1106 lines
36 KiB
C++
1106 lines
36 KiB
C++
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
|
|
Original Version:
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
|
|
-----------------------------------------------------------------------
|
|
|
|
USER-CUDA Package and associated modifications:
|
|
https://sourceforge.net/projects/lammpscuda/
|
|
|
|
Christian Trott, christian.trott@tu-ilmenau.de
|
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
|
|
|
See the README file in the USER-CUDA directory.
|
|
|
|
This software is distributed under the GNU General Public License.
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
#include <cstdlib>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include "verlet_cuda.h"
|
|
#include "neighbor.h"
|
|
#include "domain.h"
|
|
#include "comm.h"
|
|
#include "atom.h"
|
|
#include "atom_vec.h"
|
|
#include "force.h"
|
|
#include "pair.h"
|
|
#include "bond.h"
|
|
#include "angle.h"
|
|
#include "dihedral.h"
|
|
#include "improper.h"
|
|
#include "kspace.h"
|
|
#include "output.h"
|
|
#include "update.h"
|
|
#include "modify_cuda.h"
|
|
#include "compute.h"
|
|
#include "fix.h"
|
|
#include "timer.h"
|
|
#include "memory.h"
|
|
#include "error.h"
|
|
#include "cuda_wrapper_cu.h"
|
|
#include "thermo.h"
|
|
#include "cuda_pair_cu.h"
|
|
#include "cuda.h"
|
|
#include <ctime>
|
|
#include <cmath>
|
|
|
|
using namespace LAMMPS_NS;
|
|
|
|
#define MAX(a, b) ((a)>(b) ? (a) : (b))
|
|
#define MAKETIMEING
|
|
|
|
|
|
VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) {
|
|
cuda = lmp->cuda;
|
|
if(cuda == NULL)
|
|
error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
|
|
|
|
modify_cuda=(ModifyCuda*) modify;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
setup before run
|
|
------------------------------------------------------------------------- */
|
|
|
|
void VerletCuda::setup()
|
|
{
|
|
//debug related variables
|
|
cuda->debugdata[0]=0;
|
|
cuda->cu_debugdata->upload();
|
|
dotestatom=cuda->dotestatom;
|
|
int testatom=cuda->testatom;//48267;
|
|
|
|
|
|
MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
|
|
|
|
cuda->oncpu = true;
|
|
cuda->begin_setup = true;
|
|
cuda->finished_run = false;
|
|
strcpy(update->integrate_style,"verlet");
|
|
|
|
time_pair=0;
|
|
time_kspace=0;
|
|
time_comm=0;
|
|
time_modify=0;
|
|
time_fulliterate=0;
|
|
|
|
atom->setup();
|
|
|
|
cuda_shared_atom* cu_atom = & cuda->shared_data.atom;
|
|
cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
|
|
cuda_shared_pair* cu_pair = & cuda->shared_data.pair;
|
|
cu_atom->update_nlocal=1;
|
|
cu_atom->update_nmax=1;
|
|
|
|
if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
|
|
|
|
cuda->setDomainParams();
|
|
|
|
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION==1?4:8,sizeof(X_FLOAT),sizeof(V_FLOAT),sizeof(F_FLOAT),sizeof(PPPM_FLOAT));
|
|
cuda->allocate();
|
|
|
|
|
|
if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
|
|
|
|
// setup domain, communication and neighboring
|
|
// acquire ghosts
|
|
// build neighbor lists
|
|
|
|
if (triclinic) domain->x2lamda(atom->nlocal);
|
|
domain->pbc();
|
|
domain->reset_box();
|
|
comm->setup();
|
|
if (neighbor->style) neighbor->setup_bins();
|
|
comm->exchange();
|
|
if (atom->sortfreq > 0) atom->sort();
|
|
comm->borders();
|
|
if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
|
|
cuda->setSystemParams();
|
|
cuda->checkResize();
|
|
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: VerletCuda::setup: Upload data...\n");
|
|
cuda->uploadAll();
|
|
neighbor->build();
|
|
neighbor->ncalls = 0;
|
|
cuda->uploadAllNeighborLists();
|
|
if(atom->mass)
|
|
cuda->cu_mass->upload();
|
|
|
|
if(cuda->cu_map_array)
|
|
cuda->cu_map_array->upload();
|
|
|
|
// compute all forces
|
|
|
|
ev_set(update->ntimestep);
|
|
if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
|
|
if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
|
|
if(elist_atom||vlist_atom) cuda->checkResize();
|
|
|
|
|
|
int test_BpA_vs_TpA = true;
|
|
timespec starttime;
|
|
timespec endtime;
|
|
#ifdef NO_PREC_TIMING
|
|
double startsec,endsec;
|
|
#endif
|
|
//if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
|
|
if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair &&(cuda->shared_data.pair.override_block_per_atom<0))
|
|
{
|
|
int StyleLoops=10;
|
|
if(cuda->shared_data.me==0)
|
|
printf("Test TpA\n");
|
|
cuda->shared_data.pair.use_block_per_atom = 0;
|
|
neighbor->build();
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
force->pair->compute(eflag,vflag);
|
|
CudaWrapper_Sync();
|
|
#ifdef NO_PREC_TIMING
|
|
startsec = 1.0*clock()/CLOCKS_PER_SEC;
|
|
#endif
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
for(int i=0;i<StyleLoops;i++)
|
|
{
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
force->pair->compute(eflag,vflag);
|
|
CudaWrapper_Sync();
|
|
}
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
|
|
double TpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
#ifdef NO_PREC_TIMING
|
|
endsec = 1.0*clock()/CLOCKS_PER_SEC;
|
|
TpAtime = endsec - startsec;
|
|
#endif
|
|
if(cuda->shared_data.me==0)
|
|
printf("Test BpA\n");
|
|
cuda->shared_data.pair.use_block_per_atom = 1;
|
|
neighbor->build();
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
force->pair->compute(eflag,vflag);
|
|
CudaWrapper_Sync();
|
|
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
#ifdef NO_PREC_TIMING
|
|
startsec = 1.0*clock()/CLOCKS_PER_SEC;
|
|
#endif
|
|
for(int i=0;i<StyleLoops;i++)
|
|
{
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
force->pair->compute(eflag,vflag);
|
|
CudaWrapper_Sync();
|
|
}
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
double BpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
#ifdef NO_PREC_TIMING
|
|
endsec = 1.0*clock()/CLOCKS_PER_SEC;
|
|
BpAtime = endsec - startsec;
|
|
#endif
|
|
|
|
if(cuda->shared_data.me==0)
|
|
printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n",StyleLoops);
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: BpA TpA\n %lf %lf\n",BpAtime,TpAtime);
|
|
if(BpAtime>TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
|
|
}
|
|
else
|
|
cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;
|
|
//cuda->shared_data.pair.use_block_per_atom = 0;
|
|
if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
|
|
neighbor->build();
|
|
neighbor->ncalls = 0;
|
|
|
|
force_clear();
|
|
|
|
cuda->cu_f->download();
|
|
if(cuda->cu_torque)
|
|
cuda->cu_torque->download();
|
|
|
|
//printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )
|
|
|
|
test_atom(testatom,"pre pair force");
|
|
|
|
if(cuda->shared_data.pair.cudable_force)
|
|
{
|
|
cuda->uploadAll();
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
}
|
|
|
|
if (force->pair) force->pair->compute(eflag,vflag);
|
|
|
|
if(cuda->shared_data.pair.cudable_force)
|
|
{
|
|
if(cuda->shared_data.pair.collect_forces_later)
|
|
{
|
|
if(eflag) cuda->cu_eng_vdwl->upload();
|
|
if(eflag) cuda->cu_eng_coul->upload();
|
|
if(vflag) cuda->cu_virial->upload();
|
|
Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
|
|
if(eflag) cuda->cu_eng_vdwl->download();
|
|
if(eflag) cuda->cu_eng_coul->download();
|
|
if(vflag) cuda->cu_virial->download();
|
|
}
|
|
cuda->downloadAll();
|
|
}
|
|
|
|
test_atom(testatom,"post pair force");
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
|
|
//printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
|
|
|
|
if (atom->molecular) {
|
|
if (force->bond) force->bond->compute(eflag,vflag);
|
|
if (force->angle) force->angle->compute(eflag,vflag);
|
|
if (force->dihedral) force->dihedral->compute(eflag,vflag);
|
|
if (force->improper) force->improper->compute(eflag,vflag);
|
|
}
|
|
|
|
|
|
if(cuda->shared_data.pppm.cudable_force)
|
|
{
|
|
cuda->cu_tag ->upload();
|
|
cuda->cu_type->upload();
|
|
cuda->cu_x ->upload();
|
|
cuda->cu_v ->upload();
|
|
cuda->cu_f ->upload();
|
|
if(cu_atom->q_flag) cuda->cu_q->upload();
|
|
}
|
|
if (force->kspace) {
|
|
force->kspace->setup();
|
|
force->kspace->compute(eflag,vflag);
|
|
}
|
|
if(cuda->shared_data.pppm.cudable_force)
|
|
{
|
|
cuda->cu_f ->download();
|
|
}
|
|
|
|
test_atom(testatom,"post kspace");
|
|
|
|
cuda->uploadAll();
|
|
if (force->newton) comm->reverse_comm();
|
|
cuda->downloadAll();
|
|
|
|
test_atom(testatom,"post reverse comm");
|
|
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
|
|
modify->setup(vflag);
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup done\n"); )
|
|
output->setup(1);
|
|
|
|
test_atom(testatom,"post setup");
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
|
|
cuda->finished_setup = true;
|
|
cuda->oncpu = false;
|
|
}
|
|
|
|
|
|
//this routine is in a messy state
|
|
void VerletCuda::setup_minimal(int flag)
|
|
{
|
|
|
|
|
|
dotestatom=0;
|
|
int testatom=104;
|
|
cuda->oncpu = true;
|
|
cuda->begin_setup = true;
|
|
cuda->finished_run = false;
|
|
MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
|
|
strcpy(update->integrate_style,"verlet");
|
|
time_pair=0;
|
|
time_kspace=0;
|
|
time_comm=0;
|
|
time_modify=0;
|
|
time_fulliterate=0;
|
|
|
|
//cuda->allocate();
|
|
|
|
cuda_shared_atom* cu_atom = & cuda->shared_data.atom;
|
|
cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
|
|
cuda_shared_pair* cu_pair = & cuda->shared_data.pair;
|
|
cu_atom->update_nlocal=1;
|
|
cu_atom->update_nmax=1;
|
|
|
|
if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;
|
|
|
|
cuda->setDomainParams();
|
|
|
|
|
|
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
|
|
cuda->allocate();
|
|
|
|
|
|
|
|
|
|
// setup domain, communication and neighboring
|
|
// acquire ghosts
|
|
// build neighbor lists
|
|
|
|
if (flag) {
|
|
if (triclinic) domain->x2lamda(atom->nlocal);
|
|
domain->pbc();
|
|
domain->reset_box();
|
|
comm->setup();
|
|
if (neighbor->style) neighbor->setup_bins();
|
|
comm->exchange();
|
|
comm->borders();
|
|
if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
|
|
cuda->setSystemParams();
|
|
cuda->checkResize();
|
|
neighbor->build();
|
|
neighbor->ncalls = 0;
|
|
}
|
|
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: VerletCuda::setup: Upload data...\n");
|
|
cuda->uploadAll();
|
|
cuda->uploadAllNeighborLists();
|
|
if(atom->mass)
|
|
cuda->cu_mass->upload();
|
|
|
|
if(cuda->cu_map_array)
|
|
cuda->cu_map_array->upload();
|
|
|
|
// compute all forces
|
|
|
|
ev_set(update->ntimestep);
|
|
if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
|
|
if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
|
|
if(elist_atom||vlist_atom) cuda->checkResize();
|
|
|
|
force_clear();
|
|
cuda->cu_f->download();
|
|
|
|
//printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
|
|
|
|
cuda->cu_mass->upload();
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )
|
|
|
|
test_atom(testatom,"pre pair force");
|
|
|
|
if(cuda->shared_data.pair.cudable_force)
|
|
{
|
|
cuda->uploadAll();
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
}
|
|
|
|
if (force->pair) force->pair->compute(eflag,vflag);
|
|
|
|
if(cuda->shared_data.pair.cudable_force)
|
|
{
|
|
if(cuda->shared_data.pair.collect_forces_later)
|
|
{
|
|
if(eflag) cuda->cu_eng_vdwl->upload();
|
|
if(eflag) cuda->cu_eng_coul->upload();
|
|
if(vflag) cuda->cu_virial->upload();
|
|
Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
|
|
if(eflag) cuda->cu_eng_vdwl->download();
|
|
if(eflag) cuda->cu_eng_coul->download();
|
|
if(vflag) cuda->cu_virial->download();
|
|
}
|
|
cuda->downloadAll();
|
|
}
|
|
|
|
test_atom(testatom,"post pair force");
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
|
|
//printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
|
|
|
|
if (atom->molecular) {
|
|
if (force->bond) force->bond->compute(eflag,vflag);
|
|
if (force->angle) force->angle->compute(eflag,vflag);
|
|
if (force->dihedral) force->dihedral->compute(eflag,vflag);
|
|
if (force->improper) force->improper->compute(eflag,vflag);
|
|
}
|
|
|
|
|
|
if(cuda->shared_data.pppm.cudable_force)
|
|
{
|
|
cuda->cu_tag ->upload();
|
|
cuda->cu_type->upload();
|
|
cuda->cu_x ->upload();
|
|
cuda->cu_v ->upload();
|
|
cuda->cu_f ->upload();
|
|
if(cu_atom->q_flag) cuda->cu_q->upload();
|
|
}
|
|
if (force->kspace) {
|
|
force->kspace->setup();
|
|
force->kspace->compute(eflag,vflag);
|
|
}
|
|
if(cuda->shared_data.pppm.cudable_force)
|
|
{
|
|
cuda->cu_f ->download();
|
|
}
|
|
|
|
test_atom(testatom,"post kspace");
|
|
|
|
cuda->uploadAll();
|
|
if (force->newton) comm->reverse_comm();
|
|
cuda->downloadAll();
|
|
|
|
test_atom(testatom,"post reverse comm");
|
|
|
|
if(cuda->shared_data.me==0)
|
|
printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
|
|
modify->setup(vflag);
|
|
|
|
MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
|
|
cuda->finished_setup=true;
|
|
cuda->oncpu=false;
|
|
}
|
|
|
|
//#define TESTATOM
|
|
/* ----------------------------------------------------------------------
|
|
iterate for n steps
|
|
------------------------------------------------------------------------- */
|
|
|
|
void VerletCuda::run(int n)
|
|
{
|
|
dotestatom=cuda->dotestatom;
|
|
int testatom=cuda->testatom;//48267;
|
|
|
|
|
|
timespec starttime;
|
|
timespec endtime;
|
|
timespec starttotal;
|
|
timespec endtotal;
|
|
|
|
cuda->setTimingsZero();
|
|
|
|
static double testtime=0.0;
|
|
// clock_gettime(CLOCK_REALTIME,&starttime);
|
|
// clock_gettime(CLOCK_REALTIME,&endtime);
|
|
// testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
// printf("Time: %lf\n",testtime);*/
|
|
|
|
|
|
cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
|
|
|
|
int nflag,ntimestep,sortflag;
|
|
|
|
int n_initial_integrate = modify_cuda->n_initial_integrate;
|
|
int n_post_integrate = modify_cuda->n_post_integrate;
|
|
int n_final_integrate = modify_cuda->n_final_integrate;
|
|
int n_pre_exchange = modify_cuda->n_pre_exchange;
|
|
int n_pre_neighbor = modify_cuda->n_pre_neighbor;
|
|
int n_pre_force = modify_cuda->n_pre_force;
|
|
int n_post_force = modify_cuda->n_post_force;
|
|
int n_end_of_step = modify_cuda->n_end_of_step;
|
|
MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n",
|
|
n_initial_integrate,n_post_integrate,n_final_integrate,n_pre_exchange,n_pre_neighbor,n_pre_force,n_post_force,n_end_of_step);)
|
|
|
|
if (atom->sortfreq > 0) sortflag = 1;
|
|
else sortflag = 0;
|
|
|
|
|
|
if(cuda->shared_data.me==0)
|
|
{
|
|
if((not cuda->shared_data.pair.cudable_force)&&(force->pair))
|
|
error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
|
|
"but selected a pair force which has not yet been ported to Cuda");
|
|
if((not cuda->shared_data.pppm.cudable_force)&&(force->kspace))
|
|
error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
|
|
"but selected a kspace force which has not yet been ported to Cuda");
|
|
if(modify_cuda->n_post_integrate_host+modify_cuda->n_pre_exchange_host+modify_cuda->n_pre_neighbor_host+modify_cuda->n_pre_force_host+modify_cuda->n_post_force_host+modify_cuda->n_end_of_step_host+modify_cuda->n_initial_integrate_host+modify_cuda->n_final_integrate_host)
|
|
error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
|
|
"but several fixes have not yet been ported to Cuda.\n"
|
|
"This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");
|
|
if(atom->firstgroupname)
|
|
error->warning("Warning: firstgroupname is used, this will cause additional data transfers.");
|
|
}
|
|
cuda->uploadAll();
|
|
|
|
if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold)
|
|
{
|
|
const int n=cuda->shared_data.atom.maxhold;
|
|
CudaWrapper_CopyData(cuda->cu_xhold->dev_data(),cuda->cu_x->dev_data(),n*sizeof(X_FLOAT));
|
|
CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[atom->nmax],n*sizeof(X_FLOAT));
|
|
CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[2*n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[2*atom->nmax],n*sizeof(X_FLOAT));
|
|
}
|
|
|
|
cuda->shared_data.atom.reneigh_flag=0;
|
|
cuda->shared_data.atom.update_nlocal=1;
|
|
cuda->shared_data.atom.update_nmax=1;
|
|
cuda->shared_data.domain.update=1;
|
|
cuda->shared_data.buffer_new=1;
|
|
cuda->uploadtime=0;
|
|
cuda->downloadtime=0;
|
|
int firstreneigh=1;
|
|
|
|
for(int i = 0; i < n; i++)
|
|
{
|
|
ntimestep = ++update->ntimestep;
|
|
ev_set(ntimestep);
|
|
|
|
// initial time integration
|
|
|
|
test_atom(testatom,"Pre initial");
|
|
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: before initial_integrate\n"); )
|
|
|
|
modify->initial_integrate(vflag);
|
|
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: after initial_integrate\n"); )
|
|
|
|
if(n_post_integrate) modify->post_integrate();
|
|
|
|
|
|
|
|
// regular communication vs neighbor list rebuild
|
|
|
|
test_atom(testatom,"Pre Exchange");
|
|
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: before neighbor decide\n"); )
|
|
nflag = neighbor->decide();
|
|
if(nflag == 0)
|
|
{
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: communicate\n"); )
|
|
timer->stamp();
|
|
|
|
if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm))
|
|
{
|
|
//overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
|
|
//build communication buffers
|
|
// printf("Pre forward_comm(1)\n");
|
|
clock_gettime(CLOCK_REALTIME,&starttotal);
|
|
cuda->shared_data.atom.reneigh_flag=0;
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
timer->stamp();
|
|
comm->forward_comm(1);
|
|
timer->stamp(TIME_COMM);
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
cuda->shared_data.cuda_timings.comm_forward_total+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
|
|
//prepare force calculation
|
|
// printf("Pre force_clear\n");
|
|
force_clear();
|
|
// printf("Pre Generate XType\n");
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
|
|
//start force calculation asynchronus
|
|
cuda->shared_data.comm.comm_phase=1;
|
|
// printf("Pre Force Compute\n");
|
|
force->pair->compute(eflag, vflag);
|
|
timer->stamp(TIME_PAIR);
|
|
//CudaWrapper_Sync();
|
|
|
|
//download comm buffers from GPU, perform MPI communication and upload buffers again
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
// printf("Pre forward_comm(2)\n");
|
|
comm->forward_comm(2);
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
cuda->shared_data.cuda_timings.comm_forward_total+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
timer->stamp(TIME_COMM);
|
|
|
|
//wait for force calculation
|
|
//printf("Pre Synch\n");
|
|
CudaWrapper_Sync();
|
|
timer->stamp(TIME_PAIR);
|
|
|
|
//unpack communication buffers
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
// printf("Pre forward_comm(3)\n");
|
|
comm->forward_comm(3);
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
// printf("Post forward_comm(3)\n");
|
|
cuda->shared_data.cuda_timings.comm_forward_total+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
|
|
timer->stamp(TIME_COMM);
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
|
|
cuda->shared_data.cuda_timings.test1+=
|
|
endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000;
|
|
}
|
|
else
|
|
{
|
|
//perform standard forward communication
|
|
//printf("Forward_comm\n");
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
comm->forward_comm();
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
//printf("Forward_comm_done\n");
|
|
cuda->shared_data.cuda_timings.comm_forward_total+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
timer->stamp(TIME_COMM);
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int nlocalold=cuda->shared_data.atom.nlocal;
|
|
//if(firstreneigh)
|
|
{
|
|
cuda->shared_data.atom.update_nlocal=1;
|
|
cuda->shared_data.atom.update_nmax=1;
|
|
firstreneigh=0;
|
|
}
|
|
cuda->shared_data.buffer_new=1;
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); )
|
|
cuda->setDomainParams();
|
|
if(n_pre_exchange) modify->pre_exchange();
|
|
if(atom->nlocal!=cuda->shared_data.atom.nlocal) //did someone add atoms during pre_exchange?
|
|
{
|
|
cuda->checkResize();
|
|
cuda->uploadAll();
|
|
}
|
|
|
|
//check domain changes
|
|
if(domain->triclinic) domain->x2lamda(atom->nlocal);
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: neighbor pbc\n"); )
|
|
domain->pbc();
|
|
if(domain->box_change)
|
|
{
|
|
domain->reset_box();
|
|
comm->setup();
|
|
if (neighbor->style) neighbor->setup_bins();
|
|
|
|
}
|
|
timer->stamp();
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: neighbor exchange\n"); )
|
|
|
|
//perform exchange of local atoms
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
comm->exchange();
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
|
|
//special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
|
|
if(comm->nprocs>1)
|
|
{
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
if(atom->special)
|
|
cuda->cu_special->upload();
|
|
if(atom->nspecial)
|
|
cuda->cu_nspecial->upload();
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
cuda->shared_data.cuda_timings.test1+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
}
|
|
|
|
cuda->shared_data.cuda_timings.comm_exchange_total+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
|
|
if(nlocalold!=cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal=2;
|
|
|
|
//sort atoms
|
|
if (sortflag && ntimestep >= atom->nextsort) atom->sort();
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: neighbor borders\n"); )
|
|
|
|
//generate ghost atom lists, and transfer ghost atom data
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
comm->borders();
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
cuda->shared_data.cuda_timings.comm_border_total+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
//atom index maps are generated on CPU, and need to be transfered to GPU if they are used
|
|
if(cuda->cu_map_array)
|
|
cuda->cu_map_array->upload();
|
|
|
|
|
|
if(domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
|
|
|
|
if(n_pre_neighbor) modify->pre_neighbor();
|
|
|
|
cuda->shared_data.buffer_new=2;
|
|
if(atom->molecular) cuda->cu_molecule->download();
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: neighbor build\n"); )
|
|
timer->stamp(TIME_COMM);
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
cuda->shared_data.cuda_timings.test2+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
|
|
//rebuild neighbor list
|
|
test_atom(testatom,"Pre Neighbor");
|
|
neighbor->build();
|
|
timer->stamp(TIME_NEIGHBOR);
|
|
MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); )
|
|
|
|
//if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU
|
|
if(cuda->shared_data.pair.collect_forces_later)
|
|
{
|
|
if(cuda->cu_molecule) cuda->cu_molecule->download();
|
|
cuda->cu_tag->download();
|
|
cuda->cu_type->download();
|
|
cuda->cu_mask->download();
|
|
if(cuda->cu_q) cuda->cu_q->download();
|
|
}
|
|
cuda->shared_data.comm.comm_phase=3;
|
|
}
|
|
|
|
test_atom(testatom,"Post Exchange");
|
|
|
|
// force computations
|
|
|
|
//only do force_clear if it has not been done during overlap of communication with local interactions
|
|
if(not((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)))
|
|
force_clear();
|
|
|
|
if(n_pre_force) modify->pre_force(vflag);
|
|
|
|
timer->stamp();
|
|
|
|
//if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
|
|
/* if(cuda->shared_data.pair.collect_forces_later)
|
|
{
|
|
cuda->cu_x->downloadAsync(2);
|
|
cuda->cu_f->downloadAsync(2);
|
|
}*/
|
|
|
|
if(force->pair)
|
|
{
|
|
if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)&&cuda->shared_data.pair.cudable_force)
|
|
{
|
|
//second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
|
|
//regenerate data layout for force computations, its actually only needed for the ghost atoms
|
|
cuda->shared_data.comm.comm_phase=2;
|
|
|
|
timespec atime1,atime2;
|
|
clock_gettime(CLOCK_REALTIME,&atime1);
|
|
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
|
|
clock_gettime(CLOCK_REALTIME,&atime2);
|
|
cuda->shared_data.cuda_timings.pair_xtype_conversion+=
|
|
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
|
|
force->pair->compute(eflag, vflag);
|
|
|
|
}
|
|
else
|
|
{
|
|
//calculate complete pair interactions
|
|
if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
|
|
else
|
|
{
|
|
//regenerate data layout for force computations, its actually only needed for the ghost atoms
|
|
timespec atime1,atime2;
|
|
clock_gettime(CLOCK_REALTIME,&atime1);
|
|
|
|
Cuda_Pair_GenerateXType(&cuda->shared_data);
|
|
if(cuda->cu_v_radius)
|
|
Cuda_Pair_GenerateVRadius(&cuda->shared_data);
|
|
if(cuda->cu_omega_rmass)
|
|
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
|
|
|
|
clock_gettime(CLOCK_REALTIME,&atime2);
|
|
cuda->shared_data.cuda_timings.pair_xtype_conversion+=
|
|
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
|
|
}
|
|
cuda->shared_data.comm.comm_phase=0;
|
|
force->pair->compute(eflag, vflag);
|
|
}
|
|
|
|
if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll();
|
|
|
|
//wait for force calculation in case of not using overlap with bonded interactions
|
|
if(not cuda->shared_data.pair.collect_forces_later)
|
|
CudaWrapper_Sync();
|
|
|
|
timer->stamp(TIME_PAIR);
|
|
}
|
|
|
|
//calculate bonded interactions
|
|
if(atom->molecular)
|
|
{
|
|
cuda->cu_x->downloadAsync(2);
|
|
if(n_pre_force==0) Verlet::force_clear();
|
|
else cuda->cu_f->downloadAsync(2);
|
|
|
|
timer->stamp(TIME_PAIR);
|
|
|
|
test_atom(testatom,"pre bond force");
|
|
if(force->bond) force->bond->compute(eflag, vflag);
|
|
if(force->angle) force->angle->compute(eflag, vflag);
|
|
if(force->dihedral) force->dihedral->compute(eflag, vflag);
|
|
if(force->improper) force->improper->compute(eflag, vflag);
|
|
timer->stamp(TIME_BOND);
|
|
}
|
|
|
|
//collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
|
|
if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&(not (force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
|
|
{
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
cuda->cu_f->uploadAsync(2);
|
|
|
|
test_atom(testatom,"post molecular force");
|
|
|
|
|
|
if(eflag) cuda->cu_eng_vdwl->upload();
|
|
if(eflag) cuda->cu_eng_coul->upload();
|
|
if(vflag) cuda->cu_virial->upload();
|
|
Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
|
|
if(eflag) cuda->cu_eng_vdwl->download();
|
|
if(eflag) cuda->cu_eng_coul->download();
|
|
if(vflag) cuda->cu_virial->download();
|
|
timer->stamp(TIME_PAIR);
|
|
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
cuda->shared_data.cuda_timings.pair_force_collection+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
}
|
|
|
|
//compute kspace force
|
|
if(force->kspace)
|
|
{
|
|
if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
|
|
cuda->downloadAll();
|
|
if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular))
|
|
{
|
|
cuda->cu_x->downloadAsync(2);
|
|
if(n_pre_force==0) Verlet::force_clear();
|
|
else cuda->cu_f->downloadAsync(2);
|
|
|
|
timer->stamp(TIME_PAIR);
|
|
}
|
|
|
|
force->kspace->compute(eflag,vflag);
|
|
if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
|
|
cuda->uploadAll();
|
|
timer->stamp(TIME_KSPACE);
|
|
}
|
|
|
|
//collect forces in case pair forces and kspace was overlaped
|
|
if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&((force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
|
|
{
|
|
cuda->cu_f->uploadAsync(2);
|
|
|
|
clock_gettime(CLOCK_REALTIME,&starttime);
|
|
|
|
if(eflag) cuda->cu_eng_vdwl->upload();
|
|
if(eflag) cuda->cu_eng_coul->upload();
|
|
if(vflag) cuda->cu_virial->upload();
|
|
Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
|
|
if(eflag) cuda->cu_eng_vdwl->download();
|
|
if(eflag) cuda->cu_eng_coul->download();
|
|
if(vflag) cuda->cu_virial->download();
|
|
timer->stamp(TIME_PAIR);
|
|
|
|
clock_gettime(CLOCK_REALTIME,&endtime);
|
|
cuda->shared_data.cuda_timings.pair_force_collection+=
|
|
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
|
|
}
|
|
|
|
//send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
|
|
if(force->newton)
|
|
{
|
|
comm->reverse_comm();
|
|
timer->stamp(TIME_COMM);
|
|
}
|
|
test_atom(testatom,"post force");
|
|
// force modifications, final time integration, diagnostics
|
|
|
|
if(n_post_force) modify->post_force(vflag);
|
|
|
|
test_atom(testatom,"pre final");
|
|
|
|
modify->final_integrate();
|
|
|
|
test_atom(testatom,"post final");
|
|
|
|
if(n_end_of_step) modify->end_of_step();
|
|
|
|
// all output
|
|
|
|
test_atom(testatom,"pre output");
|
|
|
|
if(ntimestep == output->next)
|
|
{
|
|
if(not output->thermo->cudable)
|
|
cuda->downloadAll();
|
|
timer->stamp();
|
|
output->write(ntimestep);
|
|
timer->stamp(TIME_OUTPUT);
|
|
}
|
|
|
|
|
|
test_atom(testatom,"post output");
|
|
|
|
if(cuda->shared_data.atom.update_nlocal>0)
|
|
cuda->shared_data.atom.update_nlocal--;
|
|
if(cuda->shared_data.atom.update_nmax>0)
|
|
cuda->shared_data.atom.update_nmax--;
|
|
if(cuda->shared_data.domain.update>0)
|
|
cuda->shared_data.domain.update--;
|
|
if(cuda->shared_data.buffer_new>0)
|
|
cuda->shared_data.buffer_new--;
|
|
cuda->shared_data.atom.reneigh_flag=0;
|
|
}
|
|
|
|
|
|
cuda->downloadAll();
|
|
cuda->downloadAllNeighborLists();
|
|
cuda->shared_data.atom.update_nlocal=1;
|
|
cuda->shared_data.atom.update_nmax=1;
|
|
cuda->shared_data.buffer_new=1;
|
|
cuda->shared_data.domain.update=1;
|
|
cuda->oncpu = true;
|
|
cuda->finished_run = true;
|
|
}
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
clear force on own & ghost atoms
|
|
setup and clear other arrays as needed
|
|
------------------------------------------------------------------------- */
|
|
|
|
void VerletCuda::force_clear()
|
|
{
|
|
cuda->cu_f->memset_device(0);
|
|
if(cuda->cu_torque) cuda->cu_torque->memset_device(0);
|
|
return;
|
|
|
|
//The rest should not be necessary
|
|
int i;
|
|
for(i=0;i<atom->nlocal;i++)
|
|
{
|
|
atom->f[i][0]=0.0;
|
|
atom->f[i][1]=0.0;
|
|
atom->f[i][2]=0.0;
|
|
}
|
|
// clear force on all particles
|
|
// if either newton flag is set, also include ghosts
|
|
|
|
if (neighbor->includegroup == 0) {
|
|
int nall;
|
|
if (force->newton) nall = atom->nlocal + atom->nghost;
|
|
else nall = atom->nlocal;
|
|
if (torqueflag) {
|
|
double **torque = atom->torque;
|
|
for (i = 0; i < nall; i++) {
|
|
torque[i][0] = 0.0;
|
|
torque[i][1] = 0.0;
|
|
torque[i][2] = 0.0;
|
|
}
|
|
}
|
|
|
|
// neighbor includegroup flag is set
|
|
// clear force only on initial nfirst particles
|
|
// if either newton flag is set, also include ghosts
|
|
|
|
} else {
|
|
int nall = atom->nfirst;
|
|
|
|
|
|
if (torqueflag) {
|
|
double **torque = atom->torque;
|
|
for (i = 0; i < nall; i++) {
|
|
torque[i][0] = 0.0;
|
|
torque[i][1] = 0.0;
|
|
torque[i][2] = 0.0;
|
|
}
|
|
}
|
|
|
|
if (force->newton) {
|
|
nall = atom->nlocal + atom->nghost;
|
|
|
|
if (torqueflag) {
|
|
double **torque = atom->torque;
|
|
for (i = atom->nlocal; i < nall; i++) {
|
|
torque[i][0] = 0.0;
|
|
torque[i][1] = 0.0;
|
|
torque[i][2] = 0.0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void VerletCuda::test_atom(int aatom, char* string) //printing properties of one atom for test purposes
|
|
{
|
|
if(not dotestatom) return;
|
|
bool check=false;
|
|
if(cuda->finished_setup) cuda->downloadAll();
|
|
for(int i=0;i<atom->nlocal+atom->nghost;i++)
|
|
{
|
|
if((atom->tag[i]==aatom)&&(i<atom->nlocal))
|
|
{
|
|
|
|
printf("%i # CUDA %s: %i %i %e %e %e %i ",comm->me,string,update->ntimestep,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
|
|
if(atom->molecular && (i<atom->nlocal))
|
|
{
|
|
printf(" // %i %i %i ",atom->num_bond[i],atom->num_angle[i],atom->num_dihedral[i]);
|
|
for(int k=0;k<atom->num_bond[i];k++)
|
|
printf("// %i %i ",atom->bond_type[i][k],atom->bond_atom[i][k]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
if(i<atom->nlocal)
|
|
{
|
|
if((atom->v[i][0]<-100||atom->v[i][0]>100)||
|
|
(atom->v[i][1]<-100||atom->v[i][1]>100)||
|
|
(atom->v[i][2]<-100||atom->v[i][2]>100)||
|
|
(atom->v[i][0]!=atom->v[i][0])||
|
|
(atom->v[i][1]!=atom->v[i][1])||
|
|
(atom->v[i][2]!=atom->v[i][2]))
|
|
{printf("%i # CUDA %s velocity: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); check=true;}
|
|
if((atom->f[i][0]<-10000||atom->f[i][0]>10000)||
|
|
(atom->f[i][1]<-10000||atom->f[i][1]>10000)||
|
|
(atom->f[i][2]<-10000||atom->f[i][2]>10000)||
|
|
(atom->f[i][0]!=atom->f[i][0])||
|
|
(atom->f[i][1]!=atom->f[i][1])||
|
|
(atom->f[i][2]!=atom->f[i][2]))
|
|
{printf("%i # CUDA %s force: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); check=true;}
|
|
if(atom->tag[i]<=0)
|
|
printf("%i # CUDA %s tag: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
|
|
}
|
|
}
|
|
if(check) exit(0);
|
|
}
|