lammps/src/USER-CUDA/verlet_cuda.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   See the README file in the top-level LAMMPS directory.

   -----------------------------------------------------------------------

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany

   See the README file in the USER-CUDA directory.

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */


#include <cstdlib>
#include <cstdio>
#include <cstring>
#include "verlet_cuda.h"
#include "neighbor.h"
#include "domain.h"
#include "comm.h"
#include "atom.h"
#include "atom_vec.h"
#include "force.h"
#include "pair.h"
#include "bond.h"
#include "angle.h"
#include "dihedral.h"
#include "improper.h"
#include "kspace.h"
#include "output.h"
#include "update.h"
#include "modify_cuda.h"
#include "compute.h"
#include "fix.h"
#include "timer.h"
#include "memory.h"
#include "error.h"
#include "cuda_wrapper_cu.h"
#include "thermo.h"
#include "cuda_pair_cu.h"
#include "cuda.h"
#include <ctime>
#include <cmath>

using namespace LAMMPS_NS;

#define MAX(a, b) ((a)>(b) ? (a) : (b))
#define MAKETIMEING


VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) {
  cuda = lmp->cuda;
   if(cuda == NULL)
        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");

	modify_cuda=(ModifyCuda*) modify;
}

/* ----------------------------------------------------------------------
   setup before run
------------------------------------------------------------------------- */

void VerletCuda::setup()
{
	//debug related variables
	cuda->debugdata[0]=0;
	cuda->cu_debugdata->upload();
	dotestatom=cuda->dotestatom;
	int testatom=cuda->testatom;//48267;


	MYDBG(printf("# CUDA VerletCuda::setup start\n"); )

	cuda->oncpu = true;
	cuda->begin_setup = true;
	cuda->finished_run = false;
	strcpy(update->integrate_style,"verlet");

	time_pair=0;
	time_kspace=0;
	time_comm=0;
	time_modify=0;
	time_fulliterate=0;

    atom->setup();

	cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
	cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
	cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
	cu_atom->update_nlocal=1;
	cu_atom->update_nmax=1;

	if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;

	cuda->setDomainParams();


	if(cuda->shared_data.me==0)
	printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
	if(cuda->shared_data.me==0)
	printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION==1?4:8,sizeof(X_FLOAT),sizeof(V_FLOAT),sizeof(F_FLOAT),sizeof(PPPM_FLOAT));
	cuda->allocate();


  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");

  // setup domain, communication and neighboring
  // acquire ghosts
  // build neighbor lists

  if (triclinic) domain->x2lamda(atom->nlocal);
  domain->pbc();
  domain->reset_box();
  comm->setup();
  if (neighbor->style) neighbor->setup_bins();
  comm->exchange();
  if (atom->sortfreq > 0) atom->sort();
  comm->borders();
  if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
  cuda->setSystemParams();
  cuda->checkResize();

  if(cuda->shared_data.me==0)
  printf("# CUDA: VerletCuda::setup: Upload data...\n");
  cuda->uploadAll();
  neighbor->build();
  neighbor->ncalls = 0;
  cuda->uploadAllNeighborLists();
  if(atom->mass)
  cuda->cu_mass->upload();

  if(cuda->cu_map_array)
  cuda->cu_map_array->upload();

  // compute all forces

  ev_set(update->ntimestep);
  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
  if(elist_atom||vlist_atom) cuda->checkResize();


  int test_BpA_vs_TpA = true;
  timespec starttime;
  timespec endtime;
  #ifdef NO_PREC_TIMING
    double startsec,endsec;
  #endif
 	//if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
   if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair &&(cuda->shared_data.pair.override_block_per_atom<0))
   {
     int StyleLoops=10;
     if(cuda->shared_data.me==0)
    printf("Test TpA\n");
     cuda->shared_data.pair.use_block_per_atom = 0;
     neighbor->build();
     Cuda_Pair_GenerateXType(&cuda->shared_data);
	 if(cuda->cu_v_radius)
	   Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	 if(cuda->cu_omega_rmass)
	   Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
     force->pair->compute(eflag,vflag);
       CudaWrapper_Sync();
     #ifdef NO_PREC_TIMING
       startsec = 1.0*clock()/CLOCKS_PER_SEC;
     #endif
     clock_gettime(CLOCK_REALTIME,&starttime);
     for(int i=0;i<StyleLoops;i++)
     {
	   Cuda_Pair_GenerateXType(&cuda->shared_data);
	   if(cuda->cu_v_radius)
	     Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	   if(cuda->cu_omega_rmass)
	     Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
       force->pair->compute(eflag,vflag);
       CudaWrapper_Sync();
     }
     clock_gettime(CLOCK_REALTIME,&endtime);

  	 double TpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
     #ifdef NO_PREC_TIMING
       endsec = 1.0*clock()/CLOCKS_PER_SEC;
       TpAtime = endsec - startsec;
     #endif
     if(cuda->shared_data.me==0)
     printf("Test BpA\n");
     cuda->shared_data.pair.use_block_per_atom = 1;
     neighbor->build();
     Cuda_Pair_GenerateXType(&cuda->shared_data);
	 if(cuda->cu_v_radius)
	   Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	 if(cuda->cu_omega_rmass)
	   Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
     force->pair->compute(eflag,vflag);
       CudaWrapper_Sync();

     clock_gettime(CLOCK_REALTIME,&starttime);
     #ifdef NO_PREC_TIMING
       startsec = 1.0*clock()/CLOCKS_PER_SEC;
     #endif
     for(int i=0;i<StyleLoops;i++)
     {
	   Cuda_Pair_GenerateXType(&cuda->shared_data);
	   if(cuda->cu_v_radius)
	     Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	   if(cuda->cu_omega_rmass)
	     Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
       force->pair->compute(eflag,vflag);
       CudaWrapper_Sync();
     }
     clock_gettime(CLOCK_REALTIME,&endtime);
  	 double BpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
      #ifdef NO_PREC_TIMING
       endsec = 1.0*clock()/CLOCKS_PER_SEC;
       BpAtime = endsec - startsec;
     #endif

	if(cuda->shared_data.me==0)
  	 printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n",StyleLoops);
	if(cuda->shared_data.me==0)
  	 printf("# CUDA: BpA TpA\n %lf %lf\n",BpAtime,TpAtime);
  	 if(BpAtime>TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
   }
   else
   cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;
   //cuda->shared_data.pair.use_block_per_atom = 0;
   if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
   neighbor->build();
   neighbor->ncalls = 0;

   force_clear();

   cuda->cu_f->download();
   if(cuda->cu_torque)
   cuda->cu_torque->download();

  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )

  test_atom(testatom,"pre pair force");

  if(cuda->shared_data.pair.cudable_force)
  {
	cuda->uploadAll();
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
  }

  if (force->pair) force->pair->compute(eflag,vflag);

  if(cuda->shared_data.pair.cudable_force)
  {
	if(cuda->shared_data.pair.collect_forces_later)
	{
	  if(eflag) cuda->cu_eng_vdwl->upload();
	  if(eflag) cuda->cu_eng_coul->upload();
	  if(vflag) cuda->cu_virial->upload();
      Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	  if(eflag) cuda->cu_eng_vdwl->download();
	  if(eflag) cuda->cu_eng_coul->download();
	  if(vflag) cuda->cu_virial->download();
	}
	cuda->downloadAll();
  }

  test_atom(testatom,"post pair force");

  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
    //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  if (atom->molecular) {
    if (force->bond) force->bond->compute(eflag,vflag);
    if (force->angle) force->angle->compute(eflag,vflag);
    if (force->dihedral) force->dihedral->compute(eflag,vflag);
    if (force->improper) force->improper->compute(eflag,vflag);
  }


  if(cuda->shared_data.pppm.cudable_force)
  {
	cuda->cu_tag ->upload();
	cuda->cu_type->upload();
	cuda->cu_x   ->upload();
	cuda->cu_v   ->upload();
	cuda->cu_f   ->upload();
	if(cu_atom->q_flag) cuda->cu_q->upload();
  }
  if (force->kspace) {
    force->kspace->setup();
    force->kspace->compute(eflag,vflag);
  }
  if(cuda->shared_data.pppm.cudable_force)
  {
	cuda->cu_f   ->download();
  }

  test_atom(testatom,"post kspace");

  cuda->uploadAll();
  if (force->newton) comm->reverse_comm();
  cuda->downloadAll();

  test_atom(testatom,"post reverse comm");

 if(cuda->shared_data.me==0)
  printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);

  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
  modify->setup(vflag);

  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup done\n"); )
  output->setup(1);

  test_atom(testatom,"post setup");

  MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
  cuda->finished_setup = true;
  cuda->oncpu = false;
}


//this routine is in a messy state
void VerletCuda::setup_minimal(int flag)
{


	dotestatom=0;
	int testatom=104;
	cuda->oncpu = true;
	cuda->begin_setup = true;
	cuda->finished_run = false;
	MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
	    strcpy(update->integrate_style,"verlet");
	    time_pair=0;
	    time_kspace=0;
	    time_comm=0;
	    time_modify=0;
		time_fulliterate=0;

	//cuda->allocate();

	cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
	cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
	cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
	cu_atom->update_nlocal=1;
	cu_atom->update_nmax=1;

	if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;

	cuda->setDomainParams();


	if(cuda->shared_data.me==0)
	printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
	cuda->allocate();


  // setup domain, communication and neighboring
  // acquire ghosts
  // build neighbor lists

  if (flag) {
    if (triclinic) domain->x2lamda(atom->nlocal);
    domain->pbc();
    domain->reset_box();
    comm->setup();
    if (neighbor->style) neighbor->setup_bins();
    comm->exchange();
    comm->borders();
    if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
    cuda->setSystemParams();
    cuda->checkResize();
    neighbor->build();
    neighbor->ncalls = 0;
  }

	if(cuda->shared_data.me==0)
	printf("# CUDA: VerletCuda::setup: Upload data...\n");
	cuda->uploadAll();
    cuda->uploadAllNeighborLists();
 	if(atom->mass)
 	cuda->cu_mass->upload();

 	if(cuda->cu_map_array)
 	cuda->cu_map_array->upload();

  // compute all forces

  ev_set(update->ntimestep);
  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
  if(elist_atom||vlist_atom) cuda->checkResize();

  force_clear();
  cuda->cu_f->download();

  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  cuda->cu_mass->upload();
  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )

  test_atom(testatom,"pre pair force");

  if(cuda->shared_data.pair.cudable_force)
  {
	cuda->uploadAll();
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
  }

  if (force->pair) force->pair->compute(eflag,vflag);

  if(cuda->shared_data.pair.cudable_force)
  {
	if(cuda->shared_data.pair.collect_forces_later)
	{
	  if(eflag) cuda->cu_eng_vdwl->upload();
	  if(eflag) cuda->cu_eng_coul->upload();
	  if(vflag) cuda->cu_virial->upload();
      Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	  if(eflag) cuda->cu_eng_vdwl->download();
	  if(eflag) cuda->cu_eng_coul->download();
	  if(vflag) cuda->cu_virial->download();
	}
	cuda->downloadAll();
  }

  test_atom(testatom,"post pair force");

  MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
    //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  if (atom->molecular) {
    if (force->bond) force->bond->compute(eflag,vflag);
    if (force->angle) force->angle->compute(eflag,vflag);
    if (force->dihedral) force->dihedral->compute(eflag,vflag);
    if (force->improper) force->improper->compute(eflag,vflag);
  }


  if(cuda->shared_data.pppm.cudable_force)
  {
	cuda->cu_tag ->upload();
	cuda->cu_type->upload();
	cuda->cu_x   ->upload();
	cuda->cu_v   ->upload();
	cuda->cu_f   ->upload();
	if(cu_atom->q_flag) cuda->cu_q->upload();
  }
  if (force->kspace) {
    force->kspace->setup();
    force->kspace->compute(eflag,vflag);
  }
  if(cuda->shared_data.pppm.cudable_force)
  {
	cuda->cu_f   ->download();
  }

  test_atom(testatom,"post kspace");

  cuda->uploadAll();
  if (force->newton) comm->reverse_comm();
  cuda->downloadAll();

  test_atom(testatom,"post reverse comm");

 if(cuda->shared_data.me==0)
  printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);

  MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
  modify->setup(vflag);

  MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
  cuda->finished_setup=true;
  cuda->oncpu=false;
}

//#define TESTATOM
/* ----------------------------------------------------------------------
   iterate for n steps
------------------------------------------------------------------------- */

void VerletCuda::run(int n)
{
  dotestatom=cuda->dotestatom;
  int testatom=cuda->testatom;//48267;


  timespec starttime;
  timespec endtime;
  timespec starttotal;
  timespec endtotal;

  cuda->setTimingsZero();

  static double testtime=0.0;
//				clock_gettime(CLOCK_REALTIME,&starttime);
//  				clock_gettime(CLOCK_REALTIME,&endtime);
//				testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
// 				printf("Time: %lf\n",testtime);*/


  cuda_shared_domain* cu_domain = & cuda->shared_data.domain;

  int nflag,ntimestep,sortflag;

  int n_initial_integrate = modify_cuda->n_initial_integrate;
  int n_post_integrate = modify_cuda->n_post_integrate;
  int n_final_integrate = modify_cuda->n_final_integrate;
  int n_pre_exchange = modify_cuda->n_pre_exchange;
  int n_pre_neighbor = modify_cuda->n_pre_neighbor;
  int n_pre_force = modify_cuda->n_pre_force;
  int n_post_force = modify_cuda->n_post_force;
  int n_end_of_step = modify_cuda->n_end_of_step;
  MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n",
	n_initial_integrate,n_post_integrate,n_final_integrate,n_pre_exchange,n_pre_neighbor,n_pre_force,n_post_force,n_end_of_step);)

  if (atom->sortfreq > 0) sortflag = 1;
  else sortflag = 0;


	if(cuda->shared_data.me==0)
	{
	  if((not cuda->shared_data.pair.cudable_force)&&(force->pair))
		  error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
			             "but selected a pair force which has not yet been ported to Cuda");
	  if((not cuda->shared_data.pppm.cudable_force)&&(force->kspace))
		  error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
						 "but selected a kspace force which has not yet been ported to Cuda");
      if(modify_cuda->n_post_integrate_host+modify_cuda->n_pre_exchange_host+modify_cuda->n_pre_neighbor_host+modify_cuda->n_pre_force_host+modify_cuda->n_post_force_host+modify_cuda->n_end_of_step_host+modify_cuda->n_initial_integrate_host+modify_cuda->n_final_integrate_host)
		  error->warning("# CUDA: You asked for a Verlet integration using Cuda, "
						 "but several fixes have not yet been ported to Cuda.\n"
						 "This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");
	  if(atom->firstgroupname)
		  error->warning("Warning: firstgroupname is used, this will cause additional data transfers.");
	}
    cuda->uploadAll();

  if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold)
  {
  	 const int n=cuda->shared_data.atom.maxhold;
 	 CudaWrapper_CopyData(cuda->cu_xhold->dev_data(),cuda->cu_x->dev_data(),n*sizeof(X_FLOAT));
 	 CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[atom->nmax],n*sizeof(X_FLOAT));
 	 CudaWrapper_CopyData((void*) &((X_FLOAT*)cuda->cu_xhold->dev_data())[2*n],(void*) &((X_FLOAT*)cuda->cu_x->dev_data())[2*atom->nmax],n*sizeof(X_FLOAT));
  }

  cuda->shared_data.atom.reneigh_flag=0;
  cuda->shared_data.atom.update_nlocal=1;
  cuda->shared_data.atom.update_nmax=1;
  cuda->shared_data.domain.update=1;
  cuda->shared_data.buffer_new=1;
  cuda->uploadtime=0;
  cuda->downloadtime=0;
  int firstreneigh=1;

		for(int i = 0; i < n; i++)
		{
			ntimestep = ++update->ntimestep;
			ev_set(ntimestep);

			// initial time integration

			test_atom(testatom,"Pre initial");

			MYDBG( printf("# CUDA VerletCuda::iterate: before initial_integrate\n"); )

 			modify->initial_integrate(vflag);

			MYDBG( printf("# CUDA VerletCuda::iterate: after initial_integrate\n"); )

			if(n_post_integrate) modify->post_integrate();


			// regular communication vs neighbor list rebuild

			test_atom(testatom,"Pre Exchange");

			MYDBG( printf("# CUDA VerletCuda::iterate: before neighbor decide\n"); )
			nflag = neighbor->decide();
			if(nflag == 0)
			{
				MYDBG( printf("# CUDA VerletCuda::iterate: communicate\n"); )
				timer->stamp();

			    if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm))
			    {
				  //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
			      //build communication buffers
			//      printf("Pre forward_comm(1)\n");
			      clock_gettime(CLOCK_REALTIME,&starttotal);
	 			  cuda->shared_data.atom.reneigh_flag=0;
				  clock_gettime(CLOCK_REALTIME,&starttime);
 				  timer->stamp();
				  comm->forward_comm(1);
 				  timer->stamp(TIME_COMM);
				  clock_gettime(CLOCK_REALTIME,&endtime);
				  cuda->shared_data.cuda_timings.comm_forward_total+=
 						endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

			      //prepare force calculation
			 //     printf("Pre force_clear\n");
			      force_clear();
			 //     printf("Pre Generate XType\n");
				  Cuda_Pair_GenerateXType(&cuda->shared_data);
				  if(cuda->cu_v_radius)
				    Cuda_Pair_GenerateVRadius(&cuda->shared_data);
				  if(cuda->cu_omega_rmass)
				    Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

		          //start force calculation asynchronus
			      cuda->shared_data.comm.comm_phase=1;
			    //  printf("Pre Force Compute\n");
		          force->pair->compute(eflag, vflag);
			      timer->stamp(TIME_PAIR);
                  //CudaWrapper_Sync();

				  //download comm buffers from GPU, perform MPI communication and upload buffers again
				  clock_gettime(CLOCK_REALTIME,&starttime);
			   //   printf("Pre forward_comm(2)\n");
				  comm->forward_comm(2);
 				  clock_gettime(CLOCK_REALTIME,&endtime);
				  cuda->shared_data.cuda_timings.comm_forward_total+=
 						endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
 				  timer->stamp(TIME_COMM);

 				  //wait for force calculation
			      //printf("Pre Synch\n");
				  CudaWrapper_Sync();
				  timer->stamp(TIME_PAIR);

				  //unpack communication buffers
				  clock_gettime(CLOCK_REALTIME,&starttime);
			    //  printf("Pre forward_comm(3)\n");
				  comm->forward_comm(3);
				  clock_gettime(CLOCK_REALTIME,&endtime);
			  //    printf("Post forward_comm(3)\n");
				  cuda->shared_data.cuda_timings.comm_forward_total+=
 						endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

 				  timer->stamp(TIME_COMM);
				  MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
				  cuda->shared_data.cuda_timings.test1+=
 						endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000;
			    }
			    else
			    {
			  	  //perform standard forward communication
				//printf("Forward_comm\n");
				  clock_gettime(CLOCK_REALTIME,&starttime);
				  comm->forward_comm();
				  clock_gettime(CLOCK_REALTIME,&endtime);
				//printf("Forward_comm_done\n");
				  cuda->shared_data.cuda_timings.comm_forward_total+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
 				  timer->stamp(TIME_COMM);
				  MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
			    }
			}
			else
			{
 				int nlocalold=cuda->shared_data.atom.nlocal;
 				//if(firstreneigh)
 				{
 				  cuda->shared_data.atom.update_nlocal=1;
  				  cuda->shared_data.atom.update_nmax=1;
 				  firstreneigh=0;
 				}
 				  cuda->shared_data.buffer_new=1;
				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); )
 				cuda->setDomainParams();
				if(n_pre_exchange) modify->pre_exchange();
				if(atom->nlocal!=cuda->shared_data.atom.nlocal) //did someone add atoms during pre_exchange?
				{
					cuda->checkResize();
					cuda->uploadAll();
				}

				//check domain changes
				if(domain->triclinic) domain->x2lamda(atom->nlocal);
				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor pbc\n"); )
				domain->pbc();
				if(domain->box_change)
				{
					domain->reset_box();
					comm->setup();
					if (neighbor->style) neighbor->setup_bins();

				}
				timer->stamp();
				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor exchange\n"); )

				//perform exchange of local atoms
				clock_gettime(CLOCK_REALTIME,&starttime);
				comm->exchange();
				clock_gettime(CLOCK_REALTIME,&endtime);

				//special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
				if(comm->nprocs>1)
				{
				  clock_gettime(CLOCK_REALTIME,&starttime);
				  if(atom->special)
					cuda->cu_special->upload();
				  if(atom->nspecial)
					cuda->cu_nspecial->upload();
				  clock_gettime(CLOCK_REALTIME,&endtime);
				  cuda->shared_data.cuda_timings.test1+=
 					  endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
				}

				cuda->shared_data.cuda_timings.comm_exchange_total+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

 				if(nlocalold!=cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal=2;

				//sort atoms
      			if (sortflag && ntimestep >= atom->nextsort) atom->sort();
				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor borders\n"); )

				//generate ghost atom lists, and transfer ghost atom data
				clock_gettime(CLOCK_REALTIME,&starttime);
				comm->borders();
				clock_gettime(CLOCK_REALTIME,&endtime);
				cuda->shared_data.cuda_timings.comm_border_total+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

				clock_gettime(CLOCK_REALTIME,&starttime);
				//atom index maps are generated on CPU, and need to be transfered to GPU if they are used
				if(cuda->cu_map_array)
				  cuda->cu_map_array->upload();


				if(domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost);

				if(n_pre_neighbor) modify->pre_neighbor();

  				cuda->shared_data.buffer_new=2;
  				if(atom->molecular) cuda->cu_molecule->download();
				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor build\n"); )
  			    timer->stamp(TIME_COMM);
				clock_gettime(CLOCK_REALTIME,&endtime);
				cuda->shared_data.cuda_timings.test2+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

			    //rebuild neighbor list
			    test_atom(testatom,"Pre Neighbor");
				neighbor->build();
				timer->stamp(TIME_NEIGHBOR);
				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); )

				//if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU
				if(cuda->shared_data.pair.collect_forces_later)
				{
					if(cuda->cu_molecule) cuda->cu_molecule->download();
					cuda->cu_tag->download();
					cuda->cu_type->download();
					cuda->cu_mask->download();
		      		if(cuda->cu_q) cuda->cu_q->download();
				}
				cuda->shared_data.comm.comm_phase=3;
			}

			test_atom(testatom,"Post Exchange");

			// force computations

			//only do force_clear if it has not been done during overlap of communication with local interactions
			if(not((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)))
			force_clear();

			if(n_pre_force) modify->pre_force(vflag);

			timer->stamp();

  			//if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
 	/*	    if(cuda->shared_data.pair.collect_forces_later)
		    {
		      cuda->cu_x->downloadAsync(2);
		      cuda->cu_f->downloadAsync(2);
		    }*/

			if(force->pair)
			{
 				if((not (eflag||vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)&&cuda->shared_data.pair.cudable_force)
				{
				  //second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
				  //regenerate data layout for force computations, its actually only needed for the ghost atoms
				  cuda->shared_data.comm.comm_phase=2;

      			  timespec atime1,atime2;
	  			  clock_gettime(CLOCK_REALTIME,&atime1);

 				  Cuda_Pair_GenerateXType(&cuda->shared_data);
				  if(cuda->cu_v_radius)
				    Cuda_Pair_GenerateVRadius(&cuda->shared_data);
				  if(cuda->cu_omega_rmass)
				    Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

	  			  clock_gettime(CLOCK_REALTIME,&atime2);
	  			  cuda->shared_data.cuda_timings.pair_xtype_conversion+=
        			atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
				  force->pair->compute(eflag, vflag);

				}
				else
				{
				  //calculate complete pair interactions
 			      if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
				  else
				  {
				    //regenerate data layout for force computations, its actually only needed for the ghost atoms
      			    timespec atime1,atime2;
	  			    clock_gettime(CLOCK_REALTIME,&atime1);

				    Cuda_Pair_GenerateXType(&cuda->shared_data);
				    if(cuda->cu_v_radius)
				      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
				    if(cuda->cu_omega_rmass)
				      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

	  			    clock_gettime(CLOCK_REALTIME,&atime2);
	  			    cuda->shared_data.cuda_timings.pair_xtype_conversion+=
        				atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
				  }
				  cuda->shared_data.comm.comm_phase=0;
				  force->pair->compute(eflag, vflag);
				}

 			    if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll();

 			    //wait for force calculation in case of not using overlap with bonded interactions
 			    if(not cuda->shared_data.pair.collect_forces_later)
 				  CudaWrapper_Sync();

				timer->stamp(TIME_PAIR);
			}

   			//calculate bonded interactions
		    if(atom->molecular)
			{
	          cuda->cu_x->downloadAsync(2);
		      if(n_pre_force==0) Verlet::force_clear();
		      else  cuda->cu_f->downloadAsync(2);

			  timer->stamp(TIME_PAIR);

			  test_atom(testatom,"pre bond force");
			  if(force->bond) force->bond->compute(eflag, vflag);
			  if(force->angle) force->angle->compute(eflag, vflag);
			  if(force->dihedral) force->dihedral->compute(eflag, vflag);
			  if(force->improper) force->improper->compute(eflag, vflag);
			  timer->stamp(TIME_BOND);
		    }

		   //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
	       	if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&(not (force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
	       	{
		      clock_gettime(CLOCK_REALTIME,&starttime);
		      cuda->cu_f->uploadAsync(2);

  		      test_atom(testatom,"post molecular force");


 	          if(eflag) cuda->cu_eng_vdwl->upload();
	          if(eflag) cuda->cu_eng_coul->upload();
	          if(vflag) cuda->cu_virial->upload();
              Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	          if(eflag) cuda->cu_eng_vdwl->download();
	          if(eflag) cuda->cu_eng_coul->download();
	          if(vflag) cuda->cu_virial->download();
		      timer->stamp(TIME_PAIR);

			  clock_gettime(CLOCK_REALTIME,&endtime);
			  cuda->shared_data.cuda_timings.pair_force_collection+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	       	}

		    //compute kspace force
		   	if(force->kspace)
		   	{
			  if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
				cuda->downloadAll();
			  if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular))
			  {
	            cuda->cu_x->downloadAsync(2);
		        if(n_pre_force==0) Verlet::force_clear();
		        else  cuda->cu_f->downloadAsync(2);

			    timer->stamp(TIME_PAIR);
			  }

			  force->kspace->compute(eflag,vflag);
			  if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
				cuda->uploadAll();
			  timer->stamp(TIME_KSPACE);
		   	}

			//collect forces in case pair forces and kspace was overlaped
	   	    if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&((force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
	       	{
		      cuda->cu_f->uploadAsync(2);

			  clock_gettime(CLOCK_REALTIME,&starttime);

	          if(eflag) cuda->cu_eng_vdwl->upload();
	          if(eflag) cuda->cu_eng_coul->upload();
	      	  if(vflag) cuda->cu_virial->upload();
          	  Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	      	  if(eflag) cuda->cu_eng_vdwl->download();
	      	  if(eflag) cuda->cu_eng_coul->download();
	      	  if(vflag) cuda->cu_virial->download();
		  	  timer->stamp(TIME_PAIR);

			  clock_gettime(CLOCK_REALTIME,&endtime);
  			  cuda->shared_data.cuda_timings.pair_force_collection+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	    	}

			//send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
			if(force->newton)
			{
			  comm->reverse_comm();
			  timer->stamp(TIME_COMM);
			}
   		    test_atom(testatom,"post force");
			// force modifications, final time integration, diagnostics

			if(n_post_force) modify->post_force(vflag);

			test_atom(testatom,"pre final");

			modify->final_integrate();

			test_atom(testatom,"post final");

			if(n_end_of_step) modify->end_of_step();

			// all output

			test_atom(testatom,"pre output");

			if(ntimestep == output->next)
			{
			  if(not output->thermo->cudable)
			  cuda->downloadAll();
			  timer->stamp();
			  output->write(ntimestep);
			  timer->stamp(TIME_OUTPUT);
			}


			test_atom(testatom,"post output");

			if(cuda->shared_data.atom.update_nlocal>0)
			cuda->shared_data.atom.update_nlocal--;
  			if(cuda->shared_data.atom.update_nmax>0)
  			cuda->shared_data.atom.update_nmax--;
  			if(cuda->shared_data.domain.update>0)
  			cuda->shared_data.domain.update--;
  			if(cuda->shared_data.buffer_new>0)
  			cuda->shared_data.buffer_new--;
    		cuda->shared_data.atom.reneigh_flag=0;
		}


  		cuda->downloadAll();
 		cuda->downloadAllNeighborLists();
  		cuda->shared_data.atom.update_nlocal=1;
  		cuda->shared_data.atom.update_nmax=1;
  		cuda->shared_data.buffer_new=1;
  		cuda->shared_data.domain.update=1;
  		cuda->oncpu = true;
  		cuda->finished_run = true;
}


/* ----------------------------------------------------------------------
   clear force on own & ghost atoms
   setup and clear other arrays as needed
------------------------------------------------------------------------- */

void VerletCuda::force_clear()
{
  cuda->cu_f->memset_device(0);
  if(cuda->cu_torque) cuda->cu_torque->memset_device(0);
  return;

  //The rest should not be necessary
  int i;
  for(i=0;i<atom->nlocal;i++)
  {
  	atom->f[i][0]=0.0;
  	atom->f[i][1]=0.0;
  	atom->f[i][2]=0.0;
  }
  // clear force on all particles
  // if either newton flag is set, also include ghosts

  if (neighbor->includegroup == 0) {
    int nall;
    if (force->newton) nall = atom->nlocal + atom->nghost;
    else nall = atom->nlocal;
    if (torqueflag) {
      double **torque = atom->torque;
      for (i = 0; i < nall; i++) {
	torque[i][0] = 0.0;
	torque[i][1] = 0.0;
	torque[i][2] = 0.0;
      }
    }

  // neighbor includegroup flag is set
  // clear force only on initial nfirst particles
  // if either newton flag is set, also include ghosts

  } else {
    int nall = atom->nfirst;


    if (torqueflag) {
      double **torque = atom->torque;
      for (i = 0; i < nall; i++) {
	torque[i][0] = 0.0;
	torque[i][1] = 0.0;
	torque[i][2] = 0.0;
      }
    }

    if (force->newton) {
      nall = atom->nlocal + atom->nghost;

      if (torqueflag) {
	double **torque = atom->torque;
	for (i = atom->nlocal; i < nall; i++) {
	  torque[i][0] = 0.0;
	  torque[i][1] = 0.0;
	  torque[i][2] = 0.0;
	}
      }
    }
  }
}

void VerletCuda::test_atom(int aatom, char* string)  //printing properties of one atom for test purposes
{
  if(not dotestatom) return;
  bool check=false;
  if(cuda->finished_setup) cuda->downloadAll();
  for(int i=0;i<atom->nlocal+atom->nghost;i++)
  {
  if((atom->tag[i]==aatom)&&(i<atom->nlocal))
  	{

  	  printf("%i # CUDA %s: %i %i %e %e %e %i ",comm->me,string,update->ntimestep,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
  	  if(atom->molecular && (i<atom->nlocal))
	  {
	  	printf(" // %i %i %i ",atom->num_bond[i],atom->num_angle[i],atom->num_dihedral[i]);
	  	for(int k=0;k<atom->num_bond[i];k++)
	  	printf("// %i %i ",atom->bond_type[i][k],atom->bond_atom[i][k]);
	  }
	  printf("\n");
  	}
  if(i<atom->nlocal)
  {
  if((atom->v[i][0]<-100||atom->v[i][0]>100)||
  	 (atom->v[i][1]<-100||atom->v[i][1]>100)||
  	 (atom->v[i][2]<-100||atom->v[i][2]>100)||
  	 (atom->v[i][0]!=atom->v[i][0])||
  	 (atom->v[i][1]!=atom->v[i][1])||
  	 (atom->v[i][2]!=atom->v[i][2]))
  	{printf("%i # CUDA %s velocity: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);  check=true;}
  if((atom->f[i][0]<-10000||atom->f[i][0]>10000)||
  	 (atom->f[i][1]<-10000||atom->f[i][1]>10000)||
  	 (atom->f[i][2]<-10000||atom->f[i][2]>10000)||
  	 (atom->f[i][0]!=atom->f[i][0])||
  	 (atom->f[i][1]!=atom->f[i][1])||
  	 (atom->f[i][2]!=atom->f[i][2]))
  	{printf("%i # CUDA %s force: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);    check=true;}
  if(atom->tag[i]<=0)
  	printf("%i # CUDA %s tag: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
  }
  }
  if(check) exit(0);
}