lammps/src/USER-CUDA/verlet_cuda.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   See the README file in the top-level LAMMPS directory.

   -----------------------------------------------------------------------

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany

   See the README file in the USER-CUDA directory.

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */


#include <cstdlib>
#include <cstdio>
#include <cstring>
#include "verlet_cuda.h"
#include "neighbor.h"
#include "domain.h"
#include "comm.h"
#include "atom.h"
#include "atom_vec.h"
#include "force.h"
#include "pair.h"
#include "bond.h"
#include "angle.h"
#include "dihedral.h"
#include "improper.h"
#include "kspace.h"
#include "output.h"
#include "update.h"
#include "modify_cuda.h"
#include "compute.h"
#include "fix.h"
#include "timer.h"
#include "memory.h"
#include "error.h"
#include "cuda_wrapper_cu.h"
#include "thermo.h"
#include "cuda_pair_cu.h"
#include "cuda.h"
#include <ctime>
#include <cmath>
#ifdef _OPENMP
#include "omp.h"
#endif

using namespace LAMMPS_NS;

#define MAKETIMEING


VerletCuda::VerletCuda(LAMMPS* lmp, int narg, char** arg) : Verlet(lmp, narg, arg)
{
  cuda = lmp->cuda;

  if(cuda == NULL)
    error->all(FLERR, "You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");

  modify_cuda = (ModifyCuda*) modify;
  int ifix = modify->find_fix("package_omp");

  if(ifix >= 0) external_force_clear = 1;
}

/* ----------------------------------------------------------------------
   setup before run
------------------------------------------------------------------------- */

void VerletCuda::setup()
{
  //debug related variables
  cuda->debugdata[0] = 0;
  cuda->cu_debugdata->upload();
  dotestatom = cuda->dotestatom;
  int testatom = cuda->testatom; //48267;

  if(atom->nlocal == 0)
    error->warning(FLERR, "# CUDA: There are currently no atoms on one of the MPI processes. This is known to cause errors with the USER-CUDA package. Please use the 'processors' keyword to enforce more balanced processor layout.");

  MYDBG(printf("# CUDA VerletCuda::setup start\n");)

  cuda->oncpu = true;
  cuda->begin_setup = true;
  cuda->finished_setup = false;
  cuda->finished_run = false;

  time_pair = 0;
  time_kspace = 0;
  time_comm = 0;
  time_modify = 0;
  time_fulliterate = 0;

  atom->setup();

  cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
  cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
  cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
  cu_atom->update_nlocal = 1;
  cu_atom->update_nmax = 1;

  if(atom->molecular || (force->kspace && (not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;

  cuda->setDomainParams();


  if(cuda->shared_data.me == 0)
    printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION == 1 ? 4 : 8, (int) sizeof(X_FLOAT), (int) sizeof(V_FLOAT), (int) sizeof(F_FLOAT), (int) sizeof(PPPM_FLOAT));

  cuda->allocate();

  if(comm->me == 0 && screen) fprintf(screen, "Setting up run ...\n");

  // setup domain, communication and neighboring
  // acquire ghosts
  // build neighbor lists
  modify->setup_pre_exchange();

  if(triclinic) domain->x2lamda(atom->nlocal);

  domain->pbc();
  domain->reset_box();
  comm->setup();

  if(neighbor->style) neighbor->setup_bins();

  comm->exchange();

  if(atom->sortfreq > 0) atom->sort();

  comm->borders();

  if(triclinic) domain->lamda2x(atom->nlocal + atom->nghost);

  cuda->setSystemParams();
  cuda->checkResize();

  if(cuda->shared_data.me == 0)
    printf("# CUDA: VerletCuda::setup: Upload data...\n");

  cuda->uploadAll();
  neighbor->build();
  neighbor->ncalls = 0;

  if(atom->mass)
    cuda->cu_mass->upload();

  if(cuda->cu_map_array)
    cuda->cu_map_array->upload();

  // compute all forces

  ev_set(update->ntimestep);

  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;

  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;

  if(elist_atom || vlist_atom) cuda->checkResize();

  int test_BpA_vs_TpA = true;
  my_times starttime;
  my_times endtime;
#ifdef NO_PREC_TIMING
  double startsec, endsec;
#endif

  //if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
  if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair && (cuda->shared_data.pair.override_block_per_atom < 0)) {
    int StyleLoops = 10;

    if(cuda->shared_data.me == 0)
      printf("Test TpA\n");

    cuda->shared_data.pair.use_block_per_atom = 0;
    neighbor->build();
    Cuda_Pair_GenerateXType(&cuda->shared_data);

    if(cuda->cu_v_radius)
      Cuda_Pair_GenerateVRadius(&cuda->shared_data);

    if(cuda->cu_omega_rmass)
      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

    force->pair->compute(eflag, vflag);
    CudaWrapper_Sync();
#ifdef NO_PREC_TIMING
    startsec = 1.0 * clock() / CLOCKS_PER_SEC;
#endif
    my_gettime(CLOCK_REALTIME, &starttime);

    for(int i = 0; i < StyleLoops; i++) {
      Cuda_Pair_GenerateXType(&cuda->shared_data);

      if(cuda->cu_v_radius)
        Cuda_Pair_GenerateVRadius(&cuda->shared_data);

      if(cuda->cu_omega_rmass)
        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

      force->pair->compute(eflag, vflag);
      CudaWrapper_Sync();
    }

    my_gettime(CLOCK_REALTIME, &endtime);

    double TpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
#ifdef NO_PREC_TIMING
    endsec = 1.0 * clock() / CLOCKS_PER_SEC;
    TpAtime = endsec - startsec;
#endif

    if(cuda->shared_data.me == 0)
      printf("Test BpA\n");

    cuda->shared_data.pair.use_block_per_atom = 1;
    neighbor->build();
    Cuda_Pair_GenerateXType(&cuda->shared_data);

    if(cuda->cu_v_radius)
      Cuda_Pair_GenerateVRadius(&cuda->shared_data);

    if(cuda->cu_omega_rmass)
      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

    force->pair->compute(eflag, vflag);
    CudaWrapper_Sync();

    my_gettime(CLOCK_REALTIME, &starttime);
#ifdef NO_PREC_TIMING
    startsec = 1.0 * clock() / CLOCKS_PER_SEC;
#endif

    for(int i = 0; i < StyleLoops; i++) {
      Cuda_Pair_GenerateXType(&cuda->shared_data);

      if(cuda->cu_v_radius)
        Cuda_Pair_GenerateVRadius(&cuda->shared_data);

      if(cuda->cu_omega_rmass)
        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

      force->pair->compute(eflag, vflag);
      CudaWrapper_Sync();
    }

    my_gettime(CLOCK_REALTIME, &endtime);
    double BpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
#ifdef NO_PREC_TIMING
    endsec = 1.0 * clock() / CLOCKS_PER_SEC;
    BpAtime = endsec - startsec;
#endif

    if(cuda->shared_data.me == 0)
      printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n", StyleLoops);

    if(cuda->shared_data.me == 0)
      printf("# CUDA: BpA TpA\n %lf %lf\n", BpAtime, TpAtime);

    if(BpAtime > TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
  } else
    cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;

  //cuda->shared_data.pair.use_block_per_atom = 0;
  if(atom->molecular || (force->kspace && (not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;

  neighbor->build();
  neighbor->ncalls = 0;

  force_clear();

  modify->setup_pre_force(vflag);

  cuda->cu_f->download();

  if(cuda->cu_torque)
    cuda->cu_torque->download();

  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute\n");)

  //test_atom(testatom,"pre pair force");

  if(cuda->shared_data.pair.cudable_force) {
    cuda->uploadAll();
    Cuda_Pair_GenerateXType(&cuda->shared_data);

    if(cuda->cu_v_radius)
      Cuda_Pair_GenerateVRadius(&cuda->shared_data);

    if(cuda->cu_omega_rmass)
      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
  }

  if(force->pair) force->pair->compute(eflag, vflag);

  if(cuda->shared_data.pair.cudable_force) {
    if(cuda->shared_data.pair.collect_forces_later) {
      if(eflag) cuda->cu_eng_vdwl->upload();

      if(eflag) cuda->cu_eng_coul->upload();

      if(vflag) cuda->cu_virial->upload();

      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);

      if(eflag) cuda->cu_eng_vdwl->download();

      if(eflag) cuda->cu_eng_coul->download();

      if(vflag) cuda->cu_virial->download();
    }

    cuda->downloadAll();
  }

  test_atom(testatom, "post pair force");

  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute done\n");)
  //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  if(atom->molecular) {
    if(force->bond) force->bond->compute(eflag, vflag);

    if(force->angle) force->angle->compute(eflag, vflag);

    if(force->dihedral) force->dihedral->compute(eflag, vflag);

    if(force->improper) force->improper->compute(eflag, vflag);
  }


  if(cuda->shared_data.pppm.cudable_force) {
    cuda->cu_tag ->upload();
    cuda->cu_type->upload();
    cuda->cu_x   ->upload();
    cuda->cu_v   ->upload();
    cuda->cu_f   ->upload();

    if(cu_atom->q_flag) cuda->cu_q->upload();
  }

  if(force->kspace) {
    force->kspace->setup();
    force->kspace->compute(eflag, vflag);
  }

  if(cuda->shared_data.pppm.cudable_force) {
    cuda->cu_f   ->download();
  }

  test_atom(testatom, "post kspace");

  cuda->uploadAll();

  if(force->newton) comm->reverse_comm();

  cuda->downloadAll();

  test_atom(testatom, "post reverse comm");

  if(cuda->shared_data.me == 0)
    printf("# CUDA: Total Device Memory usage post setup: %lf MB\n", 1.0 * CudaWrapper_CheckMemUsage() / 1024 / 1024);

  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup\n");)
  modify->setup(vflag);

  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup done\n");)
  output->setup(1);

  test_atom(testatom, "post setup");

  MYDBG(printf("# CUDA: VerletCuda::setup: done\n");)
  cuda->finished_setup = true;
  cuda->oncpu = false;
}


//this routine is in a messy state
void VerletCuda::setup_minimal(int flag)
{

  printf("SetupMinimal\n");
  dotestatom = 0;
  int testatom = 104;
  cuda->oncpu = true;
  cuda->begin_setup = true;
  cuda->finished_run = false;
  MYDBG(printf("# CUDA VerletCuda::setup start\n");)
  time_pair = 0;
  time_kspace = 0;
  time_comm = 0;
  time_modify = 0;
  time_fulliterate = 0;

  //cuda->allocate();

  cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
  cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
  cuda_shared_pair*   cu_pair   = & cuda->shared_data.pair;
  cu_atom->update_nlocal = 1;
  cu_atom->update_nmax = 1;

  if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;

  cuda->setDomainParams();


  if(cuda->shared_data.me == 0)
    printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);

  cuda->allocate();


  // setup domain, communication and neighboring
  // acquire ghosts
  // build neighbor lists

  if(flag) {
    if(triclinic) domain->x2lamda(atom->nlocal);

    domain->pbc();
    domain->reset_box();
    comm->setup();

    if(neighbor->style) neighbor->setup_bins();

    comm->exchange();
    comm->borders();

    if(triclinic) domain->lamda2x(atom->nlocal + atom->nghost);

    cuda->setSystemParams();
    cuda->checkResize();
    neighbor->build();
    neighbor->ncalls = 0;
  }

  if(cuda->shared_data.me == 0)
    printf("# CUDA: VerletCuda::setup: Upload data...\n");

  cuda->uploadAll();
  cuda->uploadAllNeighborLists();

  if(atom->mass)
    cuda->cu_mass->upload();

  if(cuda->cu_map_array)
    cuda->cu_map_array->upload();

  // compute all forces

  ev_set(update->ntimestep);

  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;

  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;

  if(elist_atom || vlist_atom) cuda->checkResize();

  force_clear();
  cuda->cu_f->download();

  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  cuda->cu_mass->upload();
  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute\n");)

  test_atom(testatom, "pre pair force");

  if(cuda->shared_data.pair.cudable_force) {
    cuda->uploadAll();
    Cuda_Pair_GenerateXType(&cuda->shared_data);

    if(cuda->cu_v_radius)
      Cuda_Pair_GenerateVRadius(&cuda->shared_data);

    if(cuda->cu_omega_rmass)
      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
  }

  if(force->pair) force->pair->compute(eflag, vflag);

  if(cuda->shared_data.pair.cudable_force) {
    if(cuda->shared_data.pair.collect_forces_later) {
      if(eflag) cuda->cu_eng_vdwl->upload();

      if(eflag) cuda->cu_eng_coul->upload();

      if(vflag) cuda->cu_virial->upload();

      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);

      if(eflag) cuda->cu_eng_vdwl->download();

      if(eflag) cuda->cu_eng_coul->download();

      if(vflag) cuda->cu_virial->download();
    }

    cuda->downloadAll();
  }

  test_atom(testatom, "post pair force");

  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute done\n");)
  //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

  if(atom->molecular) {
    if(force->bond) force->bond->compute(eflag, vflag);

    if(force->angle) force->angle->compute(eflag, vflag);

    if(force->dihedral) force->dihedral->compute(eflag, vflag);

    if(force->improper) force->improper->compute(eflag, vflag);
  }


  if(cuda->shared_data.pppm.cudable_force) {
    cuda->cu_tag ->upload();
    cuda->cu_type->upload();
    cuda->cu_x   ->upload();
    cuda->cu_v   ->upload();
    cuda->cu_f   ->upload();

    if(cu_atom->q_flag) cuda->cu_q->upload();
  }

  if(force->kspace) {
    force->kspace->setup();
    force->kspace->compute(eflag, vflag);
  }

  if(cuda->shared_data.pppm.cudable_force) {
    cuda->cu_f   ->download();
  }

  test_atom(testatom, "post kspace");

  cuda->uploadAll();

  if(force->newton) comm->reverse_comm();

  cuda->downloadAll();

  test_atom(testatom, "post reverse comm");

  if(cuda->shared_data.me == 0)
    printf("# CUDA: Total Device Memory usage post setup: %lf MB\n", 1.0 * CudaWrapper_CheckMemUsage() / 1024 / 1024);

  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup\n");)
  modify->setup(vflag);

  MYDBG(printf("# CUDA: VerletCuda::setup: done\n");)
  cuda->finished_setup = true;
  cuda->oncpu = false;
}

//#define TESTATOM
/* ----------------------------------------------------------------------
   iterate for n steps
------------------------------------------------------------------------- */

void VerletCuda::run(int n)
{
  dotestatom = cuda->dotestatom;
  int testatom = cuda->testatom; //48267;


  my_times starttime;
  my_times endtime;
  my_times starttotal;
  my_times endtotal;

  cuda->setTimingsZero();

  static double testtime = 0.0;
  //                                my_gettime(CLOCK_REALTIME,&starttime);
  //                                  my_gettime(CLOCK_REALTIME,&endtime);
  //                                testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
  //                                 printf("Time: %lf\n",testtime);*/


  cuda_shared_domain* cu_domain = & cuda->shared_data.domain;

  int nflag, ntimestep, sortflag;

  int n_initial_integrate = modify_cuda->n_initial_integrate;
  int n_post_integrate = modify_cuda->n_post_integrate;
  int n_final_integrate = modify_cuda->n_final_integrate;
  int n_pre_exchange = modify_cuda->n_pre_exchange;
  int n_pre_neighbor = modify_cuda->n_pre_neighbor;
  int n_pre_force = modify_cuda->n_pre_force;
  int n_post_force = modify_cuda->n_post_force;
  int n_end_of_step = modify_cuda->n_end_of_step;
  MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n",
               n_initial_integrate, n_post_integrate, n_final_integrate, n_pre_exchange, n_pre_neighbor, n_pre_force, n_post_force, n_end_of_step);)

  if(atom->sortfreq > 0) sortflag = 1;
  else sortflag = 0;


  if(cuda->shared_data.me == 0) {
    if((not cuda->shared_data.pair.cudable_force) && (force->pair))
      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
                     "but selected a pair force which has not yet been ported to Cuda");

    if((not cuda->shared_data.pppm.cudable_force) && (force->kspace))
      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
                     "but selected a kspace force which has not yet been ported to Cuda");

    if(modify_cuda->n_post_integrate_host + modify_cuda->n_pre_exchange_host + modify_cuda->n_pre_neighbor_host + modify_cuda->n_pre_force_host + modify_cuda->n_post_force_host + modify_cuda->n_end_of_step_host + modify_cuda->n_initial_integrate_host + modify_cuda->n_final_integrate_host)
      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
                     "but several fixes have not yet been ported to Cuda.\n"
                     "This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");

    if(atom->firstgroupname)
      error->warning(FLERR, "Warning: firstgroupname is used, this will cause additional data transfers.");
  }

  cuda->uploadAll();

  if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold) {
    const int n = cuda->shared_data.atom.maxhold;
    CudaWrapper_CopyData(cuda->cu_xhold->dev_data(), cuda->cu_x->dev_data(), n * sizeof(X_FLOAT));
    CudaWrapper_CopyData((void*) & ((X_FLOAT*)cuda->cu_xhold->dev_data())[n], (void*) & ((X_FLOAT*)cuda->cu_x->dev_data())[atom->nmax], n * sizeof(X_FLOAT));
    CudaWrapper_CopyData((void*) & ((X_FLOAT*)cuda->cu_xhold->dev_data())[2 * n], (void*) & ((X_FLOAT*)cuda->cu_x->dev_data())[2 * atom->nmax], n * sizeof(X_FLOAT));
  }

  cuda->shared_data.atom.reneigh_flag = 0;
  cuda->shared_data.atom.update_nlocal = 1;
  cuda->shared_data.atom.update_nmax = 1;
  cuda->shared_data.atom.update_neigh = 1;
  cuda->shared_data.domain.update = 1;
  cuda->shared_data.buffer_new = 1;
  cuda->uploadtime = 0;
  cuda->downloadtime = 0;
  int firstreneigh = 1;

  for(int i = 0; i < n; i++) {
    if(atom->nlocal == 0)
      error->warning(FLERR, "# CUDA: There are currently no atoms on one of the MPI processes. This is currently prone to encountering errors with USER-CUDA package. Please use the 'processors' keyword to use a more balanced processor layout.");

    ntimestep = ++update->ntimestep;
    ev_set(ntimestep);

    // initial time integration

    test_atom(testatom, "Pre initial");

    MYDBG(printf("# CUDA VerletCuda::iterate: before initial_integrate\n");)

    modify->initial_integrate(vflag);

    MYDBG(printf("# CUDA VerletCuda::iterate: after initial_integrate\n");)

    if(n_post_integrate) modify->post_integrate();


    // regular communication vs neighbor list rebuild

    test_atom(testatom, "Pre Exchange");

    MYDBG(printf("# CUDA VerletCuda::iterate: before neighbor decide\n");)
    nflag = neighbor->decide();

    if(nflag == 0) {
      MYDBG(printf("# CUDA VerletCuda::iterate: communicate\n");)
      timer->stamp();

      if((not(eflag || vflag)) && (cuda->shared_data.overlap_comm)) {
        //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
        //build communication buffers
        //      printf("Pre forward_comm(1)\n");
        my_gettime(CLOCK_REALTIME, &starttotal);
        cuda->shared_data.atom.reneigh_flag = 0;
        my_gettime(CLOCK_REALTIME, &starttime);
        timer->stamp();
        comm->forward_comm(1);
        timer->stamp(TIME_COMM);
        my_gettime(CLOCK_REALTIME, &endtime);
        cuda->shared_data.cuda_timings.comm_forward_total +=
          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;

        //prepare force calculation
        //     printf("Pre force_clear\n");
        force_clear();
        //     printf("Pre Generate XType\n");
        Cuda_Pair_GenerateXType(&cuda->shared_data);

        if(cuda->cu_v_radius)
          Cuda_Pair_GenerateVRadius(&cuda->shared_data);

        if(cuda->cu_omega_rmass)
          Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

        //start force calculation asynchronus
        cuda->shared_data.comm.comm_phase = 1;
        force->pair->compute(eflag, vflag);
        timer->stamp(TIME_PAIR);
        //CudaWrapper_Sync();

        //download comm buffers from GPU, perform MPI communication and upload buffers again
        my_gettime(CLOCK_REALTIME, &starttime);
        comm->forward_comm(2);
        my_gettime(CLOCK_REALTIME, &endtime);
        cuda->shared_data.cuda_timings.comm_forward_total +=
          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
        timer->stamp(TIME_COMM);

        //wait for force calculation
        CudaWrapper_Sync();
        timer->stamp(TIME_PAIR);

        //unpack communication buffers
        my_gettime(CLOCK_REALTIME, &starttime);
        comm->forward_comm(3);
        my_gettime(CLOCK_REALTIME, &endtime);
        cuda->shared_data.cuda_timings.comm_forward_total +=
          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;

        timer->stamp(TIME_COMM);
        MYDBG(printf("# CUDA VerletCuda::iterate: communicate done\n");)
        cuda->shared_data.cuda_timings.test1 +=
          endtotal.tv_sec - starttotal.tv_sec + 1.0 * (endtotal.tv_nsec - starttotal.tv_nsec) / 1000000000;
      } else {
        //perform standard forward communication
        my_gettime(CLOCK_REALTIME, &starttime);
        comm->forward_comm();
        my_gettime(CLOCK_REALTIME, &endtime);
        cuda->shared_data.cuda_timings.comm_forward_total +=
          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
        timer->stamp(TIME_COMM);
        MYDBG(printf("# CUDA VerletCuda::iterate: communicate done\n");)
      }
    } else {
      int nlocalold = cuda->shared_data.atom.nlocal;

      if(firstreneigh) {
        cuda->shared_data.atom.update_nlocal = 1;
        cuda->shared_data.atom.update_nmax = 1;
        firstreneigh = 0;
      }

      cuda->shared_data.buffer_new = 1;
      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor\n");)
      cuda->setDomainParams();

      if(n_pre_exchange) modify->pre_exchange();

      if(atom->nlocal != cuda->shared_data.atom.nlocal) { //did someone add atoms during pre_exchange?
        cuda->checkResize();
        cuda->uploadAll();
      }

      //check domain changes
      if(domain->triclinic) domain->x2lamda(atom->nlocal);

      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor pbc\n");)
      domain->pbc();

      if(domain->box_change) {
        domain->reset_box();
        comm->setup();

        if(neighbor->style) neighbor->setup_bins();

      }

      timer->stamp();
      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor exchange\n");)

      //perform exchange of local atoms
      my_gettime(CLOCK_REALTIME, &starttime);
      comm->exchange();
      my_gettime(CLOCK_REALTIME, &endtime);

      //special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
      if(comm->nprocs > 1) {
        my_gettime(CLOCK_REALTIME, &starttime);

        if(atom->special)
          cuda->cu_special->upload();

        if(atom->nspecial)
          cuda->cu_nspecial->upload();

        my_gettime(CLOCK_REALTIME, &endtime);
        cuda->shared_data.cuda_timings.test1 +=
          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
      }

      cuda->shared_data.cuda_timings.comm_exchange_total +=
        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;

      if(nlocalold != cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal = 2;

      //sort atoms
      if(sortflag && ntimestep >= atom->nextsort) atom->sort();

      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor borders\n");)

      //generate ghost atom lists, and transfer ghost atom data
      my_gettime(CLOCK_REALTIME, &starttime);
      comm->borders();
      my_gettime(CLOCK_REALTIME, &endtime);
      cuda->shared_data.cuda_timings.comm_border_total +=
        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;

      my_gettime(CLOCK_REALTIME, &starttime);
      //atom index maps are generated on CPU, and need to be transfered to GPU if they are used
      if(cuda->cu_map_array)
        cuda->cu_map_array->upload();


      if(domain->triclinic) domain->lamda2x(atom->nlocal + atom->nghost);

      if(n_pre_neighbor) modify->pre_neighbor();

      cuda->shared_data.buffer_new = 2;

      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor build\n");)
      timer->stamp(TIME_COMM);
      my_gettime(CLOCK_REALTIME, &endtime);
      cuda->shared_data.cuda_timings.test2 +=
        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;

      //rebuild neighbor list
      test_atom(testatom, "Pre Neighbor");
      neighbor->build(0);
      timer->stamp(TIME_NEIGHBOR);
      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor done\n");)
      //if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU
      if(cuda->shared_data.pair.collect_forces_later) {
        if(cuda->cu_molecule) cuda->cu_molecule->downloadAsync(2);

        cuda->cu_tag->downloadAsync(2);
        cuda->cu_type->downloadAsync(2);
        cuda->cu_mask->downloadAsync(2);

        if(cuda->cu_q) cuda->cu_q->downloadAsync(2);
      }
      cuda->shared_data.comm.comm_phase = 3;
    }

    test_atom(testatom, "Post Exchange");

    // force computations

    //only do force_clear if it has not been done during overlap of communication with local interactions
    if(not((not(eflag || vflag)) && (cuda->shared_data.overlap_comm) && (cuda->shared_data.comm.comm_phase < 3)))
      force_clear();

    if(n_pre_force) modify->pre_force(vflag);

    timer->stamp();

    //if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
    /*            if(cuda->shared_data.pair.collect_forces_later)
               {
                 cuda->cu_x->downloadAsync(2);
                 cuda->cu_f->downloadAsync(2);
               }*/

    if(force->pair) {
      if((not(eflag || vflag)) && (cuda->shared_data.overlap_comm) && (cuda->shared_data.comm.comm_phase < 3) && cuda->shared_data.pair.cudable_force) {
        //second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
        //regenerate data layout for force computations, its actually only needed for the ghost atoms
        cuda->shared_data.comm.comm_phase = 2;

        my_times atime1, atime2;
        my_gettime(CLOCK_REALTIME, &atime1);

        Cuda_Pair_GenerateXType(&cuda->shared_data);

        if(cuda->cu_v_radius)
          Cuda_Pair_GenerateVRadius(&cuda->shared_data);

        if(cuda->cu_omega_rmass)
          Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

        my_gettime(CLOCK_REALTIME, &atime2);
        cuda->shared_data.cuda_timings.pair_xtype_conversion +=
          atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
        force->pair->compute(eflag, vflag);

      } else {
        //calculate complete pair interactions
        if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
        else {
          //regenerate data layout for force computations, its actually only needed for the ghost atoms
          my_times atime1, atime2;
          my_gettime(CLOCK_REALTIME, &atime1);

          Cuda_Pair_GenerateXType(&cuda->shared_data);

          if(cuda->cu_v_radius)
            Cuda_Pair_GenerateVRadius(&cuda->shared_data);

          if(cuda->cu_omega_rmass)
            Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

          my_gettime(CLOCK_REALTIME, &atime2);
          cuda->shared_data.cuda_timings.pair_xtype_conversion +=
            atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
        }

        cuda->shared_data.comm.comm_phase = 0;
        force->pair->compute(eflag, vflag);
      }

      if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll();

      //wait for force calculation in case of not using overlap with bonded interactions
      if(not cuda->shared_data.pair.collect_forces_later)
        CudaWrapper_Sync();

      timer->stamp(TIME_PAIR);
    }

    //calculate bonded interactions
    if(atom->molecular) {
      cuda->cu_x->downloadAsync(2);

      if(n_pre_force == 0) Verlet::force_clear();
      else  cuda->cu_f->downloadAsync(2);

      timer->stamp(TIME_PAIR);

      if(neighbor->lastcall == update->ntimestep) {
        neighbor->build_topology();
        timer->stamp(TIME_NEIGHBOR);
      }

      test_atom(testatom, "pre bond force");

      if(force->bond) force->bond->compute(eflag, vflag);

      if(force->angle) force->angle->compute(eflag, vflag);

      if(force->dihedral) force->dihedral->compute(eflag, vflag);

      if(force->improper) force->improper->compute(eflag, vflag);

      timer->stamp(TIME_BOND);
    }

    //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
    if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && (not(force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
      my_gettime(CLOCK_REALTIME, &starttime);
      cuda->cu_f->uploadAsync(2);

      test_atom(testatom, "post molecular force");


      if(eflag) cuda->cu_eng_vdwl->upload();

      if(eflag) cuda->cu_eng_coul->upload();

      if(vflag) cuda->cu_virial->upload();

      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);

      if(eflag) cuda->cu_eng_vdwl->download();

      if(eflag) cuda->cu_eng_coul->download();

      if(vflag) cuda->cu_virial->download();

      timer->stamp(TIME_PAIR);

      my_gettime(CLOCK_REALTIME, &endtime);
      cuda->shared_data.cuda_timings.pair_force_collection +=
        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
    }

    //compute kspace force
    if(force->kspace) {
      if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
        cuda->downloadAll();

      if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular)) {
        cuda->cu_x->downloadAsync(2);

        if(n_pre_force == 0) Verlet::force_clear();
        else  cuda->cu_f->downloadAsync(2);

        timer->stamp(TIME_PAIR);
      }

      force->kspace->compute(eflag, vflag);

      if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
        cuda->uploadAll();

      timer->stamp(TIME_KSPACE);
    }

    //collect forces in case pair forces and kspace was overlaped
    if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && ((force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
      cuda->cu_f->uploadAsync(2);

      my_gettime(CLOCK_REALTIME, &starttime);

      if(eflag) cuda->cu_eng_vdwl->upload();

      if(eflag) cuda->cu_eng_coul->upload();

      if(vflag) cuda->cu_virial->upload();

      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);

      if(eflag) cuda->cu_eng_vdwl->download();

      if(eflag) cuda->cu_eng_coul->download();

      if(vflag) cuda->cu_virial->download();

      timer->stamp(TIME_PAIR);

      my_gettime(CLOCK_REALTIME, &endtime);
      cuda->shared_data.cuda_timings.pair_force_collection +=
        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
    }

    //send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
    if(force->newton) {
      comm->reverse_comm();
      timer->stamp(TIME_COMM);
    }

    test_atom(testatom, "post force");
    // force modifications, final time integration, diagnostics

    if(n_post_force) modify->post_force(vflag);

    test_atom(testatom, "pre final");

    modify->final_integrate();

    test_atom(testatom, "post final");

    if(n_end_of_step) modify->end_of_step();

    // all output

    test_atom(testatom, "pre output");

    if(ntimestep == output->next) {
      if(not output->thermo->cudable)
        cuda->downloadAll();

      timer->stamp();
      output->write(ntimestep);
      timer->stamp(TIME_OUTPUT);
    }


    test_atom(testatom, "post output");

    if(cuda->shared_data.atom.update_nlocal > 0)
      cuda->shared_data.atom.update_nlocal--;

    if(cuda->shared_data.atom.update_nmax > 0)
      cuda->shared_data.atom.update_nmax--;

    if(cuda->shared_data.atom.update_neigh > 0)
      cuda->shared_data.atom.update_neigh--;

    if(cuda->shared_data.domain.update > 0)
      cuda->shared_data.domain.update--;

    if(cuda->shared_data.buffer_new > 0)
      cuda->shared_data.buffer_new--;

    cuda->shared_data.atom.reneigh_flag = 0;
  }


  cuda->downloadAll();
  cuda->downloadAllNeighborLists();
  cuda->shared_data.atom.update_nlocal = 1;
  cuda->shared_data.atom.update_nmax = 1;
  cuda->shared_data.atom.update_neigh = 1;
  cuda->shared_data.buffer_new = 1;
  cuda->shared_data.domain.update = 1;
  cuda->oncpu = true;
  cuda->finished_run = true;
}


/* ----------------------------------------------------------------------
   clear force on own & ghost atoms
   setup and clear other arrays as needed
------------------------------------------------------------------------- */

void VerletCuda::force_clear()
{
  cuda->cu_f->memset_device(0);

  if(cuda->cu_torque) cuda->cu_torque->memset_device(0);

  return;

  //The rest should not be necessary
  int i;

  for(i = 0; i < atom->nlocal; i++) {
    atom->f[i][0] = 0.0;
    atom->f[i][1] = 0.0;
    atom->f[i][2] = 0.0;
  }

  // clear force on all particles
  // if either newton flag is set, also include ghosts

  if(neighbor->includegroup == 0) {
    int nall;

    if(force->newton) nall = atom->nlocal + atom->nghost;
    else nall = atom->nlocal;

    if(torqueflag) {
      double** torque = atom->torque;

      for(i = 0; i < nall; i++) {
        torque[i][0] = 0.0;
        torque[i][1] = 0.0;
        torque[i][2] = 0.0;
      }
    }

    // neighbor includegroup flag is set
    // clear force only on initial nfirst particles
    // if either newton flag is set, also include ghosts

  } else {
    int nall = atom->nfirst;


    if(torqueflag) {
      double** torque = atom->torque;

      for(i = 0; i < nall; i++) {
        torque[i][0] = 0.0;
        torque[i][1] = 0.0;
        torque[i][2] = 0.0;
      }
    }

    if(force->newton) {
      nall = atom->nlocal + atom->nghost;

      if(torqueflag) {
        double** torque = atom->torque;

        for(i = atom->nlocal; i < nall; i++) {
          torque[i][0] = 0.0;
          torque[i][1] = 0.0;
          torque[i][2] = 0.0;
        }
      }
    }
  }
}

void VerletCuda::test_atom(int aatom, char* string)  //printing properties of one atom for test purposes
{
  if(not dotestatom) return;

  bool check = false;

  if(cuda->finished_setup) cuda->downloadAll();

  for(int i = 0; i < atom->nlocal + atom->nghost; i++) {
    if((atom->tag[i] == aatom) && (i < atom->nlocal)) {

      printf("%i # CUDA %s: %i %i %e %e %e %i ", comm->me, string, update->ntimestep, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);

      if(atom->molecular && (i < atom->nlocal)) {
        printf(" // %i %i %i ", atom->num_bond[i], atom->num_angle[i], atom->num_dihedral[i]);

        for(int k = 0; k < atom->num_bond[i]; k++)
          printf("// %i %i ", atom->bond_type[i][k], atom->bond_atom[i][k]);
      }

      printf("\n");
    }

    if(i < atom->nlocal) {
      if((atom->v[i][0] < -100 || atom->v[i][0] > 100) ||
          (atom->v[i][1] < -100 || atom->v[i][1] > 100) ||
          (atom->v[i][2] < -100 || atom->v[i][2] > 100) ||
          (atom->v[i][0] != atom->v[i][0]) ||
          (atom->v[i][1] != atom->v[i][1]) ||
          (atom->v[i][2] != atom->v[i][2])) {
        printf("%i # CUDA %s velocity: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
        check = true;
      }

      if((atom->f[i][0] < -10000 || atom->f[i][0] > 10000) ||
          (atom->f[i][1] < -10000 || atom->f[i][1] > 10000) ||
          (atom->f[i][2] < -10000 || atom->f[i][2] > 10000) ||
          (atom->f[i][0] != atom->f[i][0]) ||
          (atom->f[i][1] != atom->f[i][1]) ||
          (atom->f[i][2] != atom->f[i][2])) {
        printf("%i # CUDA %s force: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
        check = true;
      }

      if(atom->tag[i] <= 0)
        printf("%i # CUDA %s tag: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
    }
  }

  if(check) exit(0);
}