/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

#include "string.h"
#include "stdlib.h"
#include "fix_gpu.h"
#include "atom.h"
#include "force.h"
#include "pair.h"
#include "pair_hybrid.h"
#include "pair_hybrid_overlay.h"
#include "respa.h"
#include "input.h"
#include "timer.h"
#include "modify.h"
#include "update.h"
#include "domain.h"
#include "universe.h"
#include "gpu_extra.h"
#include "neighbor.h"
#include "citeme.h"
#include "error.h"

using namespace LAMMPS_NS;
using namespace FixConst;

enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};

extern int lmp_init_device(MPI_Comm world, MPI_Comm replica,
                           const int first_gpu, const int last_gpu,
                           const int gpu_mode, const double particle_split,
                           const int nthreads, const int t_per_atom,
                           const double cell_size, char *opencl_flags);
extern void lmp_clear_device();
extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
                             double **vatom, double *virial, double &ecoul);

static const char cite_gpu_package[] =
  "GPU package (short-range, long-range and three-body potentials):\n\n"
  "@Article{Brown11,\n"
  " author = {W. M. Brown, P. Wang, S. J. Plimpton, A. N. Tharrington},\n"
  " title = {Implementing Molecular Dynamics on Hybrid High Performance Computers - Short Range Forces},\n"
  " journal = {Comp.~Phys.~Comm.},\n"
  " year =    2011,\n"
  " volume =  182,\n"
  " pages =   {898--911}\n"
  "}\n\n"
  "@Article{Brown12,\n"
  " author = {W. M. Brown, A. Kohlmeyer, S. J. Plimpton, A. N. Tharrington},\n"
  " title = {Implementing Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle Particle-Mesh},\n"
  " journal = {Comp.~Phys.~Comm.},\n"
  " year =    2012,\n"
  " volume =  183,\n"
  " pages =   {449--459}\n"
  "}\n\n"
  "@Article{Brown13,\n"
  " author = {W. M. Brown, Y. Masako},\n"
  " title = {Implementing Molecular Dynamics on Hybrid High Performance Computers – Three-Body Potentials},\n"
  " journal = {Comp.~Phys.~Comm.},\n"
  " year =    2013,\n"
  " volume =  184,\n"
  " pages =   {2785--2793}\n"
  "}\n\n";

/* ---------------------------------------------------------------------- */

FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
  Fix(lmp, narg, arg)
{
  if (lmp->citeme) lmp->citeme->add(cite_gpu_package);

  if (lmp->cuda)
    error->all(FLERR,"Cannot use GPU package with USER-CUDA package enabled");

  if (narg < 4) error->all(FLERR,"Illegal package gpu command");

  int ngpu = atoi(arg[3]);
  if (ngpu <= 0) error->all(FLERR,"Illegal package gpu command");
  int first_gpu = 0;
  int last_gpu = ngpu-1;
  
  // options

  _gpu_mode = GPU_NEIGH;
  _particle_split = 1.0;
  int nthreads = 1;
  int threads_per_atom = -1;
  double binsize = -1;
  char *opencl_flags = NULL;

  int iarg = 4;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"neigh") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
      if (strcmp(arg[iarg]+1,"yes") == 0) _gpu_mode = GPU_NEIGH;
      else if (strcmp(arg[iarg]+1,"no") == 0) _gpu_mode = GPU_FORCE;
      else if (strcmp(arg[iarg]+1,"hybrid") == 0) _gpu_mode = GPU_HYB_NEIGH;
      else error->all(FLERR,"Illegal package gpu command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"split") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
      _particle_split = force->numeric(FLERR,arg[iarg+1]);
      if (_particle_split == 0.0 || _particle_split > 1.0)
        error->all(FLERR,"Illegal package GPU command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"gpuID") == 0) {
      if (iarg+3 > narg) error->all(FLERR,"Illegal package gpu command");
      first_gpu = force->inumeric(FLERR,arg[iarg+1]);
      last_gpu = force->inumeric(FLERR,arg[iarg+2]);
      iarg += 3;
    } else if (strcmp(arg[iarg],"tpa") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
      threads_per_atom = force->inumeric(FLERR,arg[iarg+1]);
      iarg += 2;
    } else if (strcmp(arg[iarg],"nthreads") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
      nthreads = force->inumeric(FLERR,arg[iarg+1]);
      if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"binsize") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
      binsize = force->numeric(FLERR,arg[iarg+1]);
      if (binsize <= 0.0) error->all(FLERR,"Illegal fix GPU command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"device") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
      opencl_flags = arg[iarg+1];
      iarg += 2;
    } else error->all(FLERR,"Illegal package gpu command");
  }

  // error check

  if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) && 
      domain->triclinic)
    error->all(FLERR,"Cannot use package gpu neigh yes with triclinic box");

  #ifndef _OPENMP
  if (nthreads > 1)
    error->all(FLERR,"No OpenMP support compiled in");
  #endif

  // set newton_pair = 0 since required by all GPU pair styles

  force->newton_pair = 0;
  if (force->newton_pair || force->newton_bond) force->newton = 1;
  else force->newton = 0;

  // pass params to GPU library

  int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu,
                                 _gpu_mode, _particle_split, nthreads,
                                 threads_per_atom, binsize, opencl_flags);
  GPU_EXTRA::check_flag(gpu_flag,error,world);
}

/* ---------------------------------------------------------------------- */

FixGPU::~FixGPU()
{
  lmp_clear_device();
}

/* ---------------------------------------------------------------------- */

int FixGPU::setmask()
{
  int mask = 0;
  mask |= POST_FORCE;
  mask |= MIN_POST_FORCE;
  mask |= POST_FORCE_RESPA;
  return mask;
}

/* ---------------------------------------------------------------------- */

void FixGPU::init()
{
  // GPU package cannot be used with atom_style template
  
  if (atom->molecular == 2) 
    error->all(FLERR,"GPU package does not (yet) work with "
               "atom_style template");

  // hybrid cannot be used with force/neigh option

  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
    if (force->pair_match("hybrid",1) != NULL ||
        force->pair_match("hybrid/overlay",1) != NULL)
      error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
  if (_particle_split < 0)
    if (force->pair_match("hybrid",1) != NULL ||
        force->pair_match("hybrid/overlay",1) != NULL)
      error->all(FLERR,"GPU split param must be positive "
                 "for hybrid pair styles");

  // make sure fdotr virial is not accumulated multiple times
  
  if (force->pair_match("hybrid",1) != NULL) {
    PairHybrid *hybrid = (PairHybrid *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i],"/gpu")==NULL)
        force->pair->no_virial_fdotr_compute = 1;
  } else if (force->pair_match("hybrid/overlay",1) != NULL) {
    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i],"/gpu")==NULL)
        force->pair->no_virial_fdotr_compute = 1;
  }

  // rRESPA support

  if (strstr(update->integrate_style,"respa"))
    _nlevels_respa = ((Respa *) update->integrate)->nlevels;
}

/* ---------------------------------------------------------------------- */

void FixGPU::setup(int vflag)
{
  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
    if (neighbor->exclude_setting()!=0)
      error->all(FLERR,
                 "Cannot use neigh_modify exclude with GPU neighbor builds");

  if (strstr(update->integrate_style,"verlet")) post_force(vflag);
  else {
    // in setup only, all forces calculated on GPU are put in the outer level
    ((Respa *) update->integrate)->copy_flevel_f(_nlevels_respa-1);
    post_force(vflag);
    ((Respa *) update->integrate)->copy_f_flevel(_nlevels_respa-1);
  }
}

/* ---------------------------------------------------------------------- */

void FixGPU::min_setup(int vflag)
{
  post_force(vflag);
}

/* ---------------------------------------------------------------------- */

void FixGPU::post_force(int vflag)
{
  timer->stamp();
  double lvirial[6];
  for (int i = 0; i < 6; i++) lvirial[i] = 0.0;
  double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom,
                                 force->pair->vatom, lvirial,
                                 force->pair->eng_coul);

  force->pair->eng_vdwl += my_eng;
  force->pair->virial[0] += lvirial[0];
  force->pair->virial[1] += lvirial[1];
  force->pair->virial[2] += lvirial[2];
  force->pair->virial[3] += lvirial[3];
  force->pair->virial[4] += lvirial[4];
  force->pair->virial[5] += lvirial[5];

  if (force->pair->vflag_fdotr) force->pair->virial_fdotr_compute();
  timer->stamp(TIME_PAIR);
}

/* ---------------------------------------------------------------------- */

void FixGPU::min_post_force(int vflag)
{
  post_force(vflag);
}

/* ---------------------------------------------------------------------- */

void FixGPU::post_force_respa(int vflag, int ilevel, int iloop)
{
  post_force(vflag);
}

/* ---------------------------------------------------------------------- */

double FixGPU::memory_usage()
{
  double bytes = 0.0;
  // memory usage currently returned by pair routine
  return bytes;
}