lammps/src/GPU/fix_nve_asphere_gpu.cpp

// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://lammps.sandia.gov/, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */

#include "fix_nve_asphere_gpu.h"

#include "atom.h"
#include "atom_vec_ellipsoid.h"
#include "comm.h"
#include "error.h"
#include "force.h"
#include "gpu_extra.h"
#include "memory.h"
#include "neighbor.h"
#include "update.h"
#include <cmath>
#if (LAL_USE_OMP == 1)
#include <omp.h>
#endif

using namespace LAMMPS_NS;
using namespace FixConst;

#define INERTIA 0.2          // moment of inertia prefactor for ellipsoid

#define ME_qnormalize(q)                                                \
{                                                                       \
  double norm = 1.0 /                                                   \
    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);        \
  q##_w *= norm;                                                        \
  q##_i *= norm;                                                        \
  q##_j *= norm;                                                        \
  q##_k *= norm;                                                        \
}

#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)     \
{                                                                       \
  double wbody_0, wbody_1, wbody_2;                                     \
  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
                                                                        \
  double w2 = quat##_w * quat##_w;                                      \
  double i2 = quat##_i * quat##_i;                                      \
  double j2 = quat##_j * quat##_j;                                      \
  double k2 = quat##_k * quat##_k;                                      \
  double twoij = 2.0 * quat##_i * quat##_j;                             \
  double twoik = 2.0 * quat##_i * quat##_k;                             \
  double twojk = 2.0 * quat##_j * quat##_k;                             \
  double twoiw = 2.0 * quat##_i * quat##_w;                             \
  double twojw = 2.0 * quat##_j * quat##_w;                             \
  double twokw = 2.0 * quat##_k * quat##_w;                             \
                                                                        \
  rot##_0 = w2 + i2 - j2 - k2;                                          \
  rot##_1 = twoij - twokw;                                              \
  rot##_2 = twojw + twoik;                                              \
                                                                        \
  rot##_3 = twoij + twokw;                                              \
  rot##_4 = w2 - i2 + j2 - k2;                                          \
  rot##_5 = twojk - twoiw;                                              \
                                                                        \
  rot##_6 = twoik - twojw;                                              \
  rot##_7 = twojk + twoiw;                                              \
  rot##_8 = w2 - i2 - j2 + k2;                                          \
                                                                        \
  wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2;              \
  wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2;              \
  wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2;              \
                                                                        \
  wbody_0 *= moments_0;                                                 \
  wbody_1 *= moments_1;                                                 \
  wbody_2 *= moments_2;                                                 \
                                                                        \
  w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2;          \
  w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2;          \
  w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2;          \
}

#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)    \
{                                                                       \
  angmomin[0] += dtf * torque[0];                                       \
  double angmom_0 = angmomin[0];                                        \
  angmomin[1] += dtf * torque[1];                                       \
  double angmom_1 = angmomin[1];                                        \
  angmomin[2] += dtf * torque[2];                                       \
  double angmom_2 = angmomin[2];                                        \
                                                                        \
  double quat_w = quatin[0];                                            \
  double quat_i = quatin[1];                                            \
  double quat_j = quatin[2];                                            \
  double quat_k = quatin[3];                                            \
                                                                        \
  double omega_0, omega_1, omega_2;                                     \
  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);                           \
                                                                        \
  double wq_0, wq_1, wq_2, wq_3;                                        \
  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;             \
  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;              \
  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;              \
  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;              \
                                                                        \
  double qfull_w, qfull_i, qfull_j, qfull_k;                            \
  qfull_w = quat_w + dtq * wq_0;                                        \
  qfull_i = quat_i + dtq * wq_1;                                        \
  qfull_j = quat_j + dtq * wq_2;                                        \
  qfull_k = quat_k + dtq * wq_3;                                        \
  ME_qnormalize(qfull);                                                 \
                                                                        \
  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;                            \
  qhalf_w = quat_w + 0.5*dtq * wq_0;                                    \
  qhalf_i = quat_i + 0.5*dtq * wq_1;                                    \
  qhalf_j = quat_j + 0.5*dtq * wq_2;                                    \
  qhalf_k = quat_k + 0.5*dtq * wq_3;                                    \
  ME_qnormalize(qhalf);                                                 \
                                                                        \
  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);                          \
  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;          \
  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;           \
  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;           \
  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;           \
                                                                        \
  qhalf_w += 0.5*dtq * wq_0;                                            \
  qhalf_i += 0.5*dtq * wq_1;                                            \
  qhalf_j += 0.5*dtq * wq_2;                                            \
  qhalf_k += 0.5*dtq * wq_3;                                            \
  ME_qnormalize(qhalf);                                                 \
                                                                        \
  quat_w = 2.0*qhalf_w - qfull_w;                                       \
  quat_i = 2.0*qhalf_i - qfull_i;                                       \
  quat_j = 2.0*qhalf_j - qfull_j;                                       \
  quat_k = 2.0*qhalf_k - qfull_k;                                       \
  ME_qnormalize(quat);                                                  \
                                                                        \
  quatin[0] = quat_w;                                                   \
  quatin[1] = quat_i;                                                   \
  quatin[2] = quat_j;                                                   \
  quatin[3] = quat_k;                                                   \
}

/* ---------------------------------------------------------------------- */

FixNVEAsphereGPU::FixNVEAsphereGPU(LAMMPS *lmp, int narg, char **arg) :
  FixNVE(lmp, narg, arg)
{
  _dtfm = 0;
  _nlocal_max = 0;
  _inertia0 = 0;
  _inertia1 = 0;
  _inertia2 = 0;
}

/* ---------------------------------------------------------------------- */

void FixNVEAsphereGPU::init()
{
  avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
  if (!avec)
    error->all(FLERR,"Compute nve/asphere requires atom style ellipsoid");

  // check that all particles are finite-size ellipsoids
  // no point particles allowed, spherical is OK

  int *ellipsoid = atom->ellipsoid;
  int *mask = atom->mask;
  int nlocal = atom->nlocal;

  for (int i = 0; i < nlocal; i++)
    if (mask[i] & groupbit)
      if (ellipsoid[i] < 0)
        error->one(FLERR,"Fix nve/asphere requires extended particles");

  FixNVE::init();
}

/* ---------------------------------------------------------------------- */

void FixNVEAsphereGPU::setup(int vflag)
{
  FixNVE::setup(vflag);
  reset_dt();
}

/* ---------------------------------------------------------------------- */

void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
{
  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
  int *ellipsoid = atom->ellipsoid;
  double * _noalias const x = atom->x[0];
  double * _noalias const v = atom->v[0];
  const double * _noalias const f = atom->f[0];
  int *mask = atom->mask;

  double **angmom = atom->angmom;
  double **torque = atom->torque;
  int nlocal = atom->nlocal;
  if (igroup == atom->firstgroup) nlocal = atom->nfirst;

  // set timestep here since dt may have changed or come via rRESPA

  dtq = 0.5 * dtv;

  #if (LAL_USE_OMP == 1)
  #pragma omp parallel
  #endif
  {
    #if (LAL_USE_OMP == 1)
    const int nthreads = comm->nthreads;
    const int tid = omp_get_thread_num();
    const int idelta = nlocal / nthreads + 1;
    const int ifrom = tid * idelta;
    const int ito = MIN(ifrom + idelta, nlocal);
    const int ifrom3 = ifrom * 3;
    const int ito3 = ito * 3;
    #else
    const int tid = 0;
    const int ifrom = 0;
    const int ifrom3 = 0;
    const int ito = nlocal;
    const int ito3 = nlocal * 3;
    #endif

    #if (LAL_USE_OMP_SIMD == 1)
    #pragma omp simd
    #endif
    for (int i = ifrom3; i < ito3; i++) {
      v[i] += _dtfm[i] * f[i];
      x[i] += dtv * v[i];
    }

    // update angular momentum by 1/2 step
    if (igroup == 0) {
      #if (LAL_USE_OMP_SIMD == 1)
        // Workaround for compiler bug
        #ifdef __INTEL_COMPILER
        #pragma simd
        #else
        #pragma omp simd
        #endif
      #endif
      for (int i = ifrom; i < ito; i++) {
        double *quat = bonus[ellipsoid[i]].quat;
        ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
                            _inertia1[i], _inertia2[i]);
      }
    } else {
      #if (LAL_USE_OMP_SIMD == 1)
        // Workaround for compiler bug
        #ifdef __INTEL_COMPILER
        #pragma simd
        #else
        #pragma omp simd
        #endif
      #endif
      for (int i = ifrom; i < ito; i++) {
        if (mask[i] & groupbit) {
          double *quat = bonus[ellipsoid[i]].quat;
          ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i],
                              _inertia0[i], _inertia1[i], _inertia2[i]);
        }
      }
    }
  }
}

/* ---------------------------------------------------------------------- */

void FixNVEAsphereGPU::final_integrate()
{
  double * _noalias const v = atom->v[0];
  const double * _noalias const f = atom->f[0];
  double * _noalias const angmom = atom->angmom[0];
  const double * _noalias const torque = atom->torque[0];

  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
    atom->nlocal;

  if (neighbor->ago == 0) {
    if (nlocal > _nlocal_max) {
      if (_nlocal_max) {
        memory->destroy(_dtfm);
        memory->destroy(_inertia0);
        memory->destroy(_inertia1);
        memory->destroy(_inertia2);
      }
      _nlocal_max = static_cast<int>(1.20 * nlocal);
      memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
      memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0");
      memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1");
      memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2");
    }
  }

  #if (LAL_USE_OMP == 1)
  #pragma omp parallel
  #endif
  {
    #if (LAL_USE_OMP == 1)
    const int nthreads = comm->nthreads;
    const int tid = omp_get_thread_num();
    const int idelta = nlocal / nthreads + 1;
    const int ifrom = tid * idelta;
    const int ito = MIN(ifrom + idelta, nlocal);
    const int ifrom3 = ifrom * 3;
    const int ito3 = ito * 3;
    #else
    const int tid = 0;
    const int ifrom = 0;
    const int ifrom3 = 0;
    const int ito = nlocal;
    const int ito3 = nlocal * 3;
    #endif

    double dtfo;
    if (neighbor->ago == 0) dtfo = reset_dt_omp(ifrom, ito, tid);
    else dtfo = dtf;

    #if (LAL_USE_OMP_SIMD == 1)
    #pragma omp simd
    #endif
    for (int i = ifrom3; i < ito3; i++) {
      v[i] += _dtfm[i] * f[i];
      angmom[i] += dtfo * torque[i];
    }
  }
}

void FixNVEAsphereGPU::reset_dt() {
  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
    atom->nlocal;

  if (nlocal > _nlocal_max) {
    if (_nlocal_max) {
      memory->destroy(_dtfm);
      memory->destroy(_inertia0);
      memory->destroy(_inertia1);
      memory->destroy(_inertia2);
    }
    _nlocal_max = static_cast<int>(1.20 * nlocal);
    memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
    memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0");
    memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1");
    memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2");
  }

  #if (LAL_USE_OMP == 1)
  #pragma omp parallel
  #endif
  {
    #if (LAL_USE_OMP == 1)
    const int nthreads = comm->nthreads;
    const int tid = omp_get_thread_num();
    const int idelta = nlocal / nthreads + 1;
    const int ifrom = tid * idelta;
    const int ito = MIN(ifrom + idelta, nlocal);
    #else
    const int tid = 0;
    const int ifrom = 0;
    const int ito = nlocal;
    #endif
    reset_dt_omp(ifrom, ito, tid);
  }
}

double FixNVEAsphereGPU::reset_dt_omp(const int ifrom, const int ito,
                                      const int tid) {
  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
  int *ellipsoid = atom->ellipsoid;
  const int * const mask = atom->mask;

  const double dtfo = 0.5 * update->dt * force->ftm2v;
  if (tid == 0) {
    dtv = update->dt;
    dtf = dtfo;
  }

  if (igroup == 0) {
    const double * const rmass = atom->rmass;
    int n = ifrom * 3;
    for (int i = ifrom; i < ito; i++) {
      const double dtfir = dtfo / rmass[i];
      _dtfm[n++] = dtfir;
      _dtfm[n++] = dtfir;
      _dtfm[n++] = dtfir;
      double *shape = bonus[ellipsoid[i]].shape;
      double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
      if (idot != 0.0) idot = 1.0 / idot;
      _inertia0[i] = idot;
      idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
      if (idot != 0.0) idot = 1.0 / idot;
      _inertia1[i] = idot;
      idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
      if (idot != 0.0) idot = 1.0 / idot;
      _inertia2[i] = idot;
    }
  } else {
    const double * const rmass = atom->rmass;
    int n = ifrom * 3;
    for (int i = ifrom; i < ito; i++) {
      if (mask[i] & groupbit) {
        const double dtfir = dtfo / rmass[i];
        _dtfm[n++] = dtfir;
        _dtfm[n++] = dtfir;
        _dtfm[n++] = dtfir;
        double *shape = bonus[ellipsoid[i]].shape;
        double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
        if (idot != 0.0) idot = 1.0 / idot;
        _inertia0[i] = idot;
        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
        if (idot != 0.0) idot = 1.0 / idot;
        _inertia1[i] = idot;
        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
        if (idot != 0.0) idot = 1.0 / idot;
        _inertia2[i] = idot;
      } else {
        _dtfm[n++] = 0.0;
        _dtfm[n++] = 0.0;
        _dtfm[n++] = 0.0;
      }
    }
  }
  return dtfo;
}

double FixNVEAsphereGPU::memory_usage()
{
  return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
}