3239 lines
111 KiB
C++
3239 lines
111 KiB
C++
// clang-format off
|
|
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
https://www.lammps.org/, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
|
------------------------------------------------------------------------- */
|
|
|
|
#include "pppm_disp_intel.h"
|
|
|
|
#include "atom.h"
|
|
#include "comm.h"
|
|
#include "domain.h"
|
|
#include "error.h"
|
|
#include "force.h"
|
|
#include "gridcomm.h"
|
|
#include "math_const.h"
|
|
#include "math_special.h"
|
|
#include "memory.h"
|
|
#include "modify.h"
|
|
#include "suffix.h"
|
|
|
|
#include <cmath>
|
|
#include <cstring>
|
|
|
|
#include "omp_compat.h"
|
|
|
|
using namespace LAMMPS_NS;
|
|
using namespace MathConst;
|
|
using namespace MathSpecial;
|
|
|
|
#define MAXORDER 7
|
|
#define OFFSET 16384
|
|
#define SMALL 0.00001
|
|
#define LARGE 10000.0
|
|
#define EPS_HOC 1.0e-7
|
|
|
|
enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
|
|
enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE};
|
|
enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM,
|
|
FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G,
|
|
FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A,
|
|
FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE,
|
|
FORWARD_AD_PERATOM_NONE};
|
|
|
|
#ifdef FFT_SINGLE
|
|
#define ZEROF 0.0f
|
|
#define ONEF 1.0f
|
|
#else
|
|
#define ZEROF 0.0
|
|
#define ONEF 1.0
|
|
#endif
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp) : PPPMDisp(lmp)
|
|
{
|
|
suffix_flag |= Suffix::INTEL;
|
|
triclinic_support = 0;
|
|
|
|
order = 7;
|
|
order_6 = 7; //sets default stencil sizes to 7
|
|
|
|
perthread_density = nullptr;
|
|
particle_ekx = particle_eky = particle_ekz = nullptr;
|
|
particle_ekx0 = particle_eky0 = particle_ekz0 = nullptr;
|
|
particle_ekx1 = particle_eky1 = particle_ekz1 = nullptr;
|
|
particle_ekx2 = particle_eky2 = particle_ekz2 = nullptr;
|
|
particle_ekx3 = particle_eky3 = particle_ekz3 = nullptr;
|
|
particle_ekx4 = particle_eky4 = particle_ekz4 = nullptr;
|
|
particle_ekx5 = particle_eky5 = particle_ekz5 = nullptr;
|
|
particle_ekx6 = particle_eky6 = particle_ekz6 = nullptr;
|
|
|
|
rho_lookup = drho_lookup = nullptr;
|
|
rho6_lookup = drho6_lookup = nullptr;
|
|
rho_points = 0;
|
|
|
|
_use_table = _use_packing = _use_lrt = 0;
|
|
}
|
|
|
|
PPPMDispIntel::~PPPMDispIntel()
|
|
{
|
|
memory->destroy(perthread_density);
|
|
memory->destroy(particle_ekx);
|
|
memory->destroy(particle_eky);
|
|
memory->destroy(particle_ekz);
|
|
|
|
memory->destroy(rho_lookup);
|
|
memory->destroy(drho_lookup);
|
|
memory->destroy(rho6_lookup);
|
|
memory->destroy(drho6_lookup);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
called once before run
|
|
------------------------------------------------------------------------- */
|
|
|
|
void PPPMDispIntel::init()
|
|
{
|
|
|
|
PPPMDisp::init();
|
|
fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
|
|
if (!fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
_use_base = 0;
|
|
if (fix->offload_balance() != 0.0) {
|
|
_use_base = 1;
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
fix->kspace_init_check();
|
|
|
|
_use_lrt = fix->lrt();
|
|
if (_use_lrt)
|
|
error->all(FLERR,
|
|
"LRT mode is currently not supported for pppm/disp/intel");
|
|
|
|
|
|
// For vectorization, we need some padding in the end
|
|
// The first thread computes on the global density
|
|
if ((comm->nthreads > 1) && !_use_lrt) {
|
|
int mygrid = MAX(ngrid,ngrid_6);
|
|
memory->destroy(perthread_density);
|
|
memory->create(perthread_density, comm->nthreads-1,
|
|
mygrid + INTEL_P3M_ALIGNED_MAXORDER,
|
|
"pppmdispintel:perthread_density");
|
|
}
|
|
|
|
_use_table = fix->pppm_table();
|
|
if (_use_table) {
|
|
rho_points = 5000;
|
|
memory->destroy(rho_lookup);
|
|
memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
|
"pppmdispintel:rho_lookup");
|
|
memory->destroy(rho6_lookup);
|
|
memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
|
"pppmdispintel:rho6_lookup");
|
|
|
|
if (differentiation_flag == 1) {
|
|
memory->destroy(drho_lookup);
|
|
memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
|
"pppmdispintel:drho_lookup");
|
|
memory->destroy(drho6_lookup);
|
|
memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
|
"pppmdispintel:drho6_lookup");
|
|
}
|
|
precompute_rho();
|
|
}
|
|
if (order > INTEL_P3M_MAXORDER)
|
|
error->all(FLERR,"PPPM order greater than supported by INTEL\n");
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute the PPPMDispIntel long-range force, energy, virial
|
|
------------------------------------------------------------------------- */
|
|
|
|
void PPPMDispIntel::compute(int eflag, int vflag)
|
|
{
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
if (_use_base) {
|
|
PPPMDisp::compute(eflag, vflag);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
int i;
|
|
|
|
// convert atoms from box to lamda coords
|
|
|
|
ev_init(eflag,vflag);
|
|
|
|
if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
|
|
|
|
if (triclinic == 0) boxlo = domain->boxlo;
|
|
else {
|
|
boxlo = domain->boxlo_lamda;
|
|
domain->x2lamda(atom->nlocal);
|
|
}
|
|
// extend size of per-atom arrays if necessary
|
|
|
|
if (atom->nmax > nmax) {
|
|
|
|
if (function[0]) memory->destroy(part2grid);
|
|
if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6);
|
|
if (differentiation_flag == 1) {
|
|
memory->destroy(particle_ekx);
|
|
memory->destroy(particle_eky);
|
|
memory->destroy(particle_ekz);
|
|
if (function[2] == 1) {
|
|
memory->destroy(particle_ekx0);
|
|
memory->destroy(particle_eky0);
|
|
memory->destroy(particle_ekz0);
|
|
memory->destroy(particle_ekx1);
|
|
memory->destroy(particle_eky1);
|
|
memory->destroy(particle_ekz1);
|
|
memory->destroy(particle_ekx2);
|
|
memory->destroy(particle_eky2);
|
|
memory->destroy(particle_ekz2);
|
|
memory->destroy(particle_ekx3);
|
|
memory->destroy(particle_eky3);
|
|
memory->destroy(particle_ekz3);
|
|
memory->destroy(particle_ekx4);
|
|
memory->destroy(particle_eky4);
|
|
memory->destroy(particle_ekz4);
|
|
memory->destroy(particle_ekx5);
|
|
memory->destroy(particle_eky5);
|
|
memory->destroy(particle_ekz5);
|
|
memory->destroy(particle_ekx6);
|
|
memory->destroy(particle_eky6);
|
|
memory->destroy(particle_ekz6);
|
|
}
|
|
|
|
}
|
|
nmax = atom->nmax;
|
|
if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid");
|
|
if (function[1] + function[2] + function[3])
|
|
memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6");
|
|
if (differentiation_flag == 1) {
|
|
memory->create(particle_ekx, nmax, "pppmdispintel:pekx");
|
|
memory->create(particle_eky, nmax, "pppmdispintel:peky");
|
|
memory->create(particle_ekz, nmax, "pppmdispintel:pekz");
|
|
if (function[2] == 1) {
|
|
memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0");
|
|
memory->create(particle_eky0, nmax, "pppmdispintel:peky0");
|
|
memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0");
|
|
memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1");
|
|
memory->create(particle_eky1, nmax, "pppmdispintel:peky1");
|
|
memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1");
|
|
memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2");
|
|
memory->create(particle_eky2, nmax, "pppmdispintel:peky2");
|
|
memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2");
|
|
memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3");
|
|
memory->create(particle_eky3, nmax, "pppmdispintel:peky3");
|
|
memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3");
|
|
memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4");
|
|
memory->create(particle_eky4, nmax, "pppmdispintel:peky4");
|
|
memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4");
|
|
memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5");
|
|
memory->create(particle_eky5, nmax, "pppmdispintel:peky5");
|
|
memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5");
|
|
memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6");
|
|
memory->create(particle_eky6, nmax, "pppmdispintel:peky6");
|
|
memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6");
|
|
}
|
|
}
|
|
}
|
|
energy = 0.0;
|
|
energy_1 = 0.0;
|
|
energy_6 = 0.0;
|
|
if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0;
|
|
|
|
// find grid points for all my particles
|
|
// distribute partcles' charges/dispersion coefficients on the grid
|
|
// communication between processors and remapping two fft
|
|
// Solution of poissons equation in k-space and backtransformation
|
|
// communication between processors
|
|
// calculation of forces
|
|
|
|
if (function[0]) {
|
|
|
|
//perform calculations for coulomb interactions only
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid,
|
|
nupper, nlower, nxlo_out, nylo_out, nzlo_out,
|
|
nxhi_out, nyhi_out, nzhi_out,
|
|
fix->get_mixed_buffers());
|
|
make_rho_c<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid,
|
|
nupper, nlower, nxlo_out, nylo_out,
|
|
nzlo_out, nxhi_out, nyhi_out, nzhi_out,
|
|
fix->get_double_buffers());
|
|
make_rho_c<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid,
|
|
nupper, nlower, nxlo_out, nylo_out, nzlo_out,
|
|
nxhi_out, nyhi_out, nzhi_out,
|
|
fix->get_single_buffers());
|
|
make_rho_c<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
gc->reverse_comm(GridComm::KSPACE,this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
|
|
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
|
|
|
brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
|
|
density_brick, density_fft, work1,remap);
|
|
|
|
if (differentiation_flag == 1) {
|
|
poisson_ad(work1, work2, density_fft, fft1, fft2,
|
|
nx_pppm, ny_pppm, nz_pppm, nfft,
|
|
nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
|
|
nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
|
|
energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick,
|
|
v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
|
|
|
|
gc->forward_comm(GridComm::KSPACE,this,1,sizeof(FFT_SCALAR),FORWARD_AD,
|
|
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_c_ad<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_c_ad<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (vflag_atom)
|
|
gc->forward_comm(GridComm::KSPACE,this,6,sizeof(FFT_SCALAR),
|
|
FORWARD_AD_PERATOM,gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
|
|
|
} else {
|
|
poisson_ik(work1, work2, density_fft, fft1, fft2,
|
|
nx_pppm, ny_pppm, nz_pppm, nfft,
|
|
nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
|
|
nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
|
|
energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2,
|
|
vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2,
|
|
u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick,
|
|
v5_brick);
|
|
|
|
gc->forward_comm(GridComm::KSPACE,this,3,sizeof(FFT_SCALAR),FORWARD_IK,
|
|
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_c_ik<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_c_ik<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (evflag_atom)
|
|
gc->forward_comm(GridComm::KSPACE,this,7,sizeof(FFT_SCALAR),
|
|
FORWARD_IK_PERATOM,gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
|
}
|
|
if (evflag_atom) fieldforce_c_peratom();
|
|
}
|
|
|
|
if (function[1]) {
|
|
|
|
//perform calculations for geometric mixing
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_mixed_buffers());
|
|
make_rho_g<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_double_buffers());
|
|
make_rho_g<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_single_buffers());
|
|
make_rho_g<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
gc6->reverse_comm(GridComm::KSPACE,this,1,sizeof(FFT_SCALAR),REVERSE_RHO_G,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
|
|
density_brick_g, density_fft_g, work1_6,remap_6);
|
|
|
|
if (differentiation_flag == 1) {
|
|
poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
|
|
nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
|
|
nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6,
|
|
nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6,
|
|
nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6,
|
|
virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g,
|
|
v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
|
|
|
|
gc6->forward_comm(GridComm::KSPACE,this,1,sizeof(FFT_SCALAR),FORWARD_AD_G,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_g_ad<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_g_ad<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (vflag_atom)
|
|
gc6->forward_comm(GridComm::KSPACE,this,7,sizeof(FFT_SCALAR),
|
|
FORWARD_AD_PERATOM_G,gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
} else {
|
|
poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
|
|
nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
|
|
nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
|
|
nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
|
|
nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,
|
|
fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g,
|
|
vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g,
|
|
v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
|
|
|
|
gc6->forward_comm(GridComm::KSPACE,this,3,sizeof(FFT_SCALAR),FORWARD_IK_G,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_g_ik<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_g_ik<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (evflag_atom)
|
|
gc6->forward_comm(GridComm::KSPACE,this,6,sizeof(FFT_SCALAR),
|
|
FORWARD_IK_PERATOM_G,gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
}
|
|
|
|
if (evflag_atom) fieldforce_g_peratom();
|
|
}
|
|
|
|
if (function[2]) {
|
|
//perform calculations for arithmetic mixing
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6,
|
|
nxlo_out_6, nylo_out_6, nzlo_out_6,
|
|
nxhi_out_6, nyhi_out_6, nzhi_out_6,
|
|
fix->get_mixed_buffers());
|
|
make_rho_a<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_double_buffers());
|
|
make_rho_a<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_single_buffers());
|
|
make_rho_a<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
gc->reverse_comm(GridComm::KSPACE,this,7,sizeof(FFT_SCALAR),REVERSE_RHO_A,
|
|
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
|
|
|
brick2fft_a();
|
|
|
|
if (differentiation_flag == 1) {
|
|
poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
|
|
nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
|
|
nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
|
|
nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
|
|
nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6,
|
|
u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
|
|
v3_brick_a3, v4_brick_a3, v5_brick_a3);
|
|
poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0,
|
|
v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0,
|
|
v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6,
|
|
v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
|
|
poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1,
|
|
v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1,
|
|
v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5,
|
|
v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
|
|
poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2,
|
|
v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2,
|
|
v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4,
|
|
v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
|
|
|
|
gc6->forward_comm(GridComm::KSPACE,this,7,sizeof(FFT_SCALAR),FORWARD_AD_A,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_a_ad<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_a_ad<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (evflag_atom)
|
|
gc6->forward_comm(GridComm::KSPACE,this,42,sizeof(FFT_SCALAR),
|
|
FORWARD_AD_PERATOM_A,gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
} else {
|
|
poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
|
|
nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
|
|
nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
|
|
nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
|
|
nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6,
|
|
fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3,
|
|
virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3,
|
|
v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
|
|
poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0,
|
|
vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6,
|
|
vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0,
|
|
v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
|
|
u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
|
|
v3_brick_a6, v4_brick_a6, v5_brick_a6);
|
|
poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1,
|
|
vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5,
|
|
vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1,
|
|
v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
|
|
u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
|
|
v3_brick_a5, v4_brick_a5, v5_brick_a5);
|
|
poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2,
|
|
vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4,
|
|
vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2,
|
|
v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
|
|
u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
|
|
v3_brick_a4, v4_brick_a4, v5_brick_a4);
|
|
|
|
gc6->forward_comm(GridComm::KSPACE,this,18,sizeof(FFT_SCALAR),FORWARD_IK_A,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_a_ik<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_a_ik<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (evflag_atom)
|
|
gc6->forward_comm(GridComm::KSPACE,this,49,sizeof(FFT_SCALAR),
|
|
FORWARD_IK_PERATOM_A,gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
}
|
|
|
|
if (evflag_atom) fieldforce_a_peratom();
|
|
}
|
|
|
|
if (function[3]) {
|
|
|
|
// perform calculations if no mixing rule applies
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_mixed_buffers());
|
|
make_rho_none<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_double_buffers());
|
|
make_rho_none<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
|
|
part2grid_6, nupper_6, nlower_6, nxlo_out_6,
|
|
nylo_out_6, nzlo_out_6, nxhi_out_6,
|
|
nyhi_out_6, nzhi_out_6,
|
|
fix->get_single_buffers());
|
|
make_rho_none<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
gc->reverse_comm(GridComm::KSPACE,this,1,sizeof(FFT_SCALAR),REVERSE_RHO_NONE,
|
|
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
|
|
|
brick2fft_none();
|
|
|
|
if (differentiation_flag == 1) {
|
|
|
|
int n = 0;
|
|
for (int k = 0; k<nsplit_alloc/2; k++) {
|
|
poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1],
|
|
u_brick_none[n],u_brick_none[n+1],
|
|
v0_brick_none, v1_brick_none, v2_brick_none,
|
|
v3_brick_none, v4_brick_none, v5_brick_none);
|
|
n += 2;
|
|
}
|
|
|
|
gc6->forward_comm(GridComm::KSPACE,this,1,sizeof(FFT_SCALAR),
|
|
FORWARD_AD_NONE,gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_none_ad<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_none_ad<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (vflag_atom)
|
|
gc6->forward_comm(GridComm::KSPACE,this,6,sizeof(FFT_SCALAR),
|
|
FORWARD_AD_PERATOM_NONE,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
} else {
|
|
int n = 0;
|
|
for (int k = 0; k<nsplit_alloc/2; k++) {
|
|
|
|
poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1],
|
|
vdx_brick_none[n], vdy_brick_none[n],
|
|
vdz_brick_none[n], vdx_brick_none[n+1],
|
|
vdy_brick_none[n+1], vdz_brick_none[n+1],
|
|
u_brick_none, v0_brick_none, v1_brick_none,
|
|
v2_brick_none, v3_brick_none, v4_brick_none,
|
|
v5_brick_none);
|
|
n += 2;
|
|
}
|
|
|
|
gc6->forward_comm(GridComm::KSPACE,this,3,sizeof(FFT_SCALAR),
|
|
FORWARD_IK_NONE,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
|
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
|
|
fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
|
|
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
|
|
fieldforce_none_ik<double,double>(fix->get_double_buffers());
|
|
} else {
|
|
fieldforce_none_ik<float,float>(fix->get_single_buffers());
|
|
}
|
|
|
|
if (evflag_atom)
|
|
gc6->forward_comm(GridComm::KSPACE,this,7,sizeof(FFT_SCALAR),
|
|
FORWARD_IK_PERATOM_NONE,
|
|
gc6_buf1,gc6_buf2,MPI_FFT_SCALAR);
|
|
}
|
|
|
|
if (evflag_atom) fieldforce_none_peratom();
|
|
}
|
|
|
|
// update qsum and qsqsum, if atom count has changed and energy needed
|
|
|
|
if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) {
|
|
qsum_qsq();
|
|
natoms_original = atom->natoms;
|
|
}
|
|
|
|
// sum energy across procs and add in volume-dependent term
|
|
|
|
const double qscale = force->qqrd2e * scale;
|
|
if (eflag_global) {
|
|
double energy_all;
|
|
MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
|
|
energy_1 = energy_all;
|
|
MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
|
|
energy_6 = energy_all;
|
|
|
|
energy_1 *= 0.5*volume;
|
|
energy_6 *= 0.5*volume;
|
|
|
|
energy_1 -= g_ewald*qsqsum/MY_PIS +
|
|
MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
|
|
energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij +
|
|
1.0/12.0*pow(g_ewald_6,6)*csum;
|
|
energy_1 *= qscale;
|
|
}
|
|
|
|
// sum virial across procs
|
|
|
|
if (vflag_global) {
|
|
double virial_all[6];
|
|
MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
|
|
for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
|
|
MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
|
|
for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
|
|
if (function[1]+function[2]+function[3]) {
|
|
double a = MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
|
|
virial[0] -= a;
|
|
virial[1] -= a;
|
|
virial[2] -= a;
|
|
}
|
|
}
|
|
|
|
if (eflag_atom) {
|
|
if (function[0]) {
|
|
double *q = atom->q;
|
|
for (i = 0; i < atom->nlocal; i++) {
|
|
eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]*
|
|
qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction
|
|
}
|
|
}
|
|
if (function[1] + function[2] + function[3]) {
|
|
int tmp;
|
|
for (i = 0; i < atom->nlocal; i++) {
|
|
tmp = atom->type[i];
|
|
eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] +
|
|
1.0/12.0*pow(g_ewald_6,6)*cii[tmp];
|
|
}
|
|
}
|
|
}
|
|
|
|
if (vflag_atom) {
|
|
if (function[1] + function[2] + function[3]) {
|
|
int tmp;
|
|
for (i = 0; i < atom->nlocal; i++) {
|
|
tmp = atom->type[i];
|
|
//dispersion self virial correction
|
|
for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*
|
|
pow(g_ewald_6,3)*csumi[tmp];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// 2d slab correction
|
|
|
|
if (slabflag) slabcorr(eflag);
|
|
if (function[0]) energy += energy_1;
|
|
if (function[1] + function[2] + function[3]) energy += energy_6;
|
|
|
|
// convert atoms back from lamda to box coords
|
|
|
|
if (triclinic) domain->lamda2x(atom->nlocal);
|
|
}
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
find center grid pt for each of my particles
|
|
check that full stencil for the particle will fit in my 3d brick
|
|
store central grid pt indices in part2grid array
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t>
|
|
void PPPMDispIntel::particle_map(double delx, double dely, double delz,
|
|
double sft, int** p2g, int nup, int nlow,
|
|
int nxlo, int nylo, int nzlo,
|
|
int nxhi, int nyhi, int nzhi,
|
|
IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
if (!std::isfinite(boxlo[0]) || !std::isfinite(boxlo[1]) || !std::isfinite(boxlo[2]))
|
|
error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
|
|
|
|
int flag = 0;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\
|
|
nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
double **x = atom->x;
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delx;
|
|
const flt_t yi = dely;
|
|
const flt_t zi = delz;
|
|
const flt_t fshift = sft;
|
|
|
|
|
|
int iifrom, iito, tid;
|
|
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd reduction(+:flag)
|
|
#else
|
|
#pragma simd reduction(+:flag)
|
|
#endif
|
|
#pragma vector aligned
|
|
#endif
|
|
for (int i = iifrom; i < iito; i++) {
|
|
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// current particle coord can be outside global and local box
|
|
// add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
|
|
|
|
int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET;
|
|
int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET;
|
|
int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET;
|
|
|
|
p2g[i][0] = nx;
|
|
p2g[i][1] = ny;
|
|
p2g[i][2] = nz;
|
|
|
|
// check that entire stencil around nx,ny,nz will fit in my 3d brick
|
|
|
|
if (nx+nlow < nxlo || nx+nup > nxhi ||
|
|
ny+nlow < nylo || ny+nup > nyhi ||
|
|
nz+nlow < nzlo || nz+nup > nzhi)
|
|
flag = 1;
|
|
}
|
|
}
|
|
|
|
if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp");
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
create discretized "density" on section of global grid due to my particles
|
|
density(x,y,z) = charge "density" at grid points of my 3d brick
|
|
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
|
|
in global grid
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
// clear 3d density array
|
|
|
|
FFT_SCALAR * _noalias global_density =
|
|
&(density_brick[nzlo_out][nylo_out][nxlo_out]);
|
|
|
|
// loop over my charges, add their contribution to nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nthr, nlocal, global_density) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
double *q = atom->q;
|
|
double **x = atom->x;
|
|
|
|
const int nix = nxhi_out - nxlo_out + 1;
|
|
const int niy = nyhi_out - nylo_out + 1;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv;
|
|
const flt_t yi = delyinv;
|
|
const flt_t zi = delzinv;
|
|
const flt_t fshiftone = shiftone;
|
|
const flt_t fdelvolinv = delvolinv;
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
|
|
perthread_density[tid - 1];
|
|
// clear 3d density array
|
|
memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
|
|
int nx = part2grid[i][0];
|
|
int ny = part2grid[i][1];
|
|
int nz = part2grid[i][2];
|
|
|
|
int nysum = nlower + ny - nylo_out;
|
|
int nxsum = nlower + nx - nxlo_out;
|
|
int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho_lookup[idx][k];
|
|
rho[1][k] = rho_lookup[idy][k];
|
|
rho[2][k] = rho_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower; k <= nupper; k++) {
|
|
FFT_SCALAR r1,r2,r3;
|
|
r1 = r2 = r3 = ZEROF;
|
|
|
|
for (int l = order-1; l >= 0; l--) {
|
|
r1 = rho_coeff[l][k] + r1*dx;
|
|
r2 = rho_coeff[l][k] + r2*dy;
|
|
r3 = rho_coeff[l][k] + r3*dz;
|
|
}
|
|
rho[0][k-nlower] = r1;
|
|
rho[1][k-nlower] = r2;
|
|
rho[2][k-nlower] = r3;
|
|
}
|
|
}
|
|
|
|
FFT_SCALAR z0 = fdelvolinv * q[i];
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order; n++) {
|
|
int mz = n*nix*niy + nzsum;
|
|
FFT_SCALAR y0 = z0*rho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order; m++) {
|
|
int mzy = m*nix + mz;
|
|
FFT_SCALAR x0 = y0*rho[1][m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mzyx = l + mzy;
|
|
my_density[mzyx] += x0*rho[0][l];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// reduce all the perthread_densities into global_density
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nthr, global_density) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int i = ifrom; i < ito; i++) {
|
|
for (int j = 1; j < nthr; j++) {
|
|
global_density[i] += perthread_density[j-1][i];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
create discretized "density" on section of global grid due to my particles
|
|
density(x,y,z) = dispersion "density" at grid points of my 3d brick
|
|
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
|
|
in global grid --- geometric mixing
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
// clear 3d density array
|
|
|
|
FFT_SCALAR * _noalias global_density =
|
|
&(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
|
|
|
|
// loop over my charges, add their contribution to nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nthr, nlocal, global_density) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
int type;
|
|
double **x = atom->x;
|
|
|
|
const int nix = nxhi_out_6 - nxlo_out_6 + 1;
|
|
const int niy = nyhi_out_6 - nylo_out_6 + 1;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
const flt_t fdelvolinv = delvolinv_6;
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
|
|
perthread_density[tid - 1];
|
|
|
|
// clear 3d density array
|
|
memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
|
|
int nysum = nlower_6 + ny - nylo_out_6;
|
|
int nxsum = nlower_6 + nx - nxlo_out_6;
|
|
int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho6_lookup[idx][k];
|
|
rho[1][k] = rho6_lookup[idy][k];
|
|
rho[2][k] = rho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1,r2,r3;
|
|
r1 = r2 = r3 = ZEROF;
|
|
|
|
for (int l = order_6-1; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1*dx;
|
|
r2 = rho_coeff_6[l][k] + r2*dy;
|
|
r3 = rho_coeff_6[l][k] + r3*dz;
|
|
}
|
|
rho[0][k-nlower_6] = r1;
|
|
rho[1][k-nlower_6] = r2;
|
|
rho[2][k-nlower_6] = r3;
|
|
}
|
|
}
|
|
|
|
type = atom->type[i];
|
|
FFT_SCALAR z0 = fdelvolinv * B[type];
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n*nix*niy + nzsum;
|
|
FFT_SCALAR y0 = z0*rho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int mzy = m*nix + mz;
|
|
FFT_SCALAR x0 = y0*rho[1][m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mzyx = l + mzy;
|
|
my_density[mzyx] += x0*rho[0][l];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// reduce all the perthread_densities into global_density
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nthr, global_density) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int i = ifrom; i < ito; i++) {
|
|
for (int j = 1; j < nthr; j++) {
|
|
global_density[i] += perthread_density[j-1][i];
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
create discretized "density" on section of global grid due to my particles
|
|
density(x,y,z) = dispersion "density" at grid points of my 3d brick
|
|
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
|
|
in global grid --- arithmetic mixing
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
// clear 3d density array
|
|
|
|
memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,ngrid_6*sizeof(FFT_SCALAR));
|
|
memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,ngrid_6*sizeof(FFT_SCALAR));
|
|
memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,ngrid_6*sizeof(FFT_SCALAR));
|
|
memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,ngrid_6*sizeof(FFT_SCALAR));
|
|
memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,ngrid_6*sizeof(FFT_SCALAR));
|
|
memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,ngrid_6*sizeof(FFT_SCALAR));
|
|
memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,ngrid_6*sizeof(FFT_SCALAR));
|
|
|
|
// loop over my charges, add their contribution to nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
|
|
int nlocal = atom->nlocal;
|
|
double **x = atom->x;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
const flt_t fdelvolinv = delvolinv_6;
|
|
|
|
for (int i = 0; i < nlocal; i++) {
|
|
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
|
|
int nxsum = nx + nlower_6;
|
|
int nysum = ny + nlower_6;
|
|
int nzsum = nz + nlower_6;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho6_lookup[idx][k];
|
|
rho[1][k] = rho6_lookup[idy][k];
|
|
rho[2][k] = rho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1,r2,r3;
|
|
r1 = r2 = r3 = ZEROF;
|
|
|
|
for (int l = order_6-1; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1*dx;
|
|
r2 = rho_coeff_6[l][k] + r2*dy;
|
|
r3 = rho_coeff_6[l][k] + r3*dz;
|
|
}
|
|
rho[0][k-nlower_6] = r1;
|
|
rho[1][k-nlower_6] = r2;
|
|
rho[2][k-nlower_6] = r3;
|
|
}
|
|
}
|
|
|
|
const int type = atom->type[i];
|
|
FFT_SCALAR z0 = fdelvolinv;
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n + nzsum;
|
|
FFT_SCALAR y0 = z0*rho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int my = m + nysum;
|
|
FFT_SCALAR x0 = y0*rho[1][m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l + nxsum;
|
|
FFT_SCALAR w = x0*rho[0][l];
|
|
density_brick_a0[mz][my][mx] += w*B[7*type];
|
|
density_brick_a1[mz][my][mx] += w*B[7*type+1];
|
|
density_brick_a2[mz][my][mx] += w*B[7*type+2];
|
|
density_brick_a3[mz][my][mx] += w*B[7*type+3];
|
|
density_brick_a4[mz][my][mx] += w*B[7*type+4];
|
|
density_brick_a5[mz][my][mx] += w*B[7*type+5];
|
|
density_brick_a6[mz][my][mx] += w*B[7*type+6];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
create discretized "density" on section of global grid due to my particles
|
|
density(x,y,z) = dispersion "density" at grid points of my 3d brick
|
|
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
|
|
in global grid --- case when mixing rules don't apply
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]);
|
|
|
|
// loop over my charges, add their contribution to nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nthr, nlocal, global_density) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
double **x = atom->x;
|
|
|
|
const int nix = nxhi_out_6 - nxlo_out_6 + 1;
|
|
const int niy = nyhi_out_6 - nylo_out_6 + 1;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
const flt_t fdelvolinv = delvolinv_6;
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
|
|
perthread_density[tid - 1];
|
|
// clear 3d density array
|
|
memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
|
|
int nysum = nlower_6 + ny - nylo_out_6;
|
|
int nxsum = nlower_6 + nx - nxlo_out_6;
|
|
int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho6_lookup[idx][k];
|
|
rho[1][k] = rho6_lookup[idy][k];
|
|
rho[2][k] = rho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1,r2,r3;
|
|
r1 = r2 = r3 = ZEROF;
|
|
|
|
for (int l = order_6-1; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1*dx;
|
|
r2 = rho_coeff_6[l][k] + r2*dy;
|
|
r3 = rho_coeff_6[l][k] + r3*dz;
|
|
}
|
|
rho[0][k-nlower_6] = r1;
|
|
rho[1][k-nlower_6] = r2;
|
|
rho[2][k-nlower_6] = r3;
|
|
}
|
|
}
|
|
|
|
FFT_SCALAR z0 = fdelvolinv;
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n*nix*niy + nzsum;
|
|
FFT_SCALAR y0 = z0*rho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int mzy = m*nix + mz;
|
|
FFT_SCALAR x0 = y0*rho[1][m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mzyx = l + mzy;
|
|
for (int k = 0; k < nsplit; k++)
|
|
my_density[mzyx + k*ngrid_6] += x0*rho[0][l];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// reduce all the perthread_densities into global_density
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nthr, global_density) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int i = ifrom; i < ito; i++) {
|
|
for (int j = 1; j < nthr; j++) {
|
|
global_density[i] += perthread_density[j-1][i];
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get electric field & force on my particles
|
|
for ik scheme
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of E-field on particle
|
|
|
|
//double *q = atom->q;
|
|
//double **x = atom->x;
|
|
//double **f = atom->f;
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
|
|
double *q = atom->q;
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv;
|
|
const flt_t yi = delyinv;
|
|
const flt_t zi = delzinv;
|
|
const flt_t fshiftone = shiftone;
|
|
const flt_t fqqrd2es = qqrd2e * scale;
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid[i][0];
|
|
int ny = part2grid[i][1];
|
|
int nz = part2grid[i][2];
|
|
|
|
int nxsum = nx + nlower;
|
|
int nysum = ny + nlower;
|
|
int nzsum = nz + nlower;;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho0[k] = rho_lookup[idx][k];
|
|
rho1[k] = rho_lookup[idy][k];
|
|
rho2[k] = rho_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower; k <= nupper; k++) {
|
|
FFT_SCALAR r1 = rho_coeff[order-1][k];
|
|
FFT_SCALAR r2 = rho_coeff[order-1][k];
|
|
FFT_SCALAR r3 = rho_coeff[order-1][k];
|
|
for (int l = order-2; l >= 0; l--) {
|
|
r1 = rho_coeff[l][k] + r1*dx;
|
|
r2 = rho_coeff[l][k] + r2*dy;
|
|
r3 = rho_coeff[l][k] + r3*dz;
|
|
}
|
|
|
|
rho0[k-nlower] = r1;
|
|
rho1[k-nlower] = r2;
|
|
rho2[k-nlower] = r3;
|
|
}
|
|
}
|
|
|
|
_alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order; n++) {
|
|
int mz = n+nzsum;
|
|
FFT_SCALAR z0 = rho2[n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order; m++) {
|
|
int my = m+nysum;
|
|
FFT_SCALAR y0 = z0*rho1[m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l+nxsum;
|
|
FFT_SCALAR x0 = y0*rho0[l];
|
|
ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
|
|
eky_arr[l] -= x0*vdy_brick[mz][my][mx];
|
|
ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
FFT_SCALAR ekx, eky, ekz;
|
|
ekx = eky = ekz = ZEROF;
|
|
|
|
for (int l = 0; l < order; l++) {
|
|
ekx += ekx_arr[l];
|
|
eky += eky_arr[l];
|
|
ekz += ekz_arr[l];
|
|
}
|
|
|
|
// convert E-field to force
|
|
|
|
const flt_t qfactor = fqqrd2es * q[i];
|
|
f[i][0] += qfactor*ekx;
|
|
f[i][1] += qfactor*eky;
|
|
if (slabflag != 2) f[i][2] += qfactor*ekz;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get electric field & force on my particles
|
|
for ad scheme
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of E-field on particle
|
|
|
|
//double *q = atom->q;
|
|
//double **x = atom->x;
|
|
//double **f = atom->f;
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
|
|
FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
|
|
FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
|
|
double *prd;
|
|
if (triclinic == 0) prd = domain->prd;
|
|
else prd = domain->prd_lamda;
|
|
|
|
double *q = atom->q;
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
const flt_t ftwo_pi = MY_PI * 2.0;
|
|
const flt_t ffour_pi = MY_PI * 4.0;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv;
|
|
const flt_t yi = delyinv;
|
|
const flt_t zi = delzinv;
|
|
const flt_t fshiftone = shiftone;
|
|
const flt_t fqqrd2es = qqrd2e * scale;
|
|
|
|
const double xprd = prd[0];
|
|
const double yprd = prd[1];
|
|
const double zprd = prd[2]*slab_volfactor;
|
|
|
|
const flt_t hx_inv = nx_pppm/xprd;
|
|
const flt_t hy_inv = ny_pppm/yprd;
|
|
const flt_t hz_inv = nz_pppm/zprd;
|
|
|
|
const flt_t fsf_coeff0 = sf_coeff[0];
|
|
const flt_t fsf_coeff1 = sf_coeff[1];
|
|
const flt_t fsf_coeff2 = sf_coeff[2];
|
|
const flt_t fsf_coeff3 = sf_coeff[3];
|
|
const flt_t fsf_coeff4 = sf_coeff[4];
|
|
const flt_t fsf_coeff5 = sf_coeff[5];
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid[i][0];
|
|
int ny = part2grid[i][1];
|
|
int nz = part2grid[i][2];
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
int nxsum = nx + nlower;
|
|
int nysum = ny + nlower;
|
|
int nzsum = nz + nlower;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho_lookup[idx][k];
|
|
rho[1][k] = rho_lookup[idy][k];
|
|
rho[2][k] = rho_lookup[idz][k];
|
|
drho[0][k] = drho_lookup[idx][k];
|
|
drho[1][k] = drho_lookup[idy][k];
|
|
drho[2][k] = drho_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower; k <= nupper; k++) {
|
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
|
dr1 = dr2 = dr3 = ZEROF;
|
|
|
|
r1 = rho_coeff[order-1][k];
|
|
r2 = rho_coeff[order-1][k];
|
|
r3 = rho_coeff[order-1][k];
|
|
for (int l = order-2; l >= 0; l--) {
|
|
r1 = rho_coeff[l][k] + r1 * dx;
|
|
r2 = rho_coeff[l][k] + r2 * dy;
|
|
r3 = rho_coeff[l][k] + r3 * dz;
|
|
dr1 = drho_coeff[l][k] + dr1 * dx;
|
|
dr2 = drho_coeff[l][k] + dr2 * dy;
|
|
dr3 = drho_coeff[l][k] + dr3 * dz;
|
|
}
|
|
rho[0][k-nlower] = r1;
|
|
rho[1][k-nlower] = r2;
|
|
rho[2][k-nlower] = r3;
|
|
drho[0][k-nlower] = dr1;
|
|
drho[1][k-nlower] = dr2;
|
|
drho[2][k-nlower] = dr3;
|
|
}
|
|
}
|
|
_alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order; n++) {
|
|
int mz = n + nzsum;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order; m++) {
|
|
int my = m + nysum;
|
|
FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
|
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l + nxsum;
|
|
ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
|
|
eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx];
|
|
ekz[l] += rho[0][l] * ekz_p * u_brick[mz][my][mx];
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
particle_ekx[i] += ekx[l];
|
|
particle_eky[i] += eky[l];
|
|
particle_ekz[i] += ekz[l];
|
|
}
|
|
}
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int i = ifrom; i < ito; i++) {
|
|
particle_ekx[i] *= hx_inv;
|
|
particle_eky[i] *= hy_inv;
|
|
particle_ekz[i] *= hz_inv;
|
|
|
|
// convert E-field to force
|
|
|
|
const flt_t qfactor = fqqrd2es * q[i];
|
|
const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
|
|
|
|
const flt_t s1 = x[i][0] * hx_inv;
|
|
const flt_t s2 = x[i][1] * hy_inv;
|
|
const flt_t s3 = x[i][2] * hz_inv;
|
|
flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
|
|
sf += fsf_coeff1 * sin(ffour_pi * s1);
|
|
sf *= twoqsq;
|
|
f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf;
|
|
|
|
sf = fsf_coeff2 * sin(ftwo_pi * s2);
|
|
sf += fsf_coeff3 * sin(ffour_pi * s2);
|
|
sf *= twoqsq;
|
|
f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf;
|
|
|
|
sf = fsf_coeff4 * sin(ftwo_pi * s3);
|
|
sf += fsf_coeff5 * sin(ffour_pi * s3);
|
|
sf *= twoqsq;
|
|
|
|
if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get dispersion field & force on my particles
|
|
for geometric mixing rule
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of dispersion field on particle
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
|
|
double lj;
|
|
int type;
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
|
|
int nxsum = nx + nlower_6;
|
|
int nysum = ny + nlower_6;
|
|
int nzsum = nz + nlower_6;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho0[k] = rho6_lookup[idx][k];
|
|
rho1[k] = rho6_lookup[idy][k];
|
|
rho2[k] = rho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
|
FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
|
|
FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
|
|
for (int l = order_6-2; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1*dx;
|
|
r2 = rho_coeff_6[l][k] + r2*dy;
|
|
r3 = rho_coeff_6[l][k] + r3*dz;
|
|
}
|
|
|
|
rho0[k-nlower_6] = r1;
|
|
rho1[k-nlower_6] = r2;
|
|
rho2[k-nlower_6] = r3;
|
|
}
|
|
}
|
|
|
|
_alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n+nzsum;
|
|
FFT_SCALAR z0 = rho2[n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int my = m+nysum;
|
|
FFT_SCALAR y0 = z0*rho1[m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l+nxsum;
|
|
FFT_SCALAR x0 = y0*rho0[l];
|
|
ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
|
|
eky_arr[l] -= x0*vdy_brick_g[mz][my][mx];
|
|
ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx];
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
FFT_SCALAR ekx, eky, ekz;
|
|
ekx = eky = ekz = ZEROF;
|
|
|
|
for (int l = 0; l < order; l++) {
|
|
ekx += ekx_arr[l];
|
|
eky += eky_arr[l];
|
|
ekz += ekz_arr[l];
|
|
}
|
|
|
|
// convert E-field to force
|
|
|
|
type = atom->type[i];
|
|
lj = B[type];
|
|
f[i][0] += lj*ekx;
|
|
f[i][1] += lj*eky;
|
|
if (slabflag != 2) f[i][2] += lj*ekz;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get dispersion field & force on my particles
|
|
for geometric mixing rule for ad scheme
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of dispersion field on particle
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
|
|
FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
|
|
FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
|
|
double *prd;
|
|
if (triclinic == 0) prd = domain->prd;
|
|
else prd = domain->prd_lamda;
|
|
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
const flt_t ftwo_pi = MY_PI * 2.0;
|
|
const flt_t ffour_pi = MY_PI * 4.0;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
|
|
const double xprd = prd[0];
|
|
const double yprd = prd[1];
|
|
const double zprd = prd[2]*slab_volfactor;
|
|
|
|
const flt_t hx_inv = nx_pppm_6/xprd;
|
|
const flt_t hy_inv = ny_pppm_6/yprd;
|
|
const flt_t hz_inv = nz_pppm_6/zprd;
|
|
|
|
const flt_t fsf_coeff0 = sf_coeff_6[0];
|
|
const flt_t fsf_coeff1 = sf_coeff_6[1];
|
|
const flt_t fsf_coeff2 = sf_coeff_6[2];
|
|
const flt_t fsf_coeff3 = sf_coeff_6[3];
|
|
const flt_t fsf_coeff4 = sf_coeff_6[4];
|
|
const flt_t fsf_coeff5 = sf_coeff_6[5];
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
int nxsum = nx + nlower_6;
|
|
int nysum = ny + nlower_6;
|
|
int nzsum = nz + nlower_6;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho6_lookup[idx][k];
|
|
rho[1][k] = rho6_lookup[idy][k];
|
|
rho[2][k] = rho6_lookup[idz][k];
|
|
drho[0][k] = drho6_lookup[idx][k];
|
|
drho[1][k] = drho6_lookup[idy][k];
|
|
drho[2][k] = drho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
|
dr1 = dr2 = dr3 = ZEROF;
|
|
|
|
r1 = rho_coeff_6[order_6-1][k];
|
|
r2 = rho_coeff_6[order_6-1][k];
|
|
r3 = rho_coeff_6[order_6-1][k];
|
|
for (int l = order_6-2; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1 * dx;
|
|
r2 = rho_coeff_6[l][k] + r2 * dy;
|
|
r3 = rho_coeff_6[l][k] + r3 * dz;
|
|
dr1 = drho_coeff_6[l][k] + dr1 * dx;
|
|
dr2 = drho_coeff_6[l][k] + dr2 * dy;
|
|
dr3 = drho_coeff_6[l][k] + dr3 * dz;
|
|
}
|
|
rho[0][k-nlower_6] = r1;
|
|
rho[1][k-nlower_6] = r2;
|
|
rho[2][k-nlower_6] = r3;
|
|
drho[0][k-nlower_6] = dr1;
|
|
drho[1][k-nlower_6] = dr2;
|
|
drho[2][k-nlower_6] = dr3;
|
|
}
|
|
}
|
|
_alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n + nzsum;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int my = m + nysum;
|
|
FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
|
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l + nxsum;
|
|
ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
|
|
eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx];
|
|
ekz[l] += rho[0][l] * ekz_p * u_brick_g[mz][my][mx];
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
particle_ekx[i] += ekx[l];
|
|
particle_eky[i] += eky[l];
|
|
particle_ekz[i] += ekz[l];
|
|
}
|
|
}
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int i = ifrom; i < ito; i++) {
|
|
particle_ekx[i] *= hx_inv;
|
|
particle_eky[i] *= hy_inv;
|
|
particle_ekz[i] *= hz_inv;
|
|
|
|
// convert E-field to force
|
|
|
|
const int type = atom->type[i];
|
|
const flt_t lj = B[type];
|
|
const flt_t twoljsq = 2.*lj*lj;
|
|
|
|
const flt_t s1 = x[i][0] * hx_inv;
|
|
const flt_t s2 = x[i][1] * hy_inv;
|
|
const flt_t s3 = x[i][2] * hz_inv;
|
|
flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
|
|
sf += fsf_coeff1 * sin(ffour_pi * s1);
|
|
sf *= twoljsq;
|
|
f[i][0] += lj * particle_ekx[i] - sf;
|
|
|
|
sf = fsf_coeff2 * sin(ftwo_pi * s2);
|
|
sf += fsf_coeff3 * sin(ffour_pi * s2);
|
|
sf *= twoljsq;
|
|
f[i][1] += lj * particle_eky[i] - sf;
|
|
|
|
sf = fsf_coeff4 * sin(ftwo_pi * s3);
|
|
sf += fsf_coeff5 * sin(ffour_pi * s3);
|
|
sf *= twoljsq;
|
|
|
|
if (slabflag != 2) f[i][2] += lj * particle_ekz[i] - sf;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get dispersion field & force on my particles
|
|
for arithmetic mixing rule and ik scheme
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of dispersion field on particle
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
|
|
int nxsum = nx + nlower_6;
|
|
int nysum = ny + nlower_6;
|
|
int nzsum = nz + nlower_6;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho0[k] = rho6_lookup[idx][k];
|
|
rho1[k] = rho6_lookup[idy][k];
|
|
rho2[k] = rho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
|
FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
|
|
FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
|
|
for (int l = order_6-2; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1*dx;
|
|
r2 = rho_coeff_6[l][k] + r2*dy;
|
|
r3 = rho_coeff_6[l][k] + r3*dz;
|
|
}
|
|
|
|
rho0[k-nlower_6] = r1;
|
|
rho1[k-nlower_6] = r2;
|
|
rho2[k-nlower_6] = r3;
|
|
}
|
|
}
|
|
|
|
_alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n+nzsum;
|
|
FFT_SCALAR z0 = rho2[n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int my = m+nysum;
|
|
FFT_SCALAR y0 = z0*rho1[m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l+nxsum;
|
|
FFT_SCALAR x0 = y0*rho0[l];
|
|
ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
|
|
eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx];
|
|
ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx];
|
|
ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx];
|
|
eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx];
|
|
ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx];
|
|
ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx];
|
|
eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx];
|
|
ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx];
|
|
ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx];
|
|
eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx];
|
|
ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx];
|
|
ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx];
|
|
eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx];
|
|
ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx];
|
|
ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx];
|
|
eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx];
|
|
ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx];
|
|
ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx];
|
|
eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx];
|
|
ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx];
|
|
}
|
|
}
|
|
}
|
|
|
|
FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
|
|
FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
|
|
FFT_SCALAR ekx6, eky6, ekz6;
|
|
ekx0 = eky0 = ekz0 = ZEROF;
|
|
ekx1 = eky1 = ekz1 = ZEROF;
|
|
ekx2 = eky2 = ekz2 = ZEROF;
|
|
ekx3 = eky3 = ekz3 = ZEROF;
|
|
ekx4 = eky4 = ekz4 = ZEROF;
|
|
ekx5 = eky5 = ekz5 = ZEROF;
|
|
ekx6 = eky6 = ekz6 = ZEROF;
|
|
|
|
for (int l = 0; l < order; l++) {
|
|
ekx0 += ekx0_arr[l];
|
|
eky0 += eky0_arr[l];
|
|
ekz0 += ekz0_arr[l];
|
|
ekx1 += ekx1_arr[l];
|
|
eky1 += eky1_arr[l];
|
|
ekz1 += ekz1_arr[l];
|
|
ekx2 += ekx2_arr[l];
|
|
eky2 += eky2_arr[l];
|
|
ekz2 += ekz2_arr[l];
|
|
ekx3 += ekx3_arr[l];
|
|
eky3 += eky3_arr[l];
|
|
ekz3 += ekz3_arr[l];
|
|
ekx4 += ekx4_arr[l];
|
|
eky4 += eky4_arr[l];
|
|
ekz4 += ekz4_arr[l];
|
|
ekx5 += ekx5_arr[l];
|
|
eky5 += eky5_arr[l];
|
|
ekz5 += ekz5_arr[l];
|
|
ekx6 += ekx6_arr[l];
|
|
eky6 += eky6_arr[l];
|
|
ekz6 += ekz6_arr[l];
|
|
}
|
|
|
|
// convert D-field to force
|
|
|
|
const int type = atom->type[i];
|
|
const FFT_SCALAR lj0 = B[7*type+6];
|
|
const FFT_SCALAR lj1 = B[7*type+5];
|
|
const FFT_SCALAR lj2 = B[7*type+4];
|
|
const FFT_SCALAR lj3 = B[7*type+3];
|
|
const FFT_SCALAR lj4 = B[7*type+2];
|
|
const FFT_SCALAR lj5 = B[7*type+1];
|
|
const FFT_SCALAR lj6 = B[7*type];
|
|
|
|
f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 +
|
|
lj4*ekx4 + lj5*ekx5 + lj6*ekx6;
|
|
f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 +
|
|
lj4*eky4 + lj5*eky5 + lj6*eky6;
|
|
if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 +
|
|
lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get dispersion field & force on my particles
|
|
for arithmetic mixing rule for the ad scheme
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of dispersion field on particle
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0;
|
|
FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0;
|
|
FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0;
|
|
FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1;
|
|
FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1;
|
|
FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1;
|
|
FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2;
|
|
FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2;
|
|
FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2;
|
|
FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3;
|
|
FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3;
|
|
FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3;
|
|
FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4;
|
|
FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4;
|
|
FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4;
|
|
FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5;
|
|
FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5;
|
|
FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5;
|
|
FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6;
|
|
FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6;
|
|
FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
|
|
double *prd;
|
|
if (triclinic == 0) prd = domain->prd;
|
|
else prd = domain->prd_lamda;
|
|
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
const flt_t ftwo_pi = MY_PI * 2.0;
|
|
const flt_t ffour_pi = MY_PI * 4.0;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
|
|
const double xprd = prd[0];
|
|
const double yprd = prd[1];
|
|
const double zprd = prd[2]*slab_volfactor;
|
|
|
|
const flt_t hx_inv = nx_pppm_6/xprd;
|
|
const flt_t hy_inv = ny_pppm_6/yprd;
|
|
const flt_t hz_inv = nz_pppm_6/zprd;
|
|
|
|
const flt_t fsf_coeff0 = sf_coeff_6[0];
|
|
const flt_t fsf_coeff1 = sf_coeff_6[1];
|
|
const flt_t fsf_coeff2 = sf_coeff_6[2];
|
|
const flt_t fsf_coeff3 = sf_coeff_6[3];
|
|
const flt_t fsf_coeff4 = sf_coeff_6[4];
|
|
const flt_t fsf_coeff5 = sf_coeff_6[5];
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
int nxsum = nx + nlower_6;
|
|
int nysum = ny + nlower_6;
|
|
int nzsum = nz + nlower_6;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho6_lookup[idx][k];
|
|
rho[1][k] = rho6_lookup[idy][k];
|
|
rho[2][k] = rho6_lookup[idz][k];
|
|
drho[0][k] = drho6_lookup[idx][k];
|
|
drho[1][k] = drho6_lookup[idy][k];
|
|
drho[2][k] = drho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
|
dr1 = dr2 = dr3 = ZEROF;
|
|
|
|
r1 = rho_coeff_6[order_6-1][k];
|
|
r2 = rho_coeff_6[order_6-1][k];
|
|
r3 = rho_coeff_6[order_6-1][k];
|
|
for (int l = order_6-2; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1 * dx;
|
|
r2 = rho_coeff_6[l][k] + r2 * dy;
|
|
r3 = rho_coeff_6[l][k] + r3 * dz;
|
|
dr1 = drho_coeff_6[l][k] + dr1 * dx;
|
|
dr2 = drho_coeff_6[l][k] + dr2 * dy;
|
|
dr3 = drho_coeff_6[l][k] + dr3 * dz;
|
|
}
|
|
rho[0][k-nlower_6] = r1;
|
|
rho[1][k-nlower_6] = r2;
|
|
rho[2][k-nlower_6] = r3;
|
|
drho[0][k-nlower_6] = dr1;
|
|
drho[1][k-nlower_6] = dr2;
|
|
drho[2][k-nlower_6] = dr3;
|
|
}
|
|
}
|
|
_alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF;
|
|
particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF;
|
|
particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF;
|
|
particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF;
|
|
particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF;
|
|
particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF;
|
|
particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n + nzsum;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int my = m + nysum;
|
|
FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
|
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l + nxsum;
|
|
FFT_SCALAR x0 = drho[0][l] * ekx_p;
|
|
FFT_SCALAR y0 = rho[0][l] * eky_p;
|
|
FFT_SCALAR z0 = rho[0][l] * ekz_p;
|
|
|
|
ekx0[l] += x0 * u_brick_a0[mz][my][mx];
|
|
eky0[l] += y0 * u_brick_a0[mz][my][mx];
|
|
ekz0[l] += z0 * u_brick_a0[mz][my][mx];
|
|
ekx1[l] += x0 * u_brick_a1[mz][my][mx];
|
|
eky1[l] += y0 * u_brick_a1[mz][my][mx];
|
|
ekz1[l] += z0 * u_brick_a1[mz][my][mx];
|
|
ekx2[l] += x0 * u_brick_a2[mz][my][mx];
|
|
eky2[l] += y0 * u_brick_a2[mz][my][mx];
|
|
ekz2[l] += z0 * u_brick_a2[mz][my][mx];
|
|
ekx3[l] += x0 * u_brick_a3[mz][my][mx];
|
|
eky3[l] += y0 * u_brick_a3[mz][my][mx];
|
|
ekz3[l] += z0 * u_brick_a3[mz][my][mx];
|
|
ekx4[l] += x0 * u_brick_a4[mz][my][mx];
|
|
eky4[l] += y0 * u_brick_a4[mz][my][mx];
|
|
ekz4[l] += z0 * u_brick_a4[mz][my][mx];
|
|
ekx5[l] += x0 * u_brick_a5[mz][my][mx];
|
|
eky5[l] += y0 * u_brick_a5[mz][my][mx];
|
|
ekz5[l] += z0 * u_brick_a5[mz][my][mx];
|
|
ekx6[l] += x0 * u_brick_a6[mz][my][mx];
|
|
eky6[l] += y0 * u_brick_a6[mz][my][mx];
|
|
ekz6[l] += z0 * u_brick_a6[mz][my][mx];
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
particle_ekx0[i] += ekx0[l];
|
|
particle_eky0[i] += eky0[l];
|
|
particle_ekz0[i] += ekz0[l];
|
|
particle_ekx1[i] += ekx1[l];
|
|
particle_eky1[i] += eky1[l];
|
|
particle_ekz1[i] += ekz1[l];
|
|
particle_ekx2[i] += ekx2[l];
|
|
particle_eky2[i] += eky2[l];
|
|
particle_ekz2[i] += ekz2[l];
|
|
particle_ekx3[i] += ekx3[l];
|
|
particle_eky3[i] += eky3[l];
|
|
particle_ekz3[i] += ekz3[l];
|
|
particle_ekx4[i] += ekx4[l];
|
|
particle_eky4[i] += eky4[l];
|
|
particle_ekz4[i] += ekz4[l];
|
|
particle_ekx5[i] += ekx5[l];
|
|
particle_eky5[i] += eky5[l];
|
|
particle_ekz5[i] += ekz5[l];
|
|
particle_ekx6[i] += ekx6[l];
|
|
particle_eky6[i] += eky6[l];
|
|
particle_ekz6[i] += ekz6[l];
|
|
}
|
|
}
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int i = ifrom; i < ito; i++) {
|
|
particle_ekx0[i] *= hx_inv;
|
|
particle_eky0[i] *= hy_inv;
|
|
particle_ekz0[i] *= hz_inv;
|
|
particle_ekx1[i] *= hx_inv;
|
|
particle_eky1[i] *= hy_inv;
|
|
particle_ekz1[i] *= hz_inv;
|
|
particle_ekx2[i] *= hx_inv;
|
|
particle_eky2[i] *= hy_inv;
|
|
particle_ekz2[i] *= hz_inv;
|
|
particle_ekx3[i] *= hx_inv;
|
|
particle_eky3[i] *= hy_inv;
|
|
particle_ekz3[i] *= hz_inv;
|
|
particle_ekx4[i] *= hx_inv;
|
|
particle_eky4[i] *= hy_inv;
|
|
particle_ekz4[i] *= hz_inv;
|
|
particle_ekx5[i] *= hx_inv;
|
|
particle_eky5[i] *= hy_inv;
|
|
particle_ekz5[i] *= hz_inv;
|
|
particle_ekx6[i] *= hx_inv;
|
|
particle_eky6[i] *= hy_inv;
|
|
particle_ekz6[i] *= hz_inv;
|
|
|
|
// convert D-field to force
|
|
|
|
const int type = atom->type[i];
|
|
const FFT_SCALAR lj0 = B[7*type+6];
|
|
const FFT_SCALAR lj1 = B[7*type+5];
|
|
const FFT_SCALAR lj2 = B[7*type+4];
|
|
const FFT_SCALAR lj3 = B[7*type+3];
|
|
const FFT_SCALAR lj4 = B[7*type+2];
|
|
const FFT_SCALAR lj5 = B[7*type+1];
|
|
const FFT_SCALAR lj6 = B[7*type];
|
|
|
|
const flt_t s1 = x[i][0] * hx_inv;
|
|
const flt_t s2 = x[i][1] * hy_inv;
|
|
const flt_t s3 = x[i][2] * hz_inv;
|
|
flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
|
|
sf += fsf_coeff1 * sin(ffour_pi * s1);
|
|
sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
|
|
f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] +
|
|
lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] +
|
|
lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf;
|
|
|
|
sf = fsf_coeff2 * sin(ftwo_pi * s2);
|
|
sf += fsf_coeff3 * sin(ffour_pi * s2);
|
|
sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
|
|
f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] +
|
|
lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] +
|
|
lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf;
|
|
|
|
sf = fsf_coeff4 * sin(ftwo_pi * s3);
|
|
sf += fsf_coeff5 * sin(ffour_pi * s3);
|
|
sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
|
|
if (slabflag != 2)
|
|
f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] +
|
|
lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] +
|
|
lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get dispersion field & force on my particles
|
|
for no mixing rule and ik scheme
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of dispersion field on particle
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
|
|
double lj;
|
|
int type;
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
_alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
|
|
int nxsum = nx + nlower_6;
|
|
int nysum = ny + nlower_6;
|
|
int nzsum = nz + nlower_6;
|
|
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho0[k] = rho6_lookup[idx][k];
|
|
rho1[k] = rho6_lookup[idy][k];
|
|
rho2[k] = rho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
|
FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
|
|
FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
|
|
for (int l = order_6-2; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1*dx;
|
|
r2 = rho_coeff_6[l][k] + r2*dy;
|
|
r3 = rho_coeff_6[l][k] + r3*dz;
|
|
}
|
|
|
|
rho0[k-nlower_6] = r1;
|
|
rho1[k-nlower_6] = r2;
|
|
rho2[k-nlower_6] = r3;
|
|
}
|
|
}
|
|
|
|
|
|
_alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
|
|
_alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
|
|
_alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
|
|
|
|
for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF;
|
|
}
|
|
|
|
for (int k = 0; k < nsplit; k++) {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n+nzsum;
|
|
FFT_SCALAR z0 = rho2[n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int my = m+nysum;
|
|
FFT_SCALAR y0 = z0*rho1[m];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l+nxsum;
|
|
FFT_SCALAR x0 = y0*rho0[l];
|
|
ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
|
|
x0*vdx_brick_none[k][mz][my][mx];
|
|
eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
|
|
x0*vdy_brick_none[k][mz][my][mx];
|
|
ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
|
|
x0*vdz_brick_none[k][mz][my][mx];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
_alignvar(FFT_SCALAR ekx[nsplit], 64);
|
|
_alignvar(FFT_SCALAR eky[nsplit], 64);
|
|
_alignvar(FFT_SCALAR ekz[nsplit], 64);
|
|
for (int k = 0; k < nsplit; k++) {
|
|
ekx[k] = eky[k] = ekz[k] = ZEROF;
|
|
}
|
|
|
|
for (int l = 0; l < order; l++) {
|
|
for (int k = 0; k < nsplit; k++) {
|
|
ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
|
eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
|
ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
|
}
|
|
}
|
|
|
|
// convert E-field to force
|
|
|
|
type = atom->type[i];
|
|
for (int k = 0; k < nsplit; k++) {
|
|
lj = B[nsplit*type + k];
|
|
f[i][0] += lj*ekx[k];
|
|
f[i][1] += lj*eky[k];
|
|
if (slabflag != 2) f[i][2] += lj*ekz[k];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
interpolate from grid to get dispersion field & force on my particles
|
|
for no mixing rule for the ad scheme
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class flt_t, class acc_t, int use_table>
|
|
void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|
{
|
|
// loop over my charges, interpolate electric field from nearby grid points
|
|
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
|
|
// (dx,dy,dz) = distance to "lower left" grid pt
|
|
// (mx,my,mz) = global coords of moving stencil pt
|
|
// ek = 3 components of dispersion field on particle
|
|
|
|
int nlocal = atom->nlocal;
|
|
int nthr = comm->nthreads;
|
|
|
|
#if defined(_OPENMP)
|
|
#pragma omp parallel LMP_DEFAULT_NONE \
|
|
shared(nlocal, nthr) if (!_use_lrt)
|
|
#endif
|
|
{
|
|
|
|
double *prd;
|
|
if (triclinic == 0) prd = domain->prd;
|
|
else prd = domain->prd_lamda;
|
|
|
|
double **x = atom->x;
|
|
double **f = atom->f;
|
|
const flt_t ftwo_pi = MY_PI * 2.0;
|
|
const flt_t ffour_pi = MY_PI * 4.0;
|
|
|
|
const flt_t lo0 = boxlo[0];
|
|
const flt_t lo1 = boxlo[1];
|
|
const flt_t lo2 = boxlo[2];
|
|
const flt_t xi = delxinv_6;
|
|
const flt_t yi = delyinv_6;
|
|
const flt_t zi = delzinv_6;
|
|
const flt_t fshiftone = shiftone_6;
|
|
|
|
const double xprd = prd[0];
|
|
const double yprd = prd[1];
|
|
const double zprd = prd[2]*slab_volfactor;
|
|
|
|
const flt_t hx_inv = nx_pppm_6/xprd;
|
|
const flt_t hy_inv = ny_pppm_6/yprd;
|
|
const flt_t hz_inv = nz_pppm_6/zprd;
|
|
|
|
const flt_t fsf_coeff0 = sf_coeff_6[0];
|
|
const flt_t fsf_coeff1 = sf_coeff_6[1];
|
|
const flt_t fsf_coeff2 = sf_coeff_6[2];
|
|
const flt_t fsf_coeff3 = sf_coeff_6[3];
|
|
const flt_t fsf_coeff4 = sf_coeff_6[4];
|
|
const flt_t fsf_coeff5 = sf_coeff_6[5];
|
|
|
|
int ifrom, ito, tid;
|
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
|
|
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
|
|
|
for (int i = ifrom; i < ito; i++) {
|
|
int nx = part2grid_6[i][0];
|
|
int ny = part2grid_6[i][1];
|
|
int nz = part2grid_6[i][2];
|
|
FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
|
|
FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
|
|
FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
|
|
|
|
int nxsum = nx + nlower_6;
|
|
int nysum = ny + nlower_6;
|
|
int nzsum = nz + nlower_6;
|
|
|
|
if (use_table) {
|
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
|
int idx = dx;
|
|
dy = dy*half_rho_scale + half_rho_scale_plus;
|
|
int idy = dy;
|
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
|
int idz = dz;
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho[0][k] = rho6_lookup[idx][k];
|
|
rho[1][k] = rho6_lookup[idy][k];
|
|
rho[2][k] = rho6_lookup[idz][k];
|
|
drho[0][k] = drho6_lookup[idx][k];
|
|
drho[1][k] = drho6_lookup[idy][k];
|
|
drho[2][k] = drho6_lookup[idz][k];
|
|
}
|
|
} else {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
|
dr1 = dr2 = dr3 = ZEROF;
|
|
|
|
r1 = rho_coeff_6[order_6-1][k];
|
|
r2 = rho_coeff_6[order_6-1][k];
|
|
r3 = rho_coeff_6[order_6-1][k];
|
|
for (int l = order_6-2; l >= 0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1 * dx;
|
|
r2 = rho_coeff_6[l][k] + r2 * dy;
|
|
r3 = rho_coeff_6[l][k] + r3 * dz;
|
|
dr1 = drho_coeff_6[l][k] + dr1 * dx;
|
|
dr2 = drho_coeff_6[l][k] + dr2 * dy;
|
|
dr3 = drho_coeff_6[l][k] + dr3 * dz;
|
|
}
|
|
rho[0][k-nlower_6] = r1;
|
|
rho[1][k-nlower_6] = r2;
|
|
rho[2][k-nlower_6] = r3;
|
|
drho[0][k-nlower_6] = dr1;
|
|
drho[1][k-nlower_6] = dr2;
|
|
drho[2][k-nlower_6] = dr3;
|
|
}
|
|
}
|
|
_alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
|
|
_alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
|
|
_alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
|
|
|
|
for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
ekx[k]=eky[k]=ekz[k]=ZEROF;
|
|
}
|
|
|
|
for (int k = 0; k < nsplit; k++) {
|
|
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int n = 0; n < order_6; n++) {
|
|
int mz = n + nzsum;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int m = 0; m < order_6; m++) {
|
|
int my = m + nysum;
|
|
FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
|
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
int mx = l + nxsum;
|
|
ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
|
|
u_brick_none[k][mz][my][mx];
|
|
eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * eky_p *
|
|
u_brick_none[k][mz][my][mx];
|
|
ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * ekz_p *
|
|
u_brick_none[k][mz][my][mx];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
_alignvar(FFT_SCALAR ekx_tot[nsplit], 64);
|
|
_alignvar(FFT_SCALAR eky_tot[nsplit], 64);
|
|
_alignvar(FFT_SCALAR ekz_tot[nsplit], 64);
|
|
for (int k = 0; k < nsplit; k++) {
|
|
ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF;
|
|
}
|
|
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
|
#endif
|
|
for (int l = 0; l < order; l++) {
|
|
for (int k = 0; k < nsplit; k++) {
|
|
ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
|
|
eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
|
|
ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l];
|
|
}
|
|
}
|
|
|
|
for (int k = 0; k < nsplit; k++) {
|
|
ekx_tot[k] *= hx_inv;
|
|
eky_tot[k] *= hy_inv;
|
|
ekz_tot[k] *= hz_inv;
|
|
}
|
|
// convert D-field to force
|
|
|
|
const int type = atom->type[i];
|
|
|
|
const flt_t s1 = x[i][0] * hx_inv;
|
|
const flt_t s2 = x[i][1] * hy_inv;
|
|
const flt_t s3 = x[i][2] * hz_inv;
|
|
flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1);
|
|
sf1 += fsf_coeff1 * sin(ffour_pi * s1);
|
|
|
|
flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2);
|
|
sf2 += fsf_coeff3 * sin(ffour_pi * s2);
|
|
|
|
flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3);
|
|
sf3 += fsf_coeff5 * sin(ffour_pi * s3);
|
|
for (int k = 0; k < nsplit; k++) {
|
|
const flt_t lj = B[nsplit*type + k];
|
|
const flt_t twoljsq = lj*lj * B[k] * 2;
|
|
flt_t sf = sf1*twoljsq;
|
|
f[i][0] += lj * ekx_tot[k] - sf;
|
|
sf = sf2*twoljsq;
|
|
f[i][1] += lj * eky_tot[k] - sf;
|
|
sf = sf3*twoljsq;
|
|
if (slabflag != 2) f[i][2] += lj * ekz_tot[k] - sf;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
precompute rho coefficients as a lookup table to save time in make_rho
|
|
and fieldforce. Instead of doing this polynomial for every atom 6 times
|
|
per time step, precompute it for some number of points.
|
|
------------------------------------------------------------------------- */
|
|
|
|
void PPPMDispIntel::precompute_rho()
|
|
{
|
|
half_rho_scale = (rho_points - 1.)/2.;
|
|
half_rho_scale_plus = half_rho_scale + 0.5;
|
|
|
|
if (function[0]) {
|
|
for (int i = 0; i < rho_points; i++) {
|
|
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k=nlower; k<=nupper;k++) {
|
|
FFT_SCALAR r1 = ZEROF;
|
|
for (int l=order-1; l>=0; l--) {
|
|
r1 = rho_coeff[l][k] + r1*dx;
|
|
}
|
|
rho_lookup[i][k-nlower] = r1;
|
|
}
|
|
for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho_lookup[i][k] = 0;
|
|
}
|
|
if (differentiation_flag == 1) {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k=nlower; k<=nupper;k++) {
|
|
FFT_SCALAR r1 = ZEROF;
|
|
for (int l=order-2; l>=0; l--) {
|
|
r1 = drho_coeff[l][k] + r1*dx;
|
|
}
|
|
drho_lookup[i][k-nlower] = r1;
|
|
}
|
|
for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
drho_lookup[i][k] = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (function[1]+function[2]+function[3]) {
|
|
for (int i = 0; i < rho_points; i++) {
|
|
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k=nlower_6; k<=nupper_6;k++) {
|
|
FFT_SCALAR r1 = ZEROF;
|
|
for (int l=order_6-1; l>=0; l--) {
|
|
r1 = rho_coeff_6[l][k] + r1*dx;
|
|
}
|
|
rho6_lookup[i][k-nlower_6] = r1;
|
|
}
|
|
for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
rho6_lookup[i][k] = 0;
|
|
}
|
|
if (differentiation_flag == 1) {
|
|
#if defined(LMP_SIMD_COMPILER)
|
|
#if defined(USE_OMP_SIMD)
|
|
#pragma omp simd
|
|
#else
|
|
#pragma simd
|
|
#endif
|
|
#endif
|
|
for (int k=nlower_6; k<=nupper_6;k++) {
|
|
FFT_SCALAR r1 = ZEROF;
|
|
for (int l=order_6-2; l>=0; l--) {
|
|
r1 = drho_coeff_6[l][k] + r1*dx;
|
|
}
|
|
drho6_lookup[i][k-nlower_6] = r1;
|
|
}
|
|
for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
|
drho6_lookup[i][k] = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Returns 0 if Intel optimizations for PPPM ignored due to offload
|
|
------------------------------------------------------------------------- */
|
|
|
|
#ifdef _LMP_INTEL_OFFLOAD
|
|
int PPPMDispIntel::use_base() {
|
|
return _use_base;
|
|
}
|
|
#endif
|