lammps/src/AMOEBA/amoeba_convolution.cpp

// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/ Sandia National Laboratories
   LAMMPS development team: developers@lammps.org

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

#include "amoeba_convolution.h"

#include "comm.h"
#include "fft3d_wrap.h"
#include "grid3d.h"
#include "memory.h"
#include "neighbor.h"
#include "remap_wrap.h"
#include "timer.h"

#include <cstring>

using namespace LAMMPS_NS;

// DEBUG

#define DEBUG_AMOEBA 0
#if DEBUG_AMOEBA
char *labels[7] =
  {(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
   (char *) "POLAR_GRIDC", (char *) "DISP_GRID",
   (char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};

enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
#endif
// END DEBUG

enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};

//#define SCALE 1
#define SCALE 0

static constexpr FFT_SCALAR ZEROF = 0.0;

/* ----------------------------------------------------------------------
   partition an FFT grid across processors
   both for a brick and FFT x pencil decomposition
   nx,nz,nz = global FFT grid size
   order = size of stencil in each dimension that maps atoms to grid
   adapted from PPPM::set_grid_local()
------------------------------------------------------------------------- */

AmoebaConvolution::AmoebaConvolution(LAMMPS *lmp, Pair *pair,
                                     int nx_caller, int ny_caller, int nz_caller,
                                     int order_caller, int which_caller) :
  Pointers(lmp)
{
  amoeba = pair;
  nx = nx_caller;
  ny = ny_caller;
  nz = nz_caller;
  order = order_caller;
  which = which_caller;

  flag3d = 1;
  if (which == POLAR_GRIDC || which == INDUCE_GRIDC) flag3d = 0;

  nfft_global = (bigint) nx * ny * nz;

  allocate_grid();
}

/* ----------------------------------------------------------------------
   free all memory
------------------------------------------------------------------------- */

AmoebaConvolution::~AmoebaConvolution()
{
  deallocate_grid();
}

/* ----------------------------------------------------------------------
   subset of FFT grids assigned to each proc may have changed
   called by load balancer when proc subdomains are adjusted
------------------------------------------------------------------------- */

void AmoebaConvolution::reset_grid()
{
  deallocate_grid();
  allocate_grid();
}

/* ----------------------------------------------------------------------
   allocate all local grid data structs: FFT, Grid3d, Remap
------------------------------------------------------------------------- */

void AmoebaConvolution::allocate_grid()
{
  // maxdist = max distance outside of subbox my owned atom may be
  // nlower,nupper = stencil size for mapping particles to FFT grid

  double maxdist = 0.5*neighbor->skin;
  int nlower = -(order-1)/2;
  int nupper = order/2;

  // Grid3d determines my owned + ghost grid cells
  // ghost cell extent depends on maxdist, nlower, nupper

  gc = new Grid3d(lmp,world,nx,ny,nz);
  gc->set_distance(maxdist);
  gc->set_stencil_atom(-nlower,nupper);

  gc->setup_grid(nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);

  int nqty = flag3d ? 1 : 2;
  int ngc_buf1,ngc_buf2;

  gc->setup_comm(ngc_buf1,ngc_buf2);
  memory->create(gc_buf1,nqty*ngc_buf1,"amoeba:gc_buf1");
  memory->create(gc_buf2,nqty*ngc_buf2,"amoeba:gc_buf2");

  // x-pencil decomposition of FFT mesh
  // global indices range from 0 to N-1
  // each proc owns entire x-dimension, clumps of columns in y,z dimensions
  // npey_fft,npez_fft = # of procs in y,z dims
  // if nprocs is small enough, proc can own 1 or more entire xy planes,
  //   else proc owns 2d sub-blocks of yz plane
  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
  // nlo_fft,nhi_fft = lower/upper limit of the section
  //   of the global FFT mesh that I own in x-pencil decomposition

  int me = comm->me;
  int nprocs = comm->nprocs;

  int npey_fft,npez_fft;
  if (nz >= nprocs) {
    npey_fft = 1;
    npez_fft = nprocs;
  } else procs2grid2d(nprocs,ny,nz,npey_fft,npez_fft);

  int me_y = me % npey_fft;
  int me_z = me / npey_fft;

  nxlo_fft = 0;
  nxhi_fft = nx - 1;
  nylo_fft = me_y*ny/npey_fft;
  nyhi_fft = (me_y+1)*ny/npey_fft - 1;
  nzlo_fft = me_z*nz/npez_fft;
  nzhi_fft = (me_z+1)*nz/npez_fft - 1;

  // FFT grid sizes
  // nbrick_owned = owned grid points in brick decomp
  // nbrick_ghosts = owned + ghost grid points in grid decomp
  // nfft_owned = owned grid points in FFT decomp
  // ngrid_either = max of nbrick_onwed and nfft_owned
  // nfft = total FFT grid points

  nbrick_owned = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
    (nzhi_in-nzlo_in+1);
  nbrick_ghosts = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);
  nfft_owned = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
    (nzhi_fft-nzlo_fft+1);

  ngrid_either = MAX(nbrick_owned,nfft_owned);

  // instantiate FFT and Remap

  int tmp;

  fft1 = new FFT3d(lmp,world,nx,ny,nz,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   1,0,&tmp,0);

  fft2 = new FFT3d(lmp,world,nx,ny,nz,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                   0,0,&tmp,0);

  remap = new Remap(lmp,world,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nqty,0,0,FFT_PRECISION,0);

  // memory allocations

  if (flag3d) {
    memory->create3d_offset(grid_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                            nxlo_out,nxhi_out,"amoeba:grid_brick");
    grid_brick_start = &grid_brick[nzlo_out][nylo_out][nxlo_out];
    cgrid_brick = nullptr;
  } else {
    memory->create4d_offset_last(cgrid_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
                                 nxlo_out,nxhi_out,2,"amoeba:cgrid_brick");
    grid_brick_start = &cgrid_brick[nzlo_out][nylo_out][nxlo_out][0];
    grid_brick = nullptr;
  }

  memory->create(grid_fft,ngrid_either,"amoeba:grid_fft");
  memory->create(cfft,2*ngrid_either,"amoeba:cfft");
  memory->create(remap_buf,nqty*nfft_owned,"amoeba:remap_buf");
}

/* ---------------------------------------------------------------------- */

void AmoebaConvolution::deallocate_grid()
{
  delete gc;

  memory->destroy(gc_buf1);
  memory->destroy(gc_buf2);

  delete fft1;
  delete fft2;
  delete remap;

  memory->destroy3d_offset(grid_brick,nzlo_out,nylo_out,nxlo_out);
  memory->destroy4d_offset_last(cgrid_brick,nzlo_out,nylo_out,nxlo_out);
  memory->destroy(grid_fft);
  memory->destroy(cfft);
  memory->destroy(remap_buf);
}

/* ----------------------------------------------------------------------
   zero brick grid, including ghosts
   can be 3d real or 4d complex array
   return pointer to data in brick grid, caller casts to 3d or 4d
------------------------------------------------------------------------- */

void *AmoebaConvolution::zero()
{
  if (flag3d) return zero_3d();
  return zero_4d();
}

/* ---------------------------------------------------------------------- */

void *AmoebaConvolution::zero_3d()
{
  if (!grid_brick) return nullptr;
  memset(&(grid_brick[nzlo_out][nylo_out][nxlo_out]),0,
         nbrick_ghosts*sizeof(FFT_SCALAR));
  return (void *) grid_brick;
}

/* ---------------------------------------------------------------------- */

void *AmoebaConvolution::zero_4d()
{
  if (!cgrid_brick) return nullptr;
  memset(&(cgrid_brick[nzlo_out][nylo_out][nxlo_out][0]),0,
         2*nbrick_ghosts*sizeof(FFT_SCALAR));
  return (void *) cgrid_brick;
}

/* ----------------------------------------------------------------------
   perform pre-convolution grid operations
   can be 3d real or 4d complex array
   return pointer to complex cfft vector
------------------------------------------------------------------------- */

FFT_SCALAR *AmoebaConvolution::pre_convolution()
{
  if (flag3d) return pre_convolution_3d();
  return pre_convolution_4d();
}

/* ----------------------------------------------------------------------
   perform pre-convolution grid operations for 3d grid_brick array
------------------------------------------------------------------------- */

FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
{
  int ix,iy,iz,n;

  // reverse comm for 3d brick grid + ghosts

#if DEBUG_AMOEBA
  debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
#endif

  gc->reverse_comm(Grid3d::PAIR,amoeba,which,1,sizeof(FFT_SCALAR),
                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);

#if DEBUG_AMOEBA
  debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
  debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
#endif

  // copy owned 3d brick grid values to FFT grid

  n = 0;
  for (iz = nzlo_in; iz <= nzhi_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        grid_fft[n++] = grid_brick[iz][iy][ix];

  // remap FFT grid from brick to x pencil partitioning

  remap->perform(grid_fft,grid_fft,remap_buf);

#if DEBUG_AMOEBA
  debug_scalar(FFT,"PRE Convo / POST Remap");
  debug_file(FFT,"pre.convo.post.remap");
#endif

  // copy real values into complex grid

  n = 0;
  for (int i = 0; i < nfft_owned; i++) {
    cfft[n++] = grid_fft[i];
    cfft[n++] = ZEROF;
  }

  double time0,time1;

  if (timer->has_sync()) MPI_Barrier(world);
  time0 = platform::walltime();

  // perform forward FFT

  fft1->compute(cfft,cfft,FFT3d::FORWARD);
  time1 = platform::walltime();

  if (SCALE) {
    FFT_SCALAR scale = 1.0/nfft_global;
    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
  }

  time_fft += time1 - time0;

#if DEBUG_AMOEBA
  debug_scalar(CFFT1,"PRE Convo / POST FFT");
  debug_file(CFFT1,"pre.convo.post.fft");
#endif
  return cfft;
}

/* ----------------------------------------------------------------------
   perform pre-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */

FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
{
  int ix,iy,iz,n;

  // reverse comm for 4d brick grid + ghosts

#if DEBUG_AMOEBA
  debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
#endif

  gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);

#if DEBUG_AMOEBA
  debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
  debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
#endif
  // copy owned 4d brick grid values to FFT grid

  n = 0;
  for (iz = nzlo_in; iz <= nzhi_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
        cfft[n++] = cgrid_brick[iz][iy][ix][0];
        cfft[n++] = cgrid_brick[iz][iy][ix][1];
      }

  // remap FFT grid from brick to x pencil partitioning
  // NOTE: could just setup FFT to start from brick decomp and skip remap

  remap->perform(cfft,cfft,remap_buf);

#if DEBUG_AMOEBA
  debug_scalar(FFT,"PRE Convo / POST Remap");
  debug_file(FFT,"pre.convo.post.remap");
#endif

  double time0,time1;

  if (timer->has_sync()) MPI_Barrier(world);
  time0 = platform::walltime();

  // perform forward FFT

  fft1->compute(cfft,cfft,FFT3d::FORWARD);
  time1 = platform::walltime();

  if (SCALE) {
    FFT_SCALAR scale = 1.0/nfft_global;
    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
  }

  time_fft += time1  - time0;

#if DEBUG_AMOEBA
  debug_scalar(CFFT1,"PRE Convo / POST FFT");
  debug_file(CFFT1,"pre.convo.post.fft");
#endif
  return cfft;
}

/* ----------------------------------------------------------------------
   perform post-convolution grid operations
   can be 3d real or 4d complex array
   return pointer to data in brick grid, caller casts to 3d or 4d
------------------------------------------------------------------------- */

void *AmoebaConvolution::post_convolution()
{
  if (flag3d) return post_convolution_3d();
  return post_convolution_4d();
}

/* ----------------------------------------------------------------------
   perform post-convolution grid operations for 3d grid_brick array
------------------------------------------------------------------------- */

void *AmoebaConvolution::post_convolution_3d()
{
  int ix,iy,iz,n;

  // perform backward FFT
#if DEBUG_AMOEBA
  debug_scalar(CFFT1,"POST Convo / PRE FFT");
  debug_file(CFFT1,"post.convo.pre.fft");
#endif

  double time0,time1;

  if (timer->has_sync()) MPI_Barrier(world);
  time0 = platform::walltime();

  fft2->compute(cfft,cfft,FFT3d::BACKWARD);
  time1 = platform::walltime();

  time_fft += time1 - time0;

#if DEBUG_AMOEBA
  debug_scalar(CFFT2,"POST Convo / POST FFT");
  debug_file(CFFT2,"post.convo.post.fft");
#endif
  // copy real portion of 1d complex values into 3d real grid

  n = 0;
  for (iz = nzlo_in; iz <= nzhi_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
        grid_brick[iz][iy][ix] = cfft[n];
        n += 2;
      }

  // forward comm to populate ghost grid values

#if DEBUG_AMOEBA
  debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
  debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
#endif
  gc->forward_comm(Grid3d::PAIR,amoeba,which,1,sizeof(FFT_SCALAR),
                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  return (void *) grid_brick;
}

/* ----------------------------------------------------------------------
   perform post-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */

void *AmoebaConvolution::post_convolution_4d()
{
  int ix,iy,iz,n;

  // perform backward FFT

#if DEBUG_AMOEBA
  debug_scalar(CFFT1,"POST Convo / PRE FFT");
  debug_file(CFFT1,"post.convo.pre.fft");
#endif

  double time0,time1;

  if (timer->has_sync()) MPI_Barrier(world);
  time0 = platform::walltime();

  fft2->compute(cfft,cfft,FFT3d::BACKWARD);

  time1 = platform::walltime();

  time_fft += time1 - time0;

#if DEBUG_AMOEBA
  debug_scalar(CFFT2,"POST Convo / POST FFT");
  debug_file(CFFT2,"post.convo.post.fft");
#endif
  // copy 1d complex values into 4d complex grid

  n = 0;
  for (iz = nzlo_in; iz <= nzhi_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
        cgrid_brick[iz][iy][ix][0] = cfft[n++];
        cgrid_brick[iz][iy][ix][1] = cfft[n++];
      }

  // forward comm to populate ghost grid values

#if DEBUG_AMOEBA
  debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
  debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
#endif
  gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);

  return (void *) cgrid_brick;
}

/* ----------------------------------------------------------------------
   map nprocs to NX by NY grid as PX by PY procs - return optimal px,py
   copy of PPPM::procs2grid2d()
------------------------------------------------------------------------- */

void AmoebaConvolution::procs2grid2d(int nprocs, int nx, int ny, int &px, int &py)
{
  // loop thru all possible factorizations of nprocs
  // surf = surface area of largest proc sub-domain
  // innermost if test minimizes surface area and surface/volume ratio

  int bestsurf = 2 * (nx + ny);
  int bestboxx = 0;
  int bestboxy = 0;

  int boxx,boxy,surf,ipx,ipy;

  ipx = 1;
  while (ipx <= nprocs) {
    if (nprocs % ipx == 0) {
      ipy = nprocs/ipx;
      boxx = nx/ipx;
      if (nx % ipx) boxx++;
      boxy = ny/ipy;
      if (ny % ipy) boxy++;
      surf = boxx + boxy;
      if (surf < bestsurf ||
          (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) {
        bestsurf = surf;
        bestboxx = boxx;
        bestboxy = boxy;
        px = ipx;
        py = ipy;
      }
    }
    ipx++;
  }
}

#if DEBUG_AMOEBA
/* ----------------------------------------------------------------------
   output a scalar value to screen
   array = which array is being summed over
---------------------------------------------------------------------- */

void AmoebaConvolution::debug_scalar(int array, const char *label)
{
  double sum = 0.0;

  if (array == GRIDBRICK_OUT) {
    if (flag3d) {
      for (int iz = nzlo_out; iz <= nzhi_out; iz++)
        for (int iy = nylo_out; iy <= nyhi_out; iy++)
          for (int ix = nxlo_out; ix <= nxhi_out; ix++)
            sum += grid_brick[iz][iy][ix];
    } else {
      for (int iz = nzlo_out; iz <= nzhi_out; iz++)
        for (int iy = nylo_out; iy <= nyhi_out; iy++)
          for (int ix = nxlo_out; ix <= nxhi_out; ix++) {
            sum += cgrid_brick[iz][iy][ix][0];
            sum += cgrid_brick[iz][iy][ix][1];
          }
    }
  }

  if (array == GRIDBRICK_IN) {
    if (flag3d) {
      for (int iz = nzlo_in; iz <= nzhi_in; iz++)
        for (int iy = nylo_in; iy <= nyhi_in; iy++)
          for (int ix = nxlo_in; ix <= nxhi_in; ix++)
            sum += grid_brick[iz][iy][ix];
    } else {
      for (int iz = nzlo_in; iz <= nzhi_in; iz++)
        for (int iy = nylo_in; iy <= nyhi_in; iy++)
          for (int ix = nxlo_in; ix <= nxhi_in; ix++) {
            sum += cgrid_brick[iz][iy][ix][0];
            sum += cgrid_brick[iz][iy][ix][1];
          }
    }
  }

  if (array == FFT) {
    if (flag3d) {
      for (int i = 0; i < nfft_owned; i++)
        sum += grid_fft[i];
    } else {
      for (int i = 0; i < 2*nfft_owned; i++)
        sum += cfft[i];
    }
  }

  if (array == CFFT1) {
    for (int i = 0; i < 2*nfft_owned; i++)
      sum += cfft[i];
  }

  if (array == CFFT2) {
    for (int i = 0; i < 2*nbrick_owned; i++)
      sum += cfft[i];
  }

  /*
  double sumall;
  MPI_Allreduce(&sum,&sumall,1,MPI_DOUBLE,MPI_SUM,world);
  if (comm->me == 0) printf("%s: %s: %12.8g\n",labels[which],label,sumall);
  */
}

/* ----------------------------------------------------------------------
   dump grid values to a file
   array = which array is being output
---------------------------------------------------------------------- */

void AmoebaConvolution::debug_file(int array, const char *label)
{
  FILE *fp;

  int me = comm->me;
  int nprocs = comm->nprocs;

  // open file

  if (me == 0) fp = fopen(fmt::format("tmp.{}.{}", labels[which], label).c_str(), "w");

  // file header
  // ncol = # of columns, including grid cell ID

  bigint ntot = nx * ny * nz;

  int ncol;
  char *columns;

  if (array == CFFT1 || array == CFFT2 || !flag3d) {
    ncol = 3;
    columns = (char *) "id real imag";
  } else {
    ncol = 2;
    columns = (char *) "id value";
  }

  char boundstr[9];          // encoding of boundary flags
  domain->boundary_string(boundstr);

  if (me == 0) {
    fprintf(fp,"ITEM: TIMESTEP\n");
    fprintf(fp,BIGINT_FORMAT "\n",update->ntimestep);
    fprintf(fp,"ITEM: NUMBER OF ATOMS\n");
    fprintf(fp,BIGINT_FORMAT "\n",ntot);
    fprintf(fp,"ITEM: BOX BOUNDS %s\n",boundstr);
    fprintf(fp,"%-1.16e %-1.16e\n",domain->boxlo[0],domain->boxhi[0]);
    fprintf(fp,"%-1.16e %-1.16e\n",domain->boxlo[1],domain->boxhi[1]);
    fprintf(fp,"%-1.16e %-1.16e\n",domain->boxlo[2],domain->boxhi[2]);
    fprintf(fp,"ITEM: ATOMS %s\n",columns);
  }

  // pack my values
  // ngrid = # of grid cells I own

  int ngrid;
  if (array == GRIDBRICK_IN) ngrid = nbrick_owned;
  else if (array == FFT) ngrid = nfft_owned;
  else if (array == CFFT1) ngrid = nfft_owned;
  else if (array == CFFT2) ngrid = nbrick_owned;

  int ngridmax;
  MPI_Allreduce(&ngrid,&ngridmax,1,MPI_INT,MPI_MAX,world);

  double *buf,*buf2;
  memory->create(buf,ncol*ngridmax,"amoeba:buf");
  memory->create(buf2,ncol*ngridmax,"amoeba:buf2");

  ngrid = 0;

  if (array == GRIDBRICK_IN) {
    if (flag3d) {
      for (int iz = nzlo_in; iz <= nzhi_in; iz++)
        for (int iy = nylo_in; iy <= nyhi_in; iy++)
          for (int ix = nxlo_in; ix <= nxhi_in; ix++) {
            int id = iz*ny*nx + iy*nx + ix + 1;
            buf[ncol*ngrid] = id;
            buf[ncol*ngrid+1] = grid_brick[iz][iy][ix];
            ngrid++;
          }
    } else {
      for (int iz = nzlo_in; iz <= nzhi_in; iz++)
        for (int iy = nylo_in; iy <= nyhi_in; iy++)
          for (int ix = nxlo_in; ix <= nxhi_in; ix++) {
            int id = iz*ny*nx + iy*nx + ix + 1;
            buf[ncol*ngrid] = id;
            buf[ncol*ngrid+1] = cgrid_brick[iz][iy][ix][0];
            buf[ncol*ngrid+2] = cgrid_brick[iz][iy][ix][1];
            ngrid++;
          }
    }
  }

  if (array == FFT) {
    if (flag3d) {
      int m = 0;
      for (int iz = nzlo_fft; iz <= nzhi_fft; iz++)
        for (int iy = nylo_fft; iy <= nyhi_fft; iy++)
          for (int ix = nxlo_fft; ix <= nxhi_fft; ix++) {
            int id = iz*ny*nx + iy*nx + ix + 1;
            buf[ncol*ngrid] = id;
            buf[ncol*ngrid+1] = grid_fft[m++];
            ngrid++;
          }
    } else {
      int m = 0;
      for (int iz = nzlo_fft; iz <= nzhi_fft; iz++)
        for (int iy = nylo_fft; iy <= nyhi_fft; iy++)
          for (int ix = nxlo_fft; ix <= nxhi_fft; ix++) {
            int id = iz*ny*nx + iy*nx + ix + 1;
            buf[ncol*ngrid] = id;
            buf[ncol*ngrid+1] = cfft[m++];
            buf[ncol*ngrid+2] = cfft[m++];
            ngrid++;
          }
    }
  }

  if (array == CFFT1) {
    int m = 0;
    for (int iz = nzlo_fft; iz <= nzhi_fft; iz++)
      for (int iy = nylo_fft; iy <= nyhi_fft; iy++)
        for (int ix = nxlo_fft; ix <= nxhi_fft; ix++) {
          int id = iz*ny*nx + iy*nx + ix + 1;
          buf[ncol*ngrid] = id;
          buf[ncol*ngrid+1] = cfft[m++];
          buf[ncol*ngrid+2] = cfft[m++];
          ngrid++;
        }
  }

  if (array == CFFT2) {
    int m = 0;
    for (int iz = nzlo_in; iz <= nzhi_in; iz++)
      for (int iy = nylo_in; iy <= nyhi_in; iy++)
        for (int ix = nxlo_in; ix <= nxhi_in; ix++) {
          int id = iz*ny*nx + iy*nx + ix + 1;
          buf[ncol*ngrid] = id;
          buf[ncol*ngrid+1] = cfft[m++];
          buf[ncol*ngrid+2] = cfft[m++];
          ngrid++;
        }
  }

  // proc 0 outputs values
  // pings other procs, send/recv of their values

  int tmp,nlines;
  MPI_Request request;
  MPI_Status status;

  if (me == 0) {
    for (int iproc = 0; iproc < nprocs; iproc++) {
      if (iproc) {
        MPI_Irecv(buf,ngridmax*ncol,MPI_DOUBLE,iproc,0,world,&request);
        MPI_Send(&tmp,0,MPI_INT,me+iproc,0,world);
        MPI_Wait(&request,&status);
        MPI_Get_count(&status,MPI_DOUBLE,&nlines);
        nlines /= ncol;
      } else nlines = ngrid;

      int n = 0;
      for (int m = 0; m < nlines; m++) {
        if (ncol == 2)
          fprintf(fp,"%d %12.8g\n",(int) buf[n],buf[n+1]);
        else if (ncol == 3)
          fprintf(fp,"%d %12.8g %12.8g\n",(int ) buf[n],buf[n+1],buf[n+2]);
        n += ncol;
      }
    }

  } else {
    MPI_Recv(&tmp,0,MPI_INT,0,0,world,MPI_STATUS_IGNORE);
    MPI_Rsend(buf,ngrid*ncol,MPI_DOUBLE,0,0,world);
  }

  // close file

  if (me == 0) fclose(fp);

  // clean up

  memory->destroy(buf);
  memory->destroy(buf2);
}
#endif