/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   See the README file in the top-level LAMMPS directory.

   -----------------------------------------------------------------------

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany

   See the README file in the USER-CUDA directory.

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */

#ifndef _CUDA_SHARED_H_
#define _CUDA_SHARED_H_
#include "cuda_precision.h"

#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)

struct dev_array {
  void* dev_data;			// pointer to memory address on cuda device
  unsigned dim[3];		// array dimensions
};

struct cuda_shared_atom {	// relevent data from atom class
  dev_array dx; 			// cumulated distance for binning settings
  dev_array x;			// position
  dev_array v;			// velocity
  dev_array f;			// force
  dev_array tag;
  dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)
  dev_array mask;
  dev_array image;
  dev_array q;			// charges
  dev_array mass;			// per-type masses
  dev_array rmass;		// per-atom masses
  dev_array radius;		// per-atom radius
  dev_array density;
  dev_array omega;
  dev_array torque;
  dev_array molecule;

  dev_array special;
  int maxspecial;
  dev_array nspecial;
  int* special_flag;
  int molecular;

  dev_array eatom;		// per-atom energy
  dev_array vatom;		// per-atom virial
  int need_eatom;
  int need_vatom;

  dev_array x_type;		// position + type in X_CFLOAT4 struct
  dev_array v_radius;		// velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
  dev_array omega_rmass;		// velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style

  double* mass_host;		// remember per-type host pointer to masses
  //int natoms;				// total # of atoms in system, could be 0
  int nghost;				// and ghost atoms on this proc
  int nlocal;				// # of owned
  int nall;			    // total # of atoms in this proc
  int nmax;				// max # of owned+ghost in arrays on this proc
  int ntypes;
  int q_flag;				// do we have charges?
  int rmass_flag;			// do we have per-atom masses?
  int firstgroup;
  int nfirst;

  int update_nlocal;
  int update_nmax;
  int update_neigh;

  dev_array xhold;	    // position at last neighboring
  X_CFLOAT triggerneighsq;		// maximum square movement before reneighboring
  int reneigh_flag;		// is reneighboring necessary
  int maxhold;			// size of xhold
  int dist_check; 		//perform distance check for reneighboring
  dev_array binned_id;    //id of each binned atom (not tag!!)
  dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
  float bin_extraspace;
  int bin_dim[3];
  int bin_nmax;
  dev_array map_array;
};

struct cuda_shared_pair {	// relevent data from pair class
  char cudable_force;		// check for (cudable_force!=0)
  X_CFLOAT cut_global;
  X_CFLOAT cut_inner_global;
  X_CFLOAT cut_coul_global;
  double** cut;			// type-type cutoff
  double** cutsq;			// type-type cutoff
  double** cut_inner;			// type-type cutoff for coul
  double** cut_coul;			// type-type cutoff for coul
  double** coeff1;		// tpye-type pair parameters
  double** coeff2;
  double** coeff3;
  double** coeff4;
  double** coeff5;
  double** coeff6;
  double** coeff7;
  double** coeff8;
  double** coeff9;
  double** coeff10;
  double** offset;
  double* special_lj;
  double* special_coul;
  dev_array virial; // ENERGY_CFLOAT
  dev_array eng_vdwl; // ENERGY_CFLOAT
  dev_array eng_coul; // ENERGY_CFLOAT
  X_CFLOAT cut_coulsq_global;
  F_CFLOAT g_ewald, kappa;
  int freeze_group_bit;

  dev_array coeff1_gm;
  dev_array coeff2_gm;
  dev_array coeff3_gm;
  dev_array coeff4_gm;
  dev_array coeff5_gm;
  dev_array coeff6_gm;
  dev_array coeff7_gm;
  dev_array coeff8_gm;
  dev_array coeff9_gm;
  dev_array coeff10_gm;

  int lastgridsize;
  int n_energy_virial;
  int collect_forces_later;
  int use_block_per_atom;
  int override_block_per_atom;
  bool neighall;

};

struct cuda_shared_domain {	// relevent data from domain class
  X_CFLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
  X_CFLOAT subhi[3];
  X_CFLOAT boxlo[3];
  X_CFLOAT boxhi[3];
  X_CFLOAT prd[3];
  int periodicity[3];		// xyz periodicity as array

  int triclinic;
  X_CFLOAT xy;
  X_CFLOAT xz;
  X_CFLOAT yz;
  X_CFLOAT boxlo_lamda[3];
  X_CFLOAT boxhi_lamda[3];
  X_CFLOAT prd_lamda[3];
  X_CFLOAT h[6];
  X_CFLOAT h_inv[6];
  V_CFLOAT h_rate[6];
  int update;
};

struct cuda_shared_pppm {
  char cudable_force;
#ifdef FFT_CUFFT
  FFT_CFLOAT* work1;
  FFT_CFLOAT* work2;
  FFT_CFLOAT* work3;
  PPPM_CFLOAT* greensfn;
  PPPM_CFLOAT* fkx;
  PPPM_CFLOAT* fky;
  PPPM_CFLOAT* fkz;
  PPPM_CFLOAT* vg;
#endif
  int* part2grid;
  PPPM_CFLOAT* density_brick;
  int* density_brick_int;
  PPPM_CFLOAT density_intScale;
  PPPM_CFLOAT* vdx_brick;
  PPPM_CFLOAT* vdy_brick;
  PPPM_CFLOAT* vdz_brick;
  PPPM_CFLOAT* density_fft;
  ENERGY_CFLOAT* energy;
  ENERGY_CFLOAT* virial;
  int nxlo_in;
  int nxhi_in;
  int nxlo_out;
  int nxhi_out;
  int nylo_in;
  int nyhi_in;
  int nylo_out;
  int nyhi_out;
  int nzlo_in;
  int nzhi_in;
  int nzlo_out;
  int nzhi_out;
  int nx_pppm;
  int ny_pppm;
  int nz_pppm;
  PPPM_CFLOAT qqrd2e;
  int order;
  // float3 sublo;
  PPPM_CFLOAT* rho_coeff;
  int nmax;
  int nlocal;
  PPPM_CFLOAT* debugdata;
  PPPM_CFLOAT delxinv;
  PPPM_CFLOAT delyinv;
  PPPM_CFLOAT delzinv;
  int nlower;
  int nupper;
  PPPM_CFLOAT shiftone;
  PPPM_CFLOAT3* fH;
};

struct cuda_shared_comm {
  int maxswap;
  int maxlistlength;
  dev_array pbc;
  dev_array slablo;
  dev_array slabhi;
  dev_array multilo;
  dev_array multihi;
  dev_array sendlist;
  int grow_flag;
  int comm_phase;

  int nsend;
  int* nsend_swap;
  int* send_size;
  int* recv_size;
  double** buf_send;
  void** buf_send_dev;
  double** buf_recv;
  void** buf_recv_dev;
  void* buffer;
  int buffer_size;
  double overlap_split_ratio;
};

struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data
  int maxlocal;
  int inum;                // # of I atoms neighbors are stored for local indices of I atoms
  int inum_border2;
  dev_array inum_border;         // # of atoms which interact with border atoms
  dev_array ilist;
  dev_array ilist_border;
  dev_array numneigh;
  dev_array numneigh_inner;
  dev_array numneigh_border;
  dev_array firstneigh;
  dev_array neighbors;
  dev_array neighbors_border;
  dev_array neighbors_inner;
  int maxpage;
  dev_array page_pointers;
  dev_array* pages;
  int maxneighbors;
  int neigh_lists_per_page;
  double** cutneighsq;
  CUDA_CFLOAT* cu_cutneighsq;
  int* binned_id;
  int* bin_dim;
  int bin_nmax;
  float bin_extraspace;
  double maxcut;
  dev_array ex_type;
  int nex_type;
  dev_array ex1_bit;
  dev_array ex2_bit;
  int nex_group;
  dev_array ex_mol_bit;
  int nex_mol;

};

struct cuda_compile_settings {	// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
  int prec_glob;
  int prec_x;
  int prec_v;
  int prec_f;
  int prec_pppm;
  int prec_fft;
  int cufft;
  int arch;
};

struct cuda_timings_struct {
  //Debug:
  double test1;
  double test2;
  //transfers
  double transfer_upload_tmp_constr;
  double transfer_download_tmp_deconstr;

  //communication
  double comm_forward_total;
  double comm_forward_mpi_upper;
  double comm_forward_mpi_lower;
  double comm_forward_kernel_pack;
  double comm_forward_kernel_unpack;
  double comm_forward_kernel_self;
  double comm_forward_upload;
  double comm_forward_download;

  double comm_exchange_total;
  double comm_exchange_mpi;
  double comm_exchange_kernel_pack;
  double comm_exchange_kernel_unpack;
  double comm_exchange_kernel_fill;
  double comm_exchange_cpu_pack;
  double comm_exchange_upload;
  double comm_exchange_download;

  double comm_border_total;
  double comm_border_mpi;
  double comm_border_kernel_pack;
  double comm_border_kernel_unpack;
  double comm_border_kernel_self;
  double comm_border_kernel_buildlist;
  double comm_border_upload;
  double comm_border_download;

  //pair forces
  double pair_xtype_conversion;
  double pair_kernel;
  double pair_virial;
  double pair_force_collection;

  //neighbor
  double neigh_bin;
  double neigh_build;
  double neigh_special;

  //PPPM
  double pppm_particle_map;
  double pppm_make_rho;
  double pppm_brick2fft;
  double pppm_poisson;
  double pppm_fillbrick;
  double pppm_fieldforce;
  double pppm_compute;

};

struct cuda_shared_data {	// holds space for all relevent data from the different classes
  void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
  int buffersize; //maxsize of buffer
  int buffer_new; //should be 1 if the pointer to buffer has changed
  void* flag;
  void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
  cuda_shared_atom atom;
  cuda_shared_pair pair;
  cuda_shared_domain domain;
  cuda_shared_pppm pppm;
  cuda_shared_comm comm;
  cuda_compile_settings compile_settings;
  cuda_timings_struct cuda_timings;
  int exchange_dim;
  int me; //mpi rank
  unsigned int datamask;
  int overlap_comm;
};


#endif // #ifndef _CUDA_SHARED_H_