/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   See the README file in the top-level LAMMPS directory.

   -----------------------------------------------------------------------

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany

   See the README file in the USER-CUDA directory.

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */

#include <stdio.h>
#include <time.h>
#define MY_PREFIX neighbor
#define IncludeCommonNeigh
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "cuda_wrapper_cu.h"

#define _cutneighsq     MY_AP(cutneighsq)
#define _ex_type     	MY_AP(ex_type)
#define _nex_type     	MY_AP(nex_type)
#define _ex1_bit     	MY_AP(ex1_bit)
#define _ex2_bit     	MY_AP(ex2_bit)
#define _nex_group     	MY_AP(nex_group)
#define _ex_mol_bit     MY_AP(ex_mol_bit)
#define _nex_mol     	MY_AP(nex_mol)
__device__ __constant__ CUDA_CFLOAT* _cutneighsq;
__device__ __constant__ int* _ex_type;
__device__ __constant__ int _nex_type;
__device__ __constant__ int* _ex1_bit;
__device__ __constant__ int* _ex2_bit;
__device__ __constant__ int _nex_group;
__device__ __constant__ int* _ex_mol_bit;
__device__ __constant__ int _nex_mol;

#include "neighbor_cu.h"
#include "neighbor_kernel.cu"

void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
  CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");

  int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT)));

  if(sdata->buffersize < size) {
    MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)

    if(sdata->buffer != NULL) CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);

    sdata->buffer = CudaWrapper_AllocCudaData(size);
    sdata->buffersize = size;
    sdata->buffer_new++;
    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
  }

  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
  CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
}

int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
  if(sdata->buffer_new)
    Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);

  // initialize only on first call
  CUDA_CFLOAT rez_bin_size[3] = {
    (1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
    (1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
    (1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
  };

  short init = 0;

  if(! init) {
    init = 0;
    cudaMemcpyToSymbol(MY_AP(x)              , & sdata->atom.x         .dev_data, sizeof(X_CFLOAT*));
    cudaMemcpyToSymbol(MY_AP(nall)         , & sdata->atom.nall                    , sizeof(unsigned));
    cudaMemcpyToSymbol(MY_AP(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned));
    cudaMemcpyToSymbol(MY_AP(sublo)          ,   sdata->domain.sublo                 , sizeof(X_CFLOAT) * 3);
  }


  int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
  dim3 threads(layout.z, 1, 1);
  dim3 grid(layout.x, layout.y, 1);

  my_times starttime, endtime;
  my_gettime(CLOCK_REALTIME, &starttime);

  cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));

  Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
  cudaThreadSynchronize();

  my_gettime(CLOCK_REALTIME, &endtime);
  sdata->cuda_timings.neigh_bin +=
    endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;


  int binning_error;
  cudaMemcpy((void*) &binning_error, (void*) sdata->buffer, 1 * sizeof(int), cudaMemcpyDeviceToHost);

  if(binning_error) {
    sneighlist->bin_extraspace += 0.05;
  } else {
    MYDBG(printf("CUDA: binning successful\n");)
  }
  CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
  return binning_error;
}

int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
  //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
  CUDA_CFLOAT globcutoff = -1.0;

  short init = 0;

  if(! init) {
    init = 1;

    // !! LAMMPS indexes atom types starting with 1 !!

    unsigned cuda_ntypes = sdata->atom.ntypes + 1;

    unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;

    CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
    //printf("Allocate: %i\n",nx);
    sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx);

    if(sneighlist->cutneighsq) {
      int cutoffsdiffer = 0;
      double cutoff0 = sneighlist->cutneighsq[1][1];

      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
          acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);

          if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
        }
      }

      if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0;
    } else {
      MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
      return 0;
    }

    int size = 100;

    if(sdata->buffersize < size) {
      MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
      CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
      sdata->buffer = CudaWrapper_AllocCudaData(size);
      sdata->buffersize = size;
      sdata->buffer_new++;
      MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
    }

    CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , &sneighlist->cu_cutneighsq       , sizeof(CUDA_CFLOAT*));

    cudaMemcpyToSymbol(MY_AP(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned));
    cudaMemcpyToSymbol(MY_AP(special_flag)     , sdata->atom.special_flag         , 4 * sizeof(int));
    cudaMemcpyToSymbol(MY_AP(molecular)        , & sdata->atom.molecular          , sizeof(int));
  }

  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
  //cudaMemcpyToSymbol(MY_AP(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*)     );
  cudaMemcpyToSymbol(MY_AP(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(inum)             , & sneighlist->inum               , sizeof(int));
  cudaMemcpyToSymbol(MY_AP(nlocal)           , & sdata->atom.nlocal             , sizeof(int));
  cudaMemcpyToSymbol(MY_AP(nall)             , & sdata->atom.nall            , sizeof(int));
  cudaMemcpyToSymbol(MY_AP(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(type)             , & sdata->atom.type      .dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(mask)             , & sdata->atom.mask      .dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(tag)              , & sdata->atom.tag       .dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(special)          , & sdata->atom.special   .dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(maxspecial)       , & sdata->atom.maxspecial         , sizeof(int));
  cudaMemcpyToSymbol(MY_AP(nspecial)         , & sdata->atom.nspecial  .dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int));
  cudaMemcpyToSymbol(MY_AP(debugdata)        , & sdata->debugdata	 , sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(overlap_comm)     , & sdata->overlap_comm, sizeof(int));
  cudaMemcpyToSymbol(MY_AP(neighbors) 		  , & sneighlist->neighbors.dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(ex_type) 		  , & sneighlist->ex_type.dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(ex1_bit) 		  , & sneighlist->ex1_bit.dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(ex2_bit) 		  , & sneighlist->ex2_bit.dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(ex_mol_bit) 	  , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
  cudaMemcpyToSymbol(MY_AP(nex_type)     	  , & sneighlist->nex_type, sizeof(int));
  cudaMemcpyToSymbol(MY_AP(nex_group)     	  , & sneighlist->nex_group, sizeof(int));
  cudaMemcpyToSymbol(MY_AP(nex_mol)     	  , & sneighlist->nex_mol, sizeof(int));

  if(sdata->overlap_comm) {
    cudaMemcpyToSymbol(MY_AP(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*));
  }

  //dim3 threads(sneighlist->bin_nmax,1,1);
  dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
  dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);

  //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax);
  int buffer[20];
  buffer[0] = 1;
  buffer[1] = 0;
  CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
  CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
  //cudaMemset(sdata->debugdata,0,100*sizeof(int));
  unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x;
  MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
  //shared_size=2056;
  my_times starttime, endtime;
  my_gettime(CLOCK_REALTIME, &starttime);
  //for(int i=0;i<100;i++)
  {
    if(sdata->overlap_comm)
      NeighborBuildFullBin_OverlapComm_Kernel <<< grid, threads, shared_size>>>
      (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom);
    else {
      int exclude = sneighlist->nex_mol | sneighlist->nex_group | sneighlist->nex_type;

      if(exclude)
        NeighborBuildFullBin_Kernel<1> <<< grid, threads, shared_size>>>
        (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
      else
        NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
        (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
    }
    //NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_CFLOAT))*threads.x+sizeof(int)>>>
    //	(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);

    cudaThreadSynchronize();
    CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
    my_gettime(CLOCK_REALTIME, &endtime);
    sdata->cuda_timings.neigh_build +=
      endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
    //dim3 threads,grid;
    CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));

    if(buffer[0] >= 0 && true && sdata->atom.molecular) {
      //printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
      my_gettime(CLOCK_REALTIME, &starttime);
      int3 layout = getgrid(sdata->atom.nlocal, 0, 512);
      threads.x = layout.z;
      threads.y = 1;
      threads.z = 1;
      grid.x = layout.x;
      grid.y = layout.y;
      grid.z = 1;
      FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom);
      cudaThreadSynchronize();
      CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
      my_gettime(CLOCK_REALTIME, &endtime);
      sdata->cuda_timings.neigh_special +=
        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
    }
  }
  //printf("Neightime: %lf\n",sdata->cuda_timings.test1);
  CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");

  //CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));

  MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
  return buffer[0];
}

int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
  MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
  // initialize only on first call
  /*static*/ short init = 0;

  if(! init) {
    init = 1;

    // !! LAMMPS indexes atom types starting with 1 !!

    unsigned cuda_ntypes = sdata->atom.ntypes + 1;

    if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
      printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
             "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
             "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);

    unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
    CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);

    if(sneighlist->cutneighsq) {
      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
          acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
          //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
        }
      }
    } else {
      MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
      return 0;
    }

    int size = 100;

    if(sdata->buffersize < size) {
      MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
      CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
      sdata->buffer = CudaWrapper_AllocCudaData(size);
      sdata->buffersize = size;
      sdata->buffer_new++;
      MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
    }

    cudaMemcpyToSymbol(MY_AP(buffer)           , & sdata->buffer                  , sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned));
    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , acutneighsq                    , nx);
    cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
    cudaMemcpyToSymbol(MY_AP(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(inum)             , & sneighlist->inum               , sizeof(int));
    cudaMemcpyToSymbol(MY_AP(nlocal)           , & sdata->atom.nlocal             , sizeof(int));
    cudaMemcpyToSymbol(MY_AP(nall)             , & sdata->atom.nall               , sizeof(int));
    cudaMemcpyToSymbol(MY_AP(nmax)             , & sdata->atom.nmax               , sizeof(int));
    cudaMemcpyToSymbol(MY_AP(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(type)             , & sdata->atom.type      .dev_data, sizeof(int*));
    cudaMemcpyToSymbol(MY_AP(x)                , & sdata->atom.x         .dev_data, sizeof(X_CFLOAT*));
    cudaMemcpyToSymbol(MY_AP(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int));

    free(acutneighsq);
  }

  int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
  dim3 threads(layout.z, 1, 1);
  dim3 grid(layout.x, layout.y, 1);

  int return_value = 1;
  CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));

  CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
  NeighborBuildFullNsq_Kernel <<< grid, threads>>> ();
  cudaThreadSynchronize();
  CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");

  int buffer[20];
  CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int) * 20);
  MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
  return return_value = buffer[0];
}