diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu deleted file mode 100644 index 6ba89ba842..0000000000 --- a/lib/gpu/lj_gpu.cu +++ /dev/null @@ -1,466 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov - Peng Wang (Nvidia), penwang@nvidia.com - Paul Crozier (SNL), pscrozi@sandia.gov -------------------------------------------------------------------------- */ -#include -#include -#include -#include "cudatimer.h" -#include "lj_tex.h" -#include "neigh.h" -#include "cell.h" -#include "lj_gpu_kernel.h" - -#ifdef WINDLL -#define EXTERN extern "C" __declspec(dllexport) -#else -#define EXTERN -#endif - -static float h_boxlo[3], h_boxhi[3]; -static float cell_size; -static float *energy = NULL, *d_energy = NULL; -static float3 *d_force = NULL, *f_temp = NULL, *v_temp = NULL, *d_virial = NULL; -static float4 *d_pos = NULL, *temp_pos = NULL; -static int *d_type = NULL; -static int ncellx, ncelly, ncellz; - -static neigh_list_gpu d_neigh_list; -static cell_list_gpu d_cell_list; - -#define TIMING(x) - -// --------------------------------------------------------------------------- -// Return string with GPU info -// --------------------------------------------------------------------------- -EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) -{ - struct cudaDeviceProp prop; - CUDA_SAFE_CALL( cudaGetDeviceProperties(&prop, id) ); -#ifdef _WIN32 - strcpy_s(name, strlen(prop.name)+1, prop.name); -#else - strncpy(name, prop.name, strlen(prop.name)+1); -#endif -} - -// --------------------------------------------------------------------------- -// Allocate memory on host and device and copy constants to device -// --------------------------------------------------------------------------- -EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, - double **cutsq,double **sigma, - double **epsilon, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, double *boxlo, double *boxhi, - double cellsize, double skin, - const int max_nbors, const int gpu_id) -{ - int num_devices; - - /* get device count */ - CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) ); - if (num_devices == 0) { - printf("NO CUDA-capable GPU detected.\n"); - exit(1); - } - - if (gpu_id > num_devices) { - printf("gpu_id %d is larger than the number of GPUs %d\n", - gpu_id, num_devices); - exit(1); - } - - /* set CUDA device to the specified GPU */ - cudaThreadExit(); - CUDA_SAFE_CALL( cudaSetDevice(gpu_id) ); - - ij_size=0; - - cell_size = cellsize; - ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size); - ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size); - ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size); - - for (int i = 0; i < 3; i++) { - h_boxhi[i] = boxhi[i]; - h_boxlo[i] = boxlo[i]; - } - - init_force_const(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset); - - init_cell_list_const(cellsize, skin, boxlo, boxhi); - - return true; -} - -// --------------------------------------------------------------------------- -// Clear memory on host and device -// --------------------------------------------------------------------------- -EXTERN void lj_gpu_clear() { - - free(energy); - free(v_temp); - CUDA_SAFE_CALL( cudaFreeHost(f_temp) ); - if (d_force) CUDA_SAFE_CALL( cudaFree(d_force) ); - if (d_energy) CUDA_SAFE_CALL( cudaFree(d_energy) ); - if (d_virial) CUDA_SAFE_CALL( cudaFree(d_virial) ); - if (d_pos) CUDA_SAFE_CALL( cudaFree(d_pos) ); - if (d_type) CUDA_SAFE_CALL( cudaFree(d_type) ); - if (temp_pos) CUDA_SAFE_CALL( cudaFreeHost(temp_pos) ); - clear_neigh_list_gpu(d_neigh_list); - clear_cell_list_gpu(d_cell_list); - - if (useCache) { - unbind_pos(); - unbind_type(); - } - - - //LJMF.clear(); -} - - -template -double _lj_gpu_neigh(double **force, double *virial, - double **host_x, int *host_type, const int inum, - const int nall, const int ago, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) -{ - - double evdwl=0.0; - - static int first_call = 1; - - TIMING( static CUDATimer cuTimer ); - TIMING( static CTimer cTimer ); - TIMING( static CTimer cTimer2 ); - - double *atom_pos = host_x[0]; - - static int szTailList = inum*32; - - TIMING( cTimer.Start() ); - TIMING( cTimer2.Start() ); - - /* MPI communication just happened, reallocate space using new inum & nall - FIXME: this is costly: ~ total kernel time! Use a DIY GPU memory allocator.*/ - - if (first_call || ago == 0) { - - if (!first_call) { - if (useCache) { - unbind_pos(); - unbind_type(); - } - - CUDA_SAFE_CALL( cudaFree(d_force) ); - CUDA_SAFE_CALL( cudaFree(d_energy) ); - CUDA_SAFE_CALL( cudaFree(d_virial) ); - CUDA_SAFE_CALL( cudaFree(d_pos) ); - CUDA_SAFE_CALL( cudaFree(d_type) ); - - clear_neigh_list_gpu(d_neigh_list); - - CUDA_SAFE_CALL( cudaFreeHost(f_temp) ); - CUDA_SAFE_CALL( cudaFreeHost(temp_pos) ); - - free(energy); - free(v_temp); - } - - CUDA_SAFE_CALL( cudaMalloc((void**)&d_force, inum*sizeof(float3)) ); - CUDA_SAFE_CALL( cudaMalloc((void**)&d_energy, inum*sizeof(float)) ); - CUDA_SAFE_CALL( cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)) ); - CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) ); - CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) ); - - init_neigh_list_gpu(d_neigh_list, inum, NEIGH_BIN_SIZE, szTailList); - - CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) ); - CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) ); - - energy = (float*) malloc(inum*sizeof(float)); - v_temp = (float3*)malloc(inum*2*sizeof(float3)); - - if (useCache) { - bind_pos(d_pos, nall); - bind_type(d_type, nall); - } - - first_call = 0; - CUDA_SAFE_CALL( cudaThreadSynchronize() ); - CUDA_SAFE_CALL( cudaGetLastError() ); - CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int), - cudaMemcpyHostToDevice) ); - - } - - TIMING( static double mallocTime = 0. ); - TIMING( mallocTime += cTimer2.GetET() ); - TIMING( printf("malloc time = %f ms\n", mallocTime*1e3) ); - - TIMING( cTimer2.Start() ); - for (int i = 0; i < 3*nall; i+=3) { - temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f); - } - - TIMING( static double copyTime = 0. ); - TIMING( copyTime += cTimer2.GetET() ); - TIMING( printf("position copy time = %f ms\n", copyTime*1e3) ); - - - TIMING( cTimer2.Start() ); - CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), cudaMemcpyHostToDevice) ); - - TIMING( static double h2dTime = 0. ); - TIMING( h2dTime += cTimer2.GetET() ); - TIMING( printf("h2d copy time = %f ms\n", h2dTime*1e3) ); - - TIMING( cTimer2.Start() ); - if (ago == 0) { - build_neigh_list_gpu(d_pos, - d_neigh_list, - h_boxlo, h_boxhi, cell_size, - inum, nall); - } - TIMING( static double neighTime = 0. ); - TIMING( neighTime += cTimer2.GetET() ); - TIMING( printf("Neigh List time = %f ms\n", neighTime*1e3) ); - - TIMING( cTimer2.Start() ); - calc_lj_neigh_gpu(d_force, d_energy, d_virial, - d_pos, d_type, - d_neigh_list, - inum, nall, - eflag, vflag); - TIMING( static double forceTime = 0. ); - TIMING( forceTime += cTimer2.GetET() ); - TIMING( printf("Force time = %f ms\n", forceTime*1e3) ); - TIMING( printf("GPU kernel time = %f ms\n", (forceTime + neighTime)*1e3) ); - - - TIMING( cTimer2.Start() ); - CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost) ); - TIMING( static double d2hTime = 0. ); - TIMING( d2hTime += cTimer2.GetET() ); - TIMING( printf("d2h copy time = %f ms\n", d2hTime*1e3) ); - TIMING( printf("GPU-CPU data transfer time = %f ms\n", (h2dTime+d2hTime)*1e3) ); - - TIMING( cTimer2.Start() ); - - for (int i = 0; i < inum; i++) { - force[i][0] += f_temp[i].x; - force[i][1] += f_temp[i].y; - force[i][2] += f_temp[i].z; - } - - if (eflag) { - CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy, - inum*sizeof(float), cudaMemcpyDeviceToHost) ); - for (int i = 0; i < inum; i++) { - evdwl += energy[i]; - } - evdwl *= 0.5f; - } - - if (vflag) { - CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), - cudaMemcpyDeviceToHost) ); - for (int i = 0; i < inum; i++) { - virial[0] += v_temp[2*i].x; - virial[1] += v_temp[2*i].y; - virial[2] += v_temp[2*i].z; - virial[3] += v_temp[2*i+1].x; - virial[4] += v_temp[2*i+1].y; - virial[5] += v_temp[2*i+1].z; - } - for (int i = 0; i < 6; i++) - virial[i] *= 0.5f; - } - - - TIMING( static double postTime = 0. ); - TIMING( postTime += cTimer2.GetET() ); - TIMING( printf("postprocess Time = %f ms\n", postTime*1e3) ); - TIMING( printf("Data process time = %f ms\n", (postTime+copyTime)*1e3) ); - - TIMING( static double totalTime = 0. ); - TIMING( totalTime += cTimer.GetET() ); - TIMING( printf("lj_gpu time = %f ms\n", totalTime*1e3) ); - - return evdwl; - -} - -EXTERN double lj_gpu_neigh(double **force, double *virial, - double **host_x, int *host_type, - const int inum, const int nall, - const int ago, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) -{ - return _lj_gpu_neigh(force, virial, - host_x, host_type, inum, nall, - ago, eflag, vflag, boxlo, boxhi); -} - - -template -double _lj_gpu_cell(double **force, double *virial, - double **host_x, int *host_type, const int inum, - const int nall, const int ago, - const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) -{ - - double evdwl=0.0; - - static int ncell = ncellx*ncelly*ncellz; - - static int first_call = 1; - - // allocate memory on CPU and GPU - if (first_call || ago == 0) { - if (!first_call) { - if (useCache) { - unbind_pos(); - unbind_type(); - } - - free(energy); - free(v_temp); - - CUDA_SAFE_CALL( cudaFree(d_force) ); - CUDA_SAFE_CALL( cudaFree(d_energy) ); - CUDA_SAFE_CALL( cudaFree(d_virial) ); - - CUDA_SAFE_CALL( cudaFree(d_pos) ); - CUDA_SAFE_CALL( cudaFree(d_type) ); - CUDA_SAFE_CALL( cudaFreeHost(f_temp) ); - CUDA_SAFE_CALL( cudaFreeHost(temp_pos) ); - - clear_cell_list_gpu(d_cell_list); - } - - energy = (float*) malloc(inum*sizeof(float)); - v_temp = (float3*)malloc(inum*2*sizeof(float3)); - - - cudaMalloc((void**)&d_force, inum*sizeof(float3)); - cudaMalloc((void**)&d_energy, inum*sizeof(float)); - cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)); - - CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) ); - CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) ); - - CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) ); - CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) ); - - init_cell_list_gpu(d_cell_list, nall, ncell); - - CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int), - cudaMemcpyHostToDevice) ); - - if (useCache) { - bind_pos(d_pos, nall); - bind_type(d_type, nall); - } - - first_call = 0; - } - - /* build cell-list on GPU */ - double *atom_pos = host_x[0]; - for (int i = 0; i < 3*nall; i+=3) { - temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f); - } - CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), - cudaMemcpyHostToDevice) ); - if (ago == 0) { - build_cell_list_gpu(d_pos, d_cell_list, h_boxlo, h_boxhi, - cell_size, inum, nall); - } - - calc_lj_cell_gpu(d_force, d_energy, d_virial, - d_pos, d_type, d_cell_list, - inum, nall, ncellx, - ncelly, ncellz, cell_size, - eflag, vflag); - - CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), - cudaMemcpyDeviceToHost) ); - - for (int i = 0; i < inum; i++) { - force[i][0] += f_temp[i].x; - force[i][1] += f_temp[i].y; - force[i][2] += f_temp[i].z; - } - - if (eflag) { - CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy, - inum*sizeof(float), cudaMemcpyDeviceToHost) ); - for (int i = 0; i < inum; i++) { - evdwl += energy[i]; - } - evdwl *= 0.5f; - } - - if (vflag) { - CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), - cudaMemcpyDeviceToHost) ); - for (int i = 0; i < inum; i++) { - virial[0] += v_temp[2*i].x; - virial[1] += v_temp[2*i].y; - virial[2] += v_temp[2*i].z; - virial[3] += v_temp[2*i+1].x; - virial[4] += v_temp[2*i+1].y; - virial[5] += v_temp[2*i+1].z; - } - for (int i = 0; i < 6; i++) - virial[i] *= 0.5f; - } - - return evdwl; -} - -EXTERN double lj_gpu_cell(double **force, double *virial, - double **host_x, int *host_type, - const int inum, const int nall, - const int ago, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) -{ - return _lj_gpu_cell(force, virial, - host_x, host_type, inum, nall, - ago, eflag, vflag, boxlo, boxhi); -} - -EXTERN void lj_gpu_time() { - /* cout.precision(4); - cout << "Atom copy: " << LJMF.time_atom.total_seconds() << " s.\n"; - cout << "Neighbor copy: " << LJMF.time_nbor.total_seconds() << " s.\n"; - cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";*/ - //cout << "Answer copy: " << LJMF.time_answer.total_seconds() << " s.\n"; -} - -EXTERN int lj_gpu_num_devices() { - int num_devices; - CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) ); - return num_devices; -} - -EXTERN double lj_gpu_bytes() { - return 0.0; -}