From 621fa7d600d68e8bda629fa99fbb8fc1047a3f20 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 6 Oct 2014 22:59:05 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12588 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/cuda/Makefile.defaults | 2 +- lib/cuda/atom_vec_cuda.cu | 56 +-- lib/cuda/atom_vec_cuda_kernel.cu | 124 +++--- lib/cuda/comm_cuda.cu | 78 ++-- lib/cuda/comm_cuda_kernel.cu | 68 +-- lib/cuda/compute_temp_cuda.cu | 20 +- lib/cuda/compute_temp_cuda_cu.h | 4 +- lib/cuda/compute_temp_cuda_kernel.cu | 14 +- lib/cuda/compute_temp_partial_cuda.cu | 24 +- lib/cuda/compute_temp_partial_cuda_cu.h | 4 +- lib/cuda/compute_temp_partial_cuda_kernel.cu | 18 +- lib/cuda/crm_cuda_utils.cu | 36 +- lib/cuda/cuda.cu | 12 +- lib/cuda/cuda_common.h | 72 +-- lib/cuda/cuda_pair.cu | 248 +++++------ lib/cuda/cuda_pair_kernel.cu | 304 ++++++------- lib/cuda/cuda_pair_virial_kernel_nc.cu | 10 +- lib/cuda/cuda_precision.h | 135 +++--- lib/cuda/cuda_shared.h | 102 ++--- lib/cuda/domain.cu | 42 +- lib/cuda/domain_kernel.cu | 38 +- lib/cuda/fft3d_cuda.cu | 12 +- lib/cuda/fft3d_cuda_cu.h | 2 +- lib/cuda/fft3d_cuda_kernel.cu | 8 +- lib/cuda/fix_addforce_cuda.cu | 16 +- lib/cuda/fix_addforce_cuda_cu.h | 2 +- lib/cuda/fix_addforce_cuda_kernel.cu | 12 +- lib/cuda/fix_aveforce_cuda.cu | 18 +- lib/cuda/fix_aveforce_cuda_cu.h | 4 +- lib/cuda/fix_aveforce_cuda_kernel.cu | 12 +- lib/cuda/fix_enforce2d_cuda.cu | 4 +- lib/cuda/fix_freeze_cuda.cu | 18 +- lib/cuda/fix_freeze_cuda_cu.h | 2 +- lib/cuda/fix_freeze_cuda_kernel.cu | 10 +- lib/cuda/fix_gravity_cuda.cu | 14 +- lib/cuda/fix_gravity_cuda_cu.h | 2 +- lib/cuda/fix_gravity_cuda_kernel.cu | 4 +- lib/cuda/fix_nh_cuda.cu | 38 +- lib/cuda/fix_nh_cuda_cu.h | 4 +- lib/cuda/fix_nh_cuda_kernel.cu | 60 +-- lib/cuda/fix_nve_cuda.cu | 24 +- lib/cuda/fix_nve_cuda_cu.h | 2 +- lib/cuda/fix_nve_cuda_kernel.cu | 40 +- lib/cuda/fix_set_force_cuda.cu | 16 +- lib/cuda/fix_set_force_cuda_cu.h | 2 +- lib/cuda/fix_set_force_cuda_kernel.cu | 12 +- lib/cuda/fix_shake_cuda.cu | 66 +-- lib/cuda/fix_shake_cuda_cu.h | 4 +- lib/cuda/fix_shake_cuda_kernel.cu | 410 +++++++++--------- lib/cuda/fix_temp_berendsen_cuda.cu | 4 +- lib/cuda/fix_temp_berendsen_cuda_kernel.cu | 2 +- lib/cuda/fix_temp_rescale_cuda.cu | 4 +- lib/cuda/fix_temp_rescale_cuda_kernel.cu | 2 +- lib/cuda/fix_temp_rescale_limit_cuda.cu | 4 +- .../fix_temp_rescale_limit_cuda_kernel.cu | 8 +- lib/cuda/fix_viscous_cuda.cu | 6 +- lib/cuda/fix_viscous_cuda_kernel.cu | 4 +- lib/cuda/neighbor.cu | 40 +- lib/cuda/neighbor_kernel.cu | 94 ++-- lib/cuda/pair_born_coul_long_cuda.cu | 4 +- .../pair_born_coul_long_cuda_kernel_nc.cu | 12 +- lib/cuda/pair_buck_coul_cut_cuda.cu | 4 +- lib/cuda/pair_buck_coul_long_cuda.cu | 4 +- lib/cuda/pair_buck_cuda.cu | 4 +- lib/cuda/pair_buck_cuda_kernel_nc.cu | 12 +- lib/cuda/pair_cg_cmm_coul_cut_cuda.cu | 4 +- lib/cuda/pair_cg_cmm_coul_debye_cuda.cu | 4 +- lib/cuda/pair_cg_cmm_coul_long_cuda.cu | 4 +- lib/cuda/pair_cg_cmm_cuda.cu | 4 +- lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu | 24 +- lib/cuda/pair_eam_cuda.cu | 92 ++-- lib/cuda/pair_eam_cuda_kernel_nc.cu | 78 ++-- lib/cuda/pair_gran_hooke_cuda.cu | 58 +-- lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu | 44 +- lib/cuda/pair_lj96_cut_cuda.cu | 4 +- lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu | 10 +- lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu | 14 +- lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h | 2 +- ...ir_lj_charmm_coul_charmm_cuda_kernel_nc.cu | 24 +- ...air_lj_charmm_coul_charmm_implicit_cuda.cu | 20 +- ...r_lj_charmm_coul_charmm_implicit_cuda_cu.h | 2 +- ...rmm_coul_charmm_implicit_cuda_kernel_nc.cu | 10 +- lib/cuda/pair_lj_charmm_coul_long_cuda.cu | 10 +- lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h | 2 +- lib/cuda/pair_lj_class2_coul_cut_cuda.cu | 4 +- lib/cuda/pair_lj_class2_coul_long_cuda.cu | 4 +- lib/cuda/pair_lj_class2_cuda.cu | 6 +- lib/cuda/pair_lj_class2_cuda_kernel_nc.cu | 8 +- lib/cuda/pair_lj_cut_coul_cut_cuda.cu | 4 +- lib/cuda/pair_lj_cut_coul_debye_cuda.cu | 4 +- lib/cuda/pair_lj_cut_coul_long_cuda.cu | 4 +- lib/cuda/pair_lj_cut_cuda.cu | 6 +- lib/cuda/pair_lj_cut_cuda_kernel_nc.cu | 6 +- lib/cuda/pair_lj_cut_experimental_cuda.cu | 6 +- lib/cuda/pair_lj_expand_cuda.cu | 4 +- lib/cuda/pair_lj_expand_cuda_kernel_nc.cu | 14 +- lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu | 24 +- .../pair_lj_gromacs_coul_gromacs_cuda_cu.h | 2 +- ..._lj_gromacs_coul_gromacs_cuda_kernel_nc.cu | 12 +- lib/cuda/pair_lj_gromacs_cuda.cu | 4 +- lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu | 16 +- lib/cuda/pair_lj_sdk_coul_cut_cuda.cu | 4 +- lib/cuda/pair_lj_sdk_coul_debye_cuda.cu | 4 +- lib/cuda/pair_lj_sdk_coul_long_cuda.cu | 4 +- lib/cuda/pair_lj_sdk_cuda.cu | 4 +- lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu | 24 +- lib/cuda/pair_lj_smooth_cuda.cu | 4 +- lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu | 14 +- lib/cuda/pair_morse_coul_long_cuda.cu | 4 +- .../pair_morse_coul_long_cuda_kernel_nc.cu | 12 +- lib/cuda/pair_morse_cuda.cu | 4 +- lib/cuda/pair_morse_cuda_kernel_nc.cu | 8 +- lib/cuda/pair_sw_cuda.cu | 18 +- lib/cuda/pair_sw_cuda_cu.h | 16 +- lib/cuda/pair_sw_cuda_kernel_nc.cu | 76 ++-- lib/cuda/pair_tersoff_cuda.cu | 26 +- lib/cuda/pair_tersoff_cuda_cu.h | 20 +- lib/cuda/pair_tersoff_cuda_kernel_nc.cu | 354 +++++++-------- lib/cuda/pppm_cuda.cu | 174 ++++---- lib/cuda/pppm_cuda_cu.h | 18 +- lib/cuda/pppm_cuda_kernel.cu | 96 ++-- lib/kokkos/Makefile.lammps | 4 + 122 files changed, 1934 insertions(+), 1929 deletions(-) diff --git a/lib/cuda/Makefile.defaults b/lib/cuda/Makefile.defaults index 98628750ec..55a65915dc 100644 --- a/lib/cuda/Makefile.defaults +++ b/lib/cuda/Makefile.defaults @@ -6,7 +6,7 @@ precision ?= 1 verbose ?= 1 #GPU architecture (compute capability): 13, 20, 21, 35 -arch ?= 21 +arch ?= 20 #Using cufft (should not be changed) cufft ?= 1 diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu index 1bed9b2089..6162889c71 100644 --- a/lib/cuda/atom_vec_cuda.cu +++ b/lib/cuda/atom_vec_cuda.cu @@ -85,15 +85,15 @@ void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata) { cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*)); - if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*)); if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*)); @@ -121,9 +121,9 @@ void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");) - cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_CFLOAT)); cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*)); cudaThreadSynchronize(); MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");) @@ -143,14 +143,14 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); int n_data_items = AtomVecCuda_CountDataItems(data_mask); - int size = (n * n_data_items) * sizeof(X_FLOAT); + int size = (n * n_data_items) * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_AtomVecCuda_UpdateBuffer(sdata, size); - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -185,8 +185,8 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed"); if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_download += @@ -216,16 +216,16 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); int n_data_items = AtomVecCuda_CountDataItems(data_mask); - int size = (n * n_data_items) * sizeof(X_FLOAT); + int size = (n * n_data_items) * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_AtomVecCuda_UpdateBuffer(sdata, size); static int count = -1; count++; - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -276,7 +276,7 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); int n_data_items = AtomVecCuda_CountDataItems(data_mask); - int size = (n * n_data_items) * sizeof(X_FLOAT); + int size = (n * n_data_items) * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_AtomVecCuda_UpdateBuffer(sdata, size); @@ -289,7 +289,7 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void my_gettime(CLOCK_REALTIME, &time1); if(not sdata->overlap_comm || iswap < 0) - cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_CFLOAT), cudaMemcpyHostToDevice); my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_upload += @@ -463,14 +463,14 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v int n_data_items = AtomVecCuda_CountDataItems(data_mask); - int size = nsend * n_data_items * sizeof(X_FLOAT); + int size = nsend * n_data_items * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_AtomVecCuda_UpdateBuffer(sdata, size); - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -522,14 +522,14 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int n_data_items = AtomVecCuda_CountDataItems(data_mask); - int size = n * n_data_items * sizeof(X_FLOAT); + int size = n * n_data_items * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_AtomVecCuda_UpdateBuffer(sdata, size); - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -584,7 +584,7 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi int n_data_items = AtomVecCuda_CountDataItems(data_mask); - int size = n * n_data_items * sizeof(X_FLOAT); + int size = n * n_data_items * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_AtomVecCuda_UpdateBuffer(sdata, size); diff --git a/lib/cuda/atom_vec_cuda_kernel.cu b/lib/cuda/atom_vec_cuda_kernel.cu index 5e2f6a974f..b776892653 100644 --- a/lib/cuda/atom_vec_cuda_kernel.cu +++ b/lib/cuda/atom_vec_cuda_kernel.cu @@ -27,7 +27,7 @@ extern __shared__ int shared[]; template -__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer) +__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -40,44 +40,44 @@ __global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxli int k = 0; if(data_mask & X_MASK) { - ((X_FLOAT*) buffer)[i + k * n] = _x[j] + dx; + ((X_CFLOAT*) buffer)[i + k * n] = _x[j] + dx; k++; - ((X_FLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy; + ((X_CFLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy; k++; - ((X_FLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz; + ((X_CFLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz; k++; } if(data_mask & V_MASK) { - ((X_FLOAT*) buffer)[i + k * n] = _v[j]; + ((X_CFLOAT*) buffer)[i + k * n] = _v[j]; k++; - ((X_FLOAT*) buffer)[i + k * n] = _v[j + _nmax]; + ((X_CFLOAT*) buffer)[i + k * n] = _v[j + _nmax]; k++; - ((X_FLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax]; + ((X_CFLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax]; k++; } if(data_mask & OMEGA_MASK) { - ((X_FLOAT*) buffer)[i + k * n] = _omega[j]; + ((X_CFLOAT*) buffer)[i + k * n] = _omega[j]; k++; - ((X_FLOAT*) buffer)[i + k * n] = _omega[j + _nmax]; + ((X_CFLOAT*) buffer)[i + k * n] = _omega[j + _nmax]; k++; - ((X_FLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax]; + ((X_CFLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax]; k++; } - if(data_mask & RADIUS_MASK)((X_FLOAT*) buffer)[i + k * n] = _radius[j]; + if(data_mask & RADIUS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _radius[j]; k++; - if(data_mask & RMASS_MASK)((X_FLOAT*) buffer)[i + k * n] = _rmass[j]; + if(data_mask & RMASS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _rmass[j]; k++; } } template -__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; @@ -121,37 +121,37 @@ __global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffe int k = 0; if(data_mask & X_MASK) { - _x[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + _x[i + first] = ((X_CFLOAT*) buffer)[i + k * n]; k++; - _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + _x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n]; k++; - _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + _x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n]; k++; } if(data_mask & V_MASK) { - _v[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + _v[i + first] = ((X_CFLOAT*) buffer)[i + k * n]; k++; - _v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + _v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n]; k++; - _v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + _v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n]; k++; } if(data_mask & OMEGA_MASK) { - _omega[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + _omega[i + first] = ((X_CFLOAT*) buffer)[i + k * n]; k++; - _omega[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + _omega[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n]; k++; - _omega[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + _omega[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n]; k++; } - if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) buffer)[i + k * n]; k++; - if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) buffer)[i + k * n]; k++; } @@ -163,8 +163,8 @@ __global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim) double* buf = (double*) _buffer; buf = &buf[1]; - //X_FLOAT lo=slablo[iswap]; - //X_FLOAT hi=slabhi[iswap]; + //X_CFLOAT lo=slablo[iswap]; + //X_CFLOAT hi=slabhi[iswap]; int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; bool add = false; @@ -369,7 +369,7 @@ __global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int* } template -__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz) +__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -379,37 +379,37 @@ __global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int max int m = 0; if(data_mask & X_MASK) { - ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx; - ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy; - ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz; } if(data_mask & V_MASK) { - ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j]; - ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax]; - ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax]; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j]; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax]; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax]; } - if(data_mask & TAG_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _tag[j]; + if(data_mask & TAG_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _tag[j]; - if(data_mask & TYPE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _type[j]; + if(data_mask & TYPE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _type[j]; - if(data_mask & MASK_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _mask[j]; + if(data_mask & MASK_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _mask[j]; - if(data_mask & Q_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _q[j]; + if(data_mask & Q_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _q[j]; - if(data_mask & MOLECULE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _molecule[j]; + if(data_mask & MOLECULE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _molecule[j]; - if(data_mask & RADIUS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _radius[i]; + if(data_mask & RADIUS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _radius[i]; - if(data_mask & DENSITY_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _density[i]; + if(data_mask & DENSITY_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _density[i]; - if(data_mask & RMASS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _rmass[i]; + if(data_mask & RMASS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _rmass[i]; if(data_mask & OMEGA_MASK) { - ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i]; - ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax]; - ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax]; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i]; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax]; + ((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax]; } } } @@ -417,7 +417,7 @@ __global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int max template -__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -471,37 +471,37 @@ __global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first) int m = 0; if(data_mask & X_MASK) { - _x[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; - _x[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; - _x[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _x[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; + _x[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; + _x[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; } if(data_mask & V_MASK) { - _v[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; - _v[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; - _v[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _v[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; + _v[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; + _v[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; } - if(data_mask & TAG_MASK) _tag[i + first] = static_cast(((X_FLOAT*) _buffer)[i + (m++) * n]); + if(data_mask & TAG_MASK) _tag[i + first] = static_cast(((X_CFLOAT*) _buffer)[i + (m++) * n]); - if(data_mask & TYPE_MASK) _type[i + first] = static_cast(((X_FLOAT*) _buffer)[i + (m++) * n]); + if(data_mask & TYPE_MASK) _type[i + first] = static_cast(((X_CFLOAT*) _buffer)[i + (m++) * n]); - if(data_mask & MASK_MASK) _mask[i + first] = static_cast(((X_FLOAT*) _buffer)[i + (m++) * n]); + if(data_mask & MASK_MASK) _mask[i + first] = static_cast(((X_CFLOAT*) _buffer)[i + (m++) * n]); - if(data_mask & Q_MASK) _q[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + if(data_mask & Q_MASK) _q[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; - if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast(((X_FLOAT*) _buffer)[i + (m++) * n]); + if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast(((X_CFLOAT*) _buffer)[i + (m++) * n]); - if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; - if(data_mask & DENSITY_MASK) _density[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + if(data_mask & DENSITY_MASK) _density[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; - if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; if(data_mask & OMEGA_MASK) { - _omega[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; - _omega[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; - _omega[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _omega[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; + _omega[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; + _omega[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n]; } } else { _flag[0] = 1; diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu index ec95a8bfeb..36c71984b9 100644 --- a/lib/cuda/comm_cuda.cu +++ b/lib/cuda/comm_cuda.cu @@ -34,7 +34,7 @@ void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n) { - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -53,9 +53,9 @@ void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata) { cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); } @@ -65,7 +65,7 @@ void Cuda_CommCuda_Init(cuda_shared_data* sdata) Cuda_CommCuda_UpdateNmax(sdata); int ntypesp = sdata->atom.ntypes + 1; cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &ntypesp, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT)); cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata, sizeof(int*)); } @@ -81,14 +81,14 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_ if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -123,8 +123,8 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_ CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_download += @@ -151,14 +151,14 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 6 * sizeof(X_FLOAT); + int size = n * 6 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -193,8 +193,8 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_download += @@ -221,16 +221,16 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); static int count = -1; count++; - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -278,16 +278,16 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 6 * sizeof(X_FLOAT); + int size = n * 6 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); static int count = -1; count++; - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -334,7 +334,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); @@ -347,7 +347,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b my_gettime(CLOCK_REALTIME, &time1); if(not sdata->overlap_comm || iswap < 0) - cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice); my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_upload += @@ -375,7 +375,7 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 6 * sizeof(X_FLOAT); + int size = n * 6 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); @@ -388,7 +388,7 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void my_gettime(CLOCK_REALTIME, &time1); if(not sdata->overlap_comm || iswap < 0) - cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice); my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_upload += @@ -414,22 +414,22 @@ int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* b if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(F_FLOAT); + int size = n * 3 * sizeof(F_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); - F_FLOAT* buf = (F_FLOAT*)buf_send; - F_FLOAT* f_dev = (F_FLOAT*)sdata->atom.f.dev_data; + F_CFLOAT* buf = (F_CFLOAT*)buf_send; + F_CFLOAT* f_dev = (F_CFLOAT*)sdata->atom.f.dev_data; f_dev += first; - cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost); buf += n; f_dev += sdata->atom.nmax; - cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost); buf += n; f_dev += sdata->atom.nmax; - cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost); return n * 3; } @@ -442,7 +442,7 @@ void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(F_FLOAT); + int size = n * 3 * sizeof(F_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); @@ -468,7 +468,7 @@ void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata, n); @@ -520,9 +520,9 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in my_gettime(CLOCK_REALTIME, &time1); if(style == 1) - Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); + Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.slablo.dev_data, (X_CFLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); else - Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); + Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.multilo.dev_data, (X_CFLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); cudaThreadSynchronize(); my_gettime(CLOCK_REALTIME, &time2); diff --git a/lib/cuda/comm_cuda_kernel.cu b/lib/cuda/comm_cuda_kernel.cu index f87b3af540..450b5243ae 100644 --- a/lib/cuda/comm_cuda_kernel.cu +++ b/lib/cuda/comm_cuda_kernel.cu @@ -21,7 +21,7 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer) +__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -31,13 +31,13 @@ __global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistl if(j > _nmax) _flag[0] = 1; - ((X_FLOAT*) buffer)[i] = _x[j] + dx; - ((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy; - ((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz; + ((X_CFLOAT*) buffer)[i] = _x[j] + dx; + ((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy; + ((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz; } } -__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer) +__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -47,16 +47,16 @@ __global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxli if(j > _nmax) _flag[0] = 1; - ((X_FLOAT*) buffer)[i] = _x[j] + dx; - ((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy; - ((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz; - ((X_FLOAT*) buffer)[i + 3 * n] = _v[j]; - ((X_FLOAT*) buffer)[i + 4 * n] = _v[j + _nmax]; - ((X_FLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax]; + ((X_CFLOAT*) buffer)[i] = _x[j] + dx; + ((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy; + ((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz; + ((X_CFLOAT*) buffer)[i + 3 * n] = _v[j]; + ((X_CFLOAT*) buffer)[i + 4 * n] = _v[j + _nmax]; + ((X_CFLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax]; } } -__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; @@ -72,7 +72,7 @@ __global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int max } } -__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; @@ -96,9 +96,9 @@ __global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer) int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < n) { - _x[i + first] = ((X_FLOAT*) buffer)[i]; - _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n]; - _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n]; + _x[i + first] = ((X_CFLOAT*) buffer)[i]; + _x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n]; + _x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n]; } } @@ -108,12 +108,12 @@ __global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffe int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < n) { - _x[i + first] = ((X_FLOAT*) buffer)[i]; - _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n]; - _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n]; - _v[i + first] = ((X_FLOAT*) buffer)[i + 3 * n]; - _v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 4 * n]; - _v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 5 * n]; + _x[i + first] = ((X_CFLOAT*) buffer)[i]; + _x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n]; + _x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n]; + _v[i + first] = ((X_CFLOAT*) buffer)[i + 3 * n]; + _v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 4 * n]; + _v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 5 * n]; } } @@ -122,9 +122,9 @@ __global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first) int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < n) { - ((F_FLOAT*) _buffer)[i] = _f[i + first]; - ((F_FLOAT*) _buffer)[i + n] = _f[i + first + _nmax]; - ((F_FLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax]; + ((F_CFLOAT*) _buffer)[i] = _f[i + first]; + ((F_CFLOAT*) _buffer)[i + n] = _f[i + first + _nmax]; + ((F_CFLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax]; } } @@ -136,9 +136,9 @@ __global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int max if(i < n) { int j = list[i]; - _f[j] += ((F_FLOAT*)_buffer)[i]; - _f[j + _nmax] += ((F_FLOAT*) _buffer)[i + n]; - _f[j + 2 * _nmax] += ((F_FLOAT*) _buffer)[i + 2 * n]; + _f[j] += ((F_CFLOAT*)_buffer)[i]; + _f[j + _nmax] += ((F_CFLOAT*) _buffer)[i + n]; + _f[j + 2 * _nmax] += ((F_CFLOAT*) _buffer)[i + 2 * n]; } } @@ -161,11 +161,11 @@ __global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, in extern __shared__ int shared[]; __global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst, - int nfirst, int nlast, int dim, int iswap, X_FLOAT* slablo, X_FLOAT* slabhi, int* sendlist, int maxlistlength) + int nfirst, int nlast, int dim, int iswap, X_CFLOAT* slablo, X_CFLOAT* slabhi, int* sendlist, int maxlistlength) { int* list = sendlist + iswap * maxlistlength; - X_FLOAT lo = slablo[iswap]; - X_FLOAT hi = slabhi[iswap]; + X_CFLOAT lo = slablo[iswap]; + X_CFLOAT hi = slabhi[iswap]; bool add = false; if(!bordergroup || ineed >= 2) { @@ -273,11 +273,11 @@ __global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, i __global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst - , int nfirst, int nlast, int dim, int iswap, X_FLOAT* multilo, X_FLOAT* multihi, int* sendlist, int maxlistlength) + , int nfirst, int nlast, int dim, int iswap, X_CFLOAT* multilo, X_CFLOAT* multihi, int* sendlist, int maxlistlength) { int* list = sendlist + iswap * maxlistlength; - X_FLOAT* mlo = &multilo[iswap * _cuda_ntypes]; - X_FLOAT* mhi = &multihi[iswap * _cuda_ntypes]; + X_CFLOAT* mlo = &multilo[iswap * _cuda_ntypes]; + X_CFLOAT* mhi = &multihi[iswap * _cuda_ntypes]; int itype = 0; bool add = false; diff --git a/lib/cuda/compute_temp_cuda.cu b/lib/cuda/compute_temp_cuda.cu index ece4cf93a9..c7f7696ec0 100644 --- a/lib/cuda/compute_temp_cuda.cu +++ b/lib/cuda/compute_temp_cuda.cu @@ -33,7 +33,7 @@ void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata) { - int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT); + int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -50,15 +50,15 @@ void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata) void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata) { cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*)); if(sdata->atom.rmass_flag) - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); } @@ -68,7 +68,7 @@ void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata) } -void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t) +void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t) { //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary Cuda_ComputeTempCuda_UpdateNmax(sdata); @@ -82,7 +82,7 @@ void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_F dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit); + Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed"); @@ -90,13 +90,13 @@ void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_F grid.x = 6; grid.y = 1; threads.x = 512; - Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed"); } } -void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t) +void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t) { //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary Cuda_ComputeTempCuda_UpdateNmax(sdata); @@ -111,7 +111,7 @@ void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_F if(sdata->atom.nlocal > 0) { CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel"); - Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit); + Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed"); @@ -119,7 +119,7 @@ void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_F grid.x = 1; grid.y = 1; threads.x = 512; - Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed"); } diff --git a/lib/cuda/compute_temp_cuda_cu.h b/lib/cuda/compute_temp_cuda_cu.h index 9ab43d727a..44ae387bd2 100644 --- a/lib/cuda/compute_temp_cuda_cu.h +++ b/lib/cuda/compute_temp_cuda_cu.h @@ -24,5 +24,5 @@ #include "cuda_shared.h" extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t); -extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t); +extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t); +extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t); diff --git a/lib/cuda/compute_temp_cuda_kernel.cu b/lib/cuda/compute_temp_cuda_kernel.cu index 79562a0e28..a36ac28d9d 100644 --- a/lib/cuda/compute_temp_cuda_kernel.cu +++ b/lib/cuda/compute_temp_cuda_kernel.cu @@ -21,7 +21,7 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ ENERGY_FLOAT sharedmem[]; +extern __shared__ ENERGY_CFLOAT sharedmem[]; __global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit) @@ -40,7 +40,7 @@ __global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit) } reduceBlock(sharedmem); - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(threadIdx.x == 0) { buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0]; @@ -59,7 +59,7 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit) if(i < _nlocal) if(_mask[i] & groupbit) { - V_FLOAT massone; + V_CFLOAT massone; if(_rmass_flag) massone = _rmass[i]; else massone = _mass[_type[i]]; @@ -78,7 +78,7 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit) reduceBlock(&sharedmem[3 * blockDim.x]); reduceBlock(&sharedmem[4 * blockDim.x]); reduceBlock(&sharedmem[5 * blockDim.x]); - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(threadIdx.x == 0) { buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0]; @@ -91,12 +91,12 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit) } -__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t) +__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t) { int i = 0; sharedmem[threadIdx.x] = 0; - ENERGY_FLOAT myforig = 0.0; - ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT myforig = 0.0; + ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer; buf = &buf[blockIdx.x * n]; while(i < n) { diff --git a/lib/cuda/compute_temp_partial_cuda.cu b/lib/cuda/compute_temp_partial_cuda.cu index bc78592640..df327e41ce 100644 --- a/lib/cuda/compute_temp_partial_cuda.cu +++ b/lib/cuda/compute_temp_partial_cuda.cu @@ -33,7 +33,7 @@ void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata) { - int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT); + int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -50,15 +50,15 @@ void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata) void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata) { cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*)); if(sdata->atom.rmass_flag) - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); } @@ -68,7 +68,7 @@ void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata) } -void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag) +void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag) { //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); @@ -82,20 +82,20 @@ void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, E dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag); + Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed"); int oldgrid = grid.x * grid.y; grid.x = 6; threads.x = 512; - Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed"); } } -void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag) +void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag) { //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); @@ -110,14 +110,14 @@ void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, E if(sdata->atom.nlocal > 0) { CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel"); - Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag); + Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed"); int oldgrid = grid.x * grid.y; grid.x = 1; threads.x = 512; - Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed"); } @@ -137,7 +137,7 @@ void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int grou dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall); + Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); } @@ -157,7 +157,7 @@ void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int gro dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall); + Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); } diff --git a/lib/cuda/compute_temp_partial_cuda_cu.h b/lib/cuda/compute_temp_partial_cuda_cu.h index 00fc8a7c36..5000156232 100644 --- a/lib/cuda/compute_temp_partial_cuda_cu.h +++ b/lib/cuda/compute_temp_partial_cuda_cu.h @@ -24,7 +24,7 @@ #include "cuda_shared.h" extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag); -extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag); +extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag); +extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag); extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall); extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall); diff --git a/lib/cuda/compute_temp_partial_cuda_kernel.cu b/lib/cuda/compute_temp_partial_cuda_kernel.cu index ec0fff334f..05138dac18 100644 --- a/lib/cuda/compute_temp_partial_cuda_kernel.cu +++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu @@ -21,7 +21,7 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ ENERGY_FLOAT sharedmem[]; +extern __shared__ ENERGY_CFLOAT sharedmem[]; __global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag) @@ -40,7 +40,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xfla } reduceBlock(sharedmem); - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(threadIdx.x == 0) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; @@ -59,7 +59,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla if(i < _nlocal) if(_mask[i] & groupbit) { - V_FLOAT massone; + V_CFLOAT massone; if(_rmass_flag) massone = _rmass[i]; else massone = _mass[_type[i]]; @@ -78,7 +78,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla reduceBlock(&sharedmem[3 * blockDim.x]); reduceBlock(&sharedmem[4 * blockDim.x]); reduceBlock(&sharedmem[5 * blockDim.x]); - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(threadIdx.x == 0) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; @@ -91,12 +91,12 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla } -__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t) +__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t) { int i = 0; sharedmem[threadIdx.x] = 0; - ENERGY_FLOAT myforig = 0.0; - ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT myforig = 0.0; + ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer; buf = &buf[blockIdx.x * n]; while(i < n) { @@ -117,7 +117,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t t[blockIdx.x] = myforig; } -__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall) +__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; @@ -140,7 +140,7 @@ __global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, i } } -__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall) +__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; diff --git a/lib/cuda/crm_cuda_utils.cu b/lib/cuda/crm_cuda_utils.cu index 6337d0d015..1d52765ab8 100644 --- a/lib/cuda/crm_cuda_utils.cu +++ b/lib/cuda/crm_cuda_utils.cu @@ -640,11 +640,11 @@ static __device__ inline double tex1Dfetch_double(texture t, int i) return __hiloint2double(v.y, v.x); } -static __device__ inline X_FLOAT4 tex1Dfetch_double(texture t, int i) +static __device__ inline X_CFLOAT4 tex1Dfetch_double(texture t, int i) { int4 v = tex1Dfetch(t, 2 * i); int4 u = tex1Dfetch(t, 2 * i + 1); - X_FLOAT4 w; + X_CFLOAT4 w; w.x = __hiloint2double(v.y, v.x); w.y = __hiloint2double(v.w, v.z); @@ -664,7 +664,7 @@ inline void BindXTypeTexture(cuda_shared_data* sdata) #if X_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4)); + cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_CFLOAT4)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4)); @@ -672,7 +672,7 @@ inline void BindXTypeTexture(cuda_shared_data* sdata) #endif } -static __device__ inline X_FLOAT4 fetchXType(int i) +static __device__ inline X_CFLOAT4 fetchXType(int i) { #ifdef CUDA_USE_TEXTURE #if X_PRECISION == 1 @@ -692,11 +692,11 @@ static __device__ inline double tex1Dfetch_double_v(texture t, int i) return __hiloint2double(v.y, v.x); } -static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture t, int i) +static __device__ inline V_CFLOAT4 tex1Dfetch_double_v(texture t, int i) { int4 v = tex1Dfetch(t, 2 * i); int4 u = tex1Dfetch(t, 2 * i + 1); - V_FLOAT4 w; + V_CFLOAT4 w; w.x = __hiloint2double(v.y, v.x); w.y = __hiloint2double(v.w, v.z); @@ -716,7 +716,7 @@ inline void BindVRadiusTexture(cuda_shared_data* sdata) #if V_PRECISION == 1 cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); - cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4)); + cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_CFLOAT4)); #else cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc(); cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4)); @@ -724,7 +724,7 @@ inline void BindVRadiusTexture(cuda_shared_data* sdata) #endif } -static __device__ inline V_FLOAT4 fetchVRadius(int i) +static __device__ inline V_CFLOAT4 fetchVRadius(int i) { #ifdef CUDA_USE_TEXTURE #if V_PRECISION == 1 @@ -747,7 +747,7 @@ inline void BindOmegaRmassTexture(cuda_shared_data* sdata) #if V_PRECISION == 1 cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); - cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4)); + cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_CFLOAT4)); #else cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc(); cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4)); @@ -755,7 +755,7 @@ inline void BindOmegaRmassTexture(cuda_shared_data* sdata) #endif } -static __device__ inline V_FLOAT4 fetchOmegaRmass(int i) +static __device__ inline V_CFLOAT4 fetchOmegaRmass(int i) { #ifdef CUDA_USE_TEXTURE #if V_PRECISION == 1 @@ -775,11 +775,11 @@ static __device__ inline double tex1Dfetch_double_f(texture t, int i) return __hiloint2double(v.y, v.x); } -static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture t, int i) +static __device__ inline F_CFLOAT4 tex1Dfetch_double_f(texture t, int i) { int4 v = tex1Dfetch(t, 2 * i); int4 u = tex1Dfetch(t, 2 * i + 1); - F_FLOAT4 w; + F_CFLOAT4 w; w.x = __hiloint2double(v.y, v.x); w.y = __hiloint2double(v.w, v.z); @@ -799,7 +799,7 @@ inline void BindQTexture(cuda_shared_data* sdata) #if F_PRECISION == 1 cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); - cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc(); cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2)); @@ -807,7 +807,7 @@ inline void BindQTexture(cuda_shared_data* sdata) #endif } -static __device__ inline F_FLOAT fetchQ(int i) +static __device__ inline F_CFLOAT fetchQ(int i) { #ifdef CUDA_USE_TEXTURE #if F_PRECISION == 1 @@ -835,7 +835,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); + cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_CFLOAT4)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); @@ -843,7 +843,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) #endif } -static __device__ inline X_FLOAT4 fetchXType(int i) +static __device__ inline X_CFLOAT4 fetchXType(int i) { #ifdef CUDA_USE_TEXTURE #if X_PRECISION == 1 @@ -863,7 +863,7 @@ static inline __device__ int sbmask(int j) return j >> SBBITS & 3; } -static inline __device__ void minimum_image(X_FLOAT4 &delta) +static inline __device__ void minimum_image(X_CFLOAT4 &delta) { if(_triclinic == 0) { if(_periodicity[0]) { @@ -907,7 +907,7 @@ static inline __device__ void minimum_image(X_FLOAT4 &delta) } } -static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci) +static inline __device__ void closest_image(X_CFLOAT4 &x1, X_CFLOAT4 &x2, X_CFLOAT4 &ci) { ci.x = x2.x - x1.x; ci.y = x2.y - x1.y; diff --git a/lib/cuda/cuda.cu b/lib/cuda/cuda.cu index b0c7a91776..d0dbf39bbd 100644 --- a/lib/cuda/cuda.cu +++ b/lib/cuda/cuda.cu @@ -4,12 +4,12 @@ void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata) { - sdata->compile_settings.prec_glob = sizeof(CUDA_FLOAT) / 4; - sdata->compile_settings.prec_x = sizeof(X_FLOAT) / 4; - sdata->compile_settings.prec_v = sizeof(V_FLOAT) / 4; - sdata->compile_settings.prec_f = sizeof(F_FLOAT) / 4; - sdata->compile_settings.prec_pppm = sizeof(PPPM_FLOAT) / 4; - sdata->compile_settings.prec_fft = sizeof(FFT_FLOAT) / 4; + sdata->compile_settings.prec_glob = sizeof(CUDA_CFLOAT) / 4; + sdata->compile_settings.prec_x = sizeof(X_CFLOAT) / 4; + sdata->compile_settings.prec_v = sizeof(V_CFLOAT) / 4; + sdata->compile_settings.prec_f = sizeof(F_CFLOAT) / 4; + sdata->compile_settings.prec_pppm = sizeof(PPPM_CFLOAT) / 4; + sdata->compile_settings.prec_fft = sizeof(FFT_CFLOAT) / 4; #ifdef FFT_CUFFT sdata->compile_settings.cufft = 1; diff --git a/lib/cuda/cuda_common.h b/lib/cuda/cuda_common.h index a6806bcfd8..4f31de3cc2 100644 --- a/lib/cuda/cuda_common.h +++ b/lib/cuda/cuda_common.h @@ -60,7 +60,7 @@ //#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var)) //#define &MY_AP(var) &(MY_AP(var)) #define CUDA_USE_TEXTURE -#define CUDA_USE_FLOAT4 +#define CUDA_USE_CFLOAT4 //constants used by many classes @@ -79,20 +79,20 @@ #define _h MY_AP(h) #define _h_inv MY_AP(h_inv) #define _h_rate MY_AP(h_rate) -__device__ __constant__ X_FLOAT _boxhi[3]; -__device__ __constant__ X_FLOAT _boxlo[3]; -__device__ __constant__ X_FLOAT _subhi[3]; -__device__ __constant__ X_FLOAT _sublo[3]; -__device__ __constant__ X_FLOAT _box_size[3]; -__device__ __constant__ X_FLOAT _prd[3]; +__device__ __constant__ X_CFLOAT _boxhi[3]; +__device__ __constant__ X_CFLOAT _boxlo[3]; +__device__ __constant__ X_CFLOAT _subhi[3]; +__device__ __constant__ X_CFLOAT _sublo[3]; +__device__ __constant__ X_CFLOAT _box_size[3]; +__device__ __constant__ X_CFLOAT _prd[3]; __device__ __constant__ int _periodicity[3]; __device__ __constant__ int _triclinic; -__device__ __constant__ X_FLOAT _boxhi_lamda[3]; -__device__ __constant__ X_FLOAT _boxlo_lamda[3]; -__device__ __constant__ X_FLOAT _prd_lamda[3]; -__device__ __constant__ X_FLOAT _h[6]; -__device__ __constant__ X_FLOAT _h_inv[6]; -__device__ __constant__ V_FLOAT _h_rate[6]; +__device__ __constant__ X_CFLOAT _boxhi_lamda[3]; +__device__ __constant__ X_CFLOAT _boxlo_lamda[3]; +__device__ __constant__ X_CFLOAT _prd_lamda[3]; +__device__ __constant__ X_CFLOAT _h[6]; +__device__ __constant__ X_CFLOAT _h_inv[6]; +__device__ __constant__ V_CFLOAT _h_rate[6]; //atom properties @@ -123,31 +123,31 @@ __device__ __constant__ V_FLOAT _h_rate[6]; #define _omega_rmass MY_AP(omega_rmass) #define _freeze_group_bit MY_AP(freeze_group_bit) #define _map_array MY_AP(map_array) -__device__ __constant__ X_FLOAT* _x; //holds pointer to positions -__device__ __constant__ V_FLOAT* _v; -__device__ __constant__ F_FLOAT* _f; +__device__ __constant__ X_CFLOAT* _x; //holds pointer to positions +__device__ __constant__ V_CFLOAT* _v; +__device__ __constant__ F_CFLOAT* _f; __device__ __constant__ int* _tag; __device__ __constant__ int* _type; __device__ __constant__ int* _mask; __device__ __constant__ int* _image; -__device__ __constant__ V_FLOAT* _mass; -__device__ __constant__ F_FLOAT* _q; -__device__ __constant__ V_FLOAT* _rmass; +__device__ __constant__ V_CFLOAT* _mass; +__device__ __constant__ F_CFLOAT* _q; +__device__ __constant__ V_CFLOAT* _rmass; __device__ __constant__ int _rmass_flag; -__device__ __constant__ ENERGY_FLOAT* _eatom; -__device__ __constant__ ENERGY_FLOAT* _vatom; -__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions -__device__ __constant__ X_FLOAT* _radius; -__device__ __constant__ F_FLOAT* _density; -__device__ __constant__ V_FLOAT* _omega; -__device__ __constant__ F_FLOAT* _torque; +__device__ __constant__ ENERGY_CFLOAT* _eatom; +__device__ __constant__ ENERGY_CFLOAT* _vatom; +__device__ __constant__ X_CFLOAT4* _x_type; //holds pointer to positions +__device__ __constant__ X_CFLOAT* _radius; +__device__ __constant__ F_CFLOAT* _density; +__device__ __constant__ V_CFLOAT* _omega; +__device__ __constant__ F_CFLOAT* _torque; __device__ __constant__ int* _special; __device__ __constant__ int _maxspecial; __device__ __constant__ int* _nspecial; __device__ __constant__ int _special_flag[4]; __device__ __constant__ int* _molecule; -__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions -__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions +__device__ __constant__ V_CFLOAT4* _v_radius; //holds pointer to positions +__device__ __constant__ V_CFLOAT4* _omega_rmass; //holds pointer to positions __device__ __constant__ int _freeze_group_bit; __device__ __constant__ int* _map_array; @@ -226,8 +226,8 @@ __device__ __constant__ int* _neighbors; __device__ __constant__ int* _neighbors_border; __device__ __constant__ int* _neighbors_inner; __device__ __constant__ int* _reneigh_flag; -__device__ __constant__ X_FLOAT _triggerneighsq; -__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions +__device__ __constant__ X_CFLOAT _triggerneighsq; +__device__ __constant__ X_CFLOAT* _xhold; //holds pointer to positions __device__ __constant__ int _maxhold; __device__ __constant__ int _dist_check; __device__ __constant__ int _neighbor_maxlocal; @@ -253,12 +253,12 @@ __device__ __constant__ unsigned _nghost; __device__ __constant__ unsigned _nlocal; __device__ __constant__ unsigned _nmax; __device__ __constant__ unsigned _cuda_ntypes; -__device__ __constant__ V_FLOAT _dtf; -__device__ __constant__ X_FLOAT _dtv; -__device__ __constant__ V_FLOAT _factor; -__device__ __constant__ ENERGY_FLOAT* _virial; -__device__ __constant__ ENERGY_FLOAT* _eng_vdwl; -__device__ __constant__ ENERGY_FLOAT* _eng_coul; +__device__ __constant__ V_CFLOAT _dtf; +__device__ __constant__ X_CFLOAT _dtv; +__device__ __constant__ V_CFLOAT _factor; +__device__ __constant__ ENERGY_CFLOAT* _virial; +__device__ __constant__ ENERGY_CFLOAT* _eng_vdwl; +__device__ __constant__ ENERGY_CFLOAT* _eng_coul; __device__ __constant__ int _molecular; //other general constants diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu index 28ab269e48..74e5ef0711 100644 --- a/lib/cuda/cuda_pair.cu +++ b/lib/cuda/cuda_pair.cu @@ -55,30 +55,30 @@ enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_L #define _cutsq_global MY_AP(cutsq_global) #define _collect_forces_later MY_AP(collect_forces_later) -__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2]; -__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT _special_lj[4]; -__device__ __constant__ F_FLOAT _special_coul[4]; -__device__ __constant__ X_FLOAT _cutsq_global; +__device__ __constant__ X_CFLOAT _cutsq[CUDA_MAX_TYPES2]; +__device__ __constant__ ENERGY_CFLOAT _offset[CUDA_MAX_TYPES2]; +__device__ __constant__ F_CFLOAT _special_lj[4]; +__device__ __constant__ F_CFLOAT _special_coul[4]; +__device__ __constant__ X_CFLOAT _cutsq_global; __device__ __constant__ int _collect_forces_later; -__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space) -__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_CFLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space) +__device__ __constant__ F_CFLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_CFLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_CFLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_CFLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2]; -__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space) -__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm); -__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space) +__device__ __constant__ F_CFLOAT* MY_AP(coeff2_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff3_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff4_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff5_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff6_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff7_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff8_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff9_gm); +__device__ __constant__ F_CFLOAT* MY_AP(coeff10_gm); #define _coeff1_gm_tex MY_AP(coeff1_gm_tex) #if F_PRECISION == 1 @@ -159,17 +159,17 @@ texture _coeff10_gm_tex; #define _g_ewald MY_AP(g_ewald) #define _qqrd2e MY_AP(qqrd2e) #define _kappa MY_AP(kappa) -__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2]; -__device__ __constant__ X_FLOAT _cut_coulsq_global; -__device__ __constant__ F_FLOAT _g_ewald; -__device__ __constant__ F_FLOAT _qqrd2e; -__device__ __constant__ F_FLOAT _kappa; +__device__ __constant__ X_CFLOAT _cut_coulsq[CUDA_MAX_TYPES2]; +__device__ __constant__ X_CFLOAT _cut_coulsq_global; +__device__ __constant__ F_CFLOAT _g_ewald; +__device__ __constant__ F_CFLOAT _qqrd2e; +__device__ __constant__ F_CFLOAT _kappa; //inner cutoff #define _cut_innersq MY_AP(cut_innersq) #define _cut_innersq_global MY_AP(cut_innersq_global) -__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2]; -__device__ __constant__ X_FLOAT _cut_innersq_global; +__device__ __constant__ X_CFLOAT _cut_innersq[CUDA_MAX_TYPES2]; +__device__ __constant__ X_CFLOAT _cut_innersq_global; template @@ -241,14 +241,14 @@ void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighli cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); //Atom - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*)); //Other @@ -261,8 +261,8 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = { unsigned cuda_ntypes = sdata->atom.ntypes + 1; unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; - unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; - unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2; + unsigned n = sizeof(F_CFLOAT) * cuda_ntypes2; + unsigned nx = sizeof(X_CFLOAT) * cuda_ntypes2; //check if enough constant memory is available if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params) @@ -275,24 +275,24 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = //type conversion of cutoffs and parameters if(need_cut) { - X_FLOAT cutsq[cuda_ntypes2]; + X_CFLOAT cutsq[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); + cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); } } int cutsqdiffer = 0; - X_FLOAT cutsq_global; - cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); + X_CFLOAT cutsq_global; + cutsq_global = (X_CFLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); if(sdata->pair.cut) { for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = i; j <= sdata->atom.ntypes; ++j) { if(sdata->pair.cut[i][j] > 1e-6) { - cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); - cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); + cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); + cutsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); } if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; @@ -307,8 +307,8 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = i; j <= sdata->atom.ntypes; ++j) { if(sdata->pair.cut[i][j] > 1e-6) { - cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]); - cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]); + cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cutsq[i][j]); + cutsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cutsq[i][j]); } if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; @@ -326,28 +326,28 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx); } - cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_CFLOAT)); } if(need_innercut) { - X_FLOAT cut_innersq[cuda_ntypes2]; + X_CFLOAT cut_innersq[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); + cut_innersq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); } } int cutsqdiffer = 0; - X_FLOAT cut_innersq_global; - cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); + X_CFLOAT cut_innersq_global; + cut_innersq_global = (X_CFLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); if(sdata->pair.cut_inner) { for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = i; j <= sdata->atom.ntypes; ++j) { if(sdata->pair.cut_inner[i][j] > 1e-6) { - cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); - cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); + cut_innersq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); + cut_innersq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); } if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j]; @@ -363,30 +363,30 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx); } - cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_CFLOAT)); } if(need_q) { - X_FLOAT cut_coulsq[cuda_ntypes2]; + X_CFLOAT cut_coulsq[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); + cut_coulsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); } } int cutsqdiffer = 0; - X_FLOAT cut_coulsq_global; - cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); + X_CFLOAT cut_coulsq_global; + cut_coulsq_global = (X_CFLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); - if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; + if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_CFLOAT) sdata->pair.cut_coulsq_global; if(sdata->pair.cut_coul) { for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = i; j <= sdata->atom.ntypes; ++j) { if(sdata->pair.cut_coul[i][j] > 1e-6) { - cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); - cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); + cut_coulsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); + cut_coulsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); } if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j]; @@ -402,22 +402,22 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx); } - cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_CFLOAT)); } CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed"); if(ncoeff > 0) { - F_FLOAT coeff1[cuda_ntypes2]; + F_CFLOAT coeff1[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j]; + coeff1[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff1[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice); _coeff1_gm_tex.normalized = false; // access with normalized texture coordinates @@ -429,7 +429,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed"); - cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed"); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); @@ -445,16 +445,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed"); if(ncoeff > 1) { - F_FLOAT coeff2[cuda_ntypes2]; + F_CFLOAT coeff2[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j]; + coeff2[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff2[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice); _coeff2_gm_tex.normalized = false; // access with normalized texture coordinates @@ -464,7 +464,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -477,16 +477,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed"); if(ncoeff > 2) { - F_FLOAT coeff3[cuda_ntypes2]; + F_CFLOAT coeff3[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j]; + coeff3[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff3[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice); _coeff3_gm_tex.normalized = false; // access with normalized texture coordinates _coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no @@ -495,7 +495,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -507,16 +507,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed"); if(ncoeff > 3) { - F_FLOAT coeff4[cuda_ntypes2]; + F_CFLOAT coeff4[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j]; + coeff4[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff4[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice); _coeff4_gm_tex.normalized = false; // access with normalized texture coordinates _coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no @@ -525,7 +525,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -537,16 +537,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed"); if(ncoeff > 4) { - F_FLOAT coeff5[cuda_ntypes2]; + F_CFLOAT coeff5[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j]; + coeff5[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff5[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice); _coeff5_gm_tex.normalized = false; // access with normalized texture coordinates _coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no @@ -555,7 +555,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -567,16 +567,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed"); if(ncoeff > 5) { - F_FLOAT coeff6[cuda_ntypes2]; + F_CFLOAT coeff6[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j]; + coeff6[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff6[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice); _coeff6_gm_tex.normalized = false; // access with normalized texture coordinates _coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no @@ -585,7 +585,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -596,16 +596,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed"); if(ncoeff > 6) { - F_FLOAT coeff7[cuda_ntypes2]; + F_CFLOAT coeff7[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j]; + coeff7[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff7[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice); _coeff7_gm_tex.normalized = false; // access with normalized texture coordinates _coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no @@ -614,7 +614,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -625,16 +625,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed"); if(ncoeff > 7) { - F_FLOAT coeff8[cuda_ntypes2]; + F_CFLOAT coeff8[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j]; + coeff8[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff8[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice); _coeff8_gm_tex.normalized = false; // access with normalized texture coordinates _coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no @@ -643,7 +643,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -654,16 +654,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed"); if(ncoeff > 8) { - F_FLOAT coeff9[cuda_ntypes2]; + F_CFLOAT coeff9[cuda_ntypes2]; for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j]; + coeff9[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff9[i][j]; } } if(use_global_params) { - cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_CFLOAT*)); cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice); _coeff9_gm_tex.normalized = false; // access with normalized texture coordinates _coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no @@ -672,7 +672,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = #if F_PRECISION == 1 cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); - cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT)); #else cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc(); cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); @@ -682,40 +682,40 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed"); - F_FLOAT special_lj[4]; + F_CFLOAT special_lj[4]; special_lj[0] = sdata->pair.special_lj[0]; special_lj[1] = sdata->pair.special_lj[1]; special_lj[2] = sdata->pair.special_lj[2]; special_lj[3] = sdata->pair.special_lj[3]; - X_FLOAT box_size[3] = { + X_CFLOAT box_size[3] = { sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[2] - sdata->domain.sublo[2] }; - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3); cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_CFLOAT) * 4); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); if(need_q) { - F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e; - F_FLOAT special_coul[4]; + F_CFLOAT qqrd2e_tmp = sdata->pppm.qqrd2e; + F_CFLOAT special_coul[4]; special_coul[0] = sdata->pair.special_coul[0]; special_coul[1] = sdata->pair.special_coul[1]; special_coul[2] = sdata->pair.special_coul[2]; special_coul[3] = sdata->pair.special_coul[3]; - cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4); - cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_CFLOAT) * 4); + cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_CFLOAT*)); } CUT_CHECK_ERROR("Cuda_Pair: init failed"); @@ -763,7 +763,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis maxthreads = 64; } - int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit + int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_CFLOAT), maxthreads, true); //need to limit to 192 threads due to register limit threads.x = layout.z; threads.y = 1; threads.z = 1; @@ -771,9 +771,9 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis grid.y = layout.y; grid.z = 1; - int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT); + int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_CFLOAT); - if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT)); + if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_CFLOAT)); Cuda_UpdateBuffer(sdata, size); @@ -787,7 +787,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis my_gettime(CLOCK_REALTIME, &startpairtime); - MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) + MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);) } //Function which is called after the kernel invocation, collects energy and virial @@ -810,8 +810,8 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sh grid.y = 1; dim3 threads(128, 1, 1); - MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);) + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed"); } @@ -863,15 +863,15 @@ void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*)); + cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_CFLOAT4*)); + cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_CFLOAT4*)); cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)); CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed"); } @@ -999,7 +999,7 @@ void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag) grid.y = 1; threads.x = 128; //printf("A grid.x: %i\n",grid.x); - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed"); } diff --git a/lib/cuda/cuda_pair_kernel.cu b/lib/cuda/cuda_pair_kernel.cu index 2c697f9c7e..1a377227b3 100644 --- a/lib/cuda/cuda_pair_kernel.cu +++ b/lib/cuda/cuda_pair_kernel.cu @@ -32,12 +32,12 @@ template __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom) { - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT ecoul = ENERGY_F(0.0); - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedECoul; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedE; + ENERGY_CFLOAT* sharedECoul; + ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x]; if(eflag || eflag_atom) { sharedE = &sharedmem[threadIdx.x]; @@ -62,12 +62,12 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT xtmp, ytmp, ztmp; - X_FLOAT4 myxtype; - F_FLOAT fxtmp, fytmp, fztmp, fpair; - F_FLOAT delx, dely, delz; - F_FLOAT factor_lj, factor_coul; - F_FLOAT qtmp; + X_CFLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT fxtmp, fytmp, fztmp, fpair; + F_CFLOAT delx, dely, delz; + F_CFLOAT factor_lj, factor_coul; + F_CFLOAT qtmp; int itype, i, j; int jnum = 0; int* jlist; @@ -114,7 +114,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ int jtype = static_cast (myxtype.w); - const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz; bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); @@ -171,7 +171,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ } if(coul_type != COUL_NONE) { - const F_FLOAT qiqj = qtmp * fetchQ(j); + const F_CFLOAT qiqj = qtmp * fetchQ(j); if(qiqj * qiqj > 1e-8) { const bool in_coul_cutoff = @@ -188,7 +188,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ break; case COUL_CUT: { - const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); if(eflag) { ecoul += forcecoul; @@ -199,11 +199,11 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ break; case COUL_DEBYE: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0) / r; - const F_FLOAT screening = _EXP_(-_kappa * r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const X_CFLOAT r = _RSQRT_(r2inv); + const X_CFLOAT rinv = F_F(1.0) / r; + const F_CFLOAT screening = _EXP_(-_kappa * r); + F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; if(eflag) { ecoul += forcecoul * rinv; @@ -219,14 +219,14 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ break; case COUL_LONG: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij * grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); - const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT grij = _g_ewald * r; + const F_CFLOAT expm2 = _EXP_(-grij * grij); + const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; @@ -248,7 +248,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ if(in_cutoff) { - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; fxtmp += dxfp = delx * fpair; fytmp += dyfp = dely * fpair; fztmp += dzfp = delz * fpair; @@ -268,10 +268,10 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ __syncthreads(); if(ii < _inum) { - F_FLOAT* my_f; + F_CFLOAT* my_f; if(_collect_forces_later) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { buffer = &buffer[1 * gridDim.x * gridDim.y]; @@ -284,7 +284,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_ buffer = &buffer[6 * gridDim.x * gridDim.y]; } - my_f = (F_FLOAT*) buffer; + my_f = (F_CFLOAT*) buffer; my_f += i; *my_f = fxtmp; my_f += _nmax; @@ -337,14 +337,14 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ if(ii >= _inum) return; - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); - F_FLOAT3* sharedVirial1; - F_FLOAT3* sharedVirial2; - F_FLOAT* sharedEnergy; - F_FLOAT* sharedEnergyCoul; + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT ecoul = ENERGY_F(0.0); + F_CFLOAT3* sharedVirial1; + F_CFLOAT3* sharedVirial2; + F_CFLOAT* sharedEnergy; + F_CFLOAT* sharedEnergyCoul; - F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; + F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0]; if(vflag) { sharedVirial1 = &sharedForce[64]; @@ -356,25 +356,25 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ if(eflag) { if(vflag || vflag_atom) - sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; + sharedEnergy = (F_CFLOAT*) &sharedVirial2[64]; else - sharedEnergy = (F_FLOAT*) &sharedForce[64]; + sharedEnergy = (F_CFLOAT*) &sharedForce[64]; if(coul_type != COUL_NONE) - sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; + sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64]; } - F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - X_FLOAT xtmp, ytmp, ztmp; - X_FLOAT4 myxtype; - F_FLOAT delx, dely, delz; - F_FLOAT factor_lj, factor_coul; - F_FLOAT fpair; - F_FLOAT qtmp; + X_CFLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT delx, dely, delz; + F_CFLOAT factor_lj, factor_coul; + F_CFLOAT fpair; + F_CFLOAT qtmp; int itype, jnum, i, j; int* jlist; @@ -413,7 +413,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ delz = ztmp - myxtype.z; int jtype = static_cast (myxtype.w); - const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz; bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); bool in_coul_cutoff; @@ -471,7 +471,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ } if(coul_type != COUL_NONE) { - const F_FLOAT qiqj = qtmp * fetchQ(j); + const F_CFLOAT qiqj = qtmp * fetchQ(j); if(qiqj * qiqj > (1e-8f)) { in_coul_cutoff = @@ -492,14 +492,14 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ break; case COUL_LONG: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij * grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); - const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT grij = _g_ewald * r; + const F_CFLOAT expm2 = _EXP_(-grij * grij); + const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; @@ -514,11 +514,11 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ break; case COUL_DEBYE: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0) / r; - const F_FLOAT screening = _EXP_(-_kappa * r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const X_CFLOAT r = _RSQRT_(r2inv); + const X_CFLOAT rinv = F_F(1.0) / r; + const F_CFLOAT screening = _EXP_(-_kappa * r); + F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; if(eflag) { ecoul += forcecoul * rinv; @@ -530,7 +530,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ break; case COUL_CUT: { - const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); if(eflag) { ecoul += forcecoul; @@ -549,7 +549,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ if(in_cutoff || in_coul_cutoff) { - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; partialForce.x += dxfp = delx * fpair; partialForce.y += dyfp = dely * fpair; partialForce.z += dzfp = delz * fpair; @@ -613,10 +613,10 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ if(threadIdx.x == 0) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { - ENERGY_FLOAT tmp_evdwl; + ENERGY_CFLOAT tmp_evdwl; buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0]; if(eflag_atom) @@ -635,7 +635,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ } if(vflag) { - ENERGY_FLOAT tmp; + ENERGY_CFLOAT tmp; buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x; if(vflag_atom) _vatom[i + 0 * _nmax] = tmp; @@ -663,10 +663,10 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ buffer = &buffer[6 * gridDim.x * gridDim.y]; } - F_FLOAT* my_f; + F_CFLOAT* my_f; if(_collect_forces_later) { - my_f = (F_FLOAT*) buffer; + my_f = (F_CFLOAT*) buffer; my_f += i; *my_f = sharedForce[0].x; my_f += _nmax; @@ -688,12 +688,12 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_ template __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase) { - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT ecoul = ENERGY_F(0.0); - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedECoul; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedE; + ENERGY_CFLOAT* sharedECoul; + ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x]; if(eflag || eflag_atom) { sharedE = &sharedmem[threadIdx.x]; @@ -718,12 +718,12 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT xtmp, ytmp, ztmp; - X_FLOAT4 myxtype; - F_FLOAT fxtmp, fytmp, fztmp, fpair; - F_FLOAT delx, dely, delz; - F_FLOAT factor_lj, factor_coul; - F_FLOAT qtmp; + X_CFLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT fxtmp, fytmp, fztmp, fpair; + F_CFLOAT delx, dely, delz; + F_CFLOAT factor_lj, factor_coul; + F_CFLOAT qtmp; int itype, i, j; int jnum = 0; int* jlist; @@ -774,7 +774,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf int jtype = static_cast (myxtype.w); - const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz; bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); @@ -831,7 +831,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf } if(coul_type != COUL_NONE) { - const F_FLOAT qiqj = qtmp * fetchQ(j); + const F_CFLOAT qiqj = qtmp * fetchQ(j); if(qiqj * qiqj > 1e-8) { const bool in_coul_cutoff = @@ -848,7 +848,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf break; case COUL_CUT: { - const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); if(eflag) { ecoul += forcecoul; @@ -859,11 +859,11 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf break; case COUL_DEBYE: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0) / r; - const F_FLOAT screening = _EXP_(-_kappa * r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const X_CFLOAT r = _RSQRT_(r2inv); + const X_CFLOAT rinv = F_F(1.0) / r; + const F_CFLOAT screening = _EXP_(-_kappa * r); + F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; if(eflag) { ecoul += forcecoul * rinv; @@ -879,14 +879,14 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf break; case COUL_LONG: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij * grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); - const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT grij = _g_ewald * r; + const F_CFLOAT expm2 = _EXP_(-grij * grij); + const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; @@ -909,7 +909,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf if(in_cutoff) { - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; fxtmp += dxfp = delx * fpair; fytmp += dyfp = dely * fpair; fztmp += dzfp = delz * fpair; @@ -929,10 +929,10 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf __syncthreads(); if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) { - F_FLOAT* my_f; + F_CFLOAT* my_f; if(_collect_forces_later) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { buffer = &buffer[1 * gridDim.x * gridDim.y]; @@ -945,7 +945,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf buffer = &buffer[6 * gridDim.x * gridDim.y]; } - my_f = (F_FLOAT*) buffer; + my_f = (F_CFLOAT*) buffer; my_f += i; *my_f = fxtmp; my_f += _nmax; @@ -998,14 +998,14 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf if(ii >= (comm_phase < 2 ? _inum : _inum_border[0])) return; - ENERGY_FLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT ecoul = ENERGY_F(0.0); - F_FLOAT3* sharedVirial1; - F_FLOAT3* sharedVirial2; - F_FLOAT* sharedEnergy; - F_FLOAT* sharedEnergyCoul; + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT ecoul = ENERGY_F(0.0); + F_CFLOAT3* sharedVirial1; + F_CFLOAT3* sharedVirial2; + F_CFLOAT* sharedEnergy; + F_CFLOAT* sharedEnergyCoul; - F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; + F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0]; if(vflag) { sharedVirial1 = &sharedForce[64]; @@ -1017,25 +1017,25 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf if(eflag) { if(vflag || vflag_atom) - sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; + sharedEnergy = (F_CFLOAT*) &sharedVirial2[64]; else - sharedEnergy = (F_FLOAT*) &sharedForce[64]; + sharedEnergy = (F_CFLOAT*) &sharedForce[64]; if(coul_type != COUL_NONE) - sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; + sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64]; } - F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; - X_FLOAT xtmp, ytmp, ztmp; - X_FLOAT4 myxtype; - F_FLOAT delx, dely, delz; - F_FLOAT factor_lj, factor_coul; - F_FLOAT fpair; - F_FLOAT qtmp; + X_CFLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT delx, dely, delz; + F_CFLOAT factor_lj, factor_coul; + F_CFLOAT fpair; + F_CFLOAT qtmp; int itype, jnum, i, j; int* jlist; @@ -1074,7 +1074,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf delz = ztmp - myxtype.z; int jtype = static_cast (myxtype.w); - const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz; bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); bool in_coul_cutoff; @@ -1132,7 +1132,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf } if(coul_type != COUL_NONE) { - const F_FLOAT qiqj = qtmp * fetchQ(j); + const F_CFLOAT qiqj = qtmp * fetchQ(j); if(qiqj * qiqj > (1e-8f)) { in_coul_cutoff = @@ -1153,14 +1153,14 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf break; case COUL_LONG: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT grij = _g_ewald * r; - const F_FLOAT expm2 = _EXP_(-grij * grij); - const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); - const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; - const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); - F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT grij = _g_ewald * r; + const F_CFLOAT expm2 = _EXP_(-grij * grij); + const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; @@ -1175,11 +1175,11 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf break; case COUL_DEBYE: { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const X_FLOAT r = _RSQRT_(r2inv); - const X_FLOAT rinv = F_F(1.0) / r; - const F_FLOAT screening = _EXP_(-_kappa * r); - F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const X_CFLOAT r = _RSQRT_(r2inv); + const X_CFLOAT rinv = F_F(1.0) / r; + const F_CFLOAT screening = _EXP_(-_kappa * r); + F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; if(eflag) { ecoul += forcecoul * rinv; @@ -1191,7 +1191,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf break; case COUL_CUT: { - const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); if(eflag) { ecoul += forcecoul; @@ -1210,7 +1210,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf if(in_cutoff || in_coul_cutoff) { - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; partialForce.x += dxfp = delx * fpair; partialForce.y += dyfp = dely * fpair; partialForce.z += dzfp = delz * fpair; @@ -1274,10 +1274,10 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf if(threadIdx.x == 0) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { - ENERGY_FLOAT tmp_evdwl; + ENERGY_CFLOAT tmp_evdwl; buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0]; if(eflag_atom) @@ -1296,7 +1296,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf } if(vflag) { - ENERGY_FLOAT tmp; + ENERGY_CFLOAT tmp; buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x; if(vflag_atom) _vatom[i + 0 * _nmax] = tmp; @@ -1324,10 +1324,10 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf buffer = &buffer[6 * gridDim.x * gridDim.y]; } - F_FLOAT* my_f; + F_CFLOAT* my_f; if(_collect_forces_later) { - my_f = (F_FLOAT*) buffer; + my_f = (F_CFLOAT*) buffer; my_f += i; *my_f = sharedForce[0].x; my_f += _nmax; @@ -1350,7 +1350,7 @@ __global__ void Pair_GenerateXType_Kernel() int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nall) { - X_FLOAT4 xtype; + X_CFLOAT4 xtype; xtype.x = _x[i]; xtype.y = _x[i + _nmax]; xtype.z = _x[i + 2 * _nmax]; @@ -1365,7 +1365,7 @@ __global__ void Pair_GenerateVRadius_Kernel() int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nall) { - V_FLOAT4 vradius; + V_CFLOAT4 vradius; vradius.x = _v[i]; vradius.y = _v[i + _nmax]; vradius.z = _v[i + 2 * _nmax]; @@ -1379,7 +1379,7 @@ __global__ void Pair_GenerateOmegaRmass_Kernel() int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nall) { - V_FLOAT4 omegarmass; + V_CFLOAT4 omegarmass; omegarmass.x = _omega[i]; omegarmass.y = _omega[i + _nmax]; omegarmass.z = _omega[i + 2 * _nmax]; @@ -1393,7 +1393,7 @@ __global__ void Pair_RevertXType_Kernel() int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nall) { - X_FLOAT4 xtype = _x_type[i]; + X_CFLOAT4 xtype = _x_type[i]; _x[i] = xtype.x; _x[i + _nmax] = xtype.y; _x[i + 2 * _nmax] = xtype.z; @@ -1407,7 +1407,7 @@ __global__ void Pair_BuildXHold_Kernel() int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nall) { - X_FLOAT4 xtype = _x_type[i]; + X_CFLOAT4 xtype = _x_type[i]; _xhold[i] = xtype.x; _xhold[i + _nmax] = xtype.y; _xhold[i + 2 * _nmax] = xtype.z; @@ -1421,10 +1421,10 @@ __global__ void Pair_CollectForces_Kernel(int nperblock, int n) if(i >= _nlocal) return; - ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer; - F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n]; - F_FLOAT* my_f = _f + i; + F_CFLOAT* buf_f = (F_CFLOAT*) &buf[nperblock * n]; + F_CFLOAT* my_f = _f + i; buf_f += i; *my_f += * buf_f; my_f += _nmax; diff --git a/lib/cuda/cuda_pair_virial_kernel_nc.cu b/lib/cuda/cuda_pair_virial_kernel_nc.cu index 3987bde43e..97b249e3f5 100644 --- a/lib/cuda/cuda_pair_virial_kernel_nc.cu +++ b/lib/cuda/cuda_pair_virial_kernel_nc.cu @@ -21,12 +21,12 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ ENERGY_FLOAT sharedmem[]; +extern __shared__ ENERGY_CFLOAT sharedmem[]; static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0) { __syncthreads(); - ENERGY_FLOAT* shared = sharedmem; + ENERGY_CFLOAT* shared = sharedmem; if(eflag) { reduceBlock(shared); @@ -49,7 +49,7 @@ static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, i if(threadIdx.x == 0) { shared = sharedmem; - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0]; @@ -79,8 +79,8 @@ static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, i __global__ void MY_AP(PairVirialCompute_reduce)(int n) { sharedmem[threadIdx.x] = ENERGY_F(0.0); - ENERGY_FLOAT sum = ENERGY_F(0.0); - ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT sum = ENERGY_F(0.0); + ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer; buf = &buf[blockIdx.x * n]; //if(blockIdx.x==2) buf=&buf[n]; diff --git a/lib/cuda/cuda_precision.h b/lib/cuda/cuda_precision.h index 7582c41de1..2abcc658a3 100644 --- a/lib/cuda/cuda_precision.h +++ b/lib/cuda/cuda_precision.h @@ -25,7 +25,7 @@ #define CUDA_PRECISION_H_ /* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA. * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation). - * ***_FLOAT: type definition of given property + * ***_CFLOAT: type definition of given property * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F) */ @@ -39,17 +39,17 @@ #ifdef CUDA_PRECISION #if CUDA_PRECISION == 1 -#define CUDA_FLOAT float +#define CUDA_CFLOAT float #define CUDA_F(x) x##f #endif #if CUDA_PRECISION == 2 -#define CUDA_FLOAT double +#define CUDA_CFLOAT double #define CUDA_F(x) x #endif #endif #ifndef CUDA_PRECISION -#define CUDA_FLOAT double +#define CUDA_CFLOAT double #define CUDA_F(x) x #define CUDA_PRECISION 2 #endif @@ -59,17 +59,17 @@ #ifdef FFT_PRECISION_CU #if FFT_PRECISION_CU == 1 -#define FFT_FLOAT float +#define FFT_CFLOAT float #define FFT_F(x) x##f #endif #if FFT_PRECISION_CU == 2 -#define FFT_FLOAT double +#define FFT_CFLOAT double #define FFT_F(x) x #endif #endif #ifndef FFT_PRECISION_CU -#define FFT_FLOAT CUDA_FLOAT +#define FFT_CFLOAT CUDA_CFLOAT #define FFT_F(x) CUDA_F(x) #define FFT_PRECISION_CU CUDA_PRECISION #endif @@ -84,24 +84,24 @@ #ifdef PPPM_PRECISION #if PPPM_PRECISION == 1 -#define PPPM_FLOAT float +#define PPPM_CFLOAT float #ifdef float3 -#define PPPM_FLOAT3 float3 +#define PPPM_CFLOAT3 float3 #else -struct PPPM_FLOAT3 { - PPPM_FLOAT x; - PPPM_FLOAT y; - PPPM_FLOAT z; +struct PPPM_CFLOAT3 { + PPPM_CFLOAT x; + PPPM_CFLOAT y; + PPPM_CFLOAT z; }; #endif #define PPPM_F(x) x##f #endif #if PPPM_PRECISION == 2 -#define PPPM_FLOAT double -struct PPPM_FLOAT3 { - PPPM_FLOAT x; - PPPM_FLOAT y; - PPPM_FLOAT z; +#define PPPM_CFLOAT double +struct PPPM_CFLOAT3 { + PPPM_CFLOAT x; + PPPM_CFLOAT y; + PPPM_CFLOAT z; }; #define PPPM_F(x) x #endif @@ -115,17 +115,17 @@ struct PPPM_FLOAT3 { #ifdef F_PRECISION #if F_PRECISION == 1 -#define F_FLOAT float +#define F_CFLOAT float #define F_F(x) x##f #endif #if F_PRECISION == 2 -#define F_FLOAT double +#define F_CFLOAT double #define F_F(x) x #endif #endif #ifndef F_PRECISION -#define F_FLOAT CUDA_FLOAT +#define F_CFLOAT CUDA_CFLOAT #define F_F(x) CUDA_F(x) #define F_PRECISION CUDA_PRECISION #endif @@ -141,48 +141,49 @@ struct PPPM_FLOAT3 { #endif #if F_PRECISION == 2 -struct F_FLOAT2 { - F_FLOAT x; - F_FLOAT y; +struct F_CFLOAT2 { + F_CFLOAT x; + F_CFLOAT y; }; -struct F_FLOAT3 { - F_FLOAT x; - F_FLOAT y; - F_FLOAT z; +struct F_CFLOAT3 { + F_CFLOAT x; + F_CFLOAT y; + F_CFLOAT z; }; -struct F_FLOAT4 { - F_FLOAT x; - F_FLOAT y; - F_FLOAT z; - F_FLOAT w; +struct F_CFLOAT4 { + F_CFLOAT x; + F_CFLOAT y; + F_CFLOAT z; + F_CFLOAT w; }; #else -#define F_FLOAT2 float2 -#define F_FLOAT3 float3 -#define F_FLOAT4 float4 +#define F_CFLOAT2 float2 +#define F_CFLOAT3 float3 +#define F_CFLOAT4 float4 #endif + //-------------------------------- //-----------ENERGY----------------- //-------------------------------- #ifndef ENERGY_PRECISION -#define ENERGY_FLOAT CUDA_FLOAT +#define ENERGY_CFLOAT CUDA_CFLOAT #define ENERGY_F(x) CUDA_F(x) #endif #ifdef ENERGY_PRECISION #if ENERGY_PRECISION == 1 -#define ENERGY_FLOAT float +#define ENERGY_CFLOAT float #define ENERGY_F(x) x##f #endif #if ENERGY_PRECISION == 2 -#define ENERGY_FLOAT double +#define ENERGY_CFLOAT double #define ENERGY_F(x) x #endif #endif #ifndef ENERGY_PRECISION -#define ENERGY_FLOAT CUDA_FLOAT +#define ENERGY_CFLOAT CUDA_CFLOAT #define ENERGY_F(x) CUDA_F(x) #define ENERGY_PRECISION CUDA_PRECISION #endif @@ -193,41 +194,41 @@ struct F_FLOAT4 { #ifdef X_PRECISION #if X_PRECISION == 1 -#define X_FLOAT float +#define X_CFLOAT float #define X_F(x) x##f #endif #if X_PRECISION == 2 -#define X_FLOAT double +#define X_CFLOAT double #define X_F(x) x #endif #endif #ifndef X_PRECISION -#define X_FLOAT CUDA_FLOAT +#define X_CFLOAT CUDA_CFLOAT #define X_F(x) CUDA_F(x) #define X_PRECISION CUDA_PRECISION #endif #if X_PRECISION == 2 -struct X_FLOAT2 { - X_FLOAT x; - X_FLOAT y; +struct X_CFLOAT2 { + X_CFLOAT x; + X_CFLOAT y; }; -struct X_FLOAT3 { - X_FLOAT x; - X_FLOAT y; - X_FLOAT z; +struct X_CFLOAT3 { + X_CFLOAT x; + X_CFLOAT y; + X_CFLOAT z; }; -struct X_FLOAT4 { - X_FLOAT x; - X_FLOAT y; - X_FLOAT z; - X_FLOAT w; +struct X_CFLOAT4 { + X_CFLOAT x; + X_CFLOAT y; + X_CFLOAT z; + X_CFLOAT w; }; #else -#define X_FLOAT2 float2 -#define X_FLOAT3 float3 -#define X_FLOAT4 float4 +#define X_CFLOAT2 float2 +#define X_CFLOAT3 float3 +#define X_CFLOAT4 float4 #endif //-------------------------------- @@ -236,30 +237,30 @@ struct X_FLOAT4 { #ifdef V_PRECISION #if V_PRECISION == 1 -#define V_FLOAT float +#define V_CFLOAT float #define V_F(x) x##f #endif #if V_PRECISION == 2 -#define V_FLOAT double +#define V_CFLOAT double #define V_F(x) x #endif #endif #ifndef V_PRECISION -#define V_FLOAT CUDA_FLOAT +#define V_CFLOAT CUDA_CFLOAT #define V_F(x) CUDA_F(x) #define V_PRECISION CUDA_PRECISION #endif #if V_PRECISION == 2 -struct V_FLOAT4 { - V_FLOAT x; - V_FLOAT y; - V_FLOAT z; - V_FLOAT w; +struct V_CFLOAT4 { + V_CFLOAT x; + V_CFLOAT y; + V_CFLOAT z; + V_CFLOAT w; }; #else -#define V_FLOAT4 float4 +#define V_CFLOAT4 float4 #endif #ifdef NO_PREC_TIMING diff --git a/lib/cuda/cuda_shared.h b/lib/cuda/cuda_shared.h index 1d29336b00..ac28e81757 100644 --- a/lib/cuda/cuda_shared.h +++ b/lib/cuda/cuda_shared.h @@ -61,9 +61,9 @@ struct cuda_shared_atom { // relevent data from atom class int need_eatom; int need_vatom; - dev_array x_type; // position + type in X_FLOAT4 struct - dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style - dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style + dev_array x_type; // position + type in X_CFLOAT4 struct + dev_array v_radius; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style + dev_array omega_rmass; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style double* mass_host; // remember per-type host pointer to masses //int natoms; // total # of atoms in system, could be 0 @@ -82,7 +82,7 @@ struct cuda_shared_atom { // relevent data from atom class int update_neigh; dev_array xhold; // position at last neighboring - X_FLOAT triggerneighsq; // maximum square movement before reneighboring + X_CFLOAT triggerneighsq; // maximum square movement before reneighboring int reneigh_flag; // is reneighboring necessary int maxhold; // size of xhold int dist_check; //perform distance check for reneighboring @@ -96,9 +96,9 @@ struct cuda_shared_atom { // relevent data from atom class struct cuda_shared_pair { // relevent data from pair class char cudable_force; // check for (cudable_force!=0) - X_FLOAT cut_global; - X_FLOAT cut_inner_global; - X_FLOAT cut_coul_global; + X_CFLOAT cut_global; + X_CFLOAT cut_inner_global; + X_CFLOAT cut_coul_global; double** cut; // type-type cutoff double** cutsq; // type-type cutoff double** cut_inner; // type-type cutoff for coul @@ -116,11 +116,11 @@ struct cuda_shared_pair { // relevent data from pair class double** offset; double* special_lj; double* special_coul; - dev_array virial; // ENERGY_FLOAT - dev_array eng_vdwl; // ENERGY_FLOAT - dev_array eng_coul; // ENERGY_FLOAT - X_FLOAT cut_coulsq_global; - F_FLOAT g_ewald, kappa; + dev_array virial; // ENERGY_CFLOAT + dev_array eng_vdwl; // ENERGY_CFLOAT + dev_array eng_coul; // ENERGY_CFLOAT + X_CFLOAT cut_coulsq_global; + F_CFLOAT g_ewald, kappa; int freeze_group_bit; dev_array coeff1_gm; @@ -144,48 +144,48 @@ struct cuda_shared_pair { // relevent data from pair class }; struct cuda_shared_domain { // relevent data from domain class - X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc - X_FLOAT subhi[3]; - X_FLOAT boxlo[3]; - X_FLOAT boxhi[3]; - X_FLOAT prd[3]; + X_CFLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc + X_CFLOAT subhi[3]; + X_CFLOAT boxlo[3]; + X_CFLOAT boxhi[3]; + X_CFLOAT prd[3]; int periodicity[3]; // xyz periodicity as array int triclinic; - X_FLOAT xy; - X_FLOAT xz; - X_FLOAT yz; - X_FLOAT boxlo_lamda[3]; - X_FLOAT boxhi_lamda[3]; - X_FLOAT prd_lamda[3]; - X_FLOAT h[6]; - X_FLOAT h_inv[6]; - V_FLOAT h_rate[6]; + X_CFLOAT xy; + X_CFLOAT xz; + X_CFLOAT yz; + X_CFLOAT boxlo_lamda[3]; + X_CFLOAT boxhi_lamda[3]; + X_CFLOAT prd_lamda[3]; + X_CFLOAT h[6]; + X_CFLOAT h_inv[6]; + V_CFLOAT h_rate[6]; int update; }; struct cuda_shared_pppm { char cudable_force; #ifdef FFT_CUFFT - FFT_FLOAT* work1; - FFT_FLOAT* work2; - FFT_FLOAT* work3; - PPPM_FLOAT* greensfn; - PPPM_FLOAT* fkx; - PPPM_FLOAT* fky; - PPPM_FLOAT* fkz; - PPPM_FLOAT* vg; + FFT_CFLOAT* work1; + FFT_CFLOAT* work2; + FFT_CFLOAT* work3; + PPPM_CFLOAT* greensfn; + PPPM_CFLOAT* fkx; + PPPM_CFLOAT* fky; + PPPM_CFLOAT* fkz; + PPPM_CFLOAT* vg; #endif int* part2grid; - PPPM_FLOAT* density_brick; + PPPM_CFLOAT* density_brick; int* density_brick_int; - PPPM_FLOAT density_intScale; - PPPM_FLOAT* vdx_brick; - PPPM_FLOAT* vdy_brick; - PPPM_FLOAT* vdz_brick; - PPPM_FLOAT* density_fft; - ENERGY_FLOAT* energy; - ENERGY_FLOAT* virial; + PPPM_CFLOAT density_intScale; + PPPM_CFLOAT* vdx_brick; + PPPM_CFLOAT* vdy_brick; + PPPM_CFLOAT* vdz_brick; + PPPM_CFLOAT* density_fft; + ENERGY_CFLOAT* energy; + ENERGY_CFLOAT* virial; int nxlo_in; int nxhi_in; int nxlo_out; @@ -201,20 +201,20 @@ struct cuda_shared_pppm { int nx_pppm; int ny_pppm; int nz_pppm; - PPPM_FLOAT qqrd2e; + PPPM_CFLOAT qqrd2e; int order; // float3 sublo; - PPPM_FLOAT* rho_coeff; + PPPM_CFLOAT* rho_coeff; int nmax; int nlocal; - PPPM_FLOAT* debugdata; - PPPM_FLOAT delxinv; - PPPM_FLOAT delyinv; - PPPM_FLOAT delzinv; + PPPM_CFLOAT* debugdata; + PPPM_CFLOAT delxinv; + PPPM_CFLOAT delyinv; + PPPM_CFLOAT delzinv; int nlower; int nupper; - PPPM_FLOAT shiftone; - PPPM_FLOAT3* fH; + PPPM_CFLOAT shiftone; + PPPM_CFLOAT3* fH; }; struct cuda_shared_comm { @@ -262,7 +262,7 @@ struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cu int maxneighbors; int neigh_lists_per_page; double** cutneighsq; - CUDA_FLOAT* cu_cutneighsq; + CUDA_CFLOAT* cu_cutneighsq; int* binned_id; int* bin_dim; int bin_nmax; diff --git a/lib/cuda/domain.cu b/lib/cuda/domain.cu index 9dddbf65fd..e6993c6d1d 100644 --- a/lib/cuda/domain.cu +++ b/lib/cuda/domain.cu @@ -49,8 +49,8 @@ void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata) { cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*)); @@ -58,19 +58,19 @@ void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata) void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata) { - cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_CFLOAT)); cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , 3 * sizeof(int)); cudaMemcpyToSymbol(MY_AP(triclinic) , & sdata->domain.triclinic , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_FLOAT)); + cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_CFLOAT)); cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*)); cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata , sizeof(int*)); } @@ -94,15 +94,15 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group int sharedmem = 0; - if(box_change) sharedmem = 6 * sizeof(X_FLOAT); + if(box_change) sharedmem = 6 * sizeof(X_CFLOAT); int3 layout = getgrid(sdata->atom.nlocal, sharedmem); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); sharedmem *= threads.x; - if((box_change) && (sdata->buffer_new or (6 * sizeof(X_FLOAT)*grid.x * grid.y > sdata->buffersize))) - Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_FLOAT)); + if((box_change) && (sdata->buffer_new or (6 * sizeof(X_CFLOAT)*grid.x * grid.y > sdata->buffersize))) + Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_CFLOAT)); Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change); @@ -111,13 +111,13 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed"); if(box_change) { - X_FLOAT buf2[6 * layout.x * layout.y]; - X_FLOAT* buf = buf2; + X_CFLOAT buf2[6 * layout.x * layout.y]; + X_CFLOAT* buf = buf2; int flag; - cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); //printf("Flag: %i\n",flag); - X_FLOAT min, max; + X_CFLOAT min, max; min = 1.0 * BIG; max = -1.0 * BIG; @@ -160,7 +160,7 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group if(n<128) threads.x=32; else if(n<256) threads.x=64; else threads.x=128; - sharedmem=n*sizeof(X_FLOAT); + sharedmem=n*sizeof(X_CFLOAT); grid.x=6; grid.y=1; Domain_reduceBoxExtent<<>>(extent,n); diff --git a/lib/cuda/domain_kernel.cu b/lib/cuda/domain_kernel.cu index fedb7807a8..090ca53a91 100644 --- a/lib/cuda/domain_kernel.cu +++ b/lib/cuda/domain_kernel.cu @@ -21,7 +21,7 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ X_FLOAT sharedmem[]; +extern __shared__ X_CFLOAT sharedmem[]; #define BIG 1e10 __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change) @@ -29,9 +29,9 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box int idim, otherdims; int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT lo[3]; - X_FLOAT hi[3]; - X_FLOAT* period; + X_CFLOAT lo[3]; + X_CFLOAT hi[3]; + X_CFLOAT* period; if(_triclinic == 0) { lo[0] = _boxlo[0]; @@ -54,11 +54,11 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box } - X_FLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]); - X_FLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]); - X_FLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]); + X_CFLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]); + X_CFLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]); + X_CFLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]); - X_FLOAT* buf = (X_FLOAT*) _buffer; + X_CFLOAT* buf = (X_CFLOAT*) _buffer; buf += blockIdx.x * gridDim.y + blockIdx.y; buf[0] = tmpx; buf += gridDim.x * gridDim.y; @@ -181,12 +181,12 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box __syncthreads(); if(box_change) { - X_FLOAT minx = BIG; - X_FLOAT maxx = -BIG; - X_FLOAT miny = BIG; - X_FLOAT maxy = -BIG; - X_FLOAT minz = BIG; - X_FLOAT maxz = -BIG; + X_CFLOAT minx = BIG; + X_CFLOAT maxx = -BIG; + X_CFLOAT miny = BIG; + X_CFLOAT maxy = -BIG; + X_CFLOAT minz = BIG; + X_CFLOAT maxz = -BIG; if(not _periodicity[0]) { sharedmem[threadIdx.x] = tmpx; @@ -231,7 +231,7 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box } if(threadIdx.x == 0) { - buf = (X_FLOAT*) _buffer; + buf = (X_CFLOAT*) _buffer; buf += blockIdx.x * gridDim.y + blockIdx.y; buf[0] = minx; buf += gridDim.x * gridDim.y; @@ -250,7 +250,7 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box __global__ void Domain_reduceBoxExtent(double* extent, int n) { - X_FLOAT* buf = (X_FLOAT*) _buffer; + X_CFLOAT* buf = (X_CFLOAT*) _buffer; buf += blockIdx.x * n; copyGlobToShared(buf, sharedmem, n); @@ -267,8 +267,8 @@ __global__ void Domain_lamda2x_Kernel(int n) int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < n) { - X_FLOAT ytmp = _x[i + _nmax]; - X_FLOAT ztmp = _x[i + 2 * _nmax]; + X_CFLOAT ytmp = _x[i + _nmax]; + X_CFLOAT ztmp = _x[i + 2 * _nmax]; _x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0]; _x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1]; _x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2]; @@ -279,7 +279,7 @@ __global__ void Domain_x2lamda_Kernel(int n) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT delta[3]; + X_CFLOAT delta[3]; if(i < n) { delta[0] = _x[i] - _boxlo[0]; diff --git a/lib/cuda/fft3d_cuda.cu b/lib/cuda/fft3d_cuda.cu index d5ac077f9d..22e8c26bac 100644 --- a/lib/cuda/fft3d_cuda.cu +++ b/lib/cuda/fft3d_cuda.cu @@ -25,15 +25,15 @@ #include "cuda_precision.h" #include "cuda_common.h" struct FFT_DATA { - FFT_FLOAT re; - FFT_FLOAT im; + FFT_CFLOAT re; + FFT_CFLOAT im; }; #include "fft3d_cuda_cu.h" #include "fft3d_cuda_kernel.cu" #include -void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow) +void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow) { dim3 grid; @@ -62,7 +62,7 @@ void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow) threads.x = nfast * 2; threads.y = 1; threads.z = 1; - permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out); + permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out); cudaThreadSynchronize(); MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError()))); } @@ -78,7 +78,7 @@ void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow) threads.x = nfast * 2; threads.y = 1; threads.z = 1; - permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out); + permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out); cudaThreadSynchronize(); } void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo) @@ -92,7 +92,7 @@ void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, i threads.x = (khi - klo + 1) * 2; threads.y = 1; threads.z = 1; - permute_part_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo); + permute_part_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo); cudaThreadSynchronize(); } diff --git a/lib/cuda/fft3d_cuda_cu.h b/lib/cuda/fft3d_cuda_cu.h index 6447d8e125..bb1445c055 100644 --- a/lib/cuda/fft3d_cuda_cu.h +++ b/lib/cuda/fft3d_cuda_cu.h @@ -23,7 +23,7 @@ #include "cuda_shared.h" -extern "C" void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow); +extern "C" void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow); extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow); extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow); extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo); diff --git a/lib/cuda/fft3d_cuda_kernel.cu b/lib/cuda/fft3d_cuda_kernel.cu index 8a1be74bb1..cd8e4a1b73 100644 --- a/lib/cuda/fft3d_cuda_kernel.cu +++ b/lib/cuda/fft3d_cuda_kernel.cu @@ -21,24 +21,24 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__global__ void initfftdata_kernel(double* in, FFT_FLOAT* out) +__global__ void initfftdata_kernel(double* in, FFT_CFLOAT* out) { out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x]; out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0; } -__global__ void permute_kernel(FFT_FLOAT* in, FFT_FLOAT* out) +__global__ void permute_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out) { out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x]; } -__global__ void permute_scale_kernel(FFT_FLOAT* in, FFT_FLOAT* out) +__global__ void permute_scale_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out) { out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5; } -__global__ void permute_part_kernel(FFT_FLOAT* in, FFT_FLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo) +__global__ void permute_part_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo) { { out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo]; diff --git a/lib/cuda/fix_addforce_cuda.cu b/lib/cuda/fix_addforce_cuda.cu index c1ecefba45..9c2a761f89 100644 --- a/lib/cuda/fix_addforce_cuda.cu +++ b/lib/cuda/fix_addforce_cuda.cu @@ -33,10 +33,10 @@ void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata) { - int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT); + int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -55,8 +55,8 @@ void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); } void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata) @@ -64,7 +64,7 @@ void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata) Cuda_FixAddForceCuda_UpdateNmax(sdata); } -void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal) +void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal) { if(sdata->atom.update_nmax) Cuda_FixAddForceCuda_UpdateNmax(sdata); @@ -75,18 +75,18 @@ void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLO if(sdata->buffer_new) Cuda_FixAddForceCuda_UpdateBuffer(sdata); - int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit, axvalue, ayvalue, azvalue); + Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit, axvalue, ayvalue, azvalue); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed"); int oldgrid = grid.x; grid.x = 4; threads.x = 512; - reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal); + reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed"); diff --git a/lib/cuda/fix_addforce_cuda_cu.h b/lib/cuda/fix_addforce_cuda_cu.h index 1bf59300c9..3e587db271 100644 --- a/lib/cuda/fix_addforce_cuda_cu.h +++ b/lib/cuda/fix_addforce_cuda_cu.h @@ -24,4 +24,4 @@ #include "cuda_shared.h" extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal); +extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal); diff --git a/lib/cuda/fix_addforce_cuda_kernel.cu b/lib/cuda/fix_addforce_cuda_kernel.cu index e0265f3797..750d80d047 100644 --- a/lib/cuda/fix_addforce_cuda_kernel.cu +++ b/lib/cuda/fix_addforce_cuda_kernel.cu @@ -21,10 +21,10 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ F_FLOAT sharedmem[]; +extern __shared__ F_CFLOAT sharedmem[]; -__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue) +__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; @@ -51,7 +51,7 @@ __global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval reduceBlock(&sharedmem[blockDim.x]); reduceBlock(&sharedmem[2 * blockDim.x]); reduceBlock(&sharedmem[3 * blockDim.x]); - F_FLOAT* buffer = (F_FLOAT*) _buffer; + F_CFLOAT* buffer = (F_CFLOAT*) _buffer; if(threadIdx.x == 0) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; @@ -63,12 +63,12 @@ __global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval } -__global__ void reduce_foriginal(int n, F_FLOAT* foriginal) +__global__ void reduce_foriginal(int n, F_CFLOAT* foriginal) { int i = 0; sharedmem[threadIdx.x] = 0; - F_FLOAT myforig = 0.0; - F_FLOAT* buf = (F_FLOAT*) _buffer; + F_CFLOAT myforig = 0.0; + F_CFLOAT* buf = (F_CFLOAT*) _buffer; buf = &buf[blockIdx.x * n]; while(i < n) { diff --git a/lib/cuda/fix_aveforce_cuda.cu b/lib/cuda/fix_aveforce_cuda.cu index 600f1d95e0..a9c19d65f3 100644 --- a/lib/cuda/fix_aveforce_cuda.cu +++ b/lib/cuda/fix_aveforce_cuda.cu @@ -33,10 +33,10 @@ void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata) { - int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT); + int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -55,8 +55,8 @@ void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); } void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata) @@ -64,7 +64,7 @@ void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata) Cuda_FixAveForceCuda_UpdateNmax(sdata); } -void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal) +void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal) { if(sdata->atom.update_nmax) Cuda_FixAveForceCuda_UpdateNmax(sdata); @@ -75,25 +75,25 @@ void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, if(sdata->buffer_new) Cuda_FixAveForceCuda_UpdateBuffer(sdata); - int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit); + Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed"); int oldgrid = grid.x; grid.x = 4; threads.x = 512; - Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal); + Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed"); } -void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue) +void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue) { int3 layout = getgrid(sdata->atom.nlocal); dim3 threads(layout.z, 1, 1); diff --git a/lib/cuda/fix_aveforce_cuda_cu.h b/lib/cuda/fix_aveforce_cuda_cu.h index 6d58a472e0..6c8553f052 100644 --- a/lib/cuda/fix_aveforce_cuda_cu.h +++ b/lib/cuda/fix_aveforce_cuda_cu.h @@ -24,5 +24,5 @@ #include "cuda_shared.h" extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal); -extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue); +extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal); +extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue); diff --git a/lib/cuda/fix_aveforce_cuda_kernel.cu b/lib/cuda/fix_aveforce_cuda_kernel.cu index 37d80d92e8..8d397a3289 100644 --- a/lib/cuda/fix_aveforce_cuda_kernel.cu +++ b/lib/cuda/fix_aveforce_cuda_kernel.cu @@ -21,7 +21,7 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ F_FLOAT sharedmem[]; +extern __shared__ F_CFLOAT sharedmem[]; __global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit) @@ -44,7 +44,7 @@ __global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit) reduceBlock(&sharedmem[blockDim.x]); reduceBlock(&sharedmem[2 * blockDim.x]); reduceBlock(&sharedmem[3 * blockDim.x]); - F_FLOAT* buffer = (F_FLOAT*) _buffer; + F_CFLOAT* buffer = (F_CFLOAT*) _buffer; if(threadIdx.x == 0) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; @@ -55,12 +55,12 @@ __global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit) } -__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal) +__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_CFLOAT* foriginal) { int i = 0; sharedmem[threadIdx.x] = 0; - F_FLOAT myforig = 0.0; - F_FLOAT* buf = (F_FLOAT*) _buffer; + F_CFLOAT myforig = 0.0; + F_CFLOAT* buf = (F_CFLOAT*) _buffer; buf = &buf[blockIdx.x * n]; while(i < n) { @@ -81,7 +81,7 @@ __global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal) foriginal[blockIdx.x] = myforig; } -__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue) +__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; diff --git a/lib/cuda/fix_enforce2d_cuda.cu b/lib/cuda/fix_enforce2d_cuda.cu index cc48ed070d..124311f71f 100644 --- a/lib/cuda/fix_enforce2d_cuda.cu +++ b/lib/cuda/fix_enforce2d_cuda.cu @@ -34,8 +34,8 @@ void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); } void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit) diff --git a/lib/cuda/fix_freeze_cuda.cu b/lib/cuda/fix_freeze_cuda.cu index 613c76bbde..c805418622 100644 --- a/lib/cuda/fix_freeze_cuda.cu +++ b/lib/cuda/fix_freeze_cuda.cu @@ -32,10 +32,10 @@ void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata) { - int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT); + int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -55,9 +55,9 @@ void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*)); } @@ -68,7 +68,7 @@ void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata) } -void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal) +void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal) { if(sdata->atom.update_nmax) Cuda_FixFreezeCuda_UpdateNmax(sdata); @@ -80,18 +80,18 @@ void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT Cuda_FixFreezeCuda_UpdateBuffer(sdata); - int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit); + Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed"); int oldgrid = grid.x; grid.x = 3; threads.x = 512; - Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal); + Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); diff --git a/lib/cuda/fix_freeze_cuda_cu.h b/lib/cuda/fix_freeze_cuda_cu.h index e8b21a9558..0c31c3feee 100644 --- a/lib/cuda/fix_freeze_cuda_cu.h +++ b/lib/cuda/fix_freeze_cuda_cu.h @@ -24,4 +24,4 @@ #include "cuda_shared.h" extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal); +extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal); diff --git a/lib/cuda/fix_freeze_cuda_kernel.cu b/lib/cuda/fix_freeze_cuda_kernel.cu index 5f5057c87d..a23ee8317f 100644 --- a/lib/cuda/fix_freeze_cuda_kernel.cu +++ b/lib/cuda/fix_freeze_cuda_kernel.cu @@ -21,7 +21,7 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ F_FLOAT sharedmem[]; +extern __shared__ F_CFLOAT sharedmem[]; __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit) @@ -49,7 +49,7 @@ __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit) reduceBlock(sharedmem); reduceBlock(&sharedmem[blockDim.x]); reduceBlock(&sharedmem[2 * blockDim.x]); - F_FLOAT* buffer = (F_FLOAT*)_buffer; + F_CFLOAT* buffer = (F_CFLOAT*)_buffer; if(threadIdx.x == 0) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; @@ -59,12 +59,12 @@ __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit) } -__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal) +__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal) { int i = 0; sharedmem[threadIdx.x] = 0; - F_FLOAT myforig = 0.0; - F_FLOAT* buf = (F_FLOAT*)_buffer; + F_CFLOAT myforig = 0.0; + F_CFLOAT* buf = (F_CFLOAT*)_buffer; buf = &buf[blockIdx.x * n]; while(i < n) { diff --git a/lib/cuda/fix_gravity_cuda.cu b/lib/cuda/fix_gravity_cuda.cu index 0fc7051b86..dd9ab9b7bb 100644 --- a/lib/cuda/fix_gravity_cuda.cu +++ b/lib/cuda/fix_gravity_cuda.cu @@ -32,10 +32,10 @@ void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata) { - int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT); + int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -55,12 +55,12 @@ void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*)); } void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata) @@ -70,7 +70,7 @@ void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata) } -void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc) +void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc) { if(sdata->atom.update_nmax) Cuda_FixGravityCuda_UpdateNmax(sdata); diff --git a/lib/cuda/fix_gravity_cuda_cu.h b/lib/cuda/fix_gravity_cuda_cu.h index 014b71f011..46043a152c 100644 --- a/lib/cuda/fix_gravity_cuda_cu.h +++ b/lib/cuda/fix_gravity_cuda_cu.h @@ -24,4 +24,4 @@ #include "cuda_shared.h" extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc); +extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc); diff --git a/lib/cuda/fix_gravity_cuda_kernel.cu b/lib/cuda/fix_gravity_cuda_kernel.cu index ba58d39bc8..271033060d 100644 --- a/lib/cuda/fix_gravity_cuda_kernel.cu +++ b/lib/cuda/fix_gravity_cuda_kernel.cu @@ -21,13 +21,13 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc) +__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal) if(_mask[i] & groupbit) { - F_FLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]]; + F_CFLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]]; _f[i] += mass * xacc; _f[i + 1 * _nmax] += mass * yacc; _f[i + 2 * _nmax] += mass * zacc; diff --git a/lib/cuda/fix_nh_cuda.cu b/lib/cuda/fix_nh_cuda.cu index cda10bd588..6e3138a57f 100644 --- a/lib/cuda/fix_nh_cuda.cu +++ b/lib/cuda/fix_nh_cuda.cu @@ -32,21 +32,21 @@ void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata) { - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata } void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata) @@ -67,12 +67,12 @@ void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata } -void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) +void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf) { - cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT)); - cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int)); cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); // Cuda_FixNHCuda_UpdateNmax(sdata); @@ -97,8 +97,8 @@ void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* fa if(sdata->buffer_new) Cuda_FixNHCuda_UpdateBuffer(sdata); - F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; - F_FLOAT3 factor2; + F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; + F_CFLOAT3 factor2; if(p_triclinic) { factor2.x = factor_h[3], factor2.y = factor_h[4]; @@ -125,8 +125,8 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro if(sdata->buffer_new) Cuda_FixNHCuda_UpdateBuffer(sdata); - F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; - F_FLOAT3 factor2; + F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; + F_CFLOAT3 factor2; if(p_triclinic) { factor2.x = factor_h[3], factor2.y = factor_h[4]; @@ -143,7 +143,7 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro } -void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp +void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp { my_times atime1, atime2; my_gettime(CLOCK_REALTIME, &atime1); @@ -237,8 +237,8 @@ void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int gro if(sdata->buffer_new) Cuda_FixNHCuda_UpdateBuffer(sdata); - F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; - F_FLOAT3 factor2; + F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; + F_CFLOAT3 factor2; if(p_triclinic) { factor2.x = factor_h[3], factor2.y = factor_h[4]; diff --git a/lib/cuda/fix_nh_cuda_cu.h b/lib/cuda/fix_nh_cuda_cu.h index ba6203cfd0..a358502882 100644 --- a/lib/cuda/fix_nh_cuda_cu.h +++ b/lib/cuda/fix_nh_cuda_cu.h @@ -23,9 +23,9 @@ #include "cuda_shared.h" -extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); +extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf); extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp -extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp diff --git a/lib/cuda/fix_nh_cuda_kernel.cu b/lib/cuda/fix_nh_cuda_kernel.cu index 8e14fa7d87..dcd7426097 100644 --- a/lib/cuda/fix_nh_cuda_kernel.cu +++ b/lib/cuda/fix_nh_cuda_kernel.cu @@ -21,14 +21,14 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit) +static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit) { if(_dist_check) { - X_FLOAT d = X_F(0.0); + X_CFLOAT d = X_F(0.0); if(i < _nlocal) { - X_FLOAT tmp = xtmp - _xhold[i]; + X_CFLOAT tmp = xtmp - _xhold[i]; d = tmp * tmp; tmp = ytmp - _xhold[i + _maxhold]; d += tmp * tmp; @@ -43,15 +43,15 @@ static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLO } } -__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2) +__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - V_FLOAT* my_v = _v + i; - V_FLOAT vx = my_v[0]; - V_FLOAT vy = my_v[_nmax]; - V_FLOAT vz = my_v[2 * _nmax]; + V_CFLOAT* my_v = _v + i; + V_CFLOAT vx = my_v[0]; + V_CFLOAT vy = my_v[_nmax]; + V_CFLOAT vz = my_v[2 * _nmax]; vx *= factor.x; vy *= factor.y; vz *= factor.z; @@ -71,12 +71,12 @@ __global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p } -__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta) +__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_CFLOAT factor_eta) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - V_FLOAT* my_v = _v + i; + V_CFLOAT* my_v = _v + i; my_v[0] *= factor_eta; my_v[_nmax] *= factor_eta; my_v[2 * _nmax] *= factor_eta; @@ -84,22 +84,22 @@ __global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta) } -__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2) +__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; + F_CFLOAT* my_f = _f + i; + V_CFLOAT* my_v = _v + i; - V_FLOAT dtfm = _dtf; + V_CFLOAT dtfm = _dtf; if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; else dtfm *= V_F(1.0) / _mass[_type[i]]; - V_FLOAT vx = my_v[0]; - V_FLOAT vy = my_v[_nmax]; - V_FLOAT vz = my_v[2 * _nmax]; + V_CFLOAT vx = my_v[0]; + V_CFLOAT vy = my_v[_nmax]; + V_CFLOAT vz = my_v[2 * _nmax]; vx *= factor.x; vy *= factor.y; vz *= factor.z; @@ -125,10 +125,10 @@ __global__ void FixNHCuda_nve_v_Kernel(int groupbit) int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; + F_CFLOAT* my_f = _f + i; + V_CFLOAT* my_v = _v + i; - V_FLOAT dtfm = _dtf; + V_CFLOAT dtfm = _dtf; if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; else dtfm *= V_F(1.0) / _mass[_type[i]]; @@ -145,13 +145,13 @@ __global__ void FixNHCuda_nve_v_Kernel(int groupbit) __global__ void FixNHCuda_nve_x_Kernel(int groupbit) { - X_FLOAT xtmp, ytmp, ztmp; + X_CFLOAT xtmp, ytmp, ztmp; int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - V_FLOAT* my_v = _v + i; - X_FLOAT* my_x = _x + i; + V_CFLOAT* my_v = _v + i; + X_CFLOAT* my_x = _x + i; xtmp = *my_x += _dtv * *my_v; my_v += _nmax; @@ -166,23 +166,23 @@ __global__ void FixNHCuda_nve_x_Kernel(int groupbit) } -__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2) +__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; + F_CFLOAT* my_f = _f + i; + V_CFLOAT* my_v = _v + i; - V_FLOAT dtfm = _dtf; + V_CFLOAT dtfm = _dtf; if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; else dtfm *= V_F(1.0) / _mass[_type[i]]; - V_FLOAT vx = my_v[0] + dtfm * my_f[0]; - V_FLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax]; - V_FLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax]; + V_CFLOAT vx = my_v[0] + dtfm * my_f[0]; + V_CFLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax]; + V_CFLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax]; vx *= factor.x; vy *= factor.y; diff --git a/lib/cuda/fix_nve_cuda.cu b/lib/cuda/fix_nve_cuda.cu index 8acddcd6f1..0bbef6bb9e 100644 --- a/lib/cuda/fix_nve_cuda.cu +++ b/lib/cuda/fix_nve_cuda.cu @@ -32,19 +32,19 @@ void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata) { - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata - cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata } void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata) @@ -65,12 +65,12 @@ void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata } -void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) +void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf) { - cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT)); - cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int)); cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); // Cuda_FixNVECuda_UpdateNmax(sdata); diff --git a/lib/cuda/fix_nve_cuda_cu.h b/lib/cuda/fix_nve_cuda_cu.h index 90b393c9ec..a35d3eb15e 100644 --- a/lib/cuda/fix_nve_cuda_cu.h +++ b/lib/cuda/fix_nve_cuda_cu.h @@ -23,6 +23,6 @@ #include "cuda_shared.h" -extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); +extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf); extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); diff --git a/lib/cuda/fix_nve_cuda_kernel.cu b/lib/cuda/fix_nve_cuda_kernel.cu index c99439adb4..d1c0efcd71 100644 --- a/lib/cuda/fix_nve_cuda_kernel.cu +++ b/lib/cuda/fix_nve_cuda_kernel.cu @@ -21,11 +21,11 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit) +static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit) { if(_dist_check) { - X_FLOAT tmp = xtmp - _xhold[i]; - X_FLOAT d = tmp * tmp; + X_CFLOAT tmp = xtmp - _xhold[i]; + X_CFLOAT d = tmp * tmp; tmp = ytmp - _xhold[i + _maxhold]; d += tmp * tmp; tmp = ztmp - _xhold[i + 2 * _maxhold]; @@ -41,7 +41,7 @@ static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLO __global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit) { - X_FLOAT xtmp, ytmp, ztmp; + X_CFLOAT xtmp, ytmp, ztmp; #ifdef CUDA_USE_BINNING const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; @@ -50,16 +50,16 @@ __global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit) const int i = 3 * blockDim.x * bin + threadIdx.x; if(_mask[i] & groupbit) { - F_FLOAT* my_f = _binned_f + i; - V_FLOAT* my_v = _binned_v + i; - X_FLOAT* my_x = _binned_x + i; + F_CFLOAT* my_f = _binned_f + i; + V_CFLOAT* my_v = _binned_v + i; + X_CFLOAT* my_x = _binned_x + i; - V_FLOAT dtfm = _dtf + V_CFLOAT dtfm = _dtf if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i]; else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; - V_FLOAT v_mem; + V_CFLOAT v_mem; v_mem = *my_v += dtfm * (*my_f); xtmp = *my_x += _dtv * v_mem; my_f += blockDim.x; @@ -80,16 +80,16 @@ __global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit) int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - X_FLOAT* my_x = _x + i; + F_CFLOAT* my_f = _f + i; + V_CFLOAT* my_v = _v + i; + X_CFLOAT* my_x = _x + i; - V_FLOAT dtfm = _dtf; + V_CFLOAT dtfm = _dtf; if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; else dtfm *= V_F(1.0) / _mass[_type[i]]; - V_FLOAT v_mem; + V_CFLOAT v_mem; v_mem = *my_v += dtfm * (*my_f); xtmp = *my_x += _dtv * v_mem; my_f += _nmax; @@ -119,10 +119,10 @@ __global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit) const int i = 3 * blockDim.x * bin + threadIdx.x; if(_mask[i] & groupbit) { - F_FLOAT* my_f = _binned_f + i; - V_FLOAT* my_v = _binned_v + i; + F_CFLOAT* my_f = _binned_f + i; + V_CFLOAT* my_v = _binned_v + i; - V_FLOAT dtfm = _dtf + V_CFLOAT dtfm = _dtf if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i]; else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; @@ -142,10 +142,10 @@ __global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit) int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal && _mask[i] & groupbit) { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; + F_CFLOAT* my_f = _f + i; + V_CFLOAT* my_v = _v + i; - V_FLOAT dtfm = _dtf; + V_CFLOAT dtfm = _dtf; if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; else dtfm *= V_F(1.0) / _mass[_type[i]]; diff --git a/lib/cuda/fix_set_force_cuda.cu b/lib/cuda/fix_set_force_cuda.cu index afa1a4789c..6e8a4e9107 100644 --- a/lib/cuda/fix_set_force_cuda.cu +++ b/lib/cuda/fix_set_force_cuda.cu @@ -32,10 +32,10 @@ void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata) { - int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT); + int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -55,8 +55,8 @@ void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); } void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata) @@ -66,7 +66,7 @@ void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata) } -void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz) +void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz) { if(sdata->atom.update_nmax) Cuda_FixSetForceCuda_UpdateNmax(sdata); @@ -78,18 +78,18 @@ void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLO Cuda_FixSetForceCuda_UpdateBuffer(sdata); - int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz); + Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed"); int oldgrid = grid.x; grid.x = 3; threads.x = 512; - Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal); + Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); diff --git a/lib/cuda/fix_set_force_cuda_cu.h b/lib/cuda/fix_set_force_cuda_cu.h index 63e528acce..0b8ec287e3 100644 --- a/lib/cuda/fix_set_force_cuda_cu.h +++ b/lib/cuda/fix_set_force_cuda_cu.h @@ -24,4 +24,4 @@ #include "cuda_shared.h" extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata); -extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz); +extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz); diff --git a/lib/cuda/fix_set_force_cuda_kernel.cu b/lib/cuda/fix_set_force_cuda_kernel.cu index ee1a590622..bb9ca84a9a 100644 --- a/lib/cuda/fix_set_force_cuda_kernel.cu +++ b/lib/cuda/fix_set_force_cuda_kernel.cu @@ -21,10 +21,10 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -extern __shared__ F_FLOAT sharedmem[]; +extern __shared__ F_CFLOAT sharedmem[]; -__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, int flagx, int flagy, int flagz) +__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, int flagx, int flagy, int flagz) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; sharedmem[threadIdx.x] = 0; @@ -48,7 +48,7 @@ __global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval reduceBlock(sharedmem); reduceBlock(&sharedmem[blockDim.x]); reduceBlock(&sharedmem[2 * blockDim.x]); - F_FLOAT* buffer = (F_FLOAT*)_buffer; + F_CFLOAT* buffer = (F_CFLOAT*)_buffer; if(threadIdx.x == 0) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; @@ -58,12 +58,12 @@ __global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval } -__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal) +__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal) { int i = 0; sharedmem[threadIdx.x] = 0; - F_FLOAT myforig = 0.0; - F_FLOAT* buf = (F_FLOAT*)_buffer; + F_CFLOAT myforig = 0.0; + F_CFLOAT* buf = (F_CFLOAT*)_buffer; buf = &buf[blockIdx.x * n]; while(i < n) { diff --git a/lib/cuda/fix_shake_cuda.cu b/lib/cuda/fix_shake_cuda.cu index e27f54e968..5dc296db98 100644 --- a/lib/cuda/fix_shake_cuda.cu +++ b/lib/cuda/fix_shake_cuda.cu @@ -41,37 +41,37 @@ __device__ __constant__ int* _shake_atom; __device__ __constant__ int* _shake_type; __device__ __constant__ int* _shake_flag; -__device__ __constant__ X_FLOAT3* _xshake; -__device__ __constant__ F_FLOAT _dtfsq; -__device__ __constant__ X_FLOAT* _bond_distance; -__device__ __constant__ X_FLOAT* _angle_distance; +__device__ __constant__ X_CFLOAT3* _xshake; +__device__ __constant__ F_CFLOAT _dtfsq; +__device__ __constant__ X_CFLOAT* _bond_distance; +__device__ __constant__ X_CFLOAT* _angle_distance; __device__ __constant__ int _max_iter; -__device__ __constant__ X_FLOAT _tolerance; +__device__ __constant__ X_CFLOAT _tolerance; #include "fix_shake_cuda_kernel.cu" void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata) { - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata , sizeof(int*)); } void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata) { cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity , sizeof(int) * 3); - cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_CFLOAT) * 3); cudaMemcpyToSymbol(MY_AP(triclinic) , &sdata->domain.triclinic , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_FLOAT) * 6); + cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_CFLOAT) * 6); } void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size) @@ -89,10 +89,10 @@ void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size) cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*)); } -void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq, +void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq, void* shake_flag, void* shake_atom, void* shake_type, void* xshake, void* bond_distance, void* angle_distance, void* virial, - int max_iter, X_FLOAT tolerance) + int max_iter, X_CFLOAT tolerance) { Cuda_FixShakeCuda_UpdateNmax(sdata); Cuda_FixShakeCuda_UpdateDomain(sdata); @@ -100,17 +100,17 @@ void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq, cudaMemcpyToSymbol(MY_AP(shake_type) , & shake_type , sizeof(void*)); cudaMemcpyToSymbol(MY_AP(shake_flag) , & shake_flag , sizeof(void*)); cudaMemcpyToSymbol(MY_AP(xshake) , & xshake , sizeof(void*)); - cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_CFLOAT)); cudaMemcpyToSymbol(MY_AP(bond_distance) , & bond_distance , sizeof(void*)); cudaMemcpyToSymbol(MY_AP(angle_distance) , & angle_distance , sizeof(void*)); cudaMemcpyToSymbol(MY_AP(virial) , & virial , sizeof(void*)); cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*)); cudaMemcpyToSymbol(MY_AP(max_iter) , &max_iter , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_CFLOAT)); if(sdata->atom.mass_host) - cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); // @@ -149,16 +149,16 @@ void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_FLOAT), 64); + int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_CFLOAT), 64); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); if(sdata->buffer_new) - Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_FLOAT)); + Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_CFLOAT)); BindXTypeTexture(sdata); - FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_FLOAT)>>> (vflag, vflag_atom, list, nlist); + FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_CFLOAT)>>> (vflag, vflag_atom, list, nlist); cudaThreadSynchronize(); CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed"); @@ -168,7 +168,7 @@ void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, grid.x = 6; grid.y = 1; threads.x = 256; - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed"); } @@ -183,14 +183,14 @@ int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_FixShakeCuda_UpdateBuffer(sdata, size); - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -212,7 +212,7 @@ int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* cudaMemset(sdata->flag, 0, sizeof(int)); FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz); cudaThreadSynchronize(); - cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost); int aflag; cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); @@ -232,16 +232,16 @@ int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, i if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_FixShakeCuda_UpdateBuffer(sdata, size); static int count = -1; count++; - X_FLOAT dx = 0.0; - X_FLOAT dy = 0.0; - X_FLOAT dz = 0.0; + X_CFLOAT dx = 0.0; + X_CFLOAT dy = 0.0; + X_CFLOAT dz = 0.0; if(pbc_flag != 0) { if(sdata->domain.triclinic == 0) { @@ -278,7 +278,7 @@ void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, voi if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - int size = n * 3 * sizeof(X_FLOAT); + int size = n * 3 * sizeof(X_CFLOAT); if(sdata->buffer_new or (size > sdata->buffersize)) Cuda_FixShakeCuda_UpdateBuffer(sdata, size); @@ -288,7 +288,7 @@ void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, voi dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice); FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed"); diff --git a/lib/cuda/fix_shake_cuda_cu.h b/lib/cuda/fix_shake_cuda_cu.h index 9b808a7216..227dd2c00f 100644 --- a/lib/cuda/fix_shake_cuda_cu.h +++ b/lib/cuda/fix_shake_cuda_cu.h @@ -22,10 +22,10 @@ ------------------------------------------------------------------------- */ #include "cuda_shared.h" -extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq, +extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq, void* shake_flag, void* shake_atom, void* shake_type, void* xshake, void* bond_distance, void* angle_distance, void* virial, - int max_iter, X_FLOAT tolerance); + int max_iter, X_CFLOAT tolerance); extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata); extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist); extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag); diff --git a/lib/cuda/fix_shake_cuda_kernel.cu b/lib/cuda/fix_shake_cuda_kernel.cu index da176d0770..2c22d5ca85 100644 --- a/lib/cuda/fix_shake_cuda_kernel.cu +++ b/lib/cuda/fix_shake_cuda_kernel.cu @@ -21,12 +21,12 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_FLOAT total, ENERGY_FLOAT* v) +__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_CFLOAT total, ENERGY_CFLOAT* v) { /*if(vflag_global) { - ENERGY_FLOAT fraction = n/total; - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT fraction = n/total; + ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x]; *shared += fraction*v[0]; shared+=blockDim.x; *shared += fraction*v[1]; shared+=blockDim.x; *shared += fraction*v[2]; shared+=blockDim.x; @@ -35,11 +35,11 @@ __device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, E *shared += fraction*v[5]; }*/ if(vflag_atom) { - ENERGY_FLOAT fraction = ENERGY_F(1.0) / total; + ENERGY_CFLOAT fraction = ENERGY_F(1.0) / total; for(int i = 0; i < n; i++) { int m = list[i]; - ENERGY_FLOAT* myvatom = &_vatom[m]; + ENERGY_CFLOAT* myvatom = &_vatom[m]; *myvatom += fraction * v[0]; myvatom += _nmax; @@ -56,7 +56,7 @@ __device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, E } } -inline __device__ void minimum_image(X_FLOAT3 &delta) +inline __device__ void minimum_image(X_CFLOAT3 &delta) { if(_triclinic == 0) { if(_periodicity[0]) { @@ -106,14 +106,14 @@ __global__ void FixShakeCuda_UnconstrainedUpdate_Kernel() if(i >= _nlocal) return; - X_FLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)}; + X_CFLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)}; if(_shake_flag[i]) { - F_FLOAT* my_f = _f + i; - V_FLOAT* my_v = _v + i; - X_FLOAT* my_x = _x + i; + F_CFLOAT* my_f = _f + i; + V_CFLOAT* my_v = _v + i; + X_CFLOAT* my_x = _x + i; - V_FLOAT dtfmsq = _dtfsq; + V_CFLOAT dtfmsq = _dtfsq; if(_rmass_flag) dtfmsq *= V_F(1.0) / _rmass[i]; else dtfmsq *= V_F(1.0) / _mass[_type[i]]; @@ -138,20 +138,20 @@ __global__ void FixShakeCuda_UnconstrainedUpdate_Kernel() __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) { int nlist, list[2]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0, invmass1; + ENERGY_CFLOAT v[6]; + X_CFLOAT invmass0, invmass1; // local atom IDs and constraint distances int i0 = _map_array[_shake_atom[m]]; int i1 = _map_array[_shake_atom[m + _nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + X_CFLOAT bond1 = _bond_distance[_shake_type[m]]; // r01 = distance vec between atoms, with PBC - X_FLOAT3 r01; + X_CFLOAT3 r01; - X_FLOAT4 x_i0, x_i1; + X_CFLOAT4 x_i0, x_i1; x_i0 = fetchXType(i0); x_i1 = fetchXType(i1); @@ -162,9 +162,9 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) // s01 = distance vec after unconstrained update, with PBC - X_FLOAT3 s01; - X_FLOAT3 xs_i0 = _xshake[i0]; - X_FLOAT3 xs_i1 = _xshake[i1]; + X_CFLOAT3 s01; + X_CFLOAT3 xs_i0 = _xshake[i0]; + X_CFLOAT3 xs_i1 = _xshake[i1]; s01.x = xs_i0.x - xs_i1.x; s01.y = xs_i0.y - xs_i1.y; @@ -173,8 +173,8 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) // scalar distances between atoms - X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; - X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; // a,b,c = coeffs in quadratic equation for lamda @@ -186,14 +186,14 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) invmass1 = X_F(1.0) / _mass[static_cast (x_i1.w)]; } - X_FLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; - X_FLOAT b = X_F(2.0) * (invmass0 + invmass1) * + X_CFLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_CFLOAT b = X_F(2.0) * (invmass0 + invmass1) * (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); - X_FLOAT c = s01sq - bond1 * bond1; + X_CFLOAT c = s01sq - bond1 * bond1; // error check - X_FLOAT determ = b * b - X_F(4.0) * a * c; + X_CFLOAT determ = b * b - X_F(4.0) * a * c; if(determ < X_F(0.0)) { _flag[0]++; @@ -202,7 +202,7 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) // exact quadratic solution for lamda - X_FLOAT lamda, lamda1, lamda2; + X_CFLOAT lamda, lamda1, lamda2; lamda1 = -b + _SQRT_(determ); lamda2 = -lamda1 - X_F(2.0) * b; lamda1 *= X_F(1.0) / (X_F(2.0) * a); @@ -233,8 +233,8 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) } if(vflag || vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor = nlist; + ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x]; + X_CFLOAT factor = nlist; v[0] = lamda * r01.x * r01.x; *shared = factor * v[0]; shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 @@ -262,22 +262,22 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m) { int nlist, list[3]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0, invmass1, invmass2; + ENERGY_CFLOAT v[6]; + X_CFLOAT invmass0, invmass1, invmass2; // local atom IDs and constraint distances int i0 = _map_array[_shake_atom[m]]; int i1 = _map_array[_shake_atom[m + _nmax]]; int i2 = _map_array[_shake_atom[m + 2 * _nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; - X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; + X_CFLOAT bond1 = _bond_distance[_shake_type[m]]; + X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; // r01 = distance vec between atoms, with PBC - X_FLOAT3 r01, r02; + X_CFLOAT3 r01, r02; - X_FLOAT4 x_i0, x_i1, x_i2; + X_CFLOAT4 x_i0, x_i1, x_i2; x_i0 = fetchXType(i0); x_i1 = fetchXType(i1); x_i2 = fetchXType(i2); @@ -294,10 +294,10 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m) // s01 = distance vec after unconstrained update, with PBC - X_FLOAT3 s01, s02; - X_FLOAT3 xs_i0 = _xshake[i0]; - X_FLOAT3 xs_i1 = _xshake[i1]; - X_FLOAT3 xs_i2 = _xshake[i2]; + X_CFLOAT3 s01, s02; + X_CFLOAT3 xs_i0 = _xshake[i0]; + X_CFLOAT3 xs_i1 = _xshake[i1]; + X_CFLOAT3 xs_i2 = _xshake[i2]; s01.x = xs_i0.x - xs_i1.x; s01.y = xs_i0.y - xs_i1.y; @@ -311,10 +311,10 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m) // scalar distances between atoms - X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; - X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; - X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; - X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; + X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; + X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; // a,b,c = coeffs in quadratic equation for lamda @@ -328,48 +328,48 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m) invmass2 = X_F(1.0) / _mass[static_cast (x_i2.w)]; } - X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * + X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); - X_FLOAT a12 = X_F(2.0) * invmass0 * + X_CFLOAT a12 = X_F(2.0) * invmass0 * (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z); - X_FLOAT a21 = X_F(2.0) * invmass0 * + X_CFLOAT a21 = X_F(2.0) * invmass0 * (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z); - X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * + X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z); // error check - X_FLOAT determ = a11 * a22 - a12 * a21; + X_CFLOAT determ = a11 * a22 - a12 * a21; if(determ == X_F(0.0)) _flag[0]++; - X_FLOAT determinv = X_F(1.0) / determ; + X_CFLOAT determinv = X_F(1.0) / determ; - X_FLOAT a11inv = a22 * determinv; - X_FLOAT a12inv = -a12 * determinv; - X_FLOAT a21inv = -a21 * determinv; - X_FLOAT a22inv = a11 * determinv; + X_CFLOAT a11inv = a22 * determinv; + X_CFLOAT a12inv = -a12 * determinv; + X_CFLOAT a21inv = -a21 * determinv; + X_CFLOAT a22inv = a11 * determinv; // quadratic correction coeffs - X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); + X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); - X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; - X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq; - X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; + X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq; + X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; - X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; - X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq; - X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; + X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; + X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq; + X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; // iterate until converged - X_FLOAT lamda01 = X_F(0.0); - X_FLOAT lamda02 = X_F(0.0); + X_CFLOAT lamda01 = X_F(0.0); + X_CFLOAT lamda02 = X_F(0.0); int niter = 0; int done = 0; - X_FLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new; + X_CFLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new; //maybe all running full loop? while(__any(!done) && niter < _max_iter) { @@ -425,8 +425,8 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m) } if(vflag || vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist; + ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x]; + X_CFLOAT factor = X_F(2.0) / X_F(3.0) * nlist; v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x; *shared = factor * v[0]; shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 @@ -453,8 +453,8 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m) __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) { int nlist, list[4]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0, invmass1, invmass2, invmass3; + ENERGY_CFLOAT v[6]; + X_CFLOAT invmass0, invmass1, invmass2, invmass3; // local atom IDs and constraint distances @@ -462,15 +462,15 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) int i1 = _map_array[_shake_atom[m + _nmax]]; int i2 = _map_array[_shake_atom[m + 2 * _nmax]]; int i3 = _map_array[_shake_atom[m + 3 * _nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; - X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; - X_FLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]]; + X_CFLOAT bond1 = _bond_distance[_shake_type[m]]; + X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; + X_CFLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]]; // r01 = distance vec between atoms, with PBC - X_FLOAT3 r01, r02, r03; + X_CFLOAT3 r01, r02, r03; - X_FLOAT4 x_i0, x_i1, x_i2, x_i3; + X_CFLOAT4 x_i0, x_i1, x_i2, x_i3; x_i0 = fetchXType(i0); x_i1 = fetchXType(i1); x_i2 = fetchXType(i2); @@ -493,11 +493,11 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) // s01 = distance vec after unconstrained update, with PBC - X_FLOAT3 s01, s02, s03; - X_FLOAT3 xs_i0 = _xshake[i0]; - X_FLOAT3 xs_i1 = _xshake[i1]; - X_FLOAT3 xs_i2 = _xshake[i2]; - X_FLOAT3 xs_i3 = _xshake[i3]; + X_CFLOAT3 s01, s02, s03; + X_CFLOAT3 xs_i0 = _xshake[i0]; + X_CFLOAT3 xs_i1 = _xshake[i1]; + X_CFLOAT3 xs_i2 = _xshake[i2]; + X_CFLOAT3 xs_i3 = _xshake[i3]; s01.x = xs_i0.x - xs_i1.x; s01.y = xs_i0.y - xs_i1.y; @@ -516,12 +516,12 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) // scalar distances between atoms - X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; - X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; - X_FLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z; - X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; - X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; - X_FLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z; + X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; + X_CFLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z; + X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; + X_CFLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z; // a,b,c = coeffs in quadratic equation for lamda @@ -537,79 +537,79 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) invmass3 = X_F(1.0) / _mass[static_cast (x_i3.w)]; } - X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * + X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); - X_FLOAT a12 = X_F(2.0) * invmass0 * + X_CFLOAT a12 = X_F(2.0) * invmass0 * (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z); - X_FLOAT a13 = X_F(2.0) * invmass0 * + X_CFLOAT a13 = X_F(2.0) * invmass0 * (s01.x * r03.x + s01.y * r03.y + s01.z * r03.z); - X_FLOAT a21 = X_F(2.0) * invmass0 * + X_CFLOAT a21 = X_F(2.0) * invmass0 * (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z); - X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * + X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z); - X_FLOAT a23 = X_F(2.0) * (invmass0) * + X_CFLOAT a23 = X_F(2.0) * (invmass0) * (s02.x * r03.x + s02.y * r03.y + s02.z * r03.z); - X_FLOAT a31 = X_F(2.0) * (invmass0) * + X_CFLOAT a31 = X_F(2.0) * (invmass0) * (s03.x * r01.x + s03.y * r01.y + s03.z * r01.z); - X_FLOAT a32 = X_F(2.0) * (invmass0) * + X_CFLOAT a32 = X_F(2.0) * (invmass0) * (s03.x * r02.x + s03.y * r02.y + s03.z * r02.z); - X_FLOAT a33 = X_F(2.0) * (invmass0 + invmass3) * + X_CFLOAT a33 = X_F(2.0) * (invmass0 + invmass3) * (s03.x * r03.x + s03.y * r03.y + s03.z * r03.z); // error check - X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 - + X_CFLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 - a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31; if(determ == X_F(0.0)) _flag[0]++; - X_FLOAT determinv = X_F(1.0) / determ; + X_CFLOAT determinv = X_F(1.0) / determ; - X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32); - X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32); - X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22); - X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31); - X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31); - X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21); - X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31); - X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31); - X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21); + X_CFLOAT a11inv = determinv * (a22 * a33 - a23 * a32); + X_CFLOAT a12inv = -determinv * (a12 * a33 - a13 * a32); + X_CFLOAT a13inv = determinv * (a12 * a23 - a13 * a22); + X_CFLOAT a21inv = -determinv * (a21 * a33 - a23 * a31); + X_CFLOAT a22inv = determinv * (a11 * a33 - a13 * a31); + X_CFLOAT a23inv = -determinv * (a11 * a23 - a13 * a21); + X_CFLOAT a31inv = determinv * (a21 * a32 - a22 * a31); + X_CFLOAT a32inv = -determinv * (a11 * a32 - a12 * a31); + X_CFLOAT a33inv = determinv * (a11 * a22 - a12 * a21); // quadratic correction coeffs - X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); - X_FLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z); - X_FLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z); + X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); + X_CFLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z); + X_CFLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z); - X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; - X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq; - X_FLOAT quad1_0303 = invmass0 * invmass0 * r03sq; - X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; - X_FLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103; - X_FLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203; + X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq; + X_CFLOAT quad1_0303 = invmass0 * invmass0 * r03sq; + X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; + X_CFLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103; + X_CFLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203; - X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq; - X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; - X_FLOAT quad2_0303 = invmass0 * invmass0 * r03sq; - X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; - X_FLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103; - X_FLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203; + X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq; + X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; + X_CFLOAT quad2_0303 = invmass0 * invmass0 * r03sq; + X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; + X_CFLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103; + X_CFLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203; - X_FLOAT quad3_0101 = invmass0 * invmass0 * r01sq; - X_FLOAT quad3_0202 = invmass0 * invmass0 * r02sq; - X_FLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq; - X_FLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102; - X_FLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103; - X_FLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203; + X_CFLOAT quad3_0101 = invmass0 * invmass0 * r01sq; + X_CFLOAT quad3_0202 = invmass0 * invmass0 * r02sq; + X_CFLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq; + X_CFLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102; + X_CFLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103; + X_CFLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203; // iterate until converged - X_FLOAT lamda01 = X_F(0.0); - X_FLOAT lamda02 = X_F(0.0); - X_FLOAT lamda03 = X_F(0.0); + X_CFLOAT lamda01 = X_F(0.0); + X_CFLOAT lamda02 = X_F(0.0); + X_CFLOAT lamda03 = X_F(0.0); int niter = 0; int done = 0; - X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new; + X_CFLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new; //maybe all running full loop? while(__any(!done) && niter < _max_iter) { @@ -692,8 +692,8 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) } if(vflag || vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor = X_F(2.0) / X_F(4.0) * nlist; + ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x]; + X_CFLOAT factor = X_F(2.0) / X_F(4.0) * nlist; v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda03 * r03.x * r03.x; *shared = factor * v[0]; shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 @@ -720,23 +720,23 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m) { int nlist, list[3]; - ENERGY_FLOAT v[6]; - X_FLOAT invmass0, invmass1, invmass2; + ENERGY_CFLOAT v[6]; + X_CFLOAT invmass0, invmass1, invmass2; // local atom IDs and constraint distances int i0 = _map_array[_shake_atom[m]]; int i1 = _map_array[_shake_atom[m + _nmax]]; int i2 = _map_array[_shake_atom[m + 2 * _nmax]]; - X_FLOAT bond1 = _bond_distance[_shake_type[m]]; - X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; - X_FLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]]; + X_CFLOAT bond1 = _bond_distance[_shake_type[m]]; + X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; + X_CFLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]]; // r01 = distance vec between atoms, with PBC - X_FLOAT3 r01, r02, r12; + X_CFLOAT3 r01, r02, r12; - X_FLOAT4 x_i0, x_i1, x_i2; + X_CFLOAT4 x_i0, x_i1, x_i2; x_i0 = fetchXType(i0); x_i1 = fetchXType(i1); x_i2 = fetchXType(i2); @@ -758,10 +758,10 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m) // s01 = distance vec after unconstrained update, with PBC - X_FLOAT3 s01, s02, s12; - X_FLOAT3 xs_i0 = _xshake[i0]; - X_FLOAT3 xs_i1 = _xshake[i1]; - X_FLOAT3 xs_i2 = _xshake[i2]; + X_CFLOAT3 s01, s02, s12; + X_CFLOAT3 xs_i0 = _xshake[i0]; + X_CFLOAT3 xs_i1 = _xshake[i1]; + X_CFLOAT3 xs_i2 = _xshake[i2]; s01.x = xs_i0.x - xs_i1.x; s01.y = xs_i0.y - xs_i1.y; @@ -780,12 +780,12 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m) // scalar distances between atoms - X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; - X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; - X_FLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z; - X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; - X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; - X_FLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z; + X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; + X_CFLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z; + X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; + X_CFLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z; // a,b,c = coeffs in quadratic equation for lamda @@ -799,79 +799,79 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m) invmass2 = X_F(1.0) / _mass[static_cast (x_i2.w)]; } - X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * + X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); - X_FLOAT a12 = X_F(2.0) * invmass0 * + X_CFLOAT a12 = X_F(2.0) * invmass0 * (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z); - X_FLOAT a13 = - X_F(2.0) * invmass1 * + X_CFLOAT a13 = - X_F(2.0) * invmass1 * (s01.x * r12.x + s01.y * r12.y + s01.z * r12.z); - X_FLOAT a21 = X_F(2.0) * invmass0 * + X_CFLOAT a21 = X_F(2.0) * invmass0 * (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z); - X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * + X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z); - X_FLOAT a23 = X_F(2.0) * invmass2 * + X_CFLOAT a23 = X_F(2.0) * invmass2 * (s02.x * r12.x + s02.y * r12.y + s02.z * r12.z); - X_FLOAT a31 = - X_F(2.0) * invmass1 * + X_CFLOAT a31 = - X_F(2.0) * invmass1 * (s12.x * r01.x + s12.y * r01.y + s12.z * r01.z); - X_FLOAT a32 = X_F(2.0) * invmass2 * + X_CFLOAT a32 = X_F(2.0) * invmass2 * (s12.x * r02.x + s12.y * r02.y + s12.z * r02.z); - X_FLOAT a33 = X_F(2.0) * (invmass1 + invmass2) * + X_CFLOAT a33 = X_F(2.0) * (invmass1 + invmass2) * (s12.x * r12.x + s12.y * r12.y + s12.z * r12.z); // inverse of matrix - X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 - + X_CFLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 - a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31; if(determ == X_F(0.0)) _flag[0]++; - X_FLOAT determinv = X_F(1.0) / determ; + X_CFLOAT determinv = X_F(1.0) / determ; - X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32); - X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32); - X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22); - X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31); - X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31); - X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21); - X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31); - X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31); - X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21); + X_CFLOAT a11inv = determinv * (a22 * a33 - a23 * a32); + X_CFLOAT a12inv = -determinv * (a12 * a33 - a13 * a32); + X_CFLOAT a13inv = determinv * (a12 * a23 - a13 * a22); + X_CFLOAT a21inv = -determinv * (a21 * a33 - a23 * a31); + X_CFLOAT a22inv = determinv * (a11 * a33 - a13 * a31); + X_CFLOAT a23inv = -determinv * (a11 * a23 - a13 * a21); + X_CFLOAT a31inv = determinv * (a21 * a32 - a22 * a31); + X_CFLOAT a32inv = -determinv * (a11 * a32 - a12 * a31); + X_CFLOAT a33inv = determinv * (a11 * a22 - a12 * a21); // quadratic correction coeffs - X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); - X_FLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z); - X_FLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z); + X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); + X_CFLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z); + X_CFLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z); - X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; - X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq; - X_FLOAT quad1_1212 = invmass1 * invmass1 * r12sq; - X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; - X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112; - X_FLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212; + X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq; + X_CFLOAT quad1_1212 = invmass1 * invmass1 * r12sq; + X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; + X_CFLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112; + X_CFLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212; - X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq; - X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; - X_FLOAT quad2_1212 = invmass2 * invmass2 * r12sq; - X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; - X_FLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112; - X_FLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212; + X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq; + X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; + X_CFLOAT quad2_1212 = invmass2 * invmass2 * r12sq; + X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; + X_CFLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112; + X_CFLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212; - X_FLOAT quad3_0101 = invmass1 * invmass1 * r01sq; - X_FLOAT quad3_0202 = invmass2 * invmass2 * r02sq; - X_FLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq; - X_FLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102; - X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112; - X_FLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212; + X_CFLOAT quad3_0101 = invmass1 * invmass1 * r01sq; + X_CFLOAT quad3_0202 = invmass2 * invmass2 * r02sq; + X_CFLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq; + X_CFLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102; + X_CFLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112; + X_CFLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212; // iterate until converged - X_FLOAT lamda01 = X_F(0.0); - X_FLOAT lamda02 = X_F(0.0); - X_FLOAT lamda12 = X_F(0.0); + X_CFLOAT lamda01 = X_F(0.0); + X_CFLOAT lamda02 = X_F(0.0); + X_CFLOAT lamda12 = X_F(0.0); int niter = 0; int done = 0; - X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new; + X_CFLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new; //maybe all running full loop? while(__any(!done) && niter < _max_iter) { @@ -947,8 +947,8 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m) } if(vflag || vflag_atom) { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; - X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist; + ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x]; + X_CFLOAT factor = X_F(2.0) / X_F(3.0) * nlist; v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda12 * r12.x * r12.x; *shared = factor * v[0]; shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 @@ -986,7 +986,7 @@ __global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list, else if(sflag == 4) FixShakeCuda_Shake4(vflag, vflag_atom, m); else FixShakeCuda_Shake3Angle(vflag, vflag_atom, m); } else { - ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x]; *shared = ENERGY_F(0.0); shared += blockDim.x; *shared = ENERGY_F(0.0); @@ -1008,7 +1008,7 @@ __global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list, } -__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz) +__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -1018,15 +1018,15 @@ __global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistle if(j > _nmax) _flag[0] = 1; - X_FLOAT3 xs = _xshake[j]; - ((X_FLOAT*) _buffer)[i] = xs.x + dx; - ((X_FLOAT*) _buffer)[i + 1 * n] = xs.y + dy; - ((X_FLOAT*) _buffer)[i + 2 * n] = xs.z + dz; + X_CFLOAT3 xs = _xshake[j]; + ((X_CFLOAT*) _buffer)[i] = xs.x + dx; + ((X_CFLOAT*) _buffer)[i + 1 * n] = xs.y + dy; + ((X_CFLOAT*) _buffer)[i + 2 * n] = xs.z + dz; } } -__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -1036,7 +1036,7 @@ __global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxl if(j > _nmax) _flag[0] = 1; - X_FLOAT3 xs = _xshake[j]; + X_CFLOAT3 xs = _xshake[j]; xs.x += dx; xs.y += dy; xs.z += dz; @@ -1050,10 +1050,10 @@ __global__ void FixShakeCuda_UnpackComm_Kernel(int n, int first) int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < n) { - X_FLOAT3 xs; - xs.x = ((X_FLOAT*) _buffer)[i]; - xs.y = ((X_FLOAT*) _buffer)[i + 1 * n]; - xs.z = ((X_FLOAT*) _buffer)[i + 2 * n]; + X_CFLOAT3 xs; + xs.x = ((X_CFLOAT*) _buffer)[i]; + xs.y = ((X_CFLOAT*) _buffer)[i + 1 * n]; + xs.z = ((X_CFLOAT*) _buffer)[i + 2 * n]; _xshake[i + first] = xs; } } diff --git a/lib/cuda/fix_temp_berendsen_cuda.cu b/lib/cuda/fix_temp_berendsen_cuda.cu index b99608dda5..1694be6a20 100644 --- a/lib/cuda/fix_temp_berendsen_cuda.cu +++ b/lib/cuda/fix_temp_berendsen_cuda.cu @@ -36,7 +36,7 @@ void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*)); } void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata) @@ -48,7 +48,7 @@ void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata) void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor) { - V_FLOAT factor = afactor; + V_CFLOAT factor = afactor; if(sdata->atom.update_nmax) Cuda_FixTempBerendsenCuda_UpdateNmax(sdata); diff --git a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu index 2d3b04ace5..e38ad48895 100644 --- a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu +++ b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu @@ -23,7 +23,7 @@ -__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor) +__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; diff --git a/lib/cuda/fix_temp_rescale_cuda.cu b/lib/cuda/fix_temp_rescale_cuda.cu index 171156519b..7f1daef7dd 100644 --- a/lib/cuda/fix_temp_rescale_cuda.cu +++ b/lib/cuda/fix_temp_rescale_cuda.cu @@ -36,7 +36,7 @@ void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*)); } void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata) @@ -48,7 +48,7 @@ void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata) void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor) { - V_FLOAT factor = afactor; + V_CFLOAT factor = afactor; //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step Cuda_FixTempRescaleCuda_UpdateNmax(sdata); //if(sdata->atom.update_nlocal) diff --git a/lib/cuda/fix_temp_rescale_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_cuda_kernel.cu index 2e34ec592f..4aacf9a7d0 100644 --- a/lib/cuda/fix_temp_rescale_cuda_kernel.cu +++ b/lib/cuda/fix_temp_rescale_cuda_kernel.cu @@ -23,7 +23,7 @@ -__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor) +__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; diff --git a/lib/cuda/fix_temp_rescale_limit_cuda.cu b/lib/cuda/fix_temp_rescale_limit_cuda.cu index 72028a124e..36a83379ea 100644 --- a/lib/cuda/fix_temp_rescale_limit_cuda.cu +++ b/lib/cuda/fix_temp_rescale_limit_cuda.cu @@ -36,7 +36,7 @@ void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*)); } void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata) @@ -48,7 +48,7 @@ void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata) void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit) { - V_FLOAT factor = afactor; + V_CFLOAT factor = afactor; //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata); //if(sdata->atom.update_nlocal) diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu index eda86ccdce..1f5ac87077 100644 --- a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu +++ b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu @@ -23,15 +23,15 @@ -__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor, V_FLOAT limit) +__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor, V_CFLOAT limit) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal) if(_mask[i] & groupbit) { - V_FLOAT vx = _v[i]; - V_FLOAT vy = _v[i + _nmax]; - V_FLOAT vz = _v[i + 2 * _nmax]; + V_CFLOAT vx = _v[i]; + V_CFLOAT vy = _v[i + _nmax]; + V_CFLOAT vz = _v[i + 2 * _nmax]; vx *= factor; vy *= factor; vz *= factor; diff --git a/lib/cuda/fix_viscous_cuda.cu b/lib/cuda/fix_viscous_cuda.cu index 03a019bc9f..5b160532b9 100644 --- a/lib/cuda/fix_viscous_cuda.cu +++ b/lib/cuda/fix_viscous_cuda.cu @@ -35,8 +35,8 @@ void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); } @@ -60,7 +60,7 @@ void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_FLOAT*) gamma); + Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_CFLOAT*) gamma); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed"); diff --git a/lib/cuda/fix_viscous_cuda_kernel.cu b/lib/cuda/fix_viscous_cuda_kernel.cu index 2c3397715f..1087d7512f 100644 --- a/lib/cuda/fix_viscous_cuda_kernel.cu +++ b/lib/cuda/fix_viscous_cuda_kernel.cu @@ -21,13 +21,13 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_FLOAT* gamma) +__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_CFLOAT* gamma) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(i < _nlocal) if(_mask[i] & groupbit) { - F_FLOAT drag = gamma[_type[i]]; + F_CFLOAT drag = gamma[_type[i]]; _f[i] -= drag * _v[i]; _f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax]; _f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax]; diff --git a/lib/cuda/neighbor.cu b/lib/cuda/neighbor.cu index 53af1e93f2..212a115e37 100644 --- a/lib/cuda/neighbor.cu +++ b/lib/cuda/neighbor.cu @@ -38,7 +38,7 @@ #define _nex_group MY_AP(nex_group) #define _ex_mol_bit MY_AP(ex_mol_bit) #define _nex_mol MY_AP(nex_mol) -__device__ __constant__ CUDA_FLOAT* _cutneighsq; +__device__ __constant__ CUDA_CFLOAT* _cutneighsq; __device__ __constant__ int* _ex_type; __device__ __constant__ int _nex_type; __device__ __constant__ int* _ex1_bit; @@ -54,7 +54,7 @@ void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* { CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed"); - int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_FLOAT))); + int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT))); if(sdata->buffersize < size) { MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -77,7 +77,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) Cuda_Neighbor_UpdateBuffer(sdata, sneighlist); // initialize only on first call - CUDA_FLOAT rez_bin_size[3] = { + CUDA_CFLOAT rez_bin_size[3] = { (1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]), (1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]), (1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2]) @@ -87,10 +87,10 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) if(! init) { init = 0; - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(unsigned)); cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_CFLOAT) * 3); } @@ -101,7 +101,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) my_times starttime, endtime; my_gettime(CLOCK_REALTIME, &starttime); - cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax)); + cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax)); Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]); cudaThreadSynchronize(); @@ -126,7 +126,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist); - CUDA_FLOAT globcutoff = -1.0; + CUDA_CFLOAT globcutoff = -1.0; short init = 0; @@ -137,11 +137,11 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn unsigned cuda_ntypes = sdata->atom.ntypes + 1; - unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; + unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes; - CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); + CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx); //printf("Allocate: %i\n",nx); - sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx); + sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx); if(sneighlist->cutneighsq) { int cutoffsdiffer = 0; @@ -149,13 +149,13 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]); + acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]); if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++; } } - if(not cutoffsdiffer) globcutoff = (CUDA_FLOAT) cutoff0; + if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0; } else { MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");) return 0; @@ -173,7 +173,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn } CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx); - cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned)); cudaMemcpyToSymbol(MY_AP(special_flag) , sdata->atom.special_flag , 4 * sizeof(int)); @@ -218,14 +218,14 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1); dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1); - //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax); + //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax); int buffer[20]; buffer[0] = 1; buffer[1] = 0; CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int)); CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error"); //cudaMemset(sdata->debugdata,0,100*sizeof(int)); - unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x; + unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x; MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);) //shared_size=2056; my_times starttime, endtime; @@ -245,7 +245,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall); } - //NeighborBuildFullBin_Kernel_Restrict<<>> + //NeighborBuildFullBin_Kernel_Restrict<<>> // (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff); cudaThreadSynchronize(); @@ -301,13 +301,13 @@ int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sn "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); - unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; - CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); + unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes; + CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx); if(sneighlist->cutneighsq) { for(int i = 1; i <= sdata->atom.ntypes; ++i) { for(int j = 1; j <= sdata->atom.ntypes; ++j) { - acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]); + acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]); //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]); } } @@ -339,7 +339,7 @@ int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sn cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); free(acutneighsq); diff --git a/lib/cuda/neighbor_kernel.cu b/lib/cuda/neighbor_kernel.cu index 3892f5ec29..7c4d1c37ad 100644 --- a/lib/cuda/neighbor_kernel.cu +++ b/lib/cuda/neighbor_kernel.cu @@ -24,26 +24,26 @@ #define SBBITS 30 __global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z, - CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z) + CUDA_CFLOAT rez_bin_size_x, CUDA_CFLOAT rez_bin_size_y, CUDA_CFLOAT rez_bin_size_z) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; /*int* bin_count=(int*) _buffer; bin_count=bin_count+20; - CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/ - CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; + CUDA_CFLOAT* binned_x=(CUDA_CFLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/ + CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer; binned_x = &binned_x[2]; int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; if(i < _nall) { // copy atom position from global device memory to local register // in this 3 steps to get as much coalesced access as possible - X_FLOAT* my_x = _x + i; - CUDA_FLOAT x_i = *my_x; + X_CFLOAT* my_x = _x + i; + CUDA_CFLOAT x_i = *my_x; my_x += _nmax; - CUDA_FLOAT y_i = *my_x; + CUDA_CFLOAT y_i = *my_x; my_x += _nmax; - CUDA_FLOAT z_i = *my_x; + CUDA_CFLOAT z_i = *my_x; // calculate flat bin index @@ -102,7 +102,7 @@ __device__ inline int exclusion(int &i, int &j, int &itype, int &jtype) return 0; } -extern __shared__ CUDA_FLOAT shared[]; +extern __shared__ CUDA_CFLOAT shared[]; __device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag) { @@ -114,12 +114,12 @@ __device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag) } template -__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall) +__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style, bool neighall) { int natoms = neighall ? _nall : _nlocal; //const bool domol=false; int bin_dim_z = gridDim.y; - CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; + CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer; binned_x = &binned_x[2]; int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y; @@ -129,19 +129,19 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi int bin_c = bin_count[bin]; - CUDA_FLOAT cut; + CUDA_CFLOAT cut; if(globcutoff > 0) cut = globcutoff; int i = _nall; - CUDA_FLOAT* my_x; - CUDA_FLOAT x_i, y_i, z_i; + CUDA_CFLOAT* my_x; + CUDA_CFLOAT x_i, y_i, z_i; for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) { int actIdx = threadIdx.x + actOffset; - CUDA_FLOAT* other_x = shared; + CUDA_CFLOAT* other_x = shared; int* other_id = (int*) &other_x[3 * blockDim.x]; if(actIdx < bin_c) { @@ -206,10 +206,10 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } - CUDA_FLOAT delx = x_i - other_x[kk]; - CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x]; - CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; - CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + CUDA_CFLOAT delx = x_i - other_x[kk]; + CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x]; + CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; + CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { @@ -268,10 +268,10 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } - CUDA_FLOAT delx = x_i - other_x[k]; - CUDA_FLOAT dely = y_i - other_x[k + blockDim.x]; - CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x]; - CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + CUDA_CFLOAT delx = x_i - other_x[k]; + CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x]; + CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x]; + CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { if(jnum < _maxneighbors) { @@ -378,10 +378,10 @@ __global__ void FindSpecial(int block_style) _numneigh[i] = jnum; } -__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style) +__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style) { int bin_dim_z = gridDim.y; - CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; + CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer; binned_x = &binned_x[2]; int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y; @@ -391,19 +391,19 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_ int bin_c = bin_count[bin]; - CUDA_FLOAT cut; + CUDA_CFLOAT cut; if(globcutoff > 0) cut = globcutoff; int i = _nall; - CUDA_FLOAT* my_x; - CUDA_FLOAT x_i, y_i, z_i; + CUDA_CFLOAT* my_x; + CUDA_CFLOAT x_i, y_i, z_i; for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) { int actIdx = threadIdx.x + actOffset; - CUDA_FLOAT* other_x = shared; + CUDA_CFLOAT* other_x = shared; int* other_id = (int*) &other_x[3 * blockDim.x]; if(actIdx < bin_c) { @@ -469,10 +469,10 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_ cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } - CUDA_FLOAT delx = x_i - other_x[kk]; - CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x]; - CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; - CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + CUDA_CFLOAT delx = x_i - other_x[kk]; + CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x]; + CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; + CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { @@ -549,10 +549,10 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_ cut = _cutneighsq[itype * _cuda_ntypes + jtype]; } - CUDA_FLOAT delx = x_i - other_x[k]; - CUDA_FLOAT dely = y_i - other_x[k + blockDim.x]; - CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x]; - CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + CUDA_CFLOAT delx = x_i - other_x[k]; + CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x]; + CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x]; + CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq <= cut && i != j) { if((j >= _nlocal) && (i_border < 0)) @@ -612,12 +612,12 @@ __global__ void NeighborBuildFullNsq_Kernel() int* buffer = (int*) _buffer; if(i < _nlocal) { - X_FLOAT* my_x = _x + i; - CUDA_FLOAT x_i = *my_x; + X_CFLOAT* my_x = _x + i; + CUDA_CFLOAT x_i = *my_x; my_x += _nmax; - CUDA_FLOAT y_i = *my_x; + CUDA_CFLOAT y_i = *my_x; my_x += _nmax; - CUDA_FLOAT z_i = *my_x; + CUDA_CFLOAT z_i = *my_x; int jnum = 0; int* jlist = _firstneigh[i]; _ilist[i] = i; @@ -627,15 +627,15 @@ __global__ void NeighborBuildFullNsq_Kernel() for(int j = 0; j < _nall; ++j) { my_x = _x + j; - CUDA_FLOAT x_j = *my_x; + CUDA_CFLOAT x_j = *my_x; my_x += _nmax; - CUDA_FLOAT y_j = *my_x; + CUDA_CFLOAT y_j = *my_x; my_x += _nmax; - CUDA_FLOAT z_j = *my_x; - CUDA_FLOAT delx = x_i - x_j; - CUDA_FLOAT dely = y_i - y_j; - CUDA_FLOAT delz = z_i - z_j; - CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + CUDA_CFLOAT z_j = *my_x; + CUDA_CFLOAT delx = x_i - x_j; + CUDA_CFLOAT dely = y_i - y_j; + CUDA_CFLOAT delz = z_i - z_j; + CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz; int jtype = _type[j]; if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) { diff --git a/lib/cuda/pair_born_coul_long_cuda.cu b/lib/cuda/pair_born_coul_long_cuda.cu index e6d66733e7..2783c58136 100644 --- a/lib/cuda/pair_born_coul_long_cuda.cu +++ b/lib/cuda/pair_born_coul_long_cuda.cu @@ -60,10 +60,10 @@ void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu index bc79848b08..4599dbca20 100644 --- a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu +++ b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu @@ -20,13 +20,13 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairBornCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT r6inv = r2inv * r2inv * r2inv; - const F_FLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]); - const F_FLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp - + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; + const F_CFLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]); + const F_CFLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp - F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv; if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv diff --git a/lib/cuda/pair_buck_coul_cut_cuda.cu b/lib/cuda/pair_buck_coul_cut_cuda.cu index ba61f5e036..a105ee88ea 100644 --- a/lib/cuda/pair_buck_coul_cut_cuda.cu +++ b/lib/cuda/pair_buck_coul_cut_cuda.cu @@ -58,10 +58,10 @@ void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sn if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_buck_coul_long_cuda.cu b/lib/cuda/pair_buck_coul_long_cuda.cu index f4e7203f83..547904d810 100644 --- a/lib/cuda/pair_buck_coul_long_cuda.cu +++ b/lib/cuda/pair_buck_coul_long_cuda.cu @@ -59,10 +59,10 @@ void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_buck_cuda.cu b/lib/cuda/pair_buck_cuda.cu index b7ca740c00..ed6080ba4a 100644 --- a/lib/cuda/pair_buck_cuda.cu +++ b/lib/cuda/pair_buck_cuda.cu @@ -60,10 +60,10 @@ void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_buck_cuda_kernel_nc.cu b/lib/cuda/pair_buck_cuda_kernel_nc.cu index 8ce7d28654..2a9828d03a 100644 --- a/lib/cuda/pair_buck_cuda_kernel_nc.cu +++ b/lib/cuda/pair_buck_cuda_kernel_nc.cu @@ -20,13 +20,13 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairBuckCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r6inv = r2inv * r2inv * r2inv; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT rexp = _EXP_(-r * _rhoinv[ij_type]); - const F_FLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT rexp = _EXP_(-r * _rhoinv[ij_type]); + const F_CFLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv; if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv - _offset[ij_type]); diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu index 7f9f853437..bafb83705a 100644 --- a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu +++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu @@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu index 43bedca883..ee18b81899 100644 --- a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu +++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu @@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu index ed1bbf0cfc..67c6ebaa7f 100644 --- a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu +++ b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu @@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_cg_cmm_cuda.cu b/lib/cuda/pair_cg_cmm_cuda.cu index 7ec1ebff99..ccc4782ec0 100644 --- a/lib/cuda/pair_cg_cmm_cuda.cu +++ b/lib/cuda/pair_cg_cmm_cuda.cu @@ -71,10 +71,10 @@ void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu index 85b41605bd..a19903b41a 100644 --- a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu +++ b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu @@ -21,28 +21,28 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4 +__device__ inline F_CFLOAT PairCGCMMCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) //0.11 of 0.4 { - const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r2inv = F_F(1.0) / rsq; const int cg_type = _cg_type[ij_type]; - const F_FLOAT r4inv = r2inv * r2inv; - const F_FLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq); - const F_FLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0); - const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second); + const F_CFLOAT r4inv = r2inv * r2inv; + const F_CFLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq); + const F_CFLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0); + const F_CFLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second); if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]); return factor_lj * forcelj * r2inv; } -/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +/*__device__ inline F_CFLOAT PairCGCMMCuda_Eval(const F_CFLOAT& rsq,const int ij_type,F_CFLOAT& factor_lj,int& eflag, ENERGY_CFLOAT& evdwl) { const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type); - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r4inv = r2inv*r2inv; - const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); - const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0); - const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); + const F_CFLOAT r2inv = F_F(1.0)/rsq; + const F_CFLOAT r4inv = r2inv*r2inv; + const F_CFLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); + const F_CFLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0); + const F_CFLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second)); return factor_lj*forcelj*r2inv; diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu index cb20343770..563af7744d 100644 --- a/lib/cuda/pair_eam_cuda.cu +++ b/lib/cuda/pair_eam_cuda.cu @@ -39,18 +39,18 @@ #define _rho MY_AP(rho) #define _fp MY_AP(fp) -__device__ __constant__ F_FLOAT MY_AP(rdr); -__device__ __constant__ F_FLOAT MY_AP(rdrho); +__device__ __constant__ F_CFLOAT MY_AP(rdr); +__device__ __constant__ F_CFLOAT MY_AP(rdrho); __device__ __constant__ int MY_AP(nr); __device__ __constant__ int MY_AP(nrho); __device__ __constant__ int MY_AP(nfrho); __device__ __constant__ int MY_AP(nrhor); __device__ __constant__ int MY_AP(nz2r); -__device__ __constant__ F_FLOAT* MY_AP(frho_spline); -__device__ __constant__ F_FLOAT* MY_AP(rhor_spline); -__device__ __constant__ F_FLOAT* MY_AP(z2r_spline); -__device__ __constant__ F_FLOAT* MY_AP(rho); -__device__ __constant__ F_FLOAT* MY_AP(fp); +__device__ __constant__ F_CFLOAT* MY_AP(frho_spline); +__device__ __constant__ F_CFLOAT* MY_AP(rhor_spline); +__device__ __constant__ F_CFLOAT* MY_AP(z2r_spline); +__device__ __constant__ F_CFLOAT* MY_AP(rho); +__device__ __constant__ F_CFLOAT* MY_AP(fp); #define _rhor_spline_tex MY_AP(rhor_spline_tex) #if F_PRECISION == 1 @@ -115,10 +115,10 @@ inline void BindEAMTextures(cuda_shared_data* sdata) void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed"); - int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT)); + int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT); + int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -151,13 +151,13 @@ void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighl void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*)); CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed"); } @@ -175,18 +175,18 @@ void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, in "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 " "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); - unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes; + unsigned nI = sizeof(F_CFLOAT) * cuda_ntypes * cuda_ntypes; - X_FLOAT cutsq_global; - cutsq_global = (X_FLOAT)(sdata->pair.cut_global); - cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); + X_CFLOAT cutsq_global; + cutsq_global = (X_CFLOAT)(sdata->pair.cut_global); + cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_CFLOAT)); - F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes]; + F_CFLOAT* coeff_buf = new F_CFLOAT[cuda_ntypes * cuda_ntypes]; for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i]; - cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_CFLOAT)); for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i]; @@ -197,34 +197,34 @@ void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, in cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI); delete [] coeff_buf; - X_FLOAT box_size[3] = { + X_CFLOAT box_size[3] = { sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[2] - sdata->domain.sublo[2] }; - F_FLOAT rdr_F = rdr; - F_FLOAT rdrho_F = rdrho; - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + F_CFLOAT rdr_F = rdr; + F_CFLOAT rdrho_F = rdrho; + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3); cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3); cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_CFLOAT)); cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int)); cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int)); cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int)); cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); - rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); - z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); + rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_CFLOAT); + z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_CFLOAT); rhor_spline_pointer = rhor_spline; z2r_spline_pointer = z2r_spline; @@ -249,8 +249,8 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis if(sdata->buffer_new) Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist); - cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*)); int sharedperproc = 0; @@ -258,7 +258,7 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis if(vflag || vflag_atom) sharedperproc = 7; - int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); @@ -270,7 +270,7 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation"); - PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); + PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed"); @@ -288,7 +288,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis if(vflag || vflag_atom) sharedperproc = 7; - int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); @@ -300,7 +300,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation"); - PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); + PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed"); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed"); @@ -310,7 +310,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis grid.x = sharedperproc; grid.y = 1; threads.x = 256; - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n); + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)*sharedperproc>>>(n); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed"); } @@ -324,19 +324,19 @@ void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* int3 layout = getgrid(n, 0); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]); + F_CFLOAT* buf = (F_CFLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]); PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n , sdata->comm.maxlistlength, iswap, buf); cudaThreadSynchronize(); - cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(buf_send, buf, n* sizeof(F_CFLOAT), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); } void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp) { - F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]); - cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice); + F_CFLOAT* fp_first = &(((F_CFLOAT*) fp)[first]); + cudaMemcpy(fp_first, buf_recv, n * sizeof(F_CFLOAT), cudaMemcpyHostToDevice); } #undef _type2frho diff --git a/lib/cuda/pair_eam_cuda_kernel_nc.cu b/lib/cuda/pair_eam_cuda_kernel_nc.cu index 458945418a..9ec3ae2757 100644 --- a/lib/cuda/pair_eam_cuda_kernel_nc.cu +++ b/lib/cuda/pair_eam_cuda_kernel_nc.cu @@ -24,7 +24,7 @@ -static __device__ inline F_FLOAT4 fetchRhor(int i) +static __device__ inline F_CFLOAT4 fetchRhor(int i) { #ifdef CUDA_USE_TEXTURE #if F_PRECISION == 1 @@ -37,7 +37,7 @@ static __device__ inline F_FLOAT4 fetchRhor(int i) #endif } -static __device__ inline F_FLOAT4 fetchZ2r(int i) +static __device__ inline F_CFLOAT4 fetchZ2r(int i) { #ifdef CUDA_USE_TEXTURE #if F_PRECISION == 1 @@ -52,8 +52,8 @@ static __device__ inline F_FLOAT4 fetchZ2r(int i) __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vflag_atom) { - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedE; + ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x]; if(eflag || eflag_atom) { @@ -73,9 +73,9 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT xtmp, ytmp, ztmp; - X_FLOAT4 myxtype; - F_FLOAT delx, dely, delz; + X_CFLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT delx, dely, delz; int itype; int i = _nlocal; int jnum = 0; @@ -109,17 +109,17 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf dely = ytmp - myxtype.y; delz = ztmp - myxtype.z; int jtype = static_cast (myxtype.w); - const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq < _cutsq_global) { - F_FLOAT p = sqrt(rsq) * _rdr + F_F(1.0); + F_CFLOAT p = sqrt(rsq) * _rdr + F_F(1.0); int m = static_cast(p); m = MIN(m, _nr - 1); p -= m; p = MIN(p, F_F(1.0)); int k = (static_cast (_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2; - F_FLOAT4 c = fetchRhor(k + 1); + F_CFLOAT4 c = fetchRhor(k + 1); _rho[i] += ((c.w * p + c.x) * p + c.y) * p + c.z; } } @@ -127,12 +127,12 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf if(ii < _inum) { - F_FLOAT p = _rho[i] * _rdrho + F_F(1.0); + F_CFLOAT p = _rho[i] * _rdrho + F_F(1.0); int m = static_cast(p); m = MAX(1, MIN(m, _nrho - 1)); p -= m; p = MIN(p, F_F(1.0)); - F_FLOAT* coeff = &_frho_spline[(static_cast (_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH]; + F_CFLOAT* coeff = &_frho_spline[(static_cast (_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH]; _fp[i] = (coeff[0] * p + coeff[1]) * p + coeff[2]; if(eflag || eflag_atom) { @@ -148,17 +148,17 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf _eatom[i] += sharedmem[threadIdx.x]; reduceBlock(sharedmem); - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0) * sharedmem[0]; } } __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vflag_atom) { - ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedE; + ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x]; if(eflag || eflag_atom) { @@ -178,10 +178,10 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT xtmp, ytmp, ztmp; - X_FLOAT4 myxtype; - F_FLOAT fxtmp, fytmp, fztmp, fpair; - F_FLOAT delx, dely, delz; + X_CFLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT fxtmp, fytmp, fztmp, fpair; + F_CFLOAT delx, dely, delz; int itype, i; int jnum = 0; int* jlist; @@ -206,7 +206,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf _rho[i] = F_F(0.0); } - if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_FLOAT*) _buffer)[ii]; + if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_CFLOAT*) _buffer)[ii]; __syncthreads(); @@ -219,35 +219,35 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf dely = ytmp - myxtype.y; delz = ztmp - myxtype.z; int jtype = static_cast (myxtype.w); - const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq < _cutsq_global) { - F_FLOAT r = _SQRT_(rsq); - F_FLOAT p = r * _rdr + F_F(1.0); + F_CFLOAT r = _SQRT_(rsq); + F_CFLOAT p = r * _rdr + F_F(1.0); int m = static_cast(p); m = MIN(m, _nr - 1); p -= m; p = MIN(p, F_F(1.0)); int k = (static_cast (_type2rhor[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2; - F_FLOAT4 c = fetchRhor(k); - F_FLOAT rhoip = (c.x * p + c.y) * p + c.z; + F_CFLOAT4 c = fetchRhor(k); + F_CFLOAT rhoip = (c.x * p + c.y) * p + c.z; k = (static_cast (_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2; c = fetchRhor(k); - F_FLOAT rhojp = (c.x * p + c.y) * p + c.z; + F_CFLOAT rhojp = (c.x * p + c.y) * p + c.z; k = (static_cast (_type2z2r[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2; c = fetchZ2r(k); - F_FLOAT z2p = (c.x * p + c.y) * p + c.z; + F_CFLOAT z2p = (c.x * p + c.y) * p + c.z; c = fetchZ2r(k + 1); - F_FLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z; + F_CFLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z; - F_FLOAT recip = F_F(1.0) / r; - F_FLOAT phi = z2 * recip; - F_FLOAT phip = z2p * recip - phi * recip; - F_FLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip; + F_CFLOAT recip = F_F(1.0) / r; + F_CFLOAT phi = z2 * recip; + F_CFLOAT phip = z2p * recip - phi * recip; + F_CFLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip; fpair = -psip * recip; - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; fxtmp += dxfp = delx * fpair; fytmp += dyfp = dely * fpair; fztmp += dzfp = delz * fpair; @@ -268,10 +268,10 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf __syncthreads(); if(ii < _inum) { - F_FLOAT* my_f; + F_CFLOAT* my_f; if(_collect_forces_later) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { buffer = &buffer[1 * gridDim.x * gridDim.y]; @@ -281,7 +281,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf buffer = &buffer[6 * gridDim.x * gridDim.y]; } - my_f = (F_FLOAT*) buffer; + my_f = (F_CFLOAT*) buffer; my_f += i; *my_f = fxtmp; my_f += _nmax; @@ -320,7 +320,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0); } -__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_FLOAT* buffer) +__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_CFLOAT* buffer) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; int* list = sendlist + iswap * maxlistlength; @@ -331,7 +331,7 @@ __global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlen } } -__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_FLOAT* buffer) +__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_CFLOAT* buffer) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; diff --git a/lib/cuda/pair_gran_hooke_cuda.cu b/lib/cuda/pair_gran_hooke_cuda.cu index 5c143240cb..a3cadf5b19 100644 --- a/lib/cuda/pair_gran_hooke_cuda.cu +++ b/lib/cuda/pair_gran_hooke_cuda.cu @@ -37,10 +37,10 @@ void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed"); - int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_FLOAT)); + int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_CFLOAT)); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_FLOAT); + int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_CFLOAT); if(sdata->buffersize < size) { MYDBG(printf("Cuda_PairGranHookeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) @@ -72,15 +72,15 @@ void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neig cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*)); - cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*)); + cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_CFLOAT4*)); + cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_CFLOAT4*)); + cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(maxneighbors), &sneighlist->maxneighbors , sizeof(int)); - cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*)); cudaMemcpyToSymbol(MY_AP(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int)); @@ -101,32 +101,32 @@ void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata) "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1); unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; - unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; + unsigned n = sizeof(F_CFLOAT) * cuda_ntypes2; - F_FLOAT coeffs1[cuda_ntypes2]; - coeffs1[0] = (F_FLOAT) sdata->pair.coeff1[0][0]; - coeffs1[1] = (F_FLOAT) sdata->pair.coeff1[0][1]; - coeffs1[2] = (F_FLOAT) sdata->pair.coeff1[1][0]; - F_FLOAT coeffs3[cuda_ntypes2]; - coeffs3[0] = (F_FLOAT) sdata->pair.coeff1[1][1]; - F_FLOAT coeffs2[cuda_ntypes2]; - coeffs2[0] = (F_FLOAT) sdata->pair.coeff2[0][0]; - coeffs2[1] = (F_FLOAT) sdata->pair.coeff2[0][1]; + F_CFLOAT coeffs1[cuda_ntypes2]; + coeffs1[0] = (F_CFLOAT) sdata->pair.coeff1[0][0]; + coeffs1[1] = (F_CFLOAT) sdata->pair.coeff1[0][1]; + coeffs1[2] = (F_CFLOAT) sdata->pair.coeff1[1][0]; + F_CFLOAT coeffs3[cuda_ntypes2]; + coeffs3[0] = (F_CFLOAT) sdata->pair.coeff1[1][1]; + F_CFLOAT coeffs2[cuda_ntypes2]; + coeffs2[0] = (F_CFLOAT) sdata->pair.coeff2[0][0]; + coeffs2[1] = (F_CFLOAT) sdata->pair.coeff2[0][1]; - X_FLOAT box_size[3] = { + X_CFLOAT box_size[3] = { sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[2] - sdata->domain.sublo[2] }; //printf("n: %i %i\n",n,CUDA_MAX_TYPES2); - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3); cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned)); cudaMemcpyToSymbol(MY_AP(coeff1) , coeffs1 , n); cudaMemcpyToSymbol(MY_AP(coeff2) , coeffs2 , n); cudaMemcpyToSymbol(MY_AP(coeff3) , coeffs3 , n); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3); CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed"); } @@ -156,7 +156,7 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei if(vflag) sharedperproc += 6; - int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT), 128); + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT), 128); dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); @@ -168,11 +168,11 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei Cuda_PairGranHookeCuda_Init(sdata); } - MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) + MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);) CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation"); - PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id - , (F_FLOAT) sdata->pair.coeff1[0][0], (F_FLOAT) sdata->pair.coeff1[1][0], (F_FLOAT) sdata->pair.coeff1[1][1], (F_FLOAT) sdata->pair.coeff2[0][0]); + PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id + , (F_CFLOAT) sdata->pair.coeff1[0][0], (F_CFLOAT) sdata->pair.coeff1[1][0], (F_CFLOAT) sdata->pair.coeff1[1][1], (F_CFLOAT) sdata->pair.coeff2[0][0]); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed"); @@ -181,7 +181,7 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei grid.x = sharedperproc; grid.y = 1; threads.x = 256; - MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed"); } diff --git a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu index e6a4ed2b8a..7b4f752461 100644 --- a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu +++ b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu @@ -23,12 +23,12 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id - , F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu) + , F_CFLOAT kn, F_CFLOAT gamman, F_CFLOAT gammat, F_CFLOAT xmu) { - ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT* sharedE; - ENERGY_FLOAT* sharedV; + ENERGY_CFLOAT* sharedE; + ENERGY_CFLOAT* sharedV; if(eflag || eflag_atom) { sharedE = &sharedmem[threadIdx.x]; @@ -51,18 +51,18 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");) - X_FLOAT xtmp, ytmp, ztmp; + X_CFLOAT xtmp, ytmp, ztmp; - X_FLOAT4 myxtype; - V_FLOAT4 myvradius, ovradius; - F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp; - F_FLOAT delx, dely, delz; - F_FLOAT radi, radj, radsum, r, rsqinv; - F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3; - F_FLOAT wr1, wr2, wr3; - F_FLOAT vtr1, vtr2, vtr3, vrel; - F_FLOAT meff, damp, ccel, tor1, tor2, tor3; - F_FLOAT fn, fs, ft, fs1, fs2, fs3; + X_CFLOAT4 myxtype; + V_CFLOAT4 myvradius, ovradius; + F_CFLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp; + F_CFLOAT delx, dely, delz; + F_CFLOAT radi, radj, radsum, r, rsqinv; + F_CFLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3; + F_CFLOAT wr1, wr2, wr3; + F_CFLOAT vtr1, vtr2, vtr3, vrel; + F_CFLOAT meff, damp, ccel, tor1, tor2, tor3; + F_CFLOAT fn, fs, ft, fs1, fs2, fs3; int jnum = 0; int i, j; @@ -108,10 +108,10 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i radj = ovradius.w; radsum = radi + radj; - const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq < radsum * radsum) { - const F_FLOAT rinv = _RSQRT_(rsq); + const F_CFLOAT rinv = _RSQRT_(rsq); r = F_F(1.0) / rinv; rsqinv = F_F(1.0) / rsq; @@ -135,8 +135,8 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i vt3 = vr3 - vn3; // relative rotational velocity - V_FLOAT4 omegarmass_i = fetchOmegaRmass(i); - V_FLOAT4 omegarmass_j = fetchOmegaRmass(j); + V_CFLOAT4 omegarmass_i = fetchOmegaRmass(i); + V_CFLOAT4 omegarmass_j = fetchOmegaRmass(j); wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv; wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv; @@ -165,7 +165,7 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i fs2 = -ft * vtr2; fs3 = -ft * vtr3; - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; fxtmp += dxfp = delx * ccel + fs1; fytmp += dyfp = dely * ccel + fs2; fztmp += dzfp = delz * ccel + fs3; @@ -194,13 +194,13 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i __syncthreads(); if(ii < _inum) { - F_FLOAT* my_f = _f + i; + F_CFLOAT* my_f = _f + i; *my_f += fxtmp; my_f += _nmax; *my_f += fytmp; my_f += _nmax; *my_f += fztmp; - F_FLOAT* my_torque = _torque + i; + F_CFLOAT* my_torque = _torque + i; *my_torque += torquextmp; my_torque += _nmax; *my_torque += torqueytmp; diff --git a/lib/cuda/pair_lj96_cut_cuda.cu b/lib/cuda/pair_lj96_cut_cuda.cu index 1d40a3c82e..13f3738b18 100644 --- a/lib/cuda/pair_lj96_cut_cuda.cu +++ b/lib/cuda/pair_lj96_cut_cuda.cu @@ -63,10 +63,10 @@ void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu index f3c2477be6..330f90336e 100644 --- a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu @@ -21,12 +21,12 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairLJ96CutCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r6inv = r2inv * r2inv * r2inv; - const F_FLOAT r3inv = _SQRT_(r6inv); - const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]); + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; + const F_CFLOAT r3inv = _SQRT_(r6inv); + const F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]); if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv - _lj4[ij_type]) - _offset[ij_type]); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu index 752f3bd47d..0a9ad068cd 100644 --- a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu @@ -33,12 +33,12 @@ #include -void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv) +void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_lj_inv, F_CFLOAT denom_coul_inv) { Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true); - cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_CFLOAT)); return; } @@ -46,7 +46,7 @@ void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_c void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul) + int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul) { static short init = 0; @@ -65,10 +65,10 @@ void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighl if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h index e44d2941f2..9e6fb21943 100644 --- a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h @@ -23,4 +23,4 @@ #include "cuda_shared.h" -extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul); +extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu index d4ed2f48af..d741e7ac9d 100644 --- a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu @@ -20,24 +20,24 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairLJCharmmCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r6inv = r2inv * r2inv * r2inv; - F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); - F_FLOAT philj, switch1; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; + F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); + F_CFLOAT philj, switch1; if(rsq > _cut_innersq_global) { switch1 = (_cutsq_global - rsq) * (_cutsq_global - rsq) * (_cutsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_innersq_global) * _denom_lj_inv; - const F_FLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) * + const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) * (rsq - _cut_innersq_global) * _denom_lj_inv; philj = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]); forcelj = forcelj * switch1 + philj * switch2; } if(eflag) { - ENERGY_FLOAT evdwl_tmp = factor_lj; + ENERGY_CFLOAT evdwl_tmp = factor_lj; if(rsq > _cut_innersq_global) { evdwl_tmp *= philj * switch1; @@ -50,16 +50,16 @@ __device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij return factor_lj * forcelj * r2inv; } -__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij) +__device__ inline F_CFLOAT CoulCharmmCuda_Eval(const F_CFLOAT &rsq, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij) { - F_FLOAT forcecoul; - ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul; + F_CFLOAT forcecoul; + ENERGY_CFLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul; if(rsq > _cut_coul_innersq_global) { - const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) * + const F_CFLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) * (_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv; ecoul_tmp *= switch1; - const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) * + const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) * (rsq - _cut_coul_innersq_global) * _denom_coul_inv; forcecoul *= switch1 + switch2; } diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu index 31df02b2ef..a960947f5d 100644 --- a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu @@ -30,9 +30,9 @@ #define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global) #define _denom_lj_inv MY_AP(denom_lj_inv) #define _denom_coul_inv MY_AP(denom_coul_inv) -__device__ __constant__ F_FLOAT _cut_coul_innersq_global; -__device__ __constant__ F_FLOAT _denom_lj_inv; -__device__ __constant__ F_FLOAT _denom_coul_inv; +__device__ __constant__ F_CFLOAT _cut_coul_innersq_global; +__device__ __constant__ F_CFLOAT _denom_lj_inv; +__device__ __constant__ F_CFLOAT _denom_coul_inv; #include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h" @@ -40,12 +40,12 @@ __device__ __constant__ F_FLOAT _denom_coul_inv; #include -void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv) +void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_lj_inv, F_CFLOAT denom_coul_inv) { Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true); - cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_CFLOAT)); return; } @@ -53,7 +53,7 @@ void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLO void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul) + int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul) { static short init = 0; @@ -72,10 +72,10 @@ void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_share if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h index c410906957..b76b08075d 100644 --- a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h @@ -23,4 +23,4 @@ #include "cuda_shared.h" -extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul); +extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu index 6a20b8626a..53dec1acef 100644 --- a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu @@ -21,16 +21,16 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij) +__device__ inline F_CFLOAT CoulCharmmImplicitCuda_Eval(const F_CFLOAT &rsq, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij) { - F_FLOAT forcecoul; - ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul; + F_CFLOAT forcecoul; + ENERGY_CFLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul; if(rsq > _cut_coul_innersq_global) { - const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) * + const F_CFLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) * (_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv; ecoul_tmp *= switch1; - const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) * + const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) * (rsq - _cut_coul_innersq_global) * _denom_coul_inv; forcecoul *= (switch1 + switch2); } diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu index 0096f7757e..b7783cbc99 100644 --- a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu +++ b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu @@ -32,10 +32,10 @@ #include -void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_lj_inv) +void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_CFLOAT denom_lj_inv) { Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true); - cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT)); return; } @@ -43,7 +43,7 @@ void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_l void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom, int vflag_atom, F_FLOAT denom_lj) + int eflag_atom, int vflag_atom, F_CFLOAT denom_lj) { static short init = 0; @@ -62,10 +62,10 @@ void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlis if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h index 34b0b722ef..b8f5592ff4 100644 --- a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h +++ b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h @@ -23,4 +23,4 @@ #include "cuda_shared.h" -extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj); +extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj); diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu index f09a480534..b1be3e132f 100644 --- a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu +++ b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu @@ -58,10 +58,10 @@ void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda.cu b/lib/cuda/pair_lj_class2_coul_long_cuda.cu index f20c74c33a..4d35089024 100644 --- a/lib/cuda/pair_lj_class2_coul_long_cuda.cu +++ b/lib/cuda/pair_lj_class2_coul_long_cuda.cu @@ -58,10 +58,10 @@ void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlis if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_class2_cuda.cu b/lib/cuda/pair_lj_class2_cuda.cu index a72e31fd9c..c18fc1f4dd 100644 --- a/lib/cuda/pair_lj_class2_cuda.cu +++ b/lib/cuda/pair_lj_class2_cuda.cu @@ -52,7 +52,7 @@ void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig dim3 grid, threads; int sharedperproc; - //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT); //if(CUDA_ARCH==20) maxthreads*=2; //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); @@ -60,10 +60,10 @@ void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu index 761e985ec8..ae4168b537 100644 --- a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu @@ -21,11 +21,11 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairLJClass2Cuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r6inv = r2inv * r2inv * r2inv; - const F_FLOAT r3inv = _SQRT_(r6inv); + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; + const F_CFLOAT r3inv = _SQRT_(r6inv); if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv - _lj4[ij_type]) - _offset[ij_type]); diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu index 88ba0300cf..99232a6a85 100644 --- a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu +++ b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu @@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu index fdbe594768..7efa352062 100644 --- a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu +++ b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu @@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda.cu b/lib/cuda/pair_lj_cut_coul_long_cuda.cu index 316bb68351..8c02bfabeb 100644 --- a/lib/cuda/pair_lj_cut_coul_long_cuda.cu +++ b/lib/cuda/pair_lj_cut_coul_long_cuda.cu @@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_cut_cuda.cu b/lib/cuda/pair_lj_cut_cuda.cu index 4f2796e958..68798bf3fc 100644 --- a/lib/cuda/pair_lj_cut_cuda.cu +++ b/lib/cuda/pair_lj_cut_cuda.cu @@ -52,7 +52,7 @@ void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli dim3 grid, threads; int sharedperproc; - //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT); //if(CUDA_ARCH==20) maxthreads*=2; //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); @@ -60,10 +60,10 @@ void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu index 2517a006e9..660ac4cbe3 100644 --- a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu @@ -21,10 +21,10 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairLJCutCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r6inv = r2inv * r2inv * r2inv; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - _offset[ij_type]); diff --git a/lib/cuda/pair_lj_cut_experimental_cuda.cu b/lib/cuda/pair_lj_cut_experimental_cuda.cu index 4df5755326..7c3f4c22f2 100644 --- a/lib/cuda/pair_lj_cut_experimental_cuda.cu +++ b/lib/cuda/pair_lj_cut_experimental_cuda.cu @@ -51,7 +51,7 @@ void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighli dim3 grid, threads; int sharedperproc; - //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT); //if(CUDA_ARCH==20) maxthreads*=2; //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt,cudaFuncCachePreferL1); Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); @@ -64,10 +64,10 @@ void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighli if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA_opt - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_expand_cuda.cu b/lib/cuda/pair_lj_expand_cuda.cu index 290c9a7a97..186af1f574 100644 --- a/lib/cuda/pair_lj_expand_cuda.cu +++ b/lib/cuda/pair_lj_expand_cuda.cu @@ -62,10 +62,10 @@ void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu index fc03d6fbf4..ae11c12c3d 100644 --- a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu @@ -20,14 +20,14 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairLJExpandCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r = _SQRT_(rsq); - const F_FLOAT rshift = r - _shift[ij_type]; - const F_FLOAT rshiftsq = rshift * rshift; - const F_FLOAT r2inv = F_F(1.0) / rshiftsq; - const F_FLOAT r6inv = r2inv * r2inv * r2inv; - const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); + const F_CFLOAT r = _SQRT_(rsq); + const F_CFLOAT rshift = r - _shift[ij_type]; + const F_CFLOAT rshiftsq = rshift * rshift; + const F_CFLOAT r2inv = F_F(1.0) / rshiftsq; + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; + const F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - _offset[ij_type]); diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu index 354f06b54b..33f9cc0e84 100644 --- a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu @@ -37,10 +37,10 @@ #define _coulsw1 MY_AP(coulsw1) #define _coulsw2 MY_AP(coulsw2) #define _coulsw5 MY_AP(coulsw5) -__device__ __constant__ F_FLOAT _cut_coul_inner_global; -__device__ __constant__ F_FLOAT _coulsw1; -__device__ __constant__ F_FLOAT _coulsw2; -__device__ __constant__ F_FLOAT _coulsw5; +__device__ __constant__ F_CFLOAT _cut_coul_inner_global; +__device__ __constant__ F_CFLOAT _coulsw1; +__device__ __constant__ F_CFLOAT _coulsw2; +__device__ __constant__ F_CFLOAT _coulsw5; #include "pair_lj_gromacs_coul_gromacs_cuda_cu.h" @@ -48,13 +48,13 @@ __device__ __constant__ F_FLOAT _coulsw5; #include -void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5) +void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5) { Cuda_Pair_Init_AllStyles(sdata, 9, true, true, true); - cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1 , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2 , sizeof(F_FLOAT)); - cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5 , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1 , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2 , sizeof(F_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5 , sizeof(F_CFLOAT)); return; } @@ -62,7 +62,7 @@ void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, - int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5) + int eflag_atom, int vflag_atom, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5) { static short init = 0; @@ -80,10 +80,10 @@ void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neig if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h index 0e3b078166..ababa5b43a 100644 --- a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h @@ -23,4 +23,4 @@ #include "cuda_shared.h" -extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5); +extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5); diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu index ee6dda06f0..7094c59817 100644 --- a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu @@ -21,23 +21,23 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij) +__device__ inline F_CFLOAT CoulGromacsCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij) { if(qij != F_F(0.0)) { - F_FLOAT ecoul_tmp; - F_FLOAT forcecoul = _RSQRT_(rsq); + F_CFLOAT ecoul_tmp; + F_CFLOAT forcecoul = _RSQRT_(rsq); if(eflag) ecoul_tmp = forcecoul - _coulsw5; if(rsq > _cut_coul_inner_global * _cut_coul_inner_global) { - const F_FLOAT r = F_F(1.0) / forcecoul; - const F_FLOAT tc = r - _cut_coul_inner_global; + const F_CFLOAT r = F_F(1.0) / forcecoul; + const F_CFLOAT tc = r - _cut_coul_inner_global; forcecoul += r * tc * tc * (_coulsw1 + _coulsw2 * tc); if(eflag) ecoul_tmp -= tc * tc * tc * (_coulsw1 * (F_F(1.0) / F_F(3.0)) + _coulsw2 * tc * (F_F(1.0) / F_F(4.0))); } - F_FLOAT qprod = _qqrd2e * qij * factor_coul; + F_CFLOAT qprod = _qqrd2e * qij * factor_coul; forcecoul *= qprod; if(eflag) { diff --git a/lib/cuda/pair_lj_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_cuda.cu index 35cc94a3cf..ba5b7dd6c3 100644 --- a/lib/cuda/pair_lj_gromacs_cuda.cu +++ b/lib/cuda/pair_lj_gromacs_cuda.cu @@ -64,10 +64,10 @@ void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); diff --git a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu index bf9d042e65..0f7401cc6c 100644 --- a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu @@ -21,14 +21,14 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairLJGromacsCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - F_FLOAT tlj; - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT r6inv = r2inv * r2inv * r2inv; - F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); - const X_FLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]); + F_CFLOAT tlj; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; + F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); + const X_CFLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]); if(rsq > cut_lj_innersq) { tlj = r - _SQRT_(cut_lj_innersq); @@ -36,7 +36,7 @@ __device__ inline F_FLOAT PairLJGromacsCuda_Eval(const F_FLOAT &rsq, const int i } if(eflag) { - ENERGY_FLOAT evdwl_tmp = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]); + ENERGY_CFLOAT evdwl_tmp = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]); if(rsq > cut_lj_innersq) { evdwl_tmp += tlj * tlj * tlj * diff --git a/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu b/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu index 8647b1a62e..2948c31af0 100644 --- a/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu +++ b/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu @@ -65,10 +65,10 @@ void Cuda_PairLJSDKCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu b/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu index 48dddcae6a..44490f5a07 100644 --- a/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu +++ b/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu @@ -65,10 +65,10 @@ void Cuda_PairLJSDKCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_sdk_coul_long_cuda.cu b/lib/cuda/pair_lj_sdk_coul_long_cuda.cu index 6cbe15c7ab..92c31b6217 100644 --- a/lib/cuda/pair_lj_sdk_coul_long_cuda.cu +++ b/lib/cuda/pair_lj_sdk_coul_long_cuda.cu @@ -65,10 +65,10 @@ void Cuda_PairLJSDKCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_sdk_cuda.cu b/lib/cuda/pair_lj_sdk_cuda.cu index a6fcf7f7a0..0e4cf0a89f 100644 --- a/lib/cuda/pair_lj_sdk_cuda.cu +++ b/lib/cuda/pair_lj_sdk_cuda.cu @@ -71,10 +71,10 @@ void Cuda_PairLJSDKCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu b/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu index f8f2474551..109dbb01fb 100644 --- a/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu @@ -21,28 +21,28 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4 +__device__ inline F_CFLOAT PairLJSDKCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) //0.11 of 0.4 { - const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r2inv = F_F(1.0) / rsq; const int lj_type = _lj_type[ij_type]; - const F_FLOAT r4inv = r2inv * r2inv; - const F_FLOAT rNinv_first = lj_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq); - const F_FLOAT rNinv_second = lj_type != CG_LJ12_4 ? -r2inv : -F_F(1.0); - const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second); + const F_CFLOAT r4inv = r2inv * r2inv; + const F_CFLOAT rNinv_first = lj_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq); + const F_CFLOAT rNinv_second = lj_type != CG_LJ12_4 ? -r2inv : -F_F(1.0); + const F_CFLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second); if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]); return factor_lj * forcelj * r2inv; } -/*__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +/*__device__ inline F_CFLOAT PairLJSDKCuda_Eval(const F_CFLOAT& rsq,const int ij_type,F_CFLOAT& factor_lj,int& eflag, ENERGY_CFLOAT& evdwl) { const int lj_type = tex1Dfetch(_coeff5_gm_tex,ij_type); - const F_FLOAT r2inv = F_F(1.0)/rsq; - const F_FLOAT r4inv = r2inv*r2inv; - const F_FLOAT rNinv_first = lj_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); - const F_FLOAT rNinv_second = lj_type!=CG_LJ12_4?r2inv:F_F(1.0); - const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); + const F_CFLOAT r2inv = F_F(1.0)/rsq; + const F_CFLOAT r4inv = r2inv*r2inv; + const F_CFLOAT rNinv_first = lj_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); + const F_CFLOAT rNinv_second = lj_type!=CG_LJ12_4?r2inv:F_F(1.0); + const F_CFLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second)); return factor_lj*forcelj*r2inv; diff --git a/lib/cuda/pair_lj_smooth_cuda.cu b/lib/cuda/pair_lj_smooth_cuda.cu index aa1df9e6be..a57b5bad9b 100644 --- a/lib/cuda/pair_lj_smooth_cuda.cu +++ b/lib/cuda/pair_lj_smooth_cuda.cu @@ -65,10 +65,10 @@ void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu index c1bb3b0785..5355a4e59b 100644 --- a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu +++ b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu @@ -21,15 +21,15 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairLJSmoothCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairLJSmoothCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - F_FLOAT fskin, t, tsq, forcelj; - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _RSQRT_(r2inv); - const F_FLOAT r6inv = r2inv * r2inv * r2inv; + F_CFLOAT fskin, t, tsq, forcelj; + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _RSQRT_(r2inv); + const F_CFLOAT r6inv = r2inv * r2inv * r2inv; - X_FLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]); + X_CFLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]); if(rsq < cut_lj_innersq) { forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); @@ -43,7 +43,7 @@ __device__ inline F_FLOAT PairLJSmoothCuda_Eval(const F_FLOAT &rsq, const int ij } if(eflag) { - ENERGY_FLOAT evdwl_tmp; + ENERGY_CFLOAT evdwl_tmp; if(rsq < cut_lj_innersq) { evdwl_tmp = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - diff --git a/lib/cuda/pair_morse_coul_long_cuda.cu b/lib/cuda/pair_morse_coul_long_cuda.cu index 7512eb0567..47e18cb4dc 100644 --- a/lib/cuda/pair_morse_coul_long_cuda.cu +++ b/lib/cuda/pair_morse_coul_long_cuda.cu @@ -60,10 +60,10 @@ void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu index f6b436f0d6..2cc24514bf 100644 --- a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu +++ b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu @@ -20,13 +20,13 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairMorseR6Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairMorseR6Cuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r2inv = F_F(1.0) / rsq; - const F_FLOAT r = _SQRT_(rsq); - const F_FLOAT r4inv = r2inv * r2inv; - const F_FLOAT dr = r - _r0[ij_type]; - const F_FLOAT dexp = _EXP_(-_alpha[ij_type] * dr); + const F_CFLOAT r2inv = F_F(1.0) / rsq; + const F_CFLOAT r = _SQRT_(rsq); + const F_CFLOAT r4inv = r2inv * r2inv; + const F_CFLOAT dr = r - _r0[ij_type]; + const F_CFLOAT dexp = _EXP_(-_alpha[ij_type] * dr); if(eflag) evdwl += factor_lj * (_d0[ij_type] * (dexp * dexp - F_F(2.0) * dexp) + _c0[ij_type] * r4inv * r4inv * r4inv - _offset[ij_type]); diff --git a/lib/cuda/pair_morse_cuda.cu b/lib/cuda/pair_morse_cuda.cu index b2a651b916..7c2d9205a3 100644 --- a/lib/cuda/pair_morse_cuda.cu +++ b/lib/cuda/pair_morse_cuda.cu @@ -62,10 +62,10 @@ void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli if(sdata->pair.use_block_per_atom) Pair_Kernel_BpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); else Pair_Kernel_TpA - <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); } diff --git a/lib/cuda/pair_morse_cuda_kernel_nc.cu b/lib/cuda/pair_morse_cuda_kernel_nc.cu index 0b3baac412..3b5932a1b0 100644 --- a/lib/cuda/pair_morse_cuda_kernel_nc.cu +++ b/lib/cuda/pair_morse_cuda_kernel_nc.cu @@ -20,11 +20,11 @@ This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT PairMorseCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +__device__ inline F_CFLOAT PairMorseCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) { - const F_FLOAT r = _SQRT_(rsq); - const F_FLOAT dr = r - _r0[ij_type]; - const F_FLOAT dexp = _EXP_(-_alpha[ij_type] * dr); + const F_CFLOAT r = _SQRT_(rsq); + const F_CFLOAT dr = r - _r0[ij_type]; + const F_CFLOAT dexp = _EXP_(-_alpha[ij_type] * dr); if(eflag) evdwl += factor_lj * (_d0[ij_type] * (dexp * dexp - F_F(2.0) * dexp) - _offset[ij_type]); diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu index e96c558c68..4ba35b23a8 100644 --- a/lib/cuda/pair_sw_cuda.cu +++ b/lib/cuda/pair_sw_cuda.cu @@ -34,16 +34,16 @@ __device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h) { unsigned cuda_ntypes = sdata->atom.ntypes + 1; - X_FLOAT box_size[3] = { + X_CFLOAT box_size[3] = { sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[2] - sdata->domain.sublo[2] }; - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3); cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); cudaMemcpyToSymbol(params_sw, params_host , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h); @@ -55,13 +55,13 @@ void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, v void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) { static int glob_ij_size = 0; - static F_FLOAT4* glob_r_ij = NULL; + static F_CFLOAT4* glob_r_ij = NULL; static int* glob_numneigh_red = NULL; static int* glob_neighbors_red = NULL; static int* glob_neightype_red = NULL; - if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { - glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); + if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_CFLOAT)) { + glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_CFLOAT); cudaFree(glob_r_ij); cudaFree(glob_numneigh_red); cudaFree(glob_neighbors_red); @@ -73,7 +73,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); + cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_CFLOAT4*)); } dim3 grid, threads; @@ -112,7 +112,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, my_gettime(CLOCK_REALTIME, &time1); //actual force calculation - unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure + unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_CFLOAT) + 4 * sizeof(F_CFLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure if(eflag) { if(vflag) diff --git a/lib/cuda/pair_sw_cuda_cu.h b/lib/cuda/pair_sw_cuda_cu.h index c3713a3989..2ff8c2bd36 100644 --- a/lib/cuda/pair_sw_cuda_cu.h +++ b/lib/cuda/pair_sw_cuda_cu.h @@ -24,14 +24,14 @@ #include "cuda_shared.h" struct ParamSW_Float { - F_FLOAT epsilon, sigma; - F_FLOAT littlea, lambda, gamma, costheta; - F_FLOAT biga, bigb; - F_FLOAT powerp, powerq; - F_FLOAT tol; - F_FLOAT cut, cutsq; - F_FLOAT sigma_gamma, lambda_epsilon, lambda_epsilon2; - F_FLOAT c1, c2, c3, c4, c5, c6; + F_CFLOAT epsilon, sigma; + F_CFLOAT littlea, lambda, gamma, costheta; + F_CFLOAT biga, bigb; + F_CFLOAT powerp, powerq; + F_CFLOAT tol; + F_CFLOAT cut, cutsq; + F_CFLOAT sigma_gamma, lambda_epsilon, lambda_epsilon2; + F_CFLOAT c1, c2, c3, c4, c5, c6; int ielement, jelement, kelement; }; diff --git a/lib/cuda/pair_sw_cuda_kernel_nc.cu b/lib/cuda/pair_sw_cuda_kernel_nc.cu index ade74808ee..d1e7fc7157 100644 --- a/lib/cuda/pair_sw_cuda_kernel_nc.cu +++ b/lib/cuda/pair_sw_cuda_kernel_nc.cu @@ -27,10 +27,10 @@ -__device__ void twobody(int iparam, F_FLOAT rsq, F_FLOAT &fforce, - int eflag, ENERGY_FLOAT &eng) +__device__ void twobody(int iparam, F_CFLOAT rsq, F_CFLOAT &fforce, + int eflag, ENERGY_CFLOAT &eng) { - F_FLOAT r, rp, rq, rainv, expsrainv; + F_CFLOAT r, rp, rq, rainv, expsrainv; r = sqrt(rsq); rp = pow(r, -params_sw[iparam].powerp); @@ -44,14 +44,14 @@ __device__ void twobody(int iparam, F_FLOAT rsq, F_FLOAT &fforce, } __device__ void threebody(int paramij, int paramik, int paramijk, - F_FLOAT4 &delr1, - F_FLOAT4 &delr2, - F_FLOAT3 &fj, F_FLOAT3 &fk, int eflag, ENERGY_FLOAT &eng) + F_CFLOAT4 &delr1, + F_CFLOAT4 &delr2, + F_CFLOAT3 &fj, F_CFLOAT3 &fk, int eflag, ENERGY_CFLOAT &eng) { - F_FLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1; - F_FLOAT r2, rinvsq2, rainv2, gsrainv2, gsrainvsq2, expgsrainv2; - F_FLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1, frad2; - F_FLOAT facang, facang12, csfacang, csfac1, csfac2; + F_CFLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1; + F_CFLOAT r2, rinvsq2, rainv2, gsrainv2, gsrainvsq2, expgsrainv2; + F_CFLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1, frad2; + F_CFLOAT facang, facang12, csfacang, csfac1, csfac2; r1 = sqrt(delr1.w); rinvsq1 = F_F(1.0) / delr1.w; @@ -99,14 +99,14 @@ __device__ void threebody(int paramij, int paramik, int paramijk, } __device__ void threebody_fj(int paramij, int paramik, int paramijk, - F_FLOAT4 &delr1, - F_FLOAT4 &delr2, - F_FLOAT3 &fj) + F_CFLOAT4 &delr1, + F_CFLOAT4 &delr2, + F_CFLOAT3 &fj) { - F_FLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1; - F_FLOAT r2, rainv2, gsrainv2, expgsrainv2; - F_FLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1; - F_FLOAT facang, facang12, csfacang, csfac1; + F_CFLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1; + F_CFLOAT r2, rainv2, gsrainv2, expgsrainv2; + F_CFLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1; + F_CFLOAT facang, facang12, csfacang, csfac1; r1 = sqrt(delr1.w); rinvsq1 = F_F(1.0) / delr1.w; @@ -143,15 +143,15 @@ __device__ void threebody_fj(int paramij, int paramik, int paramijk, } -__global__ void Pair_SW_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +__global__ void Pair_SW_Kernel_TpA_RIJ()//F_CFLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(ii >= _nall) return; - X_FLOAT4 myxtype; - F_FLOAT4 delij; - F_FLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT4 delij; + F_CFLOAT xtmp, ytmp, ztmp; int itype, jnum, i, j; int* jlist; int neigh_red = 0; @@ -195,18 +195,18 @@ __global__ void Pair_SW_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numnei template -__global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +__global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_CFLOAT* _glob_zeta_ij,F_CFLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { - ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedE = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x]; - F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; + F_CFLOAT* shared_F_F = (F_CFLOAT*) sharedmem; - if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7 * blockDim.x]; - else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; - else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6 * blockDim.x]; + if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_CFLOAT*) &sharedmem[7 * blockDim.x]; + else if(eflag) shared_F_F = (F_CFLOAT*) &sharedmem[blockDim.x]; + else if(vflagm) shared_F_F = (F_CFLOAT*) &sharedmem[6 * blockDim.x]; shared_F_F += threadIdx.x; @@ -231,9 +231,9 @@ __global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _ //#define jnum_red (static_cast (shared_F_F[3*blockDim.x])) int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT4 myxtype_i, myxtype_j, myxtype_k; - F_FLOAT4 delij, delik, deljk; - F_FLOAT fpair; + X_CFLOAT4 myxtype_i, myxtype_j, myxtype_k; + F_CFLOAT4 delij, delik, deljk; + F_CFLOAT fpair; int itype, i, j; int* jlist_red; @@ -277,7 +277,7 @@ __global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _ volatile int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype]; if(delij.w < params_sw[iparam_ij].cutsq) { - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; twobody(iparam_ij, delij.w, fpair, eflag, evdwl); fxtmp += dxfp = delij.x * fpair; fytmp += dyfp = delij.y * fpair; @@ -316,7 +316,7 @@ __global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _ vec3_scale(F_F(-1.0), delik, delik); if(delik.w <= params_sw[iparam_ijk].cutsq) { - F_FLOAT3 fj, fk; + F_CFLOAT3 fj, fk; threebody(iparam_ij, iparam_ik, iparam_ijk, delij, delik, fj, fk, eflag, evdwl); fxtmp -= fj.x + fk.x; @@ -377,7 +377,7 @@ __global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _ vec3_scale(F_F(-1.0), delij, delij); if(deljk.w <= params_sw[iparam_jik].cutsq) { - F_FLOAT3 fj; + F_CFLOAT3 fj; threebody_fj(iparam_ji, iparam_jk, iparam_jik, delij, deljk, fj); @@ -397,10 +397,10 @@ __global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _ __syncthreads(); if(ii < _inum) { - F_FLOAT* my_f; + F_CFLOAT* my_f; if(_collect_forces_later) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { buffer = &buffer[1 * gridDim.x * gridDim.y]; @@ -410,7 +410,7 @@ __global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _ buffer = &buffer[6 * gridDim.x * gridDim.y]; } - my_f = (F_FLOAT*) buffer; + my_f = (F_CFLOAT*) buffer; my_f += i; *my_f = fxtmp; my_f += _nmax; diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu index e138c62b57..a7156abea4 100644 --- a/lib/cuda/pair_tersoff_cuda.cu +++ b/lib/cuda/pair_tersoff_cuda.cu @@ -26,8 +26,8 @@ #include "pair_tersoff_cuda_cu.h" __device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR]; -__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij -__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff +__device__ __constant__ F_CFLOAT* _glob_zeta_ij; //zeta_ij +__device__ __constant__ F_CFLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff __device__ __constant__ bool _zbl; //is tersoff zbl? @@ -39,16 +39,16 @@ __device__ __constant__ bool _zbl; //is tersoff zbl? void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl) { unsigned cuda_ntypes = sdata->atom.ntypes + 1; - X_FLOAT box_size[3] = { + X_CFLOAT box_size[3] = { sdata->domain.subhi[0] - sdata->domain.sublo[0], sdata->domain.subhi[1] - sdata->domain.sublo[1], sdata->domain.subhi[2] - sdata->domain.sublo[2] }; - cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3); cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); - cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); cudaMemcpyToSymbol(params, params_host , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h); @@ -61,15 +61,15 @@ void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) { - static F_FLOAT* glob_zeta_ij = NULL; + static F_CFLOAT* glob_zeta_ij = NULL; static int glob_zeta_ij_size = 0; - static F_FLOAT4* glob_r_ij = NULL; + static F_CFLOAT4* glob_r_ij = NULL; static int* glob_numneigh_red = NULL; static int* glob_neighbors_red = NULL; static int* glob_neightype_red = NULL; - if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { - glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); + if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_CFLOAT)) { + glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_CFLOAT); cudaFree(glob_zeta_ij); cudaFree(glob_r_ij); cudaFree(glob_numneigh_red); @@ -83,8 +83,8 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); - cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); - cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_CFLOAT4*)); + cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_CFLOAT*)); } dim3 grid, threads; @@ -127,7 +127,7 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh my_gettime(CLOCK_REALTIME, &time1); //actual force calculation - unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure + unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_CFLOAT) + 4 * sizeof(F_CFLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure if(eflag) { if(vflag) diff --git a/lib/cuda/pair_tersoff_cuda_cu.h b/lib/cuda/pair_tersoff_cuda_cu.h index e4eb81827f..f3a45e9c8b 100644 --- a/lib/cuda/pair_tersoff_cuda_cu.h +++ b/lib/cuda/pair_tersoff_cuda_cu.h @@ -24,18 +24,18 @@ #include "cuda_shared.h" struct Param_Float { - F_FLOAT lam1, lam2, lam3; - F_FLOAT c, d, h; - F_FLOAT gamma, powerm; - F_FLOAT powern, beta; - F_FLOAT biga, bigb, bigd, bigr; - F_FLOAT cut, cutsq; - F_FLOAT c1, c2, c3, c4; + F_CFLOAT lam1, lam2, lam3; + F_CFLOAT c, d, h; + F_CFLOAT gamma, powerm; + F_CFLOAT powern, beta; + F_CFLOAT biga, bigb, bigd, bigr; + F_CFLOAT cut, cutsq; + F_CFLOAT c1, c2, c3, c4; int ielement, jelement, kelement; int powermint; - //F_FLOAT Z_i,Z_j; - F_FLOAT ZBLcut, ZBLexpscale; - F_FLOAT a_ij, premult; + //F_CFLOAT Z_i,Z_j; + F_CFLOAT ZBLcut, ZBLexpscale; + F_CFLOAT a_ij, premult; }; extern "C" void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl); diff --git a/lib/cuda/pair_tersoff_cuda_kernel_nc.cu b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu index e5143b36ad..c340bcbd38 100644 --- a/lib/cuda/pair_tersoff_cuda_kernel_nc.cu +++ b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu @@ -28,7 +28,7 @@ template static inline __device__ void PairVirialCompute_A_Kernel_Template() { __syncthreads(); - ENERGY_FLOAT* shared = sharedmem; + ENERGY_CFLOAT* shared = sharedmem; if(eflag) { reduceBlock(shared); @@ -46,7 +46,7 @@ static inline __device__ void PairVirialCompute_A_Kernel_Template() if(threadIdx.x == 0) { shared = sharedmem; - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0]; @@ -70,19 +70,19 @@ static inline __device__ void PairVirialCompute_A_Kernel_Template() __global__ void virial_fdotr_compute_kernel(int eflag) { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - ENERGY_FLOAT* sharedE = (ENERGY_FLOAT*) &sharedmem[0]; - ENERGY_FLOAT* sharedVirial = (ENERGY_FLOAT*) &sharedE[blockDim.x]; + ENERGY_CFLOAT* sharedE = (ENERGY_CFLOAT*) &sharedmem[0]; + ENERGY_CFLOAT* sharedVirial = (ENERGY_CFLOAT*) &sharedE[blockDim.x]; sharedE += threadIdx.x; sharedVirial += threadIdx.x; if(i < _nlocal) { - F_FLOAT x = _x[i]; - F_FLOAT y = _x[i + _nmax]; - F_FLOAT z = _x[i + 2 * _nmax]; - F_FLOAT fx = _f[i]; - F_FLOAT fy = _f[i + _nmax]; - F_FLOAT fz = _f[i + 2 * _nmax]; + F_CFLOAT x = _x[i]; + F_CFLOAT y = _x[i + _nmax]; + F_CFLOAT z = _x[i + 2 * _nmax]; + F_CFLOAT fx = _f[i]; + F_CFLOAT fy = _f[i + _nmax]; + F_CFLOAT fz = _f[i + 2 * _nmax]; //if(fz*z*fz*z>1e-5) printf("V %i %i %e %e %e %e %e %e\n",i,_tag[i],x,y,z,fx,fy,fz); sharedVirial[0] = fx * x; sharedVirial[1 * blockDim.x] = fy * y; @@ -99,7 +99,7 @@ __global__ void virial_fdotr_compute_kernel(int eflag) sharedVirial[5 * blockDim.x] = 0; } - sharedVirial = (ENERGY_FLOAT*) &sharedmem[0]; + sharedVirial = (ENERGY_CFLOAT*) &sharedmem[0]; sharedVirial += blockDim.x; reduceBlockP2(sharedVirial); reduceBlockP2(&sharedVirial[1 * blockDim.x]); @@ -109,7 +109,7 @@ __global__ void virial_fdotr_compute_kernel(int eflag) reduceBlockP2(&sharedVirial[5 * blockDim.x]); if(threadIdx.x < 6) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) buffer = &buffer[gridDim.x * gridDim.y]; @@ -122,47 +122,47 @@ __global__ void virial_fdotr_compute_kernel(int eflag) #define vec3_add(X,Y,Z) Z.x = X.x+Y.x; Z.y = X.y+Y.y; Z.z = X.z+Y.z; #define vec3_dot(X,Y) (X.x*Y.x + X.y*Y.y + X.z*Y.z)*/ -__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y) +__device__ inline void vec3_scale(F_CFLOAT k, F_CFLOAT3 &x, F_CFLOAT3 &y) { y.x = k * x.x; y.y = k * x.y; y.z = k * x.z; } -__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT3 &y) +__device__ inline void vec3_scale(F_CFLOAT k, F_CFLOAT4 &x, F_CFLOAT3 &y) { y.x = k * x.x; y.y = k * x.y; y.z = k * x.z; } -__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT4 &y) +__device__ inline void vec3_scale(F_CFLOAT k, F_CFLOAT4 &x, F_CFLOAT4 &y) { y.x = k * x.x; y.y = k * x.y; y.z = k * x.z; } -__device__ inline void vec3_scaleadd(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z) +__device__ inline void vec3_scaleadd(F_CFLOAT k, F_CFLOAT3 &x, F_CFLOAT3 &y, F_CFLOAT3 &z) { z.x = k * x.x + y.x; z.y = k * x.y + y.y; z.z = k * x.z + y.z; } -__device__ inline void vec3_add(F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z) +__device__ inline void vec3_add(F_CFLOAT3 &x, F_CFLOAT3 &y, F_CFLOAT3 &z) { z.x = x.x + y.x; z.y = x.y + y.y; z.z = x.z + y.z; } -__device__ inline F_FLOAT vec3_dot(F_FLOAT3 x, F_FLOAT3 y) +__device__ inline F_CFLOAT vec3_dot(F_CFLOAT3 x, F_CFLOAT3 y) { return x.x * y.x + x.y * y.y + x.z * y.z; } -__device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y) +__device__ inline F_CFLOAT vec3_dot(F_CFLOAT4 x, F_CFLOAT4 y) { return x.x * y.x + x.y * y.y + x.z * y.z; } @@ -171,7 +171,7 @@ __device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y) Fermi-like smoothing function ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam) +__device__ inline F_CFLOAT F_fermi(F_CFLOAT &r, int &iparam) { return F_F(1.0) / (F_F(1.0) + exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut))); } @@ -180,56 +180,56 @@ __device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam) Fermi-like smoothing function derivative with respect to r ------------------------------------------------------------------------- */ -__device__ inline F_FLOAT F_fermi_d(F_FLOAT &r, int &iparam) +__device__ inline F_CFLOAT F_fermi_d(F_CFLOAT &r, int &iparam) { - volatile const F_FLOAT tmp = exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut)); + volatile const F_CFLOAT tmp = exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut)); return params[iparam].ZBLexpscale * tmp / ((F_F(1.0) + tmp) * (F_F(1.0) + tmp)); } -__device__ inline F_FLOAT ters_fc(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) +__device__ inline F_CFLOAT ters_fc(F_CFLOAT r, F_CFLOAT ters_R, F_CFLOAT ters_D) { return (r < ters_R - ters_D) ? F_F(1.0) : ((r > ters_R + ters_D) ? F_F(0.0) : F_F(0.5) * (F_F(1.0) - sin(PI2 * (r - ters_R) / ters_D))); } -__device__ inline F_FLOAT ters_fc_d(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) +__device__ inline F_CFLOAT ters_fc_d(F_CFLOAT r, F_CFLOAT ters_R, F_CFLOAT ters_D) { return ((r < ters_R - ters_D) || (r > ters_R + ters_D)) ? F_F(0.0) : -(PI4 / ters_D) * cos(PI2 * (r - ters_R) / ters_D); } -__device__ inline F_FLOAT ters_gijk(F_FLOAT &cos_theta, int iparam) +__device__ inline F_CFLOAT ters_gijk(F_CFLOAT &cos_theta, int iparam) { - F_FLOAT ters_c = params[iparam].c; - F_FLOAT ters_d = params[iparam].d; + F_CFLOAT ters_c = params[iparam].c; + F_CFLOAT ters_d = params[iparam].d; return params[iparam].gamma * (F_F(1.0) + pow(params[iparam].c / params[iparam].d, F_F(2.0)) - pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0)))); } -__device__ F_FLOAT ters_gijk2(F_FLOAT &cos_theta, int iparam) +__device__ F_CFLOAT ters_gijk2(F_CFLOAT &cos_theta, int iparam) { - F_FLOAT ters_c = params[iparam].c; - F_FLOAT ters_d = params[iparam].d; + F_CFLOAT ters_c = params[iparam].c; + F_CFLOAT ters_d = params[iparam].d; return params[iparam].gamma * (F_F(1.0) + pow(ters_c / ters_d, F_F(2.0)) - pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0)))); } -__device__ inline F_FLOAT ters_gijk_d(F_FLOAT costheta, int iparam) +__device__ inline F_CFLOAT ters_gijk_d(F_CFLOAT costheta, int iparam) { - F_FLOAT numerator = -F_F(2.0) * pow(params[iparam].c, F_F(2.0)) * (params[iparam].h - costheta); - F_FLOAT denominator = pow(pow(params[iparam].d, F_F(2.0)) + + F_CFLOAT numerator = -F_F(2.0) * pow(params[iparam].c, F_F(2.0)) * (params[iparam].h - costheta); + F_CFLOAT denominator = pow(pow(params[iparam].d, F_F(2.0)) + pow(params[iparam].h - costheta, F_F(2.0)), F_F(2.0)); return params[iparam].gamma * numerator / denominator; } -__device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rsqik, - F_FLOAT3 &delij, F_FLOAT3 &delik) +__device__ inline F_CFLOAT zeta(int iparam, const F_CFLOAT rsqij, const F_CFLOAT rsqik, + F_CFLOAT3 &delij, F_CFLOAT3 &delik) { - F_FLOAT rij, rik, costheta, arg, ex_delr; + F_CFLOAT rij, rik, costheta, arg, ex_delr; rij = sqrt(rsqij); rik = sqrt(rsqik); @@ -245,13 +245,13 @@ __device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rs (params[iparam].c * params[iparam].c) / ((params[iparam].d * params[iparam].d) + (params[iparam].h - costheta) * (params[iparam].h - costheta))); } -__device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce, - int eflag, ENERGY_FLOAT &eng) +__device__ void repulsive(int iparam, F_CFLOAT rsq, F_CFLOAT &fforce, + int eflag, ENERGY_CFLOAT &eng) { - F_FLOAT r, tmp_fc, tmp_fc_d, tmp_exp; + F_CFLOAT r, tmp_fc, tmp_fc_d, tmp_exp; - F_FLOAT ters_R = params[iparam].bigr; - F_FLOAT ters_D = params[iparam].bigd; + F_CFLOAT ters_R = params[iparam].bigr; + F_CFLOAT ters_D = params[iparam].bigd; r = sqrt(rsq); tmp_fc = ters_fc(r, ters_R, ters_D); tmp_fc_d = ters_fc_d(r, ters_R, ters_D); @@ -262,18 +262,18 @@ __device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce, if(eflag) eng += tmp_fc * params[iparam].biga * tmp_exp; } else { - F_FLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1); - ENERGY_FLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp; + F_CFLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1); + ENERGY_CFLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp; - F_FLOAT r_ov_a = r / params[iparam].a_ij; - F_FLOAT phi = F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) + F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) + + F_CFLOAT r_ov_a = r / params[iparam].a_ij; + F_CFLOAT phi = F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) + F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) + F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) + F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a); - F_FLOAT dphi = (F_F(1.0) / params[iparam].a_ij) * (-F_F(3.2) * F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) - + F_CFLOAT dphi = (F_F(1.0) / params[iparam].a_ij) * (-F_F(3.2) * F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) - F_F(0.9423) * F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) - F_F(0.4029) * F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) - F_F(0.2016) * F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a)); - F_FLOAT fforce_ZBL = params[iparam].premult / (-r * r) * phi + params[iparam].premult / r * dphi; - ENERGY_FLOAT eng_ZBL = params[iparam].premult * (F_F(1.0) / r) * phi; + F_CFLOAT fforce_ZBL = params[iparam].premult / (-r * r) * phi + params[iparam].premult / r * dphi; + ENERGY_CFLOAT eng_ZBL = params[iparam].premult * (F_F(1.0) / r) * phi; fforce = -(-F_fermi_d(r, iparam) * (eng_ZBL - eng_ters) + fforce_ZBL + F_fermi(r, iparam) * (fforce_ters - fforce_ZBL)) / r; @@ -286,7 +286,7 @@ __device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce, /* ---------------------------------------------------------------------- */ -__device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) +__device__ inline F_CFLOAT ters_fa(F_CFLOAT r, int iparam, F_CFLOAT ters_R, F_CFLOAT ters_D) { if(r > ters_R + ters_D) return F_F(0.0); @@ -298,7 +298,7 @@ __device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT /* ---------------------------------------------------------------------- */ -__device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) +__device__ inline F_CFLOAT ters_fa_d(F_CFLOAT r, int iparam, F_CFLOAT ters_R, F_CFLOAT ters_D) { if(r > ters_R + ters_D) return F_F(0.0); @@ -313,9 +313,9 @@ __device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLO /* ---------------------------------------------------------------------- */ -__device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam) +__device__ inline F_CFLOAT ters_bij(F_CFLOAT zeta, int iparam) { - F_FLOAT tmp = params[iparam].beta * zeta; + F_CFLOAT tmp = params[iparam].beta * zeta; if(tmp > params[iparam].c1) return F_F(1.0) / sqrt(tmp); @@ -332,9 +332,9 @@ __device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam) /* ---------------------------------------------------------------------- */ -__device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam) +__device__ inline F_CFLOAT ters_bij_d(F_CFLOAT zeta, int iparam) { - F_FLOAT tmp = params[iparam].beta * zeta; + F_CFLOAT tmp = params[iparam].beta * zeta; if(tmp > params[iparam].c1) return params[iparam].beta * -F_F(0.5) * pow(tmp, -F_F(1.5)); @@ -348,17 +348,17 @@ __device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam) if(tmp < params[iparam].c3) return -F_F(0.5) * params[iparam].beta * pow(tmp, params[iparam].powern - F_F(1.0)); - F_FLOAT tmp_n = pow(tmp, params[iparam].powern); + F_CFLOAT tmp_n = pow(tmp, params[iparam].powern); return -F_F(0.5) * pow(F_F(1.0) + tmp_n, -F_F(1.0) - (F_F(1.0) / (F_F(2.0) * params[iparam].powern))) * tmp_n / zeta; } -__device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, - F_FLOAT &fforce, F_FLOAT &prefactor, - int eflag, F_FLOAT &eng) +__device__ void force_zeta(int iparam, F_CFLOAT rsq, F_CFLOAT zeta_ij, + F_CFLOAT &fforce, F_CFLOAT &prefactor, + int eflag, F_CFLOAT &eng) { - F_FLOAT r, fa, fa_d, bij; - F_FLOAT ters_R = params[iparam].bigr; - F_FLOAT ters_D = params[iparam].bigd; + F_CFLOAT r, fa, fa_d, bij; + F_CFLOAT ters_R = params[iparam].bigr; + F_CFLOAT ters_D = params[iparam].bigd; r = sqrt(rsq); fa = ters_fa(r, iparam, ters_R, ters_D); fa_d = ters_fa_d(r, iparam, ters_R, ters_D); @@ -369,12 +369,12 @@ __device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, if(eflag) eng += bij * fa; } -__device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, - F_FLOAT &fforce, F_FLOAT &prefactor) +__device__ void force_zeta_prefactor_force(int iparam, F_CFLOAT rsq, F_CFLOAT zeta_ij, + F_CFLOAT &fforce, F_CFLOAT &prefactor) { - F_FLOAT r, fa, fa_d, bij; - F_FLOAT ters_R = params[iparam].bigr; - F_FLOAT ters_D = params[iparam].bigd; + F_CFLOAT r, fa, fa_d, bij; + F_CFLOAT ters_R = params[iparam].bigr; + F_CFLOAT ters_D = params[iparam].bigd; r = sqrt(rsq); fa = ters_fa(r, iparam, ters_R, ters_D); fa_d = ters_fa_d(r, iparam, ters_R, ters_D); @@ -383,23 +383,23 @@ __device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam); } -__device__ void force_zeta_prefactor(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, - F_FLOAT &prefactor) +__device__ void force_zeta_prefactor(int iparam, F_CFLOAT rsq, F_CFLOAT zeta_ij, + F_CFLOAT &prefactor) { - F_FLOAT r, fa; + F_CFLOAT r, fa; r = sqrt(rsq); fa = ters_fa(r, iparam, params[iparam].bigr, params[iparam].bigd); prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam); } -__device__ void costheta_d(F_FLOAT3 &rij_hat, F_FLOAT &rij, - F_FLOAT3 &rik_hat, F_FLOAT &rik, - F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk) +__device__ void costheta_d(F_CFLOAT3 &rij_hat, F_CFLOAT &rij, + F_CFLOAT3 &rik_hat, F_CFLOAT &rik, + F_CFLOAT3 &dri, F_CFLOAT3 &drj, F_CFLOAT3 &drk) { // first element is derivative wrt Ri, second wrt Rj, third wrt Rk - F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + F_CFLOAT cos_theta = vec3_dot(rij_hat, rik_hat); vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj); vec3_scale(F_F(1.0) / rij, drj, drj); @@ -409,14 +409,14 @@ __device__ void costheta_d(F_FLOAT3 &rij_hat, F_FLOAT &rij, vec3_scale(-F_F(1.0), dri, dri); } -__device__ void ters_zetaterm_d(F_FLOAT prefactor, - F_FLOAT3 &rij_hat, F_FLOAT rij, - F_FLOAT3 &rik_hat, F_FLOAT rik, - F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk, +__device__ void ters_zetaterm_d(F_CFLOAT prefactor, + F_CFLOAT3 &rij_hat, F_CFLOAT rij, + F_CFLOAT3 &rik_hat, F_CFLOAT rik, + F_CFLOAT3 &dri, F_CFLOAT3 &drj, F_CFLOAT3 &drk, int iparam) { - F_FLOAT ex_delr, ex_delr_d, tmp; - F_FLOAT3 dcosdri, dcosdrj, dcosdrk; + F_CFLOAT ex_delr, ex_delr_d, tmp; + F_CFLOAT3 dcosdri, dcosdrj, dcosdrk; if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); else tmp = params[iparam].lam3 * (rij - rik); @@ -430,20 +430,20 @@ __device__ void ters_zetaterm_d(F_FLOAT prefactor, else ex_delr_d = params[iparam].lam3 * ex_delr; - const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + const F_CFLOAT cos_theta = vec3_dot(rij_hat, rik_hat); costheta_d(rij_hat, rij, rik_hat, rik, dcosdri, dcosdrj, dcosdrk); - const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + const F_CFLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + const F_CFLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_CFLOAT denominator = (params[iparam].d * params[iparam].d) + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + const F_CFLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri // dri = -dfc*gijk*ex_delr*rik_hat; // dri += fc*gijk_d*ex_delr*dcosdri; // dri += fc*gijk*ex_delr_d*(rik_hat - rij_hat); - const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); - const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); + const F_CFLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + const F_CFLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri); @@ -470,12 +470,12 @@ __device__ void ters_zetaterm_d(F_FLOAT prefactor, vec3_scale(prefactor, drk, drk); } -__device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, - F_FLOAT3 &rij_hat, F_FLOAT &rij, - F_FLOAT3 &rik_hat, F_FLOAT &rik, - F_FLOAT3 &dri, int &iparam) +__device__ void ters_zetaterm_d_fi(F_CFLOAT &prefactor, + F_CFLOAT3 &rij_hat, F_CFLOAT &rij, + F_CFLOAT3 &rik_hat, F_CFLOAT &rik, + F_CFLOAT3 &dri, int &iparam) { - F_FLOAT ex_delr, ex_delr_d, tmp; + F_CFLOAT ex_delr, ex_delr_d, tmp; if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); else tmp = params[iparam].lam3 * (rij - rik); @@ -488,11 +488,11 @@ __device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr; else ex_delr_d = params[iparam].lam3 * ex_delr; - const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + const F_CFLOAT cos_theta = vec3_dot(rij_hat, rik_hat); //costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); - F_FLOAT3 dcosdri; + F_CFLOAT3 dcosdri; vec3_scaleadd(-cos_theta, rij_hat, rik_hat, dri); vec3_scale(F_F(1.0) / rij, dri, dri); vec3_scaleadd(-cos_theta, rik_hat, rij_hat, dcosdri); @@ -500,15 +500,15 @@ __device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, vec3_add(dri, dcosdri, dcosdri); vec3_scale(-F_F(1.0), dcosdri, dcosdri); - const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + const F_CFLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + const F_CFLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_CFLOAT denominator = (params[iparam].d * params[iparam].d) + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + const F_CFLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri // - const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); - const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); + const F_CFLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + const F_CFLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri); vec3_scaleadd(fc * gijk_d * ex_delr, dcosdri, dri, dri); @@ -518,12 +518,12 @@ __device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, } -__device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor, - F_FLOAT3 &rij_hat, F_FLOAT &rij, - F_FLOAT3 &rik_hat, F_FLOAT &rik, - F_FLOAT3 &drj, int &iparam) +__device__ void ters_zetaterm_d_fj(F_CFLOAT &prefactor, + F_CFLOAT3 &rij_hat, F_CFLOAT &rij, + F_CFLOAT3 &rik_hat, F_CFLOAT &rik, + F_CFLOAT3 &drj, int &iparam) { - F_FLOAT ex_delr, ex_delr_d, tmp; + F_CFLOAT ex_delr, ex_delr_d, tmp; if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); else tmp = params[iparam].lam3 * (rij - rik); @@ -536,30 +536,30 @@ __device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor, ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr; else ex_delr_d = params[iparam].lam3 * ex_delr; - const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + const F_CFLOAT cos_theta = vec3_dot(rij_hat, rik_hat); vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj); vec3_scale(F_F(1.0) / rij, drj, drj); - const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + const F_CFLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + const F_CFLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_CFLOAT denominator = (params[iparam].d * params[iparam].d) + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + const F_CFLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri - const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + const F_CFLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); vec3_scale(fc * gijk_d * ex_delr, drj, drj); vec3_scaleadd(fc * gijk * ex_delr_d, rij_hat, drj, drj); vec3_scale(prefactor, drj, drj); } -__device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor, - F_FLOAT3 &rij_hat, F_FLOAT &rij, - F_FLOAT3 &rik_hat, F_FLOAT &rik, - F_FLOAT3 &drk, int &iparam) +__device__ void ters_zetaterm_d_fk(F_CFLOAT &prefactor, + F_CFLOAT3 &rij_hat, F_CFLOAT &rij, + F_CFLOAT3 &rik_hat, F_CFLOAT &rik, + F_CFLOAT3 &drk, int &iparam) { - F_FLOAT ex_delr, ex_delr_d, tmp; + F_CFLOAT ex_delr, ex_delr_d, tmp; if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); else tmp = params[iparam].lam3 * (rij - rik); @@ -572,19 +572,19 @@ __device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor, ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr; else ex_delr_d = params[iparam].lam3 * ex_delr; - const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + const F_CFLOAT cos_theta = vec3_dot(rij_hat, rik_hat); vec3_scaleadd(-cos_theta, rik_hat, rij_hat, drk); vec3_scale(F_F(1.0) / rik, drk, drk); - const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + const F_CFLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); - const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); - const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + const F_CFLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_CFLOAT denominator = (params[iparam].d * params[iparam].d) + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); - const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + const F_CFLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri - const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); - const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); + const F_CFLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + const F_CFLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); vec3_scale(fc * gijk_d * ex_delr, drk, drk); vec3_scaleadd(dfc * gijk * ex_delr, rik_hat, drk, drk); @@ -592,13 +592,13 @@ __device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor, vec3_scale(prefactor, drk, drk); } -__device__ void attractive(int iparam, F_FLOAT prefactor, - F_FLOAT4 &delij, - F_FLOAT4 &delik, - F_FLOAT3 &fi, F_FLOAT3 &fj, F_FLOAT3 &fk) +__device__ void attractive(int iparam, F_CFLOAT prefactor, + F_CFLOAT4 &delij, + F_CFLOAT4 &delik, + F_CFLOAT3 &fi, F_CFLOAT3 &fj, F_CFLOAT3 &fk) { - F_FLOAT3 rij_hat, rik_hat; - F_FLOAT rij, rijinv, rik, rikinv; + F_CFLOAT3 rij_hat, rik_hat; + F_CFLOAT rij, rijinv, rik, rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0) / rij; @@ -611,13 +611,13 @@ __device__ void attractive(int iparam, F_FLOAT prefactor, ters_zetaterm_d(prefactor, rij_hat, rij, rik_hat, rik, fi, fj, fk, iparam); } -__device__ void attractive_fi(int &iparam, F_FLOAT &prefactor, - F_FLOAT4 &delij, - F_FLOAT4 &delik, - F_FLOAT3 &f) +__device__ void attractive_fi(int &iparam, F_CFLOAT &prefactor, + F_CFLOAT4 &delij, + F_CFLOAT4 &delik, + F_CFLOAT3 &f) { - F_FLOAT3 rij_hat, rik_hat; - F_FLOAT rij, rijinv, rik, rikinv; + F_CFLOAT3 rij_hat, rik_hat; + F_CFLOAT rij, rijinv, rik, rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0) / rij; @@ -630,13 +630,13 @@ __device__ void attractive_fi(int &iparam, F_FLOAT &prefactor, ters_zetaterm_d_fi(prefactor, rij_hat, rij, rik_hat, rik, f, iparam); } -__device__ void attractive_fj(int iparam, F_FLOAT prefactor, - F_FLOAT4 &delij, - F_FLOAT4 &delik, - F_FLOAT3 &f) +__device__ void attractive_fj(int iparam, F_CFLOAT prefactor, + F_CFLOAT4 &delij, + F_CFLOAT4 &delik, + F_CFLOAT3 &f) { - F_FLOAT3 rij_hat, rik_hat; - F_FLOAT rij, rijinv, rik, rikinv; + F_CFLOAT3 rij_hat, rik_hat; + F_CFLOAT rij, rijinv, rik, rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0) / rij; @@ -649,13 +649,13 @@ __device__ void attractive_fj(int iparam, F_FLOAT prefactor, ters_zetaterm_d_fj(prefactor, rij_hat, rij, rik_hat, rik, f, iparam); } -__device__ void attractive_fk(int iparam, F_FLOAT prefactor, - F_FLOAT4 &delij, - F_FLOAT4 &delik, - F_FLOAT3 &f) +__device__ void attractive_fk(int iparam, F_CFLOAT prefactor, + F_CFLOAT4 &delij, + F_CFLOAT4 &delik, + F_CFLOAT3 &f) { - F_FLOAT3 rij_hat, rik_hat; - F_FLOAT rij, rijinv, rik, rikinv; + F_CFLOAT3 rij_hat, rik_hat; + F_CFLOAT rij, rijinv, rik, rikinv; rij = sqrt(delij.w); rijinv = F_F(1.0) / rij; @@ -668,15 +668,15 @@ __device__ void attractive_fk(int iparam, F_FLOAT prefactor, ters_zetaterm_d_fk(prefactor, rij_hat, rij, rik_hat, rik, f, iparam); } -__global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +__global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_CFLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if(ii >= _nall) return; - X_FLOAT4 myxtype; - F_FLOAT4 delij; - F_FLOAT xtmp, ytmp, ztmp; + X_CFLOAT4 myxtype; + F_CFLOAT4 delij; + F_CFLOAT xtmp, ytmp, ztmp; int itype, jnum, i, j; int* jlist; int neigh_red = 0; @@ -719,7 +719,7 @@ __global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_n } -__global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +__global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_CFLOAT* _glob_zeta_ij,F_CFLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; @@ -727,8 +727,8 @@ __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT if(ii >= _nall) return; - F_FLOAT4 delij; - F_FLOAT4 delik; + F_CFLOAT4 delij; + F_CFLOAT4 delik; int itype, jnum, i, j; int* jlist; @@ -751,8 +751,8 @@ __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype]; if(delij.w < params[iparam_ij].cutsq) { - F_FLOAT zeta_ij = 0.0; - F_FLOAT3 delij3 = {delij.x, delij.y, delij.z}; + F_CFLOAT zeta_ij = 0.0; + F_CFLOAT3 delij3 = {delij.x, delij.y, delij.z}; for(int kk = 0; kk < jnum; kk++) { if(jj == kk) continue; @@ -762,9 +762,9 @@ __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT int ktype = _glob_neightype_red[i + kk * _nall]; delik = _glob_r_ij[i + kk * _nall]; - F_FLOAT3 delik3 = {delik.x, delik.y, delik.z}; + F_CFLOAT3 delik3 = {delik.x, delik.y, delik.z}; int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype]; - const F_FLOAT rsqki = delik.w; + const F_CFLOAT rsqki = delik.w; if(rsqki <= params[iparam_ijk].cutsq) zeta_ij += zeta(iparam_ijk, delij.w, rsqki, delij3, delik3); @@ -783,18 +783,18 @@ __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT //back15: num 12 steps 10: ZetaIJ/TPA 0.0137/0.0287 //pow beseitigt // num 12 steps 10: ZetaIJ/TPA 0.0137/0.027 template -__global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +__global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_CFLOAT* _glob_zeta_ij,F_CFLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) { - ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_CFLOAT evdwl = ENERGY_F(0.0); - ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; - ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedE = &sharedmem[threadIdx.x]; + ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x]; - F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; + F_CFLOAT* shared_F_F = (F_CFLOAT*) sharedmem; - if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7 * blockDim.x]; - else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; - else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6 * blockDim.x]; + if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_CFLOAT*) &sharedmem[7 * blockDim.x]; + else if(eflag) shared_F_F = (F_CFLOAT*) &sharedmem[blockDim.x]; + else if(vflagm) shared_F_F = (F_CFLOAT*) &sharedmem[6 * blockDim.x]; shared_F_F += threadIdx.x; @@ -820,10 +820,10 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - X_FLOAT4 myxtype_i, myxtype_j, myxtype_k; - F_FLOAT4 delij, delik, deljk; - F_FLOAT fpair; - F_FLOAT prefactor_ij, prefactor_ji; + X_CFLOAT4 myxtype_i, myxtype_j, myxtype_k; + F_CFLOAT4 delij, delik, deljk; + F_CFLOAT fpair; + F_CFLOAT prefactor_ij, prefactor_ji; int itype, i, j; int* jlist_red; @@ -868,7 +868,7 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO volatile int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype]; if(delij.w < params[iparam_ij].cutsq) { - F_FLOAT dxfp, dyfp, dzfp; + F_CFLOAT dxfp, dyfp, dzfp; repulsive(iparam_ij, delij.w, fpair, eflag, evdwl); fxtmp += dxfp = delij.x * fpair; fytmp += dyfp = delij.y * fpair; @@ -941,7 +941,7 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO if(delik.w <= params[iparam_ijk].cutsq) { if(vflagm) { - F_FLOAT3 fi, fj, fk; + F_CFLOAT3 fi, fj, fk; attractive(iparam_ijk, prefactor_ij, delij, delik, fi, fj, fk); fxtmp += fi.x; @@ -969,7 +969,7 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.z; sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.z; } else { - F_FLOAT3 fi; //local variable + F_CFLOAT3 fi; //local variable attractive_fi(iparam_ijk, prefactor_ij, delij, delik, fi); fxtmp += fi.x; @@ -1008,7 +1008,7 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO vec3_scale(F_F(-1.0), delij, delij); if(deljk.w <= params[iparam_jik].cutsq) { - F_FLOAT3 ftmp; //local variable + F_CFLOAT3 ftmp; //local variable attractive_fj(iparam_jik, prefactor_ji, delij, deljk, ftmp); @@ -1016,7 +1016,7 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO fytmp += ftmp.y; fztmp += ftmp.z; int iparam_jk = elem2param[(jtype * nelements + ktype) * nelements + ktype]; - F_FLOAT prefactor_jk; + F_CFLOAT prefactor_jk; force_zeta_prefactor(iparam_jk, deljk.w, _glob_zeta_ij[j + kk * _nall], prefactor_jk); attractive_fk(iparam_jki, prefactor_jk, @@ -1037,10 +1037,10 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO __syncthreads(); if(ii < _inum) { - F_FLOAT* my_f; + F_CFLOAT* my_f; if(_collect_forces_later) { - ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer; if(eflag) { buffer = &buffer[1 * gridDim.x * gridDim.y]; @@ -1050,7 +1050,7 @@ __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLO buffer = &buffer[6 * gridDim.x * gridDim.y]; } - my_f = (F_FLOAT*) buffer; + my_f = (F_CFLOAT*) buffer; my_f += i; *my_f = fxtmp; my_f += _nmax; diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu index 19c2a23a68..367b080f04 100644 --- a/lib/cuda/pppm_cuda.cu +++ b/lib/cuda/pppm_cuda.cu @@ -34,25 +34,25 @@ #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) -__device__ __constant__ FFT_FLOAT* work1; -__device__ __constant__ FFT_FLOAT* work2; -__device__ __constant__ FFT_FLOAT* work3; -__device__ __constant__ PPPM_FLOAT* greensfn; -__device__ __constant__ PPPM_FLOAT* gf_b; -__device__ __constant__ PPPM_FLOAT* fkx; -__device__ __constant__ PPPM_FLOAT* fky; -__device__ __constant__ PPPM_FLOAT* fkz; -__device__ __constant__ PPPM_FLOAT* vg; +__device__ __constant__ FFT_CFLOAT* work1; +__device__ __constant__ FFT_CFLOAT* work2; +__device__ __constant__ FFT_CFLOAT* work3; +__device__ __constant__ PPPM_CFLOAT* greensfn; +__device__ __constant__ PPPM_CFLOAT* gf_b; +__device__ __constant__ PPPM_CFLOAT* fkx; +__device__ __constant__ PPPM_CFLOAT* fky; +__device__ __constant__ PPPM_CFLOAT* fkz; +__device__ __constant__ PPPM_CFLOAT* vg; __device__ __constant__ int* part2grid; -__device__ __constant__ PPPM_FLOAT* density_brick; +__device__ __constant__ PPPM_CFLOAT* density_brick; __device__ __constant__ int* density_brick_int; -__device__ __constant__ PPPM_FLOAT density_intScale; -__device__ __constant__ PPPM_FLOAT* vdx_brick; -__device__ __constant__ PPPM_FLOAT* vdy_brick; -__device__ __constant__ PPPM_FLOAT* vdz_brick; -__device__ __constant__ PPPM_FLOAT* density_fft; -__device__ __constant__ ENERGY_FLOAT* energy; -__device__ __constant__ ENERGY_FLOAT* virial; +__device__ __constant__ PPPM_CFLOAT density_intScale; +__device__ __constant__ PPPM_CFLOAT* vdx_brick; +__device__ __constant__ PPPM_CFLOAT* vdy_brick; +__device__ __constant__ PPPM_CFLOAT* vdz_brick; +__device__ __constant__ PPPM_CFLOAT* density_fft; +__device__ __constant__ ENERGY_CFLOAT* energy; +__device__ __constant__ ENERGY_CFLOAT* virial; __device__ __constant__ int nxlo_in; __device__ __constant__ int nxhi_in; __device__ __constant__ int nxlo_out; @@ -75,19 +75,19 @@ __device__ __constant__ int nx_pppm; __device__ __constant__ int ny_pppm; __device__ __constant__ int nz_pppm; __device__ __constant__ int slabflag; -__device__ __constant__ PPPM_FLOAT qqrd2e; +__device__ __constant__ PPPM_CFLOAT qqrd2e; __device__ __constant__ int order; //__device__ __constant__ float3 sublo; -__device__ __constant__ PPPM_FLOAT* rho_coeff; +__device__ __constant__ PPPM_CFLOAT* rho_coeff; __device__ __constant__ int nmax; __device__ __constant__ int nlocal; -__device__ __constant__ PPPM_FLOAT* debugdata; -__device__ __constant__ PPPM_FLOAT delxinv; -__device__ __constant__ PPPM_FLOAT delyinv; -__device__ __constant__ PPPM_FLOAT delzinv; +__device__ __constant__ PPPM_CFLOAT* debugdata; +__device__ __constant__ PPPM_CFLOAT delxinv; +__device__ __constant__ PPPM_CFLOAT delyinv; +__device__ __constant__ PPPM_CFLOAT delzinv; __device__ __constant__ int nlower; __device__ __constant__ int nupper; -__device__ __constant__ PPPM_FLOAT shiftone; +__device__ __constant__ PPPM_CFLOAT shiftone; #include "pppm_cuda_kernel.cu" @@ -100,14 +100,14 @@ void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_b ) { CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); - cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_CFLOAT*)); + cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_CFLOAT*)); cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int)); cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int)); cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int)); @@ -130,68 +130,68 @@ void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_b cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int)); cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int)); cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int)); - cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_CFLOAT*)); + cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_CFLOAT*)); + cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_CFLOAT*)); + cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_CFLOAT*)); - PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e; - cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT)); + PPPM_CFLOAT cu_qqrd2e_a = cu_qqrd2e; + cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_CFLOAT)); cudaMemcpyToSymbol(order, &cu_order, sizeof(int)); - cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_CFLOAT*)); + cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_CFLOAT*)); CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); - /*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); + /*if(sizeof(CUDA_CFLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); #ifdef PPPM_PRECISION - if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); - if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); + if(sizeof(PPPM_CFLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); + if(sizeof(PPPM_CFLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); #endif #ifdef ENERGY_PRECISION - if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); - if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); + if(sizeof(ENERGY_CFLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); + if(sizeof(ENERGY_CFLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); #endif #ifdef ENERGY_PRECISION - if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); - if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); + if(sizeof(FFT_CFLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); + if(sizeof(FFT_CFLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); #endif #ifdef X_PRECISION - if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); - if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); + if(sizeof(X_CFLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); + if(sizeof(X_CFLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); #endif #ifdef F_PRECISION - if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); - if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); + if(sizeof(F_CFLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); + if(sizeof(F_CFLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); #endif*/ } -void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper) +void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_CFLOAT cu_shiftone, PPPM_CFLOAT cu_delxinv, PPPM_CFLOAT cu_delyinv, PPPM_CFLOAT cu_delzinv, int cu_nlower, int cu_nupper) { - cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_CFLOAT)); + cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_CFLOAT)); + cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_CFLOAT)); + cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_CFLOAT)); cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int)); cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int)); - cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_CFLOAT)); + cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_CFLOAT)); CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); } void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa) { cudaMemcpyToSymbol(part2grid, &cu_part2grid, sizeof(int*)); - cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*)); + cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*)); cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); //cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int)); @@ -207,7 +207,7 @@ void pppm_update_nlocal(int nlocala) } -void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald) +void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_CFLOAT unitkx, PPPM_CFLOAT unitky, PPPM_CFLOAT unitkz, PPPM_CFLOAT g_ewald) { dim3 grid; dim3 threads; @@ -223,8 +223,8 @@ void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLO CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); } -void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, - int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab) +void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_CFLOAT unitkx, PPPM_CFLOAT unitky, PPPM_CFLOAT unitkz, PPPM_CFLOAT g_ewald, + int nbx, int nby, int nbz, PPPM_CFLOAT xprd, PPPM_CFLOAT yprd, PPPM_CFLOAT zprd_slab) { dim3 grid; dim3 threads; @@ -356,15 +356,15 @@ void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int threads.x = nxhi_fft - nxlo_fft + 1; threads.y = 1; threads.z = 1; - poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag); + poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); } -ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial) +ENERGY_CFLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_CFLOAT* cpu_virial) { - ENERGY_FLOAT host_energy = 0; + ENERGY_CFLOAT host_energy = 0; dim3 grid; dim3 threads; @@ -374,7 +374,7 @@ ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_p threads.x = ny_pppma; threads.y = 1; threads.z = 1; - sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); + sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_CFLOAT)>>>(vflag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); @@ -384,20 +384,20 @@ ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_p threads.x = nz_pppma; threads.y = 1; threads.z = 1; - sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); + sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_CFLOAT)>>>(vflag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); - cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_CFLOAT), cudaMemcpyDeviceToHost); if(vflag) - cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_CFLOAT), cudaMemcpyDeviceToHost); CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); return host_energy; } -void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int) +void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_CFLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int) { CUT_CHECK_ERROR("cuda_make_rho begin"); dim3 grid, threads; @@ -408,17 +408,17 @@ void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_i threads.x = 32; threads.y = 1; threads.z = 1; - int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); + int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_CFLOAT); do { cpu_flag[0] = 0; cpu_flag[1] = 0; cpu_flag[2] = 0; - cudaMemcpyToSymbol(density_intScale, cu_density_intScale, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(density_intScale, cu_density_intScale, sizeof(PPPM_CFLOAT*)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); cudaMemset(flag, 0, 3 * sizeof(int)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); - cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT)); + cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_CFLOAT)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int)); CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); @@ -483,14 +483,14 @@ void cuda_fieldforce(cuda_shared_data* sdata, void* flag) threads.x = 32; threads.y = 1; threads.z = 1; - int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); + int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_CFLOAT); fieldforce_kernel <<< grid, threads, sharedmemsize>>> (sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag); cudaThreadSynchronize(); CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); } -double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) +double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_CFLOAT* buf, ENERGY_CFLOAT* dev_buf) { dim3 grid, threads; grid.x = (sdata->atom.nlocal + 31) / 32; @@ -499,9 +499,9 @@ double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_F threads.x = 32; threads.y = 1; threads.z = 1; - slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf); + slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_CFLOAT)>>>(dev_buf); cudaThreadSynchronize(); - cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_CFLOAT), cudaMemcpyDeviceToHost); double dipole_all = 0.0; @@ -511,7 +511,7 @@ double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_F return dipole_all; } -void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact) +void cuda_slabcorr_force(cuda_shared_data* sdata, F_CFLOAT ffact) { dim3 grid, threads; grid.x = (sdata->atom.nlocal + 31) / 32; @@ -528,7 +528,7 @@ void sum_virial(double* host_virial) { } -void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out) +void pppm_initfftdata(cuda_shared_data* sdata, PPPM_CFLOAT* in, FFT_CFLOAT* out) { int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in; int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in; diff --git a/lib/cuda/pppm_cuda_cu.h b/lib/cuda/pppm_cuda_cu.h index a22e811c38..03c0706197 100644 --- a/lib/cuda/pppm_cuda_cu.h +++ b/lib/cuda/pppm_cuda_cu.h @@ -30,10 +30,10 @@ extern "C" void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, voi , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_lock, int slabflag ); -extern "C" void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT shiftone, PPPM_FLOAT delxinv, PPPM_FLOAT delyinv, PPPM_FLOAT delzinv, int nlower, int nupper); -extern "C" void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald); -extern "C" void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, - int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab); +extern "C" void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_CFLOAT shiftone, PPPM_CFLOAT delxinv, PPPM_CFLOAT delyinv, PPPM_CFLOAT delzinv, int nlower, int nupper); +extern "C" void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_CFLOAT unitkx, PPPM_CFLOAT unitky, PPPM_CFLOAT unitkz, PPPM_CFLOAT g_ewald); +extern "C" void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_CFLOAT unitkx, PPPM_CFLOAT unitky, PPPM_CFLOAT unitkz, PPPM_CFLOAT g_ewald, + int nbx, int nby, int nbz, PPPM_CFLOAT xprd, PPPM_CFLOAT yprd, PPPM_CFLOAT zprd_slab); extern "C" void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa); extern "C" void pppm_update_nlocal(int nlocala); @@ -45,11 +45,11 @@ extern "C" void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, i extern "C" void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm); extern "C" void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm); extern "C" void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag); -extern "C" ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial); +extern "C" ENERGY_CFLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_CFLOAT* cpu_virial); extern "C" int cuda_particle_map(cuda_shared_data* sdata, void* flag); -extern "C" void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int); +extern "C" void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_CFLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int); extern "C" void cuda_fieldforce(cuda_shared_data* sdata, void* flag); -extern "C" double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf); -extern "C" void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact); -extern "C" void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out); +extern "C" double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_CFLOAT* buf, ENERGY_CFLOAT* dev_buf); +extern "C" void cuda_slabcorr_force(cuda_shared_data* sdata, F_CFLOAT ffact); +extern "C" void pppm_initfftdata(cuda_shared_data* sdata, PPPM_CFLOAT* in, FFT_CFLOAT* out); #endif /*PPPM_CUDA_CU_H_*/ diff --git a/lib/cuda/pppm_cuda_kernel.cu b/lib/cuda/pppm_cuda_kernel.cu index 25a81866f0..c2f5937749 100644 --- a/lib/cuda/pppm_cuda_kernel.cu +++ b/lib/cuda/pppm_cuda_kernel.cu @@ -65,13 +65,13 @@ __device__ void reduceBlock(double* data) } } -extern __shared__ PPPM_FLOAT sharedmem[]; +extern __shared__ PPPM_CFLOAT sharedmem[]; -__global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald) +__global__ void setup_fkxyz_vg(PPPM_CFLOAT unitkx, PPPM_CFLOAT unitky, PPPM_CFLOAT unitkz, PPPM_CFLOAT g_ewald) { - PPPM_FLOAT my_fkx = unitkx * (int(threadIdx.x) - nx_pppm * (2 * int(threadIdx.x) / nx_pppm)); - PPPM_FLOAT my_fky = unitky * (int(blockIdx.y) - ny_pppm * (2 * int(blockIdx.y) / ny_pppm)); - PPPM_FLOAT my_fkz = unitkz * (int(blockIdx.x) - nz_pppm * (2 * int(blockIdx.x) / nz_pppm)); + PPPM_CFLOAT my_fkx = unitkx * (int(threadIdx.x) - nx_pppm * (2 * int(threadIdx.x) / nx_pppm)); + PPPM_CFLOAT my_fky = unitky * (int(blockIdx.y) - ny_pppm * (2 * int(blockIdx.y) / ny_pppm)); + PPPM_CFLOAT my_fkz = unitkz * (int(blockIdx.x) - nz_pppm * (2 * int(blockIdx.x) / nz_pppm)); if((blockIdx.x == 0) && (blockIdx.y == 0)) fkx[threadIdx.x] = my_fkx; @@ -85,8 +85,8 @@ __global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT (blockIdx.y >= nylo_fft) && (blockIdx.y <= nyhi_fft) && (threadIdx.x >= nxlo_fft) && (threadIdx.x <= nxhi_fft)) { int n = ((int(blockIdx.x) - nzlo_fft) * (nyhi_fft - nylo_fft + 1) + int(blockIdx.y) - nylo_fft) * (nxhi_fft - nxlo_fft + 1) + int(threadIdx.x) - nxlo_fft; - PPPM_FLOAT sqk = my_fkx * my_fkx + my_fky * my_fky + my_fkz * my_fkz; - PPPM_FLOAT vterm = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0) / sqk + PPPM_F(0.25) / (g_ewald * g_ewald)); + PPPM_CFLOAT sqk = my_fkx * my_fkx + my_fky * my_fky + my_fkz * my_fkz; + PPPM_CFLOAT vterm = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0) / sqk + PPPM_F(0.25) / (g_ewald * g_ewald)); vg[6 * n + 0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx * my_fkx; vg[6 * n + 1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky * my_fky; vg[6 * n + 2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz * my_fkz; @@ -97,9 +97,9 @@ __global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT } } -__device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z) +__device__ PPPM_CFLOAT gf_denom(PPPM_CFLOAT x, PPPM_CFLOAT y, PPPM_CFLOAT z) { - PPPM_FLOAT sx, sy, sz; + PPPM_CFLOAT sx, sy, sz; sz = sy = sx = PPPM_F(0.0); for(int l = order - 1; l >= 0; l--) { @@ -108,22 +108,22 @@ __device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z) sz = gf_b[l] + sz * z; } - PPPM_FLOAT s = sx * sy * sz; + PPPM_CFLOAT s = sx * sy * sz; return s * s; } -__global__ void setup_greensfn(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, +__global__ void setup_greensfn(PPPM_CFLOAT unitkx, PPPM_CFLOAT unitky, PPPM_CFLOAT unitkz, PPPM_CFLOAT g_ewald, int nbx, int nby, int nbz, - PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab) + PPPM_CFLOAT xprd, PPPM_CFLOAT yprd, PPPM_CFLOAT zprd_slab) { - PPPM_FLOAT sqk; + PPPM_CFLOAT sqk; int nx, ny, nz, kper, lper, mper, k, l, m; - PPPM_FLOAT snx, sny, snz, snx2, sny2, snz2; - PPPM_FLOAT argx, argy, argz, wx, wy, wz, sx, sy, sz, qx, qy, qz; - PPPM_FLOAT sum1, dot1, dot2; - PPPM_FLOAT numerator, denominator; + PPPM_CFLOAT snx, sny, snz, snx2, sny2, snz2; + PPPM_CFLOAT argx, argy, argz, wx, wy, wz, sx, sy, sz, qx, qy, qz; + PPPM_CFLOAT sum1, dot1, dot2; + PPPM_CFLOAT numerator, denominator; - PPPM_FLOAT form = PPPM_F(1.0); + PPPM_CFLOAT form = PPPM_F(1.0); int n = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; m = blockIdx.x; l = blockIdx.y; @@ -188,7 +188,7 @@ __global__ void setup_greensfn(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT __global__ void poisson_scale_kernel() { int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - FFT_FLOAT scaleinv = FFT_F(1.0) / (gridDim.x * gridDim.y * blockDim.x); + FFT_CFLOAT scaleinv = FFT_F(1.0) / (gridDim.x * gridDim.y * blockDim.x); work1[2 * i] *= scaleinv * greensfn[i]; work1[2 * i + 1] *= scaleinv * greensfn[i]; } @@ -249,10 +249,10 @@ __global__ void poisson_vdz_brick_kernel(int ilo, int jlo, int klo) __global__ void poisson_energy_kernel(int nxlo_fft, int nylo_fft, int nzlo_fft, int vflag) { - ENERGY_FLOAT scaleinv = FFT_F(1.0) / (nx_pppm * ny_pppm * nz_pppm); + ENERGY_CFLOAT scaleinv = FFT_F(1.0) / (nx_pppm * ny_pppm * nz_pppm); int i = (blockIdx.x + nzlo_fft) * ny_pppm * nx_pppm + (blockIdx.y + nylo_fft) * nx_pppm + threadIdx.x + nxlo_fft; - ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem; - ENERGY_FLOAT myenergy = scaleinv * scaleinv * greensfn[i] * (work1[2 * i] * work1[2 * i] + work1[2 * i + 1] * work1[2 * i + 1]); + ENERGY_CFLOAT* s_energy = (ENERGY_CFLOAT*) sharedmem; + ENERGY_CFLOAT myenergy = scaleinv * scaleinv * greensfn[i] * (work1[2 * i] * work1[2 * i] + work1[2 * i + 1] * work1[2 * i + 1]); s_energy[threadIdx.x] = myenergy; __syncthreads(); @@ -278,8 +278,8 @@ __global__ void poisson_energy_kernel(int nxlo_fft, int nylo_fft, int nzlo_fft, __global__ void sum_energy_kernel1(int vflag) { - ENERGY_FLOAT myenergy = energy[(blockIdx.x * ny_pppm + threadIdx.x)]; - ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem; + ENERGY_CFLOAT myenergy = energy[(blockIdx.x * ny_pppm + threadIdx.x)]; + ENERGY_CFLOAT* s_energy = (ENERGY_CFLOAT*) sharedmem; s_energy[threadIdx.x] = myenergy; __syncthreads(); reduceBlock(s_energy); @@ -305,8 +305,8 @@ __global__ void sum_energy_kernel1(int vflag) __global__ void sum_energy_kernel2(int vflag) { - ENERGY_FLOAT myenergy = energy[threadIdx.x * ny_pppm]; - ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem; + ENERGY_CFLOAT myenergy = energy[threadIdx.x * ny_pppm]; + ENERGY_CFLOAT* s_energy = (ENERGY_CFLOAT*) sharedmem; s_energy[threadIdx.x] = myenergy; __syncthreads(); reduceBlock(s_energy); @@ -329,9 +329,9 @@ __global__ void sum_energy_kernel2(int vflag) } } -__device__ PPPM_FLOAT rho1d(int k, PPPM_FLOAT d, PPPM_FLOAT* srho_coeff) +__device__ PPPM_CFLOAT rho1d(int k, PPPM_CFLOAT d, PPPM_CFLOAT* srho_coeff) { - PPPM_FLOAT rho1d_tmp = PPPM_F(0.0); + PPPM_CFLOAT rho1d_tmp = PPPM_F(0.0); for(int l = order - 1; l >= 0; l--) rho1d_tmp = srho_coeff[l * order + k - (1 - order) / 2] + rho1d_tmp * d; @@ -345,7 +345,7 @@ __global__ void particle_map_kernel(int* flag) if(i < nlocal) { int nx, ny, nz; - PPPM_FLOAT shift = PPPM_F(0.5) - shiftone; //+OFFSET; + PPPM_CFLOAT shift = PPPM_F(0.5) - shiftone; //+OFFSET; nx = (int)((_x[i] - _boxlo[0]) * delxinv + shift); // - OFFSET; ny = (int)((_x[i + nmax] - _boxlo[1]) * delyinv + shift); // - OFFSET; nz = (int)((_x[i + 2 * nmax] - _boxlo[2]) * delzinv + shift); // - OFFSET; @@ -391,7 +391,7 @@ __global__ void make_rho_kernelA() if(i < nlocal) { - PPPM_FLOAT dx, dy, dz, x0, y0, z0; + PPPM_CFLOAT dx, dy, dz, x0, y0, z0; nx = part2grid[i]; ny = part2grid[i + nmax]; nz = part2grid[i + 2 * nmax]; @@ -442,7 +442,7 @@ __global__ void make_rho_kernel(int* flag, int read_threads_at_same_time) int nelements = nupper - nlower + 1; int* idx = (int*) sharedmem; int* sdensity_brick_int = &idx[blockDim.x]; - PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &sdensity_brick_int[nelements * blockDim.x]; + PPPM_CFLOAT* srho_coeff = (PPPM_CFLOAT*) &sdensity_brick_int[nelements * blockDim.x]; if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1)) srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x]; @@ -454,7 +454,7 @@ __global__ void make_rho_kernel(int* flag, int read_threads_at_same_time) if(false) { if(i < nlocal) { - PPPM_FLOAT dx, dy, dz, x0, y0, z0; + PPPM_CFLOAT dx, dy, dz, x0, y0, z0; nx = part2grid[i]; ny = part2grid[i + nmax]; nz = part2grid[i + 2 * nmax]; @@ -497,7 +497,7 @@ __global__ void make_rho_kernel(int* flag, int read_threads_at_same_time) i = blockIdx.x * blockDim.x + threadIdx.x; { - PPPM_FLOAT dx, dy, dz, x0, y0, z0, qtmp; + PPPM_CFLOAT dx, dy, dz, x0, y0, z0, qtmp; if(i < nlocal) { qtmp = _q[i]; @@ -575,8 +575,8 @@ __global__ void fieldforce_kernel(int elements_per_thread, int read_threads_at_s // ek = 3 components of E-field on particle i = blockIdx.x * blockDim.x + threadIdx.x; int* idx = (int*) sharedmem; - PPPM_FLOAT* tmp_brick = (PPPM_FLOAT*) &idx[blockDim.x]; - PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &tmp_brick[3 * blockDim.x * elements_per_thread]; + PPPM_CFLOAT* tmp_brick = (PPPM_CFLOAT*) &idx[blockDim.x]; + PPPM_CFLOAT* srho_coeff = (PPPM_CFLOAT*) &tmp_brick[3 * blockDim.x * elements_per_thread]; if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1)) srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x]; @@ -584,8 +584,8 @@ __global__ void fieldforce_kernel(int elements_per_thread, int read_threads_at_s __syncthreads(); { int l, m, n, nx, ny, nz, my, mz; - PPPM_FLOAT dx, dy, dz, x0, y0, z0; - PPPM_FLOAT ek[3]; + PPPM_CFLOAT dx, dy, dz, x0, y0, z0; + PPPM_CFLOAT ek[3]; if(i < nlocal) { nx = part2grid[i]; @@ -652,9 +652,9 @@ __global__ void fieldforce_kernel(int elements_per_thread, int read_threads_at_s } } -__global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf) +__global__ void slabcorr_energy_kernel(ENERGY_CFLOAT* buf) { - ENERGY_FLOAT* dipole = (ENERGY_FLOAT*) sharedmem; + ENERGY_CFLOAT* dipole = (ENERGY_CFLOAT*) sharedmem; int i = blockIdx.x * blockDim.x + threadIdx.x; if(i < nlocal) @@ -668,7 +668,7 @@ __global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf) if(threadIdx.x == 0) buf[blockIdx.x] = dipole[0]; } -__global__ void slabcorr_force_kernel(F_FLOAT ffact) +__global__ void slabcorr_force_kernel(F_CFLOAT ffact) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -677,13 +677,13 @@ __global__ void slabcorr_force_kernel(F_FLOAT ffact) } -__global__ void initfftdata_core_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_core_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] = in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x) + 1] = 0; } -__global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_z_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { if(slabflag) { if(blockIdx.x < nzlo_in - nzlo_out) @@ -697,7 +697,7 @@ __global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) out[2 * ((((blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + (nzhi_out - nzlo_in)) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; } -__global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_y_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { if(blockIdx.y < nylo_in - nylo_out) out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + (2 * (nyhi_in + 1) - nylo_in - nyhi_out) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; @@ -706,7 +706,7 @@ __global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + (nyhi_out - nylo_in)) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; } -__global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_x_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { if(threadIdx.x < nxlo_in - nxlo_out) out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; @@ -715,7 +715,7 @@ __global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; } -__global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_yz_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { if(slabflag) { if(blockIdx.x < nzlo_in - nzlo_out) @@ -744,7 +744,7 @@ __global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; } -__global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_xz_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { if(blockIdx.x < nzhi_out - nzhi_in) if(threadIdx.x < nxlo_in - nxlo_out) @@ -773,7 +773,7 @@ __global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) } } -__global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_xy_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { if(blockIdx.y < nyhi_out - nyhi_in) if(threadIdx.x < nxlo_in - nxlo_out) @@ -792,7 +792,7 @@ __global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; } -__global__ void initfftdata_xyz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +__global__ void initfftdata_xyz_kernel(PPPM_CFLOAT* in, FFT_CFLOAT* out) { if(blockIdx.x < nzhi_out - nzhi_in) if(blockIdx.y < nyhi_out - nyhi_in) diff --git a/lib/kokkos/Makefile.lammps b/lib/kokkos/Makefile.lammps index a655b062bd..dd7f4539d9 100644 --- a/lib/kokkos/Makefile.lammps +++ b/lib/kokkos/Makefile.lammps @@ -131,8 +131,12 @@ KOKKOS_INC += -Xcompiler -fopenmp else KOKKOS_INC += -fopenmp endif +ifeq ($(CUDA), yes) +KOKKOS_LINK += -Xcompiler -fopenmp +else KOKKOS_LINK += -fopenmp endif +endif ifeq ($(HWLOC),yes) KOKKOS_INC += -DKOKKOS_HAVE_HWLOC -I$(HWLOC_PATH)/include