diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index cc593e4263..b3bbabadc3 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1631,9 +1631,9 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, const __global numtyp2 *restrict grid, - __global numtyp *restrict fdip_phi1, - __global numtyp *restrict fdip_phi2, - __global numtyp *restrict fdip_sum_phi, + __global acctyp *restrict fdip_phi1, + __global acctyp *restrict fdip_phi2, + __global acctyp *restrict fdip_sum_phi, const int bsorder, const int inum, const int nzlo_out, const int nylo_out, const int nxlo_out, const int ngridxy, @@ -1843,7 +1843,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, } int idx; - numtyp fdip_buf[20]; + acctyp fdip_buf[20]; fdip_buf[0] = (numtyp)0.0; fdip_buf[1] = tuv100_1; @@ -1917,7 +1917,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, const __global numtyp2 *restrict grid, - __global numtyp *restrict fphi, + __global acctyp *restrict fphi, const int bsorder, const int inum, const numtyp felec, const int nzlo_out, const int nylo_out, const int nxlo_out, const int ngridxy, diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a20c3886d5..a7f98fa5be 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -250,7 +250,7 @@ class BaseAmoeba { UCL_Vector _thetai1, _thetai2, _thetai3; UCL_Vector _igrid; UCL_Vector _cgrid_brick; - UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; + UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _ngridx, _ngridy, _ngridz, _num_grid_points; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 713015b5c5..d3d4103953 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -203,10 +203,7 @@ void PairAmoebaGPU::init_style() if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); - if (tq_size == sizeof(double)) - tq_single = false; - else - tq_single = true; + tq_single = tq_size != sizeof(double); // replace with the gpu counterpart @@ -739,23 +736,44 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (udirect1) int nlocal = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; + if (tq_single) { + auto field_ptr = (float *)fieldp_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + auto fieldp_ptr = (float *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + auto fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } } + } @@ -960,23 +978,44 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) amoeba_gpu_update_fieldp(&fieldp_pinned); int inum = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; + if (tq_single) { + auto field_ptr = (float *)fieldp_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + auto fieldp_ptr = (float *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + auto fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } } + // accumulate timing information @@ -1139,32 +1178,63 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, &fdip_sum_phi_pinned); int nlocal = atom->nlocal; - double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 10; m++) { - fdip_phi1[i][m] = _fdip_phi1_ptr[n]; - n += nlocal; + if (tq_single) { + auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } } - } - double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 10; m++) { - fdip_phi2[i][m] = _fdip_phi2_ptr[n]; - n += nlocal; + auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } } - } - double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 20; m++) { - fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; - n += nlocal; + auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + + } else { + auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } } } + } /* ---------------------------------------------------------------------- @@ -1447,15 +1517,26 @@ void PairAmoebaGPU::polar_kspace() } else { void* fphi_pinned = nullptr; amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); - - double *_fphi_ptr = (double *)fphi_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = i; - for (int m = 0; m < 20; m++) { - fphi[i][m] = _fphi_ptr[idx]; - idx += nlocal; + if (tq_single) { + auto _fphi_ptr = (float *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + } else { + auto _fphi_ptr = (double *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } } } + }