From d6412dc97b86c0dc95a8f9a014ab75796bd6c12f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 8 Jul 2023 00:50:19 -0500 Subject: [PATCH] Attempted to resolve issues with switching from acctyp4 to acctyp3 in tep, fieldp since the changes in PR #3675, noting some changes with Intel OCL PR #3663 --- lib/gpu/lal_base_amoeba.cpp | 5 +++++ lib/gpu/lal_base_amoeba.h | 2 +- lib/gpu/lal_hippo.cpp | 16 ++++++++++------ src/GPU/pair_amoeba_gpu.cpp | 30 +++++++++++++++--------------- src/GPU/pair_hippo_gpu.cpp | 30 +++++++++++++++--------------- 5 files changed, 46 insertions(+), 37 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 0821a33b06..c41c66fb5f 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -420,6 +420,11 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); + if (_max_tep_size>_max_fieldp_size) { + _max_fieldp_size = _max_tep_size; + _fieldp.resize(_max_fieldp_size*6); + } + *fieldp_ptr=_fieldp.host.begin(); // specify the correct cutoff and alpha values diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 0eaaafeb1e..f415b30334 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -203,7 +203,7 @@ class BaseAmoeba { virtual void update_fieldp(void **fieldp_ptr) { *fieldp_ptr=_fieldp.host.begin(); // _fieldp store both arrays, one after another - _fieldp.update_host(_max_fieldp_size*8,false); + _fieldp.update_host(_max_fieldp_size*6,false); } /// setup a plan for FFT, where size is the number of elements diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 24ffae8de2..8d6ad5dfb2 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -211,7 +211,7 @@ void HippoT::compute_repulsion(const int /*ago*/, const int inum_full, if (inum_full>this->_max_tep_size) { this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); - this->_tep.resize(this->_max_tep_size*4); + this->_tep.resize(this->_max_tep_size*3); } *tep_ptr=this->_tep.host.begin(); @@ -226,7 +226,7 @@ void HippoT::compute_repulsion(const int /*ago*/, const int inum_full, repulsion(this->_eflag,this->_vflag); // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); + this->_tep.update_host(this->_max_tep_size*3,false); } // --------------------------------------------------------------------------- @@ -366,7 +366,7 @@ void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full, if (inum_full>this->_max_tep_size) { this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); - this->_tep.resize(this->_max_tep_size*4); + this->_tep.resize(this->_max_tep_size*3); } *tep_ptr=this->_tep.host.begin(); @@ -376,7 +376,7 @@ void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full, multipole_real(this->_eflag,this->_vflag); // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); + this->_tep.update_host(this->_max_tep_size*3,false); } // --------------------------------------------------------------------------- @@ -434,6 +434,10 @@ void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, do this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval); this->atom->add_extra_data(); + if (this->_max_tep_size>this->_max_fieldp_size) { + this->_max_fieldp_size = this->_max_tep_size; + this->_fieldp.resize(this->_max_fieldp_size*6); + } *fieldp_ptr=this->_fieldp.host.begin(); this->_off2_polar = off2_polar; @@ -442,7 +446,7 @@ void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, do // copy field and fieldp from device to host (_fieldp store both arrays, one after another) - this->_fieldp.update_host(this->_max_fieldp_size*8,false); + this->_fieldp.update_host(this->_max_fieldp_size*6,false); } // --------------------------------------------------------------------------- @@ -580,7 +584,7 @@ void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, d this->device->add_ans_object(this->ans); // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); + this->_tep.update_host(this->_max_tep_size*3,false); } // --------------------------------------------------------------------------- diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index b7e1f8e118..8b3cf2cfea 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -745,15 +745,15 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -762,15 +762,15 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) auto field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -976,15 +976,15 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -993,15 +993,15 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) auto field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -2029,9 +2029,9 @@ void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, int nlocal = atom->nlocal; for (i = 0; i < nlocal; i++) { - _tq[0] = tq_ptr[4*i]; - _tq[1] = tq_ptr[4*i+1]; - _tq[2] = tq_ptr[4*i+2]; + _tq[0] = tq_ptr[3*i]; + _tq[1] = tq_ptr[3*i+1]; + _tq[2] = tq_ptr[3*i+2]; torque2force(i,_tq,fix,fiy,fiz,force_comp); iz = zaxis2local[i]; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index f87676ec08..256b4088d8 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -859,15 +859,15 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -877,15 +877,15 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) auto field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -1087,15 +1087,15 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) auto *field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -1105,15 +1105,15 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) auto *field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } - field_ptr += 4*inum; + field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { - int idx = 4*i; + int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; @@ -1456,9 +1456,9 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, int nlocal = atom->nlocal; for (i = 0; i < nlocal; i++) { - _tq[0] = tq_ptr[4*i]; - _tq[1] = tq_ptr[4*i+1]; - _tq[2] = tq_ptr[4*i+2]; + _tq[0] = tq_ptr[3*i]; + _tq[1] = tq_ptr[3*i+1]; + _tq[2] = tq_ptr[3*i+2]; torque2force(i,_tq,fix,fiy,fiz,force_comp); iz = zaxis2local[i];