Attempted to resolve the memory access runtime errors when acquiring single and mixed precision arrays from the GPU lib
This commit is contained in:
@ -1631,9 +1631,9 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
|||||||
const __global numtyp4 *restrict thetai3,
|
const __global numtyp4 *restrict thetai3,
|
||||||
const __global int *restrict igrid,
|
const __global int *restrict igrid,
|
||||||
const __global numtyp2 *restrict grid,
|
const __global numtyp2 *restrict grid,
|
||||||
__global numtyp *restrict fdip_phi1,
|
__global acctyp *restrict fdip_phi1,
|
||||||
__global numtyp *restrict fdip_phi2,
|
__global acctyp *restrict fdip_phi2,
|
||||||
__global numtyp *restrict fdip_sum_phi,
|
__global acctyp *restrict fdip_sum_phi,
|
||||||
const int bsorder, const int inum,
|
const int bsorder, const int inum,
|
||||||
const int nzlo_out, const int nylo_out,
|
const int nzlo_out, const int nylo_out,
|
||||||
const int nxlo_out, const int ngridxy,
|
const int nxlo_out, const int ngridxy,
|
||||||
@ -1843,7 +1843,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int idx;
|
int idx;
|
||||||
numtyp fdip_buf[20];
|
acctyp fdip_buf[20];
|
||||||
|
|
||||||
fdip_buf[0] = (numtyp)0.0;
|
fdip_buf[0] = (numtyp)0.0;
|
||||||
fdip_buf[1] = tuv100_1;
|
fdip_buf[1] = tuv100_1;
|
||||||
@ -1917,7 +1917,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
|
|||||||
const __global numtyp4 *restrict thetai3,
|
const __global numtyp4 *restrict thetai3,
|
||||||
const __global int *restrict igrid,
|
const __global int *restrict igrid,
|
||||||
const __global numtyp2 *restrict grid,
|
const __global numtyp2 *restrict grid,
|
||||||
__global numtyp *restrict fphi,
|
__global acctyp *restrict fphi,
|
||||||
const int bsorder, const int inum, const numtyp felec,
|
const int bsorder, const int inum, const numtyp felec,
|
||||||
const int nzlo_out, const int nylo_out,
|
const int nzlo_out, const int nylo_out,
|
||||||
const int nxlo_out, const int ngridxy,
|
const int nxlo_out, const int ngridxy,
|
||||||
|
|||||||
@ -250,7 +250,7 @@ class BaseAmoeba {
|
|||||||
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
|
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
|
||||||
UCL_Vector<int,int> _igrid;
|
UCL_Vector<int,int> _igrid;
|
||||||
UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
|
UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
|
||||||
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
|
UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
|
||||||
int _max_thetai_size;
|
int _max_thetai_size;
|
||||||
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
|
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
|
||||||
int _ngridx, _ngridy, _ngridz, _num_grid_points;
|
int _ngridx, _ngridy, _ngridz, _num_grid_points;
|
||||||
|
|||||||
@ -203,10 +203,7 @@ void PairAmoebaGPU::init_style()
|
|||||||
if (gpu_mode == GPU_FORCE)
|
if (gpu_mode == GPU_FORCE)
|
||||||
error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
|
error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
|
||||||
|
|
||||||
if (tq_size == sizeof(double))
|
tq_single = tq_size != sizeof(double);
|
||||||
tq_single = false;
|
|
||||||
else
|
|
||||||
tq_single = true;
|
|
||||||
|
|
||||||
// replace with the gpu counterpart
|
// replace with the gpu counterpart
|
||||||
|
|
||||||
@ -739,23 +736,44 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
|
|||||||
// field and fieldp may already have some nonzero values from kspace (udirect1)
|
// field and fieldp may already have some nonzero values from kspace (udirect1)
|
||||||
|
|
||||||
int nlocal = atom->nlocal;
|
int nlocal = atom->nlocal;
|
||||||
double *field_ptr = (double *)fieldp_pinned;
|
if (tq_single) {
|
||||||
|
auto field_ptr = (float *)fieldp_pinned;
|
||||||
|
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
int idx = 4*i;
|
int idx = 4*i;
|
||||||
field[i][0] += field_ptr[idx];
|
field[i][0] += field_ptr[idx];
|
||||||
field[i][1] += field_ptr[idx+1];
|
field[i][1] += field_ptr[idx+1];
|
||||||
field[i][2] += field_ptr[idx+2];
|
field[i][2] += field_ptr[idx+2];
|
||||||
}
|
}
|
||||||
|
|
||||||
double* fieldp_ptr = (double *)fieldp_pinned;
|
auto fieldp_ptr = (float *)fieldp_pinned;
|
||||||
fieldp_ptr += 4*inum;
|
fieldp_ptr += 4*inum;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
int idx = 4*i;
|
int idx = 4*i;
|
||||||
fieldp[i][0] += fieldp_ptr[idx];
|
fieldp[i][0] += fieldp_ptr[idx];
|
||||||
fieldp[i][1] += fieldp_ptr[idx+1];
|
fieldp[i][1] += fieldp_ptr[idx+1];
|
||||||
fieldp[i][2] += fieldp_ptr[idx+2];
|
fieldp[i][2] += fieldp_ptr[idx+2];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto field_ptr = (double *)fieldp_pinned;
|
||||||
|
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int idx = 4*i;
|
||||||
|
field[i][0] += field_ptr[idx];
|
||||||
|
field[i][1] += field_ptr[idx+1];
|
||||||
|
field[i][2] += field_ptr[idx+2];
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fieldp_ptr = (double *)fieldp_pinned;
|
||||||
|
fieldp_ptr += 4*inum;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int idx = 4*i;
|
||||||
|
fieldp[i][0] += fieldp_ptr[idx];
|
||||||
|
fieldp[i][1] += fieldp_ptr[idx+1];
|
||||||
|
fieldp[i][2] += fieldp_ptr[idx+2];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -960,23 +978,44 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
|
|||||||
amoeba_gpu_update_fieldp(&fieldp_pinned);
|
amoeba_gpu_update_fieldp(&fieldp_pinned);
|
||||||
|
|
||||||
int inum = atom->nlocal;
|
int inum = atom->nlocal;
|
||||||
double *field_ptr = (double *)fieldp_pinned;
|
if (tq_single) {
|
||||||
|
auto field_ptr = (float *)fieldp_pinned;
|
||||||
|
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
int idx = 4*i;
|
int idx = 4*i;
|
||||||
field[i][0] += field_ptr[idx];
|
field[i][0] += field_ptr[idx];
|
||||||
field[i][1] += field_ptr[idx+1];
|
field[i][1] += field_ptr[idx+1];
|
||||||
field[i][2] += field_ptr[idx+2];
|
field[i][2] += field_ptr[idx+2];
|
||||||
}
|
}
|
||||||
|
|
||||||
double* fieldp_ptr = (double *)fieldp_pinned;
|
auto fieldp_ptr = (float *)fieldp_pinned;
|
||||||
fieldp_ptr += 4*inum;
|
fieldp_ptr += 4*inum;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
int idx = 4*i;
|
int idx = 4*i;
|
||||||
fieldp[i][0] += fieldp_ptr[idx];
|
fieldp[i][0] += fieldp_ptr[idx];
|
||||||
fieldp[i][1] += fieldp_ptr[idx+1];
|
fieldp[i][1] += fieldp_ptr[idx+1];
|
||||||
fieldp[i][2] += fieldp_ptr[idx+2];
|
fieldp[i][2] += fieldp_ptr[idx+2];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto field_ptr = (double *)fieldp_pinned;
|
||||||
|
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int idx = 4*i;
|
||||||
|
field[i][0] += field_ptr[idx];
|
||||||
|
field[i][1] += field_ptr[idx+1];
|
||||||
|
field[i][2] += field_ptr[idx+2];
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fieldp_ptr = (double *)fieldp_pinned;
|
||||||
|
fieldp_ptr += 4*inum;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int idx = 4*i;
|
||||||
|
fieldp[i][0] += fieldp_ptr[idx];
|
||||||
|
fieldp[i][1] += fieldp_ptr[idx+1];
|
||||||
|
fieldp[i][2] += fieldp_ptr[idx+2];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// accumulate timing information
|
// accumulate timing information
|
||||||
|
|
||||||
@ -1139,32 +1178,63 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
|
|||||||
&fdip_sum_phi_pinned);
|
&fdip_sum_phi_pinned);
|
||||||
|
|
||||||
int nlocal = atom->nlocal;
|
int nlocal = atom->nlocal;
|
||||||
double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
|
if (tq_single) {
|
||||||
for (int i = 0; i < nlocal; i++) {
|
auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned;
|
||||||
int n = i;
|
for (int i = 0; i < nlocal; i++) {
|
||||||
for (int m = 0; m < 10; m++) {
|
int n = i;
|
||||||
fdip_phi1[i][m] = _fdip_phi1_ptr[n];
|
for (int m = 0; m < 10; m++) {
|
||||||
n += nlocal;
|
fdip_phi1[i][m] = _fdip_phi1_ptr[n];
|
||||||
|
n += nlocal;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
|
auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
int n = i;
|
int n = i;
|
||||||
for (int m = 0; m < 10; m++) {
|
for (int m = 0; m < 10; m++) {
|
||||||
fdip_phi2[i][m] = _fdip_phi2_ptr[n];
|
fdip_phi2[i][m] = _fdip_phi2_ptr[n];
|
||||||
n += nlocal;
|
n += nlocal;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
|
auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
int n = i;
|
int n = i;
|
||||||
for (int m = 0; m < 20; m++) {
|
for (int m = 0; m < 20; m++) {
|
||||||
fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
|
fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
|
||||||
n += nlocal;
|
n += nlocal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int n = i;
|
||||||
|
for (int m = 0; m < 10; m++) {
|
||||||
|
fdip_phi1[i][m] = _fdip_phi1_ptr[n];
|
||||||
|
n += nlocal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int n = i;
|
||||||
|
for (int m = 0; m < 10; m++) {
|
||||||
|
fdip_phi2[i][m] = _fdip_phi2_ptr[n];
|
||||||
|
n += nlocal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int n = i;
|
||||||
|
for (int m = 0; m < 20; m++) {
|
||||||
|
fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
|
||||||
|
n += nlocal;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
@ -1447,15 +1517,26 @@ void PairAmoebaGPU::polar_kspace()
|
|||||||
} else {
|
} else {
|
||||||
void* fphi_pinned = nullptr;
|
void* fphi_pinned = nullptr;
|
||||||
amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
|
amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
|
||||||
|
if (tq_single) {
|
||||||
double *_fphi_ptr = (double *)fphi_pinned;
|
auto _fphi_ptr = (float *)fphi_pinned;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
int idx = i;
|
int idx = i;
|
||||||
for (int m = 0; m < 20; m++) {
|
for (int m = 0; m < 20; m++) {
|
||||||
fphi[i][m] = _fphi_ptr[idx];
|
fphi[i][m] = _fphi_ptr[idx];
|
||||||
idx += nlocal;
|
idx += nlocal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto _fphi_ptr = (double *)fphi_pinned;
|
||||||
|
for (int i = 0; i < nlocal; i++) {
|
||||||
|
int idx = i;
|
||||||
|
for (int m = 0; m < 20; m++) {
|
||||||
|
fphi[i][m] = _fphi_ptr[idx];
|
||||||
|
idx += nlocal;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user