Replacing mul24 with fast_mul in device code.

This commit is contained in:
W. Michael Brown
2011-10-23 18:04:42 -04:00
parent b0ee11cadd
commit cab77720de
21 changed files with 46 additions and 45 deletions

View File

@ -147,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {

View File

@ -184,7 +184,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;

View File

@ -216,7 +216,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;

View File

@ -25,7 +25,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#define atom_info(t_per_atom, ii, tid, offset) \
tid=THREAD_ID_X; \
offset=tid & (t_per_atom-1); \
ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, list_end, nbor) \
@ -34,9 +34,9 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
nbor+=nbor_stride; \
numj=*nbor; \
nbor+=nbor_stride; \
list_end=nbor+mul24(nbor_stride,numj); \
nbor+=mul24(offset,nbor_stride); \
stride=mul24(t_per_atom,nbor_stride);
list_end=nbor+fast_mul(nbor_stride,numj); \
nbor+=fast_mul(offset,nbor_stride); \
stride=fast_mul(t_per_atom,nbor_stride);
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \

View File

@ -37,12 +37,12 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24(iw,ntypes);
int itype=fast_mul(iw,ntypes);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
@ -90,7 +90,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
cutsq[ii]=cut_form[ii].x;
form[ii]=cut_form[ii].y;
}
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
ii+=fast_mul((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
__syncthreads();
if (ii<inum) {
@ -99,12 +99,12 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {

View File

@ -196,7 +196,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
int mtype=fast_mul(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);

View File

@ -131,7 +131,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
int mtype=fast_mul(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
@ -357,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {

View File

@ -139,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {

View File

@ -140,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {

View File

@ -178,7 +178,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;

View File

@ -171,7 +171,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;

View File

@ -176,7 +176,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;

View File

@ -142,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {

View File

@ -140,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {

View File

@ -21,7 +21,7 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
const int inum, const int t_per_atom) {
int tid=THREAD_ID_X;
int offset=tid & (t_per_atom-1);
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
if (ii<inum) {
__global int *nbor=dev_nbor+ii+inum;
@ -30,8 +30,8 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
__global int *list=dev_ij+*nbor;
__global int *list_end=list+numj;
list+=offset;
nbor+=mul24(ii,t_per_atom-1)+offset;
int stride=mul24(t_per_atom,inum);
nbor+=fast_mul(ii,t_per_atom-1)+offset;
int stride=fast_mul(t_per_atom,inum);
for ( ; list<list_end; list++) {
*nbor=*list;

View File

@ -217,7 +217,7 @@ __kernel void kernel_special(__global int *dev_nbor,
__global int *nspecial, __global int *special,
int inum, int nt, int max_nbors, int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid & (t_per_atom-1);
@ -234,15 +234,15 @@ __kernel void kernel_special(__global int *dev_nbor,
stride=inum;
list=dev_nbor+stride+ii;
numj=*list;
list+=stride+mul24(ii,t_per_atom-1);
stride=mul24(inum,t_per_atom);
list_end=list+mul24(int(numj/t_per_atom),stride)+(numj & (t_per_atom-1));
list+=stride+fast_mul(ii,t_per_atom-1);
stride=fast_mul(inum,t_per_atom);
list_end=list+fast_mul(int(numj/t_per_atom),stride)+(numj & (t_per_atom-1));
list+=offset;
} else {
stride=1;
list=host_nbor_list+(ii-inum)*max_nbors;
numj=host_numj[ii-inum];
list_end=list+mul24(numj,stride);
list_end=list+fast_mul(numj,stride);
}
for ( ; list<list_end; list+=stride) {

View File

@ -53,7 +53,7 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_,
// Resequence the atom indices to avoid collisions during atomic ops
int nthreads=GLOBAL_SIZE_X;
ii=mul24(ii,PPPM_BLOCK_1D);
ii=fast_mul(ii,PPPM_BLOCK_1D);
ii-=(ii/nthreads)*(nthreads-1);
int nx,ny,nz;
@ -130,21 +130,21 @@ __kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
y_stop-=ny-nlocal_y+1;
if (nz>=nlocal_z)
z_stop-=nz-nlocal_z+1;
int z_stride=mul24(nlocal_x,nlocal_y);
int z_stride=fast_mul(nlocal_x,nlocal_y);
int loop_count=npts_x/PENCIL_SIZE+1;
int nx=fid;
int pt=mul24(nz,mul24(npts_y,npts_x))+mul24(ny,npts_x)+nx;
int pt=fast_mul(nz,fast_mul(npts_y,npts_x))+fast_mul(ny,npts_x)+nx;
for (int i=0 ; i<loop_count; i++) {
for (int n=0; n<order; n++)
ans[n][tid]=(grdtyp)0.0;
if (nx<nlocal_x && nz<npts_z) {
int z_pos=mul24(nz+z_start-order_m_1,z_stride);
int z_pos=fast_mul(nz+z_start-order_m_1,z_stride);
for (int m=z_start; m<z_stop; m++) {
int y_pos=mul24(ny+y_start-order_m_1,nlocal_x);
int y_pos=fast_mul(ny+y_start-order_m_1,nlocal_x);
for (int l=y_start; l<y_stop; l++) {
int pos=z_pos+y_pos+nx;
int natoms=mul24(counts[pos],atom_stride);
int natoms=fast_mul(counts[pos],atom_stride);
for (int row=pos; row<natoms; row+=atom_stride) {
grdtyp4 delta=atoms[row];
@ -240,13 +240,13 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
}
}
int mz=mul24(nz,npts_yx)+nx;
int mz=fast_mul(nz,npts_yx)+nx;
for (int n=0; n<order; n++) {
grdtyp rho1d_2=(grdtyp)0.0;
for (int k=order2+n; k>=n; k-=order)
rho1d_2=rho_coeff[k]+rho1d_2*dz;
grdtyp z0=qs*rho1d_2;
int my=mz+mul24(ny,npts_x);
int my=mz+fast_mul(ny,npts_x);
for (int m=0; m<order; m++) {
grdtyp y0=z0*rho1d_1[m][tid];
for (int l=0; l<order; l++) {

View File

@ -105,10 +105,10 @@ ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; }
#endif
#if (__CUDA_ARCH__ < 200)
#define mul24 __mul24
#define fast_mul __mul24
#define MEM_THREADS 16
#else
#define mul24(X,Y) (X)*(Y)
#define fast_mul(X,Y) (X)*(Y)
#define MEM_THREADS 32
#endif
@ -144,6 +144,7 @@ typedef struct _double4 double4;
#ifdef GENERIC_OCL
#define fast_mul mul24
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)

View File

@ -197,7 +197,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
gpu_times3(aTe1,a1,temp);
numtyp sigma, epsilon;
int mtype=mul24(ntypes,itype)+jtype;
int mtype=fast_mul(ntypes,itype)+jtype;
sigma = sig_eps[mtype].x;
epsilon = sig_eps[mtype].y*factor_lj;

View File

@ -100,7 +100,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
rhat[2] = r[2]*rnorm;
numtyp sigma, epsilon;
int mtype=mul24(ntypes,itype)+jtype;
int mtype=fast_mul(ntypes,itype)+jtype;
sigma = sig_eps[mtype].x;
epsilon = sig_eps[mtype].y*factor_lj;
@ -335,7 +335,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
rhat[2] = r[2]*rnorm;
numtyp sigma, epsilon;
int mtype=mul24(ntypes,itype)+jtype;
int mtype=fast_mul(ntypes,itype)+jtype;
sigma = sig_eps[mtype].x;
epsilon = sig_eps[mtype].y*factor_lj;
@ -577,7 +577,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {

View File

@ -8,6 +8,6 @@ mkdir /tmp/cpp5678/lgpu
foreach file ( $files )
# /bin/cp $file /tmp/cpp5678/$file:t:t
# ------ Sed Replace
sed -i 's/atom->dev_engv/ans->dev_engv/g' $file
sed -i.bak 's/atom->dev_engv/ans->dev_engv/g' $file
end