diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 1f56aeb364..bdfec64f54 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -38,8 +38,8 @@ namespace ucl_opencl { /// Class for timing OpenCL events class UCL_Timer { public: - inline UCL_Timer() : _total_time(0.0f), _initialized(false) { } - inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false) + inline UCL_Timer() : _total_time(0.0f), _initialized(false), has_measured_time(false) { } + inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false), has_measured_time(false) { init(dev); } inline ~UCL_Timer() { clear(); } @@ -52,6 +52,7 @@ class UCL_Timer { _initialized=false; _total_time=0.0; } + has_measured_time = false; } /// Initialize default command queue for timing @@ -64,25 +65,39 @@ class UCL_Timer { _cq=cq; clRetainCommandQueue(_cq); _initialized=true; + has_measured_time = false; } /// Start timing on default command queue - inline void start() { UCL_OCL_MARKER(_cq,&start_event); } + inline void start() { + UCL_OCL_MARKER(_cq,&start_event); + has_measured_time = false; + } /// Stop timing on default command queue - inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); } + inline void stop() { + UCL_OCL_MARKER(_cq,&stop_event); + has_measured_time = true; + } /// Block until the start event has been reached on device - inline void sync_start() - { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); } + inline void sync_start() { + CL_SAFE_CALL(clWaitForEvents(1,&start_event)); + has_measured_time = false; + } /// Block until the stop event has been reached on device - inline void sync_stop() - { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); } + inline void sync_stop() { + CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); + has_measured_time = true; + } /// Set the time elapsed to zero (not the total_time) - inline void zero() - { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } + inline void zero() { + has_measured_time = false; + UCL_OCL_MARKER(_cq,&start_event); + UCL_OCL_MARKER(_cq,&stop_event); + } /// Set the total time to zero inline void zero_total() { _total_time=0.0; } @@ -97,6 +112,7 @@ class UCL_Timer { /// Return the time (ms) of last start to stop - Forces synchronization inline double time() { + if(!has_measured_time) return 0.0; cl_ulong tstart,tend; CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); CL_SAFE_CALL(clGetEventProfilingInfo(stop_event, @@ -107,6 +123,7 @@ class UCL_Timer { sizeof(cl_ulong), &tstart, NULL)); clReleaseEvent(start_event); clReleaseEvent(stop_event); + has_measured_time = false; return (tend-tstart)*t_factor; } @@ -123,8 +140,9 @@ class UCL_Timer { cl_event start_event, stop_event; cl_command_queue _cq; double _total_time; - bool _initialized; double t_factor; + bool _initialized; + bool has_measured_time; }; } // namespace diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index 49b11f3ae0..11aa695f08 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -292,7 +292,8 @@ public: #if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC ) if ( 0 == r ) { - Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory"); + //Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory"); + return handle_type(); } #endif diff --git a/src/USER-UEF/fix_nh_uef.cpp b/src/USER-UEF/fix_nh_uef.cpp index cd0b2ba268..bfa4549286 100644 --- a/src/USER-UEF/fix_nh_uef.cpp +++ b/src/USER-UEF/fix_nh_uef.cpp @@ -536,10 +536,26 @@ void FixNHUef::pre_exchange() rotate_x(rot); rotate_f(rot); - // put all atoms in the new box - double **x = atom->x; + // this is a generalization of what is done in domain->image_flip(...) + int ri[3][3]; + uefbox->get_inverse_cob(ri); imageint *image = atom->image; int nlocal = atom->nlocal; + for (int i=0; i> IMGBITS & IMGMASK) - IMGMAX; + iold[2] = (image[i] >> IMG2BITS) - IMGMAX; + inew[0] = ri[0][0]*iold[0] + ri[0][1]*iold[1] + ri[0][2]*iold[2]; + inew[1] = ri[1][0]*iold[0] + ri[1][1]*iold[1] + ri[1][2]*iold[2]; + inew[2] = ri[2][0]*iold[0] + ri[2][1]*iold[1] + ri[2][2]*iold[2]; + image[i] = ((imageint) (inew[0] + IMGMAX) & IMGMASK) | + (((imageint) (inew[1] + IMGMAX) & IMGMASK) << IMGBITS) | + (((imageint) (inew[2] + IMGMAX) & IMGMASK) << IMG2BITS); + } + + // put all atoms in the new box + double **x = atom->x; for (int i=0; iremap(x[i],image[i]); // move atoms to the right processors diff --git a/src/USER-UEF/uef_utils.cpp b/src/USER-UEF/uef_utils.cpp index a5498d605f..a2e6cb291e 100644 --- a/src/USER-UEF/uef_utils.cpp +++ b/src/USER-UEF/uef_utils.cpp @@ -30,47 +30,54 @@ namespace LAMMPS_NS { UEFBox::UEFBox() { + // initial box (also an inverse eigenvector matrix of automorphisms) + double x = 0.327985277605681; double y = 0.591009048506103; double z = 0.736976229099578; l0[0][0]= z; l0[0][1]= y; l0[0][2]= x; l0[1][0]=-x; l0[1][1]= z; l0[1][2]=-y; l0[2][0]=-y; l0[2][1]= x; l0[2][2]= z; + // spectra of the two automorpisms (log of eigenvalues) + w1[0]=-1.177725211523360; w1[1]=-0.441448620566067; w1[2]= 1.619173832089425; w2[0]= w1[1]; w2[1]= w1[2]; w2[2]= w1[0]; + // initialize theta // strain = w1 * theta1 + w2 * theta2 + theta[0]=theta[1]=0; - //set up the initial box l and change of basis matrix r + for (int k=0;k<3;k++) - for (int j=0;j<3;j++) - { + for (int j=0;j<3;j++) { l[k][j] = l0[k][j]; r[j][k]=(j==k); + ri[j][k]=(j==k); } // get the initial rotation and upper triangular matrix + rotation_matrix(rot, lrot ,l); // this is just a way to calculate the automorphisms // themselves, which play a minor role in the calculations // it's overkill, but only called once + double t1[3][3]; double t1i[3][3]; double t2[3][3]; double t2i[3][3]; double l0t[3][3]; for (int k=0; k<3; ++k) - for (int j=0; j<3; ++j) - { + for (int j=0; j<3; ++j) { t1[k][j] = exp(w1[k])*l0[k][j]; t1i[k][j] = exp(-w1[k])*l0[k][j]; t2[k][j] = exp(w2[k])*l0[k][j]; @@ -82,8 +89,7 @@ UEFBox::UEFBox() mul_m2(l0t,t2); mul_m2(l0t,t2i); for (int k=0; k<3; ++k) - for (int j=0; j<3; ++j) - { + for (int j=0; j<3; ++j) { a1[k][j] = round(t1[k][j]); a1i[k][j] = round(t1i[k][j]); a2[k][j] = round(t2[k][j]); @@ -92,6 +98,7 @@ UEFBox::UEFBox() // winv used to transform between // strain increments and theta increments + winv[0][0] = w2[1]; winv[0][1] = -w2[0]; winv[1][0] = -w1[1]; @@ -102,7 +109,9 @@ UEFBox::UEFBox() winv[k][j] /= d; } -// get volume-correct r basis in: basis*cbrt(vol) = q*r +/* ---------------------------------------------------------------------- + get volume-correct r basis in: basis*cbrt(vol) = q*r +------------------------------------------------------------------------- */ void UEFBox::get_box(double x[3][3], double v) { v = cbrtf(v); @@ -111,7 +120,9 @@ void UEFBox::get_box(double x[3][3], double v) x[k][j] = lrot[k][j]*v; } -// get rotation matrix q in: basis = q*r +/* ---------------------------------------------------------------------- + get rotation matrix q in: basis = q*r +------------------------------------------------------------------------- */ void UEFBox::get_rot(double x[3][3]) { for (int k=0;k<3;k++) @@ -119,20 +130,32 @@ void UEFBox::get_rot(double x[3][3]) x[k][j]=rot[k][j]; } -// diagonal, incompressible deformation +/* ---------------------------------------------------------------------- + get inverse change of basis matrix +------------------------------------------------------------------------- */ +void UEFBox::get_inverse_cob(int x[3][3]) +{ + for (int k=0;k<3;k++) + for (int j=0;j<3;j++) + x[k][j]=ri[k][j]; +} + +/* ---------------------------------------------------------------------- + apply diagonal, incompressible deformation +------------------------------------------------------------------------- */ void UEFBox::step_deform(const double ex, const double ey) { // increment theta values used in the reduction + theta[0] +=winv[0][0]*ex + winv[0][1]*ey; theta[1] +=winv[1][0]*ex + winv[1][1]*ey; - // deformation of the box. reduce() needs to - // be called regularly or calculation will become - // unstable + // deformation of the box. reduce() needs to be called regularly or + // calculation will become unstable + double eps[3]; eps[0]=ex; eps[1] = ey; eps[2] = -ex-ey; - for (int k=0;k<3;k++) - { + for (int k=0;k<3;k++) { eps[k] = exp(eps[k]); l[k][0] = eps[k]*l[k][0]; l[k][1] = eps[k]*l[k][1]; @@ -140,68 +163,84 @@ void UEFBox::step_deform(const double ex, const double ey) } rotation_matrix(rot,lrot, l); } -// reuduce the current basis + +/* ---------------------------------------------------------------------- + reduce the current basis +------------------------------------------------------------------------- */ bool UEFBox::reduce() { - // determine how many times to apply the automorphisms - // and find new theta values + // determine how many times to apply the automorphisms and find new theta + // values + int f1 = round(theta[0]); int f2 = round(theta[1]); theta[0] -= f1; theta[1] -= f2; - // store old change or basis matrix to determine if it - // changes + // store old change or basis matrix to determine if it changes + int r0[3][3]; for (int k=0;k<3;k++) for (int j=0;j<3;j++) r0[k][j]=r[k][j]; - // this modifies the old change basis matrix to - // handle the case where the automorphism transforms - // the box but the reduced basis doesn't change + // this modifies the old change basis matrix to handle the case where the + // automorphism transforms the box but the reduced basis doesn't change // (r0 should still equal r at the end) + if (f1 > 0) for (int k=0;k 0) for (int k=0;k (-q)*m = (-r) will hold row-wise + if (r[0][0] < 0){ neg_row(q,0); neg_row(r,0); } if (r[1][1] < 0){ neg_row(q,1); neg_row(r,1); } if (r[2][2] < 0){ neg_row(q,2); neg_row(r,2); } } - - -//sort columns in order of increasing length -void col_sort(double b[3][3],int r[3][3]) +/* ---------------------------------------------------------------------- + sort columns of b in order of increasing length + mimic column operations on ri and r +------------------------------------------------------------------------- */ +void col_sort(double b[3][3],int r[3][3],int ri[3][3]) { - if (col_prod(b,0,0)>col_prod(b,1,1)) - { + if (col_prod(b,0,0)>col_prod(b,1,1)) { col_swap(b,0,1); col_swap(r,0,1); + col_swap(ri,0,1); } - if (col_prod(b,0,0)>col_prod(b,2,2)) - { + if (col_prod(b,0,0)>col_prod(b,2,2)) { col_swap(b,0,2); col_swap(r,0,2); + col_swap(ri,0,2); } - if (col_prod(b,1,1)>col_prod(b,2,2)) - { + if (col_prod(b,1,1)>col_prod(b,2,2)) { col_swap(b,1,2); col_swap(r,1,2); + col_swap(ri,1,2); } } - -// 1-2 reduction (Graham-Schmidt) -void red12(double b[3][3],int r[3][3]) +/* ---------------------------------------------------------------------- + 1-2 reduction (Graham-Schmidt) +------------------------------------------------------------------------- */ +void red12(double b[3][3],int r[3][3],int ri[3][3]) { int y = round(col_prod(b,0,1)/col_prod(b,0,0)); b[0][1] -= y*b[0][0]; @@ -276,16 +318,23 @@ void red12(double b[3][3],int r[3][3]) r[0][1] -= y*r[0][0]; r[1][1] -= y*r[1][0]; r[2][1] -= y*r[2][0]; - if (col_prod(b,1,1) < col_prod(b,0,0)) - { + + ri[0][0] += y*ri[0][1]; + ri[1][0] += y*ri[1][1]; + ri[2][0] += y*ri[2][1]; + + if (col_prod(b,1,1) < col_prod(b,0,0)) { col_swap(b,0,1); col_swap(r,0,1); - red12(b,r); + col_swap(ri,0,1); + red12(b,r,ri); } } -// The Semaev condition for a 3-reduced basis -void red3(double b[3][3], int r[3][3]) +/* ---------------------------------------------------------------------- + Apply the Semaev condition for a 3-reduced basis +------------------------------------------------------------------------- */ +void red3(double b[3][3], int r[3][3], int ri[3][3]) { double b11 = col_prod(b,0,0); double b22 = col_prod(b,1,1); @@ -304,63 +353,97 @@ void red3(double b[3][3], int r[3][3]) x1v[0] = floor(y1); x1v[1] = x1v[0]+1; x2v[0] = floor(y2); x2v[1] = x2v[0]+1; for (int k=0;k<2;k++) - for (int j=0;j<2;j++) - { + for (int j=0;j<2;j++) { double a[3]; a[0] = b[0][2] + x1v[k]*b[0][0] + x2v[j]*b[0][1]; a[1] = b[1][2] + x1v[k]*b[1][0] + x2v[j]*b[1][1]; a[2] = b[2][2] + x1v[k]*b[2][0] + x2v[j]*b[2][1]; double val=a[0]*a[0]+a[1]*a[1]+a[2]*a[2]; - if (val T col_prod(T x[3][3], int c1, int c2) { @@ -56,8 +57,7 @@ T col_prod(T x[3][3], int c1, int c2) template void col_swap(T x[3][3], int c1, int c2) { - for (int k=0;k<3;k++) - { + for (int k=0;k<3;k++) { T t = x[k][c2]; x[k][c2]=x[k][c1]; x[k][c1]=t; @@ -101,9 +101,21 @@ bool mat_same(T x1[3][3], T x2[3][3]) } template -void mul_m1(T m1[3][3], const T m2[3][3]) +void transpose(T m[3][3]) { T t[3][3]; + for (int k=0;k<3;k++) + for (int j=k+1;j<3;j++) { + T x = m[k][j]; + m[k][j] = m[j][k]; + m[j][k] = x; + } +} + +template +void mul_m1(T1 m1[3][3], const T2 m2[3][3]) +{ + T1 t[3][3]; for (int k=0;k<3;k++) for (int j=0;j<3;j++) t[k][j]=m1[k][j]; @@ -113,10 +125,10 @@ void mul_m1(T m1[3][3], const T m2[3][3]) m1[k][j] = t[k][0]*m2[0][j] + t[k][1]*m2[1][j] + t[k][2]*m2[2][j]; } -template -void mul_m2(const T m1[3][3], T m2[3][3]) +template +void mul_m2(const T1 m1[3][3], T2 m2[3][3]) { - T t[3][3]; + T2 t[3][3]; for (int k=0;k<3;k++) for (int j=0;j<3;j++) t[k][j]=m2[k][j]; diff --git a/src/fix_group.cpp b/src/fix_group.cpp index 2447002bc5..68d74b8b1f 100644 --- a/src/fix_group.cpp +++ b/src/fix_group.cpp @@ -84,7 +84,7 @@ idregion(NULL), idvar(NULL), idprop(NULL) idprop = new char[n]; strcpy(idprop,arg[iarg+1]); iarg += 2; - } else if (strcmp(arg[iarg],"every") == 0) { + } else if (strcmp(arg[iarg],"every") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal group command"); nevery = force->inumeric(FLERR,arg[iarg+1]); if (nevery <= 0) error->all(FLERR,"Illegal group command"); diff --git a/src/group.cpp b/src/group.cpp index 9d33da9acb..dd5e53bb3c 100644 --- a/src/group.cpp +++ b/src/group.cpp @@ -754,6 +754,18 @@ void Group::read_restart(FILE *fp) // computations on a group of atoms // ---------------------------------------------------------------------- +/* ---------------------------------------------------------------------- + count atoms in group all +------------------------------------------------------------------------- */ + +bigint Group::count_all() +{ + bigint nme = atom->nlocal; + bigint nall; + MPI_Allreduce(&nme,&nall,1,MPI_LMP_BIGINT,MPI_SUM,world); + return nall; +} + /* ---------------------------------------------------------------------- count atoms in group ------------------------------------------------------------------------- */ diff --git a/src/group.h b/src/group.h index 6b6cbb1def..962d37b32a 100644 --- a/src/group.h +++ b/src/group.h @@ -37,6 +37,7 @@ class Group : protected Pointers { void write_restart(FILE *); void read_restart(FILE *); + bigint count_all(); // count atoms in group all bigint count(int); // count atoms in group bigint count(int,int); // count atoms in group & region double mass(int); // total mass of atoms in group diff --git a/src/thermo.cpp b/src/thermo.cpp index ade7a3c333..ddbbd0f496 100644 --- a/src/thermo.cpp +++ b/src/thermo.cpp @@ -1698,7 +1698,7 @@ void Thermo::compute_timeremain() void Thermo::compute_atoms() { - bivalue = atom->natoms; + bivalue = group->count_all(); } /* ---------------------------------------------------------------------- */