Misc Improvements to GPU Package

- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -114,7 +114,7 @@ bool AtomT::alloc(const int nall) {
                                UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=q.device.row_bytes();
  }
-  if (_rot && !_host_view) {
+  if (_rot) {
    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=quat.device.row_bytes();
@ -182,11 +182,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
  if (rot && !_rot) {
    _rot=true;
    _other=true;
-    if (!_host_view) {
-      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
-                                     UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=quat.device.row_bytes();
-    }
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=quat.device.row_bytes();
  }

  if (vel && !_vel) {
@ -451,7 +449,7 @@ template <class numtyp, class acctyp>
 void AtomT::compile_kernels(UCL_Device &dev) {
  std::string flags = "";
  atom_program=new UCL_Program(dev);
-  atom_program->load_string(atom,flags,nullptr,screen);
+  atom_program->load_string(atom,flags.c_str(),nullptr,stderr);
  k_cast_x.set_function(*atom_program,"kernel_cast_x");
  _compiled=true;
 }