git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7180 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-10-24 17:49:41 +00:00
parent 5a03365db1
commit f3018d53bb
5 changed files with 50 additions and 69 deletions
--- a/src/USER-CUDA/atom_vec_atomic_cuda.cpp
+++ b/src/USER-CUDA/atom_vec_atomic_cuda.cpp
@ -286,14 +286,14 @@ int AtomVecAtomicCuda::pack_exchange(int dim, double *buf)
  }
  if(max_nsend==0) grow_copylist(200);
-  
+
  int nsend_atoms = Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
  if(nsend_atoms>max_nsend) {grow_copylist(nsend_atoms+100);}
  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
  {
  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
-  	Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+   	Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
  }
  int nlocal=atom->nlocal-nsend_atoms;
@ -395,6 +395,7 @@ int AtomVecAtomicCuda::unpack_exchange(double *buf)
    }
  }
  cuda->shared_data.atom.nlocal=nlocal;
  if(atom->nlocal!=nlocal)
  cuda->shared_data.atom.update_nlocal=2;
  atom->nlocal=nlocal;
  mfirst+=m;
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@ -130,11 +130,11 @@ Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp)
 	downloadtime=0;
 	dotiming=false;
-    dotestatom = false;
+  dotestatom = false;
-    testatom = 0;	
+  testatom = 0;
 	oncpu = true;
-    self_comm = 0;
+  self_comm = 0;
 	MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");)
 	//cCudaData<double, float, yx >  
 }
@ -267,10 +267,10 @@ void Cuda::accelerator(int narg, char** arg)
 	cu_virial     = 0;
 	cu_eatom      = 0;
 	cu_vatom      = 0;
-	cu_radius	  = 0;
+	cu_radius	    = 0;
 	cu_density	  = 0;
-	cu_omega	  = 0;
+	cu_omega	    = 0;
-	cu_torque	  = 0;
+	cu_torque	    = 0;
 	cu_special 	  = 0;
 	cu_nspecial   = 0;
@ -299,8 +299,11 @@ void Cuda::setSharedDataZero()
 	shared_data.atom.q_flag = 0;
 	shared_data.atom.need_eatom = 0;
 	shared_data.atom.need_vatom = 0;
  shared_data.atom.update_nmax = 1;
  shared_data.atom.update_nlocal = 1;
  shared_data.atom.update_neigh = 1;
-    shared_data.pair.cudable_force = 0;
+  shared_data.pair.cudable_force = 0;
 	shared_data.pair.collect_forces_later = 0;
 	shared_data.pair.use_block_per_atom = 0;
 	shared_data.pair.override_block_per_atom = -1;
@ -429,14 +432,6 @@ void Cuda::checkResize()
 		if(cu_atom->q_flag)
 			{delete cu_q;          cu_q         = new cCudaData<double, F_FLOAT, x > ((double*)atom->q, & cu_atom->q         , atom->nmax  );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
 /*
 		if(force->pair)
 		if(force->pair->eatom)
 			{delete cu_eatom;          cu_eatom         = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & cu_atom->eatom         , atom->nmax  );}// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
 		if(force->pair)
 		if(force->pair->vatom)
 			{delete cu_vatom;          cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & cu_atom->vatom         , atom->nmax,6  );}// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
 */
 		if(atom->radius)
 		{
 			delete cu_radius;     cu_radius    = new cCudaData<double, X_FLOAT, x > (atom->radius    , & cu_atom->radius     , atom->nmax  );
@ -444,11 +439,6 @@ void Cuda::checkResize()
 		    delete cu_omega_rmass;   cu_omega_rmass  = new cCudaData<V_FLOAT, V_FLOAT, x> (omega_rmass , & cu_atom->omega_rmass      , atom->nmax*4);
 		}
 		/*		
 		if(atom->density)
 			{delete cu_density;    cu_density   = new cCudaData<double, F_FLOAT, x > (atom->density   , & cu_atom->density     , atom->nmax  );}
 		*/
 		if(atom->omega)
 			{delete cu_omega;      cu_omega     = new cCudaData<double, V_FLOAT, yx > (((double*) atom->omega)    , & cu_atom->omega     , atom->nmax,3  );}
@ -464,12 +454,10 @@ void Cuda::checkResize()
 		shared_data.atom.special_flag = neighbor->special_flag;
 		shared_data.atom.molecular = atom->molecular;
-  	    cu_atom->update_nmax = 2;
+    cu_atom->update_nmax = 2;
-	    cu_atom->nmax        = atom->nmax;
+    cu_atom->nmax        = atom->nmax;
 	    //delete [] x_type; 			x_type 		= new X_FLOAT4[atom->nmax];
 		delete cu_x_type;           cu_x_type   = new cCudaData<X_FLOAT, X_FLOAT, x> (x_type , & cu_atom->x_type      , atom->nmax*4);
 	   // shared_data.buffer_new = 2;
 	}
 	if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]<neighbor->maxhold))&&neighbor->xhold)
@ -488,6 +476,12 @@ void Cuda::checkResize()
 	  {
 	  	cu_map_array   = new cCudaData<int, int, x > (atom->get_map_array()   , & cu_atom->map_array     , atom->get_map_size()  );
 	  }
 	  else
 	  if(cu_map_array->dev_size()/sizeof(int)<atom->get_map_size())
 	  {
 	    delete cu_map_array;
      cu_map_array   = new cCudaData<int, int, x > (atom->get_map_array()   , & cu_atom->map_array     , atom->get_map_size()  );
 	  }
 	}
@ -512,11 +506,6 @@ void Cuda::checkResize()
 	if(atom->radius)
 	if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius));
 	/*
 	if(atom->density)
 	if(cu_density->get_host_data() != atom->density) cu_density->set_host_data((double*) (atom->density));
 	*/
 	if(atom->omega)
 	if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega));
@ -558,7 +547,7 @@ void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom)
    	if(not cu_vatom) 
    		cu_vatom         = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom)         , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
    	cu_vatom->set_host_data((double*)force->pair->vatom); 
-		cu_vatom->memset_device(0);
+		  cu_vatom->memset_device(0);
    }
 }
@ -579,16 +568,9 @@ void Cuda::uploadAll()
 	cu_image->upload();
 	if(shared_data.atom.q_flag) cu_q    ->upload();
 	//printf("A3\n");
 	//if(shared_data.atom.need_eatom) cu_eatom->upload();
 	//printf("A4\n");
 	//if(shared_data.atom.need_vatom) cu_vatom->upload();
 	//printf("A5\n");
 	if(atom->rmass)             cu_rmass->upload();
 	if(atom->radius)            cu_radius->upload();
 	//	if(atom->density)           cu_density->upload();
 	if(atom->omega)             cu_omega->upload();
 	if(atom->torque)            cu_torque->upload();
 	if(atom->special)           cu_special->upload();
@ -631,7 +613,6 @@ void Cuda::downloadAll()
 	if(atom->rmass)             cu_rmass->download();
 	if(atom->radius)            cu_radius->download();
 	//	if(atom->density)           cu_density->download();
 	if(atom->omega)             cu_omega->download();
 	if(atom->torque)            cu_torque->download();
 	if(atom->special)           cu_special->download();
@ -747,13 +728,13 @@ void Cuda::setTimingsZero()
 	shared_data.cuda_timings.neigh_special = 0;
 	//PPPM
- 	shared_data.cuda_timings.pppm_particle_map; 
+ 	shared_data.cuda_timings.pppm_particle_map = 0;
-    shared_data.cuda_timings.pppm_make_rho; 
+  shared_data.cuda_timings.pppm_make_rho = 0;
-    shared_data.cuda_timings.pppm_brick2fft; 
+  shared_data.cuda_timings.pppm_brick2fft = 0;
-    shared_data.cuda_timings.pppm_poisson; 
+  shared_data.cuda_timings.pppm_poisson = 0;
-    shared_data.cuda_timings.pppm_fillbrick; 
+  shared_data.cuda_timings.pppm_fillbrick = 0;
-    shared_data.cuda_timings.pppm_fieldforce; 
+  shared_data.cuda_timings.pppm_fieldforce = 0;
-    shared_data.cuda_timings.pppm_compute; 
+  shared_data.cuda_timings.pppm_compute = 0;
 	CudaWrapper_CheckUploadTime(true);
 	CudaWrapper_CheckDownloadTime(true);
@ -789,8 +770,8 @@ void Cuda::print_timings()
 	printf(" Exchange MPI            \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi);
 	printf(" Exchange Kernel Pack    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack);
 	printf(" Exchange Kernel Unpack  \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack);
-    printf(" Exchange Kernel Fill    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill);
+  printf(" Exchange Kernel Fill    \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill);
-    printf(" Exchange CPU Pack	     \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack);
+  printf(" Exchange CPU Pack	     \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack);
 	printf(" Exchange Upload         \t %lf \n",shared_data.cuda_timings.comm_exchange_upload);
 	printf(" Exchange Download       \t %lf \n",shared_data.cuda_timings.comm_exchange_download);
 	printf("\n");
--- a/src/USER-CUDA/cuda_neigh_list.cpp
+++ b/src/USER-CUDA/cuda_neigh_list.cpp
@ -111,6 +111,7 @@ void CudaNeighList::dev_alloc()
 	neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
 	cu_neighbors_inner = new cCudaData<int, int, x> (neighbors_inner	 , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors );
 	}
 	cuda->shared_data.atom.update_neigh=2;
 	MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); )
 }
--- a/src/USER-CUDA/neigh_full_cuda.cpp
+++ b/src/USER-CUDA/neigh_full_cuda.cpp
@ -250,6 +250,7 @@ void NeighborCuda::full_bin_cuda(NeighList *list)
  }*/
  list->cuda_list->cu_numneigh->download();
  list->cuda_list->cu_ilist->download();
  cuda->shared_data.atom.update_neigh=2;
 	//printf("Done\n");
  MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");)
--- a/src/USER-CUDA/verlet_cuda.cpp
+++ b/src/USER-CUDA/verlet_cuda.cpp
@ -564,6 +564,7 @@ void VerletCuda::run(int n)
  cuda->shared_data.atom.reneigh_flag=0;
  cuda->shared_data.atom.update_nlocal=1;
  cuda->shared_data.atom.update_nmax=1;
  cuda->shared_data.atom.update_neigh=1;
  cuda->shared_data.domain.update=1;
  cuda->shared_data.buffer_new=1;
  cuda->uploadtime=0;
@ -627,14 +628,12 @@ void VerletCuda::run(int n)
 		          //start force calculation asynchronus
 			      cuda->shared_data.comm.comm_phase=1;
 			    //  printf("Pre Force Compute\n");
 		          force->pair->compute(eflag, vflag);
 			      timer->stamp(TIME_PAIR);
                  //CudaWrapper_Sync();
 				  //download comm buffers from GPU, perform MPI communication and upload buffers again
 				  clock_gettime(CLOCK_REALTIME,&starttime);
 			   //   printf("Pre forward_comm(2)\n");
 				  comm->forward_comm(2);
 				  clock_gettime(CLOCK_REALTIME,&endtime);
 				  cuda->shared_data.cuda_timings.comm_forward_total+=
@ -642,16 +641,13 @@ void VerletCuda::run(int n)
 				  timer->stamp(TIME_COMM);
 				  //wait for force calculation
 			      //printf("Pre Synch\n");
 				  CudaWrapper_Sync();
 				  timer->stamp(TIME_PAIR);			
 				  //unpack communication buffers
 				  clock_gettime(CLOCK_REALTIME,&starttime);
 			    //  printf("Pre forward_comm(3)\n");
 				  comm->forward_comm(3);
 				  clock_gettime(CLOCK_REALTIME,&endtime);
 			  //    printf("Post forward_comm(3)\n");
 				  cuda->shared_data.cuda_timings.comm_forward_total+=
 						endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
@ -663,11 +659,9 @@ void VerletCuda::run(int n)
 			    else
 			    {
 			  	  //perform standard forward communication
 				//printf("Forward_comm\n");
 				  clock_gettime(CLOCK_REALTIME,&starttime);
 				  comm->forward_comm();
 				  clock_gettime(CLOCK_REALTIME,&endtime);
 				//printf("Forward_comm_done\n");
 				  cuda->shared_data.cuda_timings.comm_forward_total+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
 				  timer->stamp(TIME_COMM);
@ -677,13 +671,13 @@ void VerletCuda::run(int n)
 			else
 			{
 				int nlocalold=cuda->shared_data.atom.nlocal;
- 				//if(firstreneigh)
+ 				if(firstreneigh)
 				{
 				  cuda->shared_data.atom.update_nlocal=1; 
-  				  cuda->shared_data.atom.update_nmax=1;
+  				cuda->shared_data.atom.update_nmax=1;
 				  firstreneigh=0;
 				}
- 				  cuda->shared_data.buffer_new=1;
+ 				cuda->shared_data.buffer_new=1;
 				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); )
 				cuda->setDomainParams();
 				if(n_pre_exchange) modify->pre_exchange();
@ -759,10 +753,10 @@ void VerletCuda::run(int n)
 				cuda->shared_data.cuda_timings.test2+=
 					endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-			    //rebuild neighbor list
+			  //rebuild neighbor list
-			    test_atom(testatom,"Pre Neighbor");
+			  test_atom(testatom,"Pre Neighbor");
 				neighbor->build();
-				timer->stamp(TIME_NEIGHBOR);
+  			timer->stamp(TIME_NEIGHBOR);
 				MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); )
 				//if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU 
@ -772,7 +766,7 @@ void VerletCuda::run(int n)
 					cuda->cu_tag->download();
 					cuda->cu_type->download();
 					cuda->cu_mask->download();
-		      		if(cuda->cu_q) cuda->cu_q->download();
+		   		if(cuda->cu_q) cuda->cu_q->download();
 				}
 				cuda->shared_data.comm.comm_phase=3;
 			}
@ -969,14 +963,16 @@ void VerletCuda::run(int n)
 			test_atom(testatom,"post output");
 			if(cuda->shared_data.atom.update_nlocal>0)
-			cuda->shared_data.atom.update_nlocal--;
+			  cuda->shared_data.atom.update_nlocal--;
-  			if(cuda->shared_data.atom.update_nmax>0)
+  		if(cuda->shared_data.atom.update_nmax>0)
-  			cuda->shared_data.atom.update_nmax--;
+  		  cuda->shared_data.atom.update_nmax--;
-  			if(cuda->shared_data.domain.update>0)
+      if(cuda->shared_data.atom.update_neigh>0)
        cuda->shared_data.atom.update_neigh--;
  		if(cuda->shared_data.domain.update>0)
  			cuda->shared_data.domain.update--;
-  			if(cuda->shared_data.buffer_new>0)
+  		if(cuda->shared_data.buffer_new>0)
  			cuda->shared_data.buffer_new--;
-    		cuda->shared_data.atom.reneigh_flag=0;
+    	cuda->shared_data.atom.reneigh_flag=0;
 		}
@ -984,6 +980,7 @@ void VerletCuda::run(int n)
 		cuda->downloadAllNeighborLists();
  		cuda->shared_data.atom.update_nlocal=1;
  		cuda->shared_data.atom.update_nmax=1;
      cuda->shared_data.atom.update_neigh=1;
  		cuda->shared_data.buffer_new=1;
  		cuda->shared_data.domain.update=1;
  		cuda->oncpu = true;