git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -30,7 +30,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include
|
|||||||
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
|
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
BSH = /bin/sh
|
BSH = /bin/sh
|
||||||
|
|||||||
@ -26,11 +26,11 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||||||
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
||||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||||
|
|
||||||
CUDR_CPP = mpic++ -DMPI_GERYON
|
CUDR_CPP = mpic++ -DMPI_GERYON -openmp
|
||||||
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
|
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
BSH = /bin/sh
|
BSH = /bin/sh
|
||||||
|
|||||||
@ -28,7 +28,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON
|
|||||||
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
|
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
|
|
||||||
|
|||||||
@ -30,7 +30,7 @@ CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
|||||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
BSH = /bin/sh
|
BSH = /bin/sh
|
||||||
|
|||||||
@ -22,7 +22,7 @@ OCL_LINK = -lOpenCL
|
|||||||
OCL_PREC = -D_SINGLE_SINGLE
|
OCL_PREC = -D_SINGLE_SINGLE
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./ocl_obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
BSH = /bin/sh
|
BSH = /bin/sh
|
||||||
|
|||||||
@ -27,7 +27,7 @@ CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
|||||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
|
|
||||||
|
|||||||
@ -30,7 +30,7 @@ CUDR_CPP = mpic++
|
|||||||
CUDR_OPTS = -O2 -m32 -g
|
CUDR_OPTS = -O2 -m32 -g
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
BSH = /bin/sh
|
BSH = /bin/sh
|
||||||
|
|||||||
@ -22,7 +22,7 @@ OCL_LINK = -framework OpenCL
|
|||||||
OCL_PREC = -D_SINGLE_SINGLE
|
OCL_PREC = -D_SINGLE_SINGLE
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./ocl_obj
|
OBJ_DIR = ./
|
||||||
LIB_DIR = ./
|
LIB_DIR = ./
|
||||||
AR = ar
|
AR = ar
|
||||||
BSH = /bin/sh
|
BSH = /bin/sh
|
||||||
|
|||||||
@ -47,6 +47,7 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
|||||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||||
|
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
|
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
|
||||||
$(CUDPP)
|
$(CUDPP)
|
||||||
@ -59,6 +60,7 @@ PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
|
|||||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
|
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
|
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
|
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
|
||||||
|
$(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
|
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
|
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
|
||||||
|
|
||||||
@ -169,6 +171,18 @@ $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_me
|
|||||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
||||||
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
|
||||||
|
$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_kernel.ptx
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
|
$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
|
||||||
|
$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
|
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
|
$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
|
||||||
|
|
||||||
|
|||||||
@ -37,12 +37,14 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
|||||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||||
|
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
|
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
|
||||||
KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
|
KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
|
||||||
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
|
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
|
||||||
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
|
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
|
$(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
|
||||||
|
$(OBJ_DIR)/crml_gpu_cl.h \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h
|
$(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h
|
||||||
|
|
||||||
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
||||||
@ -112,6 +114,15 @@ $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_me
|
|||||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
||||||
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
|
$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
|
||||||
|
$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
||||||
$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
|
$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
|
||||||
|
|
||||||
|
|||||||
@ -46,7 +46,7 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||||||
CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
|
CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
|
||||||
|
|
||||||
bool message=false;
|
bool message=false;
|
||||||
if (world_me==0 && screen)
|
if (CMMMF.device->replica_me()==0 && screen)
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
@ -62,14 +62,14 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
CMMMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
@ -83,7 +83,7 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||||||
if (!init_ok)
|
if (!init_ok)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
MPI_Barrier(CMMMF.device->gpu_comm);
|
CMMMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -203,14 +203,14 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -48,7 +48,7 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||||||
CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
|
CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
|
||||||
|
|
||||||
bool message=false;
|
bool message=false;
|
||||||
if (world_me==0 && screen)
|
if (CMMLMF.device->replica_me()==0 && screen)
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
@ -66,14 +66,14 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
CMMLMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
@ -88,7 +88,7 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||||||
if (!init_ok)
|
if (!init_ok)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
MPI_Barrier(CMMLMF.device->gpu_comm);
|
CMMLMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -114,15 +114,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
sp_lj[7]=sp_lj_in[7];
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(numtyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -256,15 +256,15 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(numtyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -69,7 +69,7 @@ bool gb_gpu_init(const int ntypes, const double gamma,
|
|||||||
GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
|
GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
|
||||||
|
|
||||||
bool message=false;
|
bool message=false;
|
||||||
if (world_me==0 && screen)
|
if (GBMF.device->replica_me()==0 && screen)
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
@ -86,14 +86,14 @@ bool gb_gpu_init(const int ntypes, const double gamma,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
GBMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
@ -108,7 +108,7 @@ bool gb_gpu_init(const int ntypes, const double gamma,
|
|||||||
if (!init_ok)
|
if (!init_ok)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
MPI_Barrier(GBMF.device->gpu_comm);
|
GBMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -105,18 +105,18 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp4 tor;
|
acctyp4 tor;
|
||||||
tor.x=(numtyp)0;
|
tor.x=(acctyp)0;
|
||||||
tor.y=(numtyp)0;
|
tor.y=(acctyp)0;
|
||||||
tor.z=(numtyp)0;
|
tor.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -42,14 +42,14 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -276,14 +276,14 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_ij+ii;
|
__global int *nbor=dev_ij+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -385,14 +385,14 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_ij+ii;
|
__global int *nbor=dev_ij+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -222,7 +222,7 @@ void GB_GPU_MemoryT::clear() {
|
|||||||
single[4]=0;
|
single[4]=0;
|
||||||
single[5]=atom->cast_time();
|
single[5]=atom->cast_time();
|
||||||
|
|
||||||
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
|
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
||||||
double avg_split=hd_balancer.all_avg_split();
|
double avg_split=hd_balancer.all_avg_split();
|
||||||
|
|
||||||
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
||||||
@ -230,12 +230,13 @@ void GB_GPU_MemoryT::clear() {
|
|||||||
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
||||||
gamma_upsilon_mu.row_bytes();
|
gamma_upsilon_mu.row_bytes();
|
||||||
double mpi_max_bytes;
|
double mpi_max_bytes;
|
||||||
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
|
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
||||||
|
device->replica());
|
||||||
double max_mb=mpi_max_bytes/(1024*1024);
|
double max_mb=mpi_max_bytes/(1024*1024);
|
||||||
|
|
||||||
if (device->world_me()==0)
|
if (device->replica_me()==0)
|
||||||
if (screen && times[3]>0.0) {
|
if (screen && times[3]>0.0) {
|
||||||
int world_size=device->world_size();
|
int replica_size=device->replica_size();
|
||||||
|
|
||||||
fprintf(screen,"\n\n-------------------------------------");
|
fprintf(screen,"\n\n-------------------------------------");
|
||||||
fprintf(screen,"--------------------------------\n");
|
fprintf(screen,"--------------------------------\n");
|
||||||
@ -244,15 +245,15 @@ void GB_GPU_MemoryT::clear() {
|
|||||||
fprintf(screen,"--------------------------------\n");
|
fprintf(screen,"--------------------------------\n");
|
||||||
|
|
||||||
if (device->procs_per_gpu()==1) {
|
if (device->procs_per_gpu()==1) {
|
||||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/world_size);
|
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
|
||||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/world_size);
|
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
|
||||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/world_size);
|
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
|
||||||
if (nbor->gpu_nbor())
|
if (nbor->gpu_nbor())
|
||||||
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/world_size);
|
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/world_size);
|
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
|
||||||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/world_size);
|
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
|
||||||
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/world_size);
|
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
|
||||||
}
|
}
|
||||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||||
|
|||||||
@ -95,7 +95,7 @@ class UCL_Device {
|
|||||||
/** \note You cannot delete the default stream **/
|
/** \note You cannot delete the default stream **/
|
||||||
inline void pop_command_queue() {
|
inline void pop_command_queue() {
|
||||||
if (_cq.size()<2) return;
|
if (_cq.size()<2) return;
|
||||||
CUDA_SAFE_CALL_NS(cudaStreamDestroy(_cq.back()));
|
CUDA_DESTRUCT_CALL_NS(cudaStreamDestroy(_cq.back()));
|
||||||
_cq.pop_back();
|
_cq.pop_back();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -290,7 +290,7 @@ inline void UCL_Device::print_all(std::ostream &out) {
|
|||||||
else
|
else
|
||||||
out << "Unknown\n";
|
out << "Unknown\n";
|
||||||
#endif
|
#endif
|
||||||
#if CUDART_VERSION >= 3000
|
#if CUDART_VERSION >= 3010
|
||||||
out << " Concurrent kernel execution: ";
|
out << " Concurrent kernel execution: ";
|
||||||
if (_properties[i].concurrentKernels)
|
if (_properties[i].concurrentKernels)
|
||||||
out << "Yes\n";
|
out << "Yes\n";
|
||||||
|
|||||||
@ -18,6 +18,11 @@
|
|||||||
#define NVC_GERYON_EXIT assert(0==1)
|
#define NVC_GERYON_EXIT assert(0==1)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
#define UCL_SYNC_DEBUG
|
||||||
|
#define UCL_DESTRUCT_CHECK
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef UCL_NO_API_CHECK
|
#ifndef UCL_NO_API_CHECK
|
||||||
|
|
||||||
#define CUDA_SAFE_CALL_NS( call) do { \
|
#define CUDA_SAFE_CALL_NS( call) do { \
|
||||||
@ -53,5 +58,17 @@
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef UCL_DESTRUCT_CHECK
|
||||||
|
|
||||||
|
#define CUDA_DESTRUCT_CALL( call) CUDA_SAFE_CALL( call)
|
||||||
|
#define CUDA_DESTRUCT_CALL_NS( call) CUDA_SAFE_CALL_NS( call)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define CUDA_DESTRUCT_CALL( call) call
|
||||||
|
#define CUDA_DESTRUCT_CALL_NS( call) call
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -233,7 +233,7 @@ inline UCL_Device::UCL_Device() {
|
|||||||
&_properties.back().canMapHostMemory,
|
&_properties.back().canMapHostMemory,
|
||||||
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
||||||
#endif
|
#endif
|
||||||
#if CUDA_VERSION >= 3000
|
#if CUDA_VERSION >= 3010
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||||
&_properties.back().concurrentKernels,
|
&_properties.back().concurrentKernels,
|
||||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
|
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
|
||||||
@ -339,7 +339,7 @@ inline void UCL_Device::print_all(std::ostream &out) {
|
|||||||
else
|
else
|
||||||
out << "No\n";
|
out << "No\n";
|
||||||
#endif
|
#endif
|
||||||
#if CUDA_VERSION >= 3000
|
#if CUDA_VERSION >= 3010
|
||||||
out << " Concurrent kernel execution: ";
|
out << " Concurrent kernel execution: ";
|
||||||
if (_properties[i].concurrentKernels)
|
if (_properties[i].concurrentKernels)
|
||||||
out << "Yes\n";
|
out << "Yes\n";
|
||||||
|
|||||||
@ -18,6 +18,11 @@
|
|||||||
#define NVD_GERYON_EXIT assert(0==1)
|
#define NVD_GERYON_EXIT assert(0==1)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
#define UCL_SYNC_DEBUG
|
||||||
|
#define UCL_DESTRUCT_CHECK
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef UCL_NO_API_CHECK
|
#ifndef UCL_NO_API_CHECK
|
||||||
|
|
||||||
#define CU_SAFE_CALL_NS( call ) do { \
|
#define CU_SAFE_CALL_NS( call ) do { \
|
||||||
@ -53,5 +58,17 @@
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef UCL_DESTRUCT_CHECK
|
||||||
|
|
||||||
|
#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
|
||||||
|
#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define CU_DESTRUCT_CALL( call) call
|
||||||
|
#define CU_DESTRUCT_CALL_NS( call) call
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -78,7 +78,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
||||||
if (kind!=UCL_NOT_PINNED)
|
if (kind!=UCL_NOT_PINNED)
|
||||||
CU_SAFE_CALL(cuMemFreeHost(mat.begin()));
|
CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
|
||||||
else
|
else
|
||||||
free(mat.begin());
|
free(mat.begin());
|
||||||
}
|
}
|
||||||
@ -134,7 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
|||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _device_free(mat_type &mat) {
|
inline void _device_free(mat_type &mat) {
|
||||||
CU_SAFE_CALL(cuMemFree(mat.cbegin()));
|
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
|
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
|
||||||
|
|||||||
@ -41,8 +41,8 @@ class UCL_Timer {
|
|||||||
/** \note init() must be called to reuse timer after a clear() **/
|
/** \note init() must be called to reuse timer after a clear() **/
|
||||||
inline void clear() {
|
inline void clear() {
|
||||||
if (_initialized) {
|
if (_initialized) {
|
||||||
CU_SAFE_CALL(cuEventDestroy(start_event));
|
CU_DESTRUCT_CALL(cuEventDestroy(start_event));
|
||||||
CU_SAFE_CALL(cuEventDestroy(stop_event));
|
CU_DESTRUCT_CALL(cuEventDestroy(stop_event));
|
||||||
_initialized=false;
|
_initialized=false;
|
||||||
_total_time=0.0;
|
_total_time=0.0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -265,10 +265,10 @@ inline UCL_Device::UCL_Device() {
|
|||||||
inline UCL_Device::~UCL_Device() {
|
inline UCL_Device::~UCL_Device() {
|
||||||
if (_device>-1) {
|
if (_device>-1) {
|
||||||
for (size_t i=0; i<_cq.size(); i++) {
|
for (size_t i=0; i<_cq.size(); i++) {
|
||||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
|
||||||
_cq.pop_back();
|
_cq.pop_back();
|
||||||
}
|
}
|
||||||
CL_SAFE_CALL(clReleaseContext(_context));
|
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -51,9 +51,9 @@ class UCL_Program {
|
|||||||
/** \note Must call init() after each clear **/
|
/** \note Must call init() after each clear **/
|
||||||
inline void clear() {
|
inline void clear() {
|
||||||
if (_init_done) {
|
if (_init_done) {
|
||||||
CL_SAFE_CALL(clReleaseProgram(_program));
|
CL_DESTRUCT_CALL(clReleaseProgram(_program));
|
||||||
CL_SAFE_CALL(clReleaseContext(_context));
|
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq));
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
||||||
_init_done=false;
|
_init_done=false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -41,7 +41,7 @@ class UCL_Timer {
|
|||||||
/** \note init() must be called to reuse timer after a clear() **/
|
/** \note init() must be called to reuse timer after a clear() **/
|
||||||
inline void clear() {
|
inline void clear() {
|
||||||
if (_initialized) {
|
if (_initialized) {
|
||||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq));
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
||||||
clReleaseEvent(start_event);
|
clReleaseEvent(start_event);
|
||||||
clReleaseEvent(stop_event);
|
clReleaseEvent(stop_event);
|
||||||
_initialized=false;
|
_initialized=false;
|
||||||
|
|||||||
@ -45,7 +45,7 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
|
LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
|
||||||
|
|
||||||
bool message=false;
|
bool message=false;
|
||||||
if (world_me==0 && screen)
|
if (LJ96MF.device->replica_me()==0 && screen)
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
@ -61,14 +61,14 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
LJ96MF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
@ -82,7 +82,7 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
if (!init_ok)
|
if (!init_ok)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
MPI_Barrier(LJ96MF.device->gpu_comm);
|
LJ96MF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -196,14 +196,14 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -46,7 +46,7 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
|
|||||||
LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
|
LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
|
||||||
|
|
||||||
bool message=false;
|
bool message=false;
|
||||||
if (world_me==0 && screen)
|
if (LJLMF.device->replica_me()==0 && screen)
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
@ -62,14 +62,14 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
LJLMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
@ -83,7 +83,7 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
|
|||||||
if (!init_ok)
|
if (!init_ok)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
MPI_Barrier(LJLMF.device->gpu_comm);
|
LJLMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -91,14 +91,14 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -195,14 +195,14 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -47,7 +47,7 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
|
LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
|
||||||
|
|
||||||
bool message=false;
|
bool message=false;
|
||||||
if (world_me==0 && screen)
|
if (LJCMF.device->replica_me()==0 && screen)
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
@ -65,14 +65,14 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
LJCMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
@ -87,7 +87,7 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
if (!init_ok)
|
if (!init_ok)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
MPI_Barrier(LJCMF.device->gpu_comm);
|
LJCMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -106,15 +106,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
sp_lj[7]=sp_lj_in[7];
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(numtyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -234,15 +234,15 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(numtyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -48,7 +48,7 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu);
|
LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu);
|
||||||
|
|
||||||
bool message=false;
|
bool message=false;
|
||||||
if (world_me==0 && screen)
|
if (LJCLMF.device->replica_me()==0 && screen)
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
@ -66,14 +66,14 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
LJCLMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
@ -88,7 +88,7 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
if (!init_ok)
|
if (!init_ok)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
MPI_Barrier(LJCLMF.device->gpu_comm);
|
LJCLMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -114,15 +114,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
sp_lj[7]=sp_lj_in[7];
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(numtyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
@ -248,15 +248,15 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
|
|
||||||
acctyp energy=(numtyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(numtyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(numtyp)0;
|
f.x=(acctyp)0;
|
||||||
f.y=(numtyp)0;
|
f.y=(acctyp)0;
|
||||||
f.z=(numtyp)0;
|
f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(numtyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
|
|||||||
@ -30,7 +30,8 @@ __win_sort _win_sort;
|
|||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
|
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
|
||||||
_vflag(false),_inum(0),_ilist(NULL) {
|
_vflag(false),_inum(0),_ilist(NULL),
|
||||||
|
_newton(false) {
|
||||||
#ifndef USE_OPENCL
|
#ifndef USE_OPENCL
|
||||||
sort_config.op = CUDPP_ADD;
|
sort_config.op = CUDPP_ADD;
|
||||||
sort_config.datatype = CUDPP_UINT;
|
sort_config.datatype = CUDPP_UINT;
|
||||||
@ -64,7 +65,13 @@ int PairGPUAtomT::bytes_per_atom() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool PairGPUAtomT::alloc(const int max_atoms) {
|
bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
||||||
|
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||||
|
if (_newton)
|
||||||
|
_max_local=_max_atoms;
|
||||||
|
else
|
||||||
|
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||||
|
|
||||||
bool success=true;
|
bool success=true;
|
||||||
|
|
||||||
int ans_elements=4;
|
int ans_elements=4;
|
||||||
@ -79,10 +86,10 @@ bool PairGPUAtomT::alloc(const int max_atoms) {
|
|||||||
// Allocate storage for CUDPP sort
|
// Allocate storage for CUDPP sort
|
||||||
#ifndef USE_OPENCL
|
#ifndef USE_OPENCL
|
||||||
#ifdef WINDLL
|
#ifdef WINDLL
|
||||||
_win_sort_alloc(max_atoms);
|
_win_sort_alloc(_max_atoms);
|
||||||
#else
|
#else
|
||||||
if (_gpu_nbor) {
|
if (_gpu_nbor) {
|
||||||
CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0);
|
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
|
||||||
if (CUDPP_SUCCESS != result)
|
if (CUDPP_SUCCESS != result)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -92,23 +99,23 @@ bool PairGPUAtomT::alloc(const int max_atoms) {
|
|||||||
// -------------------------- Host allocations
|
// -------------------------- Host allocations
|
||||||
// Get a host write only buffer
|
// Get a host write only buffer
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
success=success && (host_x_cast.alloc(max_atoms*3,*dev,
|
success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
success=success && (host_type_cast.alloc(max_atoms,*dev,
|
success=success && (host_type_cast.alloc(_max_atoms,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
#else
|
#else
|
||||||
success=success && (host_x.alloc(max_atoms*4,*dev,
|
success=success && (host_x.alloc(_max_atoms*4,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
success=success && (host_ans.alloc(ans_elements*max_atoms,*dev)==UCL_SUCCESS);
|
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
|
||||||
success=success && (host_engv.alloc(_ev_fields*max_atoms,*dev)==UCL_SUCCESS);
|
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
|
||||||
// Buffer for casting only if different precisions
|
// Buffer for casting only if different precisions
|
||||||
if (_charge)
|
if (_charge)
|
||||||
success=success && (host_q.alloc(max_atoms,*dev,
|
success=success && (host_q.alloc(_max_atoms,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
// Buffer for casting only if different precisions
|
// Buffer for casting only if different precisions
|
||||||
if (_rot)
|
if (_rot)
|
||||||
success=success && (host_quat.alloc(max_atoms*4,*dev,
|
success=success && (host_quat.alloc(_max_atoms*4,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
|
|
||||||
|
|
||||||
@ -128,43 +135,44 @@ bool PairGPUAtomT::alloc(const int max_atoms) {
|
|||||||
dev_q.view(host_q);
|
dev_q.view(host_q);
|
||||||
} else {
|
} else {
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
success=success && (UCL_SUCCESS==dev_x.alloc(max_atoms*4,*dev));
|
success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
|
||||||
success=success && (UCL_SUCCESS==
|
success=success && (UCL_SUCCESS==
|
||||||
dev_x_cast.alloc(max_atoms*3,*dev,UCL_READ_ONLY));
|
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
|
||||||
success=success && (UCL_SUCCESS==
|
success=success && (UCL_SUCCESS==
|
||||||
dev_type_cast.alloc(max_atoms,*dev,UCL_READ_ONLY));
|
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
|
||||||
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||||
#else
|
#else
|
||||||
success=success && (UCL_SUCCESS==
|
success=success && (UCL_SUCCESS==
|
||||||
dev_x.alloc(max_atoms*4,*dev,UCL_READ_ONLY));
|
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
|
||||||
#endif
|
#endif
|
||||||
success=success && (dev_engv.alloc(_ev_fields*max_atoms,*dev,
|
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
|
||||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
success=success && (dev_ans.alloc(ans_elements*max_atoms,
|
success=success && (dev_ans.alloc(ans_elements*_max_local,
|
||||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
if (_charge) {
|
if (_charge) {
|
||||||
success=success && (dev_q.alloc(max_atoms,*dev,
|
success=success && (dev_q.alloc(_max_atoms,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_q.row_bytes();
|
_gpu_bytes+=dev_q.row_bytes();
|
||||||
}
|
}
|
||||||
if (_rot) {
|
if (_rot) {
|
||||||
success=success && (dev_quat.alloc(max_atoms*4,*dev,
|
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_quat.row_bytes();
|
_gpu_bytes+=dev_quat.row_bytes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (_gpu_nbor) {
|
if (_gpu_nbor) {
|
||||||
success=success && (dev_cell_id.alloc(max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
success=success && (dev_particle_id.alloc(max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||||
if (_bonds) {
|
if (_bonds) {
|
||||||
success=success && (dev_tag.alloc(max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_tag.row_bytes();
|
_gpu_bytes+=dev_tag.row_bytes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
|
_gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -188,13 +196,12 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
|||||||
_ev_fields=6+_e_fields;
|
_ev_fields=6+_e_fields;
|
||||||
|
|
||||||
// Initialize atom and nbor data
|
// Initialize atom and nbor data
|
||||||
int max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
int ef_inum=inum;
|
||||||
if (max_local==0)
|
if (ef_inum==0)
|
||||||
max_local=1000;
|
ef_inum=1000;
|
||||||
if (nall<=inum)
|
int ef_nall=nall;
|
||||||
_max_atoms=max_local*2;
|
if (ef_nall<=ef_inum)
|
||||||
else
|
ef_nall=ef_inum*2;
|
||||||
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
|
||||||
|
|
||||||
// Initialize timers for the selected device
|
// Initialize timers for the selected device
|
||||||
time_pos.init(*dev);
|
time_pos.init(*dev);
|
||||||
@ -209,8 +216,7 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
|||||||
compile_kernels(*dev);
|
compile_kernels(*dev);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
_allocated=true;
|
return success && alloc(ef_inum,ef_nall);
|
||||||
return success && alloc(_max_atoms);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -285,7 +291,7 @@ double PairGPUAtomT::host_memory_usage() const {
|
|||||||
atom_bytes+=4;
|
atom_bytes+=4;
|
||||||
int ans_bytes=atom_bytes+_ev_fields;
|
int ans_bytes=atom_bytes+_ev_fields;
|
||||||
return _max_atoms*atom_bytes*sizeof(numtyp)+
|
return _max_atoms*atom_bytes*sizeof(numtyp)+
|
||||||
ans_bytes*(_max_atoms)*sizeof(acctyp)+
|
ans_bytes*(_max_local)*sizeof(acctyp)+
|
||||||
sizeof(PairGPUAtom<numtyp,acctyp>);
|
sizeof(PairGPUAtom<numtyp,acctyp>);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -77,11 +77,9 @@ class PairGPUAtom {
|
|||||||
inline bool resize(const int inum, const int nall, bool &success) {
|
inline bool resize(const int inum, const int nall, bool &success) {
|
||||||
_inum=inum;
|
_inum=inum;
|
||||||
_nall=nall;
|
_nall=nall;
|
||||||
if (nall>_max_atoms) {
|
if (inum>_max_local || nall>_max_atoms) {
|
||||||
clear_resize();
|
clear_resize();
|
||||||
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
success = success && alloc(inum,nall);
|
||||||
_allocated=true;
|
|
||||||
success = success && alloc(_max_atoms);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
@ -203,6 +201,19 @@ class PairGPUAtom {
|
|||||||
ucl_copy(dev_v,view,false);
|
ucl_copy(dev_v,view,false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
|
||||||
|
template <class dev_typ, class t1, class t2>
|
||||||
|
inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
|
||||||
|
UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
|
||||||
|
for (int i=0; i<n; i++) {
|
||||||
|
buffer[i*2]=static_cast<numtyp>(one[i][i]);
|
||||||
|
buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
|
||||||
|
}
|
||||||
|
UCL_H_Vec<dev_typ> view;
|
||||||
|
view.view((dev_typ*)buffer.begin(),n,*dev);
|
||||||
|
ucl_copy(dev_v,view,false);
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------COPY TO GPU ----------------------------------
|
// -------------------------COPY TO GPU ----------------------------------
|
||||||
|
|
||||||
/// Cast positions and types to write buffer
|
/// Cast positions and types to write buffer
|
||||||
@ -386,16 +397,18 @@ class PairGPUAtom {
|
|||||||
|
|
||||||
bool _compiled;
|
bool _compiled;
|
||||||
|
|
||||||
bool alloc(const int max_atoms);
|
bool alloc(const int inum, const int nall);
|
||||||
|
|
||||||
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
||||||
int _max_atoms, _nall, _inum, _e_fields, _ev_fields;
|
int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
|
||||||
bool _gpu_nbor, _bonds;
|
bool _gpu_nbor, _bonds;
|
||||||
int *_ilist;
|
int *_ilist;
|
||||||
double _time_cast;
|
double _time_cast;
|
||||||
|
|
||||||
double _gpu_bytes;
|
double _gpu_bytes;
|
||||||
|
|
||||||
|
bool _newton;
|
||||||
|
|
||||||
#ifndef USE_OPENCL
|
#ifndef USE_OPENCL
|
||||||
CUDPPConfiguration sort_config;
|
CUDPPConfiguration sort_config;
|
||||||
CUDPPHandle sort_plan;
|
CUDPPHandle sort_plan;
|
||||||
|
|||||||
@ -65,11 +65,9 @@ class PairGPUBalance {
|
|||||||
inline double all_avg_split() {
|
inline double all_avg_split() {
|
||||||
if (_load_balance) {
|
if (_load_balance) {
|
||||||
double _all_avg_split=0.0;
|
double _all_avg_split=0.0;
|
||||||
int nprocs;
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
|
|
||||||
MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
|
MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
|
||||||
MPI_COMM_WORLD);
|
_device->replica());
|
||||||
_all_avg_split/=nprocs;
|
_all_avg_split/=_device->replica_size();
|
||||||
return _all_avg_split/_avg_count;
|
return _all_avg_split/_avg_count;
|
||||||
} else
|
} else
|
||||||
return _actual_split;
|
return _actual_split;
|
||||||
@ -83,10 +81,10 @@ class PairGPUBalance {
|
|||||||
inline void start_timer() {
|
inline void start_timer() {
|
||||||
if (_measure_this_step) {
|
if (_measure_this_step) {
|
||||||
_device->gpu->sync();
|
_device->gpu->sync();
|
||||||
MPI_Barrier(_device->gpu_comm);
|
_device->gpu_barrier();
|
||||||
_device_time.start();
|
_device_time.start();
|
||||||
_device->gpu->sync();
|
_device->gpu->sync();
|
||||||
MPI_Barrier(_device->gpu_comm);
|
_device->gpu_barrier();
|
||||||
_device->start_host_timer();
|
_device->start_host_timer();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -178,7 +176,7 @@ void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
|
|||||||
cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
|
cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
|
||||||
|
|
||||||
MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
|
MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
|
||||||
_device->gpu_comm);
|
_device->gpu_comm());
|
||||||
double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
|
double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
|
||||||
split*=_HD_BALANCE_GAP;
|
split*=_HD_BALANCE_GAP;
|
||||||
|
|
||||||
|
|||||||
@ -34,19 +34,28 @@ PairGPUDeviceT::~PairGPUDevice() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
|
bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
||||||
const int gpu_mode, const double p_split) {
|
const int first_gpu, const int last_gpu,
|
||||||
|
const int gpu_mode, const double p_split,
|
||||||
|
const int nthreads) {
|
||||||
|
_nthreads=nthreads;
|
||||||
|
|
||||||
if (_device_init)
|
if (_device_init)
|
||||||
return true;
|
return true;
|
||||||
_device_init=true;
|
_device_init=true;
|
||||||
|
_comm_world=world;
|
||||||
|
_comm_replica=replica;
|
||||||
_first_device=first_gpu;
|
_first_device=first_gpu;
|
||||||
_last_device=last_gpu;
|
_last_device=last_gpu;
|
||||||
_gpu_mode=gpu_mode;
|
_gpu_mode=gpu_mode;
|
||||||
_particle_split=p_split;
|
_particle_split=p_split;
|
||||||
|
|
||||||
// Get the rank within the world
|
// Get the rank/size within the world
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD,&_world_me);
|
MPI_Comm_rank(_comm_world,&_world_me);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD,&_world_size);
|
MPI_Comm_size(_comm_world,&_world_size);
|
||||||
|
// Get the rank/size within the replica
|
||||||
|
MPI_Comm_rank(_comm_replica,&_replica_me);
|
||||||
|
MPI_Comm_size(_comm_replica,&_replica_size);
|
||||||
|
|
||||||
// Get the names of all nodes
|
// Get the names of all nodes
|
||||||
int name_length;
|
int name_length;
|
||||||
@ -54,7 +63,7 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
|
|||||||
char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
|
char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
|
||||||
MPI_Get_processor_name(node_name,&name_length);
|
MPI_Get_processor_name(node_name,&name_length);
|
||||||
MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
|
MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
|
||||||
MPI_MAX_PROCESSOR_NAME,MPI_CHAR,MPI_COMM_WORLD);
|
MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
|
||||||
std::string node_string=std::string(node_name);
|
std::string node_string=std::string(node_name);
|
||||||
|
|
||||||
// Get the number of procs per node
|
// Get the number of procs per node
|
||||||
@ -80,7 +89,7 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
|
|||||||
|
|
||||||
// Set up a per node communicator and find rank within
|
// Set up a per node communicator and find rank within
|
||||||
MPI_Comm node_comm;
|
MPI_Comm node_comm;
|
||||||
MPI_Comm_split(MPI_COMM_WORLD, split_id, 0, &node_comm);
|
MPI_Comm_split(_comm_world, split_id, 0, &node_comm);
|
||||||
int node_rank;
|
int node_rank;
|
||||||
MPI_Comm_rank(node_comm,&node_rank);
|
MPI_Comm_rank(node_comm,&node_rank);
|
||||||
|
|
||||||
@ -90,8 +99,8 @@ bool PairGPUDeviceT::init_device(const int first_gpu, const int last_gpu,
|
|||||||
int my_gpu=node_rank/_procs_per_gpu;
|
int my_gpu=node_rank/_procs_per_gpu;
|
||||||
|
|
||||||
// Set up a per device communicator
|
// Set up a per device communicator
|
||||||
MPI_Comm_split(node_comm,my_gpu,0,&gpu_comm);
|
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
||||||
MPI_Comm_rank(gpu_comm,&_gpu_rank);
|
MPI_Comm_rank(_comm_gpu,&_gpu_rank);
|
||||||
|
|
||||||
gpu=new UCL_Device();
|
gpu=new UCL_Device();
|
||||||
if (my_gpu>=gpu->num_devices())
|
if (my_gpu>=gpu->num_devices())
|
||||||
@ -111,10 +120,13 @@ bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
|
|||||||
return false;
|
return false;
|
||||||
if (_init_count==0) {
|
if (_init_count==0) {
|
||||||
// Initialize atom and nbor data
|
// Initialize atom and nbor data
|
||||||
if (!atom.init(nlocal,nall,charge,rot,*gpu,gpu_nbor,
|
int ef_nlocal=nlocal;
|
||||||
|
if (_particle_split<1.0 && _particle_split>0.0)
|
||||||
|
ef_nlocal=static_cast<int>(_particle_split*nlocal);
|
||||||
|
if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
|
||||||
gpu_nbor && maxspecial>0))
|
gpu_nbor && maxspecial>0))
|
||||||
return false;
|
return false;
|
||||||
if (!nbor.init(nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
|
if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
|
||||||
gpu_host,pre_cut))
|
gpu_host,pre_cut))
|
||||||
return false;
|
return false;
|
||||||
nbor.cell_size(cell_size);
|
nbor.cell_size(cell_size);
|
||||||
@ -136,7 +148,7 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
|
|||||||
std::string fs=toa(gpu->free_gigabytes())+"/";
|
std::string fs=toa(gpu->free_gigabytes())+"/";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (_world_me == 0 && screen) {
|
if (_replica_me == 0 && screen) {
|
||||||
fprintf(screen,"\n-------------------------------------");
|
fprintf(screen,"\n-------------------------------------");
|
||||||
fprintf(screen,"-------------------------------------\n");
|
fprintf(screen,"-------------------------------------\n");
|
||||||
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
|
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
|
||||||
@ -175,14 +187,14 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
|||||||
single[3]=time_pair.total_seconds();
|
single[3]=time_pair.total_seconds();
|
||||||
single[4]=atom.cast_time();
|
single[4]=atom.cast_time();
|
||||||
|
|
||||||
MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
|
MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
||||||
|
|
||||||
double my_max_bytes=max_bytes;
|
double my_max_bytes=max_bytes;
|
||||||
double mpi_max_bytes;
|
double mpi_max_bytes;
|
||||||
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
|
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||||
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||||
|
|
||||||
if (world_me()==0)
|
if (replica_me()==0)
|
||||||
if (screen && times[3]>0.0) {
|
if (screen && times[3]>0.0) {
|
||||||
fprintf(screen,"\n\n-------------------------------------");
|
fprintf(screen,"\n\n-------------------------------------");
|
||||||
fprintf(screen,"--------------------------------\n");
|
fprintf(screen,"--------------------------------\n");
|
||||||
@ -191,14 +203,14 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
|||||||
fprintf(screen,"--------------------------------\n");
|
fprintf(screen,"--------------------------------\n");
|
||||||
|
|
||||||
if (procs_per_gpu()==1) {
|
if (procs_per_gpu()==1) {
|
||||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_world_size);
|
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
|
||||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_world_size);
|
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
|
||||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_world_size);
|
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
|
||||||
if (nbor.gpu_nbor())
|
if (nbor.gpu_nbor())
|
||||||
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_world_size);
|
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_world_size);
|
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
|
||||||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_world_size);
|
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size);
|
||||||
}
|
}
|
||||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||||
@ -239,10 +251,11 @@ double PairGPUDeviceT::host_memory_usage() const {
|
|||||||
template class PairGPUDevice<PRECISION,ACC_PRECISION>;
|
template class PairGPUDevice<PRECISION,ACC_PRECISION>;
|
||||||
PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||||
|
|
||||||
bool lmp_init_device(const int first_gpu, const int last_gpu,
|
bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||||
const int gpu_mode, const double particle_split) {
|
const int last_gpu, const int gpu_mode,
|
||||||
return pair_gpu_device.init_device(first_gpu,last_gpu,gpu_mode,
|
const double particle_split, const int nthreads) {
|
||||||
particle_split);
|
return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
||||||
|
particle_split,nthreads);
|
||||||
}
|
}
|
||||||
|
|
||||||
void lmp_clear_device() {
|
void lmp_clear_device() {
|
||||||
@ -261,3 +274,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
|
|||||||
}
|
}
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,8 +34,9 @@ class PairGPUDevice {
|
|||||||
/// Initialize the device for use by this process
|
/// Initialize the device for use by this process
|
||||||
/** Sets up a per-device MPI communicator for load balancing and initializes
|
/** Sets up a per-device MPI communicator for load balancing and initializes
|
||||||
* the device (>=first_gpu and <=last_gpu) that this proc will be using **/
|
* the device (>=first_gpu and <=last_gpu) that this proc will be using **/
|
||||||
bool init_device(const int first_gpu, const int last_gpu,
|
bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||||
const int gpu_mode, const double particle_split);
|
const int last_gpu, const int gpu_mode,
|
||||||
|
const double particle_split, const int nthreads);
|
||||||
|
|
||||||
/// Initialize the device for Atom and Neighbor storage
|
/// Initialize the device for Atom and Neighbor storage
|
||||||
/** \param rot True if quaternions need to be stored
|
/** \param rot True if quaternions need to be stored
|
||||||
@ -83,12 +84,26 @@ class PairGPUDevice {
|
|||||||
|
|
||||||
/// Return the number of procs sharing a device (size of device commincator)
|
/// Return the number of procs sharing a device (size of device commincator)
|
||||||
inline int procs_per_gpu() const { return _procs_per_gpu; }
|
inline int procs_per_gpu() const { return _procs_per_gpu; }
|
||||||
/// Return my rank in the device communicator
|
/// Return the number of threads per proc
|
||||||
inline int gpu_rank() const { return _gpu_rank; }
|
inline int num_threads() const { return _nthreads; }
|
||||||
/// My rank within all processes
|
/// My rank within all processes
|
||||||
inline int world_me() const { return _world_me; }
|
inline int world_me() const { return _world_me; }
|
||||||
/// Total number of processes
|
/// Total number of processes
|
||||||
inline int world_size() const { return _world_size; }
|
inline int world_size() const { return _world_size; }
|
||||||
|
/// MPI Barrier for world
|
||||||
|
inline void world_barrier() { MPI_Barrier(_comm_world); }
|
||||||
|
/// Return the replica MPI communicator
|
||||||
|
inline MPI_Comm & replica() { return _comm_replica; }
|
||||||
|
/// My rank within replica communicator
|
||||||
|
inline int replica_me() const { return _replica_me; }
|
||||||
|
/// Number of procs in replica communicator
|
||||||
|
inline int replica_size() const { return _replica_size; }
|
||||||
|
/// Return the per-GPU MPI communicator
|
||||||
|
inline MPI_Comm & gpu_comm() { return _comm_gpu; }
|
||||||
|
/// Return my rank in the device communicator
|
||||||
|
inline int gpu_rank() const { return _gpu_rank; }
|
||||||
|
/// MPI Barrier for gpu
|
||||||
|
inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
|
||||||
/// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
|
/// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
|
||||||
inline int gpu_mode() const { return _gpu_mode; }
|
inline int gpu_mode() const { return _gpu_mode; }
|
||||||
/// Index of first device used by a node
|
/// Index of first device used by a node
|
||||||
@ -104,8 +119,6 @@ class PairGPUDevice {
|
|||||||
|
|
||||||
/// Geryon Device
|
/// Geryon Device
|
||||||
UCL_Device *gpu;
|
UCL_Device *gpu;
|
||||||
/// Device communicator
|
|
||||||
MPI_Comm gpu_comm;
|
|
||||||
|
|
||||||
enum{GPU_FORCE, GPU_NEIGH};
|
enum{GPU_FORCE, GPU_NEIGH};
|
||||||
|
|
||||||
@ -122,8 +135,10 @@ class PairGPUDevice {
|
|||||||
private:
|
private:
|
||||||
int _init_count;
|
int _init_count;
|
||||||
bool _device_init;
|
bool _device_init;
|
||||||
int _procs_per_gpu, _gpu_rank, _world_me, _world_size;
|
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
|
||||||
int _gpu_mode, _first_device, _last_device;
|
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
|
||||||
|
_replica_size;
|
||||||
|
int _gpu_mode, _first_device, _last_device, _nthreads;
|
||||||
double _particle_split;
|
double _particle_split;
|
||||||
double _cpu_full;
|
double _cpu_full;
|
||||||
|
|
||||||
|
|||||||
@ -85,6 +85,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
#define MAX_SHARED_TYPES 8
|
||||||
|
#define MAX_BIO_SHARED_TYPES 128
|
||||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user