Merge branch 'master' into gpu-bugfix

This commit is contained in:
Axel Kohlmeyer
2020-08-27 17:05:26 -04:00
3176 changed files with 119404 additions and 111819 deletions

View File

@ -3,13 +3,13 @@
// -------------------------------------------------------------
// $Revision: 3572$
// $Date: 2010-11-23 13:04:43 -0700 (Tue, 23 Nov 2010) $
// -------------------------------------------------------------
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
// -------------------------------------------------------------
#ifndef __CUDPP_PLAN_H__
#define __CUDPP_PLAN_H__
typedef void* KernelPointer;
extern "C" size_t getNumCTAs(KernelPointer kernel);
@ -30,10 +30,10 @@ void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threads
/** @brief Base class for CUDPP Plan data structures
*
* CUDPPPlan and its subclasses provide the internal (i.e. not visible to the
* library user) infrastructure for planning algorithm execution. They
* library user) infrastructure for planning algorithm execution. They
* own intermediate storage for CUDPP algorithms as well as, in some cases,
* information about optimal execution configuration for the present hardware.
*
*
*/
class CUDPPPlan
{
@ -91,7 +91,7 @@ public:
CUDPPScanPlan *m_scanPlan; //!< @internal Compact performs a scan of type unsigned int using this plan
unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan
};
class CUDPPRadixSortPlan : public CUDPPPlan
@ -99,7 +99,7 @@ class CUDPPRadixSortPlan : public CUDPPPlan
public:
CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements);
virtual ~CUDPPRadixSortPlan();
bool m_bKeysOnly;
bool m_bManualCoalesce;
bool m_bUsePersistentCTAs;
@ -123,22 +123,22 @@ class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan
public:
CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts,
const void *A,
const unsigned int *rowindx,
const unsigned int *rowindx,
const unsigned int *indx, size_t numRows);
virtual ~CUDPPSparseMatrixVectorMultiplyPlan();
CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan
void *m_d_prod; //!< @internal Vector of products (of an element in A and its corresponding (thats is
//! belongs to the same row) element in x; this is the input and output of
//! belongs to the same row) element in x; this is the input and output of
//! segmented scan
unsigned int *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element
//! of its row; this is the flags vector for segmented scan
unsigned int *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the last element of that row. Resides in GPU memory.
//! which is the last element of that row. Resides in GPU memory.
unsigned int *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the first element of that row. Resides in GPU memory.
unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A
void *m_d_A; //!<@internal The A matrix
//! which is the first element of that row. Resides in GPU memory.
unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A
void *m_d_A; //!<@internal The A matrix
unsigned int *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the last element of that row. Resides in CPU memory.
size_t m_numRows; //!< Number of rows

View File

@ -1,32 +1,32 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* CUda UTility Library */
@ -36,7 +36,7 @@
#include <cuda_runtime.h>
#ifdef _WIN32
# pragma warning( disable : 4996 ) // disable deprecated warning
# pragma warning( disable : 4996 ) // disable deprecated warning
#endif
#ifdef __cplusplus
@ -50,8 +50,8 @@ extern "C" {
# else
# define DLL_MAPPING __declspec(dllimport)
# endif
#else
# define DLL_MAPPING
#else
# define DLL_MAPPING
#endif
#ifdef _WIN32
@ -64,7 +64,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
//! CUT bool type
////////////////////////////////////////////////////////////////////////////
enum CUTBoolean
enum CUTBoolean
{
CUTFalse = 0,
CUTTrue = 1
@ -72,11 +72,11 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
//! Deallocate memory allocated within Cutil
//! @param pointer to memory
//! @param pointer to memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
void CUTIL_API
cutFree( void* ptr);
cutFree( void* ptr);
////////////////////////////////////////////////////////////////////////////
//! Helper for bank conflict checking (should only be used with the
@ -95,7 +95,7 @@ extern "C" {
DLL_MAPPING
void CUTIL_API
cutCheckBankAccess( unsigned int tidx, unsigned int tidy, unsigned int tidz,
unsigned int bdimx, unsigned int bdimy,
unsigned int bdimx, unsigned int bdimy,
unsigned int bdimz, const char* file, const int line,
const char* aname, const int index);
@ -141,8 +141,8 @@ extern "C" {
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFilef( const char* filename, float** data, unsigned int* len,
CUTBoolean CUTIL_API
cutReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
@ -157,8 +157,8 @@ extern "C" {
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFiled( const char* filename, double** data, unsigned int* len,
CUTBoolean CUTIL_API
cutReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
@ -173,7 +173,7 @@ extern "C" {
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
@ -183,13 +183,13 @@ extern "C" {
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFileui( const char* filename, unsigned int** data,
CUTBoolean CUTIL_API
cutReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
@ -199,13 +199,13 @@ extern "C" {
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFileb( const char* filename, char** data, unsigned int* len,
CUTBoolean CUTIL_API
cutReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
@ -220,12 +220,12 @@ extern "C" {
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFileub( const char* filename, unsigned char** data,
CUTBoolean CUTIL_API
cutReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! Write a data file \filename containing single precision floating point
//! data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
@ -234,12 +234,12 @@ extern "C" {
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! Write a data file \filename containing double precision floating point
//! data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
@ -248,7 +248,7 @@ extern "C" {
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
@ -260,7 +260,7 @@ extern "C" {
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
@ -272,8 +272,8 @@ extern "C" {
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFileui( const char* filename,const unsigned int* data,
CUTBoolean CUTIL_API
cutWriteFileui( const char* filename,const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
@ -284,8 +284,8 @@ extern "C" {
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFileb( const char* filename, const char* data, unsigned int len,
CUTBoolean CUTIL_API
cutWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
@ -296,7 +296,7 @@ extern "C" {
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutWriteFileub( const char* filename,const unsigned char* data,
unsigned int len, bool verbose = false);
@ -307,7 +307,7 @@ extern "C" {
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
@ -326,11 +326,11 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPPMub( const char* file, unsigned char** data,
cutLoadPPMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
@ -340,7 +340,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPPM4ub( const char* file, unsigned char** data,
cutLoadPPM4ub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
@ -350,13 +350,13 @@ extern "C" {
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPGMi( const char* file, unsigned int** data,
cutLoadPGMi( const char* file, unsigned int** data,
unsigned int* w, unsigned int* h);
////////////////////////////////////////////////////////////////////////////
@ -366,13 +366,13 @@ extern "C" {
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPGMs( const char* file, unsigned short** data,
cutLoadPGMs( const char* file, unsigned short** data,
unsigned int* w, unsigned int* h);
////////////////////////////////////////////////////////////////////////////
@ -381,7 +381,7 @@ extern "C" {
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
@ -399,7 +399,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePGMub( const char* file, unsigned char* data,
cutSavePGMub( const char* file, unsigned char* data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
@ -411,11 +411,11 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePPMub( const char* file, unsigned char *data,
cutSavePPMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @param file name of the image file
//! @param data handle to the data read
@ -424,7 +424,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePPM4ub( const char* file, unsigned char *data,
cutSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
@ -465,15 +465,15 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// * All command line arguments begin with '--' followed by the token;
// token and value are separated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return CUTTrue if command line argument \a flag_name has been given,
//! @return CUTTrue if command line argument \a flag_name has been given,
//! otherwise 0
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
@ -481,7 +481,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutCheckCmdLineFlag( const int argc, const char** argv,
cutCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
@ -495,7 +495,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumenti( const int argc, const char** argv,
cutGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
@ -509,7 +509,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumentf( const int argc, const char** argv,
cutGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
@ -523,7 +523,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumentstr( const int argc, const char** argv,
cutGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
@ -538,8 +538,8 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
cutGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
@ -556,46 +556,46 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutComparei( const int* reference, const int* data,
const unsigned int len );
CUTBoolean CUTIL_API
cutComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return CUTTrue if \a reference and \a data are identical,
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
@ -609,7 +609,7 @@ extern "C" {
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return CUTTrue if \a reference and \a data are identical,
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
@ -617,14 +617,14 @@ extern "C" {
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return CUTTrue if \a reference and \a data are identical,
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
@ -632,7 +632,7 @@ extern "C" {
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
@ -645,7 +645,7 @@ extern "C" {
//! @param name of the new timer, 0 if the creation failed
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutCreateTimer( unsigned int* name);
////////////////////////////////////////////////////////////////////////////
@ -654,7 +654,7 @@ extern "C" {
//! @param name of the timer to delete
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutDeleteTimer( unsigned int name);
////////////////////////////////////////////////////////////////////////////
@ -662,7 +662,7 @@ extern "C" {
//! @param name name of the timer to start
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutStartTimer( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
@ -670,7 +670,7 @@ extern "C" {
//! @param name name of the timer to stop
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutStopTimer( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
@ -678,27 +678,27 @@ extern "C" {
//! @param name name of the timer to reset.
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
CUTBoolean CUTIL_API
cutResetTimer( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Returns total execution time in milliseconds for the timer over all
//! Returns total execution time in milliseconds for the timer over all
//! runs since the last reset or timer creation.
//! @param name name of the timer to return the time of
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
float CUTIL_API
float CUTIL_API
cutGetTimerValue( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Return the average time in milliseconds for timer execution as the
//! Return the average time in milliseconds for timer execution as the
//! total time for the timer dividied by the number of completed (stopped)
//! runs the timer has made.
//! Excludes the current running time if the timer is currently running.
//! @param name name of the timer to return the time of
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
float CUTIL_API
float CUTIL_API
cutGetAverageTimerValue( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
@ -764,7 +764,7 @@ extern "C" {
fprintf(stderr, "Cut error in file '%s' in line %i.\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
}
}
//! Check for CUDA error
# define CUT_CHECK_ERROR(errorMessage) do { \
@ -802,7 +802,7 @@ extern "C" {
// void macros for performance reasons
# define CUT_CHECK_ERROR(errorMessage)
# define CUT_CHECK_ERROR_GL()
# define CUT_CONDITION( val)
# define CUT_CONDITION( val)
# define CU_SAFE_CALL_NO_SYNC( call) call
# define CU_SAFE_CALL( call) call
# define CUDA_SAFE_CALL_NO_SYNC( call) call

View File

@ -625,9 +625,9 @@ int UCL_Device::set_platform_accelerator(int pid) {
for (int n=0; n<_num_platforms; n++) {
set_platform(n);
for (int i=0; i<num_devices(); i++) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU ||
_properties[i].device_type==CL_DEVICE_TYPE_GPU ||
_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR) {
if ((_properties[i].device_type & CL_DEVICE_TYPE_CPU) ||
(_properties[i].device_type & CL_DEVICE_TYPE_GPU) ||
(_properties[i].device_type & CL_DEVICE_TYPE_ACCELERATOR)) {
found = 1;
break;
}

View File

@ -1,4 +1,4 @@
// **************************************************************************
// **************************************************************************
// preprocessor.cu
// -------------------
// W. Michael Brown (ORNL)

View File

@ -65,7 +65,7 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kßernels
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types

View File

@ -308,7 +308,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
}
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \
#define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy) \
{ \
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
@ -361,7 +361,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
} \
}
#define threebody_half(delr1x, delr1y, delr1z) \
#define threebody_half(delr1x, delr1y, delr1z, delr2x, delr2y, delr2z) \
{ \
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
@ -511,7 +511,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy);
threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
f.x -= fjx + fkx;
f.y -= fjy + fky;
@ -665,12 +665,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz;
//if (evatom==0) {
threebody_half(delr1x,delr1y,delr1z);
//} else {
// numtyp fkx, fky, fkz;
// threebody(delr1x,delr1y,delr1z,eflag,energy);
//}
threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
f.x += fjx;
f.y += fjy;
@ -819,7 +814,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy);
threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
f.x += fjx;
f.y += fjy;

View File

@ -250,11 +250,10 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_cutshortsq, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;

View File

@ -165,13 +165,10 @@ _texture( ts5_tex,int4);
#endif
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const numtyp _cutshortsq,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
@ -185,8 +182,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
@ -200,9 +195,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -210,7 +202,7 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
if (rsq<_cutshortsq) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
@ -460,7 +452,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
// rsq<cutsq[ijparam]
if (rsq >= cutsq[ijparam]) continue;
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
@ -574,6 +567,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
delr1[1] = jx.y-ix.y;
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 >= cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -715,7 +709,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local int red_acc[2*BLOCK_PAIR];
__local int ijnum_shared[BLOCK_PAIR];
__syncthreads();
@ -749,7 +743,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delr1[3];
@ -796,21 +789,14 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
red_acc[2*m+0] = ijnum;
red_acc[2*m+1] = offset_k;
ijnum_shared[m] = ijnum;
break;
}
}
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
int offset_kf;
if (ijnum >= 0) {
offset_kf = offset_k;
} else {
ijnum = red_acc[2*m+0];
offset_kf = red_acc[2*m+1];
}
if (ijnum < 0) ijnum = ijnum_shared[m];
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
@ -853,7 +839,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
delr2[2] = kx.z-jx.z;
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
if (rsq2 > cutsq[jikparam]) continue;
numtyp r2 = ucl_sqrt(rsq2);
numtyp r2inv = ucl_rsqrt(rsq2);
numtyp4 ts1_param, ts2_param, ts4_param;
@ -953,7 +938,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local int red_acc[2*BLOCK_PAIR];
__local int red_acc[BLOCK_PAIR];
__syncthreads();
@ -987,7 +972,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delr1[3];
@ -1034,21 +1018,14 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
red_acc[2*m+0] = ijnum;
red_acc[2*m+1] = offset_k;
red_acc[m] = ijnum;
break;
}
}
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
int offset_kf;
if (ijnum >= 0) {
offset_kf = offset_k;
} else {
ijnum = red_acc[2*m+0];
offset_kf = red_acc[2*m+1];
}
if (ijnum < 0) ijnum = red_acc[m];
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;

View File

@ -192,7 +192,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
_allocated=true;
this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
ts4.row_bytes()+cutsq.row_bytes()+
ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
return 0;
}
@ -250,11 +250,10 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_cutshortsq, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;

View File

@ -165,13 +165,10 @@ _texture( ts5_tex,int4);
#endif
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const numtyp _cutshortsq,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
@ -185,8 +182,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
@ -200,9 +195,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -210,7 +202,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
if (rsq<_cutshortsq) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
@ -461,7 +453,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
// rsq<cutsq[ijparam]
if (rsq >= cutsq[ijparam]) continue;
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
@ -578,6 +571,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
delr1[1] = jx.y-ix.y;
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 >= cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -725,7 +719,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local int red_acc[2*BLOCK_PAIR];
__local int ijnum_shared[BLOCK_PAIR];
__syncthreads();
@ -759,7 +753,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delr1[3];
@ -806,21 +799,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
red_acc[2*m+0] = ijnum;
red_acc[2*m+1] = offset_k;
ijnum_shared[m] = ijnum;
break;
}
}
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
int offset_kf;
if (ijnum >= 0) {
offset_kf = offset_k;
} else {
ijnum = red_acc[2*m+0];
offset_kf = red_acc[2*m+1];
}
if (ijnum < 0) ijnum = ijnum_shared[m];
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
@ -863,7 +849,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
delr2[2] = kx.z-jx.z;
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
if (rsq2 > cutsq[jikparam]) continue;
numtyp r2 = ucl_sqrt(rsq2);
numtyp r2inv = ucl_rsqrt(rsq2);
numtyp4 ts1_param, ts2_param, ts4_param, ts5_param;
@ -892,6 +877,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -971,7 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local int red_acc[2*BLOCK_PAIR];
__local int ijnum_shared[BLOCK_PAIR];
__syncthreads();
@ -1005,7 +991,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delr1[3];
@ -1052,21 +1037,14 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
red_acc[2*m+0] = ijnum;
red_acc[2*m+1] = offset_k;
ijnum_shared[m] = ijnum;
break;
}
}
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
int offset_kf;
if (ijnum >= 0) {
offset_kf = offset_k;
} else {
ijnum = red_acc[2*m+0];
offset_kf = red_acc[2*m+1];
}
if (ijnum < 0) ijnum = ijnum_shared[m];
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;

View File

@ -275,11 +275,10 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_cutshortsq, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;

View File

@ -168,13 +168,10 @@ _texture( ts6_tex,int4);
#endif
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const numtyp _cutshortsq,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
@ -188,8 +185,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
@ -203,9 +198,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -213,7 +205,7 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
if (rsq<_cutshortsq) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
@ -474,7 +466,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
// rsq<cutsq[ijparam]
if (rsq >= cutsq[ijparam]) continue;
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
@ -594,6 +587,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
delr1[1] = jx.y-ix.y;
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 >= cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -735,7 +729,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local int red_acc[2*BLOCK_PAIR];
__local int ijnum_shared[BLOCK_PAIR];
__syncthreads();
@ -769,7 +763,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delr1[3];
@ -816,21 +809,14 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
red_acc[2*m+0] = ijnum;
red_acc[2*m+1] = offset_k;
ijnum_shared[m] = ijnum;
break;
}
}
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
int offset_kf;
if (ijnum >= 0) {
offset_kf = offset_k;
} else {
ijnum = red_acc[2*m+0];
offset_kf = red_acc[2*m+1];
}
if (ijnum < 0) ijnum = ijnum_shared[m];
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
@ -873,7 +859,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
delr2[2] = kx.z-jx.z;
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
if (rsq2 > cutsq[jikparam]) continue;
numtyp r2 = ucl_sqrt(rsq2);
numtyp r2inv = ucl_rsqrt(rsq2);
numtyp4 ts1_param, ts2_param, ts4_param;
@ -899,6 +884,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -972,7 +958,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local int red_acc[2*BLOCK_PAIR];
__local int red_acc[BLOCK_PAIR];
__syncthreads();
@ -1006,7 +992,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delr1[3];
@ -1053,21 +1038,14 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
red_acc[2*m+0] = ijnum;
red_acc[2*m+1] = offset_k;
red_acc[m] = ijnum;
break;
}
}
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
int offset_kf;
if (ijnum >= 0) {
offset_kf = offset_k;
} else {
ijnum = red_acc[2*m+0];
offset_kf = red_acc[2*m+1];
}
if (ijnum < 0) ijnum = red_acc[m];
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
begin :
email : pl.rodolfo@gmail.com
dekoning@ifi.unicamp.br
***************************************************************************/
@ -38,7 +38,7 @@ template <class numtyp, class acctyp>
UFMT::~UFM() {
clear();
}
template <class numtyp, class acctyp>
int UFMT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
@ -46,9 +46,9 @@ int UFMT::bytes_per_atom(const int max_nbors) const {
template <class numtyp, class acctyp>
int UFMT::init(const int ntypes,
double **host_cutsq, double **host_uf1,
double **host_uf2, double **host_uf3,
double **host_uf4, double **host_offset,
double **host_cutsq, double **host_uf1,
double **host_uf2, double **host_uf3,
double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
@ -78,11 +78,11 @@ int UFMT::init(const int ntypes,
uf1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,uf1,host_write,host_uf1,host_uf2,
host_cutsq);
host_cutsq);
uf3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,uf3,host_write,host_uf3,host_uf4,
host_offset);
this->atom->type_pack4(ntypes,lj_types,uf3,host_write,host_uf3,host_uf2,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -96,18 +96,17 @@ int UFMT::init(const int ntypes,
template <class numtyp, class acctyp>
void UFMT::reinit(const int ntypes, double **host_cutsq, double **host_uf1,
double **host_uf2, double **host_uf3,
double **host_uf4, double **host_offset) {
double **host_uf2, double **host_uf3, double **host_offset) {
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<_lj_types*_lj_types; i++)
host_write[i]=0.0;
this->atom->type_pack4(ntypes,_lj_types,uf1,host_write,host_uf1,host_uf2,
host_cutsq);
this->atom->type_pack4(ntypes,_lj_types,uf3,host_write,host_uf3,host_uf4,
this->atom->type_pack4(ntypes,_lj_types,uf3,host_write,host_uf3,host_uf2,
host_offset);
}
@ -145,7 +144,7 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
@ -157,12 +156,12 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->x, &uf1, &uf3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &uf1, &uf3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
begin :
email : pl.rodolfo@gmail.com
dekoning@ifi.unicamp.br
***************************************************************************/
@ -32,7 +32,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
@ -41,16 +41,15 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_uf1, double **host_uf2, double **host_uf3,
double **host_uf4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **host_cutsq,
double **host_uf1, double **host_uf2, double **host_uf3,
double **host_uf4, double **host_offset);
void reinit(const int ntypes, double **host_cutsq, double **host_uf1,
double **host_uf2, double **host_uf3, double **host_offset);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -65,7 +64,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
/// uf1.x = uf1, uf1.y = uf2, uf1.z = cutsq
UCL_D_Vec<numtyp4> uf1;
/// uf3.x = uf3, uf3.y = uf4, uf3.z = offset
/// uf3.x = uf3, uf3.y = uf2, uf3.z = offset
UCL_D_Vec<numtyp4> uf3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
@ -73,7 +72,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
/// Number of atom types
int _lj_types;
private:

View File

@ -10,7 +10,7 @@
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
begin :
email : pl.rodolfo@gmail.com
dekoning@ifi.unicamp.br
***************************************************************************/
@ -30,10 +30,10 @@ static UFM<PRECISION,ACC_PRECISION> UFMLMF;
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
double **host_uf2, double **host_uf3, double **host_uf4,
double **offset, double *special_lj, const int inum, const int nall,
const int max_nbors, const int maxspecial, const double cell_size,
int &gpu_mode, FILE *screen) {
double **host_uf2, double **host_uf3, double **offset,
double *special_lj, const int inum, const int nall,
const int max_nbors, const int maxspecial, const double cell_size,
int &gpu_mode, FILE *screen) {
UFMLMF.clear();
gpu_mode=UFMLMF.device->gpu_mode();
double gpu_split=UFMLMF.device->particle_split();
@ -57,8 +57,8 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
int init_ok=0;
if (world_me==0)
init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
host_uf4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
UFMLMF.device->world_barrier();
if (message)
@ -74,12 +74,12 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3, host_uf4,
init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
UFMLMF.device->gpu_barrier();
if (message)
if (message)
fprintf(screen,"Done.\n");
}
if (message)
@ -94,19 +94,18 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
// Copy updated coeffs from host to device
// ---------------------------------------------------------------------------
void ufml_gpu_reinit(const int ntypes, double **cutsq, double **host_uf1,
double **host_uf2, double **host_uf3, double **host_uf4,
double **offset) {
double **host_uf2, double **host_uf3, double **offset) {
int world_me=UFMLMF.device->world_me();
int gpu_rank=UFMLMF.device->gpu_rank();
int procs_per_gpu=UFMLMF.device->procs_per_gpu();
if (world_me==0)
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, host_uf4, offset);
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, offset);
UFMLMF.device->world_barrier();
for (int i=0; i<procs_per_gpu; i++) {
if (gpu_rank==i && world_me!=0)
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, host_uf4, offset);
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, offset);
UFMLMF.device->gpu_barrier();
}
}
@ -125,8 +124,8 @@ int ** ufml_gpu_compute_n(const int ago, const int inum_full,
return UFMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
}
void ufml_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,

View File

@ -180,8 +180,8 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
ucl_copy(map,dview_map,false);
_allocated=true;
this->_max_bytes=param1.row_bytes()+param2.row_bytes()+param3.row_bytes()+param4.row_bytes()+param5.row_bytes()+
map.row_bytes()+elem2param.row_bytes();
this->_max_bytes=param1.row_bytes()+param2.row_bytes()+param3.row_bytes()+
param4.row_bytes()+param5.row_bytes()+map.row_bytes()+elem2param.row_bytes();
return 0;
}
@ -233,11 +233,10 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &param4, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_cutshortsq, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1

View File

@ -137,13 +137,10 @@ _texture( param5_tex,int4);
#endif
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param4,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const numtyp _cutshortsq,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
@ -157,8 +154,6 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
@ -172,9 +167,6 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -182,7 +174,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
if (rsq<_cutshortsq) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;