Merge branch 'master' into gpu-bugfix
This commit is contained in:
@ -3,13 +3,13 @@
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 3572$
|
||||
// $Date: 2010-11-23 13:04:43 -0700 (Tue, 23 Nov 2010) $
|
||||
// -------------------------------------------------------------
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
// -------------------------------------------------------------
|
||||
#ifndef __CUDPP_PLAN_H__
|
||||
#define __CUDPP_PLAN_H__
|
||||
|
||||
|
||||
typedef void* KernelPointer;
|
||||
|
||||
extern "C" size_t getNumCTAs(KernelPointer kernel);
|
||||
@ -30,10 +30,10 @@ void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threads
|
||||
/** @brief Base class for CUDPP Plan data structures
|
||||
*
|
||||
* CUDPPPlan and its subclasses provide the internal (i.e. not visible to the
|
||||
* library user) infrastructure for planning algorithm execution. They
|
||||
* library user) infrastructure for planning algorithm execution. They
|
||||
* own intermediate storage for CUDPP algorithms as well as, in some cases,
|
||||
* information about optimal execution configuration for the present hardware.
|
||||
*
|
||||
*
|
||||
*/
|
||||
class CUDPPPlan
|
||||
{
|
||||
@ -91,7 +91,7 @@ public:
|
||||
|
||||
CUDPPScanPlan *m_scanPlan; //!< @internal Compact performs a scan of type unsigned int using this plan
|
||||
unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan
|
||||
|
||||
|
||||
};
|
||||
|
||||
class CUDPPRadixSortPlan : public CUDPPPlan
|
||||
@ -99,7 +99,7 @@ class CUDPPRadixSortPlan : public CUDPPPlan
|
||||
public:
|
||||
CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements);
|
||||
virtual ~CUDPPRadixSortPlan();
|
||||
|
||||
|
||||
bool m_bKeysOnly;
|
||||
bool m_bManualCoalesce;
|
||||
bool m_bUsePersistentCTAs;
|
||||
@ -123,22 +123,22 @@ class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan
|
||||
public:
|
||||
CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts,
|
||||
const void *A,
|
||||
const unsigned int *rowindx,
|
||||
const unsigned int *rowindx,
|
||||
const unsigned int *indx, size_t numRows);
|
||||
virtual ~CUDPPSparseMatrixVectorMultiplyPlan();
|
||||
|
||||
CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan
|
||||
void *m_d_prod; //!< @internal Vector of products (of an element in A and its corresponding (thats is
|
||||
//! belongs to the same row) element in x; this is the input and output of
|
||||
//! belongs to the same row) element in x; this is the input and output of
|
||||
//! segmented scan
|
||||
unsigned int *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element
|
||||
//! of its row; this is the flags vector for segmented scan
|
||||
unsigned int *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
|
||||
//! which is the last element of that row. Resides in GPU memory.
|
||||
//! which is the last element of that row. Resides in GPU memory.
|
||||
unsigned int *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
|
||||
//! which is the first element of that row. Resides in GPU memory.
|
||||
unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A
|
||||
void *m_d_A; //!<@internal The A matrix
|
||||
//! which is the first element of that row. Resides in GPU memory.
|
||||
unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A
|
||||
void *m_d_A; //!<@internal The A matrix
|
||||
unsigned int *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
|
||||
//! which is the last element of that row. Resides in CPU memory.
|
||||
size_t m_numRows; //!< Number of rows
|
||||
|
||||
@ -1,32 +1,32 @@
|
||||
/*
|
||||
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
@ -36,7 +36,7 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( disable : 4996 ) // disable deprecated warning
|
||||
# pragma warning( disable : 4996 ) // disable deprecated warning
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -50,8 +50,8 @@ extern "C" {
|
||||
# else
|
||||
# define DLL_MAPPING __declspec(dllimport)
|
||||
# endif
|
||||
#else
|
||||
# define DLL_MAPPING
|
||||
#else
|
||||
# define DLL_MAPPING
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
@ -64,7 +64,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! CUT bool type
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
enum CUTBoolean
|
||||
enum CUTBoolean
|
||||
{
|
||||
CUTFalse = 0,
|
||||
CUTTrue = 1
|
||||
@ -72,11 +72,11 @@ extern "C" {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Deallocate memory allocated within Cutil
|
||||
//! @param pointer to memory
|
||||
//! @param pointer to memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
void CUTIL_API
|
||||
cutFree( void* ptr);
|
||||
cutFree( void* ptr);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Helper for bank conflict checking (should only be used with the
|
||||
@ -95,7 +95,7 @@ extern "C" {
|
||||
DLL_MAPPING
|
||||
void CUTIL_API
|
||||
cutCheckBankAccess( unsigned int tidx, unsigned int tidy, unsigned int tidz,
|
||||
unsigned int bdimx, unsigned int bdimy,
|
||||
unsigned int bdimx, unsigned int bdimy,
|
||||
unsigned int bdimz, const char* file, const int line,
|
||||
const char* aname, const int index);
|
||||
|
||||
@ -141,8 +141,8 @@ extern "C" {
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -157,8 +157,8 @@ extern "C" {
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -173,7 +173,7 @@ extern "C" {
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -183,13 +183,13 @@ extern "C" {
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileui( const char* filename, unsigned int** data,
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -199,13 +199,13 @@ extern "C" {
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -220,12 +220,12 @@ extern "C" {
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileub( const char* filename, unsigned char** data,
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
@ -234,12 +234,12 @@ extern "C" {
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
@ -248,7 +248,7 @@ extern "C" {
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
@ -260,7 +260,7 @@ extern "C" {
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
@ -272,8 +272,8 @@ extern "C" {
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileui( const char* filename,const unsigned int* data,
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileui( const char* filename,const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -284,8 +284,8 @@ extern "C" {
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -296,7 +296,7 @@ extern "C" {
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileub( const char* filename,const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
@ -307,7 +307,7 @@ extern "C" {
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -326,11 +326,11 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPPMub( const char* file, unsigned char** data,
|
||||
cutLoadPPMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
@ -340,7 +340,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPPM4ub( const char* file, unsigned char** data,
|
||||
cutLoadPPM4ub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -350,13 +350,13 @@ extern "C" {
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPGMi( const char* file, unsigned int** data,
|
||||
cutLoadPGMi( const char* file, unsigned int** data,
|
||||
unsigned int* w, unsigned int* h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -366,13 +366,13 @@ extern "C" {
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPGMs( const char* file, unsigned short** data,
|
||||
cutLoadPGMs( const char* file, unsigned short** data,
|
||||
unsigned int* w, unsigned int* h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -381,7 +381,7 @@ extern "C" {
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -399,7 +399,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePGMub( const char* file, unsigned char* data,
|
||||
cutSavePGMub( const char* file, unsigned char* data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -411,11 +411,11 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePPMub( const char* file, unsigned char *data,
|
||||
cutSavePPMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
@ -424,7 +424,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePPM4ub( const char* file, unsigned char *data,
|
||||
cutSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -465,15 +465,15 @@ extern "C" {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are separated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return CUTTrue if command line argument \a flag_name has been given,
|
||||
//! @return CUTTrue if command line argument \a flag_name has been given,
|
||||
//! otherwise 0
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
@ -481,7 +481,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutCheckCmdLineFlag( const int argc, const char** argv,
|
||||
cutCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -495,7 +495,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
cutGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -509,7 +509,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
cutGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -523,7 +523,7 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
cutGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -538,8 +538,8 @@ extern "C" {
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
cutGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -556,46 +556,46 @@ extern "C" {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
CUTBoolean CUTIL_API
|
||||
cutComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
@ -609,7 +609,7 @@ extern "C" {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
@ -617,14 +617,14 @@ extern "C" {
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
@ -632,7 +632,7 @@ extern "C" {
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
@ -645,7 +645,7 @@ extern "C" {
|
||||
//! @param name of the new timer, 0 if the creation failed
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutCreateTimer( unsigned int* name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -654,7 +654,7 @@ extern "C" {
|
||||
//! @param name of the timer to delete
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutDeleteTimer( unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -662,7 +662,7 @@ extern "C" {
|
||||
//! @param name name of the timer to start
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutStartTimer( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -670,7 +670,7 @@ extern "C" {
|
||||
//! @param name name of the timer to stop
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutStopTimer( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -678,27 +678,27 @@ extern "C" {
|
||||
//! @param name name of the timer to reset.
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
CUTBoolean CUTIL_API
|
||||
cutResetTimer( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Returns total execution time in milliseconds for the timer over all
|
||||
//! Returns total execution time in milliseconds for the timer over all
|
||||
//! runs since the last reset or timer creation.
|
||||
//! @param name name of the timer to return the time of
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
float CUTIL_API
|
||||
float CUTIL_API
|
||||
cutGetTimerValue( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Return the average time in milliseconds for timer execution as the
|
||||
//! Return the average time in milliseconds for timer execution as the
|
||||
//! total time for the timer dividied by the number of completed (stopped)
|
||||
//! runs the timer has made.
|
||||
//! Excludes the current running time if the timer is currently running.
|
||||
//! @param name name of the timer to return the time of
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
float CUTIL_API
|
||||
float CUTIL_API
|
||||
cutGetAverageTimerValue( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
@ -764,7 +764,7 @@ extern "C" {
|
||||
fprintf(stderr, "Cut error in file '%s' in line %i.\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
}
|
||||
|
||||
//! Check for CUDA error
|
||||
# define CUT_CHECK_ERROR(errorMessage) do { \
|
||||
@ -802,7 +802,7 @@ extern "C" {
|
||||
// void macros for performance reasons
|
||||
# define CUT_CHECK_ERROR(errorMessage)
|
||||
# define CUT_CHECK_ERROR_GL()
|
||||
# define CUT_CONDITION( val)
|
||||
# define CUT_CONDITION( val)
|
||||
# define CU_SAFE_CALL_NO_SYNC( call) call
|
||||
# define CU_SAFE_CALL( call) call
|
||||
# define CUDA_SAFE_CALL_NO_SYNC( call) call
|
||||
|
||||
@ -625,9 +625,9 @@ int UCL_Device::set_platform_accelerator(int pid) {
|
||||
for (int n=0; n<_num_platforms; n++) {
|
||||
set_platform(n);
|
||||
for (int i=0; i<num_devices(); i++) {
|
||||
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU ||
|
||||
_properties[i].device_type==CL_DEVICE_TYPE_GPU ||
|
||||
_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR) {
|
||||
if ((_properties[i].device_type & CL_DEVICE_TYPE_CPU) ||
|
||||
(_properties[i].device_type & CL_DEVICE_TYPE_GPU) ||
|
||||
(_properties[i].device_type & CL_DEVICE_TYPE_ACCELERATOR)) {
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// **************************************************************************
|
||||
// **************************************************************************
|
||||
// preprocessor.cu
|
||||
// -------------------
|
||||
// W. Michael Brown (ORNL)
|
||||
|
||||
@ -65,7 +65,7 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kßernels
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
|
||||
@ -308,7 +308,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
|
||||
}
|
||||
|
||||
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \
|
||||
#define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy) \
|
||||
{ \
|
||||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
@ -361,7 +361,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
} \
|
||||
}
|
||||
|
||||
#define threebody_half(delr1x, delr1y, delr1z) \
|
||||
#define threebody_half(delr1x, delr1y, delr1z, delr2x, delr2y, delr2z) \
|
||||
{ \
|
||||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
@ -511,7 +511,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
||||
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||
threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
|
||||
|
||||
f.x -= fjx + fkx;
|
||||
f.y -= fjy + fky;
|
||||
@ -665,12 +665,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz;
|
||||
//if (evatom==0) {
|
||||
threebody_half(delr1x,delr1y,delr1z);
|
||||
//} else {
|
||||
// numtyp fkx, fky, fkz;
|
||||
// threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||
//}
|
||||
threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
|
||||
|
||||
f.x += fjx;
|
||||
f.y += fjy;
|
||||
@ -819,7 +814,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
||||
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||
threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
|
||||
|
||||
f.x += fjx;
|
||||
f.y += fjy;
|
||||
|
||||
@ -250,11 +250,10 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &_cutshortsq, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
|
||||
@ -165,13 +165,10 @@ _texture( ts5_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const numtyp _cutshortsq,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
@ -185,8 +182,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
@ -200,9 +195,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -210,7 +202,7 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
if (rsq<_cutshortsq) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
@ -460,7 +452,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
// rsq<cutsq[ijparam]
|
||||
if (rsq >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
@ -574,6 +567,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[1] = jx.y-ix.y;
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
if (rsq1 >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
@ -715,7 +709,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int ijnum_shared[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -749,7 +743,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -796,21 +789,14 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
ijnum_shared[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = ijnum_shared[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
@ -853,7 +839,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
if (rsq2 > cutsq[jikparam]) continue;
|
||||
numtyp r2 = ucl_sqrt(rsq2);
|
||||
numtyp r2inv = ucl_rsqrt(rsq2);
|
||||
numtyp4 ts1_param, ts2_param, ts4_param;
|
||||
@ -953,7 +938,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int red_acc[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -987,7 +972,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -1034,21 +1018,14 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
red_acc[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = red_acc[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
|
||||
@ -192,7 +192,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
|
||||
ts4.row_bytes()+cutsq.row_bytes()+
|
||||
ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
|
||||
map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
@ -250,11 +250,10 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &_cutshortsq, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
|
||||
@ -165,13 +165,10 @@ _texture( ts5_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const numtyp _cutshortsq,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
@ -185,8 +182,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
@ -200,9 +195,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -210,7 +202,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
if (rsq<_cutshortsq) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
@ -461,7 +453,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
// rsq<cutsq[ijparam]
|
||||
if (rsq >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
@ -578,6 +571,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[1] = jx.y-ix.y;
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
if (rsq1 >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
@ -725,7 +719,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int ijnum_shared[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -759,7 +753,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -806,21 +799,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
ijnum_shared[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = ijnum_shared[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
@ -863,7 +849,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
if (rsq2 > cutsq[jikparam]) continue;
|
||||
numtyp r2 = ucl_sqrt(rsq2);
|
||||
numtyp r2inv = ucl_rsqrt(rsq2);
|
||||
numtyp4 ts1_param, ts2_param, ts4_param, ts5_param;
|
||||
@ -892,6 +877,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
@ -971,7 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int ijnum_shared[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -1005,7 +991,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -1052,21 +1037,14 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
ijnum_shared[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = ijnum_shared[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
|
||||
@ -275,11 +275,10 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &_cutshortsq, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
int nall = this->_nall;
|
||||
|
||||
@ -168,13 +168,10 @@ _texture( ts6_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const numtyp _cutshortsq,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
@ -188,8 +185,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
@ -203,9 +198,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -213,7 +205,7 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
if (rsq<_cutshortsq) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
@ -474,7 +466,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
// rsq<cutsq[ijparam]
|
||||
if (rsq >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
@ -594,6 +587,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[1] = jx.y-ix.y;
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
if (rsq1 >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
@ -735,7 +729,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int ijnum_shared[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -769,7 +763,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -816,21 +809,14 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
ijnum_shared[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = ijnum_shared[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
@ -873,7 +859,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
if (rsq2 > cutsq[jikparam]) continue;
|
||||
numtyp r2 = ucl_sqrt(rsq2);
|
||||
numtyp r2inv = ucl_rsqrt(rsq2);
|
||||
numtyp4 ts1_param, ts2_param, ts4_param;
|
||||
@ -899,6 +884,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
@ -972,7 +958,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int red_acc[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -1006,7 +992,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -1053,21 +1038,14 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
red_acc[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = red_acc[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : pl.rodolfo@gmail.com
|
||||
dekoning@ifi.unicamp.br
|
||||
***************************************************************************/
|
||||
@ -38,7 +38,7 @@ template <class numtyp, class acctyp>
|
||||
UFMT::~UFM() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int UFMT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
@ -46,9 +46,9 @@ int UFMT::bytes_per_atom(const int max_nbors) const {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int UFMT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_uf1,
|
||||
double **host_uf2, double **host_uf3,
|
||||
double **host_uf4, double **host_offset,
|
||||
double **host_cutsq, double **host_uf1,
|
||||
double **host_uf2, double **host_uf3,
|
||||
double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
@ -78,11 +78,11 @@ int UFMT::init(const int ntypes,
|
||||
|
||||
uf1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,uf1,host_write,host_uf1,host_uf2,
|
||||
host_cutsq);
|
||||
host_cutsq);
|
||||
|
||||
uf3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,uf3,host_write,host_uf3,host_uf4,
|
||||
host_offset);
|
||||
this->atom->type_pack4(ntypes,lj_types,uf3,host_write,host_uf3,host_uf2,
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
@ -96,18 +96,17 @@ int UFMT::init(const int ntypes,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void UFMT::reinit(const int ntypes, double **host_cutsq, double **host_uf1,
|
||||
double **host_uf2, double **host_uf3,
|
||||
double **host_uf4, double **host_offset) {
|
||||
double **host_uf2, double **host_uf3, double **host_offset) {
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
|
||||
|
||||
for (int i=0; i<_lj_types*_lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
|
||||
this->atom->type_pack4(ntypes,_lj_types,uf1,host_write,host_uf1,host_uf2,
|
||||
host_cutsq);
|
||||
this->atom->type_pack4(ntypes,_lj_types,uf3,host_write,host_uf3,host_uf4,
|
||||
this->atom->type_pack4(ntypes,_lj_types,uf3,host_write,host_uf3,host_uf2,
|
||||
host_offset);
|
||||
}
|
||||
|
||||
@ -145,7 +144,7 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -157,12 +156,12 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->k_pair_fast.run(&this->atom->x, &uf1, &uf3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &uf1, &uf3, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : pl.rodolfo@gmail.com
|
||||
dekoning@ifi.unicamp.br
|
||||
***************************************************************************/
|
||||
@ -32,7 +32,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
@ -41,16 +41,15 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_uf1, double **host_uf2, double **host_uf3,
|
||||
double **host_uf4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
|
||||
/// Send updated coeffs from host to device (to be compatible with fix adapt)
|
||||
void reinit(const int ntypes, double **host_cutsq,
|
||||
double **host_uf1, double **host_uf2, double **host_uf3,
|
||||
double **host_uf4, double **host_offset);
|
||||
|
||||
void reinit(const int ntypes, double **host_cutsq, double **host_uf1,
|
||||
double **host_uf2, double **host_uf3, double **host_offset);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
@ -65,7 +64,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
|
||||
|
||||
/// uf1.x = uf1, uf1.y = uf2, uf1.z = cutsq
|
||||
UCL_D_Vec<numtyp4> uf1;
|
||||
/// uf3.x = uf3, uf3.y = uf4, uf3.z = offset
|
||||
/// uf3.x = uf3, uf3.y = uf2, uf3.z = offset
|
||||
UCL_D_Vec<numtyp4> uf3;
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
@ -73,7 +72,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : pl.rodolfo@gmail.com
|
||||
dekoning@ifi.unicamp.br
|
||||
***************************************************************************/
|
||||
@ -30,10 +30,10 @@ static UFM<PRECISION,ACC_PRECISION> UFMLMF;
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
|
||||
double **host_uf2, double **host_uf3, double **host_uf4,
|
||||
double **offset, double *special_lj, const int inum, const int nall,
|
||||
const int max_nbors, const int maxspecial, const double cell_size,
|
||||
int &gpu_mode, FILE *screen) {
|
||||
double **host_uf2, double **host_uf3, double **offset,
|
||||
double *special_lj, const int inum, const int nall,
|
||||
const int max_nbors, const int maxspecial, const double cell_size,
|
||||
int &gpu_mode, FILE *screen) {
|
||||
UFMLMF.clear();
|
||||
gpu_mode=UFMLMF.device->gpu_mode();
|
||||
double gpu_split=UFMLMF.device->particle_split();
|
||||
@ -57,8 +57,8 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
|
||||
host_uf4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
UFMLMF.device->world_barrier();
|
||||
if (message)
|
||||
@ -74,12 +74,12 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3, host_uf4,
|
||||
init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
|
||||
offset, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen);
|
||||
|
||||
UFMLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
@ -94,19 +94,18 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
|
||||
// Copy updated coeffs from host to device
|
||||
// ---------------------------------------------------------------------------
|
||||
void ufml_gpu_reinit(const int ntypes, double **cutsq, double **host_uf1,
|
||||
double **host_uf2, double **host_uf3, double **host_uf4,
|
||||
double **offset) {
|
||||
double **host_uf2, double **host_uf3, double **offset) {
|
||||
int world_me=UFMLMF.device->world_me();
|
||||
int gpu_rank=UFMLMF.device->gpu_rank();
|
||||
int procs_per_gpu=UFMLMF.device->procs_per_gpu();
|
||||
|
||||
|
||||
if (world_me==0)
|
||||
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, host_uf4, offset);
|
||||
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, offset);
|
||||
UFMLMF.device->world_barrier();
|
||||
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, host_uf4, offset);
|
||||
UFMLMF.reinit(ntypes, cutsq, host_uf1, host_uf2, host_uf3, offset);
|
||||
UFMLMF.device->gpu_barrier();
|
||||
}
|
||||
}
|
||||
@ -125,8 +124,8 @@ int ** ufml_gpu_compute_n(const int ago, const int inum_full,
|
||||
return UFMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ufml_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
||||
@ -180,8 +180,8 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
|
||||
ucl_copy(map,dview_map,false);
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=param1.row_bytes()+param2.row_bytes()+param3.row_bytes()+param4.row_bytes()+param5.row_bytes()+
|
||||
map.row_bytes()+elem2param.row_bytes();
|
||||
this->_max_bytes=param1.row_bytes()+param2.row_bytes()+param3.row_bytes()+
|
||||
param4.row_bytes()+param5.row_bytes()+map.row_bytes()+elem2param.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -233,11 +233,10 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, ¶m4, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &_cutshortsq, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
|
||||
|
||||
@ -137,13 +137,10 @@ _texture( param5_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict param4,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const numtyp _cutshortsq,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
@ -157,8 +154,6 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
@ -172,9 +167,6 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -182,7 +174,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
|
||||
if (rsq<_cutshortsq) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
|
||||
Reference in New Issue
Block a user