diff --git a/doc/src/Eqs/HADRESS_AT_pair_coul_dsf.jpg b/doc/src/Eqs/HADRESS_AT_pair_coul_dsf.jpg
new file mode 100644
index 0000000000..61399489d6
Binary files /dev/null and b/doc/src/Eqs/HADRESS_AT_pair_coul_dsf.jpg differ
diff --git a/doc/src/Eqs/HADRESS_AT_pair_coul_dsf.tex b/doc/src/Eqs/HADRESS_AT_pair_coul_dsf.tex
new file mode 100644
index 0000000000..5c908aa6fe
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_AT_pair_coul_dsf.tex
@@ -0,0 +1,11 @@
+\documentclass[12pt]{article}
+\pagestyle{empty}
+
+\begin{document}
+$$
+ V^{AT} =
+ q_iq_j \left[ \frac{\mbox{erfc} (\alpha r)}{r} - \frac{\mbox{erfc} (\alpha r_c)}{r_c} +
+ \left( \frac{\mbox{erfc} (\alpha r_c)}{r_c^2} + \frac{2\alpha}{\sqrt{\pi}}\frac{\exp (-\alpha^2 r^2_c)}{r_c} \right)(r-r_c) \right] \qquad r < r_c
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/HADRESS_AT_pair_lj.jpg b/doc/src/Eqs/HADRESS_AT_pair_lj.jpg
new file mode 100644
index 0000000000..6953f07941
Binary files /dev/null and b/doc/src/Eqs/HADRESS_AT_pair_lj.jpg differ
diff --git a/doc/src/Eqs/HADRESS_AT_pair_lj.tex b/doc/src/Eqs/HADRESS_AT_pair_lj.tex
new file mode 100644
index 0000000000..99a251a129
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_AT_pair_lj.tex
@@ -0,0 +1,12 @@
+\documentstyle[12pt]{article}
+\pagestyle{empty}
+
+\begin{document}
+
+$$
+ V^{AT} = 4 \epsilon \left[ \left(\frac{\sigma}{r}\right)^{12} -
+ \left(\frac{\sigma}{r}\right)^6 \right]
+ \qquad r < r_c
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/HADRESS_CG_pair_lj.jpg b/doc/src/Eqs/HADRESS_CG_pair_lj.jpg
new file mode 100644
index 0000000000..71b7751930
Binary files /dev/null and b/doc/src/Eqs/HADRESS_CG_pair_lj.jpg differ
diff --git a/doc/src/Eqs/HADRESS_CG_pair_lj.tex b/doc/src/Eqs/HADRESS_CG_pair_lj.tex
new file mode 100644
index 0000000000..c66fb30105
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_CG_pair_lj.tex
@@ -0,0 +1,12 @@
+\documentstyle[12pt]{article}
+\pagestyle{empty}
+
+\begin{document}
+
+$$
+ V^{CG} = 4 \epsilon \left[ \left(\frac{\sigma}{r}\right)^{12} -
+ \left(\frac{\sigma}{r}\right)^6 \right]
+ \qquad r < r_c
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/HADRESS_Switching_Function_Cylinder.jpg b/doc/src/Eqs/HADRESS_Switching_Function_Cylinder.jpg
new file mode 100644
index 0000000000..e7dda32d4d
Binary files /dev/null and b/doc/src/Eqs/HADRESS_Switching_Function_Cylinder.jpg differ
diff --git a/doc/src/Eqs/HADRESS_Switching_Function_Cylinder.tex b/doc/src/Eqs/HADRESS_Switching_Function_Cylinder.tex
new file mode 100644
index 0000000000..d45783ab91
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_Switching_Function_Cylinder.tex
@@ -0,0 +1,16 @@
+\documentclass[12pt]{article}
+\pagestyle{empty}
+
+\begin{document}
+
+\begin{eqnarray*}
+\lambda(r)=\left\{
+\begin{array}{ll}
+1 & r \leq r_{at}\\
+\cos^2\left(\frac{\pi(r-r_{at})}{2r_{hy}}\right) & r_{at}< r \leq r_{at}+r_{hy}\\
+0 & r > r_{at}+r_{hy}
+\end{array}
+\right.
+\end{eqnarray*}
+
+\end{document}
diff --git a/doc/src/Eqs/HADRESS_Switching_Function_Slab.jpg b/doc/src/Eqs/HADRESS_Switching_Function_Slab.jpg
new file mode 100644
index 0000000000..b9f03bf571
Binary files /dev/null and b/doc/src/Eqs/HADRESS_Switching_Function_Slab.jpg differ
diff --git a/doc/src/Eqs/HADRESS_Switching_Function_Slab.tex b/doc/src/Eqs/HADRESS_Switching_Function_Slab.tex
new file mode 100644
index 0000000000..19f9728646
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_Switching_Function_Slab.tex
@@ -0,0 +1,14 @@
+\documentclass[12pt]{article}
+\pagestyle{empty}
+
+\begin{document}
+$$
+\lambda(x)=\left\{
+\begin{array}{ll}
+1 & |x| \leq L_{AT}/2\\
+\cos^2\left(\frac{\pi(x-L_{AT}/2)}{2L_{HY}}\right) & \frac{L_{AT}}{2}< |x| \leq \frac{L_{AT}}{2}+L_{HY}\\
+0 & |x| > L_{AT}+L_{HY}
+\end{array}
+\right.
+$$
+\end{document}
diff --git a/doc/src/Eqs/HADRESS_Switching_Function_Sphere.jpg b/doc/src/Eqs/HADRESS_Switching_Function_Sphere.jpg
new file mode 100644
index 0000000000..e7dda32d4d
Binary files /dev/null and b/doc/src/Eqs/HADRESS_Switching_Function_Sphere.jpg differ
diff --git a/doc/src/Eqs/HADRESS_Switching_Function_Sphere.tex b/doc/src/Eqs/HADRESS_Switching_Function_Sphere.tex
new file mode 100644
index 0000000000..d45783ab91
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_Switching_Function_Sphere.tex
@@ -0,0 +1,16 @@
+\documentclass[12pt]{article}
+\pagestyle{empty}
+
+\begin{document}
+
+\begin{eqnarray*}
+\lambda(r)=\left\{
+\begin{array}{ll}
+1 & r \leq r_{at}\\
+\cos^2\left(\frac{\pi(r-r_{at})}{2r_{hy}}\right) & r_{at}< r \leq r_{at}+r_{hy}\\
+0 & r > r_{at}+r_{hy}
+\end{array}
+\right.
+\end{eqnarray*}
+
+\end{document}
diff --git a/doc/src/Eqs/HADRESS_System_Hamiltonian.jpg b/doc/src/Eqs/HADRESS_System_Hamiltonian.jpg
new file mode 100644
index 0000000000..4b3780d3e9
Binary files /dev/null and b/doc/src/Eqs/HADRESS_System_Hamiltonian.jpg differ
diff --git a/doc/src/Eqs/HADRESS_System_Hamiltonian.tex b/doc/src/Eqs/HADRESS_System_Hamiltonian.tex
new file mode 100644
index 0000000000..bbc0ca4ad7
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_System_Hamiltonian.tex
@@ -0,0 +1,10 @@
+\documentclass[12pt]{article}
+\pagestyle{empty}
+
+\begin{document}
+
+\begin{eqnarray}\label{hadress_H}
+&&H = K + V^{int} + \sum_{\alpha} \left\{{\lambda_\alpha} {V^{AT}_\alpha} + {(1 - \lambda_\alpha)} {V^{CG}_\alpha} \right\}\\ \nonumber
+\end{eqnarray}
+
+\end{document}
diff --git a/doc/src/Eqs/HADRESS_System_Potentials.jpg b/doc/src/Eqs/HADRESS_System_Potentials.jpg
new file mode 100644
index 0000000000..7d0d6356c4
Binary files /dev/null and b/doc/src/Eqs/HADRESS_System_Potentials.jpg differ
diff --git a/doc/src/Eqs/HADRESS_System_Potentials.tex b/doc/src/Eqs/HADRESS_System_Potentials.tex
new file mode 100644
index 0000000000..ecae7d0bcc
--- /dev/null
+++ b/doc/src/Eqs/HADRESS_System_Potentials.tex
@@ -0,0 +1,11 @@
+\documentclass[12pt]{article}
+
+\pagestyle{empty}
+\begin{document}
+
+\begin{eqnarray}\label{hadress_V}
+&& V^{AT}_\alpha \equiv \displaystyle\frac{1}{2}\sum_{\beta,\beta\neq \alpha}^{N} \sum_{ij} V^{AT}(|\textbf{r}_{\alpha i} - \textbf{r}_{\beta j}|)\\ \nonumber
+&& V^{CG}_\alpha \equiv \displaystyle\frac{1}{2}\sum_{\beta,\beta\neq \alpha}^{N} V^{CG}(|\textbf{R}_\alpha - \textbf{R}_\beta|)
+\end{eqnarray}
+
+\end{document}
diff --git a/doc/src/JPG/HADRESS_MODEL_LAMMPS.png b/doc/src/JPG/HADRESS_MODEL_LAMMPS.png
new file mode 100644
index 0000000000..5626fef663
Binary files /dev/null and b/doc/src/JPG/HADRESS_MODEL_LAMMPS.png differ
diff --git a/doc/src/fix_lambdah_calc.txt b/doc/src/fix_lambdah_calc.txt
new file mode 100644
index 0000000000..4c264b32b8
--- /dev/null
+++ b/doc/src/fix_lambdah_calc.txt
@@ -0,0 +1,212 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+fix lambdah command :h3
+
+[Syntax:]
+
+fix ID group-ID lambdah/calc NH-mol LHY LAT Pflag δλ dTp TpStart TpEnd HYShape Dflag Δx dTd TdStart TdEnd σ R ρ0 c fileflag :pre
+
+ID is documented in "fix"_fix.html command :ulb,l
+group-ID has to be all :l
+lambdaH/calc = style name of this fix command :l
+NH-mol = Number of molecular types within the low resolution :l
+LHY = Length of Hybrid region :l
+LAT = Length of Atomistic (high resolution) region :l
+Pflag = {0} or {1} :l
+ {0} Constant-pressure route is off
+ {1} Constant-pressure route is on :pre
+δλ = Bin size in constant-pressure route :l
+dTp = Time step interval of constant-pressure route :l
+TpStart = Starting time step of constant-pressure route :l
+TpEnd = Ending time step of constant-pressure route :l
+HYShape = Shape of Hybrid region : {slab}, {sphere}, {cylinder} :l
+ {slab} is for rectangular hybrid region
+ {sphere} is for spherical hybrid region
+ {cylinder} is for cylinderical hybrid region :pre
+Dflag = {0} or {1} :l
+ {0} Constant-density route is off
+ {1} Constant-density route is on :pre
+Δx = Bin size in constant-density route (length unit) :l
+dTd = Time step interval of constant-density route :l
+TdStart = Starting time step of constant-density route :l
+TdEnd = Ending time step of constant-density route :l
+σ = Width of gaussian function in constant-density route (length unit) :l
+R = Range of gaussian function in constant-density route (length unit) :l
+ρ0 = Reference number density in constant-density route :l
+c = Prefactor in constant-density route (energy unit) :l
+fileflag = {0} or {1} :l
+ {0} Do not employ density-balancing file
+ {1} Employ density-balancing file :pre
+
+:ule
+
+[Examples:]
+
+fix 1 all lambdah/calc 1 25 60 1 0.02 1000 150000 300000 slab 1 1.5 500 400000 700000 6 2 0.1 2 0
+fix 1 all lambdah/calc 1 25 60 1 0.02 1000 100000 200000 sphere 1 1.5 500 300000 700000 6 2 0.1 2 0 :pre
+
+[Description:]
+
+The Hamiltonian adaptive resolution simulation scheme (H-AdResS) is a dual-resolution simulation method that
+joins models with different levels of complexity for the same system within a global Hamiltonian framework "(Potestio2013_1)"_#Potestio2013_1, "(Potestio2013_2)"_#Potestio2013_2, "(Heidari2016)"_#Heidari2016.
+
+Depending on the shape of the Hybrid region which might be either slab, sphere or cynlinder, this fix calculates
+the resolution of every atom based on the center of mass of its molecule.
+The following switching function is defined for a simulation box whose atomistic region is limited to \[-0.5LAT 0.5LAT\]:
+
+:c,image(Eqs/HADRESS_Switching_Function_Slab.jpg)
+
+The following switching function is defined for a spherical/cylinderical atomistic region located at the middle of the simulation box:
+
+:c,image(Eqs/HADRESS_Switching_Function_Sphere.jpg)
+
+A setup of a Hamiltonian Adaptive Resolution Simulation is shown below. The box is partitioned into three
+ different region types, namely: Coarse-grained (CG), Hybrid (HY), and Atomistic (AT). In each region,
+ the resolution of each molecule (here water) is determined by the instantaneous value of the
+ smooth function λ represented above the simulation snapshot.
+
+:c,image(JPG/HADRESS_MODEL_LAMMPS.png)
+
+{NH-mol} determines the number of molecular types within the low resolution. For instance, for a system containing
+coarse-grained water molecules in the coarse-grained region, this number equals one. However, for a sytem containing
+water molecules and ions such as Na and Cl and they interact differently in the coarse-grained region,
+this number is 3.
+
+The {LHY} specifies the length of the hybrid region. For the cases of cylinderical or spherical hybrid regions, this quantity denotes {rHY}.
+
+The {LAT} determines the length of atomistic region. For the cases of cylinderical or spherical hybrid regions, this quantity denotes {rAT}.
+
+The {Pflag} switches off and on the constant-pressure route.
+
+The {δλ} denotes the bin size over the hybrid region. In the on-the-fly method of averaging the drift forces,
+particles are sorted into uniformly spaced λ bins of {δλ} side.
+
+The {dTp} denotes the time intervals in constant-pressure route at which the averaged drift forces are applied on the molecules of the hybrid region.
+
+The {TpStart} denotes the time step at which the simulation of the constant-pressure route is started.
+
+The {TpEnd} specifies the ending time step of the constant-pressure route.
+
+The {HYShape} specifies the geometry of the Hybrid region. This could be {slab}, {sphere}, {cylinder}.
+
+{Dflag} switches off and on the constant-pressure route.
+
+{Δx} is the bin size by which the simulation box is descritized in the constant-density route.
+
+{dTd} is the time interval in constant-density route at which the averaged thermodynamic forces are applied.
+
+{TdStart} is the starting time step of constant-density route.
+
+{TdEnd} is the ending time step of constant-density route.
+
+{σ} is the width of Gaussian function in the constant-density route.
+
+{R} is the range of Gaussian function in the constant-density route.
+
+{ρ0} is the reference density in the constant-density route.
+
+{c} is the prefactor in the constant-density route.
+
+{fileflag} denotes a flag whether the file containing the density-balancing force is employed or not.
+
+:line
+
+[Restart, fix_modify, output, run start/stop, minimize info:]
+
+No information about this fix is written to "binary restart
+files"_restart.html.
+
+This fix creates a file named "Mean_Comp_Density.txt" in which the compensation forces are printed.
+This file is created at {TdStart} and is updated every {dTd}.
+The updating process of the file is finished at time step {TdEnd}.
+For those equillibrated simulations starting at time step larger than {TdEnd},
+the file "Mean_Comp_Density.txt" is loaded in this fix.
+
+:line
+
+[Restrictions:]
+
+This fix calculates the center of mass of the particles. Thus at the beginning of the calculation,
+it is required that all atoms belonging to a molecule are on the same side of the box.
+
+To employ the H-AdResS scheme, the full/hars atom style has to be used:
+
+ atom_style full/hars :pre
+
+To perform HAdResS, Data File should contain the following extra information with respect to the Data File defined in full atom style:
+
+\[1\] [mol_H] determines the number of molecular types in the low resolution (coarse-grained) region.
+
+\[2\] [representative_flag] determines which atom carries the molecule's information
+(center of mass, molecule's resolution, ...) in the low resolution (coarse-grained) region.
+
+\[3\] [mol_type] denotes the type of the molecule in the low resolution (coarse-grained) region.
+
+The following example is extracted from a Data File in which the simulation box contains water molecules and the ions of sodium and cholorine:
+
+30720 atoms
+20480 bonds
+10240 angles :pre
+
+4 atom types
+[1 mol_H types]
+1 bond types
+1 angle types :pre
+
+-99.968000 99.968000 xlo xhi
+-20.793600 20.793600 ylo yhi
+-20.793600 20.793600 zlo zhi :pre
+
+Masses :pre
+
+1 15.999400
+2 1.007940
+3 22.9898
+4 35.453 :pre
+
+Atoms
+#atomID molecule-tag atom-type q [representative_flag mol_type] x y z
+1 1 1 -0.847200 1 1 -99.654503 -19.897600 -20.192101
+2 1 2 0.423600 0 1 -100.568001 -19.999300 -20.586599
+3 1 2 0.423600 0 1 -99.777702 -20.103100 -19.221300
+4 2 1 -0.847200 1 1 -97.826401 -17.709900 -20.127100
+5 2 2 0.423600 0 1 -96.938400 -18.071301 -19.842800
+6 2 2 0.423600 0 1 -97.735100 -16.718800 -20.030100
+7 3 3 1.0 1 1 -97.429398 -20.402201 -17.494900
+8 3 4 -1.0 1 1 -96.834000 -19.671400 -17.160999
+.
+.
+. :pre
+
+As it is shown, the representative_flag of the oxygen atoms is equal 1, and
+since the soldium and cholorine are single atom ions, their representative_flags are also equals 1.
+The interactions of water molecules and ions are the same in the coarse-grained region,
+thus they all carry the same molecular type (mol_type).
+
+:line
+
+[Related commands:]
+
+"pair_lj_hars.html"_pair_lj_hars.html
+
+[Default:] none
+
+:line
+
+
+:link(Potestio2013_1)
+[(Potestio2013_1)] R. Potestio, S. Fritsch, P. Espanol, R. Delgado-Buscalioni, K. Kremer, R. Everaers, and D. Donadio, {Hamiltonian Adaptive Resolution Simulation for Molecular Liquids}, "Phys. Rev. Lett. \[110\],
+108301 (2013)"_http://dx.doi.org/10.1103/PhysRevLett.110.108301
+
+:link(Potestio2013_2)
+[(Potestio2013_2)] R. Potestio, S. Fritsch, P. Espanol, R. Delgado-Buscalioni, K. Kremer, R. Everaers, and D. Donadio, {Monte Carlo Adaptive Resolution Simulation of Multicomponent Molecular Liquids}, "Phys. Rev. Lett. \[111\],
+060601 (2013)"_http://dx.doi.org/10.1103/PhysRevLett.111.060601
+
+:link(Heidari2016)
+[(Heidari2016)] M. Heidari, R. Cortes-Huerto, D. Donadio and R. Potestio, {Accurate and general treatment of electrostatic interaction in Hamiltonian adaptive resolution simulations}, "EPJST (2016)"
diff --git a/doc/src/pair_lj_hars.txt b/doc/src/pair_lj_hars.txt
new file mode 100644
index 0000000000..8ded669048
--- /dev/null
+++ b/doc/src/pair_lj_hars.txt
@@ -0,0 +1,179 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+pair_style lj/cut/hars/at command :h3
+pair_style lj/cut/coul/dsf/hars/at command :h3
+pair_style lj/cut/hars/cg command :h3
+
+[Syntax:]
+
+pair_style style args :pre
+
+style = {lj/cut/hars/at} or {lj/cut/coul/dsf/hars/at} or {lj/cut/hars/cg}
+args = list of arguments for a particular style :ul
+ {lj/cut/hars/at} args = cutoff All_AT Flag_Load_File
+ cutoff = global cutoff for Lennard Jones interactions (distance units)
+ All_AT = Fully atomic simulation flag, = {0} or {1}
+ {0} Fully atomic simulation is off and HAdResS is on
+ {1} Fully atomic simulation is on and HAdResS is off
+ Flag_Load_File = Flag of employing compensation energy file, = {0} or {1}
+ {0} Do not employ compensation energy until TpStart
+ {1} Employ compensation energy file immediately :pre
+ {lj/cut/coul/dsf/hars/at} args = Alpha LJcutoff Coulcutoff All_AT Flag_Load_File
+ Alpha = Damping coefficient in DSF potential (1.0/distance units)
+ LJcutoff = global cutoff for Lennard Jones interactions (distance units)
+ Coulcutoff = global cutoff for DSF coulombic interactions (distance units)
+ All_AT = Fully atomic simulation flag, = {0} or {1}
+ {0} Fully atomic simulation is off and HAdResS is on
+ {1} Fully atomic simulation is on and HAdResS is off
+ Flag_Load_File = Flag of employing compensation energy file, = {0} or {1}
+ {0} Do not employ compensation energy until TpStart
+ {1} Employ compensation energy file immediately :pre
+ {lj/cut/hars/cg} args = cutoff All_CG Flag_Load_File
+ cutoff = global cutoff for Lennard Jones interactions (distance units)
+ All_CG = Fully coarse-grained simulation flag, = {0} or {1}
+ {0} Fully coarse-grained simulation is off and HAdResS is on
+ {1} Fully coarse-grained simulation is on and HAdResS is off
+ Flag_Load_File = Flag of employing compensation energy file, = {0} or {1}
+ {0} Do not employ compensation energy until TpStart
+ {1} Employ compensation energy file immediately
+:pre
+
+[Examples:]
+
+pair_style hybrid/overlay lj/cut/hars/cg 2.469416506 0 0 lj/cut/hars/at 0.2 10.0 12.0 0 0
+pair_style hybrid/overlay lj/cut/hars/cg 1.1224 1 0 lj/cut/hars/at 1.5 1 0 :pre
+
+[Description:]
+
+In the H-AdResS scheme, the description of the interactions within a system of particles is given in terms
+of a global Hamiltonian function H, which has the following form "(Potestio2013_1)"_#Potestio2013_1, "(Potestio2013_2)"_#Potestio2013_2, "(Heidari2016)"_#Heidari2016:
+
+:c,image(Eqs/HADRESS_System_Hamiltonian.jpg)
+
+The term K is the atomistic kinetic energy, and Vint consists of all the intramolecular bonded interactions (e.g. bond stretching).
+The value of the switching function is determined by the sizes LAT
+LHY of the atomistic and hybrid regions, respectively, and of the specific geometry of the atomistic region.
+
+In the Hamiltonian, the non-bonded potential energy contribution of each molecule is given by a weighted sum of two terms
+VαCG and VαAT, defined as:
+
+:c,image(Eqs/HADRESS_System_Potentials.jpg)
+
+The {lj/cut/hars/at} styles compute the standard 12/6 Lennard-Jones potential for the atoms located in atomistic (high resolution) and hybrid region.
+The general formula is given by
+
+:c,image(Eqs/HADRESS_AT_pair_lj.jpg)
+
+rc is the cutoff.
+
+Style {lj/cut/coul/dsf/hars/at} computes the standard 12/6 Lennard-Jones and Coulomb interactions for atoms of atomistic (high resolution) and hybrid region.
+The Coulombic term is computed via the damped shifted force model introduced by "Fennell et al."_#Fennell, given by:
+
+:c,image(Eqs/HADRESS_AT_pair_coul_dsf.jpg)
+
+where {alpha} is the damping parameter and erfc() is the complementary
+error-function. This potential is essentially a short-range,
+spherically-truncated, charge-neutralized, shifted, pairwise {1/r}
+summation.
+
+The {lj/cut/hars/cg} styles compute the standard 12/6 Lennard-Jones potential for the atoms located in the low resolution (coarse-grained) and hybrid region.
+The general formula is given by
+
+:c,image(Eqs/HADRESS_CG_pair_lj.jpg)
+
+rc is the cutoff.
+As mentioned above, the interactions in the coarse-grained region are computed based on the center of mass of the particles.
+
+Important Note: For dual resolution simulations, it is required to use hybrid/overlay to include
+both resolution pair-styles.
+
+For all of dual resolution pair styles, the following coefficients must
+be defined for each pair of atoms types via the
+"pair_coeff"_pair_coeff.html command as in the examples below
+
+epsilon (energy units) :l
+sigma (distance units) :l
+cutoff (distance units) :l
+
+For examples:
+
+ pair_coeff * * lj/cut/hars/cg 1.0 2.2
+ pair_coeff 1 1 lj/cut/coul/dsf/hars/at 0.15535 3.166
+ pair_coeff * 2 lj/cut/coul/dsf/hars/at 0.0000 0.0000 :pre
+
+Note that sigma is defined in the LJ formula as the zero-crossing
+distance for the potential, not as the energy minimum at 2^(1/6)
+sigma.
+
+All potentials have to be shifted at the cutoff through the command
+
+ pair_modify shift yes :pre
+
+
+:line
+
+
+[Mixing, shift, table, tail correction, restart, rRESPA info]:
+
+All of the {lj/cut} pair styles support the
+"pair_modify"_pair_modify.html shift option for the energy of the
+Lennard-Jones portion of the pair interaction.
+
+All of the {lj/cut} pair styles write their information to "binary
+restart files"_restart.html, so pair_style and pair_coeff commands do
+not need to be specified in an input script that reads a restart file.
+
+The pair styles do not support the use of the rRESPA hierarchy.
+
+Each pair styles creates a file named as "Mean_Comp_Energy_XX.txt", where the file name's suffix "XX", is replaced by "AT" and "CG" for atomistic and coarse-grained pairwise interactions respectively.
+In these files the averaged compensation energy as function of the resolution (λ) is printed. Each file is created at {TpStart} and is updated every {dTp}.
+The updating process of the files is finished at time step {TpEnd}.
+For those equilibrated simulations starting at time step larger than {TpEnd}, the file "Mean_Comp_Energy_XX.txt" is loaded in each pair styles. For more information,
+see "fix_lambdah_calc"_fix_lambdah_calc.html.
+
+:line
+
+[Restrictions:]
+
+In HAdResS, it is required to include both high resolution (atomistic)
+and low resolution (coarse-grained) force fields together through
+
+ pair_style hybrid/overlay :pre
+An example of such setup is given above.
+
+To employ the H-AdResS scheme, the full/hars atom style as well as "(fix_lambdah_calc)"_fix_lambdah_calc.html have to be used:
+
+ atom_style full/hars :pre
+
+ fix ID group-ID lambdah/calc ... :pre
+
+:line
+
+[Related commands:]
+
+"fix_lambdah_calc"_fix_lambdah_calc.html, "pair_coeff"_pair_coeff.html
+
+[Default:] none
+
+:line
+
+:link(Potestio2013_1)
+[(Potestio2013_1)] R. Potestio, S. Fritsch, P. Espanol, R. Delgado-Buscalioni, K. Kremer, R. Everaers, and D. Donadio, {Hamiltonian Adaptive Resolution Simulation for Molecular Liquids}, "Phys. Rev. Lett. \[110\],
+108301 (2013)"_http://dx.doi.org/10.1103/PhysRevLett.110.108301
+
+:link(Potestio2013_2)
+[(Potestio2013_2)] R. Potestio, S. Fritsch, P. Espanol, R. Delgado-Buscalioni, K. Kremer, R. Everaers, and D. Donadio, {Monte Carlo Adaptive Resolution Simulation of Multicomponent Molecular Liquids}, "Phys. Rev. Lett. \[111\],
+060601 (2013)"_http://dx.doi.org/10.1103/PhysRevLett.111.060601
+
+:link(Heidari2016)
+[(Heidari2016)] M. Heidari, R. Cortes-Huerto, D. Donadio and R. Potestio, {Accurate and general treatment of electrostatic interaction in Hamiltonian adaptive resolution simulations}, "EPJST (2016)"
+
+:link(Fennell)
+[(Fennell)] C. J. Fennell, J. D. Gezelter, J Chem Phys, 124,
+234104 (2006).
diff --git a/examples/USER/hadress/H-AdResS/HADRESS_Water b/examples/USER/hadress/H-AdResS/HADRESS_Water
new file mode 100644
index 0000000000..d33e5b2dda
--- /dev/null
+++ b/examples/USER/hadress/H-AdResS/HADRESS_Water
@@ -0,0 +1,123 @@
+# SPC/E water box benchmark
+# This is HAdResS setup for simulation of water molecules.
+# The water molecules are simulated and coupled into two resolution:
+# High resolution in which atomistic SPC/E forcefields are present and
+# Low resolution in which coarse-grained WCA forcefield is computed.
+# For more information see Documentation and the follwoing paper:
+# Heidari et al., Eur. Phys. J. Special Topics, 2016.
+
+# You need to have the file "restart.waterT300.50000" for running the script.
+# Thus you need to run first the equillibration script (LongSetup) for 50000 timestep,
+# and then set it as the restart file.
+
+units real
+atom_style full/hars
+boundary p p p
+
+#read_data HAdResS_SPC.data
+read_restart restart.waterT300.50000
+
+variable root index waterT300
+variable Nrun equal 1000000
+variable Nf equal 100
+variable Ne equal 10
+variable Nr equal ${Nf}/${Ne}
+variable Ndump equal 500
+variable Nrestart equal 10000
+variable Nr_rdf equal 0.5*${Nrun}/${Ne}
+
+variable Comp_AT_Size equal 60.0
+variable Comp_HY_Size equal 25.0
+variable Hyb_Shape string slab
+
+variable Comp_Dens_nMolType equal 1
+variable Comp_Pres_Switch equal 1
+variable Comp_Pres_Start equal 150000
+variable Comp_Pres_End equal 300000
+variable Comp_Pres_Freq equal 1000
+variable Comp_Pres_Lambda_Size equal 0.02
+
+variable Comp_Dens_Switch equal 1
+variable Comp_Dens_Start equal 500000
+variable Comp_Dens_End equal 900000
+variable Comp_Dens_Freq equal 1000
+variable Comp_Dens_Alpha equal 4.0
+variable Comp_Dens_Sigma equal 6.0
+variable Comp_Dens_Range_Int equal 2.0
+variable Comp_Dens_Bin_Size equal 1.5
+variable Comp_Dens_Ref_Dens equal 0.1
+variable Load_File_Flag equal 0
+
+
+variable Text equal 300.0
+variable Pext equal 1.0
+
+pair_style hybrid/overlay lj/cut/hars/cg 2.469416506 0 0 lj/cut/coul/dsf/hars/at 0.2 10.0 12.0 0 0
+
+pair_coeff * * lj/cut/hars/cg 1.0 2.2
+
+pair_coeff 1 1 lj/cut/coul/dsf/hars/at 0.15535 3.166
+pair_coeff * 2 lj/cut/coul/dsf/hars/at 0.0000 0.0000
+pair_modify shift yes
+
+#######################################################################
+
+bond_style harmonic
+angle_style harmonic
+dihedral_style none
+improper_style none
+
+bond_coeff 1 1000.00 1.000
+angle_coeff 1 100.0 109.47
+
+#special_bonds lj/cut/coul/dsf/hars/at 0.0 0.0 0.5
+special_bonds lj/coul 0.0 0.0 0.5
+
+neighbor 1.0 bin
+neigh_modify every 1 delay 10 check yes
+
+fix LAMBDACALC all lambdah/calc ${Comp_Dens_nMolType} ${Comp_HY_Size} ${Comp_AT_Size} ${Comp_Pres_Switch} ${Comp_Pres_Lambda_Size} ${Comp_Pres_Freq} ${Comp_Pres_Start} ${Comp_Pres_End} ${Hyb_Shape} &
+ ${Comp_Dens_Switch} ${Comp_Dens_Bin_Size} ${Comp_Dens_Freq} ${Comp_Dens_Start} ${Comp_Dens_End} ${Comp_Dens_Sigma} ${Comp_Dens_Range_Int} ${Comp_Dens_Ref_Dens} ${Comp_Dens_Alpha} ${Load_File_Flag}
+
+fix 1 all shake 0.0001 20 0 b 1 a 1
+fix 3 all nve
+fix 4 all langevin 300.0 300.0 100 9892571
+
+timestep 1.0
+
+########################################################################
+compute FFX all property/atom fx
+compute SumFFX all reduce sum c_FFX
+
+compute FFY all property/atom fy
+compute SumFFY all reduce sum c_FFY
+
+compute FFZ all property/atom fz
+compute SumFFZ all reduce sum c_FFZ
+
+compute T all temp
+fix TempAve all ave/time ${Ne} ${Nr} ${Nf} c_T
+
+compute Pperatom all stress/atom NULL
+variable Patom atom -1.0*c_Pperatom[1]/(3*0.5*(yhi-ylo)*(zhi-zlo))
+
+variable P equal press
+fix PressAve all ave/time ${Ne} ${Nr} ${Nf} v_P
+
+fix PressureP all ave/spatial 5 1000 5000 x lower 3.0 v_Patom ave one file Pres.profile
+fix DensityD all ave/spatial 5 1000 5000 x lower 3.0 density/number ave one file Dens.profile
+
+compute rdf all rdf 100 1 1 # oxygen-oxygen
+fix rdf all ave/time 10 1000 10000 c_rdf file O.rdf mode vector
+
+########################################################################
+thermo_style custom step temp f_TempAve press f_PressAve density ke ebond eangle evdwl ecoul etotal c_SumFFX c_SumFFY c_SumFFZ
+
+thermo_modify flush yes
+thermo ${Nf}
+
+dump 2 all custom 500 wat.lammpstrj id type x y z vx vy vz
+
+restart ${Nrestart} restart.${root}
+
+run ${Nrun}
diff --git a/examples/USER/hadress/Relaxation/LongSetup b/examples/USER/hadress/Relaxation/LongSetup
new file mode 100644
index 0000000000..386966536e
--- /dev/null
+++ b/examples/USER/hadress/Relaxation/LongSetup
@@ -0,0 +1,107 @@
+# SPC/E water box benchmark
+# HAdResS Relaxation Setup
+# Maziar Heidari, Max Planck Institute for Polymer Research
+# (heidari@mpip-mainz.mpg.de))
+
+units real
+atom_style full/hars
+boundary p p p
+
+#read_data HAdResS_SPC.data
+#read_restart restart.waterT300.10000
+
+variable root index waterT300
+variable Nrun equal 50000
+variable Nf equal 100
+variable Ne equal 10
+variable Nr equal ${Nf}/${Ne}
+variable Ndump equal 1000
+variable Nrestart equal 10000
+variable Nr_rdf equal 0.5*${Nrun}/${Ne}
+
+variable Text equal 300.0
+variable Pext equal 1.0
+
+variable x index 1
+variable y index 1
+variable z index 1
+
+variable xx equal 40*$x
+variable yy equal 8*$y
+variable zz equal 8*$z
+
+variable half_xx equal 0.5*${xx}
+variable half_yy equal 0.5*${yy}
+variable half_zz equal 0.5*${zz}
+
+#######################################################################
+molecule mol1 molecule
+
+lattice fcc 5.0
+region box block -${half_xx} ${half_xx} -${half_yy} ${half_yy} -${half_zz} ${half_zz}
+create_box 2 box bond/types 1 extra/bond/per/atom 2 angle/types 1 extra/angle/per/atom 1 dihedral/types 0 improper/types 0 extra/special/per/atom 2
+create_atoms 0 box mol mol1 10
+
+mass 1 15.9994
+mass 2 1.00794
+
+pair_style lj/cut/coul/dsf 0.2 10.0 12.0
+
+pair_coeff 1 1 0.15535 3.166
+pair_coeff * 2 0.0000 0.0000
+########################################################################
+
+bond_style harmonic
+angle_style harmonic
+
+bond_coeff 1 1000.00 1.000
+angle_coeff 1 100.0 109.47
+
+special_bonds lj/coul 0.0 0.0 0.5
+
+neighbor 1.0 bin
+neigh_modify every 1 delay 10 check yes
+
+fix 1 all shake 0.0001 20 0 b 1 a 1
+fix 2 all npt temp ${Text} ${Text} 100.0 iso ${Pext} ${Pext} 1000.0
+
+velocity all create 300 432567 dist uniform
+
+timestep 1.0
+
+########################################################################
+compute FFX all property/atom fx
+compute SumFFX all reduce sum c_FFX
+
+compute FFY all property/atom fy
+compute SumFFY all reduce sum c_FFY
+
+compute FFZ all property/atom fz
+compute SumFFZ all reduce sum c_FFZ
+
+compute T all temp
+fix TempAve all ave/time ${Ne} ${Nr} ${Nf} c_T
+
+compute Pperatom all stress/atom NULL
+variable Patom atom -1.0*c_Pperatom[1]/(3*0.5*(yhi-ylo)*(zhi-zlo))
+
+variable P equal press
+fix PressAve all ave/time ${Ne} ${Nr} ${Nf} v_P
+
+fix PressureP all ave/spatial 10 1000 10000 x lower 1.0 v_Patom ave one file Pres.profile
+fix DensityD all ave/spatial 10 1000 10000 x lower 1.0 density/number ave one file Dens.profile
+
+compute rdf all rdf 100 1 1 # oxygen-oxygen
+fix rdf all ave/time 10 1000 10000 c_rdf file O.rdf mode vector
+
+########################################################################
+thermo_style custom step temp f_TempAve press f_PressAve density vol ke ebond eangle evdwl ecoul etotal c_SumFFX c_SumFFY c_SumFFZ
+
+thermo_modify flush yes
+thermo ${Nf}
+
+dump trj all atom ${Ndump} wat.lammpstrj
+
+restart ${Nrestart} restart.${root}
+
+run ${Nrun}
diff --git a/examples/USER/hadress/Relaxation/molecule b/examples/USER/hadress/Relaxation/molecule
new file mode 100644
index 0000000000..8e37ca578a
--- /dev/null
+++ b/examples/USER/hadress/Relaxation/molecule
@@ -0,0 +1,70 @@
+# molecule
+
+3 atoms
+2 bonds
+1 angles
+0 dihedrals
+0 impropers
+
+Coords
+
+1 0 0 0
+2 0.790422368 0.612562225 0
+3 -0.790422368 0.612562225 0
+
+Types
+
+1 1
+2 2
+3 2
+
+moltypeH
+1 1
+2 1
+3 1
+
+replambdaH
+
+1 1
+2 0
+3 0
+
+Charges
+
+1 -0.8472
+2 0.4236
+3 0.4236
+
+Masses
+
+1 15.9994
+2 1.00794
+3 1.00794
+
+Diameters
+
+1 1.563111282
+2 0.622023675
+3 0.622023675
+
+Bonds
+
+1 1 1 2
+2 1 1 3
+
+Angles
+
+1 1 2 1 3
+
+Special Bond Counts
+
+1 2 0 0
+2 1 0 0
+3 1 0 0
+
+Special Bonds
+
+1 2 3
+2 1
+3 1
+
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index d77487648e..1d40d4b21d 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -7,7 +7,7 @@
EXTRAMAKE = Makefile.lammps.standard
-ifeq($(CUDA_HOME),)
+ifeq ($(CUDA_HOME),)
CUDA_HOME = /usr/local/cuda
endif
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 3b7781753c..2d2a751f85 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -35,7 +35,7 @@ namespace ucl_cudadr {
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
-typedef CUstream command_queue;
+typedef CUstream command_queue;
inline void ucl_sync(CUstream &stream) {
CU_SAFE_CALL(cuStreamSynchronize(stream));
@@ -59,21 +59,21 @@ struct NVDProperties {
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
- * behavior
+ * behavior
* \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
public:
/// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/
inline UCL_Device();
-
+
inline ~UCL_Device();
/// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform
- inline std::string platform_name()
+ inline std::string platform_name()
{ return "NVIDIA Corporation NVIDIA CUDA Driver"; }
/// Delete any contexts/data and set the platform number to be used
@@ -97,24 +97,24 @@ class UCL_Device {
/// Returns the default stream for the current device
inline command_queue & cq() { return cq(0); }
-
+
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
-
+
/// Block until all commands in the default stream have completed
inline void sync() { sync(0); }
-
+
/// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); }
-
+
/// Get the number of command queues currently available on device
- inline int num_queues()
+ inline int num_queues()
{ return _cq.size(); }
-
+
/// Add a stream for device computations
inline void push_command_queue() {
- _cq.push_back(CUstream());
- CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
+ _cq.push_back(CUstream());
+ CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
}
/// Remove a stream for device computations
@@ -124,19 +124,19 @@ class UCL_Device {
CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
_cq.pop_back();
}
-
+
/// Set the default command queue (by default this is the null stream)
- /** \param i index of the command queue (as added by push_command_queue())
+ /** \param i index of the command queue (as added by push_command_queue())
If i is 0, the default command queue is set to the null stream **/
inline void set_command_queue(const int i) {
if (i==0) _cq[0]=0;
else _cq[0]=_cq[i];
}
-
+
/// Get the current CUDA device name
inline std::string name() { return name(_device); }
/// Get the CUDA device name
- inline std::string name(const int i)
+ inline std::string name(const int i)
{ return std::string(_properties[i].name); }
/// Get a string telling the type of the current device
@@ -148,38 +148,38 @@ class UCL_Device {
inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; }
-
+
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
-
+
/// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
inline bool double_precision(const int i) {return arch(i)>=1.3;}
-
+
/// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); }
/// Get the number of compute units
- inline unsigned cus(const int i)
+ inline unsigned cus(const int i)
{ return _properties[i].multiProcessorCount; }
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
- inline unsigned cores(const int i)
- { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
+ inline unsigned cores(const int i)
+ { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
else return _properties[i].multiProcessorCount*192; }
-
+
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
- inline double gigabytes(const int i)
+ inline double gigabytes(const int i)
{ return static_cast(_properties[i].totalGlobalMem)/1073741824; }
-
+
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
@@ -188,13 +188,13 @@ class UCL_Device {
// Get the gigabytes of free memory in the current device
inline double free_gigabytes() { return free_gigabytes(_device); }
// Get the gigabytes of free memory
- inline double free_gigabytes(const int i)
+ inline double free_gigabytes(const int i)
{ return static_cast(free_bytes(i))/1073741824; }
-
+
// Get the bytes of free memory in the current device
inline size_t free_bytes() { return free_bytes(_device); }
// Get the bytes of free memory
- inline size_t free_bytes(const int i) {
+ inline size_t free_bytes(const int i) {
CUDA_INT_TYPE dfree, dtotal;
CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
return static_cast(dfree);
@@ -203,21 +203,21 @@ class UCL_Device {
/// Return the GPGPU compute capability for current device
inline double arch() { return arch(_device); }
/// Return the GPGPU compute capability
- inline double arch(const int i)
+ inline double arch(const int i)
{ return static_cast(_properties[i].minor)/10+_properties[i].major;}
-
+
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
- inline double clock_rate(const int i)
+ inline double clock_rate(const int i)
{ return _properties[i].p.clockRate*1e-6;}
-
+
/// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block
- inline size_t group_size(const int i)
+ inline size_t group_size(const int i)
{ return _properties[i].p.maxThreadsPerBlock; }
-
+
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
@@ -242,7 +242,7 @@ class UCL_Device {
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
- { return false; }
+ { return false; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
@@ -259,7 +259,7 @@ class UCL_Device {
/// List all devices along with all properties
inline void print_all(std::ostream &out);
-
+
private:
int _device, _num_devices;
std::vector _properties;
@@ -279,16 +279,16 @@ UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
if (major==9999)
continue;
-
+
_properties.push_back(NVDProperties());
_properties.back().device_id=dev;
_properties.back().major=major;
_properties.back().minor=minor;
-
+
char namecstr[1024];
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
_properties.back().name=namecstr;
-
+
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@@ -296,23 +296,23 @@ UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
#if CUDA_VERSION >= 2020
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
- &_properties.back().kernelExecTimeoutEnabled,
+ &_properties.back().kernelExecTimeoutEnabled,
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().integrated,
CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
- &_properties.back().canMapHostMemory,
+ &_properties.back().canMapHostMemory,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
- CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
+ CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
#endif
#if CUDA_VERSION >= 3010
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
- &_properties.back().concurrentKernels,
+ &_properties.back().concurrentKernels,
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
- &_properties.back().ECCEnabled,
+ &_properties.back().ECCEnabled,
CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
#endif
}
@@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
cuDriverGetVersion(&driver_version);
out << "CUDA Driver Version: "
<< driver_version/1000 << "." << driver_version%100
- << std::endl;
+ << std::endl;
#endif
if (num_devices() == 0)
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index e0bfb1bb5e..d03a715e1b 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -35,15 +35,15 @@ template class UCL_D_Mat;
template class UCL_Vector;
template class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256
-
+
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
public:
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
- inline UCL_Program(UCL_Device &device, const void *program,
- const char *flags="", std::string *log=NULL) {
+ inline UCL_Program(UCL_Device &device, const void *program,
+ const char *flags="", std::string *log=NULL) {
_cq=device.cq();
- init(device);
+ init(device);
load_string(program,flags,log);
}
@@ -61,20 +61,20 @@ class UCL_Program {
std::string *log=NULL) {
std::ifstream in(filename);
if (!in || in.is_open()==false) {
- #ifndef UCL_NO_EXIT
- std::cerr << "UCL Error: Could not open kernel file: "
+ #ifndef UCL_NO_EXIT
+ std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
}
-
+
std::string program((std::istreambuf_iterator(in)),
std::istreambuf_iterator());
in.close();
return load_string(program.c_str(),flags,log);
}
-
+
/// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="",
std::string *log=NULL) {
@@ -94,12 +94,12 @@ class UCL_Program {
CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
options,(void **)values);
-
+
if (log!=NULL)
*log=std::string(clog);
-
+
if (err != CUDA_SUCCESS) {
- #ifndef UCL_NO_EXIT
+ #ifndef UCL_NO_EXIT
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " UCL Error: Error compiling PTX Program...\n"
@@ -108,24 +108,24 @@ class UCL_Program {
#endif
return UCL_COMPILE_ERROR;
}
-
+
return UCL_SUCCESS;
- }
-
+ }
+
/// Load a precompiled program from a file
inline int load_binary(const char *filename) {
CUmodule _module;
CUresult err = cuModuleLoad(&_module,filename);
if (err==301) {
- #ifndef UCL_NO_EXIT
- std::cerr << "UCL Error: Could not open binary kernel file: "
+ #ifndef UCL_NO_EXIT
+ std::cerr << "UCL Error: Could not open binary kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
} else if (err!=CUDA_SUCCESS) {
- #ifndef UCL_NO_EXIT
- std::cerr << "UCL Error: Error loading binary kernel file: "
+ #ifndef UCL_NO_EXIT
+ std::cerr << "UCL Error: Error loading binary kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
@@ -138,7 +138,7 @@ class UCL_Program {
// return UCL_ERROR;
return UCL_SUCCESS;
}
-
+
friend class UCL_Kernel;
private:
CUmodule _module;
@@ -149,23 +149,23 @@ class UCL_Program {
/// Class for dealing with CUDA Driver kernels
class UCL_Kernel {
public:
- UCL_Kernel() : _dimensions(1), _num_args(0) {
+ UCL_Kernel() : _dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
- _num_blocks[0]=0;
+ _num_blocks[0]=0;
}
-
- UCL_Kernel(UCL_Program &program, const char *function) :
+
+ UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
- _num_blocks[0]=0;
- set_function(program,function);
- _cq=program._cq;
+ _num_blocks[0]=0;
+ set_function(program,function);
+ _cq=program._cq;
}
-
+
~UCL_Kernel() {}
/// Clear any function associated with the kernel
@@ -189,7 +189,7 @@ class UCL_Kernel {
/// Set the kernel argument.
/** If not a device pointer, this must be repeated each time the argument
- * changes
+ * changes
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
template
inline void set_arg(const unsigned index, const dtype * const arg) {
@@ -202,27 +202,27 @@ class UCL_Kernel {
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
#endif
else
- assert(0==1); // Must add kernel parameters in sequential order
+ assert(0==1); // Must add kernel parameters in sequential order
}
-
+
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_D_Vec * const arg)
+ inline void set_arg(const UCL_D_Vec * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_D_Mat * const arg)
+ inline void set_arg(const UCL_D_Mat * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_Vector * const arg)
+ inline void set_arg(const UCL_Vector * const arg)
{ set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_Matrix * const arg)
+ inline void set_arg(const UCL_Matrix * const arg)
{ set_arg(&arg->device.begin()); }
/// Add a kernel argument.
@@ -257,37 +257,37 @@ class UCL_Kernel {
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_D_Vec * const arg)
+ inline void add_arg(const UCL_D_Vec * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_D_Mat * const arg)
+ inline void add_arg(const UCL_D_Mat * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_Vector * const arg)
+ inline void add_arg(const UCL_Vector * const arg)
{ add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_Matrix * const arg)
+ inline void add_arg(const UCL_Matrix * const arg)
{ add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
- inline void set_size(const size_t num_blocks, const size_t block_size) {
- _dimensions=1;
- _num_blocks[0]=num_blocks;
+ inline void set_size(const size_t num_blocks, const size_t block_size) {
+ _dimensions=1;
+ _num_blocks[0]=num_blocks;
_num_blocks[1]=1;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size;
_block_size[1]=1;
_block_size[2]=1;
- #else
+ #else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
#endif
}
@@ -303,43 +303,43 @@ class UCL_Kernel {
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
- const size_t block_size_x, const size_t block_size_y) {
- _dimensions=2;
- _num_blocks[0]=num_blocks_x;
- _num_blocks[1]=num_blocks_y;
+ const size_t block_size_x, const size_t block_size_y) {
+ _dimensions=2;
+ _num_blocks[0]=num_blocks_x;
+ _num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=1;
- #else
+ #else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
#endif
}
-
+
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
- command_queue &cq)
+ command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
- const size_t block_size_x,
+ const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
- _dimensions=2;
- _num_blocks[0]=num_blocks_x;
- _num_blocks[1]=num_blocks_y;
- _num_blocks[2]=1;
+ _dimensions=2;
+ _num_blocks[0]=num_blocks_x;
+ _num_blocks[1]=num_blocks_y;
+ _num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=block_size_z;
- #else
+ #else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
block_size_z));
#endif
@@ -352,10 +352,10 @@ class UCL_Kernel {
const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) {
_cq=cq;
- set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
+ set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z);
}
-
+
/// Run the kernel in the default command queue
inline void run() {
#if CUDA_VERSION >= 4000
@@ -367,12 +367,12 @@ class UCL_Kernel {
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
#endif
}
-
+
/// Clear any arguments associated with the kernel
- inline void clear_args() {
- _num_args=0;
+ inline void clear_args() {
+ _num_args=0;
#if CUDA_VERSION < 4000
- _offsets.clear();
+ _offsets.clear();
_param_size=0;
#endif
}
@@ -390,7 +390,7 @@ class UCL_Kernel {
unsigned _num_blocks[3];
unsigned _num_args;
friend class UCL_Texture;
-
+
#if CUDA_VERSION >= 4000
unsigned _block_size[3];
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
diff --git a/lib/gpu/geryon/nvd_mat.h b/lib/gpu/geryon/nvd_mat.h
index 51cfe1d56f..042e2978c3 100644
--- a/lib/gpu/geryon/nvd_mat.h
+++ b/lib/gpu/geryon/nvd_mat.h
@@ -17,12 +17,12 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
/*! \file */
-
+
#ifndef NVD_MAT_H
#define NVD_MAT_H
@@ -52,6 +52,6 @@ namespace ucl_cudadr {
#include "ucl_print.h"
#undef UCL_PRINT_ALLOW
-} // namespace ucl_cudadr
+} // namespace ucl_cudadr
#endif
diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h
index 5f7b98ba5c..0484e33de6 100644
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -46,7 +46,7 @@ typedef CUdeviceptr device_ptr;
// - HOST MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template
-inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS;
if (kind==UCL_NOT_PINNED)
@@ -62,7 +62,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
}
template
-inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS;
if (kind==UCL_NOT_PINNED)
@@ -95,7 +95,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (mat.kind()==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
- else
+ else
err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
@@ -130,30 +130,30 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
CUresult err;
- CUDA_INT_TYPE upitch;
+ CUDA_INT_TYPE upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16);
- pitch=static_cast(upitch);
+ pitch=static_cast(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
-}
+}
template
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
CUresult err;
- unsigned upitch;
+ unsigned upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16);
- pitch=static_cast(upitch);
+ pitch=static_cast(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=d.cq();
return UCL_SUCCESS;
-}
+}
template
inline void _device_free(mat_type &mat) {
@@ -175,33 +175,33 @@ inline int _device_resize(mat_type &mat, const size_t rows,
const size_t cols, size_t &pitch) {
_device_free(mat);
CUresult err;
- CUDA_INT_TYPE upitch;
+ CUDA_INT_TYPE upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16);
- pitch=static_cast(upitch);
+ pitch=static_cast(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
-}
+}
-inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
*ptr=in;
}
template
-inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
- *ptr=0;
+inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
+ *ptr=0;
}
-inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
- const size_t offset, const size_t numsize) {
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
+ const size_t offset, const size_t numsize) {
*ptr=in+offset*numsize;
}
template
inline void _device_view(CUdeviceptr *ptr, numtyp *in,
- const size_t offset, const size_t numsize) {
- *ptr=0;
+ const size_t offset, const size_t numsize) {
+ *ptr=0;
}
// --------------------------------------------------------------------------
@@ -211,13 +211,13 @@ template
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols) {
assert(0==1);
-}
+}
template
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols) {
assert(0==1);
-}
+}
template
inline void _device_image_free(mat_type &mat) {
@@ -245,7 +245,7 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
// - HELPER FUNCTIONS FOR MEMCPY ROUTINES
// --------------------------------------------------------------------------
-inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
+inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
const size_t spitch, const size_t cols,
const size_t rows) {
ins.srcXInBytes=0;
@@ -257,13 +257,13 @@ inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
ins.WidthInBytes=cols;
ins.Height=rows;
}
-
+
template struct _nvd_set_2D_mem;
-template <> struct _nvd_set_2D_mem<1>
+template <> struct _nvd_set_2D_mem<1>
{ static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
-template <> struct _nvd_set_2D_mem<2>
+template <> struct _nvd_set_2D_mem<2>
{ static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
-template struct _nvd_set_2D_mem
+template struct _nvd_set_2D_mem
{ static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
@@ -285,7 +285,7 @@ template<> struct _ucl_memcpy<2,2> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -297,7 +297,7 @@ template<> struct _ucl_memcpy<2,2> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -322,7 +322,7 @@ template<> struct _ucl_memcpy<2,0> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -334,7 +334,7 @@ template<> struct _ucl_memcpy<2,0> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -359,7 +359,7 @@ template<> struct _ucl_memcpy<2,1> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -371,7 +371,7 @@ template<> struct _ucl_memcpy<2,1> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -396,7 +396,7 @@ template<> struct _ucl_memcpy<0,2> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -408,7 +408,7 @@ template<> struct _ucl_memcpy<0,2> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -433,7 +433,7 @@ template<> struct _ucl_memcpy<1,2> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -445,7 +445,7 @@ template<> struct _ucl_memcpy<1,2> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -470,7 +470,7 @@ template <> struct _ucl_memcpy<1,0> {
CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -482,7 +482,7 @@ template <> struct _ucl_memcpy<1,0> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -507,7 +507,7 @@ template <> struct _ucl_memcpy<0,1> {
CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -519,7 +519,7 @@ template <> struct _ucl_memcpy<0,1> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -542,7 +542,7 @@ template <> struct _ucl_memcpy<1,1> {
CUstream &cq)
{ memcpy(dst.begin(),src.begin(),n); }
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
@@ -554,7 +554,7 @@ template <> struct _ucl_memcpy<1,1> {
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
@@ -579,18 +579,18 @@ template struct _ucl_memcpy {
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
if (p1::PADDED==0 || p2::PADDED==0) {
size_t src_offset=0, dst_offset=0;
- for (size_t i=0; i::a();
@@ -601,12 +601,12 @@ template struct _ucl_memcpy {
}
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
if (p1::PADDED==0 || p2::PADDED==0) {
size_t src_offset=0, dst_offset=0;
- for (size_t i=0; i
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
- const size_t spitch, const size_t cols,
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+ const size_t spitch, const size_t cols,
const size_t rows) {
_ucl_memcpy::mc(dst,dpitch,src,spitch,cols,
rows);
}
template
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
- const size_t spitch, const size_t cols,
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+ const size_t spitch, const size_t cols,
const size_t rows,CUstream &cq) {
_ucl_memcpy::mc(dst,dpitch,src,spitch,cols,
rows,cq);
}
-} // namespace ucl_cudart
+} // namespace ucl_cudart
#endif
diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h
index 07650263a5..965595a448 100644
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -28,7 +28,7 @@
#include "nvd_mat.h"
namespace ucl_cudadr {
-
+
/// Class storing a texture reference
class UCL_Texture {
public:
@@ -38,39 +38,39 @@ class UCL_Texture {
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
{ get_texture(prog,texture_name); }
/// Set the texture reference for this object
- inline void get_texture(UCL_Program &prog, const char *texture_name)
+ inline void get_texture(UCL_Program &prog, const char *texture_name)
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
/// Bind a float array where each fetch grabs a vector of length numel
template
- inline void bind_float(UCL_D_Vec &vec, const unsigned numel)
+ inline void bind_float(UCL_D_Vec &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template
- inline void bind_float(UCL_D_Mat &vec, const unsigned numel)
+ inline void bind_float(UCL_D_Mat &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template
- inline void bind_float(UCL_Vector &vec, const unsigned numel)
+ inline void bind_float(UCL_Vector &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template
- inline void bind_float(UCL_Matrix &vec, const unsigned numel)
+ inline void bind_float(UCL_Matrix &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
- /// Make a texture reference available to kernel
- inline void allow(UCL_Kernel &kernel) {
+ /// Make a texture reference available to kernel
+ inline void allow(UCL_Kernel &kernel) {
#if CUDA_VERSION < 4000
- CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
+ CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
#endif
}
-
+
private:
CUtexref _tex;
friend class UCL_Kernel;
@@ -80,7 +80,7 @@ class UCL_Texture {
#ifdef UCL_DEBUG
assert(numel!=0 && numel<5);
#endif
- CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
+ CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
vec.numel()*vec.element_size()));
if (vec.element_size()==sizeof(float))
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h
index 4c3e993e0d..aefbaea0c3 100644
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -41,7 +41,7 @@ class UCL_Timer {
/// Clear any data associated with timer
/** \note init() must be called to reuse timer after a clear() **/
inline void clear() {
- if (_initialized) {
+ if (_initialized) {
CU_DESTRUCT_CALL(cuEventDestroy(start_event));
CU_DESTRUCT_CALL(cuEventDestroy(stop_event));
_initialized=false;
@@ -63,16 +63,16 @@ class UCL_Timer {
/// Start timing on command queue
inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
-
+
/// Stop timing on command queue
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
-
+
/// Block until the start event has been reached on device
- inline void sync_start()
+ inline void sync_start()
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); }
/// Block until the stop event has been reached on device
- inline void sync_stop()
+ inline void sync_stop()
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
/// Set the time elapsed to zero (not the total_time)
@@ -80,29 +80,29 @@ class UCL_Timer {
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
}
-
+
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
-
+
/// Add time from previous start and stop to total
/** Forces synchronization **/
- inline double add_to_total()
+ inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; }
-
+
/// Add a user specified time to the total (ms)
inline void add_time_to_total(const double t) { _total_time+=t; }
-
+
/// Return the time (ms) of last start to stop - Forces synchronization
- inline double time() {
+ inline double time() {
float timer;
CU_SAFE_CALL(cuEventSynchronize(stop_event));
CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
- return timer;
+ return timer;
}
-
+
/// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; }
-
+
/// Return the total time in ms
inline double total_time() { return _total_time; }
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 8dadcf2efd..20656c8489 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -40,13 +40,13 @@
#include "ucl_types.h"
namespace ucl_opencl {
-
+
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
-typedef cl_command_queue command_queue;
+typedef cl_command_queue command_queue;
typedef cl_context context_type;
-
+
inline void ucl_sync(cl_command_queue &cq) {
CL_SAFE_CALL(clFinish(cq));
}
@@ -76,19 +76,19 @@ struct OCLProperties {
/// Class for looking at data parallel device properties
/** \note Calls to change the device outside of the class results in incorrect
- * behavior
+ * behavior
* \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
public:
/// Collect properties for every device on the node
/** \note You must set the active GPU with set() before using the device **/
inline UCL_Device();
-
+
inline ~UCL_Device();
/// Return the number of platforms (0 if error or no platforms)
inline int num_platforms() { return _num_platforms; }
-
+
/// Return a string with name and info of the current platform
inline std::string platform_name();
@@ -104,38 +104,38 @@ class UCL_Device {
* be allocated for use. clear() is called to delete any contexts and
* associated data from previous calls to set(). **/
inline int set(int num);
-
+
/// Delete any context and associated data stored from a call to set()
inline void clear();
/// Get the current device number
inline int device_num() { return _device; }
-
+
/// Returns the context for the current device
inline cl_context & context() { return _context; }
-
+
/// Returns the default stream for the current device
inline command_queue & cq() { return cq(_default_cq); }
-
+
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
-
+
/// Set the default command queue
- /** \param i index of the command queue (as added by push_command_queue())
+ /** \param i index of the command queue (as added by push_command_queue())
If i is 0, the command queue created with device initialization is
used **/
inline void set_command_queue(const int i) { _default_cq=i; }
-
+
/// Block until all commands in the default stream have completed
inline void sync() { sync(_default_cq); }
-
+
/// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); }
-
+
/// Get the number of command queues currently available on device
- inline int num_queues()
+ inline int num_queues()
{ return _cq.size(); }
-
+
/// Add a command queue for device computations (with profiling enabled)
inline void push_command_queue() {
cl_int errorv;
@@ -143,7 +143,7 @@ class UCL_Device {
_cq.back()=clCreateCommandQueue(_context,_cl_device,
CL_QUEUE_PROFILING_ENABLE,&errorv);
if (errorv!=CL_SUCCESS) {
- std::cerr << "Could not create command queue on device: " << name()
+ std::cerr << "Could not create command queue on device: " << name()
<< std::endl;
UCL_GERYON_EXIT;
}
@@ -160,76 +160,76 @@ class UCL_Device {
/// Get the current OpenCL device name
inline std::string name() { return name(_device); }
/// Get the OpenCL device name
- inline std::string name(const int i)
+ inline std::string name(const int i)
{ return std::string(_properties[i].name); }
/// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); }
/// Get a string telling the type of the device
inline std::string device_type_name(const int i);
-
+
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i);
-
+
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device
- inline bool shared_memory(const int i)
+ inline bool shared_memory(const int i)
{ return _shared_mem_device(_properties[i].device_type); }
-
+
/// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
- inline bool double_precision(const int i)
+ inline bool double_precision(const int i)
{return _properties[i].double_precision;}
-
+
/// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); }
/// Get the number of compute units
- inline unsigned cus(const int i)
+ inline unsigned cus(const int i)
{ return _properties[i].compute_units; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
- inline double gigabytes(const int i)
+ inline double gigabytes(const int i)
{ return static_cast(_properties[i].global_mem)/1073741824; }
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].global_mem; }
-
+
/// Return the GPGPU revision number for current device
//inline double revision() { return revision(_device); }
/// Return the GPGPU revision number
- //inline double revision(const int i)
+ //inline double revision(const int i)
// { return //static_cast(_properties[i].minor)/10+_properties[i].major;}
-
+
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
-
+
/// Return the address alignment in bytes
inline int alignment() { return alignment(_device); }
/// Return the address alignment in bytes
inline int alignment(const int i) { return _properties[i].alignment; }
-
+
/// Return the timer resolution
inline size_t timer_resolution() { return timer_resolution(_device); }
/// Return the timer resolution
- inline size_t timer_resolution(const int i)
+ inline size_t timer_resolution(const int i)
{ return _properties[i].timer_resolution; }
-
+
/// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block
- inline size_t group_size(const int i)
+ inline size_t group_size(const int i)
{ return _properties[i].work_group_size; }
-
+
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
@@ -254,7 +254,7 @@ class UCL_Device {
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
- { return _properties[i].partition_counts; }
+ { return _properties[i].partition_counts; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
@@ -271,10 +271,10 @@ class UCL_Device {
/// List all devices along with all properties
inline void print_all(std::ostream &out);
-
+
/// Return the OpenCL type for the device
inline cl_device_id & cl_device() { return _cl_device; }
-
+
private:
int _num_platforms; // Number of platforms
int _platform; // UCL_Device ID for current platform
@@ -287,7 +287,7 @@ class UCL_Device {
std::vector _cl_devices; // OpenCL IDs for all devices
int _num_devices; // Number of devices
std::vector _properties; // Properties for each device
-
+
inline void add_properties(cl_device_id);
inline int create_context();
int _default_cq;
@@ -300,7 +300,7 @@ UCL_Device::UCL_Device() {
// --- Get Number of Platforms
cl_uint nplatforms;
cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
-
+
if (errorv!=CL_SUCCESS) {
_num_platforms=0;
return;
@@ -328,18 +328,18 @@ void UCL_Device::clear() {
int UCL_Device::set_platform(int pid) {
clear();
cl_int errorv;
-
+
_cl_device=0;
_device=-1;
_num_devices=0;
_default_cq=0;
-
+
#ifdef UCL_DEBUG
assert(pid
namespace ucl_opencl {
-
+
class UCL_Texture;
template class UCL_D_Vec;
template class UCL_D_Mat;
@@ -41,10 +41,10 @@ class UCL_Program {
public:
inline UCL_Program() : _init_done(false) {}
inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
- inline UCL_Program(UCL_Device &device, const void *program,
- const char *flags="", std::string *log=NULL) :
- _init_done(false) {
- init(device);
+ inline UCL_Program(UCL_Device &device, const void *program,
+ const char *flags="", std::string *log=NULL) :
+ _init_done(false) {
+ init(device);
load_string(program,flags,log);
}
@@ -56,7 +56,7 @@ class UCL_Program {
_device=device.cl_device();
_context=device.context();
_cq=device.cq();
- CL_SAFE_CALL(clRetainContext(_context));
+ CL_SAFE_CALL(clRetainContext(_context));
CL_SAFE_CALL(clRetainCommandQueue(_cq));
_init_done=true;
}
@@ -65,7 +65,7 @@ class UCL_Program {
/** \note Must call init() after each clear **/
inline void clear() {
if (_init_done) {
- CL_DESTRUCT_CALL(clReleaseProgram(_program));
+ CL_DESTRUCT_CALL(clReleaseProgram(_program));
CL_DESTRUCT_CALL(clReleaseContext(_context));
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
_init_done=false;
@@ -77,20 +77,20 @@ class UCL_Program {
std::string *log=NULL) {
std::ifstream in(filename);
if (!in || in.is_open()==false) {
- #ifndef UCL_NO_EXIT
- std::cerr << "UCL Error: Could not open kernel file: "
+ #ifndef UCL_NO_EXIT
+ std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
}
-
+
std::string program((std::istreambuf_iterator(in)),
std::istreambuf_iterator());
in.close();
return load_string(program.c_str(),flags,log);
}
-
+
/// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="",
std::string *log=NULL) {
@@ -103,23 +103,23 @@ class UCL_Program {
CL_CHECK_ERR(error_flag);
cl_build_status build_status;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
- CL_PROGRAM_BUILD_STATUS,
+ CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),&build_status,
NULL));
-
+
if (build_status != CL_SUCCESS || log!=NULL) {
size_t ms;
- CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
+ CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
NULL, &ms));
- char build_log[ms];
+ char build_log[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
build_log, NULL));
-
+
if (log!=NULL)
*log=std::string(build_log);
-
+
if (build_status != CL_SUCCESS) {
- #ifndef UCL_NO_EXIT
+ #ifndef UCL_NO_EXIT
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " UCL Error: Error compiling OpenCL Program ("
@@ -130,10 +130,10 @@ class UCL_Program {
return UCL_COMPILE_ERROR;
}
}
-
+
return UCL_SUCCESS;
}
-
+
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
@@ -143,7 +143,7 @@ class UCL_Program {
private:
bool _init_done;
cl_program _program;
- cl_device_id _device;
+ cl_device_id _device;
cl_context _context;
cl_command_queue _cq;
};
@@ -153,7 +153,7 @@ class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
{ _block_size[0]=0; _num_blocks[0]=0; }
-
+
inline UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _function_set(false), _num_args(0)
{ _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
@@ -178,48 +178,48 @@ class UCL_Kernel {
/** If not a device pointer, this must be repeated each time the argument
* changes **/
template
- inline void set_arg(const cl_uint index, const dtype * const arg) {
- CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
+ inline void set_arg(const cl_uint index, const dtype * const arg) {
+ CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
if (index>_num_args) {
_num_args=index;
#ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) {
- std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
+ std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl;
assert(0==1);
}
#endif
}
}
-
+
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_D_Vec * const arg)
+ inline void set_arg(const UCL_D_Vec * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_D_Mat * const arg)
+ inline void set_arg(const UCL_D_Mat * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_Vector * const arg)
+ inline void set_arg(const UCL_Vector * const arg)
{ set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument.
template
- inline void set_arg(const UCL_Matrix * const arg)
+ inline void set_arg(const UCL_Matrix * const arg)
{ set_arg(&arg->device.begin()); }
/// Add a kernel argument.
template
inline void add_arg(const dtype * const arg) {
- CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
- _num_args++;
+ CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
+ _num_args++;
#ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) {
- std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
+ std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl;
assert(0==1);
}
@@ -228,31 +228,31 @@ class UCL_Kernel {
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_D_Vec * const arg)
+ inline void add_arg(const UCL_D_Vec * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_D_Mat * const arg)
+ inline void add_arg(const UCL_D_Mat * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_Vector * const arg)
+ inline void add_arg(const UCL_Vector * const arg)
{ add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument.
template
- inline void add_arg(const UCL_Matrix * const arg)
+ inline void add_arg(const UCL_Matrix * const arg)
{ add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
- inline void set_size(const size_t num_blocks, const size_t block_size) {
- _dimensions=1;
- _num_blocks[0]=num_blocks*block_size;
- _block_size[0]=block_size;
+ inline void set_size(const size_t num_blocks, const size_t block_size) {
+ _dimensions=1;
+ _num_blocks[0]=num_blocks*block_size;
+ _block_size[0]=block_size;
}
/// Set the number of thread blocks and the number of threads in each block
@@ -266,36 +266,36 @@ class UCL_Kernel {
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
- const size_t block_size_x, const size_t block_size_y) {
- _dimensions=2;
- _num_blocks[0]=num_blocks_x*block_size_x;
- _block_size[0]=block_size_x;
- _num_blocks[1]=num_blocks_y*block_size_y;
- _block_size[1]=block_size_y;
+ const size_t block_size_x, const size_t block_size_y) {
+ _dimensions=2;
+ _num_blocks[0]=num_blocks_x*block_size_x;
+ _block_size[0]=block_size_x;
+ _num_blocks[1]=num_blocks_y*block_size_y;
+ _block_size[1]=block_size_y;
}
-
+
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
- command_queue &cq)
+ command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
- const size_t block_size_x,
+ const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
- _dimensions=3;
+ _dimensions=3;
const size_t num_blocks_z=1;
- _num_blocks[0]=num_blocks_x*block_size_x;
- _block_size[0]=block_size_x;
- _num_blocks[1]=num_blocks_y*block_size_y;
- _block_size[1]=block_size_y;
- _num_blocks[2]=num_blocks_z*block_size_z;
- _block_size[2]=block_size_z;
+ _num_blocks[0]=num_blocks_x*block_size_x;
+ _block_size[0]=block_size_x;
+ _num_blocks[1]=num_blocks_y*block_size_y;
+ _block_size[1]=block_size_y;
+ _num_blocks[2]=num_blocks_z*block_size_z;
+ _block_size[2]=block_size_z;
}
/// Set the number of thread blocks and the number of threads in each block
@@ -305,13 +305,13 @@ class UCL_Kernel {
const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) {
_cq=cq;
- set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
+ set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z);
}
-
+
/// Run the kernel in the default command queue
inline void run();
-
+
/// Clear any arguments associated with the kernel
inline void clear_args() { _num_args=0; }
@@ -320,7 +320,7 @@ class UCL_Kernel {
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h"
-
+
private:
cl_kernel _kernel;
cl_program _program;
@@ -328,7 +328,7 @@ class UCL_Kernel {
size_t _block_size[3];
size_t _num_blocks[3];
bool _function_set;
-
+
cl_command_queue _cq; // The default command queue for this kernel
unsigned _num_args;
@@ -348,7 +348,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
CL_SAFE_CALL(clRetainProgram(_program));
cl_int error_flag;
_kernel=clCreateKernel(program._program,function,&error_flag);
-
+
if (error_flag!=CL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not find function: " << function
@@ -357,7 +357,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
#endif
return UCL_FUNCTION_NOT_FOUND;
}
-
+
#ifdef UCL_DEBUG
_kernel_info_name=function;
cl_uint nargs;
@@ -375,7 +375,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
#endif
#endif
- return UCL_SUCCESS;
+ return UCL_SUCCESS;
}
void UCL_Kernel::run() {
diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h
index 2909d72a72..3135594dc3 100644
--- a/lib/gpu/geryon/ocl_mat.h
+++ b/lib/gpu/geryon/ocl_mat.h
@@ -17,12 +17,12 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
/*! \file */
-
+
#ifndef OCL_MAT_H
#define OCL_MAT_H
@@ -54,6 +54,6 @@ namespace ucl_opencl {
#include "ucl_print.h"
#undef UCL_PRINT_ALLOW
-} // namespace ucl_cudart
+} // namespace ucl_cudart
#endif
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index 7aed0a1a8c..28bb88941f 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -36,10 +36,10 @@ namespace ucl_opencl {
// --------------------------------------------------------------------------
struct ocl_kernel_dim {
size_t x,y,z;
- ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
+ ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
x(_x), y(_y), z(_z) {}
operator size_t * () { return (size_t *)this; }
- operator const size_t * () const { return (const size_t *)this; }
+ operator const size_t * () const { return (const size_t *)this; }
};
typedef ocl_kernel_dim ucl_kernel_dim;
@@ -53,13 +53,13 @@ typedef cl_mem device_ptr;
// --------------------------------------------------------------------------
template
-inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
-
+
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
if (kind2==UCL_NOT_SPECIFIED) {
@@ -88,7 +88,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
else
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
-
+
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
@@ -102,9 +102,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
} else
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
-
+
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
- if (error_flag != CL_SUCCESS)
+ if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
@@ -125,7 +125,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
&orig_flags,NULL));
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
-
+
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
*mat.host_ptr(), &error_flag);
@@ -135,7 +135,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
}
template
-inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
@@ -160,7 +160,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
- if (error_flag != CL_SUCCESS)
+ if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
@@ -210,7 +210,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
map_perm=CL_MAP_READ | CL_MAP_WRITE;
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
- if (error_flag != CL_SUCCESS)
+ if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
@@ -248,7 +248,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
- if (error_flag != CL_SUCCESS)
+ if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -278,7 +278,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
assert(0==1);
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
&error_flag);
- if (error_flag != CL_SUCCESS)
+ if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -304,7 +304,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
if (dev.device_type()!=UCL_CPU && cols%256!=0)
padded_cols+=256-cols%256;
pitch=padded_cols*sizeof(typename mat_type::data_type);
- return _device_alloc(mat,dev,pitch*rows,kind);
+ return _device_alloc(mat,dev,pitch*rows,kind);
}
template
@@ -342,7 +342,7 @@ inline int _device_resize(mat_type &mat, const size_t n) {
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
- if (error_flag != CL_SUCCESS)
+ if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
@@ -380,7 +380,7 @@ inline int _device_resize(mat_type &mat, const size_t rows,
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
- if (error_flag != CL_SUCCESS)
+ if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
@@ -396,21 +396,21 @@ inline void _host_zero(void *ptr, const size_t n) {
inline void _ocl_build(cl_program &program, cl_device_id &device,
const char* options = "") {
clBuildProgram(program,1,&device,options,NULL,NULL);
-
+
cl_build_status build_status;
- CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
+ CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),&build_status,
NULL));
if (build_status == CL_SUCCESS)
return;
-
+
size_t ms;
- CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
+ CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
NULL, &ms));
- char build_log[ms];
+ char build_log[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
build_log, NULL));
-
+
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " Error compiling OpenCL Program...\n"
@@ -423,13 +423,13 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
cl_kernel &kernel, const char *function,
const char *options="") {
cl_int error_flag;
-
+
cl_program program=clCreateProgramWithSource(context,lines,source,
NULL,&error_flag);
- CL_CHECK_ERR(error_flag);
+ CL_CHECK_ERR(error_flag);
_ocl_build(program,device,options);
kernel=clCreateKernel(program,function,&error_flag);
- CL_CHECK_ERR(error_flag);
+ CL_CHECK_ERR(error_flag);
}
template
@@ -452,17 +452,17 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
cl_device_id device;
CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
sizeof(cl_device_id),&device,NULL));
-
+
const char * szero[3]={
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void _device_zero(__global NUMTYP *a, const int offset)",
" { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
};
-
+
cl_kernel kzero;
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
_UCL_DATA_ID::numtyp_flag());
-
+
cl_int offset=mat.offset();
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
@@ -486,7 +486,7 @@ template<> struct _ucl_memcpy<2,2> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
@@ -504,7 +504,7 @@ template<> struct _ucl_memcpy<2,0> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
@@ -522,7 +522,7 @@ template<> struct _ucl_memcpy<2,1> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
@@ -540,7 +540,7 @@ template<> struct _ucl_memcpy<0,2> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
@@ -558,7 +558,7 @@ template<> struct _ucl_memcpy<1,2> {
assert(0==1);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
@@ -587,9 +587,9 @@ template <> struct _ucl_memcpy<1,0> {
dst.begin(),0,NULL,NULL));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
- const size_t rows, cl_command_queue &cq,
+ const size_t rows, cl_command_queue &cq,
const cl_bool block,
size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
@@ -602,20 +602,20 @@ template <> struct _ucl_memcpy<1,0> {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2NS\n";
#endif
- if (spitch==dpitch && dst.cols()==src.cols() &&
+ if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
spitch*rows,
(char *)dst.begin()+dst_offset,0,NULL,
NULL));
else
- for (size_t i=0; i struct _ucl_memcpy<0,1> {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3S\n";
#endif
- return;
+ return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3NS\n";
@@ -639,9 +639,9 @@ template <> struct _ucl_memcpy<0,1> {
src.begin(),0,NULL,NULL));
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
- const size_t rows, cl_command_queue &cq,
+ const size_t rows, cl_command_queue &cq,
const cl_bool block,
size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
@@ -649,12 +649,12 @@ template <> struct _ucl_memcpy<0,1> {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4S\n";
#endif
- return;
+ return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4NS\n";
#endif
- if (spitch==dpitch && dst.cols()==src.cols() &&
+ if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
spitch*rows,
@@ -667,7 +667,7 @@ template <> struct _ucl_memcpy<0,1> {
NULL));
src_offset+=spitch;
dst_offset+=dpitch;
- }
+ }
}
};
@@ -687,33 +687,33 @@ template struct _ucl_memcpy {
#ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 6S\n";
#endif
-
+
if (block==CL_TRUE) ucl_sync(cq);
}
template
- static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
+ static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
size_t dst_offset, size_t src_offset) {
- if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
+ if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7NS\n";
#endif
- if (spitch==dpitch && dst.cols()==src.cols() &&
+ if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
dst_offset,spitch*rows,0,NULL,NULL));
-
+
else
- for (size_t i=0; i
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
- const size_t spitch, const size_t cols,
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+ const size_t spitch, const size_t cols,
const size_t rows) {
_ucl_memcpy::mc(dst,dpitch,src,spitch,cols,
rows,dst.cq(),CL_TRUE,
@@ -745,15 +745,15 @@ inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
}
template
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
- const size_t spitch, const size_t cols,
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+ const size_t spitch, const size_t cols,
const size_t rows,cl_command_queue &cq) {
_ucl_memcpy::mc(dst,dpitch,src,spitch,cols,
rows,cq,CL_FALSE,
dst.byteoff(),src.byteoff());
}
-} // namespace ucl_cudart
+} // namespace ucl_cudart
#endif
diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h
index 8e72c51730..0e60045f55 100644
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -28,7 +28,7 @@
#include "ocl_mat.h"
namespace ucl_opencl {
-
+
/// Class storing a texture reference
class UCL_Texture {
public:
@@ -46,9 +46,9 @@ class UCL_Texture {
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
- /// Make a texture reference available to kernel
+ /// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) { }
-
+
private:
friend class UCL_Kernel;
};
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 627d19d66f..66b79dcab1 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -67,33 +67,33 @@ class UCL_Timer {
clRetainCommandQueue(_cq);
_initialized=true;
}
-
+
/// Start timing on default command queue
inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
-
+
/// Stop timing on default command queue
inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
-
+
/// Block until the start event has been reached on device
- inline void sync_start()
+ inline void sync_start()
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
/// Block until the stop event has been reached on device
- inline void sync_stop()
+ inline void sync_stop()
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
/// Set the time elapsed to zero (not the total_time)
- inline void zero()
- { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
-
+ inline void zero()
+ { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
+
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
-
+
/// Add time from previous start and stop to total
/** Forces synchronization **/
- inline double add_to_total()
+ inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; }
-
+
/// Add a user specified time to the total (ms)
inline void add_time_to_total(const double t) { _total_time+=t; }
@@ -107,12 +107,12 @@ class UCL_Timer {
CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &tstart, NULL));
- return (tend-tstart)*t_factor;
+ return (tend-tstart)*t_factor;
}
-
+
/// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; }
-
+
/// Return the total time in ms
inline double total_time() { return _total_time; }
diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h
index 646aa4d68f..eea913863d 100644
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -38,47 +38,47 @@
template
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
}
template
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) {
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6);
}
template
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) {
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7);
}
template
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) {
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8);
}
template
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
}
template
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
}
template
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
run();
}
@@ -434,8 +434,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6);
run();
}
@@ -444,8 +444,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7);
run();
}
@@ -454,8 +454,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8);
run();
}
@@ -464,8 +464,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
run();
}
@@ -474,8 +474,8 @@
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
run();
}
@@ -486,9 +486,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11);
run();
}
@@ -499,8 +499,8 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12);
run();
}
@@ -512,9 +512,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13);
run();
}
@@ -525,9 +525,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
run();
}
@@ -538,9 +538,9 @@
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
run();
}
@@ -553,10 +553,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16);
run();
}
@@ -569,10 +569,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17);
run();
}
@@ -585,10 +585,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18);
run();
}
@@ -601,10 +601,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
run();
}
@@ -617,10 +617,10 @@
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
run();
}
@@ -635,10 +635,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21);
run();
}
@@ -654,10 +654,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22);
run();
}
@@ -673,10 +673,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23);
run();
}
@@ -692,10 +692,10 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
run();
}
@@ -711,11 +711,11 @@
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
- add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
run();
}
@@ -732,11 +732,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
- add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26);
run();
}
@@ -754,11 +754,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
- add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27);
run();
}
@@ -776,12 +776,12 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
- add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
- add_arg(a26); add_arg(a27); add_arg(a28);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+ add_arg(a26); add_arg(a27); add_arg(a28);
run();
}
@@ -798,11 +798,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
- add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
run();
}
@@ -820,11 +820,11 @@
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
clear_args();
- add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
- add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
- add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
- add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
- add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
- add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
+ add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+ add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+ add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+ add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+ add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+ add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
run();
}
diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h
index 4edf83e057..1ded9f043b 100644
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -52,10 +52,10 @@
/// Base class for vector/matrix containers
/** All containers are associated with a default command queue.
* For CUDA, this is the default stream.
- *
- * The default queue is used for asynchonrous operations on the container
+ *
+ * The default queue is used for asynchonrous operations on the container
* that do not specify a queue. For OpenCL, this queue is also used in
- * calls for reserving and copying memory **/
+ * calls for reserving and copying memory **/
class UCL_BaseMat {
public:
UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
@@ -68,8 +68,8 @@ class UCL_BaseMat {
inline void sync() { ucl_sync(_cq); }
/// Return the type/permissions of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
- * or UCL_VIEW **/
- inline enum UCL_MEMOPT kind() const { return _kind; }
+ * or UCL_VIEW **/
+ inline enum UCL_MEMOPT kind() const { return _kind; }
inline bool shared_mem_device() {
#ifdef _OCL_MAT
@@ -79,12 +79,12 @@ class UCL_BaseMat {
cl_device_type device_type;
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
sizeof(device_type),&device_type,NULL));
- return _shared_mem_device(device_type);
+ return _shared_mem_device(device_type);
#else
return false;
#endif
}
-
+
protected:
command_queue _cq;
enum UCL_MEMOPT _kind;
diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h
index c6bff97a8c..c906a14f30 100644
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@@ -17,33 +17,33 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
-
+
/***************************************************************************
The ucl_copy and ucl_cast_copy routines provide a general prototype for
copying data between host and device memory (including texture memory)
for the matrix and vector types in nvc_memory.
-
- For host/host and host/device transfers, typecasting is performed
- automatically as necessary.
-
- The routines are written so that all branches can be removed by the
+
+ For host/host and host/device transfers, typecasting is performed
+ automatically as necessary.
+
+ The routines are written so that all branches can be removed by the
compiler during template instantiation.
-
+
The routines currently assume row-major ordering for all types.
-
+
For asynchronous copy in the default command queue, async is boolean true;
For asynchronous copy in a specified command queue, async is command queue
Otherwise, set async to boolean false;
-
+
When performing frequent data copies that require casting, it is more
efficient to allocate a casting buffer once and then pass that buffer
to the copy routine. This can be accomplished with the ucl_cast_copy
routines.
-
- Examples
+
+ Examples
(x's represent alignment padding - to maintain alignment)
(o's represent a larger matrix in memory)
(vectors represented as single row)
@@ -51,18 +51,18 @@
dst src command
----------------------------------------------------------------
0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async)
-
+
0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async)
-
+
0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async)
- 3 4 5
-
+ 3 4 5
+
0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5
-
+
0 1 2 <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5 3 4 5
-
+
0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async)
3 4 5 3 4 5
5 6 7
@@ -70,33 +70,33 @@
0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async)
4 5 6 4 5 6 7
8 9 10 11
-
+
0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5 x x 3 4 5
-
+
0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async)
3 4 5 3 4 5 x x
-
+
0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async)
3 4 5 o o 3 4 5
- o o o o o
+ o o o o o
0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async)
- 3 4 5 o o
- o o o o o
+ 3 4 5 o o
+ o o o o o
0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async)
- 2 3 o o o
- o o o o o
+ 2 3 o o o
+ o o o o o
0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
5 6 7 o o 5 6 7 8 9
o o o o o 10 11 12 13 14
-
+
0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
5 6 7 8 9
10 11 12 13 14
-
+
***************************************************************************/
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
@@ -124,7 +124,7 @@ inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
assert(0==1);
}
}
-}
+}
// --------------------------------------------------------------------------
// - HOST-HOST COPY ROUTINES
@@ -182,7 +182,7 @@ template <> struct _host_host_copy<1,1> {
return;
}
#endif
-
+
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 8NS\n";
#endif
@@ -212,7 +212,7 @@ template struct _host_host_copy {
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols) {
assert(0==1);
- }
+ }
};
// --------------------------------------------------------------------------
@@ -242,20 +242,20 @@ template struct _ucl_cast_copy<1,host_type2> {
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
- // Asynchronous currently pointless here
+ // Asynchronous currently pointless here
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
- if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
- if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
- #endif
+ if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+ if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+ #endif
if (mat1::VECTOR) {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
for (size_t i=0; i(cast_buffer[i]);
} else {
- if (mat2::VECTOR)
+ if (mat2::VECTOR)
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
cols*sizeof(typename mat2::data_type),
cols*sizeof(typename mat2::data_type),rows);
@@ -276,23 +276,23 @@ template struct _ucl_cast_copy<1,host_type2> {
}
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
- const size_t cols, mat3 &cast_buffer,
+ const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
- // Asynchronous currently pointless here
+ // Asynchronous currently pointless here
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
- if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
- if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
- #endif
+ if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+ if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+ #endif
if (mat1::VECTOR) {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
- cast_buffer.sync();
+ cast_buffer.sync();
for (size_t i=0; i(cast_buffer[i]);
} else {
- if (mat2::VECTOR)
+ if (mat2::VECTOR)
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
cols*sizeof(typename mat2::data_type),
cols*sizeof(typename mat2::data_type),rows,cq);
@@ -338,7 +338,7 @@ template struct _ucl_cast_copy {
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
- if (mat3::VECTOR==0) {
+ if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
}
@@ -404,9 +404,9 @@ template struct _ucl_cast_copy {
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
- if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
- if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
- if (mat3::VECTOR==0) {
+ if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+ if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+ if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
}
@@ -472,23 +472,23 @@ template <> struct _ucl_cast_copy<1,1> {
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
- assert(0==1);
+ assert(0==1);
}
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
- assert(0==1);
+ assert(0==1);
}
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
- assert(0==1);
+ assert(0==1);
}
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
- assert(0==1);
+ assert(0==1);
}
};
@@ -497,23 +497,23 @@ template <> struct _ucl_cast_copy<0,0> {
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
- assert(0==1);
+ assert(0==1);
}
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
- assert(0==1);
+ assert(0==1);
}
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
- assert(0==1);
+ assert(0==1);
}
template
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
- assert(0==1);
+ assert(0==1);
}
};
@@ -525,7 +525,7 @@ template <> struct _ucl_cast_copy<0,0> {
/** \param numel Number of elements (not bytes) to copy
* \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/
template
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@@ -551,7 +551,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
* \param async Perform non-blocking copy on default stream
* \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/
template
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@@ -580,7 +580,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/
template
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
@@ -593,7 +593,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy::hhc(dst,src,numel);
- else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
+ else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec cast_buffer;
@@ -606,8 +606,8 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
_ucl_cast_copy::cc(dst,src,numel,
cast_buffer,cq);
}
- } else
- ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
+ } else
+ ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
}
/// Copy matrix/vector (memory already allocated)
@@ -619,7 +619,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - The default stream is used for asynchronous copy
* - Currently does not handle textures **/
template
@@ -648,7 +648,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
cast_buffer);
}
} else
- ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
+ ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
}
// --------------------------------------------------------------------------
@@ -659,11 +659,11 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
/** \param async Perform non-blocking copy on default stream
* \param cast_buffer Buffer on host with enough storage for casting
* - If src is a vector, routine assumes row-major rows by cols copy
- * - If src is a matrix, routine will copy upper left tile of matrix
+ * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
- * - If dst is a matrix, routine will copy into left tile of matrix
+ * - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types for the two matrices are same, no cast performed
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template
@@ -686,16 +686,16 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
/** \param cast_buffer Buffer on host with enough storage for casting
* - If src is a vector, routine assumes row-major rows by cols copy
- * - If src is a matrix, routine will copy upper left tile of matrix
+ * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
- * - If dst is a matrix, routine will copy into upper left tile of matrix
+ * - If dst is a matrix, routine will copy into upper left tile of matrix
* - If the data types for the two matrices are same, no cast performed
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
- const size_t cols, mat3 &cast_buffer,
+ const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,rows,cols,cq);
@@ -710,11 +710,11 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
/** - If src is a vector, routine assumes row-major rows by cols copy
- * - If src is a matrix, routine will copy upper left tile of matrix
+ * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
- * - If dst is a matrix, routine will copy into left tile of matrix
+ * - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types of the two matrices are not the same,
- * casting will be performed automatically as long as the copy is
+ * casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
@@ -730,7 +730,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy::hhc(dst,src,rows,cols);
- else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
+ else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec cast_buffer;
@@ -773,9 +773,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
/// Copy subset of matrix rows,cols (memory already allocated)
/** \param async Perform non-blocking copy (ignored for host to host copy)
* - If src is a vector, routine assumes row-major rows by cols copy
- * - If src is a matrix, routine will copy upper left tile of matrix
+ * - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
- * - If dst is a matrix, routine will copy into left tile of matrix
+ * - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
@@ -796,7 +796,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
ucl_copy(dst,src,rows,cols,dst.cq());
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy::hhc(dst,src,rows,cols);
- else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
+ else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec cast_buffer;
@@ -846,7 +846,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
* \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
* - The number of bytes copied is determined by entire src data
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template
@@ -866,7 +866,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
/** \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
* - The number of bytes copied is determined by entire src data
- * - Padding for 2D matrices is not considered in this routine.
+ * - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template
@@ -885,7 +885,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
/// Asynchronous copy of matrix/vector (memory already allocated)
/** - The number of bytes copied is determined by entire src data
* - If the data types of the two matrices are not the same,
- * casting will be performed automatically as long as the copy is
+ * casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
@@ -924,7 +924,7 @@ template
inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
if (async)
ucl_copy(dst,src,dst.cq());
- else if (dst.row_bytes()==src.row_bytes() &&
+ else if (dst.row_bytes()==src.row_bytes() &&
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,src.row_size()*src.rows(),async);
diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h
index f1aaa27903..da55cc6ebc 100644
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -37,23 +37,23 @@ class UCL_D_Mat : public UCL_BaseMat {
ROW_MAJOR = 1,
VECTOR = 0
};
- typedef numtyp data_type;
+ typedef numtyp data_type;
UCL_D_Mat() : _cols(0) {}
~UCL_D_Mat() { _device_free(*this); }
-
+
/// Construct with specified rows and cols
/** \sa alloc() **/
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
- const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
+ const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_cols(0) { alloc(rows,cols,device,kind); }
-
+
/// Row major matrix on device
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
- * \param cq Default command queue for operations copied from another mat
+ * \param cq Default command queue for operations copied from another mat
* \note - Coalesced access using adjacent cols on same row
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
* \return UCL_SUCCESS if the memory allocation is successful **/
@@ -65,7 +65,7 @@ class UCL_D_Mat : public UCL_BaseMat {
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
- std::cerr << "UCL Error: Could not allocate "
+ std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
UCL_GERYON_EXIT;
#endif
@@ -82,9 +82,9 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT
_offset=0;
#endif
- return err;
+ return err;
}
-
+
/// Row major matrix on device
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
@@ -118,15 +118,15 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT
_offset=0;
#endif
- return err;
+ return err;
}
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) {
@@ -145,7 +145,7 @@ class UCL_D_Mat : public UCL_BaseMat {
#else
_device_view(&_array,input.begin());
#endif
-
+
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
@@ -157,39 +157,39 @@ class UCL_D_Mat : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
- inline void view(ucl_type &input, const size_t rows, const size_t cols)
+ inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
- inline void view(ucl_type &input)
+ inline void view(ucl_type &input)
{ view(input,input.rows(),input.cols()); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ptr_type input, const size_t rows, const size_t cols,
- const size_t stride, UCL_Device &dev) {
+ const size_t stride, UCL_Device &dev) {
clear();
_kind=UCL_VIEW;
_cols=cols;
@@ -215,7 +215,7 @@ class UCL_D_Mat : public UCL_BaseMat {
template
inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
@@ -223,13 +223,13 @@ class UCL_D_Mat : public UCL_BaseMat {
template
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride) {
@@ -248,7 +248,7 @@ class UCL_D_Mat : public UCL_BaseMat {
#else
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif
-
+
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
@@ -261,45 +261,45 @@ class UCL_D_Mat : public UCL_BaseMat {
* allocating container when using CUDA APIs **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
- const size_t cols)
+ const size_t cols)
{ view_offset(offset,input,rows,cols,input.row_size()); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
- inline void view_offset(const size_t offset, ucl_type &input) {
- if (input.rows()==1)
+ inline void view_offset(const size_t offset, ucl_type &input) {
+ if (input.rows()==1)
view_offset(offset,input,1,input.cols()-offset);
- else
+ else
view_offset(offset,input,input.rows()-offset/input.row_size(),
input.cols());
}
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols,const size_t stride,
- UCL_Device &dev) {
+ UCL_Device &dev) {
clear();
_kind=UCL_VIEW;
_cols=cols;
@@ -307,7 +307,7 @@ class UCL_D_Mat : public UCL_BaseMat {
_pitch=stride*sizeof(numtyp);
_row_size=stride;
this->_cq=dev.cq();
-
+
#ifdef _OCL_MAT
_array=input;
_offset=offset;
@@ -320,7 +320,7 @@ class UCL_D_Mat : public UCL_BaseMat {
_array=input+offset;
#endif
#endif
-
+
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
@@ -332,20 +332,20 @@ class UCL_D_Mat : public UCL_BaseMat {
* allocating container when using CUDA APIs **/
template
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
- const size_t cols, UCL_Device &dev)
+ const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,rows,cols,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
- inline void view_offset(const size_t offset, ptr_type input,
+ inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,1,cols,dev); }
-
+
/// Free memory and set size to 0
- inline void clear()
+ inline void clear()
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements
@@ -356,7 +356,7 @@ class UCL_D_Mat : public UCL_BaseMat {
int err=_device_resize(*this,rows,cols,_pitch);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
- std::cerr << "UCL Error: Could not allocate "
+ std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
UCL_GERYON_EXIT;
#endif
@@ -372,13 +372,13 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT
_offset=0;
#endif
- return err;
+ return err;
}
-
+
/// Resize (only if bigger) the allocation to contain rows x cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int rows, const int cols)
- { if (cols>_cols || rows>_rows) return resize(rows,cols);
+ { if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; }
/// Set each element to zero asynchronously in the default command_queue
@@ -386,10 +386,10 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Set first n elements to zero asynchronously in the default command_queue
inline void zero(const int n) { zero(n,_cq); }
/// Set each element to zero asynchronously
- inline void zero(command_queue &cq)
+ inline void zero(command_queue &cq)
{ _device_zero(*this,row_bytes()*_rows,cq); }
/// Set first n elements to zero asynchronously
- inline void zero(const int n, command_queue &cq)
+ inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); }
@@ -445,7 +445,7 @@ class UCL_D_Mat : public UCL_BaseMat {
inline size_t row_bytes() const { return _pitch; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
-
+
#ifdef _OCL_MAT
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
@@ -459,7 +459,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return offset()*sizeof(numtyp); }
-
+
private:
size_t _pitch, _row_size, _rows, _cols;
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index fc1977f4b5..99a6c939c6 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -17,14 +17,14 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
-/// Row vector on device
+/// Row vector on device
template
class UCL_D_Vec : public UCL_BaseMat {
public:
@@ -37,7 +37,7 @@ class UCL_D_Vec : public UCL_BaseMat {
ROW_MAJOR = 1,
VECTOR = 1
};
- typedef numtyp data_type;
+ typedef numtyp data_type;
UCL_D_Vec() : _cols(0) {}
~UCL_D_Vec() { _device_free(*this); }
@@ -45,7 +45,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Construct with n columns
/** \sa alloc() **/
UCL_D_Vec(const size_t n, UCL_Device &device,
- const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
+ const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_cols(0) { alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory
@@ -58,7 +58,7 @@ class UCL_D_Vec : public UCL_BaseMat {
template
inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
-
+
clear();
_row_bytes=cols*sizeof(numtyp);
@@ -82,8 +82,8 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT
_offset=0;
#endif
- return err;
- }
+ return err;
+ }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory optimizations as follows:
@@ -116,7 +116,7 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT
_offset=0;
#endif
- return err;
+ return err;
}
/// Do not allocate memory, instead use an existing allocation from Geryon
@@ -142,18 +142,18 @@ class UCL_D_Vec : public UCL_BaseMat {
#else
_device_view(&_array,input.begin());
#endif
-
+
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { view(input,rows,cols); }
@@ -162,24 +162,24 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
- inline void view(ucl_type &input)
+ inline void view(ucl_type &input)
{ view(input,input.rows()*input.row_size()); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
@@ -205,15 +205,15 @@ class UCL_D_Vec : public UCL_BaseMat {
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#endif
}
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ptr_type input, const size_t rows, const size_t cols,
- const size_t stride, UCL_Device &dev)
+ const size_t stride, UCL_Device &dev)
{ view(input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation
@@ -223,7 +223,7 @@ class UCL_D_Vec : public UCL_BaseMat {
template
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
@@ -248,45 +248,45 @@ class UCL_D_Vec : public UCL_BaseMat {
#else
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif
-
+
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
- const size_t cols, const size_t stride)
+ const size_t cols, const size_t stride)
{ view_offset(offset,input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template
- inline void view_offset(const size_t offset, ucl_type &input)
+ inline void view_offset(const size_t offset, ucl_type &input)
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
@@ -302,7 +302,7 @@ class UCL_D_Vec : public UCL_BaseMat {
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=dev.cq();
-
+
#ifdef _OCL_MAT
_array=input;
_offset=offset;
@@ -315,20 +315,20 @@ class UCL_D_Vec : public UCL_BaseMat {
_array=input+offset;
#endif
#endif
-
+
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
- const size_t cols,const size_t stride,UCL_Device &dev)
+ const size_t cols,const size_t stride,UCL_Device &dev)
{ view_offset(offset,input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation
@@ -336,12 +336,12 @@ class UCL_D_Vec : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template
- inline void view_offset(const size_t offset, ptr_type input,
+ inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,1,cols,dev); }
-
+
/// Free memory and set size to 0
- inline void clear()
+ inline void clear()
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements
@@ -369,9 +369,9 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT
_offset=0;
#endif
- return err;
+ return err;
}
-
+
/// Resize (only if bigger) the allocation to contain cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int cols)
@@ -384,7 +384,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Set each element to zero asynchronously
inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
/// Set first n elements to zero asynchronously
- inline void zero(const int n, command_queue &cq)
+ inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); }
#ifdef _UCL_DEVICE_PTR_MAT
@@ -402,7 +402,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// For CUDA-RT, get device pointer to one past last element
inline numtyp * end() const { return _end; }
#endif
-
+
#ifdef _UCL_DEVICE_PTR_MAT
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
@@ -427,10 +427,10 @@ class UCL_D_Vec : public UCL_BaseMat {
inline const numtyp ** cbegin() const { return &_array; }
/// For CUDA-RT, allocate row vector and bind texture
inline void safe_alloc(const size_t cols, UCL_Device &dev,
- textureReference *t)
+ textureReference *t)
{ alloc(cols,dev); assign_texture(t); bind(); }
/// For CUDA-RT, assign a texture to matrix
- inline void assign_texture(textureReference *t) { _tex_ptr=t; }
+ inline void assign_texture(textureReference *t) { _tex_ptr=t; }
/// For CUDA-RT, bind to texture
inline void bind() {
cuda_gb_get_channel(_channel);
@@ -456,7 +456,7 @@ class UCL_D_Vec : public UCL_BaseMat {
inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
-
+
#ifdef _OCL_MAT
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
@@ -473,7 +473,7 @@ class UCL_D_Vec : public UCL_BaseMat {
private:
size_t _row_bytes, _row_size, _rows, _cols;
-
+
#ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array;
#else
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index dc6da3de0c..1df3c2de4b 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -37,21 +37,21 @@ class UCL_H_Mat : public UCL_BaseMat {
ROW_MAJOR = 1,
VECTOR = 0
};
- typedef numtyp data_type;
-
+ typedef numtyp data_type;
+
UCL_H_Mat() : _cols(0) {
#ifdef _OCL_MAT
_carray=(cl_mem)(0);
#endif
}
~UCL_H_Mat() { _host_free(*this); }
-
+
/// Construct with specied number of rows and columns
/** \sa alloc() **/
- UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
- const enum UCL_MEMOPT kind=UCL_READ_WRITE)
+ UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
+ const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
-
+
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_READ_WRITE - Specify that you will read and write from host
@@ -74,7 +74,7 @@ class UCL_H_Mat : public UCL_BaseMat {
<< " bytes on host.\n";
_row_bytes=0;
UCL_GERYON_EXIT;
- #endif
+ #endif
_row_bytes=0;
return err;
}
@@ -84,7 +84,7 @@ class UCL_H_Mat : public UCL_BaseMat {
_kind=kind;
_end=_array+rows*cols;
return err;
- }
+ }
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows:
@@ -117,15 +117,15 @@ class UCL_H_Mat : public UCL_BaseMat {
_kind=kind;
_end=_array+rows*cols;
return err;
- }
-
+ }
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device container on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) {
@@ -149,45 +149,45 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device container on the host is not supported **/
template
- inline void view(ucl_type &input, const size_t rows, const size_t cols)
+ inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
- * will be used for view
- * - Viewing a device container on the host is not supported **/
+ * will be used for view
+ * - Viewing a device container on the host is not supported **/
template
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
- * will be used for view when using CUDA APIs
- * - Viewing a device container on the host is not supported **/
+ * will be used for view when using CUDA APIs
+ * - Viewing a device container on the host is not supported **/
template
- inline void view(ucl_type &input)
+ inline void view(ucl_type &input)
{ view(input,input.rows(),input.cols()); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ptr_type *input, const size_t rows, const size_t cols,
- const size_t stride, UCL_Device &dev) {
+ const size_t stride, UCL_Device &dev) {
assert(rows==1 || stride==cols);
clear();
_kind=UCL_VIEW;
@@ -197,40 +197,40 @@ class UCL_H_Mat : public UCL_BaseMat {
this->_cq=dev.cq();
_array=input;
_end=_array+_cols;
-
+
#ifdef _OCL_MAT
_host_view(*this,dev,_row_bytes*rows);
- #endif
+ #endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported **/
template
inline void view(ptr_type *input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported **/
template
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device container on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
- const size_t cols, const size_t stride) {
+ const size_t cols, const size_t stride) {
assert(rows==1 || stride==cols);
clear();
_kind=UCL_VIEW;
@@ -244,81 +244,81 @@ class UCL_H_Mat : public UCL_BaseMat {
_host_view(*this,input,_row_bytes*_rows);
#endif
}
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device container on the host is not supported **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
- const size_t cols)
+ const size_t cols)
{ view_offset(offset,input,rows,cols,input.row_size()); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
- * will be used for view
- * - Viewing a device container on the host is not supported **/
+ * will be used for view
+ * - Viewing a device container on the host is not supported **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
- * will be used for view
- * - Viewing a device container on the host is not supported **/
+ * will be used for view
+ * - Viewing a device container on the host is not supported **/
template
- inline void view_offset(const size_t offset, ucl_type &input) {
- if (input.rows()==1)
+ inline void view_offset(const size_t offset, ucl_type &input) {
+ if (input.rows()==1)
view_offset(offset,input,1,input.cols()-offset);
- else
+ else
view_offset(offset,input,input.rows()-offset/input.row_size(),
input.cols());
}
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container
- * - Viewing a device pointer on the host is not supported **/
+ * allocating container
+ * - Viewing a device pointer on the host is not supported **/
template
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols, UCL_Device &dev)
{ view(input+offset,rows,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
- const size_t cols,const size_t stride,UCL_Device &dev)
+ const size_t cols,const size_t stride,UCL_Device &dev)
{ view(input+offset,rows,cols,stride,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported **/
template
- inline void view_offset(const size_t offset, ptr_type *input,
+ inline void view_offset(const size_t offset, ptr_type *input,
const size_t cols, UCL_Device &dev)
{ view(input+offset,1,cols,dev); }
-
+
/// Free memory and set size to 0
- inline void clear()
- { _host_free(*this); _cols=0; _kind=UCL_VIEW; }
+ inline void clear()
+ { _host_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to rows x cols elements
/** \note Cannot be used on views **/
@@ -333,7 +333,7 @@ class UCL_H_Mat : public UCL_BaseMat {
<< " bytes on host.\n";
_row_bytes=0;
UCL_GERYON_EXIT;
- #endif
+ #endif
_row_bytes=0;
return err;
}
@@ -347,7 +347,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Resize (only if bigger) the allocation to contain rows x cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int rows, const int cols)
- { if (cols>_cols || rows>_rows) return resize(rows,cols);
+ { if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; }
/// Set each element to zero
@@ -376,21 +376,21 @@ class UCL_H_Mat : public UCL_BaseMat {
inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
-
+
/// Get element at index i
inline numtyp & operator[](const int i) { return _array[i]; }
/// Get element at index i
inline const numtyp & operator[](const int i) const { return _array[i]; }
- /// 2D access (row should always be 0)
- inline numtyp & operator()(const int row, const int col)
+ /// 2D access (row should always be 0)
+ inline numtyp & operator()(const int row, const int col)
{ return _array[row*_cols+col]; }
- /// 2D access (row should always be 0)
+ /// 2D access (row should always be 0)
inline const numtyp & operator()(const int row, const int col) const
{ return _array[row*_cols+col]; }
-
+
/// Returns pointer to memory pointer for allocation on host
inline numtyp ** host_ptr() { return &_array; }
-
+
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; }
@@ -409,14 +409,14 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
inline const void ** cbegin() const { return (const void **)&_array; }
#endif
-
+
private:
numtyp *_array, *_end;
size_t _row_bytes, _rows, _cols;
#ifdef _OCL_MAT
device_ptr _carray;
- #endif
+ #endif
};
#endif
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index 773facdea0..a9d64349d9 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -37,21 +37,21 @@ class UCL_H_Vec : public UCL_BaseMat {
ROW_MAJOR = 1,
VECTOR = 1
};
- typedef numtyp data_type;
-
+ typedef numtyp data_type;
+
UCL_H_Vec() : _cols(0) {
#ifdef _OCL_MAT
_carray=(cl_mem)(0);
#endif
}
~UCL_H_Vec() { _host_free(*this); }
-
+
/// Construct with n columns
/** \sa alloc() **/
- UCL_H_Vec(const size_t n, UCL_Device &device,
- const enum UCL_MEMOPT kind=UCL_READ_WRITE)
+ UCL_H_Vec(const size_t n, UCL_Device &device,
+ const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
-
+
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_READ_WRITE - Specify that you will read and write from host
@@ -84,7 +84,7 @@ class UCL_H_Vec : public UCL_BaseMat {
_kind=kind;
_end=_array+cols;
return err;
- }
+ }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows:
@@ -108,7 +108,7 @@ class UCL_H_Vec : public UCL_BaseMat {
<< " bytes on host.\n";
_row_bytes=0;
UCL_GERYON_EXIT;
- #endif
+ #endif
_row_bytes=0;
return err;
}
@@ -118,13 +118,13 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+cols;
return err;
}
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device container on the host is not supported **/
template
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
#ifdef UCL_DEBUG
@@ -143,14 +143,14 @@ class UCL_H_Vec : public UCL_BaseMat {
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#endif
}
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * - Viewing a device container on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { view(input,rows,cols); }
@@ -159,31 +159,31 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
- * will be used for view
- * - Viewing a device container on the host is not supported **/
+ * will be used for view
+ * - Viewing a device container on the host is not supported **/
template
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container
+ * allocating container
* - If a matrix is used a input, all elements (including padding)
- * will be used for view
- * - Viewing a device container on the host is not supported **/
+ * will be used for view
+ * - Viewing a device container on the host is not supported **/
template
- inline void view(ucl_type &input)
+ inline void view(ucl_type &input)
{ view(input,input.rows()*input.row_size()); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported **/
template
inline void view(ptr_type *input, const size_t rows, const size_t cols,
UCL_Device &dev) {
@@ -197,38 +197,38 @@ class UCL_H_Vec : public UCL_BaseMat {
this->_cq=dev.cq();
_array=input;
_end=_array+_cols;
-
+
#ifdef _OCL_MAT
_host_view(*this,dev,_row_bytes);
- #endif
+ #endif
}
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * - Viewing a device pointer on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view(ptr_type *input, const size_t rows, const size_t cols,
- const size_t stride, UCL_Device &dev)
+ const size_t stride, UCL_Device &dev)
{ view(input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported **/
+ * - Viewing a device pointer on the host is not supported **/
template
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported **/
+ * - Viewing a device container on the host is not supported **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) {
@@ -246,76 +246,76 @@ class UCL_H_Vec : public UCL_BaseMat {
_host_view(*this,input,_row_bytes);
#endif
}
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device container on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device container on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
- const size_t cols, const size_t stride)
+ const size_t cols, const size_t stride)
{ view_offset(offset,input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
- * will be used for view
- * - Viewing a device container on the host is not supported **/
+ * will be used for view
+ * - Viewing a device container on the host is not supported **/
template
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
-
+
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
+ * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
- * will be used for view
- * - Viewing a device container on the host is not supported **/
+ * will be used for view
+ * - Viewing a device container on the host is not supported **/
template
- inline void view_offset(const size_t offset, ucl_type &input)
+ inline void view_offset(const size_t offset, ucl_type &input)
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported **/
template
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols, UCL_Device &dev)
{ view(input+offset,rows,cols,dev); }
-
+
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported
- * \param stride Number of _elements_ between the start of each row **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported
+ * \param stride Number of _elements_ between the start of each row **/
template
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
- const size_t cols,const size_t stride,UCL_Device &dev)
+ const size_t cols,const size_t stride,UCL_Device &dev)
{ view(input+offset,rows,cols,stride,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
- * allocating container when using CUDA APIs
- * - Viewing a device pointer on the host is not supported **/
+ * allocating container when using CUDA APIs
+ * - Viewing a device pointer on the host is not supported **/
template
- inline void view_offset(const size_t offset, ptr_type *input,
+ inline void view_offset(const size_t offset, ptr_type *input,
const size_t cols, UCL_Device &dev)
{ view(input+offset,1,cols,dev); }
-
+
/// Free memory and set size to 0
- inline void clear()
+ inline void clear()
{ _host_free(*this); _kind=UCL_VIEW; _cols=0; }
/// Resize the allocation to contain cols elements
@@ -324,7 +324,7 @@ class UCL_H_Vec : public UCL_BaseMat {
assert(_kind!=UCL_VIEW);
_row_bytes=cols*sizeof(numtyp);
int err=_host_resize(*this,_row_bytes);
-
+
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes
@@ -340,7 +340,7 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+cols;
return err;
}
-
+
/// Resize (only if bigger) the allocation to contain cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int cols)
@@ -348,7 +348,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Set each element to zero
inline void zero() { _host_zero(_array,row_bytes()); }
-
+
/// Set first n elements to zero
inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
@@ -373,35 +373,35 @@ class UCL_H_Vec : public UCL_BaseMat {
inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
-
+
/// Get element at index i
inline numtyp & operator[](const int i) { return _array[i]; }
/// Get element at index i
inline const numtyp & operator[](const int i) const { return _array[i]; }
- /// 2D access (row should always be 0)
- inline numtyp & operator()(const int row, const int col)
+ /// 2D access (row should always be 0)
+ inline numtyp & operator()(const int row, const int col)
{ return _array[col]; }
- /// 2D access (row should always be 0)
+ /// 2D access (row should always be 0)
inline const numtyp & operator()(const int row, const int col) const
{ return _array[col]; }
-
+
/// Returns pointer to memory pointer for allocation on host
inline numtyp ** host_ptr() { return &_array; }
-
+
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; }
/// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return 0; }
-
+
#ifdef _OCL_MAT
/// For OpenCL, returns a reference to the cl_mem object
inline device_ptr & cbegin() { return _carray; }
/// For OpenCL, returns a reference to the cl_mem object
inline const device_ptr & cbegin() const { return _carray; }
#endif
-
+
private:
numtyp *_array, *_end;
size_t _row_bytes, _cols;
diff --git a/lib/gpu/geryon/ucl_matrix.h b/lib/gpu/geryon/ucl_matrix.h
index 301325b454..b93d1c7f68 100644
--- a/lib/gpu/geryon/ucl_matrix.h
+++ b/lib/gpu/geryon/ucl_matrix.h
@@ -34,25 +34,25 @@ class UCL_Matrix {
ROW_MAJOR = 1,
VECTOR = 0
};
- typedef hosttype data_type;
+ typedef hosttype data_type;
/// Host Allocation
UCL_H_Mat host;
-
+
/// Device Allocation
UCL_D_Mat device;
UCL_Matrix() { }
~UCL_Matrix() { }
-
+
/// Construct with specied number of rows and columns
/** \sa alloc() **/
- UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
+ UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
-
+
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host
@@ -74,7 +74,7 @@ class UCL_Matrix {
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type::ans >::
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
-
+
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host
@@ -92,9 +92,9 @@ class UCL_Matrix {
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
-
+
/// Free memory and set size to 0
- inline void clear()
+ inline void clear()
{ host.clear(); device.clear(); }
/// Resize the allocation to contain cols elements
@@ -106,10 +106,10 @@ class UCL_Matrix {
return _ucl_s_obj_help< ucl_same_type::ans >::
dev_resize(device,host,_buffer,rows,cols);
}
-
+
/// Resize (only if bigger) the allocation to contain cols elements
inline int resize_ib(const int new_rows, const int new_cols)
- { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
+ { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
else return UCL_SUCCESS; }
/// Set each element to zero (asynchronously on device)
@@ -118,14 +118,14 @@ class UCL_Matrix {
inline void zero(const int n) { zero(n,cq()); }
/// Set each element to zero (asynchronously on device)
inline void zero(command_queue &cq) {
- host.zero();
+ host.zero();
if (device.kind()!=UCL_VIEW) device.zero(cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Set first n elements to zero (asynchronously on device)
- inline void zero(const int n, command_queue &cq) {
- host.zero(n);
- if (device.kind()!=UCL_VIEW) device.zero(n,cq);
+ inline void zero(const int n, command_queue &cq) {
+ host.zero(n);
+ if (device.kind()!=UCL_VIEW) device.zero(n,cq);
else if (_buffer.numel()>0) _buffer.zero();
}
@@ -136,26 +136,26 @@ class UCL_Matrix {
/// Get the number of columns
inline size_t cols() const { return host.cols(); }
/// Get the memory usage (bytes) of the s-object (including any buffers)
- inline size_t host_mem_usage()
+ inline size_t host_mem_usage()
{ return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
/// Get the memory usage (bytes) of the s-object (including any buffers)
- inline size_t device_mem_usage()
+ inline size_t device_mem_usage()
{ return device.row_bytes()*device.rows(); }
-
+
/// Get element at index i
inline hosttype & operator[](const int i) { return host[i]; }
/// Get element at index i
inline const hosttype & operator[](const int i) const { return host[i]; }
- /// 2D access (row should always be 0)
- inline hosttype & operator()(const int row, const int col)
+ /// 2D access (row should always be 0)
+ inline hosttype & operator()(const int row, const int col)
{ return host(row,col); }
- /// 2D access (row should always be 0)
+ /// 2D access (row should always be 0)
inline const hosttype & operator()(const int row, const int col) const
{ return host(row,col); }
-
+
/// Returns pointer to memory pointer for allocation on host
inline hosttype ** host_ptr() { return host.host_ptr(); }
-
+
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); }
/// Change the default command queue associated with this data
@@ -172,7 +172,7 @@ class UCL_Matrix {
/// Update the allocation on the host asynchronously
- inline void update_host()
+ inline void update_host()
{ _ucl_s_obj_help< ucl_same_type::ans >::
copy(host,device,_buffer,true); }
/// Update the allocation on the host (true for asynchronous copy)
@@ -202,7 +202,7 @@ class UCL_Matrix {
/// Update the allocation on the device asynchronously
- inline void update_device()
+ inline void update_device()
{ _ucl_s_obj_help< ucl_same_type::ans >::
copy(device,host,_buffer,true); }
/// Update the allocation on the device (true for asynchronous copy)
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index bdba8ff7ae..437631ec3a 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -17,7 +17,7 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
@@ -53,9 +53,9 @@ typedef struct _double4 double4;
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#define __local __shared__
-#define __global
+#define __global
#define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__
+#define ucl_inline static __inline__ __device__
#endif
diff --git a/lib/gpu/geryon/ucl_print.h b/lib/gpu/geryon/ucl_print.h
index 87b3d3d7ff..98ae8a8c06 100644
--- a/lib/gpu/geryon/ucl_print.h
+++ b/lib/gpu/geryon/ucl_print.h
@@ -17,10 +17,10 @@
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
- certain rights in this software. This software is distributed under
+ certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
-
+
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
#ifdef UCL_PRINT_ALLOW
@@ -40,7 +40,7 @@ template <> struct _ucl_print<1> {
}
template
static inline void p(mat_type &mat, const size_t rows, const size_t cols,
- std::ostream &out, const std::string delim,
+ std::ostream &out, const std::string delim,
const std::string row_delim) {
int offset=0;
int row_size=cols;
@@ -58,12 +58,12 @@ template <> struct _ucl_print<1> {
}
template
static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
- std::ostream &out,const std::string delim,
+ std::ostream &out,const std::string delim,
const std::string row_delim, UCL_Device &dev) {
- p(mat,rows,cols,out,delim,row_delim);
+ p(mat,rows,cols,out,delim,row_delim);
}
};
-
+
template struct _ucl_print {
template
static inline void p(mat_type &mat, const size_t n, std::ostream &out,
@@ -83,7 +83,7 @@ template struct _ucl_print {
}
template