add utility functions to detect UTF-8 characters and substitute with ASCII equivalents
This commit is contained in:
@ -77,6 +77,12 @@ and parsing files or arguments.
|
||||
.. doxygenfunction:: trim_comment
|
||||
:project: progguide
|
||||
|
||||
.. doxygenfunction:: has_utf8
|
||||
:project: progguide
|
||||
|
||||
.. doxygenfunction:: utf8_subst
|
||||
:project: progguide
|
||||
|
||||
.. doxygenfunction:: count_words(const char *text)
|
||||
:project: progguide
|
||||
|
||||
|
||||
@ -548,7 +548,8 @@ int utils::expand_args(const char *file, int line, int narg, char **arg,
|
||||
Return string without leading or trailing whitespace
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
std::string utils::trim(const std::string &line) {
|
||||
std::string utils::trim(const std::string &line)
|
||||
{
|
||||
int beg = re_match(line.c_str(),"\\S+");
|
||||
int end = re_match(line.c_str(),"\\s+$");
|
||||
if (beg < 0) beg = 0;
|
||||
@ -561,7 +562,8 @@ std::string utils::trim(const std::string &line) {
|
||||
Return string without trailing # comment
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
std::string utils::trim_comment(const std::string &line) {
|
||||
std::string utils::trim_comment(const std::string &line)
|
||||
{
|
||||
auto end = line.find_first_of("#");
|
||||
if (end != std::string::npos) {
|
||||
return line.substr(0, end);
|
||||
@ -569,6 +571,51 @@ std::string utils::trim_comment(const std::string &line) {
|
||||
return std::string(line);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Replace UTF-8 encoded chars with known ASCII equivalents
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
std::string utils::utf8_subst(const std::string &line)
|
||||
{
|
||||
const unsigned char * const in = (const unsigned char *)line.c_str();
|
||||
const int len = line.size();
|
||||
std::string out;
|
||||
|
||||
for (int i=0; i < len; ++i) {
|
||||
|
||||
// UTF-8 2-byte character
|
||||
if ((in[i] & 0xe0U) == 0xc0U) {
|
||||
if ((i+1) < len) {
|
||||
// MODIFIER LETTER PLUS SIGN (U+02D6)
|
||||
if ((in[i] == 0xcbU) && (in[i+1] == 0x96U))
|
||||
out += '+', ++i;
|
||||
// MODIFIER LETTER MINUS SIGN (U+02D7)
|
||||
if ((in[i] == 0xcbU) && (in[i+1] == 0x97U))
|
||||
out += '-', ++i;
|
||||
}
|
||||
// UTF-8 3-byte character
|
||||
} else if ((in[i] & 0xf0U) == 0xe0U) {
|
||||
if ((i+2) < len) {
|
||||
// INVISIBLE SEPARATOR (U+2063)
|
||||
if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa3U))
|
||||
out += ' ', i += 2;
|
||||
// INVISIBLE PLUS (U+2064)
|
||||
if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa4U))
|
||||
out += '+', i += 2;
|
||||
// MINUS SIGN (U+2212)
|
||||
if ((in[i] == 0xe2U) && (in[i+1] == 0x88U) && (in[i+2] == 0x92U))
|
||||
out += '-', i += 2;
|
||||
}
|
||||
// UTF-8 4-byte character
|
||||
} else if ((in[i] & 0xe8U) == 0xf0U) {
|
||||
if ((i+3) < len) {
|
||||
;
|
||||
}
|
||||
} else out += in[i];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
return number of words
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
46
src/utils.h
46
src/utils.h
@ -197,18 +197,60 @@ namespace LAMMPS_NS {
|
||||
|
||||
/** Trim leading and trailing whitespace. Like TRIM() in Fortran.
|
||||
*
|
||||
* \param line string that should be trimmed
|
||||
* \param line string that should be trimmed
|
||||
* \return new string without whitespace (string) */
|
||||
|
||||
std::string trim(const std::string &line);
|
||||
|
||||
/** Return string with anything from '#' onward removed
|
||||
*
|
||||
* \param line string that should be trimmed
|
||||
* \param line string that should be trimmed
|
||||
* \return new string without comment (string) */
|
||||
|
||||
std::string trim_comment(const std::string &line);
|
||||
|
||||
/** Check if a string will likely have UTF-8 encoded characters
|
||||
*
|
||||
* UTF-8 uses the 7-bit standard ASCII table for the first 127 characters and
|
||||
* all other characters are encoded as multiple bytes. For the multi-byte
|
||||
* characters the first byte has either the highest two, three, or four bits
|
||||
* set followed by a zero bit and followed by one, two, or three more bytes,
|
||||
* respectively, where the highest bit is set and the second highest bit set
|
||||
* to 0. The remaining bits combined are the character code, which is thus
|
||||
* limited to 21-bits.
|
||||
*
|
||||
* For the sake of efficiency this test only checks if a character in the string
|
||||
* has the highest two bits set and thus is likely an UTF-8 character. It
|
||||
*
|
||||
\verbatim embed:rst
|
||||
|
||||
*See also*
|
||||
:cpp:func:`utils::utf8_subst`
|
||||
|
||||
\endverbatim
|
||||
* \param line string that should be checked
|
||||
* \return true if string contains UTF-8 encoded characters (bool) */
|
||||
|
||||
inline bool has_utf8(const std::string &line)
|
||||
{
|
||||
const unsigned char * const in = (const unsigned char *)line.c_str();
|
||||
for (int i=0; i < line.size(); ++i) if (in[i] & 0xc0U) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Replace known UTF-8 characters with ASCII equivalents
|
||||
*
|
||||
\verbatim embed:rst
|
||||
|
||||
*See also*
|
||||
:cpp:func:`utils::has_utf8`
|
||||
|
||||
\endverbatim
|
||||
* \param line string that should be converted
|
||||
* \return new string with ascii replacements (string) */
|
||||
|
||||
std::string utf8_subst(const std::string &line);
|
||||
|
||||
/** Count words in string with custom choice of separating characters
|
||||
*
|
||||
* \param text string that should be searched
|
||||
|
||||
@ -54,6 +54,23 @@ TEST(Utils, trim_comment)
|
||||
ASSERT_THAT(trimmed, StrEq("some text "));
|
||||
}
|
||||
|
||||
TEST(Utils, has_utf8)
|
||||
{
|
||||
const char ascii_string[] = " -2";
|
||||
const char utf8_string[] = " −2";
|
||||
ASSERT_FALSE(utils::has_utf8(ascii_string));
|
||||
ASSERT_TRUE(utils::has_utf8(utf8_string));
|
||||
}
|
||||
|
||||
TEST(Utils, utf8_subst)
|
||||
{
|
||||
const char ascii_string[] = " -2";
|
||||
const char utf8_string[] = " −2";
|
||||
auto ascii = utils::utf8_subst(ascii_string);
|
||||
auto utf8 = utils::utf8_subst(utf8_string);
|
||||
ASSERT_TRUE(ascii == utf8);
|
||||
}
|
||||
|
||||
TEST(Utils, count_words)
|
||||
{
|
||||
ASSERT_EQ(utils::count_words("some text # comment"), 4);
|
||||
|
||||
Reference in New Issue
Block a user