diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst index bf1cca7413..17b6f13bad 100644 --- a/doc/src/Developer_utils.rst +++ b/doc/src/Developer_utils.rst @@ -77,6 +77,12 @@ and parsing files or arguments. .. doxygenfunction:: trim_comment :project: progguide +.. doxygenfunction:: has_utf8 + :project: progguide + +.. doxygenfunction:: utf8_subst + :project: progguide + .. doxygenfunction:: count_words(const char *text) :project: progguide diff --git a/src/utils.cpp b/src/utils.cpp index a58b384225..bd514226df 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -548,7 +548,8 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, Return string without leading or trailing whitespace ------------------------------------------------------------------------- */ -std::string utils::trim(const std::string &line) { +std::string utils::trim(const std::string &line) +{ int beg = re_match(line.c_str(),"\\S+"); int end = re_match(line.c_str(),"\\s+$"); if (beg < 0) beg = 0; @@ -561,7 +562,8 @@ std::string utils::trim(const std::string &line) { Return string without trailing # comment ------------------------------------------------------------------------- */ -std::string utils::trim_comment(const std::string &line) { +std::string utils::trim_comment(const std::string &line) +{ auto end = line.find_first_of("#"); if (end != std::string::npos) { return line.substr(0, end); @@ -569,6 +571,51 @@ std::string utils::trim_comment(const std::string &line) { return std::string(line); } +/* ---------------------------------------------------------------------- + Replace UTF-8 encoded chars with known ASCII equivalents +------------------------------------------------------------------------- */ + +std::string utils::utf8_subst(const std::string &line) +{ + const unsigned char * const in = (const unsigned char *)line.c_str(); + const int len = line.size(); + std::string out; + + for (int i=0; i < len; ++i) { + + // UTF-8 2-byte character + if ((in[i] & 0xe0U) == 0xc0U) { + if ((i+1) < len) { + // MODIFIER LETTER PLUS SIGN (U+02D6) + if ((in[i] == 0xcbU) && (in[i+1] == 0x96U)) + out += '+', ++i; + // MODIFIER LETTER MINUS SIGN (U+02D7) + if ((in[i] == 0xcbU) && (in[i+1] == 0x97U)) + out += '-', ++i; + } + // UTF-8 3-byte character + } else if ((in[i] & 0xf0U) == 0xe0U) { + if ((i+2) < len) { + // INVISIBLE SEPARATOR (U+2063) + if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa3U)) + out += ' ', i += 2; + // INVISIBLE PLUS (U+2064) + if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa4U)) + out += '+', i += 2; + // MINUS SIGN (U+2212) + if ((in[i] == 0xe2U) && (in[i+1] == 0x88U) && (in[i+2] == 0x92U)) + out += '-', i += 2; + } + // UTF-8 4-byte character + } else if ((in[i] & 0xe8U) == 0xf0U) { + if ((i+3) < len) { + ; + } + } else out += in[i]; + } + return out; +} + /* ---------------------------------------------------------------------- return number of words ------------------------------------------------------------------------- */ diff --git a/src/utils.h b/src/utils.h index 52f7933caf..ab1d52c57a 100644 --- a/src/utils.h +++ b/src/utils.h @@ -197,18 +197,60 @@ namespace LAMMPS_NS { /** Trim leading and trailing whitespace. Like TRIM() in Fortran. * - * \param line string that should be trimmed + * \param line string that should be trimmed * \return new string without whitespace (string) */ std::string trim(const std::string &line); /** Return string with anything from '#' onward removed * - * \param line string that should be trimmed + * \param line string that should be trimmed * \return new string without comment (string) */ std::string trim_comment(const std::string &line); + /** Check if a string will likely have UTF-8 encoded characters + * + * UTF-8 uses the 7-bit standard ASCII table for the first 127 characters and + * all other characters are encoded as multiple bytes. For the multi-byte + * characters the first byte has either the highest two, three, or four bits + * set followed by a zero bit and followed by one, two, or three more bytes, + * respectively, where the highest bit is set and the second highest bit set + * to 0. The remaining bits combined are the character code, which is thus + * limited to 21-bits. + * + * For the sake of efficiency this test only checks if a character in the string + * has the highest two bits set and thus is likely an UTF-8 character. It + * +\verbatim embed:rst + +*See also* + :cpp:func:`utils::utf8_subst` + +\endverbatim + * \param line string that should be checked + * \return true if string contains UTF-8 encoded characters (bool) */ + + inline bool has_utf8(const std::string &line) + { + const unsigned char * const in = (const unsigned char *)line.c_str(); + for (int i=0; i < line.size(); ++i) if (in[i] & 0xc0U) return true; + return false; + } + + /** Replace known UTF-8 characters with ASCII equivalents + * +\verbatim embed:rst + +*See also* + :cpp:func:`utils::has_utf8` + +\endverbatim + * \param line string that should be converted + * \return new string with ascii replacements (string) */ + + std::string utf8_subst(const std::string &line); + /** Count words in string with custom choice of separating characters * * \param text string that should be searched diff --git a/unittest/utils/test_utils.cpp b/unittest/utils/test_utils.cpp index da5c100b65..80042be9b0 100644 --- a/unittest/utils/test_utils.cpp +++ b/unittest/utils/test_utils.cpp @@ -54,6 +54,23 @@ TEST(Utils, trim_comment) ASSERT_THAT(trimmed, StrEq("some text ")); } +TEST(Utils, has_utf8) +{ + const char ascii_string[] = " -2"; + const char utf8_string[] = " −2"; + ASSERT_FALSE(utils::has_utf8(ascii_string)); + ASSERT_TRUE(utils::has_utf8(utf8_string)); +} + +TEST(Utils, utf8_subst) +{ + const char ascii_string[] = " -2"; + const char utf8_string[] = " −2"; + auto ascii = utils::utf8_subst(ascii_string); + auto utf8 = utils::utf8_subst(utf8_string); + ASSERT_TRUE(ascii == utf8); +} + TEST(Utils, count_words) { ASSERT_EQ(utils::count_words("some text # comment"), 4);