add utility functions to detect UTF-8 characters and substitute with ASCII equivalents

2021-01-25 01:32:56 -05:00
parent 70998c0509
commit 72168d6780
4 changed files with 116 additions and 4 deletions
--- a/doc/src/Developer_utils.rst
+++ b/doc/src/Developer_utils.rst
@ -77,6 +77,12 @@ and parsing files or arguments.
 .. doxygenfunction:: trim_comment
   :project: progguide

+.. doxygenfunction:: has_utf8
+   :project: progguide
+
+.. doxygenfunction:: utf8_subst
+   :project: progguide
+
 .. doxygenfunction:: count_words(const char *text)
   :project: progguide

--- a/src/utils.cpp
+++ b/src/utils.cpp
@ -548,7 +548,8 @@ int utils::expand_args(const char *file, int line, int narg, char **arg,
   Return string without leading or trailing whitespace
 ------------------------------------------------------------------------- */

-std::string utils::trim(const std::string &line) {
+std::string utils::trim(const std::string &line)
+{
  int beg = re_match(line.c_str(),"\\S+");
  int end = re_match(line.c_str(),"\\s+$");
  if (beg < 0) beg = 0;
@ -561,7 +562,8 @@ std::string utils::trim(const std::string &line) {
   Return string without trailing # comment
 ------------------------------------------------------------------------- */

-std::string utils::trim_comment(const std::string &line) {
+std::string utils::trim_comment(const std::string &line)
+{
  auto end = line.find_first_of("#");
  if (end != std::string::npos) {
    return line.substr(0, end);
@ -569,6 +571,51 @@ std::string utils::trim_comment(const std::string &line) {
  return std::string(line);
 }

+/* ----------------------------------------------------------------------
+   Replace UTF-8 encoded chars with known ASCII equivalents
+------------------------------------------------------------------------- */
+
+std::string utils::utf8_subst(const std::string &line)
+{
+  const unsigned char * const in = (const unsigned char *)line.c_str();
+  const int len = line.size();
+  std::string out;
+
+  for (int i=0; i < len; ++i) {
+
+    // UTF-8 2-byte character
+    if ((in[i] & 0xe0U) == 0xc0U) {
+      if ((i+1) < len) {
+        // MODIFIER LETTER PLUS SIGN (U+02D6)
+        if ((in[i] == 0xcbU) && (in[i+1] == 0x96U))
+          out += '+', ++i;
+        // MODIFIER LETTER MINUS SIGN (U+02D7)
+        if ((in[i] == 0xcbU) && (in[i+1] == 0x97U))
+          out += '-', ++i;
+      }
+    // UTF-8 3-byte character
+    } else if ((in[i] & 0xf0U) == 0xe0U) {
+      if ((i+2) < len) {
+        // INVISIBLE SEPARATOR (U+2063)
+        if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa3U))
+          out += ' ', i += 2;
+        // INVISIBLE PLUS (U+2064)
+        if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa4U))
+          out += '+', i += 2;
+        // MINUS SIGN (U+2212)
+        if ((in[i] == 0xe2U) && (in[i+1] == 0x88U) && (in[i+2] == 0x92U))
+          out += '-', i += 2;
+      }
+    // UTF-8 4-byte character
+    } else if ((in[i] & 0xe8U) == 0xf0U) {
+      if ((i+3) < len) {
+        ;
+      }
+    } else out += in[i];
+  }
+  return out;
+}
+
 /* ----------------------------------------------------------------------
   return number of words
 ------------------------------------------------------------------------- */
--- a/src/utils.h
+++ b/src/utils.h
@ -197,18 +197,60 @@ namespace LAMMPS_NS {

    /** Trim leading and trailing whitespace. Like TRIM() in Fortran.
     *
-     * \param line string that should be trimmed
+     * \param line  string that should be trimmed
     * \return new string without whitespace (string) */

    std::string trim(const std::string &line);

    /** Return string with anything from '#' onward removed
     *
-     * \param line string that should be trimmed
+     * \param line  string that should be trimmed
     * \return new string without comment (string) */

    std::string trim_comment(const std::string &line);

+    /** Check if a string will likely have UTF-8 encoded characters
+     *
+     * UTF-8 uses the 7-bit standard ASCII table for the first 127 characters and
+     * all other characters are encoded as multiple bytes.  For the multi-byte
+     * characters the first byte has either the highest two, three, or four bits
+     * set followed by a zero bit and followed by one, two, or three more bytes,
+     * respectively, where the highest bit is set and the second highest bit set
+     * to 0.  The remaining bits combined are the character code, which is thus
+     * limited to 21-bits.
+     *
+     * For the sake of efficiency this test only checks if a character in the string
+     * has the highest two bits set and thus is likely an UTF-8 character.  It
+     *
+\verbatim embed:rst
+
+*See also*
+   :cpp:func:`utils::utf8_subst`
+
+\endverbatim
+     * \param line  string that should be checked
+     * \return true if string contains UTF-8 encoded characters (bool) */
+
+    inline bool has_utf8(const std::string &line)
+    {
+      const unsigned char * const in = (const unsigned char *)line.c_str();
+      for (int i=0; i < line.size(); ++i) if (in[i] & 0xc0U) return true;
+      return false;
+    }
+
+    /** Replace known UTF-8 characters with ASCII equivalents
+     *
+\verbatim embed:rst
+
+*See also*
+   :cpp:func:`utils::has_utf8`
+
+\endverbatim
+     * \param line  string that should be converted
+     * \return new string with ascii replacements (string) */
+
+    std::string utf8_subst(const std::string &line);
+
    /** Count words in string with custom choice of separating characters
     *
     * \param text string that should be searched
--- a/unittest/utils/test_utils.cpp
+++ b/unittest/utils/test_utils.cpp
@ -54,6 +54,23 @@ TEST(Utils, trim_comment)
    ASSERT_THAT(trimmed, StrEq("some text "));
 }

+TEST(Utils, has_utf8)
+{
+    const char ascii_string[] = " -2";
+    const char utf8_string[] = " −2";
+    ASSERT_FALSE(utils::has_utf8(ascii_string));
+    ASSERT_TRUE(utils::has_utf8(utf8_string));
+}
+
+TEST(Utils, utf8_subst)
+{
+    const char ascii_string[] = " -2";
+    const char utf8_string[] = " −2";
+    auto ascii = utils::utf8_subst(ascii_string);
+    auto utf8  = utils::utf8_subst(utf8_string);
+    ASSERT_TRUE(ascii == utf8);
+}
+
 TEST(Utils, count_words)
 {
    ASSERT_EQ(utils::count_words("some text # comment"), 4);