diff --git a/doc/src/Commands_parse.rst b/doc/src/Commands_parse.rst index 37283823d7..64d5100715 100644 --- a/doc/src/Commands_parse.rst +++ b/doc/src/Commands_parse.rst @@ -162,3 +162,26 @@ LAMMPS: triple quotes can be nested in the usual manner. See the doc pages for those commands for examples. Only one of level of nesting is allowed, but that should be sufficient for most use cases. + +.. admonition:: ASCII versus UTF-8 + :class: note + + LAMMPS expects and processes 7-bit ASCII format text internally. + Many modern environments use UTF-8 encoding, which is a superset + of the 7-bit ASCII character table and thus mostly compatible. + However, there are several non-ASCII characters that can look + very similar to their ASCII equivalents or are invisible (so they + look like a blank), but are encoded differently. Web browsers, + PDF viewers, document editors are known to sometimes replace one + with the other for a better looking output. However, that can + lead to problems, for instance, when using cut-n-paste of input + file examples from web pages, or when using a document editor + (not a dedicated plain text editor) for writing LAMMPS inputs. + LAMMPS will try to detect this and substitute the non-ASCII + characters with their ASCII equivalents where known. There also + is going to be a warning printed, if this occurs. It is + recommended to avoid such characters altogether in LAMMPS input, + data and potential files. The replacement tables are likely + incomplete and dependent on users reporting problems processing + correctly looking input containing UTF-8 encoded non-ASCII + characters. diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst index bf1cca7413..17b6f13bad 100644 --- a/doc/src/Developer_utils.rst +++ b/doc/src/Developer_utils.rst @@ -77,6 +77,12 @@ and parsing files or arguments. .. doxygenfunction:: trim_comment :project: progguide +.. doxygenfunction:: has_utf8 + :project: progguide + +.. doxygenfunction:: utf8_subst + :project: progguide + .. doxygenfunction:: count_words(const char *text) :project: progguide diff --git a/src/input.cpp b/src/input.cpp index 457cf74b1a..df5cf0efbe 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -106,6 +106,7 @@ Input::Input(LAMMPS *lmp, int argc, char **argv) : Pointers(lmp) label_active = 0; labelstr = nullptr; jump_skip = 0; + utf8_warn = true; if (me == 0) { nfile = 1; @@ -421,6 +422,16 @@ void Input::parse() ptr++; } + if (utils::has_utf8(copy)) { + std::string buf = utils::utf8_subst(copy); + strcpy(copy,buf.c_str()); + if (utf8_warn && (comm->me == 0)) + error->warning(FLERR,"Detected non-ASCII characters in input. " + "Will try to continue by replacing with ASCII " + "equivalents where known."); + utf8_warn = false; + } + // perform $ variable substitution (print changes) // except if searching for a label since earlier variable may not be defined diff --git a/src/input.h b/src/input.h index b8ffb276f9..a86b10a686 100644 --- a/src/input.h +++ b/src/input.h @@ -54,6 +54,7 @@ class Input : protected Pointers { int label_active; // 0 = no label, 1 = looking for label char *labelstr; // label string being looked for int jump_skip; // 1 if skipping next jump, 0 otherwise + bool utf8_warn; // true if need to warn about UTF-8 chars FILE **infiles; // list of open input files diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index a9d82ca913..2cc0de72e1 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -210,6 +210,7 @@ std::string ValueTokenizer::next_string() { int ValueTokenizer::next_int() { if (has_next()) { std::string current = tokens.next(); + if (utils::has_utf8(current)) current = utils::utf8_subst(current); if (!utils::is_integer(current)) { throw InvalidIntegerException(current); } @@ -225,6 +226,7 @@ int ValueTokenizer::next_int() { bigint ValueTokenizer::next_bigint() { if (has_next()) { std::string current = tokens.next(); + if (utils::has_utf8(current)) current = utils::utf8_subst(current); if (!utils::is_integer(current)) { throw InvalidIntegerException(current); } @@ -240,6 +242,7 @@ bigint ValueTokenizer::next_bigint() { tagint ValueTokenizer::next_tagint() { if (has_next()) { std::string current = tokens.next(); + if (utils::has_utf8(current)) current = utils::utf8_subst(current); if (!utils::is_integer(current)) { throw InvalidIntegerException(current); } @@ -255,6 +258,7 @@ tagint ValueTokenizer::next_tagint() { double ValueTokenizer::next_double() { if (has_next()) { std::string current = tokens.next(); + if (utils::has_utf8(current)) current = utils::utf8_subst(current); if (!utils::is_double(current)) { throw InvalidFloatException(current); } diff --git a/src/utils.cpp b/src/utils.cpp index a58b384225..80800397d3 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -237,28 +237,27 @@ double utils::numeric(const char *file, int line, const char *str, if (str) n = strlen(str); if (n == 0) { - if (do_abort) - lmp->error->one(file,line,"Expected floating point parameter instead of" - " NULL or empty string in input script or data file"); - else - lmp->error->all(file,line,"Expected floating point parameter instead of" - " NULL or empty string in input script or data file"); - } - - for (int i = 0; i < n; i++) { - if (isdigit(str[i])) continue; - if (str[i] == '-' || str[i] == '+' || str[i] == '.') continue; - if (str[i] == 'e' || str[i] == 'E') continue; - std::string msg("Expected floating point parameter instead of '"); - msg += str; - msg += "' in input script or data file"; + const char msg[] = "Expected floating point parameter instead of" + " NULL or empty string in input script or data file"; if (do_abort) lmp->error->one(file,line,msg); else lmp->error->all(file,line,msg); } - return atof(str); + std::string buf(str); + if (has_utf8(buf)) buf = utf8_subst(buf); + + if (buf.find_first_not_of("0123456789-+.eE") != std::string::npos) { + std::string msg("Expected floating point parameter instead of '"); + msg += buf + "' in input script or data file"; + if (do_abort) + lmp->error->one(file,line,msg); + else + lmp->error->all(file,line,msg); + } + + return atof(buf.c_str()); } /* ---------------------------------------------------------------------- @@ -274,26 +273,27 @@ int utils::inumeric(const char *file, int line, const char *str, if (str) n = strlen(str); if (n == 0) { - if (do_abort) - lmp->error->one(file,line,"Expected integer parameter instead of " - "NULL or empty string in input script or data file"); - else - lmp->error->all(file,line,"Expected integer parameter instead of " - "NULL or empty string in input script or data file"); - } - - for (int i = 0; i < n; i++) { - if (isdigit(str[i]) || str[i] == '-' || str[i] == '+') continue; - std::string msg("Expected integer parameter instead of '"); - msg += str; - msg += "' in input script or data file"; + const char msg[] = "Expected integer parameter instead of" + " NULL or empty string in input script or data file"; if (do_abort) lmp->error->one(file,line,msg); else lmp->error->all(file,line,msg); } - return atoi(str); + std::string buf(str); + if (has_utf8(buf)) buf = utf8_subst(buf); + + if (buf.find_first_not_of("0123456789-+") != std::string::npos) { + std::string msg("Expected integer parameter instead of '"); + msg += buf + "' in input script or data file"; + if (do_abort) + lmp->error->one(file,line,msg); + else + lmp->error->all(file,line,msg); + } + + return atoi(buf.c_str()); } /* ---------------------------------------------------------------------- @@ -309,26 +309,27 @@ bigint utils::bnumeric(const char *file, int line, const char *str, if (str) n = strlen(str); if (n == 0) { - if (do_abort) - lmp->error->one(file,line,"Expected integer parameter instead of " - "NULL or empty string in input script or data file"); - else - lmp->error->all(file,line,"Expected integer parameter instead of " - "NULL or empty string in input script or data file"); - } - - for (int i = 0; i < n; i++) { - if (isdigit(str[i]) || str[i] == '-' || str[i] == '+') continue; - std::string msg("Expected integer parameter instead of '"); - msg += str; - msg += "' in input script or data file"; + const char msg[] = "Expected integer parameter instead of" + " NULL or empty string in input script or data file"; if (do_abort) lmp->error->one(file,line,msg); else lmp->error->all(file,line,msg); } - return ATOBIGINT(str); + std::string buf(str); + if (has_utf8(buf)) buf = utf8_subst(buf); + + if (buf.find_first_not_of("0123456789-+") != std::string::npos) { + std::string msg("Expected integer parameter instead of '"); + msg += buf + "' in input script or data file"; + if (do_abort) + lmp->error->one(file,line,msg); + else + lmp->error->all(file,line,msg); + } + + return ATOBIGINT(buf.c_str()); } /* ---------------------------------------------------------------------- @@ -344,26 +345,27 @@ tagint utils::tnumeric(const char *file, int line, const char *str, if (str) n = strlen(str); if (n == 0) { - if (do_abort) - lmp->error->one(file,line,"Expected integer parameter instead of " - "NULL or empty string in input script or data file"); - else - lmp->error->all(file,line,"Expected integer parameter instead of " - "NULL or empty string in input script or data file"); - } - - for (int i = 0; i < n; i++) { - if (isdigit(str[i]) || str[i] == '-' || str[i] == '+') continue; - std::string msg("Expected integer parameter instead of '"); - msg += str; - msg += "' in input script or data file"; + const char msg[] = "Expected integer parameter instead of" + " NULL or empty string in input script or data file"; if (do_abort) lmp->error->one(file,line,msg); else lmp->error->all(file,line,msg); } - return ATOTAGINT(str); + std::string buf(str); + if (has_utf8(buf)) buf = utf8_subst(buf); + + if (buf.find_first_not_of("0123456789-+") != std::string::npos) { + std::string msg("Expected integer parameter instead of '"); + msg += buf + "' in input script or data file"; + if (do_abort) + lmp->error->one(file,line,msg); + else + lmp->error->all(file,line,msg); + } + + return ATOTAGINT(buf.c_str()); } /* ---------------------------------------------------------------------- @@ -548,7 +550,8 @@ int utils::expand_args(const char *file, int line, int narg, char **arg, Return string without leading or trailing whitespace ------------------------------------------------------------------------- */ -std::string utils::trim(const std::string &line) { +std::string utils::trim(const std::string &line) +{ int beg = re_match(line.c_str(),"\\S+"); int end = re_match(line.c_str(),"\\s+$"); if (beg < 0) beg = 0; @@ -561,7 +564,8 @@ std::string utils::trim(const std::string &line) { Return string without trailing # comment ------------------------------------------------------------------------- */ -std::string utils::trim_comment(const std::string &line) { +std::string utils::trim_comment(const std::string &line) +{ auto end = line.find_first_of("#"); if (end != std::string::npos) { return line.substr(0, end); @@ -569,6 +573,51 @@ std::string utils::trim_comment(const std::string &line) { return std::string(line); } +/* ---------------------------------------------------------------------- + Replace UTF-8 encoded chars with known ASCII equivalents +------------------------------------------------------------------------- */ + +std::string utils::utf8_subst(const std::string &line) +{ + const unsigned char * const in = (const unsigned char *)line.c_str(); + const int len = line.size(); + std::string out; + + for (int i=0; i < len; ++i) { + + // UTF-8 2-byte character + if ((in[i] & 0xe0U) == 0xc0U) { + if ((i+1) < len) { + // MODIFIER LETTER PLUS SIGN (U+02D6) + if ((in[i] == 0xcbU) && (in[i+1] == 0x96U)) + out += '+', ++i; + // MODIFIER LETTER MINUS SIGN (U+02D7) + if ((in[i] == 0xcbU) && (in[i+1] == 0x97U)) + out += '-', ++i; + } + // UTF-8 3-byte character + } else if ((in[i] & 0xf0U) == 0xe0U) { + if ((i+2) < len) { + // INVISIBLE SEPARATOR (U+2063) + if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa3U)) + out += ' ', i += 2; + // INVISIBLE PLUS (U+2064) + if ((in[i] == 0xe2U) && (in[i+1] == 0x81U) && (in[i+2] == 0xa4U)) + out += '+', i += 2; + // MINUS SIGN (U+2212) + if ((in[i] == 0xe2U) && (in[i+1] == 0x88U) && (in[i+2] == 0x92U)) + out += '-', i += 2; + } + // UTF-8 4-byte character + } else if ((in[i] & 0xe8U) == 0xf0U) { + if ((i+3) < len) { + ; + } + } else out += in[i]; + } + return out; +} + /* ---------------------------------------------------------------------- return number of words ------------------------------------------------------------------------- */ diff --git a/src/utils.h b/src/utils.h index 52f7933caf..0c619d316d 100644 --- a/src/utils.h +++ b/src/utils.h @@ -197,18 +197,62 @@ namespace LAMMPS_NS { /** Trim leading and trailing whitespace. Like TRIM() in Fortran. * - * \param line string that should be trimmed + * \param line string that should be trimmed * \return new string without whitespace (string) */ std::string trim(const std::string &line); /** Return string with anything from '#' onward removed * - * \param line string that should be trimmed + * \param line string that should be trimmed * \return new string without comment (string) */ std::string trim_comment(const std::string &line); + /** Check if a string will likely have UTF-8 encoded characters + * + * UTF-8 uses the 7-bit standard ASCII table for the first 127 characters and + * all other characters are encoded as multiple bytes. For the multi-byte + * characters the first byte has either the highest two, three, or four bits + * set followed by a zero bit and followed by one, two, or three more bytes, + * respectively, where the highest bit is set and the second highest bit set + * to 0. The remaining bits combined are the character code, which is thus + * limited to 21-bits. + * + * For the sake of efficiency this test only checks if a character in the string + * has the highest bit set and thus is very likely an UTF-8 character. It will + * not be able to tell this this is a valid UTF-8 character or whether it is a + * 2-byte, 3-byte, or 4-byte character. + * +\verbatim embed:rst + +*See also* + :cpp:func:`utils::utf8_subst` + +\endverbatim + * \param line string that should be checked + * \return true if string contains UTF-8 encoded characters (bool) */ + + inline bool has_utf8(const std::string &line) + { + const unsigned char * const in = (const unsigned char *)line.c_str(); + for (int i=0; i < line.size(); ++i) if (in[i] & 0x80U) return true; + return false; + } + + /** Replace known UTF-8 characters with ASCII equivalents + * +\verbatim embed:rst + +*See also* + :cpp:func:`utils::has_utf8` + +\endverbatim + * \param line string that should be converted + * \return new string with ascii replacements (string) */ + + std::string utf8_subst(const std::string &line); + /** Count words in string with custom choice of separating characters * * \param text string that should be searched diff --git a/unittest/utils/test_utils.cpp b/unittest/utils/test_utils.cpp index da5c100b65..80042be9b0 100644 --- a/unittest/utils/test_utils.cpp +++ b/unittest/utils/test_utils.cpp @@ -54,6 +54,23 @@ TEST(Utils, trim_comment) ASSERT_THAT(trimmed, StrEq("some text ")); } +TEST(Utils, has_utf8) +{ + const char ascii_string[] = " -2"; + const char utf8_string[] = " −2"; + ASSERT_FALSE(utils::has_utf8(ascii_string)); + ASSERT_TRUE(utils::has_utf8(utf8_string)); +} + +TEST(Utils, utf8_subst) +{ + const char ascii_string[] = " -2"; + const char utf8_string[] = " −2"; + auto ascii = utils::utf8_subst(ascii_string); + auto utf8 = utils::utf8_subst(utf8_string); + ASSERT_TRUE(ascii == utf8); +} + TEST(Utils, count_words) { ASSERT_EQ(utils::count_words("some text # comment"), 4);