apply UTF-8 character replacement before creating tokens

This commit is contained in:
Axel Kohlmeyer
2021-03-19 16:10:37 -04:00
committed by GitHub
parent 6503a7c3ba
commit 9707771f1c

View File

@ -35,12 +35,14 @@ TokenizerException::TokenizerException(const std::string &msg, const std::string
/** Class for splitting text into words
*
* This tokenizer will break down a string into sub-strings (i.e words)
* separated by the given separator characters.
* separated by the given separator characters. If the string contains
* certain known UTF-8 characters they will be replaced by their ASCII
* equivalents processing the string.
*
\verbatim embed:rst
*See also*
:cpp:class:`ValueTokenizer`, :cpp:func:`utils::split_words`
:cpp:class:`ValueTokenizer`, :cpp:func:`utils::split_words`, :cpp:func:`utils::utf8_subst`
\endverbatim
*
@ -50,6 +52,8 @@ TokenizerException::TokenizerException(const std::string &msg, const std::string
Tokenizer::Tokenizer(const std::string &str, const std::string &separators) :
text(str), separators(separators), start(0), ntokens(std::string::npos)
{
// replace known UTF-8 characters with ASCII equivalents
if (utils::has_utf8(text)) text = utils::utf8_subst(text);
reset();
}
@ -205,7 +209,6 @@ std::string ValueTokenizer::next_string() {
* \return value of next token */
int ValueTokenizer::next_int() {
std::string current = tokens.next();
if (utils::has_utf8(current)) current = utils::utf8_subst(current);
if (!utils::is_integer(current)) {
throw InvalidIntegerException(current);
}
@ -217,7 +220,6 @@ int ValueTokenizer::next_int() {
* \return value of next token */
bigint ValueTokenizer::next_bigint() {
std::string current = tokens.next();
if (utils::has_utf8(current)) current = utils::utf8_subst(current);
if (!utils::is_integer(current)) {
throw InvalidIntegerException(current);
}
@ -229,7 +231,6 @@ bigint ValueTokenizer::next_bigint() {
* \return value of next token */
tagint ValueTokenizer::next_tagint() {
std::string current = tokens.next();
if (utils::has_utf8(current)) current = utils::utf8_subst(current);
if (!utils::is_integer(current)) {
throw InvalidIntegerException(current);
}
@ -241,7 +242,6 @@ tagint ValueTokenizer::next_tagint() {
* \return value of next token */
double ValueTokenizer::next_double() {
std::string current = tokens.next();
if (utils::has_utf8(current)) current = utils::utf8_subst(current);
if (!utils::is_double(current)) {
throw InvalidFloatException(current);
}