add UTF-8 substitution and detection

This commit is contained in:
Axel Kohlmeyer
2022-09-04 06:45:11 -04:00
parent 34104bbbe9
commit 26e1b7c44d
7 changed files with 57 additions and 26 deletions

View File

@ -1185,6 +1185,7 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset,
coord[1] >= sublo[1] && coord[1] < subhi[1] && coord[1] >= sublo[1] && coord[1] < subhi[1] &&
coord[2] >= sublo[2] && coord[2] < subhi[2]) { coord[2] >= sublo[2] && coord[2] < subhi[2]) {
avec->data_atom(xdata,imagedata,values,typestr); avec->data_atom(xdata,imagedata,values,typestr);
typestr = utils::utf8_subst(typestr);
if (id_offset) tag[nlocal-1] += id_offset; if (id_offset) tag[nlocal-1] += id_offset;
if (mol_offset) molecule[nlocal-1] += mol_offset; if (mol_offset) molecule[nlocal-1] += mol_offset;
// clang-format on // clang-format on
@ -1291,7 +1292,7 @@ void Atom::data_bonds(int n, char *buf, int *count, tagint id_offset,
// Bonds line is: number(ignored), bond type, atomID 1, atomID 2 // Bonds line is: number(ignored), bond type, atomID 1, atomID 2
if (nwords > 0) { if (nwords > 0) {
if (nwords != 4) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); if (nwords != 4) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1]; typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
if (id_offset) { if (id_offset) {
@ -1388,7 +1389,7 @@ void Atom::data_angles(int n, char *buf, int *count, tagint id_offset,
// Angles line is: number(ignored), angle type, atomID 1, atomID 2, atomID 3 // Angles line is: number(ignored), angle type, atomID 1, atomID 2, atomID 3
if (nwords > 0) { if (nwords > 0) {
if (nwords != 5) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); if (nwords != 5) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1]; typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
atom3 = utils::tnumeric(FLERR, values[4], false, lmp); atom3 = utils::tnumeric(FLERR, values[4], false, lmp);
@ -1501,7 +1502,7 @@ void Atom::data_dihedrals(int n, char *buf, int *count, tagint id_offset,
// Dihedrals line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4 // Dihedrals line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4
if (nwords > 0) { if (nwords > 0) {
if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1]; typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
atom3 = utils::tnumeric(FLERR, values[4], false, lmp); atom3 = utils::tnumeric(FLERR, values[4], false, lmp);
@ -1633,7 +1634,7 @@ void Atom::data_impropers(int n, char *buf, int *count, tagint id_offset,
// Impropers line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4 // Impropers line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4
if (nwords > 0) { if (nwords > 0) {
if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf)); if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1]; typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp); atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp); atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
atom3 = utils::tnumeric(FLERR, values[4], false, lmp); atom3 = utils::tnumeric(FLERR, values[4], false, lmp);
@ -1934,30 +1935,44 @@ void Atom::set_mass(const char *file, int line, int itype, double value)
void Atom::set_mass(const char *file, int line, int /*narg*/, char **arg) void Atom::set_mass(const char *file, int line, int /*narg*/, char **arg)
{ {
if (mass == nullptr) error->all(file,line, "Cannot set atom mass for atom style {}", atom_style); if (mass == nullptr)
error->all(file,line, "Cannot set per-type atom mass for atom style {}", atom_style);
std::string typestr = utils::trim(arg[0]); // clang-format on
if (!isdigit(typestr[0]) && typestr[0] != '*') { std::string typestr = utils::utf8_subst(utils::trim(arg[0]));
int itype = lmap->find(typestr,Atom::ATOM); switch (utils::is_type(typestr)) {
if (itype == -1) error->all(file,line,"Invalid type for mass set");
mass[itype] = utils::numeric(FLERR,arg[1],false,lmp);
mass_setflag[itype] = 1;
if (mass[itype] <= 0.0) error->all(file,line,"Invalid mass value"); case 0: { // numeric
} else { int lo, hi;
int lo,hi; utils::bounds(file, line, typestr.c_str(), 1, ntypes, lo, hi, error);
utils::bounds(file,line,arg[0],1,ntypes,lo,hi,error); if ((lo < 1) || (hi > ntypes))
if ((lo < 1) || (hi > ntypes)) error->all(file, line, "Invalid atom type {} for atom mass", typestr);
error->all(file,line,"Invalid type {} for atom mass {}", arg[1]);
const double value = utils::numeric(FLERR,arg[1],false,lmp); const double value = utils::numeric(FLERR, arg[1], false, lmp);
if (value <= 0.0) error->all(file,line,"Invalid atom mass value {}", value); if (value <= 0.0)
error->all(file, line, "Invalid atom mass value {} for type {}", value, typestr);
for (int itype = lo; itype <= hi; itype++) { for (int itype = lo; itype <= hi; itype++) {
mass[itype] = value; mass[itype] = value;
mass_setflag[itype] = 1; mass_setflag[itype] = 1;
}
break;
} }
case 1: { // type label
if (!atom->labelmapflag) error->all(FLERR, "Invalid atom type {} for setting mass", typestr);
int itype = lmap->find(typestr, Atom::ATOM);
if (itype == -1) error->all(file, line, "Invalid type {} for setting mass", typestr);
mass[itype] = utils::numeric(FLERR, arg[1], false, lmp);
mass_setflag[itype] = 1;
break;
}
default: // invalid
error->one(FLERR, "Invalid mass setting");
break;
} }
// clang-format off
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------

View File

@ -1666,7 +1666,8 @@ void AtomVec::data_atom(double *coord, imageint imagetmp, const std::vector<std:
initialize other peratom quantities initialize other peratom quantities
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void AtomVec::data_atom(double *coord, imageint imagetmp, const std::vector<std::string> &values, std::string &extract) void AtomVec::data_atom(double *coord, imageint imagetmp, const std::vector<std::string> &values,
std::string &extract)
{ {
int m, n, datatype, cols; int m, n, datatype, cols;
void *pdata; void *pdata;

View File

@ -141,7 +141,7 @@ void LabelMap::modify_lmap(int narg, char **arg)
int itype = utils::inumeric(FLERR, arg[iarg++], false, lmp); int itype = utils::inumeric(FLERR, arg[iarg++], false, lmp);
if ((itype < 1) || (itype > ntypes)) if ((itype < 1) || (itype > ntypes))
error->all(FLERR, "Labelmap {} type {} must be within 1-{}", tlabel, itype, ntypes); error->all(FLERR, "Labelmap {} type {} must be within 1-{}", tlabel, itype, ntypes);
std::string slabel = utils::trim(arg[iarg++]); std::string slabel = utils::utf8_subst(utils::trim(arg[iarg++]));
if (utils::is_type(slabel) != 1) if (utils::is_type(slabel) != 1)
error->all(FLERR, "Type label string {} for {} type {} is invalid", slabel, tlabel, itype); error->all(FLERR, "Type label string {} for {} type {} is invalid", slabel, tlabel, itype);
int found = search(slabel, (*labels_map)); int found = search(slabel, (*labels_map));

View File

@ -2159,6 +2159,7 @@ void ReadData::typelabels(int mode)
} }
if (nwords != 2) if (nwords != 2)
error->all(FLERR, "Invalid format in section: {} Type Labels: {}", labeltypes[mode], buf); error->all(FLERR, "Invalid format in section: {} Type Labels: {}", labeltypes[mode], buf);
values[1] = utils::utf8_subst(values[1]);
if (utils::is_type(values[1]) != 1) error->all(FLERR, "Invalid type label {}", values[1]); if (utils::is_type(values[1]) != 1) error->all(FLERR, "Invalid type label {}", values[1]);
int itype = utils::inumeric(FLERR, values[0], false, lmp); int itype = utils::inumeric(FLERR, values[0], false, lmp);
if ((itype < 1) || (itype > lntypes)) if ((itype < 1) || (itype > lntypes))

View File

@ -1162,8 +1162,10 @@ int utils::is_type(const std::string &str)
// TODO: the first two checks below are not really needed with this function. // TODO: the first two checks below are not really needed with this function.
// If a type label has at least one character that is not a digit or '*' // If a type label has at least one character that is not a digit or '*'
// it can be identified by this function as type label due to the check above. // it can be identified by this function as type label due to the check above.
// Whitespace and multi-byte characters are not allowed.
if (isdigit(str[0]) || (str[0] == '*') || (str[0] == '#')) return -1; if (isdigit(str[0]) || (str[0] == '*') || (str[0] == '#')) return -1;
if (str.find_first_of(" \t\r\n\f") != std::string::npos) return -1; if (str.find_first_of(" \t\r\n\f") != std::string::npos) return -1;
if (has_utf8(utf8_subst(str))) return -1;
return 1; return 1;
} }

View File

@ -64,18 +64,28 @@ TEST_F(SetTest, NoBoxAtoms)
command("create_box 4 box"); command("create_box 4 box");
command("labelmap atom 2 N1"); command("labelmap atom 2 N1");
command("labelmap atom 3 O1 4 H1"); command("labelmap atom 3 O1 4 H1");
command("mass * 1.0");
command("mass O1 3.0");
command("mass N1 2.0");
command("mass H1 4.0");
END_HIDE_OUTPUT(); END_HIDE_OUTPUT();
ASSERT_NE(atom->lmap, nullptr); ASSERT_NE(atom->lmap, nullptr);
ASSERT_FALSE(atom->lmap->is_complete(Atom::ATOM)); ASSERT_FALSE(atom->lmap->is_complete(Atom::ATOM));
ASSERT_DOUBLE_EQ(atom->mass[1], 1.0);
ASSERT_DOUBLE_EQ(atom->mass[2], 2.0);
ASSERT_DOUBLE_EQ(atom->mass[3], 3.0);
ASSERT_DOUBLE_EQ(atom->mass[4], 4.0);
BEGIN_HIDE_OUTPUT(); BEGIN_HIDE_OUTPUT();
command("labelmap atom 1 C1 2 N2 3 ' O#' 1 C1 4 H# 2 N3"); // second '#' starts comment command("labelmap atom 1 C1 2 N2 3 ' O#' 1 C1 4 H# 2 N3"); // second '#' starts comment
command("mass \"O#\" 10.0");
END_HIDE_OUTPUT(); END_HIDE_OUTPUT();
ASSERT_TRUE(atom->lmap->is_complete(Atom::ATOM)); ASSERT_TRUE(atom->lmap->is_complete(Atom::ATOM));
ASSERT_EQ(atom->lmap->find("C1", Atom::ATOM), 1); ASSERT_EQ(atom->lmap->find("C1", Atom::ATOM), 1);
ASSERT_EQ(atom->lmap->find("N2", Atom::ATOM), 2); ASSERT_EQ(atom->lmap->find("N2", Atom::ATOM), 2);
ASSERT_EQ(atom->lmap->find("O#", Atom::ATOM), 3); ASSERT_EQ(atom->lmap->find("O#", Atom::ATOM), 3);
ASSERT_EQ(atom->lmap->find("H", Atom::ATOM), 4); ASSERT_EQ(atom->lmap->find("H", Atom::ATOM), 4);
ASSERT_DOUBLE_EQ(atom->mass[3], 10.0);
TEST_FAILURE(".*ERROR: Labelmap atom type 0 must be within 1-4.*", TEST_FAILURE(".*ERROR: Labelmap atom type 0 must be within 1-4.*",
command("labelmap atom 0 C1");); command("labelmap atom 0 C1"););

View File

@ -141,8 +141,8 @@ TEST(Utils, count_words_with_extra_spaces)
TEST(Utils, join_words) TEST(Utils, join_words)
{ {
std::vector<std::string> words = {"one", "two", "three" }; std::vector<std::string> words = {"one", "two", "three"};
auto combined = utils::join_words(words, " "); auto combined = utils::join_words(words, " ");
ASSERT_THAT(combined, StrEq("one two three")); ASSERT_THAT(combined, StrEq("one two three"));
combined = utils::join_words(words, ""); combined = utils::join_words(words, "");
ASSERT_THAT(combined, StrEq("onetwothree")); ASSERT_THAT(combined, StrEq("onetwothree"));
@ -469,6 +469,8 @@ TEST(Utils, valid_label)
ASSERT_EQ(utils::is_type("@X2=&X1"), 1); ASSERT_EQ(utils::is_type("@X2=&X1"), 1);
ASSERT_EQ(utils::is_type("|Na|Cl|H2O|"), 1); ASSERT_EQ(utils::is_type("|Na|Cl|H2O|"), 1);
ASSERT_EQ(utils::is_type("CA(1)/CB(1)"), 1); ASSERT_EQ(utils::is_type("CA(1)/CB(1)"), 1);
ASSERT_EQ(utils::is_type("A-B"), 1); // ASCII
ASSERT_EQ(utils::is_type("AB"), 1); // UTF-8
} }
TEST(Utils, invalid_label) TEST(Utils, invalid_label)