add UTF-8 substitution and detection

This commit is contained in:
Axel Kohlmeyer
2022-09-04 06:45:11 -04:00
parent 34104bbbe9
commit 26e1b7c44d
7 changed files with 57 additions and 26 deletions

View File

@ -1185,6 +1185,7 @@ void Atom::data_atoms(int n, char *buf, tagint id_offset, tagint mol_offset,
coord[1] >= sublo[1] && coord[1] < subhi[1] &&
coord[2] >= sublo[2] && coord[2] < subhi[2]) {
avec->data_atom(xdata,imagedata,values,typestr);
typestr = utils::utf8_subst(typestr);
if (id_offset) tag[nlocal-1] += id_offset;
if (mol_offset) molecule[nlocal-1] += mol_offset;
// clang-format on
@ -1291,7 +1292,7 @@ void Atom::data_bonds(int n, char *buf, int *count, tagint id_offset,
// Bonds line is: number(ignored), bond type, atomID 1, atomID 2
if (nwords > 0) {
if (nwords != 4) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1];
typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
if (id_offset) {
@ -1388,7 +1389,7 @@ void Atom::data_angles(int n, char *buf, int *count, tagint id_offset,
// Angles line is: number(ignored), angle type, atomID 1, atomID 2, atomID 3
if (nwords > 0) {
if (nwords != 5) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1];
typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
atom3 = utils::tnumeric(FLERR, values[4], false, lmp);
@ -1501,7 +1502,7 @@ void Atom::data_dihedrals(int n, char *buf, int *count, tagint id_offset,
// Dihedrals line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4
if (nwords > 0) {
if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1];
typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
atom3 = utils::tnumeric(FLERR, values[4], false, lmp);
@ -1633,7 +1634,7 @@ void Atom::data_impropers(int n, char *buf, int *count, tagint id_offset,
// Impropers line is: number(ignored), bond type, atomID 1, atomID 2, atomID 3, atomID 4
if (nwords > 0) {
if (nwords != 6) error->all(FLERR, "Incorrect format in {}: {}", location, utils::trim(buf));
typestr = values[1];
typestr = utils::utf8_subst(values[1]);
atom1 = utils::tnumeric(FLERR, values[2], false, lmp);
atom2 = utils::tnumeric(FLERR, values[3], false, lmp);
atom3 = utils::tnumeric(FLERR, values[4], false, lmp);
@ -1934,30 +1935,44 @@ void Atom::set_mass(const char *file, int line, int itype, double value)
void Atom::set_mass(const char *file, int line, int /*narg*/, char **arg)
{
if (mass == nullptr) error->all(file,line, "Cannot set atom mass for atom style {}", atom_style);
if (mass == nullptr)
error->all(file,line, "Cannot set per-type atom mass for atom style {}", atom_style);
std::string typestr = utils::trim(arg[0]);
if (!isdigit(typestr[0]) && typestr[0] != '*') {
int itype = lmap->find(typestr,Atom::ATOM);
if (itype == -1) error->all(file,line,"Invalid type for mass set");
mass[itype] = utils::numeric(FLERR,arg[1],false,lmp);
mass_setflag[itype] = 1;
// clang-format on
std::string typestr = utils::utf8_subst(utils::trim(arg[0]));
switch (utils::is_type(typestr)) {
if (mass[itype] <= 0.0) error->all(file,line,"Invalid mass value");
} else {
case 0: { // numeric
int lo, hi;
utils::bounds(file,line,arg[0],1,ntypes,lo,hi,error);
utils::bounds(file, line, typestr.c_str(), 1, ntypes, lo, hi, error);
if ((lo < 1) || (hi > ntypes))
error->all(file,line,"Invalid type {} for atom mass {}", arg[1]);
error->all(file, line, "Invalid atom type {} for atom mass", typestr);
const double value = utils::numeric(FLERR, arg[1], false, lmp);
if (value <= 0.0) error->all(file,line,"Invalid atom mass value {}", value);
if (value <= 0.0)
error->all(file, line, "Invalid atom mass value {} for type {}", value, typestr);
for (int itype = lo; itype <= hi; itype++) {
mass[itype] = value;
mass_setflag[itype] = 1;
}
break;
}
case 1: { // type label
if (!atom->labelmapflag) error->all(FLERR, "Invalid atom type {} for setting mass", typestr);
int itype = lmap->find(typestr, Atom::ATOM);
if (itype == -1) error->all(file, line, "Invalid type {} for setting mass", typestr);
mass[itype] = utils::numeric(FLERR, arg[1], false, lmp);
mass_setflag[itype] = 1;
break;
}
default: // invalid
error->one(FLERR, "Invalid mass setting");
break;
}
// clang-format off
}
/* ----------------------------------------------------------------------

View File

@ -1666,7 +1666,8 @@ void AtomVec::data_atom(double *coord, imageint imagetmp, const std::vector<std:
initialize other peratom quantities
------------------------------------------------------------------------- */
void AtomVec::data_atom(double *coord, imageint imagetmp, const std::vector<std::string> &values, std::string &extract)
void AtomVec::data_atom(double *coord, imageint imagetmp, const std::vector<std::string> &values,
std::string &extract)
{
int m, n, datatype, cols;
void *pdata;

View File

@ -141,7 +141,7 @@ void LabelMap::modify_lmap(int narg, char **arg)
int itype = utils::inumeric(FLERR, arg[iarg++], false, lmp);
if ((itype < 1) || (itype > ntypes))
error->all(FLERR, "Labelmap {} type {} must be within 1-{}", tlabel, itype, ntypes);
std::string slabel = utils::trim(arg[iarg++]);
std::string slabel = utils::utf8_subst(utils::trim(arg[iarg++]));
if (utils::is_type(slabel) != 1)
error->all(FLERR, "Type label string {} for {} type {} is invalid", slabel, tlabel, itype);
int found = search(slabel, (*labels_map));

View File

@ -2159,6 +2159,7 @@ void ReadData::typelabels(int mode)
}
if (nwords != 2)
error->all(FLERR, "Invalid format in section: {} Type Labels: {}", labeltypes[mode], buf);
values[1] = utils::utf8_subst(values[1]);
if (utils::is_type(values[1]) != 1) error->all(FLERR, "Invalid type label {}", values[1]);
int itype = utils::inumeric(FLERR, values[0], false, lmp);
if ((itype < 1) || (itype > lntypes))

View File

@ -1162,8 +1162,10 @@ int utils::is_type(const std::string &str)
// TODO: the first two checks below are not really needed with this function.
// If a type label has at least one character that is not a digit or '*'
// it can be identified by this function as type label due to the check above.
// Whitespace and multi-byte characters are not allowed.
if (isdigit(str[0]) || (str[0] == '*') || (str[0] == '#')) return -1;
if (str.find_first_of(" \t\r\n\f") != std::string::npos) return -1;
if (has_utf8(utf8_subst(str))) return -1;
return 1;
}

View File

@ -64,18 +64,28 @@ TEST_F(SetTest, NoBoxAtoms)
command("create_box 4 box");
command("labelmap atom 2 N1");
command("labelmap atom 3 O1 4 H1");
command("mass * 1.0");
command("mass O1 3.0");
command("mass N1 2.0");
command("mass H1 4.0");
END_HIDE_OUTPUT();
ASSERT_NE(atom->lmap, nullptr);
ASSERT_FALSE(atom->lmap->is_complete(Atom::ATOM));
ASSERT_DOUBLE_EQ(atom->mass[1], 1.0);
ASSERT_DOUBLE_EQ(atom->mass[2], 2.0);
ASSERT_DOUBLE_EQ(atom->mass[3], 3.0);
ASSERT_DOUBLE_EQ(atom->mass[4], 4.0);
BEGIN_HIDE_OUTPUT();
command("labelmap atom 1 C1 2 N2 3 ' O#' 1 C1 4 H# 2 N3"); // second '#' starts comment
command("mass \"O#\" 10.0");
END_HIDE_OUTPUT();
ASSERT_TRUE(atom->lmap->is_complete(Atom::ATOM));
ASSERT_EQ(atom->lmap->find("C1", Atom::ATOM), 1);
ASSERT_EQ(atom->lmap->find("N2", Atom::ATOM), 2);
ASSERT_EQ(atom->lmap->find("O#", Atom::ATOM), 3);
ASSERT_EQ(atom->lmap->find("H", Atom::ATOM), 4);
ASSERT_DOUBLE_EQ(atom->mass[3], 10.0);
TEST_FAILURE(".*ERROR: Labelmap atom type 0 must be within 1-4.*",
command("labelmap atom 0 C1"););

View File

@ -469,6 +469,8 @@ TEST(Utils, valid_label)
ASSERT_EQ(utils::is_type("@X2=&X1"), 1);
ASSERT_EQ(utils::is_type("|Na|Cl|H2O|"), 1);
ASSERT_EQ(utils::is_type("CA(1)/CB(1)"), 1);
ASSERT_EQ(utils::is_type("A-B"), 1); // ASCII
ASSERT_EQ(utils::is_type("AB"), 1); // UTF-8
}
TEST(Utils, invalid_label)