Skip to content

Commit

Permalink
Update _filter_xyz to ignore lines after cartesian coordinates in sta…
Browse files Browse the repository at this point in the history
…ndard xyz files
  • Loading branch information
coltonbh committed Apr 13, 2022
1 parent d04a380 commit 9c22bdc
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 1 deletion.
10 changes: 9 additions & 1 deletion qcelemental/molparse/from_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ def process_variable(matchobj):
xyz1strict = re.compile(r"\A" + r"(?P<nat>\d+)" + r"\Z")
SIMPLENUCLEUS = r"""((?P<E>[A-Z]{1,3})|(?P<Z>\d{1,3}))"""
atom_cartesian_strict = re.compile(
r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE
r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + ".*" + r"\Z", re.IGNORECASE | re.VERBOSE
)

xyz1 = re.compile(r"\A" + r"(?P<nat>\d+)" + r"[\s,]*" + r"((?P<ubohr>(bohr|au))|(?P<uang>ang))?" + r"\Z", re.IGNORECASE)
Expand Down Expand Up @@ -737,9 +737,17 @@ def process_atom_cartesian(matchobj):
for iln, line in enumerate(string.split("\n")):
line = line.strip()
if iln == 0:
try:
num_atoms = int(line)
except ValueError:
# Not a standard xyz format; continue with regular process
num_atoms = None
line = re.sub(xyz1strict, "", line)
elif iln == 1:
continue
elif num_atoms and iln > num_atoms + 1:
# If standard xyz ignore everything after cartesian coords
break
else:
line = re.sub(atom_cartesian_strict, process_atom_cartesian, line)
if line:
Expand Down
214 changes: 214 additions & 0 deletions qcelemental/tests/test_molparse_from_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import qcelemental
from qcelemental.models import Molecule
from qcelemental.molparse.from_string import _filter_xyz
from qcelemental.testing import compare, compare_molrecs, compare_recursive, compare_values, tnm

_arrays_prov_stamp = {"creator": "QCElemental", "version": "1.0", "routine": "qcelemental.molparse.from_arrays"}
Expand Down Expand Up @@ -921,6 +922,219 @@ def test_xyzp_qm_7e():
assert compare_molrecs(fullans, final["qm"], tnm() + ": full qm")


@pytest.mark.parametrize(
"string,elbl,geom",
(
(
"""5
gdb 1 157.7118 157.70997 157.70699 0. 13.21 -0.3877 0.1171 0.5048 35.3641 0.044749 -40.47893 -40.476062 -40.475117 -40.498597 6.469
C -0.0126981359 1.0858041578 0.0080009958 -0.535689
H 0.002150416 -0.0060313176 0.0019761204 0.133921
H 1.0117308433 1.4637511618 0.0002765748 0.133922
H -0.540815069 1.4475266138 -0.8766437152 0.133923
H -0.5238136345 1.4379326443 0.9063972942 0.133923
1341.307 1341.3284 1341.365 1562.6731 1562.7453 3038.3205 3151.6034 3151.6788 3151.7078
C C
InChI=1S/CH4/h1H4 InChI=1S/CH4/h1H4""",
["C", "H", "H", "H", "H"],
[
-0.0126981359,
1.0858041578,
0.0080009958,
0.002150416,
-0.0060313176,
0.0019761204,
1.0117308433,
1.4637511618,
0.0002765748,
-0.540815069,
1.4475266138,
-0.8766437152,
-0.5238136345,
1.4379326443,
0.9063972942,
],
),
(
"""20
gdb 52625 3.48434 0.81389 0.77349 4.0931 85.49 -0.2471 0.0275 0.2746 1578.0163 0.16756 -403.158572 -403.147447 -403.146503 -403.196607 38.82
C 0.0219866132 1.4617007325 0.0778162941 -0.5003
C 0.0172170008 0.0062570163 0.0278221402 0.060798
C 0.0082754818 -1.1968179556 -0.0219490036 0.058561
C -0.0048543985 -2.6632285567 -0.0864436684 0.086487
C -1.4625346892 -3.1775528493 -0.0739515886 -0.401401
C 0.724612823 -3.1404242978 -1.3630324305 -0.401392
N 0.7132087245 -3.1481618818 1.1179206119 -0.295483
C 0.9247284349 -4.4479813471 1.4397739679 0.162556
O 0.5626675182 -5.4147891002 0.7982523365 -0.336127
H 1.0431199948 1.8581192318 0.0749725881 0.150933
H -0.4997253695 1.8847111976 -0.7874860262 0.154682
H -0.4780438734 1.8320081388 0.979321682 0.150927
H -1.9940013133 -2.8054928472 -0.9536444371 0.11928
H -1.4636215253 -4.2685570577 -0.0833553058 0.154594
H -1.981857154 -2.824763824 0.8204749775 0.116725
H 1.749444309 -2.7614376976 -1.3787041159 0.11672
H 0.7443784569 -4.2310758294 -1.3847506905 0.154572
H 0.2030559276 -2.7681902401 -2.2485640369 0.11929
H 1.0701803562 -2.4436691628 1.7431541049 0.251085
H 1.4874481325 -4.5389119798 2.3916335805 0.077494
9.6558 74.9555 86.4308 130.4187 209.6956 213.4664 245.0078 277.3227 292.8775 293.3837 344.8375 364.9338 411.1024 508.5101 542.9334 570.3714 618.7374 820.0082 829.2715 936.286 986.4059 1028.4996 1033.1508 1051.8084 1054.6924 1076.9172 1172.1103 1179.8358 1222.2743 1289.1699 1387.6509 1408.9379 1414.4085 1431.5059 1476.1153 1477.0789 1478.6118 1481.2338 1494.3428 1511.202 1519.6039 1792.8486 2370.6229 2920.0767 3029.2922 3050.3711 3054.2985 3090.8073 3092.1476 3124.8526 3128.7105 3148.8786 3152.1387 3639.6323
CC#CC(C)(C)NC=O CC#CC(C)(C)NC=O
InChI=1S/C7H11NO/c1-4-5-7(2,3)8-6-9/h6H,1-3H3,(H,8,9) InChI=1S/C7H11NO/c1-4-5-7(2,3)8-6-9/h6H,1-3H3,(H,8,9)
""",
["C", "C", "C", "C", "C", "C", "N", "C", "O", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H"],
[
0.0219866132,
1.4617007325,
0.0778162941,
0.0172170008,
0.0062570163,
0.0278221402,
0.0082754818,
-1.1968179556,
-0.0219490036,
-0.0048543985,
-2.6632285567,
-0.0864436684,
-1.4625346892,
-3.1775528493,
-0.0739515886,
0.724612823,
-3.1404242978,
-1.3630324305,
0.7132087245,
-3.1481618818,
1.1179206119,
0.9247284349,
-4.4479813471,
1.4397739679,
0.5626675182,
-5.4147891002,
0.7982523365,
1.0431199948,
1.8581192318,
0.0749725881,
-0.4997253695,
1.8847111976,
-0.7874860262,
-0.4780438734,
1.8320081388,
0.979321682,
-1.9940013133,
-2.8054928472,
-0.9536444371,
-1.4636215253,
-4.2685570577,
-0.0833553058,
-1.981857154,
-2.824763824,
0.8204749775,
1.749444309,
-2.7614376976,
-1.3787041159,
0.7443784569,
-4.2310758294,
-1.3847506905,
0.2030559276,
-2.7681902401,
-2.2485640369,
1.0701803562,
-2.4436691628,
1.7431541049,
1.4874481325,
-4.5389119798,
2.3916335805,
],
),
(
"""17
gdb 107395 3.22974 1.27619 1.028 3.0629 69.23 -0.225 0.0454 0.2704 1155.6964 0.136978 -455.1439 -455.135732 -455.134788 -455.176691 31.549
N 0.0166534686 1.2958609713 -0.1502735485 -0.580455
C -0.0682337319 -0.0600800983 -0.0096861238 0.457461
N -1.1305816717 -0.7597824714 0.1390863864 -0.408126
C -0.7009986293 -2.1439574926 0.1108213578 0.311595
C -1.4984924036 -3.0589993793 1.002599169 -0.138474
O -2.8585678608 -3.0949107355 0.6105794873 -0.417767
C -0.0617590849 -2.6053899658 -1.1855277252 -0.354308
C 0.8014538633 -2.0927793706 -0.0638522254 0.002142
O 1.1296621 -0.7079465651 -0.1012706145 -0.24761
H 0.8924648972 1.705797935 0.136582908 0.26608
H -0.8059467993 1.7929265086 0.153937696 0.270518
H -1.1201801933 -4.0847553601 0.9397586373 0.099132
H -1.3951151651 -2.7241068393 2.0475874391 0.085667
H -3.1252104475 -2.1733539114 0.5051511429 0.295168
H -0.0217120799 -3.6759992339 -1.3586546368 0.121153
H -0.1831965908 -1.9943417386 -2.0738763382 0.124725
H 1.5549266689 -2.687136563 0.4367361986 0.113098
70.6081 153.9333 180.5159 278.5491 345.0174 381.8663 413.9812 430.6681 448.7324 482.9536 553.6703 659.1349 739.6911 741.9888 783.4372 872.8339 876.0968 943.3731 959.9083 980.6717 997.5511 1046.3298 1069.8255 1090.0307 1128.0369 1148.246 1187.611 1267.6525 1296.0396 1370.3317 1379.5593 1424.1337 1443.8219 1468.4333 1503.1705 1614.9407 1733.6758 2971.4118 3070.4849 3127.1322 3195.3399 3220.8339 3586.8344 3701.0432 3786.2797
NC1=NC2(CO)CC2O1 NC1=N[C@@]2(CO)C[C@H]2O1
InChI=1S/C5H8N2O2/c6-4-7-5(2-8)1-3(5)9-4/h3,8H,1-2H2,(H2,6,7) InChI=1S/C5H8N2O2/c6-4-7-5(2-8)1-3(5)9-4/h3,8H,1-2H2,(H2,6,7)/t3-,5-/m1/s1
""",
["N", "C", "N", "C", "C", "O", "C", "C", "O", "H", "H", "H", "H", "H", "H", "H", "H"],
[
0.0166534686,
1.2958609713,
-0.1502735485,
-0.0682337319,
-0.0600800983,
-0.0096861238,
-1.1305816717,
-0.7597824714,
0.1390863864,
-0.7009986293,
-2.1439574926,
0.1108213578,
-1.4984924036,
-3.0589993793,
1.002599169,
-2.8585678608,
-3.0949107355,
0.6105794873,
-0.0617590849,
-2.6053899658,
-1.1855277252,
0.8014538633,
-2.0927793706,
-0.0638522254,
1.1296621,
-0.7079465651,
-0.1012706145,
0.8924648972,
1.705797935,
0.136582908,
-0.8059467993,
1.7929265086,
0.153937696,
-1.1201801933,
-4.0847553601,
0.9397586373,
-1.3951151651,
-2.7241068393,
2.0475874391,
-3.1252104475,
-2.1733539114,
0.5051511429,
-0.0217120799,
-3.6759992339,
-1.3586546368,
-0.1831965908,
-1.9943417386,
-2.0738763382,
1.5549266689,
-2.687136563,
0.4367361986,
],
),
),
)
def test_xyz_gdb_format(string, elbl, geom):
unprocessed, processed = _filter_xyz(string, strict=True)

assert not unprocessed
assert processed["elbl"] == elbl
assert processed["geom"] == geom


subject8 = """\
3
stuffs
Expand Down

0 comments on commit 9c22bdc

Please sign in to comment.