Source code for qcelemental.molparse.from_string

import pprint
import re
from typing import Dict, Tuple, Union

from ..exceptions import ChoicesError, MoleculeFormatError, ValidationError
from ..util import filter_comments, provenance_stamp
from . import pubchem
from .from_arrays import from_input_arrays
from .regex import CARTXYZ, CHGMULT, ENDL, NUCLEUS, NUMBER, SEP

__all__ = ["from_string"]


[docs]def from_string( molstr: str, dtype: str = None, *, name: str = None, fix_com: bool = None, fix_orientation: bool = None, fix_symmetry: str = None, return_processed: bool = False, enable_qm: bool = True, enable_efp: bool = True, missing_enabled_return_qm: str = "none", missing_enabled_return_efp: str = "none", verbose=1, ) -> Union[Dict, Tuple[Dict, Dict]]: r"""Construct a molecule dictionary from any recognized string format. Parameters ---------- molstr Multiline string specification of molecule in a recognized format. dtype {'xyz', 'xyz+', 'psi4', 'psi4+'} Molecule format name; see below for details. return_processed Additionally return intermediate dictionary. enable_qm Consider quantum mechanical domain in processing the string constants into the returned molrec. enable_efp Consider effective fragment potential domain in processing the string contents into the returned molrec. Only relevant if `dtype` supports EFP. missing_enabled_return_qm {'minimal', 'none', 'error'} If `enable_qm=True`, what to do if it has no atoms/fragments? Respectively, return a fully valid but empty molrec, return empty dictionary, or throw error. missing_enabled_return_efp {'minimal', 'none', 'error'} If `enable_efp=True`, what to do if it has no atoms/fragments? Respectively, return a fully valid but empty molrec, return empty dictionary, or throw error. name Override `molstr` information for label for molecule; should be valid Python identifier. One of a very limited number of fields (three others follow) for trumping `molstr`. Provided for convenience, since the alternative would be collect the resulting molrec (discarding the Mol if called from class), editing it, then remaking the Mol. fix_com Override `molstr` information for whether translation of `geom` is allowed or disallowed. fix_orientation Override `molstr` information for whether rotation of `geom` is allowed or disallowed. fix_symmetry Override `molstr` information for maximal point group symmetry which geometry should be treated. Returns ------- molrec : dict Molecule dictionary spec. See :py:func:`from_arrays`. molinit : dict, optional Intermediate "molrec"-like dictionary containing `molstr` info after parsing by this function but before the validation and defaulting of `from_arrays` that returns the proper `molrec`. Only provided if `return_processed` is True. Raises ------ qcelemental.MoleculeFormatError After processing of `molstr`, only an empty string should remain. Anything left is a syntax error. Notes ----- Several formats are interpretable: .. code-block:: none xyz - Strict XYZ format ----------------------- String Layout ------------- <number of atoms> comment line <element_symbol or atomic_number> <x> <y> <z> ... <element_symbol or atomic_number> <x> <y> <z> QM Domain --------- Specifiable: geom, elem/elez (element identity) Inaccessible: mass, real (vs. ghost), elbl (user label), name, units (assumed [A]), input_units_to_au, fix_com/orientation/symmetry, fragmentation, molecular_charge, molecular_multiplicity Notes ----- <number of atoms> is pattern-matched but ignored. xyz+ - Enhanced XYZ format -------------------------- String Layout ------------- <number of atoms> [<bohr|au|ang>] [<molecular_charge> <molecular_multiplicity>] comment line <psi4_nucleus_spec> <x> <y> <z> ... <psi4_nucleus_spec> <x> <y> <z> QM Domain --------- Specifiable: geom, elem/elez (element identity), mass, real (vs. ghost), elbl (user label), units (defaults [A]), molecular_charge, molecular_multiplicity Inaccessible: name, input_units_to_au, fix_com/orientation/symmetry, fragmentation Notes ----- <number of atoms> is pattern-matched but ignored. psi4 - Psi4 molecule {...} format --------------------------------- QM Domain --------- Specifiable: geom, elem/elez (element identity), mass, real (vs. ghost), elbl (user label), units (defaults [A]), fix_com/orientation/symmetry, fragment_separators, fragment_charges, fragment_multiplicities, molecular_charge, molecular_multiplicity Inaccessible: name, input_units_to_au PubChem ------- pubchem : <cid|name|formula> [*] A string like the above searches the PubChem database and substitutes the below. Adding the wildcard searches for multiple matches and raises ChoicesError with matches for further consideration attached. Specifiable: geom, elem/elez (element identity), units (fixed [A]), molecular_charge, molecular_multiplicity (fixed singlet), name EFP Domain ---------- Specifiable: units, fix_com/orientation/symmetry, fragment_files, hint_types, geom_hints Inaccessible: anything atomic or fragment details -- geom, elem/elez (element identity), mass, real (vs. ghost), elbl (user label), fragment_separators, fragment_charges, fragment_multiplicities, molecular_charge, molecular_multiplicity psi4+ - Psi4 non-Cartesian molecule {...} format ------------------------------------------------ Like `dtype=psi4` (although combination with EFP not tested) except that instead of pure-Cartesian geometry, allow variables, zmatrix, and un-fully-specified geometries. *Not* MolSSI standard, but we're not dropping zmatrix yet. Note that in Psi4 internal coordinates defined through a zmatrix have no bearing on geometry optimization internals or constraints. """ if verbose >= 2: print("<<< FROM_STRING\n", molstr, "\n>>>") # << 1 >> str-->str -- discard comments molstr = filter_comments(molstr.strip()) def parse_as_xyz_ish(molstr, strict): molinit = {} # << 2 >> str-->dict -- process atoms, units[, chg, mult] molstr, processed = _filter_xyz(molstr, strict=strict) molinit.update(processed) if molstr: raise MoleculeFormatError(f"""Unprocessable Molecule remnants under {dtype}:\n{molstr}""") return molstr, molinit def parse_as_psi4_ish(molstr, unsettled): molinit = {} # Notes # * *_filter functions must fill non-overlapping fields # * not recc but can add to downstream by appending to str # << 2.1 >> str-->str -- process pubchem into str for downfunction molstr, processed = _filter_pubchem(molstr) molinit.update(processed) # << 2.2 >> str-->dict -- process units, com, orient, symm molstr, processed = _filter_universals(molstr) molinit.update(processed) # << 2.3 >> str-->dict -- process efp frags molstr, processed = _filter_libefp(molstr) molinit.update(processed) # << 2.4 >> str-->dict -- process atoms, chg, mult, frags molstr, processed = _filter_mints(molstr, unsettled=unsettled) molinit.update(processed) if molstr: raise MoleculeFormatError(f"""Unprocessable Molecule remnants under {dtype}:\n{molstr}""") return molstr, molinit if dtype == "xyz": molstr, molinit = parse_as_xyz_ish(molstr, strict=True) elif dtype == "xyz+": molstr, molinit = parse_as_xyz_ish(molstr, strict=False) elif dtype == "psi4": molstr, molinit = parse_as_psi4_ish(molstr, unsettled=False) elif dtype == "psi4+": molstr, molinit = parse_as_psi4_ish(molstr, unsettled=True) elif dtype is None: dtype = "[psi4, xyz, xyz+, psi4+]" # for error message try: molstr, molinit = parse_as_psi4_ish(molstr, unsettled=False) dtype = "psi4" except MoleculeFormatError as e: min_error_length = len(str(e)) min_error = e try: molstr, molinit = parse_as_xyz_ish(molstr, strict=True) dtype = "xyz" except MoleculeFormatError as e: if len(str(e)) < min_error_length: min_error_length = len(str(e)) min_error = e try: molstr, molinit = parse_as_xyz_ish(molstr, strict=False) dtype = "xyz+" except MoleculeFormatError as e: if len(str(e)) < min_error_length: min_error_length = len(str(e)) min_error = e try: molstr, molinit = parse_as_psi4_ish(molstr, unsettled=True) dtype = "psi4+" except MoleculeFormatError as e: if len(str(e)) < min_error_length: min_error_length = len(str(e)) min_error = e raise min_error else: raise KeyError(f"Molecule: dtype of `{dtype}` not recognized.") # << 3 >> args-->dict -- process name, com, orient, symm from arguments processed = _filter_kwargs(name, fix_com, fix_orientation, fix_symmetry) molinit.update(processed) if verbose >= 2: print("\nFROM_STRING (", dtype, ") --> FROM_INPUT_ARRAYS <<<") pprint.pprint(molinit) print(">>>\n") # << 4 >> dict-->molspec molrec = from_input_arrays( speclabel=True, enable_qm=enable_qm, enable_efp=enable_efp, missing_enabled_return_qm=missing_enabled_return_qm, missing_enabled_return_efp=missing_enabled_return_efp, **molinit, ) # replace from_arrays stamp with from_string stamp if "qm" in molrec and molrec["qm"]: molrec["qm"]["provenance"] = provenance_stamp(__name__) if "efp" in molrec and molrec["efp"]: molrec["efp"]["provenance"] = provenance_stamp(__name__) if verbose >= 2: print("\nFROM_STRING MOLREC <<<", molrec, ">>>\n") if return_processed: return molrec, molinit else: return molrec
# TODO maybe molrec needs a "fix_loose" flag to signal the reciever can symmetrize # pubchemerror = re.compile(r'^\s*PubchemError\s*$', re.IGNORECASE) # pubcheminput = re.compile(r'^\s*PubchemInput\s*$', re.IGNORECASE) # # N.B. Anything starting with PubchemError will be handled correctly by the molecule parser # # in libmints, which will just print the rest of the string and exit gracefully. pubchemre = re.compile(r"\Apubchem" + r"\s*:\s*" + r"(?P<pubsearch>(([\S ]+)))\Z", re.IGNORECASE) def _filter_pubchem(string): """Find any "pubchem:" lines in `string`, make call to the pubchem database and return the XYZ results back to `string`. Author: @andysim """ def process_pubchem(matchobj): pubsearch = matchobj.group("pubsearch") # search pubchem for the provided string try: results = pubchem.get_pubchem_results(pubsearch) except Exception as e: raise ValidationError(e.message) if pubsearch.endswith("*"): pubsearch = pubsearch[:-1] if len(results) == 1: # There's only 1 result - use it xyz = results[0].get_molecule_string() processed["name"] = "IUPAC {}".format(results[0].name()) processed["molecular_charge"] = float(results[0].molecular_charge) if "Input Error" in xyz: raise ValidationError(xyz) else: # There are multiple results -- print and exit # * formerly, this checked for (then used) any exact match, but now (LAB; Sep 2018), disabling that # since user explicitly added '*' char & "best match" (not available formerly) returned w/o '*' msg = "\tPubchemError\n" msg += "\tMultiple pubchem results were found. Replace\n\n\t\tpubchem:%s\n\n" % (pubsearch) msg += "\twith the Chemical ID number or exact name from one of the following and re-run.\n\n" msg += "\t Chemical ID IUPAC Name\n\n" ematches = {} for result in results: msg += "%s" % (result) ematches[result.cid] = result.iupac raise ChoicesError(msg, ematches) # remove PubchemInput first line and assert [A] xyz = xyz.replace("PubchemInput", "units ang") return xyz reconstitute = [] processed = {} for line in string.split("\n"): line = re.sub(pubchemre, process_pubchem, line.strip()) if line: reconstitute.append(line) return "\n".join(reconstitute), processed def _filter_kwargs(name, fix_com, fix_orientation, fix_symmetry): processed = {} if name is not None: processed["name"] = name if fix_com is not None: processed["fix_com"] = fix_com if fix_orientation is not None: processed["fix_orientation"] = fix_orientation if fix_symmetry is not None: processed["fix_symmetry"] = fix_symmetry return processed com = re.compile(r"\A(no_com|nocom)\Z", re.IGNORECASE) orient = re.compile(r"\A(no_reorient|noreorient)\Z", re.IGNORECASE) bohrang = re.compile(r"\Aunits?[\s=]+((?P<ubohr>(bohr|au|a.u.))|(?P<uang>(ang|angstrom)))\Z", re.IGNORECASE) symmetry = re.compile(r"\Asymmetry[\s=]+(?P<pg>\w+)\Z", re.IGNORECASE) def _filter_universals(string): """Process multiline `string` for fix_ and unit markers, returning a string of unprocessed `string` and a dictionary of processed fields. fix_com fix_orientation fix_symmetry #input_units_to_au (not settable) units """ def process_com(matchobj): processed["fix_com"] = True return "" def process_orient(matchobj): processed["fix_orientation"] = True return "" def process_bohrang(matchobj): if matchobj.group("uang"): processed["units"] = "Angstrom" elif matchobj.group("ubohr"): processed["units"] = "Bohr" return "" def process_symmetry(matchobj): processed["fix_symmetry"] = matchobj.group("pg").lower() return "" reconstitute = [] processed = {} com_found = False orient_found = False bohrang_found = False symmetry_found = False for line in string.split("\n"): line = line.strip() if not com_found: line, com_found = re.subn(com, process_com, line) if not orient_found: line, orient_found = re.subn(orient, process_orient, line) if not bohrang_found: line, bohrang_found = re.subn(bohrang, process_bohrang, line) if not symmetry_found: line, symmetry_found = re.subn(symmetry, process_symmetry, line) if line: reconstitute.append(line) return "\n".join(reconstitute), processed # fmt: off fragment_marker = re.compile(r'^\s*--\s*$', re.MULTILINE) efpxyzabc = re.compile( r'\A' + r'efp' + SEP + r'(?P<efpfile>(\w+))' + SEP + r'(?P<x>' + NUMBER + r')' + SEP + r'(?P<y>' + NUMBER + r')' + SEP + r'(?P<z>' + NUMBER + r')' + SEP + r'(?P<a>' + NUMBER + r')' + SEP + r'(?P<b>' + NUMBER + r')' + SEP + r'(?P<c>' + NUMBER + r')' + ENDL + r'\Z', re.IGNORECASE | re.VERBOSE) efppoints = re.compile( r'\A' + r'efp' + SEP + r'(?P<efpfile>(\w+))' + ENDL + r'[\s,]*' + r'(?P<x1>' + NUMBER + r')' + SEP + r'(?P<y1>' + NUMBER + r')' + SEP + r'(?P<z1>' + NUMBER + r')' + ENDL + r'[\s,]*' + r'(?P<x2>' + NUMBER + r')' + SEP + r'(?P<y2>' + NUMBER + r')' + SEP + r'(?P<z2>' + NUMBER + r')' + ENDL + r'[\s,]*' + r'(?P<x3>' + NUMBER + r')' + SEP + r'(?P<y3>' + NUMBER + r')' + SEP + r'(?P<z3>' + NUMBER + r')' + ENDL + r'\Z', re.IGNORECASE | re.MULTILINE | re.VERBOSE) # fmt: on def _filter_libefp(string): def process_efpxyzabc(matchobj): processed["fragment_files"].append(matchobj.group("efpfile")) processed["hint_types"].append("xyzabc") processed["geom_hints"].append( [ float(matchobj.group("x")), float(matchobj.group("y")), float(matchobj.group("z")), float(matchobj.group("a")), float(matchobj.group("b")), float(matchobj.group("c")), ] ) return "" def process_efppoints(matchobj): processed["fragment_files"].append(matchobj.group("efpfile")) processed["hint_types"].append("points") processed["geom_hints"].append( [ float(matchobj.group("x1")), float(matchobj.group("y1")), float(matchobj.group("z1")), float(matchobj.group("x2")), float(matchobj.group("y2")), float(matchobj.group("z2")), float(matchobj.group("x3")), float(matchobj.group("y3")), float(matchobj.group("z3")), ] ) return "" reconstitute = [] processed = {} processed["fragment_files"] = [] processed["hint_types"] = [] processed["geom_hints"] = [] # handle `--`-demarcated blocks for frag in re.split(fragment_marker, string): frag = re.sub(efpxyzabc, process_efpxyzabc, frag.strip()) frag = re.sub(efppoints, process_efppoints, frag) if frag: reconstitute.append(frag) return "\n--\n".join(reconstitute), processed fragment_marker = re.compile(r"^\s*--\s*$", re.MULTILINE) cgmp = re.compile(r"\A" + CHGMULT + r"\Z", re.VERBOSE) VAR = r"(-?[a-z][a-z0-9_]*)" # slight cheat to allow neg in `variable` NUCLABEL = r"([A-Z]{1,3}((_\w+)|(\d+))?)" ANCHORTO = r"((\d+)|" + NUCLABEL + r")" ANCHORVAL = r"(" + NUMBER + r"|" + VAR + ")" # fmt: off atom_cartesian = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP + CARTXYZ + r'\Z', re.IGNORECASE | re.VERBOSE) atom_vcart = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP + r'(?P<Xval>' + ANCHORVAL + r')' + SEP + r'(?P<Yval>' + ANCHORVAL + r')' + SEP + r'(?P<Zval>' + ANCHORVAL + r')' + r'\Z', re.IGNORECASE | re.VERBOSE) atom_zmat1 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + r'\Z', re.IGNORECASE | re.VERBOSE) atom_zmat2 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP + r'(?P<Ridx>' + ANCHORTO + r')' + SEP + r'(?P<Rval>' + ANCHORVAL + r')' + r'\Z', re.IGNORECASE | re.VERBOSE) atom_zmat3 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP + r'(?P<Ridx>' + ANCHORTO + r')' + SEP + r'(?P<Rval>' + ANCHORVAL + r')' + SEP + r'(?P<Aidx>' + ANCHORTO + r')' + SEP + r'(?P<Aval>' + ANCHORVAL + r')' + r'\Z', re.IGNORECASE | re.VERBOSE) atom_zmat4 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP + r'(?P<Ridx>' + ANCHORTO + r')' + SEP + r'(?P<Rval>' + ANCHORVAL + r')' + SEP + r'(?P<Aidx>' + ANCHORTO + r')' + SEP + r'(?P<Aval>' + ANCHORVAL + r')' + SEP + r'(?P<Didx>' + ANCHORTO + r')' + SEP + r'(?P<Dval>' + ANCHORVAL + r')' + r'\Z', re.IGNORECASE | re.VERBOSE) variable = re.compile( r'\A' + r'(?P<varname>' + VAR + r')' + r'\s*=\s*' + r'(?P<varvalue>((tda)|(' + NUMBER + r')))' + r'\Z', re.IGNORECASE | re.VERBOSE, ) # fmt: on def _filter_mints(string, unsettled=False): r"""Handle extracting fragment, atom, and chg/mult lines from `string`. Returns ------- str, dict Returns first a subset (plus some fragment separation guidance) of `string` containing the unmatched contents. These are generally input violations unless handled by a subsequent processing function. Returns second a dictionary with processed extractions. Contains (some optional) the following keys. molecular_charge : float, optional molecular_multiplicity : int, optional geom elbl fragment_separators fragment_charges fragment_multiplicities unsettled : bool, optional Whether to allow variable entries and zmat structure, accumulating into geom_unsettled, rather than pure numerical Cartesian entries, accumulating into geom. """ def process_system_cgmp(matchobj): """Handles optional special first fragment with sole contents overall chg/mult.""" processed["molecular_charge"] = float(matchobj.group("chg")) processed["molecular_multiplicity"] = int(matchobj.group("mult")) return "" def filter_fragment(fstring): """Handles extraction from everything within a fragment marker "--" of a single chg/mult (or None/None) and multiple atom lines. """ def process_fragment_cgmp(matchobj): processed["fragment_charges"].append(float(matchobj.group("chg"))) processed["fragment_multiplicities"].append(int(matchobj.group("mult"))) return "" def process_atom_cartesian(matchobj): processed["elbl"].append(matchobj.group("nucleus")) processed["geom"].append(float(matchobj.group("x"))) processed["geom"].append(float(matchobj.group("y"))) processed["geom"].append(float(matchobj.group("z"))) return "" def process_atom_unsettled(matchobj): processed["elbl"].append(matchobj.group("nucleus")) geo = [] if "Xval" in matchobj.groupdict(): geo.append(matchobj.group("Xval")) geo.append(matchobj.group("Yval")) geo.append(matchobj.group("Zval")) if "Rval" in matchobj.groupdict(): geo.append(matchobj.group("Ridx")) geo.append(matchobj.group("Rval")) if "Aval" in matchobj.groupdict(): geo.append(matchobj.group("Aidx")) geo.append(matchobj.group("Aval")) if "Dval" in matchobj.groupdict(): geo.append(matchobj.group("Didx")) geo.append(matchobj.group("Dval")) processed["geom_unsettled"].append(geo) return "" def process_variable(matchobj): processed["variables"].append((matchobj.group("varname"), matchobj.group("varvalue"))) return "" freconstitute = [] start_atom = len(processed["elbl"]) if start_atom > 0: processed["fragment_separators"].append(start_atom) fcgmp_found = False for iln, line in enumerate(fstring.split("\n")): line = line.strip() if not fcgmp_found: line, fcgmp_found = re.subn(cgmp, process_fragment_cgmp, line) if unsettled: line = re.sub(atom_vcart, process_atom_unsettled, line) line = re.sub(atom_zmat1, process_atom_unsettled, line) line = re.sub(atom_zmat2, process_atom_unsettled, line) line = re.sub(atom_zmat3, process_atom_unsettled, line) line = re.sub(atom_zmat4, process_atom_unsettled, line) line = re.sub(variable, process_variable, line) else: line = re.sub(atom_cartesian, process_atom_cartesian, line) if line: freconstitute.append(line) if not fcgmp_found: processed["fragment_charges"].append(None) processed["fragment_multiplicities"].append(None) return "\n".join(freconstitute), processed reconstitute = [] processed = {} processed["elbl"] = [] processed["fragment_separators"] = [] processed["fragment_charges"] = [] processed["fragment_multiplicities"] = [] if unsettled: processed["geom_unsettled"] = [] processed["variables"] = [] else: processed["geom"] = [] # handle `--`-demarcated blocks for ifr, frag in enumerate(re.split(fragment_marker, string)): frag = frag.strip() if ifr == 0 and cgmp.match(frag): frag, ntotch = re.subn(cgmp, process_system_cgmp, frag) else: frag, processed = filter_fragment(frag) if frag: reconstitute.append(frag) return "\n--\n".join(reconstitute), processed xyz1strict = re.compile(r"\A" + r"(?P<nat>\d+)" + r"\Z") SIMPLENUCLEUS = r"""((?P<E>[A-Z]{1,3})|(?P<Z>\d{1,3}))""" atom_cartesian_strict = re.compile( r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE ) xyz1 = re.compile(r"\A" + r"(?P<nat>\d+)" + r"[\s,]*" + r"((?P<ubohr>(bohr|au))|(?P<uang>ang))?" + r"\Z", re.IGNORECASE) xyz2 = re.compile(r"\A" + CHGMULT, re.VERBOSE) atom_cartesian = re.compile( r"\A" + r"(?P<nucleus>" + NUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE ) def _filter_xyz(string, strict): r"""Handle extracting atom, units, and chg/mult lines from `string`. Parameters ---------- strict : bool Whether to enforce a strict XYZ file format or to allow units, chg/mult, and add'l atom info. Returns ------- str, dict Returns first a subset of `string` containing the unmatched contents. These are generally input violations unless handled by a subsequent processing function. Returns second a dictionary with processed extractions. Contains (some optional) the following keys. molecular_charge : float, optional (`strict=False` only) molecular_multiplicity : int, optional (`strict=False` only) geom elbl units : {'Angstrom', 'Bohr'} (`Bohr` `strict=False` only) """ def process_bohrang(matchobj): nat = matchobj.group("nat") # lgtm[py/unused-local-variable] if matchobj.group("uang"): processed["units"] = "Angstrom" elif matchobj.group("ubohr"): processed["units"] = "Bohr" return "" def process_system_cgmp(matchobj): processed["molecular_charge"] = float(matchobj.group("chg")) processed["molecular_multiplicity"] = int(matchobj.group("mult")) return "" def process_atom_cartesian(matchobj): processed["elbl"].append(matchobj.group("nucleus")) processed["geom"].append(float(matchobj.group("x"))) processed["geom"].append(float(matchobj.group("y"))) processed["geom"].append(float(matchobj.group("z"))) return "" # nat = 0 reconstitute = [] processed = {} processed["geom"] = [] processed["elbl"] = [] if strict: for iln, line in enumerate(string.split("\n")): line = line.strip() if iln == 0: line = re.sub(xyz1strict, "", line) elif iln == 1: continue else: line = re.sub(atom_cartesian_strict, process_atom_cartesian, line) if line: reconstitute.append(line) else: for iln, line in enumerate(string.split("\n")): line = line.strip() if iln == 0: line = re.sub(xyz1, process_bohrang, line) elif iln == 1: line = re.sub(xyz2, process_system_cgmp, line) else: line = re.sub(atom_cartesian, process_atom_cartesian, line) if line and iln != 1: reconstitute.append(line) if "units" not in processed: processed["units"] = "Angstrom" # if len(processed['geom']) != nat: # raise ValidationError processed["geom_hints"] = [] # no EFP return "\n".join(reconstitute), processed