Source code for qcelemental.molparse.from_string

import pprint
import re
from typing import Dict, Tuple, Union

from ..exceptions import ChoicesError, MoleculeFormatError, ValidationError
from ..util import filter_comments, provenance_stamp
from . import pubchem
from .from_arrays import from_input_arrays
from .regex import CARTXYZ, CHGMULT, ENDL, NUCLEUS, NUMBER, SEP

__all__ = ["from_string"]


[docs]def from_string(
    molstr: str,
    dtype: str = None,
    *,
    name: str = None,
    fix_com: bool = None,
    fix_orientation: bool = None,
    fix_symmetry: str = None,
    return_processed: bool = False,
    enable_qm: bool = True,
    enable_efp: bool = True,
    missing_enabled_return_qm: str = "none",
    missing_enabled_return_efp: str = "none",
    verbose=1,
) -> Union[Dict, Tuple[Dict, Dict]]:
    r"""Construct a molecule dictionary from any recognized string format.

    Parameters
    ----------
    molstr
        Multiline string specification of molecule in a recognized format.
    dtype
        {'xyz', 'xyz+', 'psi4', 'psi4+'}
        Molecule format name; see below for details.
    return_processed
        Additionally return intermediate dictionary.
    enable_qm
        Consider quantum mechanical domain in processing the string constants
        into the returned molrec.
    enable_efp
        Consider effective fragment potential domain in processing the string
        contents into the returned molrec. Only relevant if `dtype` supports EFP.
    missing_enabled_return_qm
        {'minimal', 'none', 'error'}
        If `enable_qm=True`, what to do if it has no atoms/fragments?
        Respectively, return a fully valid but empty molrec, return empty
        dictionary, or throw error.
    missing_enabled_return_efp
        {'minimal', 'none', 'error'}
        If `enable_efp=True`, what to do if it has no atoms/fragments?
        Respectively, return a fully valid but empty molrec, return empty
        dictionary, or throw error.
    name
        Override `molstr` information for label for molecule; should
        be valid Python identifier. One of a very limited number of
        fields (three others follow) for trumping `molstr`. Provided
        for convenience, since the alternative would be collect the
        resulting molrec (discarding the Mol if called from class),
        editing it, then remaking the Mol.
    fix_com
        Override `molstr` information for whether translation of `geom`
        is allowed or disallowed.
    fix_orientation
        Override `molstr` information for whether rotation of `geom`
        is allowed or disallowed.
    fix_symmetry
        Override `molstr` information for maximal point group symmetry
        which geometry should be treated.

    Returns
    -------
    molrec : dict
        Molecule dictionary spec. See :py:func:`from_arrays`.
    molinit : dict, optional
        Intermediate "molrec"-like dictionary containing `molstr` info after
        parsing by this function but before the validation and defaulting of
        `from_arrays` that returns the proper `molrec`.
        Only provided if `return_processed` is True.

    Raises
    ------
    qcelemental.MoleculeFormatError
        After processing of `molstr`, only an empty string should remain.
        Anything left is a syntax error.

    Notes
    -----
    Several formats are interpretable:

    .. code-block:: none

        xyz - Strict XYZ format
        -----------------------

            String Layout
            -------------
            <number of atoms>
            comment line
            <element_symbol or atomic_number> <x> <y> <z>
            ...
            <element_symbol or atomic_number> <x> <y> <z>

            QM Domain
            ---------
            Specifiable: geom, elem/elez (element identity)
            Inaccessible: mass, real (vs. ghost), elbl (user label), name, units (assumed [A]),
                          input_units_to_au, fix_com/orientation/symmetry, fragmentation,
                          molecular_charge, molecular_multiplicity

            Notes
            -----
            <number of atoms> is pattern-matched but ignored.

        xyz+ - Enhanced XYZ format
        --------------------------

            String Layout
            -------------
            <number of atoms> [<bohr|au|ang>]
            [<molecular_charge> <molecular_multiplicity>] comment line
            <psi4_nucleus_spec> <x> <y> <z>
            ...
            <psi4_nucleus_spec> <x> <y> <z>

            QM Domain
            ---------
            Specifiable: geom, elem/elez (element identity), mass, real (vs. ghost), elbl (user label),
                         units (defaults [A]), molecular_charge, molecular_multiplicity
            Inaccessible: name, input_units_to_au, fix_com/orientation/symmetry, fragmentation

            Notes
            -----
            <number of atoms> is pattern-matched but ignored.

        psi4 - Psi4 molecule {...} format
        ---------------------------------

            QM Domain
            ---------
            Specifiable: geom, elem/elez (element identity), mass, real (vs. ghost), elbl (user label),
                         units (defaults [A]), fix_com/orientation/symmetry, fragment_separators,
                         fragment_charges, fragment_multiplicities, molecular_charge, molecular_multiplicity
            Inaccessible: name, input_units_to_au

                PubChem
                -------
                pubchem : <cid|name|formula> [*]

                A string like the above searches the PubChem database and substitutes the below. Adding the wildcard
                searches for multiple matches and raises ChoicesError with matches for further consideration attached.

                Specifiable: geom, elem/elez (element identity), units (fixed [A]), molecular_charge,
                             molecular_multiplicity (fixed singlet), name

            EFP Domain
            ----------
            Specifiable: units, fix_com/orientation/symmetry, fragment_files, hint_types, geom_hints
            Inaccessible: anything atomic or fragment details -- geom, elem/elez (element identity),
                          mass, real (vs. ghost), elbl (user label), fragment_separators, fragment_charges,
                          fragment_multiplicities, molecular_charge, molecular_multiplicity

        psi4+ - Psi4 non-Cartesian molecule {...} format
        ------------------------------------------------
        Like `dtype=psi4` (although combination with EFP not tested) except
        that instead of pure-Cartesian geometry, allow variables, zmatrix,
        and un-fully-specified geometries. *Not* MolSSI standard, but we're
        not dropping zmatrix yet. Note that in Psi4 internal coordinates
        defined through a zmatrix have no bearing on geometry
        optimization internals or constraints.

    """
    if verbose >= 2:
        print("<<< FROM_STRING\n", molstr, "\n>>>")

    # << 1 >>  str-->str -- discard comments
    molstr = filter_comments(molstr.strip())

    def parse_as_xyz_ish(molstr, strict):
        molinit = {}

        # << 2 >>  str-->dict -- process atoms, units[, chg, mult]
        molstr, processed = _filter_xyz(molstr, strict=strict)
        molinit.update(processed)

        if molstr:
            raise MoleculeFormatError(f"""Unprocessable Molecule remnants under {dtype}:\n{molstr}""")

        return molstr, molinit

    def parse_as_psi4_ish(molstr, unsettled):
        molinit = {}

        # Notes
        # * *_filter functions must fill non-overlapping fields
        # * not recc but can add to downstream by appending to str

        # << 2.1 >>  str-->str -- process pubchem into str for downfunction
        molstr, processed = _filter_pubchem(molstr)
        molinit.update(processed)

        # << 2.2 >>  str-->dict -- process units, com, orient, symm
        molstr, processed = _filter_universals(molstr)
        molinit.update(processed)

        # << 2.3 >>  str-->dict -- process efp frags
        molstr, processed = _filter_libefp(molstr)
        molinit.update(processed)

        # << 2.4 >>  str-->dict -- process atoms, chg, mult, frags
        molstr, processed = _filter_mints(molstr, unsettled=unsettled)
        molinit.update(processed)

        if molstr:
            raise MoleculeFormatError(f"""Unprocessable Molecule remnants under {dtype}:\n{molstr}""")

        return molstr, molinit

    if dtype == "xyz":
        molstr, molinit = parse_as_xyz_ish(molstr, strict=True)

    elif dtype == "xyz+":
        molstr, molinit = parse_as_xyz_ish(molstr, strict=False)

    elif dtype == "psi4":
        molstr, molinit = parse_as_psi4_ish(molstr, unsettled=False)

    elif dtype == "psi4+":
        molstr, molinit = parse_as_psi4_ish(molstr, unsettled=True)

    elif dtype is None:
        dtype = "[psi4, xyz, xyz+, psi4+]"  # for error message
        try:
            molstr, molinit = parse_as_psi4_ish(molstr, unsettled=False)
            dtype = "psi4"
        except MoleculeFormatError as e:
            min_error_length = len(str(e))
            min_error = e
            try:
                molstr, molinit = parse_as_xyz_ish(molstr, strict=True)
                dtype = "xyz"
            except MoleculeFormatError as e:
                if len(str(e)) < min_error_length:
                    min_error_length = len(str(e))
                    min_error = e
                try:
                    molstr, molinit = parse_as_xyz_ish(molstr, strict=False)
                    dtype = "xyz+"
                except MoleculeFormatError as e:
                    if len(str(e)) < min_error_length:
                        min_error_length = len(str(e))
                        min_error = e
                    try:
                        molstr, molinit = parse_as_psi4_ish(molstr, unsettled=True)
                        dtype = "psi4+"
                    except MoleculeFormatError as e:
                        if len(str(e)) < min_error_length:
                            min_error_length = len(str(e))
                            min_error = e
                        raise min_error
    else:
        raise KeyError(f"Molecule: dtype of `{dtype}` not recognized.")

    # << 3 >>  args-->dict -- process name, com, orient, symm from arguments
    processed = _filter_kwargs(name, fix_com, fix_orientation, fix_symmetry)
    molinit.update(processed)

    if verbose >= 2:
        print("\nFROM_STRING (", dtype, ") --> FROM_INPUT_ARRAYS <<<")
        pprint.pprint(molinit)
        print(">>>\n")

    # << 4 >>  dict-->molspec
    molrec = from_input_arrays(
        speclabel=True,
        enable_qm=enable_qm,
        enable_efp=enable_efp,
        missing_enabled_return_qm=missing_enabled_return_qm,
        missing_enabled_return_efp=missing_enabled_return_efp,
        **molinit,
    )

    # replace from_arrays stamp with from_string stamp
    if "qm" in molrec and molrec["qm"]:
        molrec["qm"]["provenance"] = provenance_stamp(__name__)
    if "efp" in molrec and molrec["efp"]:
        molrec["efp"]["provenance"] = provenance_stamp(__name__)

    if verbose >= 2:
        print("\nFROM_STRING MOLREC <<<", molrec, ">>>\n")

    if return_processed:
        return molrec, molinit
    else:
        return molrec


# TODO maybe molrec needs a "fix_loose" flag to signal the reciever can symmetrize
#    pubchemerror = re.compile(r'^\s*PubchemError\s*$', re.IGNORECASE)
#    pubcheminput = re.compile(r'^\s*PubchemInput\s*$', re.IGNORECASE)
#        # N.B. Anything starting with PubchemError will be handled correctly by the molecule parser
#        # in libmints, which will just print the rest of the string and exit gracefully.

pubchemre = re.compile(r"\Apubchem" + r"\s*:\s*" + r"(?P<pubsearch>(([\S ]+)))\Z", re.IGNORECASE)


def _filter_pubchem(string):
    """Find any "pubchem:" lines in `string`, make call to the pubchem database
    and return the XYZ results back to `string`.

    Author: @andysim

    """

    def process_pubchem(matchobj):
        pubsearch = matchobj.group("pubsearch")

        # search pubchem for the provided string
        try:
            results = pubchem.get_pubchem_results(pubsearch)
        except Exception as e:
            raise ValidationError(e.message)

        if pubsearch.endswith("*"):
            pubsearch = pubsearch[:-1]
        if len(results) == 1:
            # There's only 1 result - use it
            xyz = results[0].get_molecule_string()
            processed["name"] = "IUPAC {}".format(results[0].name())
            processed["molecular_charge"] = float(results[0].molecular_charge)
            if "Input Error" in xyz:
                raise ValidationError(xyz)
        else:
            # There are multiple results -- print and exit
            # * formerly, this checked for (then used) any exact match, but now (LAB; Sep 2018), disabling that
            #   since user explicitly added '*' char & "best match" (not available formerly) returned w/o '*'
            msg = "\tPubchemError\n"
            msg += "\tMultiple pubchem results were found. Replace\n\n\t\tpubchem:%s\n\n" % (pubsearch)
            msg += "\twith the Chemical ID number or exact name from one of the following and re-run.\n\n"
            msg += "\t Chemical ID     IUPAC Name\n\n"
            ematches = {}
            for result in results:
                msg += "%s" % (result)
                ematches[result.cid] = result.iupac
            raise ChoicesError(msg, ematches)

        # remove PubchemInput first line and assert [A]
        xyz = xyz.replace("PubchemInput", "units ang")
        return xyz

    reconstitute = []
    processed = {}

    for line in string.split("\n"):
        line = re.sub(pubchemre, process_pubchem, line.strip())
        if line:
            reconstitute.append(line)

    return "\n".join(reconstitute), processed


def _filter_kwargs(name, fix_com, fix_orientation, fix_symmetry):
    processed = {}
    if name is not None:
        processed["name"] = name
    if fix_com is not None:
        processed["fix_com"] = fix_com
    if fix_orientation is not None:
        processed["fix_orientation"] = fix_orientation
    if fix_symmetry is not None:
        processed["fix_symmetry"] = fix_symmetry

    return processed


com = re.compile(r"\A(no_com|nocom)\Z", re.IGNORECASE)
orient = re.compile(r"\A(no_reorient|noreorient)\Z", re.IGNORECASE)
bohrang = re.compile(r"\Aunits?[\s=]+((?P<ubohr>(bohr|au|a.u.))|(?P<uang>(ang|angstrom)))\Z", re.IGNORECASE)
symmetry = re.compile(r"\Asymmetry[\s=]+(?P<pg>\w+)\Z", re.IGNORECASE)


def _filter_universals(string):
    """Process multiline `string` for fix_ and unit markers,
    returning a string of unprocessed `string` and a dictionary of
    processed fields.

    fix_com
    fix_orientation
    fix_symmetry
    #input_units_to_au (not settable)
    units

    """

    def process_com(matchobj):
        processed["fix_com"] = True
        return ""

    def process_orient(matchobj):
        processed["fix_orientation"] = True
        return ""

    def process_bohrang(matchobj):
        if matchobj.group("uang"):
            processed["units"] = "Angstrom"
        elif matchobj.group("ubohr"):
            processed["units"] = "Bohr"
        return ""

    def process_symmetry(matchobj):
        processed["fix_symmetry"] = matchobj.group("pg").lower()
        return ""

    reconstitute = []
    processed = {}
    com_found = False
    orient_found = False
    bohrang_found = False
    symmetry_found = False

    for line in string.split("\n"):
        line = line.strip()
        if not com_found:
            line, com_found = re.subn(com, process_com, line)
        if not orient_found:
            line, orient_found = re.subn(orient, process_orient, line)
        if not bohrang_found:
            line, bohrang_found = re.subn(bohrang, process_bohrang, line)
        if not symmetry_found:
            line, symmetry_found = re.subn(symmetry, process_symmetry, line)
        if line:
            reconstitute.append(line)

    return "\n".join(reconstitute), processed


# fmt: off
fragment_marker = re.compile(r'^\s*--\s*$', re.MULTILINE)
efpxyzabc = re.compile(
    r'\A' + r'efp' + SEP + r'(?P<efpfile>(\w+))' + SEP +
    r'(?P<x>' + NUMBER + r')' + SEP + r'(?P<y>' + NUMBER + r')' + SEP + r'(?P<z>' + NUMBER + r')' + SEP +
    r'(?P<a>' + NUMBER + r')' + SEP + r'(?P<b>' + NUMBER + r')' + SEP + r'(?P<c>' + NUMBER + r')' + ENDL + r'\Z',
    re.IGNORECASE | re.VERBOSE)

efppoints = re.compile(
    r'\A' + r'efp' + SEP + r'(?P<efpfile>(\w+))' + ENDL +
    r'[\s,]*' + r'(?P<x1>' + NUMBER + r')' + SEP + r'(?P<y1>' + NUMBER + r')' + SEP + r'(?P<z1>' + NUMBER + r')' + ENDL +
    r'[\s,]*' + r'(?P<x2>' + NUMBER + r')' + SEP + r'(?P<y2>' + NUMBER + r')' + SEP + r'(?P<z2>' + NUMBER + r')' + ENDL +
    r'[\s,]*' + r'(?P<x3>' + NUMBER + r')' + SEP + r'(?P<y3>' + NUMBER + r')' + SEP + r'(?P<z3>' + NUMBER + r')' + ENDL + r'\Z',
    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
# fmt: on


def _filter_libefp(string):
    def process_efpxyzabc(matchobj):
        processed["fragment_files"].append(matchobj.group("efpfile"))
        processed["hint_types"].append("xyzabc")
        processed["geom_hints"].append(
            [
                float(matchobj.group("x")),
                float(matchobj.group("y")),
                float(matchobj.group("z")),
                float(matchobj.group("a")),
                float(matchobj.group("b")),
                float(matchobj.group("c")),
            ]
        )
        return ""

    def process_efppoints(matchobj):
        processed["fragment_files"].append(matchobj.group("efpfile"))
        processed["hint_types"].append("points")
        processed["geom_hints"].append(
            [
                float(matchobj.group("x1")),
                float(matchobj.group("y1")),
                float(matchobj.group("z1")),
                float(matchobj.group("x2")),
                float(matchobj.group("y2")),
                float(matchobj.group("z2")),
                float(matchobj.group("x3")),
                float(matchobj.group("y3")),
                float(matchobj.group("z3")),
            ]
        )
        return ""

    reconstitute = []
    processed = {}
    processed["fragment_files"] = []
    processed["hint_types"] = []
    processed["geom_hints"] = []

    # handle `--`-demarcated blocks
    for frag in re.split(fragment_marker, string):
        frag = re.sub(efpxyzabc, process_efpxyzabc, frag.strip())
        frag = re.sub(efppoints, process_efppoints, frag)
        if frag:
            reconstitute.append(frag)

    return "\n--\n".join(reconstitute), processed


fragment_marker = re.compile(r"^\s*--\s*$", re.MULTILINE)
cgmp = re.compile(r"\A" + CHGMULT + r"\Z", re.VERBOSE)

VAR = r"(-?[a-z][a-z0-9_]*)"  # slight cheat to allow neg in `variable`
NUCLABEL = r"([A-Z]{1,3}((_\w+)|(\d+))?)"
ANCHORTO = r"((\d+)|" + NUCLABEL + r")"
ANCHORVAL = r"(" + NUMBER + r"|" + VAR + ")"

# fmt: off
atom_cartesian = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP + CARTXYZ + r'\Z',
                            re.IGNORECASE | re.VERBOSE)
atom_vcart = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP +
                        r'(?P<Xval>' + ANCHORVAL + r')' + SEP +
                        r'(?P<Yval>' + ANCHORVAL + r')' + SEP +
                        r'(?P<Zval>' + ANCHORVAL + r')' + r'\Z',
                        re.IGNORECASE | re.VERBOSE)
atom_zmat1 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + r'\Z',
                        re.IGNORECASE | re.VERBOSE)
atom_zmat2 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP +
                        r'(?P<Ridx>' + ANCHORTO + r')' + SEP + r'(?P<Rval>' + ANCHORVAL + r')' + r'\Z',
                        re.IGNORECASE | re.VERBOSE)
atom_zmat3 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP +
                        r'(?P<Ridx>' + ANCHORTO + r')' + SEP + r'(?P<Rval>' + ANCHORVAL + r')' + SEP +
                        r'(?P<Aidx>' + ANCHORTO + r')' + SEP + r'(?P<Aval>' + ANCHORVAL + r')' + r'\Z',
                        re.IGNORECASE | re.VERBOSE)
atom_zmat4 = re.compile(r'\A' + r'(?P<nucleus>' + NUCLEUS + r')' + SEP +
                        r'(?P<Ridx>' + ANCHORTO + r')' + SEP + r'(?P<Rval>' + ANCHORVAL + r')' + SEP +
                        r'(?P<Aidx>' + ANCHORTO + r')' + SEP + r'(?P<Aval>' + ANCHORVAL + r')' + SEP +
                        r'(?P<Didx>' + ANCHORTO + r')' + SEP + r'(?P<Dval>' + ANCHORVAL + r')' + r'\Z',
                        re.IGNORECASE | re.VERBOSE)
variable = re.compile(
    r'\A' + r'(?P<varname>' + VAR + r')' + r'\s*=\s*' + r'(?P<varvalue>((tda)|(' + NUMBER + r')))' + r'\Z',
    re.IGNORECASE | re.VERBOSE,
)
# fmt: on


def _filter_mints(string, unsettled=False):
    r"""Handle extracting fragment, atom, and chg/mult lines from `string`.

    Returns
    -------
    str, dict
        Returns first a subset (plus some fragment separation guidance) of
            `string` containing the unmatched contents. These are generally input
            violations unless handled by a subsequent processing function.
        Returns second a dictionary with processed extractions. Contains (some
            optional) the following keys.

            molecular_charge : float, optional
            molecular_multiplicity : int, optional
            geom
            elbl
            fragment_separators
            fragment_charges
            fragment_multiplicities

    unsettled : bool, optional
        Whether to allow variable entries and zmat structure, accumulating into
        geom_unsettled, rather than pure numerical Cartesian entries,
        accumulating into geom.

    """

    def process_system_cgmp(matchobj):
        """Handles optional special first fragment with sole contents overall chg/mult."""

        processed["molecular_charge"] = float(matchobj.group("chg"))
        processed["molecular_multiplicity"] = int(matchobj.group("mult"))
        return ""

    def filter_fragment(fstring):
        """Handles extraction from everything within a fragment marker "--" of a
        single chg/mult (or None/None) and multiple atom lines.

        """

        def process_fragment_cgmp(matchobj):
            processed["fragment_charges"].append(float(matchobj.group("chg")))
            processed["fragment_multiplicities"].append(int(matchobj.group("mult")))
            return ""

        def process_atom_cartesian(matchobj):
            processed["elbl"].append(matchobj.group("nucleus"))
            processed["geom"].append(float(matchobj.group("x")))
            processed["geom"].append(float(matchobj.group("y")))
            processed["geom"].append(float(matchobj.group("z")))
            return ""

        def process_atom_unsettled(matchobj):
            processed["elbl"].append(matchobj.group("nucleus"))
            geo = []
            if "Xval" in matchobj.groupdict():
                geo.append(matchobj.group("Xval"))
                geo.append(matchobj.group("Yval"))
                geo.append(matchobj.group("Zval"))
            if "Rval" in matchobj.groupdict():
                geo.append(matchobj.group("Ridx"))
                geo.append(matchobj.group("Rval"))
            if "Aval" in matchobj.groupdict():
                geo.append(matchobj.group("Aidx"))
                geo.append(matchobj.group("Aval"))
            if "Dval" in matchobj.groupdict():
                geo.append(matchobj.group("Didx"))
                geo.append(matchobj.group("Dval"))
            processed["geom_unsettled"].append(geo)
            return ""

        def process_variable(matchobj):
            processed["variables"].append((matchobj.group("varname"), matchobj.group("varvalue")))
            return ""

        freconstitute = []
        start_atom = len(processed["elbl"])
        if start_atom > 0:
            processed["fragment_separators"].append(start_atom)

        fcgmp_found = False
        for iln, line in enumerate(fstring.split("\n")):
            line = line.strip()
            if not fcgmp_found:
                line, fcgmp_found = re.subn(cgmp, process_fragment_cgmp, line)
            if unsettled:
                line = re.sub(atom_vcart, process_atom_unsettled, line)
                line = re.sub(atom_zmat1, process_atom_unsettled, line)
                line = re.sub(atom_zmat2, process_atom_unsettled, line)
                line = re.sub(atom_zmat3, process_atom_unsettled, line)
                line = re.sub(atom_zmat4, process_atom_unsettled, line)
                line = re.sub(variable, process_variable, line)
            else:
                line = re.sub(atom_cartesian, process_atom_cartesian, line)
            if line:
                freconstitute.append(line)

        if not fcgmp_found:
            processed["fragment_charges"].append(None)
            processed["fragment_multiplicities"].append(None)

        return "\n".join(freconstitute), processed

    reconstitute = []
    processed = {}
    processed["elbl"] = []
    processed["fragment_separators"] = []
    processed["fragment_charges"] = []
    processed["fragment_multiplicities"] = []
    if unsettled:
        processed["geom_unsettled"] = []
        processed["variables"] = []
    else:
        processed["geom"] = []

    # handle `--`-demarcated blocks
    for ifr, frag in enumerate(re.split(fragment_marker, string)):
        frag = frag.strip()
        if ifr == 0 and cgmp.match(frag):
            frag, ntotch = re.subn(cgmp, process_system_cgmp, frag)
        else:
            frag, processed = filter_fragment(frag)
        if frag:
            reconstitute.append(frag)

    return "\n--\n".join(reconstitute), processed


xyz1strict = re.compile(r"\A" + r"(?P<nat>\d+)" + r"\Z")
SIMPLENUCLEUS = r"""((?P<E>[A-Z]{1,3})|(?P<Z>\d{1,3}))"""
atom_cartesian_strict = re.compile(
    r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE
)

xyz1 = re.compile(r"\A" + r"(?P<nat>\d+)" + r"[\s,]*" + r"((?P<ubohr>(bohr|au))|(?P<uang>ang))?" + r"\Z", re.IGNORECASE)
xyz2 = re.compile(r"\A" + CHGMULT, re.VERBOSE)
atom_cartesian = re.compile(
    r"\A" + r"(?P<nucleus>" + NUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE
)


def _filter_xyz(string, strict):
    r"""Handle extracting atom, units, and chg/mult lines from `string`.

    Parameters
    ----------
    strict : bool
        Whether to enforce a strict XYZ file format or to allow units, chg/mult,
        and add'l atom info.

    Returns
    -------
    str, dict
        Returns first a subset of `string` containing the unmatched contents.
        These are generally input violations unless handled by a subsequent
        processing function.
        Returns second a dictionary with processed extractions. Contains (some
            optional) the following keys.

            molecular_charge : float, optional (`strict=False` only)
            molecular_multiplicity : int, optional (`strict=False` only)
            geom
            elbl
            units : {'Angstrom', 'Bohr'} (`Bohr` `strict=False` only)

    """

    def process_bohrang(matchobj):
        nat = matchobj.group("nat")  # lgtm[py/unused-local-variable]
        if matchobj.group("uang"):
            processed["units"] = "Angstrom"
        elif matchobj.group("ubohr"):
            processed["units"] = "Bohr"
        return ""

    def process_system_cgmp(matchobj):
        processed["molecular_charge"] = float(matchobj.group("chg"))
        processed["molecular_multiplicity"] = int(matchobj.group("mult"))
        return ""

    def process_atom_cartesian(matchobj):
        processed["elbl"].append(matchobj.group("nucleus"))
        processed["geom"].append(float(matchobj.group("x")))
        processed["geom"].append(float(matchobj.group("y")))
        processed["geom"].append(float(matchobj.group("z")))
        return ""

    # nat = 0
    reconstitute = []
    processed = {}
    processed["geom"] = []
    processed["elbl"] = []

    if strict:
        for iln, line in enumerate(string.split("\n")):
            line = line.strip()
            if iln == 0:
                line = re.sub(xyz1strict, "", line)
            elif iln == 1:
                continue
            else:
                line = re.sub(atom_cartesian_strict, process_atom_cartesian, line)
            if line:
                reconstitute.append(line)
    else:
        for iln, line in enumerate(string.split("\n")):
            line = line.strip()
            if iln == 0:
                line = re.sub(xyz1, process_bohrang, line)
            elif iln == 1:
                line = re.sub(xyz2, process_system_cgmp, line)
            else:
                line = re.sub(atom_cartesian, process_atom_cartesian, line)
            if line and iln != 1:
                reconstitute.append(line)

    if "units" not in processed:
        processed["units"] = "Angstrom"

    # if len(processed['geom']) != nat:
    #    raise ValidationError
    processed["geom_hints"] = []  # no EFP

    return "\n".join(reconstitute), processed