From 88225246ac528cca98c22f693b8ac1718febeaf1 Mon Sep 17 00:00:00 2001
From: CalCraven <nicholas.c.craven@vanderbilt.edu>
Date: Sun, 16 Jun 2024 08:57:05 -0500
Subject: [PATCH 1/2] Adjust and move pandas dataframe conversion to external
 module

---
 gmso/core/topology.py                |  77 ------
 gmso/external/__init__.py            |   1 +
 gmso/external/convert_dataframe.py   | 381 +++++++++++++++++++++++++++
 gmso/tests/test_convert_dataframe.py | 153 +++++++++++
 gmso/tests/test_topology.py          |  88 +------
 5 files changed, 536 insertions(+), 164 deletions(-)
 create mode 100644 gmso/external/convert_dataframe.py
 create mode 100644 gmso/tests/test_convert_dataframe.py

diff --git a/gmso/core/topology.py b/gmso/core/topology.py
index 856adaa7f..f3abd8a9a 100644
--- a/gmso/core/topology.py
+++ b/gmso/core/topology.py
@@ -1237,83 +1237,6 @@ def write_forcefield(self, filename, overwrite=False):
         ff = self.get_forcefield()
         ff.to_xml(filename=filename, overwrite=overwrite)
 
-    def to_dataframe(self, parameter="sites", site_attrs=None, unyts_bool=True):
-        """Return a pandas dataframe object for the sites in a topology
-
-        Parameters
-        ----------
-        parameter : str, default='sites'
-            A string determining what aspects of the gmso topology will be reported.
-            Options are: 'sites', 'bonds', 'angles', 'dihedrals', and 'impropers'. Defaults to 'sites'.
-        site_attrs : list of str, default=None
-             List of strings that are attributes of the topology site and can be included as entries in the pandas dataframe.
-            Examples of these can be found by printing `topology.sites[0].__dict__`.
-            See https://gmso.mosdef.org/en/stable/data_structures.html#gmso.Atom for additional information on labeling.
-        unyts_bool: bool, default=True
-            Determine if numerical values are saved as unyt quantities or floats. See
-            https://unyt.readthedocs.io/en/stable/usage.html
-            for more information about manipulating unyt quantities.
-            Default is True.
-
-        Returns
-        -------
-        Pandas Dataframe
-            A pandas.Dataframe object, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
-            for further information.
-
-        Examples
-        ________
-        >>> topology.to_dataframe(parameter = 'sites', site_attrs = ['charge'])
-            This will return a dataframe with a listing of the sites and include the charges that correspond to each site.
-        >>> topology.to_dataframe(parameter = 'dihedrals', site_attrs = ['positions'])
-            This will return a dataframe with a listing of the sites that make up each dihedral, the positions of each of
-            those sites, and the parameters that are associated with the dihedrals.
-
-        Notes
-        ____
-        A dataframe is easily manipulated. In order to change the rounding to two decimals places for a column named `label`:
-            >>> df['label'] = df['label'].round(2)
-        The column labels can also be easily modified. This line can take a dataframe `df` and rename a column labeled
-        `Atom0` to `newname` using a dictionary.
-            >>> df.rename(columns = {'Atom0':'newname'})
-        See https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/index.html for further information.
-        """
-        from gmso.utils.io import import_
-
-        pd = import_("pandas")
-        if not site_attrs:
-            site_attrs = []
-        df = pd.DataFrame()
-        if not self.is_typed():
-            raise GMSOError(
-                "This topology is not typed, please type this object before converting to a pandas dataframe"
-            )
-        if parameter == "sites":
-            df["atom_types"] = list(site.atom_type.name for site in self.sites)
-            df["names"] = list(site.name for site in self.sites)
-            for attr in site_attrs:
-                df = self._parse_dataframe_attrs(df, attr, parameter, unyts_bool)
-        elif parameter in ["bonds", "angles", "dihedrals", "impropers"]:
-            if len(getattr(self, parameter)) == 0:
-                raise GMSOError(
-                    f"There arent any {parameter} in the topology. The dataframe would be empty."
-                )
-            df = self._pandas_from_parameters(
-                df,
-                parameter=parameter,
-                site_attrs=site_attrs,
-                unyts_bool=unyts_bool,
-            )
-            df = self._parse_parameter_expression(df, parameter, unyts_bool)
-        else:
-            raise AttributeError(
-                "{} is not yet supported for outputting parameters to a dataframe. \
-            Please use  one of 'sites', 'bonds', 'angles', 'dihedrals', or \
-            'impropers'".format(str(parameter))
-            )
-
-        return df
-
     def get_forcefield(self):
         """Get an instance of gmso.ForceField out of this topology
 
diff --git a/gmso/external/__init__.py b/gmso/external/__init__.py
index bf94976d6..8a0451072 100644
--- a/gmso/external/__init__.py
+++ b/gmso/external/__init__.py
@@ -1,6 +1,7 @@
 # ruff: noqa: F401
 """Support for various in-memory representations of chemical systems."""
 
+from .convert_dataframe import to_dataframeDict
 from .convert_hoomd import (
     to_gsd_snapshot,
     to_hoomd_forcefield,
diff --git a/gmso/external/convert_dataframe.py b/gmso/external/convert_dataframe.py
new file mode 100644
index 000000000..894f42f82
--- /dev/null
+++ b/gmso/external/convert_dataframe.py
@@ -0,0 +1,381 @@
+"""Module support for converting to/from Pandas DataFrame objects."""
+
+import functools
+import warnings
+from collections.abc import Iterable
+
+import numpy as np
+import unyt as u
+
+from gmso import Topology
+from gmso.core.views import PotentialFilters
+from gmso.exceptions import GMSOError
+from gmso.utils.io import import_
+
+pd = import_("pandas")
+pfilter = PotentialFilters.UNIQUE_PARAMETERS
+
+
+def to_dataframeDict(
+    topology: Topology,
+    parameters: str or list[str] = "all",
+    format: str = "default",
+    columns: list[str] = None,
+    handle_unyts: str = "to_headers",
+) -> pd.DataFrame:
+    """Return a dictionary of pandas dataframe objects for a topology.
+
+    Parameters
+    ----------
+    topology : gmso.Topology, required
+        Topology to use for converting values
+    parameters : str or list of str, optional, default='all'
+        A string determining what aspects of the gmso topology will be reported.
+        Options are: 'all', 'sites', 'bonds', 'angles', 'dihedrals', and 'impropers'. Defaults to 'all'. Can pass multiple strings as a list.
+    format : str, optional, default='default'
+        The output formatting style for the dataframe.
+        Options are 'default', 'specific_columns', 'publication', `remove_duplicates`. Defaults to 'default'
+        'default' will output default column values of ["name", "atom_type.name", "atom_type.parameters", "charge", "mass"],
+        and any additional attributes in the `columns` argument.
+        'specific_columns' will only output the attributes from the `columns` argument.
+        'publication' will use the default outputs, but remove duplicate values from the dataframes. It adds a column labeled
+        'Atom Indices' to the `sites` dataframe, which enumerates the indices that the atom_type is a part of.
+        `remove_duplicates` will remove duplicate rows from the dataframe. For sites, this column is `atom_types.name`.
+        For connections, it is the `connection_types.connection_members`. For sites, an additional column will be added, labeled
+        `Atom Indices` that includes the site indexes of members that are the given `atom_type.name`. Because these methods
+        are specific to a given Topology element, the `parameters` argument must be one of
+        {"sites", "bonds", "angles", "dihedrals", "impropers"}, not {"all"}.
+    columns : list of str, optional, default=None
+        List of strings that are attributes of the topology site and can be included as entries in the pandas dataframe.
+        Examples of these can be found by printing `topology.sites[0].__dict__` or `topology.bonds[0].__dict__`.
+        See https://gmso.mosdef.org/en/stable/data_structures.html#gmso.Atom for additional information on labeling.
+    handle_unyts: str, optional, default='to_headers'
+        The placement/recording of unyt quantities in dataframe.
+        Options are 'to_headers', 'with_data', 'no_unyts'
+        Determines if numerical values in the DataFrame are saved as unyt quantities or floats. Default case, 'to_headers",
+        puts the unyts as strings to go with the column header of the dataframe.
+        `with_data` leaves any values alone, so any values in the Topology that are unyt quantities will stay that way.
+        `no_unyts` strips any unyt values and converts to a float in the associated element of the dataframe.
+        See https://unyt.readthedocs.io/en/stable/usage.html
+        for more information about manipulating unyt quantities.
+
+    Returns
+    -------
+    Dictionary of Pandas Dataframe
+        A python dictionary of pandas.Dataframe object, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
+        for further information. The keys of this dictionary are the attributes of the topology that are associated with each DataFrame. These
+        can be `sites`, `bonds`, `angles`, `dihedrals`, `impropers`, which are determined from the argument `parameters`.
+
+
+    Examples
+    --------
+    # example topology to use
+    ``` python
+    >>> import gmso
+    >>> import mbuild as mb
+    >>> from gmso.parameterization import apply
+    >>> cpd = mb.load("C", smiles=True)
+    >>> top = cpd.to_gmso()
+    >>> ff = gmso.ForceField("oplsaa")
+    >>> ptop = apply(top, ff)
+    ```
+
+
+    >>> gmso.external.convert_dataframe.to_dataframeDict(ptop, parameters='sites', columns=['charge'], handle_unyts="to_headers")
+        This will return a dataframe with a listing of the sites and include the charges that correspond to each site.
+        ```
+        {'sites':
+            name atom_type.name  epsilon (kJ/mol)  sigma (nm)   charge (elementary_charge)  mass (amu)
+            0    C       opls_138          0.276144        0.35  -0.24      12.011
+            1    H       opls_140          0.125520        0.25   0.06       1.008
+            2    H       opls_140          0.125520        0.25   0.06       1.008
+            3    H       opls_140          0.125520        0.25   0.06       1.008
+            4    H       opls_140          0.125520        0.25   0.06       1.008
+        }
+        ```
+
+    >>> topology.to_dataframe(parameters = 'dihedrals', site_attrs = ['positions'])
+        This will return a dataframe with a listing of the sites that make up each dihedral, the positions of each of
+        those sites, and the parameters that are associated with the dihedrals.
+
+    Notes
+    -----
+    A dataframe is easily manipulated. In order to change the rounding to two decimals places for a column named `label`:
+        >>> df['label'] = df['label'].round(2)
+    The column labels can also be easily modified. This line can take a dataframe `df` and rename a column labeled
+    `Atom0` to `newname` using a dictionary.
+        >>> df.rename(columns = {'Atom0':'newname'})
+    See https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/index.html for further information.
+    """
+    if columns is None:
+        columns = []
+    if not topology.is_typed():
+        raise GMSOError(
+            "This topology is not typed, please type this object before converting to a pandas dataframe"
+        )
+    outDict = {}  # dictionary of dataframes to write out
+
+    # Get columns from format methods
+    columnsDict = {}
+    connectionsList = [
+        "bonds",
+        "angles",
+        "dihedrals",
+        "impropers",
+    ]  # these can be handled generally
+    remove_duplicatesBool = False  # flag to remove duplicate parameters from the dataframe and put indices into a new column
+    if format == "default":
+        columnsDict = {
+            param: [
+                "name",
+                f"{param[:-1]}_type.member_classes",
+                f"{param[:-1]}_type.parameters",
+            ]
+            for param in connectionsList
+        }
+        columnsDict["sites"] = [
+            "name",
+            "atom_type.name",
+            "atom_type.parameters",
+            "charge",
+            "mass",
+        ]
+        if isinstance(columns, list):
+            columnsDict = {
+                key: columnsDict[key] + columns for key in columnsDict.keys()
+            }  # add in any provided columns
+    elif format == "specific_columns":
+        assert parameters != "all", (
+            f"When formatting for specific columns, please set parameter argument to be one of {['sites']+connectionsList}."
+            "Otherwise use a format of default."
+        )
+        if isinstance(parameters, str):
+            parametersList = [parameters]
+        else:
+            parametersList = parameters
+        columnsDict = {parameter: columns for parameter in parametersList}
+    elif format == "publication":
+        columnsDict = {
+            param: [
+                "name",
+                f"{param[:-1]}_type.member_classes",
+                f"{param[:-1]}_type.parameters",
+            ]
+            for param in connectionsList
+        }
+        columnsDict["sites"] = [
+            "name",
+            "atom_type.name",
+            "atom_type.parameters",
+            "charge",
+            "mass",
+        ]
+        remove_duplicatesBool = True
+    elif format == "remove_duplicates":
+        assert parameters != "all", (
+            f"When formatting for specific columns, please set parameter argument to be one of {['sites']+connectionsList}."
+            "Otherwise use a format of default."
+        )
+        if not columns and parameters == "sites":  # default values
+            columns = ["atom_type.name"]
+        elif not columns and parameters in connectionsList:
+            columns = [f"{parameters[:-1]}_type.member_classes"]
+        columnsDict = {parameters: columns}
+    else:
+        raise ValueError(
+            f"Available options for format are 'default', 'specific_columns', 'publication', or 'remove_duplicates'. The incorrect argument passed was {format=}."
+        )
+
+    if parameters == "all":
+        parametersList = ["sites"] + connectionsList
+    elif parameters in connectionsList or parameters == "sites":
+        parametersList = [parameters]
+    elif isinstance(parameters, list) and all(
+        [parameter in connectionsList + ["sites"] for parameter in parameters]
+    ):
+        parametersList = parameters
+    else:
+        allowed_parameters = "', '".join(connectionsList)
+        raise ValueError(
+            f"Parameters argument {parameters} must be one of: 'all', 'sites', '{allowed_parameters}'."
+        )
+
+    for param in parametersList:
+        if not getattr(topology, f"n_{param}"):
+            warnings.warn(
+                UserWarning(
+                    f"Topology {topology} has no {param}, so adding a None element to dictionary"
+                )
+            )
+            outDict[param] = None
+            continue
+        dataList, columns = _generate_component_lists(
+            topology, param, columnsDict.get(param)
+        )
+        # handle unyts in values
+        dataList, columns = _parse_unyts(handle_unyts, dataList, columns)
+        dataDict = {col: data for col, data in zip(columns, dataList)}
+        outDict[param] = pd.DataFrame(dataDict)  # create dataframe
+
+    if (
+        remove_duplicatesBool and topology.n_sites > 0
+    ):  # use flag to remove duplicates in sites
+        outDict["sites"] = _add_duplicate_indices_to_sites_dataframe(outDict["sites"])
+        for param in connectionsList:
+            if not getattr(topology, f"n_{param}"):
+                continue
+            outDict[param] = _remove_duplicate_connections(outDict[param], param)
+
+    if format == "remove_duplicates":
+        for df in outDict.values():  # remove duplicate values
+            df.drop("Atom Indices", errors="ignore")
+            df.drop_duplicates(inplace=True, ignore_index=True)
+
+    return outDict
+
+
+def _parse_unyts(handle_unyts, dataList, columnsList):
+    if handle_unyts == "to_headers":  # move units to the header
+        columnsList = _parse_unyts_to_headers(dataList, columnsList)
+        dataList = _parse_unyts_no_unytss(dataList)
+    elif handle_unyts == "with_data":  # leave units where they are
+        pass
+    elif handle_unyts == "no_unyts":  # convert units to floats
+        dataList = _parse_unyts_no_unytss(dataList)
+    else:
+        raise ValueError(
+            f"Supplied the argument {handle_unyts=} of {type(handle_unyts)}, but must provide one of the arguments 'to_headers', 'with_data', or 'no_unyts'."
+        )
+    return dataList, columnsList
+
+
+def _parse_unyts_no_unytss(dataList) -> list:
+    for i in range(len(dataList)):
+        if isinstance(dataList[i][0], u.unyt_array):
+            dataList[i] = [float(x) for x in dataList[i]]  # turn to float
+    return dataList
+
+
+def _parse_unyts_to_headers(dataList, columns) -> list:
+    new_colsList = []
+    for data, col in zip(dataList, columns):
+        if isinstance(data[0], u.unyt_array):
+            unit = str(data[0].units)  # assumption that all data in List is same units
+            new_colsList.append(col + f" ({unit})")
+        else:
+            new_colsList.append(col)
+    return new_colsList
+
+
+def _generate_component_lists(topology, parameter, columns) -> list:
+    outList = []
+    columnsList = []
+    for column in columns:
+        valuesList = _recursive_getattr(topology, parameter, column)
+        if isinstance(valuesList[0], dict):
+            # add keys to columnsList and values to outList
+            keys = list(valuesList[0].keys())
+            values_dictList = [[value[key] for value in valuesList] for key in keys]
+            outList.extend(values_dictList)
+            columnsList.extend(keys)
+        elif isinstance(valuesList[0], u.unyt_array) and not isinstance(
+            valuesList[0], u.unyt_quantity
+        ):
+            outList.extend(np.array(valuesList).T)
+            if column == "position":
+                columnsList.extend(["x", "y", "z"])
+            else:
+                columnsList.extend(
+                    [f"{column}-({i})" for i in range(len(valuesList[0]))]
+                )
+        elif isinstance(valuesList[0], tuple) or isinstance(
+            valuesList[0], list
+        ):  # could be connection_members
+            outList.extend(np.array(valuesList).T)
+            if "connection_members" in column:
+                columnsList.extend(
+                    [f"{parameter} member ({i})" for i in range(len(valuesList[0]))]
+                )
+            else:
+                columnsList.extend(
+                    [f"{column}-({i})" for i in range(len(valuesList[0]))]
+                )
+
+            # handle positions?
+            # handle connection_members
+            pass
+        else:
+            outList.append(valuesList)
+            columnsList.append(column)
+    return outList, columnsList
+
+
+def _recursive_getattr(topology, attr, attr_attr):
+    """Parse a topology to get a list of attributes from an iterable."""
+
+    def _getattr(obj, attr1):
+        try:
+            return getattr(obj, attr1)
+        except AttributeError:
+            raise AttributeError(
+                f"The GMSO Topology is missing the requested attribute {attr1} from {obj}.{attr_attr}"
+            )
+
+    iteritems = getattr(topology, attr)
+
+    def _parseFunction(x):
+        return functools.reduce(_getattr, [x] + attr_attr.split("."))
+
+    return list(map(_parseFunction, iteritems))
+
+
+def _add_duplicate_indices_to_sites_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    unique_col = "atom_type.name"  # use to grab what is considered `unique`, may be able to make this a variable in the future
+    df["Atom Indices"] = df[unique_col].apply(
+        lambda x: ", ".join(str(v) for v in df.index[df[unique_col] == x].to_list())
+    )
+    keep = df[~df.duplicated(subset=unique_col)]
+    return keep.reset_index()
+
+
+def _remove_duplicate_connections(df: pd.DataFrame, parameter) -> pd.DataFrame:
+    # dset connection members length
+    membersMap = {"bonds": 2, "angles": 3, "dihedrals": 4, "impropers": 4}
+    # drop duplicate rows in df
+    n_atoms = membersMap[parameter]
+    df = df.drop_duplicates(
+        subset=[f"{parameter[:-1]}_type.member_classes-({i})" for i in range(n_atoms)]
+    )
+    # remove columns for indexing
+    # df = df.drop(labels=[f"Atom{i}" for i in range(n_atoms)], axis=1)
+    return df.reset_index(drop=True)
+
+
+def multi_topology_dataframe(topologies: list) -> pd.DataFrame:
+    """Take an iterable of topologies and create a combined dataframe to encompass all parameters."""
+    assert isinstance(topologies, Iterable)
+    assert isinstance(next(iter(topologies)), Topology)
+    topList = list(topologies)
+    dictList = []
+    for top in topList:
+        dictList.append(to_dataframeDict(top, format="publication"))
+    concatDict = {}
+    for parameter in ["sites", "bonds", "angles", "dihedrals", "impropers"]:
+        dfsList = list(map(lambda x: x.get(parameter), dictList))
+        if not any(elem is not None for elem in dfsList):
+            continue
+        dfout = pd.concat(
+            [
+                df.drop("Atom Indices", errors="ignore")
+                for df in dfsList
+                if df is not None
+            ]
+        )  # remove missing dfs
+        # remove duplicates
+        concatDict[parameter] = dfout.drop_duplicates().reset_index()
+
+    return concatDict
+
+
+def generate_topology_report(topologies: list) -> pd.DataFrame:
+    """Generate information of 2D structure and parameters for an iterable of Topologies."""
diff --git a/gmso/tests/test_convert_dataframe.py b/gmso/tests/test_convert_dataframe.py
new file mode 100644
index 000000000..a0103604d
--- /dev/null
+++ b/gmso/tests/test_convert_dataframe.py
@@ -0,0 +1,153 @@
+import numpy as np
+import pytest
+import unyt as u
+
+from gmso.external.convert_dataframe import (
+    _recursive_getattr,
+    multi_topology_dataframe,
+    to_dataframeDict,
+)
+from gmso.tests.base_test import BaseTest
+from gmso.utils.io import has_pandas
+
+
+class TestConvertDataFrame(BaseTest):
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_recursive_sites(self, typed_ethane):
+        out = list(_recursive_getattr(typed_ethane, "sites", "atom_type.atomclass"))
+        expected = [site.atom_type.atomclass for site in typed_ethane.sites]
+        assert out == expected
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_recursive_dihedrals(self, typed_ethane):
+        out = list(
+            _recursive_getattr(typed_ethane, "dihedrals", "dihedral_type.member_types")
+        )
+        expected = [
+            dihedral.dihedral_type.member_types for dihedral in typed_ethane.dihedrals
+        ]
+        assert out == expected
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_to_dataframeDict(self, typed_ethane):
+        expected_valuesList = [8, 7, 12, 9]
+        checkList = ["sites", "bonds", "angles", "dihedrals"]
+        for parameter, val in zip(checkList, expected_valuesList):
+            assert (
+                len(to_dataframeDict(typed_ethane, parameters=parameter)[parameter])
+                == val
+            )
+        allDict = to_dataframeDict(typed_ethane, parameters="all")
+        dfList = [allDict.get(key) for key in checkList]
+        assert list(map(len, dfList)) == expected_valuesList
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_dataframe_impropers(self, benzeneTopology):
+        expected_valuesList = [12, 12, 18, 24, 6]
+        checkList = ["sites", "bonds", "angles", "dihedrals", "impropers"]
+        for parameter, val in zip(checkList, expected_valuesList):
+            assert (
+                len(to_dataframeDict(benzeneTopology, parameters=parameter)[parameter])
+                == val
+            )
+        allDict = to_dataframeDict(benzeneTopology, parameters="all")
+        dfList = [allDict.get(key) for key in checkList]
+        assert list(map(len, dfList)) == expected_valuesList
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_dataframe_default_columns(self, typed_ethane):
+        expected_columns = set(
+            [
+                "name",
+                "atom_type.name",
+                "sigma",
+                "epsilon",
+                "charge",
+                "mass",
+            ]
+        )
+        assert np.all(
+            set(
+                to_dataframeDict(typed_ethane, "sites", handle_unyts="no_unyts")[
+                    "sites"
+                ].columns
+            )
+            == expected_columns
+        )
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_dataframe_specified_columns(self, typed_ethane):
+        input_columns = ["name", "position", "group"]
+        expected_columns = ["name", "x", "y", "z", "group"]
+        assert np.all(
+            list(
+                to_dataframeDict(
+                    typed_ethane,
+                    "sites",
+                    columns=input_columns,
+                    format="specific_columns",
+                )["sites"].columns
+            )
+            == expected_columns
+        )
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_dataframe_publication(self, benzeneTopology):
+        dfDict = to_dataframeDict(benzeneTopology, "all", format="publication")
+        df = dfDict["sites"]
+        assert len(df.index) == 2
+        assert len(df.columns) == 8
+        assert "Atom Indices" in df.columns
+        assert df["Atom Indices"].loc[0] == ", ".join(str(v) for v in np.arange(6))
+        assert df["Atom Indices"].loc[1] == ", ".join(str(v) for v in np.arange(6, 12))
+
+        connectList = ["bonds", "angles", "dihedrals", "impropers"]
+        checkList = [2, 2, 3, 1]
+        for connect, check in zip(connectList, checkList):
+            df = dfDict[connect]
+            assert len(df.index) == check
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_dataframe_remove_duplicates(self, benzeneTopology):
+        dfDict = to_dataframeDict(benzeneTopology, "sites", format="remove_duplicates")
+        df = dfDict["sites"]
+        assert len(df.index) == 2
+        assert len(df.columns) == 1
+        assert "Atom Indices" not in df.columns
+
+        connectList = ["bonds", "angles", "dihedrals", "impropers"]
+        checkList = [2, 2, 3, 1]
+        for connect, check in zip(connectList, checkList):
+            dfDict = to_dataframeDict(
+                benzeneTopology, connect, format="remove_duplicates"
+            )
+            df = dfDict[connect]
+            assert len(df.index) == check
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_dataframe_unyts(self, typed_ethane):
+        dfDict = to_dataframeDict(
+            typed_ethane, "all", format="publication", handle_unyts="with_data"
+        )
+        df = dfDict["sites"]
+        assert isinstance(df["charge"].loc[0], u.unyt_quantity)
+
+        dfDict = to_dataframeDict(
+            typed_ethane, "all", format="publication", handle_unyts="no_unyts"
+        )
+        df = dfDict["sites"]
+        assert isinstance(df["charge"].loc[0], float)
+
+    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
+    def test_multi_topology_dataframe(self, benzeneTopology, spce_water):
+        dfDict = multi_topology_dataframe(
+            [benzeneTopology, spce_water, benzeneTopology]
+        )
+        connectList = ["sites", "bonds", "angles", "dihedrals", "impropers"]
+        checkList = [4, 3, 3, 3, 1]
+        for connect, check in zip(connectList, checkList):
+            df = dfDict.get(connect)
+            if df is None:
+                assert df == check
+            else:
+                assert len(df.index) == check
diff --git a/gmso/tests/test_topology.py b/gmso/tests/test_topology.py
index 49e246b76..8fd8ed634 100644
--- a/gmso/tests/test_topology.py
+++ b/gmso/tests/test_topology.py
@@ -21,7 +21,7 @@
 from gmso.exceptions import GMSOError
 from gmso.external.convert_parmed import from_parmed
 from gmso.tests.base_test import BaseTest
-from gmso.utils.io import get_fn, has_pandas, has_parmed, import_
+from gmso.utils.io import get_fn, has_parmed, import_
 from gmso.utils.units import GMSO_UnitRegistry as UnitReg
 
 if has_parmed:
@@ -715,92 +715,6 @@ def test_topology_set_scaling_factors_none(self):
         with pytest.raises(ValueError):
             top.set_scaling_factors(None, None)
 
-    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
-    def test_to_dataframe(self, typed_ethane):
-        assert len(typed_ethane.to_dataframe()) == 8
-        assert len(typed_ethane.to_dataframe(parameter="bonds")) == 7
-        assert len(typed_ethane.to_dataframe(parameter="angles")) == 12
-        assert len(typed_ethane.to_dataframe(parameter="dihedrals")) == 9
-        assert np.isclose(
-            float(
-                typed_ethane.to_dataframe(site_attrs=["charge", "position"])[
-                    "charge (e)"
-                ][0]
-            ),
-            typed_ethane.sites[0]
-            .charge.in_units(
-                u.Unit("elementary_charge", registry=UnitReg.default_reg())
-            )
-            .to_value(),
-        )
-        assert (
-            typed_ethane.to_dataframe(site_attrs=["atom_type.name"])["atom_type.name"][
-                0
-            ]
-            == "opls_135"
-        )
-        assert np.allclose(
-            float(typed_ethane.to_dataframe(site_attrs=["charge", "position"])["x"][0]),
-            0,
-        )
-        assert np.allclose(
-            float(
-                typed_ethane.to_dataframe(
-                    parameter="bonds", site_attrs=["charge", "position"]
-                )["charge Atom0 (e)"][0]
-            ),
-            typed_ethane.bonds[0]
-            .connection_members[0]
-            .charge.in_units(
-                u.Unit("elementary_charge", registry=UnitReg.default_reg())
-            )
-            .to_value(),
-        )
-        with pytest.raises(AttributeError) as e:
-            typed_ethane.to_dataframe(site_attrs=["missingattr"])
-        assert str(e.value) == "The attribute missingattr is not in this gmso object."
-        with pytest.raises(AttributeError) as e:
-            typed_ethane.to_dataframe(site_attrs=["missingattr.missingattr"])
-        assert (
-            str(e.value)
-            == "The attribute missingattr.missingattr is not in this gmso object."
-        )
-        with pytest.raises(AttributeError) as e:
-            typed_ethane.to_dataframe(site_attrs=["missingattr.attr"])
-        assert (
-            str(e.value) == "The attribute missingattr.attr is not in this gmso object."
-        )
-        with pytest.raises(AttributeError) as e:
-            typed_ethane.to_dataframe(parameter="bonds", site_attrs=["missingattr"])
-        assert str(e.value) == "The attribute missingattr is not in this gmso object."
-        with pytest.raises(AttributeError) as e:
-            typed_ethane.to_dataframe(
-                parameter="bonds", site_attrs=["missingattr.attr"]
-            )
-        assert (
-            str(e.value) == "The attribute missingattr.attr is not in this gmso object."
-        )
-        with pytest.raises(GMSOError) as e:
-            top = Topology()
-            top.to_dataframe(parameter="bonds")
-            assert (
-                str(e.value)
-                == "There arent any bonds in the topology. The dataframe would be empty."
-            )
-
-    @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed")
-    def test_pandas_from_parameters(self, typed_ethane):
-        pd = import_("pandas")
-        df = pd.DataFrame()
-        assert np.allclose(
-            float(
-                typed_ethane._pandas_from_parameters(df, "bonds", ["positions"])[
-                    "x Atom1 (nm)"
-                ][6]
-            ),
-            -0.03570001,
-        )
-
     def test_is_typed_check(self, typed_chloroethanol):
         groups = [
             "sites",

From b364e48c43ff478a795da16cf2491f83fbb761c3 Mon Sep 17 00:00:00 2001
From: CalCraven <nicholas.c.craven@vanderbilt.edu>
Date: Sun, 16 Jun 2024 08:57:49 -0500
Subject: [PATCH 2/2] Fix bug with finding hoomd minor version when checking
 for gaff PeriodicImpropers

---
 gmso/external/convert_hoomd.py | 2 +-
 gmso/tests/test_hoomd.py       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/gmso/external/convert_hoomd.py b/gmso/external/convert_hoomd.py
index 6779dafef..59a40db53 100644
--- a/gmso/external/convert_hoomd.py
+++ b/gmso/external/convert_hoomd.py
@@ -1275,7 +1275,7 @@ def _parse_improper_forces(
             base_units,
         )
 
-    if int(hoomd_version[0]) >= 4 and int(hoomd_version[1]) >= 5:
+    if int(hoomd_version[0]) + float(hoomd_version[1]) * 0.1 >= 4.5:
         itype_group_map = {
             "HarmonicImproperPotential": {
                 "container": hoomd.md.improper.Harmonic,
diff --git a/gmso/tests/test_hoomd.py b/gmso/tests/test_hoomd.py
index f3032871e..5b2c74082 100644
--- a/gmso/tests/test_hoomd.py
+++ b/gmso/tests/test_hoomd.py
@@ -388,7 +388,8 @@ def test_zero_charges(self):
     @pytest.mark.skipif(not has_hoomd, reason="hoomd is not installed")
     @pytest.mark.skipif(not has_mbuild, reason="mbuild not installed")
     @pytest.mark.skipif(
-        int(hoomd_version[0]) <= 3.8, reason="Deprecated features in HOOMD 4"
+        int(hoomd_version[0]) + float(hoomd_version[1]) * 0.1 < 4.5,
+        reason="Feature added in HOOMD 4.5",
     )
     def test_gaff_sim(self, gaff_forcefield):
         base_units = {