Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies:
- openff-units
- pip
- tqdm
- prolif

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also add the dependency to the pyproject.toml?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes can do :)

- pyyaml
# for testing
- coverage
Expand Down
306 changes: 306 additions & 0 deletions src/openfe_analysis/prolif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
from __future__ import annotations

import numpy as np
from typing import Any, Dict, Optional, Sequence, Tuple, Literal
import warnings

import MDAnalysis as mda
from MDAnalysis.guesser.tables import vdwradii as MDA_VDWRADII

import prolif as plf

from .utils.plotting import plot_prolif_3d, plot_prolif_lignetwork


class ProLIFAnalysis:
"""
ProLIF interaction fingerprint analysis for an OpenFEReader Universe.
"""

def __init__(
self,
universe: mda.Universe,
ligand_ag: mda.AtomGroup,
water_order: int = 3,
protein_cutoff: float = 12.0,
water_cutoff: float = 8.0,
interactions: Optional[Sequence[str] | str] = None,
guess_bonds: bool = True,
vdwradii: Optional[Dict[str, float]] = None,
) -> None:
"""

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the other analysis classes we ended up going with the doc string style of putting the doc string at the function level instead of having it in init. Could you maybe change that here as well?

Initialize the ProLIF analysis.

Parameters
----------
universe
MDAnalysis Universe containing topology and trajectory.
ligand_ag
mda.AtomGroup representing the ligand.
water_order
Maximum WaterBridge interaction order (water-water interaction).
Only used if "WaterBridge" is tracked.
protein_cutoff
Distance cutoff in angstrom used to define the protein pocket
around the ligand.
water_cutoff
Distance cutoff in angstrom used to define waters considered
around the ligand/protein pocket.
interactions
Which interactions to track:
- None: ProLIF defaults

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you list the current defaults here or link to the repo?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will add the link to the defaults of ProLIF and also a test case to see that those are captured

- "all": all registered (non-bridged; depends on ProLIF version)
- Sequence[str]: explicit list like ["VdWContact", "HBDonor"]
guess_bonds
If True, guess bonds for (protein, ligand, water) so ProLIF can
recognize donors/acceptors and bonded hydrogens.
vdwradii
Optional dict of van der Waals radii used by MDAnalysis bond guesser.
Useful when your topology contains types the guesser doesn't know
(e.g. "Cl", "Na"). If None, uses coded defaults.
"""
self.universe = universe
self.ligand_ag = ligand_ag
self.water_order = water_order

self.frames: Optional[np.ndarray] = None
self.times: Optional[np.ndarray] = None
self.n_frames: Optional[int] = None
self.ifp_df = None

# --- Guess bonds once on stable selections so RDKit/ProLIF can detect HBonds ---
if guess_bonds:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a lot happening in init here, maybe it would make sense to put some of these into private functions (e.g. to guess bonds, to build the fingerprints) so that they are easier to test separately and for a better overview of the different things that are going on.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree, can split it up a little bit, since it is quite crowded currently

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@talagayev i created a small utils function for the guessing of bonds here :

as part of this PR #92
Please let me know what you think about it!

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hannahbaumann looks good to me, will adjust the code to import it from there :)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is merged now, so feel to try out the utils function here.

if vdwradii is None:
vdwradii = dict(MDA_VDWRADII)
vdwradii.update(
{
"Cl": vdwradii["CL"],
"Br": vdwradii["BR"],
"Na": vdwradii["NA"],
}
)

# Protein: guess on the full protein so any pocket residue later has bonds
universe.select_atoms("protein").guess_bonds(vdwradii=vdwradii)

# Ligand: stable group
self.ligand_ag.guess_bonds(vdwradii=vdwradii)

# Water: only if you care about water-mediated interactions
wat_all = universe.select_atoms("water")
if wat_all.n_atoms:
wat_all.guess_bonds(vdwradii=vdwradii)

self.protein_ag = self.universe.select_atoms(
f"protein and byres around {protein_cutoff} group ligand",
ligand=self.ligand_ag,
updating=True,
)
self.water_ag = self.universe.select_atoms(
f"water and byres around {water_cutoff} (group ligand or group pocket)",
ligand=self.ligand_ag,
pocket=self.protein_ag,
updating=True,
)

available = plf.Fingerprint.list_available(show_bridged=True)

fp_interactions: Optional[list[str] | str]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're not consistent with this in this repo yet, but I think it would be great to go with the new syntax in new PRs for the type hints (e.g. list[str] | str | None), here and elsewhere.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup yup sounds good 🙂

if interactions is None:
fp_interactions = None

elif interactions == "all":
fp_interactions = "all"

else:
# Cover case of false interaction
missing = [i for i in interactions if i not in available]
if missing:
raise ValueError(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like ProLIF would also raise an error for missing interactions, so this may not be necessary.
_check_valid_interactions

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or just a test for this ValueError would be great to see if it gets raised.

f"Unknown interaction(s): {missing}. " f"Available: {available}"
)
fp_interactions = list(interactions)

self._parameters = None
if (
fp_interactions is not None
and fp_interactions != "all"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check if water interactions are part of "all" in ProLIF.

and "WaterBridge" in fp_interactions
):
if self.water_ag.n_atoms == 0:
warnings.warn(
"WaterBridge selected but water selection is empty at the initial "
"frame; removing WaterBridge from the requested interactions.",
UserWarning,
stacklevel=2,
)
fp_interactions = [
interaction
for interaction in fp_interactions
if interaction != "WaterBridge"
]
else:
self._parameters = {
"WaterBridge": {"water": self.water_ag, "order": self.water_order}
}

if not fp_interactions:
self.fp = plf.Fingerprint(parameters=self._parameters)
else:
self.fp = plf.Fingerprint(
interactions=fp_interactions,
parameters=self._parameters,
)

def run(
Comment thread
hannahbaumann marked this conversation as resolved.
self,
*,
start: Optional[int] = None,
stop: Optional[int] = None,
step: Optional[int] = None,
residues: Optional[Literal["all"] | Sequence[str | int]] = None,
progress: bool = True,
n_jobs: Optional[int] = None,
parallel_strategy: Optional[Literal["chunk", "queue"]] = None,
converter_kwargs: Optional[Tuple[Dict[str, Any], Dict[str, Any]]] = None,
) -> "ProLIFAnalysis":
"""
Run the fingerprint calculation over a slice of the trajectory.

Parameters
----------
start, stop, step
Trajectory slicing parameters.
residues
Passed to ProLIF: ``"all"`` to track every residue, or an explicit
sequence of residue identifiers. If None, ProLIF's default is used
and interactions with atoms are identified.
progress
Show progress bar.
n_jobs
Number of workers for parallel execution.).
parallel_strategy
ProLIF parallel strategy. If None, this wrapper sets:
- "chunk" for n_jobs None/1
- "queue" for n_jobs > 1
converter_kwargs
Two dicts: (ligand_kwargs, protein_kwargs) forwarded to the MDAnalysis→RDKit
converter. If None, we default to:
- ligand: {"inferrer": None, "implicit_hydrogens": False} (avoid valence issues)
- protein: {"implicit_hydrogens": False} (use topology bonds)

Returns
-------
self
Returned for fluent chaining.
"""
# Due to FEReader trajectory only certain strategies work with the format
if parallel_strategy is None:
# avoid ProLIF trying to pickle FEReader/netCDF trajectory to auto-pick strategy
parallel_strategy = "chunk" if (n_jobs is None or n_jobs == 1) else "queue"

_slice = slice(start, stop, step)
traj = self.universe.trajectory[_slice]

try:
n_total = len(self.universe.trajectory)
s0, s1, s2 = _slice.indices(n_total)
self.frames = np.arange(s0, s1, s2, dtype=int)
self.n_frames = len(traj)

if (
hasattr(self.universe.trajectory, "times")
and self.universe.trajectory.times is not None
):
self.times = np.asarray(self.universe.trajectory.times)[self.frames]
elif getattr(self.universe.trajectory, "dt", None) is not None:
self.times = self.frames * self.universe.trajectory.dt
else:
self.times = None
except Exception:
self.frames = None
self.times = None
self.n_frames = None

if converter_kwargs is None:
# Avoid Valence errors
converter_kwargs = (
{"inferrer": None, "implicit_hydrogens": False}, # ligand
{"implicit_hydrogens": False}, # protein
)

self.fp.run(
traj,
self.ligand_ag,
self.protein_ag,
residues=residues,
converter_kwargs=converter_kwargs,
progress=progress,
n_jobs=n_jobs,
parallel_strategy=parallel_strategy,
)

return self

@property
def ifp(self):
"""
Convenience accessor for underlying ProLIF fingerprint results.
"""
return getattr(self.fp, "ifp", None)

# For now, depending on what we do withe the data
def to_dataframe(self, **kwargs):
"""
Transform fingerprint results to pd.DataFrame.
"""
df = self.fp.to_dataframe(**kwargs)
self.ifp_df = df

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if it's necessary to store it here.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm seing the dataframe and with the dataframe being also kinda a quite utils style representation of results I am thinking if it makes sense if we would have other cases where we would want to show results in dataframes just as an utils function accesible for all other functions 🤔

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will remove the part of storing to self since it could be modified by calling to_dataframe with different kwargs.

return df

def plot_lignetwork(self, ligand_mol=None, **kwargs):
"""
2D ProLIF ligand-network visualization.
"""
return plot_prolif_lignetwork(self, ligand_mol, **kwargs)

plot_2d = plot_lignetwork

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we could also rename the function above to plot_2d or similar and remove the alias here?


def plot_barcode(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also move these plotting functions into the plotting script?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes my mistake, think I had a different correct version locally, need to try to find it and can the commit and push it, my bad🙈

self,
*,
figsize: tuple[int, int] = (8, 10),
dpi: int = 100,
interactive: bool = True,
n_frame_ticks: int = 10,
residues_tick_location: Literal["top", "bottom"] = "top",
xlabel: str = "Frame",
subplots_kwargs: Optional[dict] = None,
tight_layout_kwargs: Optional[dict] = None,
):
"""
Barcode plot of interactions across frames.
"""
if not self.ifp:
raise RuntimeError(
"No ProLIF fingerprint data found. Run `analysis.run(...)` first."
)

return self.fp.plot_barcode(
figsize=figsize,
dpi=dpi,
interactive=interactive,
n_frame_ticks=n_frame_ticks,
residues_tick_location=residues_tick_location,
xlabel=xlabel,
subplots_kwargs=subplots_kwargs,
tight_layout_kwargs=tight_layout_kwargs,
)

def plot_3d(self, ligand_mol=None, protein_mol=None, water_mol=None, **kwargs):
"""
3D ProLIF interaction visualization using py3Dmol.
"""
return plot_prolif_3d(
self, ligand_mol, protein_mol, water_mol=water_mol, **kwargs
)
Loading
Loading