From 639fbbe81e637d22c690655c8decbc19a391d817 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Fri, 16 Jan 2026 10:40:05 +0000
Subject: [PATCH 01/17] add compare and rank method for FEMaps

---
 cinnabar/compare.py            | 275 +++++++++++++++++++++++++++++++++
 cinnabar/stats.py              | 160 ++++++++++++++++---
 cinnabar/tests/test_compare.py |  48 ++++++
 3 files changed, 463 insertions(+), 20 deletions(-)
 create mode 100644 cinnabar/compare.py
 create mode 100644 cinnabar/tests/test_compare.py

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
new file mode 100644
index 00000000..90e85ed0
--- /dev/null
+++ b/cinnabar/compare.py
@@ -0,0 +1,275 @@
+import string
+
+import numpy as np
+
+from cinnabar import FEMap
+from typing import Literal
+import pandas as pd
+from collections import defaultdict
+from cinnabar.stats import AVAILABLE_STATS
+import itertools
+
+
+
+def compare_and_rank_femaps(
+    femaps: list[FEMap],
+    labels: list[str],
+    prediction_type: Literal["nodewise", "edgewise", "pairwise"] = "pairwise",
+    rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"] = "MUE",
+    metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]] | None = None,
+    num_bootstraps: int = 1000,
+    confidence_level: float = 0.95,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Compare and rank multiple FEMaps based on the chosen performance metric and return an ordered table of
+    results using the compact letter display (CLD) system.
+
+    Parameters
+    ----------
+    femaps : list[FEMap]
+        A list of FEMap instances to compare.
+    labels : list[str]
+        A list of labels corresponding to each FEMap used in the output tables.
+    prediction_type: Literal["nodewise", "edgewise", "pairwise"], optional
+        The type of prediction to evaluate, by default "pairwise".
+    rank_metric : Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"], optional
+        The metric used to rank the models, by default "MUE".
+    metrics_to_compute : list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]], optional
+        A list of metrics to compute for each model. If None, all metrics appropriate for the `prediction_type` will be computed.
+    num_bootstraps : int, optional
+        The number of bootstrap samples to use for estimating confidence intervals, by default 1000.
+    confidence_level : float, optional
+        The confidence level for the intervals, by default 0.95.
+
+    Note
+    ----
+    - We assume that all FEMaps have been evaluated on the same test set.
+    - Prediction types "nodewise", "edgewise", and "pairwise" correspond to DGs, edgewise DDGs and pairwise DDGs (back calculated from nodewise DGs) respectively.
+    - When we have more than 2 models, we apply multiple testing correction to the pairwise comparisons using the ``Holm`` method by default.
+
+    Returns
+    -------
+    tuple[pd.DataFrame, pd.DataFrame]
+        A tuple containing two DataFrames:
+        - The first DataFrame contains the computed metrics for each model.
+        - The second DataFrame contains the pairwise comparison results between models.
+    """
+
+
+    # get the predictions and experimental values from the FEMaps and align them into a DataFrame
+    predictions_by_key = defaultdict(dict)
+    for label, femap in zip(labels, femaps):
+        graph = femap.to_legacy_graph()
+        if prediction_type == "nodewise":
+            #TODO  should we expose a shift or centering option here?
+            for node, data in graph.nodes(data=True):
+                predictions_by_key[node][f"{label}_calc"] = data["calc_DG"]
+                if "exp_DG" not in predictions_by_key[node]:
+                    predictions_by_key[node]["exp"] = data["exp_DG"]
+        elif prediction_type == "edgewise":
+            for a, b, data in graph.edges(data=True):
+                # we assume all edges have been run in the same direction
+                key = (a, b)
+                predictions_by_key[key][f"{label}_calc"] = data["calc_DDG"]
+                if "exp_DDG" not in predictions_by_key[key]:
+                    predictions_by_key[key]["exp"] = data["exp_DDG"]
+        elif prediction_type == "pairwise":
+            nodes = list(graph.nodes())
+            for a, b in itertools.combinations(nodes, 2):
+                exp = graph.nodes[b]["exp_DG"] - graph.nodes[a]["exp_DG"]
+                calc = graph.nodes[b]["calc_DG"] - graph.nodes[a]["calc_DG"]
+                key = (a, b)
+                predictions_by_key[key][f"{label}_calc"] = calc
+                if "exp" not in predictions_by_key[key]:
+                    predictions_by_key[key]["exp"] = exp
+        else:
+            raise ValueError(f"Invalid prediction_type: {prediction_type}")
+
+    predictions_df = pd.DataFrame(predictions_by_key.values())
+
+    # set metrics to compute based on best practices if not provided
+    if metrics_to_compute is None:
+        if prediction_type in ["pairwise", "edgewise"]:
+            metrics_to_compute = ["MUE", "RMSE"]
+        elif prediction_type == "nodewise":
+            metrics_to_compute = ["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]
+
+    # check we can compute all requested metrics
+    for metric in metrics_to_compute:
+        if metric not in AVAILABLE_STATS:
+            raise ValueError(f"Metric {metric} is not available.")
+
+    metrics_by_label =  dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels)
+    pairwise_metrics = {}
+    # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap
+    for _ in range(num_bootstraps):
+        bootstrap_sample = predictions_df.sample(frac=1.0, replace=True)
+        for label in labels:
+            y_true = bootstrap_sample["exp"].values
+            y_pred = bootstrap_sample[f"{label}_calc"].values
+            for metric in metrics_to_compute:
+                value = AVAILABLE_STATS[metric](y_true, y_pred)
+                metrics_by_label[label][metric].append(value)
+
+    # compute pairwise differences for ranking metric
+    for i, label_i in enumerate(labels):
+        for j, label_j in enumerate(labels):
+            if j <= i:
+                continue
+            diffs = np.array(metrics_by_label[label_i][rank_metric]) - np.array(metrics_by_label[label_j][rank_metric])
+            pairwise_metrics[(label_i, label_j)] = diffs
+
+    # summarize all metrics
+    summary_data = []
+    for label in labels:
+        row = {"Model": label}
+        for metric in metrics_to_compute:
+            # compute the sample metric
+            x = predictions_df[f"{label}_calc"].values
+            y = predictions_df["exp"].values
+            sample_value = AVAILABLE_STATS[metric](y, x)
+            bootstrap_values = np.array(metrics_by_label[label][metric])
+            lower = np.percentile(bootstrap_values, (1 - confidence_level) / 2 * 100)
+            upper = np.percentile(bootstrap_values, (1 + confidence_level) / 2 * 100)
+            row[f"{metric}"] = sample_value
+            row[f"{metric}_CI_Lower"] = lower
+            row[f"{metric}_CI_Upper"] = upper
+        summary_data.append(row)
+    summary_df = pd.DataFrame(summary_data)
+
+    # summarize pairwise comparisons with corrected p-values
+    comparison_data = []
+    for (label_i, label_j), diffs in pairwise_metrics.items():
+        lower = np.percentile(diffs, (1 - confidence_level) / 2 * 100)
+        upper = np.percentile(diffs, (1 + confidence_level) / 2 * 100)
+        # calculate the p-value as the fraction of bootstrap samples that cross zero
+        # use a 2-tailed test
+        # inspired by https://www.nature.com/articles/s42004-025-01428-y
+        p_value = 2 * min(np.mean(diffs < 0), np.mean(diffs > 0))
+        comparison_data.append({
+            "Model 1": label_i,
+            "Model 2": label_j,
+            f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0] - summary_df[summary_df["Model"] == label_j][rank_metric].values[0],
+            "CI Lower": lower,
+            "CI Upper": upper,
+            "p-value": p_value,
+            "significant": p_value < 0.05
+        })
+    comparison_df = pd.DataFrame(comparison_data)
+
+    # if we have more than 2 models, apply multiple testing correction
+    if len(labels) > 2:
+        from statsmodels.stats.multitest import multipletests
+        p_values = comparison_df["p-value"].values
+        reject, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method="holm")
+        comparison_df["p-value corrected"] = pvals_corrected
+        # add corrected significance
+        comparison_df["significant"] = pvals_corrected < 0.05
+
+    # rank the models and apply the CLD
+    # the order depends on whether lower or higher is better for the rank metric
+    if rank_metric in ["MUE", "RMSE", "RAE"]:
+        ascending = True  # lower is better
+    else:
+        ascending = False  # higher is better
+    ordered_labels = summary_df.sort_values(by=rank_metric, ascending=ascending).Model.tolist()
+    cld_assignment = _apply_cld(comparison_df, ordered_labels)
+    summary_df["CLD"] = summary_df["Model"].map(cld_assignment)
+
+    return summary_df, comparison_df
+
+
+def _build_significance_lookup(pairwise_df: pd.DataFrame):
+    """
+    Build a lookup dictionary for significance between model pairs.
+
+    Parameters
+    ----------
+    pairwise_df : pd.DataFrame
+        DataFrame containing pairwise comparison results with a 'significant' column.
+
+    Returns
+    -------
+    dict[frozenset, bool]
+        A dictionary mapping frozensets of model pairs to their significance (True if significant, False otherwise).
+    """
+    sig = {}
+    for _, row in pairwise_df.iterrows():
+        m1, m2 = row["Model 1"], row["Model 2"]
+        sig[frozenset((m1, m2))] = row.significant
+    return sig
+
+
+def _apply_cld(comparison_df: pd.DataFrame, ordered_labels: list[str]) -> dict[str, str]:
+    """
+    Apply the Compact Letter Display (CLD) system to the pairwise comparison results using the insert-absorb method.
+
+    Parameters
+    ----------
+    comparison_df : pd.DataFrame
+        DataFrame containing pairwise comparison results with a 'significant' column.
+    ordered_labels : list[str]
+        List of model labels sorted by metric performance (best to worst).
+
+    Returns
+    -------
+    dict[str, str]
+        A dictionary mapping each model label to its assigned CLD letters.
+    """
+
+    sig = _build_significance_lookup(comparison_df)
+
+    # Each letter is represented as a set of methods which are not significantly different
+    letters = []
+
+    def is_compatible(meth, let):
+        """Check if method can join this letter set"""
+        for other in let:
+            if sig.get(frozenset((meth, other)), False):
+                return False
+        return True
+
+    def absorb(lets):
+        """Remove letters that are strict subsets of others"""
+        absorbed = []
+        for i, a in enumerate(lets):
+            redundant = False
+            for j, b in enumerate(lets):
+                if i != j and a < b:  # a is a strict subset of b
+                    redundant = True
+                    break
+            if not redundant:
+                absorbed.append(a)
+        return absorbed
+
+    # For each method in order, try to insert into existing letters or create new letter
+    for method in ordered_labels:
+        inserted = False
+
+        # Try inserting into existing letters
+        for letter in letters:
+            if is_compatible(method, letter):
+                letter.add(method)
+                inserted = True
+
+        # If no insertion possible, create new letter
+        if not inserted:
+            letters.append({method})
+
+        # Absorb step
+        letters = absorb(letters)
+
+    # Convert to method → string mapping
+    alphabet = string.ascii_lowercase
+    if len(letters) > len(alphabet):
+        raise ValueError("Too many letters required for CLD")
+
+    result = {m: "" for m in ordered_labels}
+    for i, letter in enumerate(letters):
+        for m in letter:
+            result[m] += alphabet[i]
+
+    return result
+
+
+
diff --git a/cinnabar/stats.py b/cinnabar/stats.py
index 7a864c3c..6aa1e216 100644
--- a/cinnabar/stats.py
+++ b/cinnabar/stats.py
@@ -15,6 +15,144 @@
 )
 
 
+def calculate_rmse(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+) -> float:
+    """Compute root mean squared error between true and predicted values.
+
+    Parameters
+    ----------
+    y_true : ndarray with shape (N,)
+        True values
+    y_pred : ndarray with shape (N,)
+        Predicted values
+
+    Returns
+    -------
+    rmse : float
+        RMSE between true and predicted values
+    """
+    return np.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred))
+
+
+def calculate_mue(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+) -> float:
+    """Compute mean unsigned error between true and predicted values.
+
+    Parameters
+    ----------
+    y_true : ndarray with shape (N,)
+        True values
+    y_pred : ndarray with shape (N,)
+        Predicted values
+
+    Returns
+    -------
+    mue : float
+        MUE between true and predicted values
+    """
+    return sklearn.metrics.mean_absolute_error(y_true, y_pred)
+
+
+def calculate_rae(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+) -> float:
+    """Compute relative absolute error between true and predicted values.
+
+    Parameters
+    ----------
+    y_true : ndarray with shape (N,)
+        True values
+    y_pred : ndarray with shape (N,)
+        Predicted values
+
+    Returns
+    -------
+    rae : float
+        RAE between true and predicted values
+    """
+    MAE = sklearn.metrics.mean_absolute_error(y_true, y_pred)
+    mean = np.mean(y_true)
+    MAD = np.sum([np.abs(mean - i) for i in y_true]) / float(len(y_true))
+    return MAE / MAD
+
+
+def calculate_r2(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+) -> float:
+    """Compute R^2 between true and predicted values.
+
+    Parameters
+    ----------
+    y_true : ndarray with shape (N,)
+        True values
+    y_pred : ndarray with shape (N,)
+        Predicted values
+
+    Returns
+    -------
+    r2 : float
+        R^2 between true and predicted values
+    """
+    _, _, r_value, _, _ = scipy.stats.linregress(y_true, y_pred)
+    return r_value**2
+
+
+def calculate_pearson_r(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+) -> float:
+    """Compute Pearson's r between true and predicted values.
+
+    Parameters
+    ----------
+    y_true : ndarray with shape (N,)
+        True values
+    y_pred : ndarray with shape (N,)
+        Predicted values
+
+    Returns
+    -------
+    r : float
+        Pearson's r between true and predicted values
+    """
+    return scipy.stats.pearsonr(y_true, y_pred)[0]
+
+
+def calculate_kendalls_tau(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+) -> float:
+    """Compute Kendall's tau between true and predicted values.
+
+    Parameters
+    ----------
+    y_true : ndarray with shape (N,)
+        True values
+    y_pred : ndarray with shape (N,)
+        Predicted values
+
+    Returns
+    -------
+    tau : float
+        Kendall's tau between true and predicted values
+    """
+    return scipy.stats.kendalltau(y_true, y_pred)[0]
+
+AVAILABLE_STATS = {
+    "RMSE": calculate_rmse,
+    "MUE": calculate_mue,
+    "RAE": calculate_rae,
+    "R2": calculate_r2,
+    "rho": calculate_pearson_r,
+    "KTAU": calculate_kendalls_tau,
+}
+
 def bootstrap_statistic(
     y_true: np.ndarray,
     y_pred: np.ndarray,
@@ -75,12 +213,6 @@ def compute_statistic(y_true_sample: np.ndarray, y_pred_sample: np.ndarray, stat
 
         """
 
-        def calc_RAE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray):
-            MAE = sklearn.metrics.mean_absolute_error(y_true_sample, y_pred_sample)
-            mean = np.mean(y_true_sample)
-            MAD = np.sum([np.abs(mean - i) for i in y_true_sample]) / float(len(y_true_sample))
-            return MAE / MAD
-
         def calc_RRMSE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray):
             rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_true_sample, y_pred_sample))
             mean_exp = np.mean(y_true_sample)
@@ -88,21 +220,9 @@ def calc_RRMSE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray):
             rrmse = np.sqrt(rmse**2 / mds)
             return rrmse
 
-        if statistic == "RMSE":
-            return np.sqrt(sklearn.metrics.mean_squared_error(y_true_sample, y_pred_sample))
-        elif statistic == "MUE":
-            return sklearn.metrics.mean_absolute_error(y_true_sample, y_pred_sample)
-        elif statistic == "R2":
-            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(y_true_sample, y_pred_sample)
-            return r_value**2
-        elif statistic == "rho":
-            return scipy.stats.pearsonr(y_true_sample, y_pred_sample)[0]
-        elif statistic == "RAE":
-            return calc_RAE(y_true_sample, y_pred_sample)
-        elif statistic == "KTAU":
-            return scipy.stats.kendalltau(y_true_sample, y_pred_sample)[0]
-        else:
+        if statistic not in AVAILABLE_STATS:
             raise Exception("unknown statistic '{}'".format(statistic))
+        return AVAILABLE_STATS[statistic](y_true_sample, y_pred_sample)
 
     # not used?
     def unique_differences(x):
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
new file mode 100644
index 00000000..866f1733
--- /dev/null
+++ b/cinnabar/tests/test_compare.py
@@ -0,0 +1,48 @@
+from cinnabar.compare import compare_and_rank_femaps
+import networkx as nx
+import numpy as np
+from cinnabar import FEMap
+
+
+def test_compare_and_rank_femaps(fe_map):
+    graph1 = nx.MultiDiGraph()
+    graph2 = nx.MultiDiGraph()
+    for a, b, data in fe_map.to_networkx().edges(data=True):
+        new_data = data.copy()
+        if data["source"] != "reverse" and data["computational"]:
+            # add noise to the result
+            new_result = data["DG"] + np.random.normal(0, data["uncertainty"].m) * data["DG"].u
+            new_data["DG"] = new_result
+            graph1.add_edge(a, b, **new_data)
+            # and add the reverse edge
+            rev_data = new_data.copy()
+            rev_data["source"] = "reverse"
+            rev_data["DG"] = -new_data["DG"]
+            graph1.add_edge(b, a, **rev_data)
+            # add a large value to the second graph to simulate a bad prediction
+            new_result2 = data["DG"] + 1.5 * data["DG"].u
+            new_data["DG"] = new_result2
+            graph2.add_edge(a, b, **new_data)
+            # add the reverse edge
+            rev_data2 = new_data.copy()
+            rev_data2["source"] = "reverse"
+            rev_data2["DG"] = -new_data["DG"]
+            graph2.add_edge(b, a, **rev_data2)
+
+        else:
+            graph1.add_edge(a, b, **data)
+            graph2.add_edge(a, b, **data)
+    fe_map_2 = FEMap.from_networkx(graph1)
+    fe_map_3 = FEMap.from_networkx(graph2)
+
+    t1, t2 = compare_and_rank_femaps([fe_map, fe_map_2, fe_map_3], ["FE Map 1", "FE Map 2", "FE Map 3"], prediction_type="nodewise", rank_metric="rho")
+    print(t1)
+    print(t2)
+    # check that FE Map 3 is ranked worst
+    assert t1[t1["Model"] == "FE Map 3"]["CLD"].values[0] == "b"
+    # check that FE Map 1 and FE Map 2 are ranked better
+    assert t1[t1["Model"] == "FE Map 1"]["CLD"].values[0] == "a"
+    assert t1[t1["Model"] == "FE Map 2"]["CLD"].values[0] == "a"
+    # check that the comparison table has all three models and corrected p-values
+    assert len(t2) == 3
+    assert "p-value corrected" in t2.columns

From 1ab43807334cfaaff0d338d6110c3e96b40b8f48 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Fri, 16 Jan 2026 11:13:01 +0000
Subject: [PATCH 02/17] fix pairwise comparisons, remove prints

---
 cinnabar/compare.py            | 3 ++-
 cinnabar/tests/test_compare.py | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index 90e85ed0..b770e500 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -74,7 +74,8 @@ def compare_and_rank_femaps(
                 if "exp_DDG" not in predictions_by_key[key]:
                     predictions_by_key[key]["exp"] = data["exp_DDG"]
         elif prediction_type == "pairwise":
-            nodes = list(graph.nodes())
+            # incase the networks nodes are in a different order, sort them first to get consistent pairs
+            nodes = sorted(list(graph.nodes()))
             for a, b in itertools.combinations(nodes, 2):
                 exp = graph.nodes[b]["exp_DG"] - graph.nodes[a]["exp_DG"]
                 calc = graph.nodes[b]["calc_DG"] - graph.nodes[a]["calc_DG"]
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 866f1733..2dee2876 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -36,8 +36,7 @@ def test_compare_and_rank_femaps(fe_map):
     fe_map_3 = FEMap.from_networkx(graph2)
 
     t1, t2 = compare_and_rank_femaps([fe_map, fe_map_2, fe_map_3], ["FE Map 1", "FE Map 2", "FE Map 3"], prediction_type="nodewise", rank_metric="rho")
-    print(t1)
-    print(t2)
+
     # check that FE Map 3 is ranked worst
     assert t1[t1["Model"] == "FE Map 3"]["CLD"].values[0] == "b"
     # check that FE Map 1 and FE Map 2 are ranked better

From b0dbba7c119a10e6109a3935a9e5e1c294219011 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 16 Jan 2026 11:13:49 +0000
Subject: [PATCH 03/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 cinnabar/compare.py            | 39 +++++++++++++++++-----------------
 cinnabar/stats.py              |  2 ++
 cinnabar/tests/test_compare.py | 10 +++++++--
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index 90e85ed0..b8338f49 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -1,14 +1,13 @@
+import itertools
 import string
+from collections import defaultdict
+from typing import Literal
 
 import numpy as np
+import pandas as pd
 
 from cinnabar import FEMap
-from typing import Literal
-import pandas as pd
-from collections import defaultdict
 from cinnabar.stats import AVAILABLE_STATS
-import itertools
-
 
 
 def compare_and_rank_femaps(
@@ -55,13 +54,12 @@ def compare_and_rank_femaps(
         - The second DataFrame contains the pairwise comparison results between models.
     """
 
-
     # get the predictions and experimental values from the FEMaps and align them into a DataFrame
     predictions_by_key = defaultdict(dict)
     for label, femap in zip(labels, femaps):
         graph = femap.to_legacy_graph()
         if prediction_type == "nodewise":
-            #TODO  should we expose a shift or centering option here?
+            # TODO  should we expose a shift or centering option here?
             for node, data in graph.nodes(data=True):
                 predictions_by_key[node][f"{label}_calc"] = data["calc_DG"]
                 if "exp_DG" not in predictions_by_key[node]:
@@ -99,7 +97,7 @@ def compare_and_rank_femaps(
         if metric not in AVAILABLE_STATS:
             raise ValueError(f"Metric {metric} is not available.")
 
-    metrics_by_label =  dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels)
+    metrics_by_label = dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels)
     pairwise_metrics = {}
     # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap
     for _ in range(num_bootstraps):
@@ -146,20 +144,24 @@ def compare_and_rank_femaps(
         # use a 2-tailed test
         # inspired by https://www.nature.com/articles/s42004-025-01428-y
         p_value = 2 * min(np.mean(diffs < 0), np.mean(diffs > 0))
-        comparison_data.append({
-            "Model 1": label_i,
-            "Model 2": label_j,
-            f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0] - summary_df[summary_df["Model"] == label_j][rank_metric].values[0],
-            "CI Lower": lower,
-            "CI Upper": upper,
-            "p-value": p_value,
-            "significant": p_value < 0.05
-        })
+        comparison_data.append(
+            {
+                "Model 1": label_i,
+                "Model 2": label_j,
+                f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0]
+                - summary_df[summary_df["Model"] == label_j][rank_metric].values[0],
+                "CI Lower": lower,
+                "CI Upper": upper,
+                "p-value": p_value,
+                "significant": p_value < 0.05,
+            }
+        )
     comparison_df = pd.DataFrame(comparison_data)
 
     # if we have more than 2 models, apply multiple testing correction
     if len(labels) > 2:
         from statsmodels.stats.multitest import multipletests
+
         p_values = comparison_df["p-value"].values
         reject, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method="holm")
         comparison_df["p-value corrected"] = pvals_corrected
@@ -270,6 +272,3 @@ def absorb(lets):
             result[m] += alphabet[i]
 
     return result
-
-
-
diff --git a/cinnabar/stats.py b/cinnabar/stats.py
index 6aa1e216..a2346b08 100644
--- a/cinnabar/stats.py
+++ b/cinnabar/stats.py
@@ -144,6 +144,7 @@ def calculate_kendalls_tau(
     """
     return scipy.stats.kendalltau(y_true, y_pred)[0]
 
+
 AVAILABLE_STATS = {
     "RMSE": calculate_rmse,
     "MUE": calculate_mue,
@@ -153,6 +154,7 @@ def calculate_kendalls_tau(
     "KTAU": calculate_kendalls_tau,
 }
 
+
 def bootstrap_statistic(
     y_true: np.ndarray,
     y_pred: np.ndarray,
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 866f1733..6d284efb 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -1,7 +1,8 @@
-from cinnabar.compare import compare_and_rank_femaps
 import networkx as nx
 import numpy as np
+
 from cinnabar import FEMap
+from cinnabar.compare import compare_and_rank_femaps
 
 
 def test_compare_and_rank_femaps(fe_map):
@@ -35,7 +36,12 @@ def test_compare_and_rank_femaps(fe_map):
     fe_map_2 = FEMap.from_networkx(graph1)
     fe_map_3 = FEMap.from_networkx(graph2)
 
-    t1, t2 = compare_and_rank_femaps([fe_map, fe_map_2, fe_map_3], ["FE Map 1", "FE Map 2", "FE Map 3"], prediction_type="nodewise", rank_metric="rho")
+    t1, t2 = compare_and_rank_femaps(
+        [fe_map, fe_map_2, fe_map_3],
+        ["FE Map 1", "FE Map 2", "FE Map 3"],
+        prediction_type="nodewise",
+        rank_metric="rho",
+    )
     print(t1)
     print(t2)
     # check that FE Map 3 is ranked worst

From 739ba9907051e35e8a4808a023a66a1ddd30a303 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Tue, 19 May 2026 10:33:54 +0100
Subject: [PATCH 04/17] rework to use a single femap, remove pairwise, add more
 tests and a tutorial

---
 cinnabar/compare.py                     | 155 +++---
 cinnabar/tests/test_compare.py          | 167 +++++--
 docs/api.rst                            |   1 +
 docs/tutorials/index.rst                |   1 +
 docs/tutorials/results_comparison.ipynb |   1 +
 examples/results_comparison.ipynb       | 598 ++++++++++++++++++++++++
 6 files changed, 808 insertions(+), 115 deletions(-)
 create mode 120000 docs/tutorials/results_comparison.ipynb
 create mode 100644 examples/results_comparison.ipynb

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index c2cd5dc7..c76e7e9f 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -1,4 +1,3 @@
-import itertools
 import string
 from collections import defaultdict
 from typing import Literal
@@ -7,33 +6,30 @@
 import pandas as pd
 
 from cinnabar import FEMap
-from cinnabar.stats import AVAILABLE_STATS
+from cinnabar.stats import _AVAILABLE_STATS
 
 
-def compare_and_rank_femaps(
-    femaps: list[FEMap],
-    labels: list[str],
-    prediction_type: Literal["nodewise", "edgewise", "pairwise"] = "pairwise",
-    rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"] = "MUE",
-    metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]] | None = None,
-    num_bootstraps: int = 1000,
+def compare_and_rank_results(
+    femap: FEMap,
+    prediction_type: Literal["nodewise", "edgewise"] = "edgewise",
+    rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"] = "MUE",
+    metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU","PI"]] | None = None,
+    num_bootstraps: int = 1_000,
     confidence_level: float = 0.95,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
-    Compare and rank multiple FEMaps based on the chosen performance metric and return an ordered table of
+    Compare and rank multiple result sources on a single FEMap based on the chosen performance metric and return an ordered table of
     results using the compact letter display (CLD) system.
 
     Parameters
     ----------
-    femaps : list[FEMap]
-        A list of FEMap instances to compare.
-    labels : list[str]
-        A list of labels corresponding to each FEMap used in the output tables.
-    prediction_type: Literal["nodewise", "edgewise", "pairwise"], optional
-        The type of prediction to evaluate, by default "pairwise".
-    rank_metric : Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"], optional
+    femap : FEMap
+        An FEMap instance with results from multiple sources to compare.
+    prediction_type: Literal["nodewise", "edgewise"], optional
+        The type of prediction to evaluate, by default "edgewise".
+    rank_metric : Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"], optional
         The metric used to rank the models, by default "MUE".
-    metrics_to_compute : list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]], optional
+    metrics_to_compute : list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"]], optional
         A list of metrics to compute for each model. If None, all metrics appropriate for the `prediction_type` will be computed.
     num_bootstraps : int, optional
         The number of bootstrap samples to use for estimating confidence intervals, by default 1000.
@@ -42,8 +38,9 @@ def compare_and_rank_femaps(
 
     Note
     ----
-    - We assume that all FEMaps have been evaluated on the same test set.
-    - Prediction types "nodewise", "edgewise", and "pairwise" correspond to DGs, edgewise DDGs and pairwise DDGs (back calculated from nodewise DGs) respectively.
+    - The comparison method uses a joint bootstrapping procedure that generates a distribution of differences in the rank metric and checks for significant differences using a method inspired by. [1]_
+    - Each source must be evaluated on the same set of edges.
+    - Prediction types "nodewise" and "edgewise" correspond to DGs and edgewise DDGs respectively.
     - When we have more than 2 models, we apply multiple testing correction to the pairwise comparisons using the ``Holm`` method by default.
 
     Returns
@@ -52,82 +49,100 @@ def compare_and_rank_femaps(
         A tuple containing two DataFrames:
         - The first DataFrame contains the computed metrics for each model.
         - The second DataFrame contains the pairwise comparison results between models.
+
+    References
+    ----------
+    .. [1] https://www.nature.com/articles/s42004-025-01428-y
     """
 
     # get the predictions and experimental values from the FEMaps and align them into a DataFrame
     predictions_by_key = defaultdict(dict)
-    for label, femap in zip(labels, femaps):
-        graph = femap.to_legacy_graph()
-        if prediction_type == "nodewise":
-            # TODO  should we expose a shift or centering option here?
-            for node, data in graph.nodes(data=True):
-                predictions_by_key[node][f"{label}_calc"] = data["calc_DG"]
-                if "exp_DG" not in predictions_by_key[node]:
-                    predictions_by_key[node]["exp"] = data["exp_DG"]
-        elif prediction_type == "edgewise":
-            for a, b, data in graph.edges(data=True):
-                # we assume all edges have been run in the same direction
-                key = (a, b)
-                predictions_by_key[key][f"{label}_calc"] = data["calc_DDG"]
-                if "exp_DDG" not in predictions_by_key[key]:
-                    predictions_by_key[key]["exp"] = data["exp_DDG"]
-        elif prediction_type == "pairwise":
-            # incase the networks nodes are in a different order, sort them first to get consistent pairs
-            nodes = sorted(list(graph.nodes()))
-            for a, b in itertools.combinations(nodes, 2):
-                exp = graph.nodes[b]["exp_DG"] - graph.nodes[a]["exp_DG"]
-                calc = graph.nodes[b]["calc_DG"] - graph.nodes[a]["calc_DG"]
-                key = (a, b)
-                predictions_by_key[key][f"{label}_calc"] = calc
-                if "exp" not in predictions_by_key[key]:
-                    predictions_by_key[key]["exp"] = exp
-        else:
-            raise ValueError(f"Invalid prediction_type: {prediction_type}")
+    if prediction_type == "nodewise":
+        # make sure we have absolute values
+        femap.generate_absolute_values()
+        abs_df = femap.get_absolute_dataframe()
+        # get the computational sources we want to compare
+        sources = abs_df[abs_df["computational"] == True]["source"].unique()
+        # for each row add the node prediction
+        for _, row in abs_df.iterrows():
+            node_label = row["label"]
+            if not row["computational"]:
+                source_label = "exp"
+            else:
+                source_label = f"{row["source"]}_calc"
+            predictions_by_key[node_label][source_label] = row["DG (kcal/mol)"]
+
+    elif prediction_type == "edgewise":
+        rel_df = femap.get_relative_dataframe()
+        sources =rel_df[rel_df["computational"] == True]["source"].unique()
+        for _, row in rel_df.iterrows():
+            key = (row["labelA"], row["labelB"])
+            if not row["computational"]:
+                source_label = "exp"
+            else:
+                source_label = f"{row["source"]}_calc"
+            predictions_by_key[key][source_label] = row["DDG (kcal/mol)"]
+    else:
+        raise ValueError(f"Invalid prediction_type: {prediction_type}")
 
     predictions_df = pd.DataFrame(predictions_by_key.values())
 
+    # check that we have experimental values for all edges
+    if "exp" not in predictions_df.columns:
+        raise ValueError("Experimental values are required to rank the results.")
+
+    # check that we have the same number of values for all sources
+    for source in sources:
+        # check if any values for this source are missing
+        if any(predictions_df[f"{source}_calc"].isna()):
+            raise ValueError(f"Missing predictions for source {source}, all sources must have the same number of predictions.")
+
     # set metrics to compute based on best practices if not provided
     if metrics_to_compute is None:
-        if prediction_type in ["pairwise", "edgewise"]:
+        if prediction_type == "edgewise":
             metrics_to_compute = ["MUE", "RMSE"]
         elif prediction_type == "nodewise":
-            metrics_to_compute = ["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]
+            metrics_to_compute = ["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"]
+
+    # we must compute the rank metric however it is possible to miss it
+    if rank_metric not in metrics_to_compute:
+        metrics_to_compute.append(rank_metric)
 
     # check we can compute all requested metrics
     for metric in metrics_to_compute:
-        if metric not in AVAILABLE_STATS:
+        if metric not in _AVAILABLE_STATS:
             raise ValueError(f"Metric {metric} is not available.")
 
-    metrics_by_label = dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels)
+    metrics_by_source = dict((source, dict((metric, []) for metric in metrics_to_compute)) for source in sources)
     pairwise_metrics = {}
     # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap
     for _ in range(num_bootstraps):
         bootstrap_sample = predictions_df.sample(frac=1.0, replace=True)
-        for label in labels:
+        for source in sources:
             y_true = bootstrap_sample["exp"].values
-            y_pred = bootstrap_sample[f"{label}_calc"].values
+            y_pred = bootstrap_sample[f"{source}_calc"].values
             for metric in metrics_to_compute:
-                value = AVAILABLE_STATS[metric](y_true, y_pred)
-                metrics_by_label[label][metric].append(value)
+                value = _AVAILABLE_STATS[metric](y_true, y_pred)
+                metrics_by_source[source][metric].append(value)
 
     # compute pairwise differences for ranking metric
-    for i, label_i in enumerate(labels):
-        for j, label_j in enumerate(labels):
+    for i, source_i in enumerate(sources):
+        for j, source_j in enumerate(sources):
             if j <= i:
                 continue
-            diffs = np.array(metrics_by_label[label_i][rank_metric]) - np.array(metrics_by_label[label_j][rank_metric])
-            pairwise_metrics[(label_i, label_j)] = diffs
+            diffs = np.array(metrics_by_source[source_i][rank_metric]) - np.array(metrics_by_source[source_j][rank_metric])
+            pairwise_metrics[(source_i, source_j)] = diffs
 
     # summarize all metrics
     summary_data = []
-    for label in labels:
-        row = {"Model": label}
+    for source in sources:
+        row = {"Model": source}
         for metric in metrics_to_compute:
             # compute the sample metric
-            x = predictions_df[f"{label}_calc"].values
+            x = predictions_df[f"{source}_calc"].values
             y = predictions_df["exp"].values
-            sample_value = AVAILABLE_STATS[metric](y, x)
-            bootstrap_values = np.array(metrics_by_label[label][metric])
+            sample_value = _AVAILABLE_STATS[metric](y, x)
+            bootstrap_values = np.array(metrics_by_source[source][metric])
             lower = np.percentile(bootstrap_values, (1 - confidence_level) / 2 * 100)
             upper = np.percentile(bootstrap_values, (1 + confidence_level) / 2 * 100)
             row[f"{metric}"] = sample_value
@@ -138,7 +153,7 @@ def compare_and_rank_femaps(
 
     # summarize pairwise comparisons with corrected p-values
     comparison_data = []
-    for (label_i, label_j), diffs in pairwise_metrics.items():
+    for (source_i, source_j), diffs in pairwise_metrics.items():
         lower = np.percentile(diffs, (1 - confidence_level) / 2 * 100)
         upper = np.percentile(diffs, (1 + confidence_level) / 2 * 100)
         # calculate the p-value as the fraction of bootstrap samples that cross zero
@@ -147,10 +162,10 @@ def compare_and_rank_femaps(
         p_value = 2 * min(np.mean(diffs < 0), np.mean(diffs > 0))
         comparison_data.append(
             {
-                "Model 1": label_i,
-                "Model 2": label_j,
-                f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0]
-                - summary_df[summary_df["Model"] == label_j][rank_metric].values[0],
+                "Model 1": source_i,
+                "Model 2": source_j,
+                f"Diff in {rank_metric}": summary_df[summary_df["Model"] == source_i][rank_metric].values[0]
+                - summary_df[summary_df["Model"] == source_j][rank_metric].values[0],
                 "CI Lower": lower,
                 "CI Upper": upper,
                 "p-value": p_value,
@@ -160,7 +175,7 @@ def compare_and_rank_femaps(
     comparison_df = pd.DataFrame(comparison_data)
 
     # if we have more than 2 models, apply multiple testing correction
-    if len(labels) > 2:
+    if len(sources) > 2:
         from statsmodels.stats.multitest import multipletests
 
         p_values = comparison_df["p-value"].values
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 8f55d3de..0e8c6094 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -1,53 +1,130 @@
-import networkx as nx
 import numpy as np
+import pytest
 
 from cinnabar import FEMap
-from cinnabar.compare import compare_and_rank_femaps
-
-
-def test_compare_and_rank_femaps(fe_map):
-    graph1 = nx.MultiDiGraph()
-    graph2 = nx.MultiDiGraph()
-    for a, b, data in fe_map.to_networkx().edges(data=True):
-        new_data = data.copy()
-        if data["source"] != "reverse" and data["computational"]:
-            # add noise to the result
-            new_result = data["DG"] + np.random.normal(0, data["uncertainty"].m) * data["DG"].u
-            new_data["DG"] = new_result
-            graph1.add_edge(a, b, **new_data)
-            # and add the reverse edge
-            rev_data = new_data.copy()
-            rev_data["source"] = "reverse"
-            rev_data["DG"] = -new_data["DG"]
-            graph1.add_edge(b, a, **rev_data)
-            # add a large value to the second graph to simulate a bad prediction
-            new_result2 = data["DG"] + 1.5 * data["DG"].u
-            new_data["DG"] = new_result2
-            graph2.add_edge(a, b, **new_data)
-            # add the reverse edge
-            rev_data2 = new_data.copy()
-            rev_data2["source"] = "reverse"
-            rev_data2["DG"] = -new_data["DG"]
-            graph2.add_edge(b, a, **rev_data2)
+from cinnabar.compare import compare_and_rank_results
 
+from openff.units import unit
+
+
+def test_compare_and_rank_results(fe_map):
+    np.random.seed(42)
+
+    compare_map = FEMap()
+    for m in fe_map:
+        if m.computational:
+            # add the result with a new source
+            compare_map.add_relative_calculation(
+                labelA=m.labelA,
+                labelB=m.labelB,
+                value=m.DG,
+                uncertainty=m.uncertainty,
+                source="original"
+            )
+            # add the data again under a second source with some noise added
+            compare_map.add_relative_calculation(
+                labelA=m.labelA,
+                labelB=m.labelB,
+                value=np.random.normal(m.DG.m, m.uncertainty.m) * unit.kilocalorie_per_mole,
+                uncertainty=m.uncertainty,
+                source="perturbed"
+            )
+            # add a third set of data with a lot of noise to get a significant difference
+            compare_map.add_relative_calculation(
+                labelA=m.labelA,
+                labelB=m.labelB,
+                value=np.random.normal(m.DG.m, 12.0 * m.uncertainty.m) * unit.kilocalorie_per_mole,
+                uncertainty=m.uncertainty,
+                source="noisy"
+            )
         else:
-            graph1.add_edge(a, b, **data)
-            graph2.add_edge(a, b, **data)
-    fe_map_2 = FEMap.from_networkx(graph1)
-    fe_map_3 = FEMap.from_networkx(graph2)
-
-    t1, t2 = compare_and_rank_femaps(
-        [fe_map, fe_map_2, fe_map_3],
-        ["FE Map 1", "FE Map 2", "FE Map 3"],
-        prediction_type="nodewise",
-        rank_metric="rho",
+            # add the experimental data
+            compare_map.add_measurement(m)
+
+    summary_df, comparison_df = compare_and_rank_results(
+        compare_map,
+        prediction_type="edgewise",
+        rank_metric="MUE",
+        metrics_to_compute=["MUE", "RMSE"],
     )
 
-    # check that FE Map 3 is ranked worst
-    assert t1[t1["Model"] == "FE Map 3"]["CLD"].values[0] == "b"
-    # check that FE Map 1 and FE Map 2 are ranked better
-    assert t1[t1["Model"] == "FE Map 1"]["CLD"].values[0] == "a"
-    assert t1[t1["Model"] == "FE Map 2"]["CLD"].values[0] == "a"
+    # make sure the MUE and RMSE have been calculated and recorded in the summary table
+    for metric in ["MUE", "RMSE"]:
+        assert metric in summary_df.columns
+        for ci in ["Upper", "Lower"]:
+            assert f"{metric}_CI_{ci}" in summary_df.columns
+    # check that noisy is ranked worst
+    assert summary_df[summary_df["Model"] == "noisy"]["CLD"].values[0] == "b"
+    # check that original and perturbed are ranked better
+    assert summary_df[summary_df["Model"] == "original"]["CLD"].values[0] == "a"
+    assert summary_df[summary_df["Model"] == "perturbed"]["CLD"].values[0] == "a"
     # check that the comparison table has all three models and corrected p-values
-    assert len(t2) == 3
-    assert "p-value corrected" in t2.columns
+    assert len(comparison_df) == 3
+    assert "p-value corrected" in comparison_df.columns
+
+
+def test_invalid_prediction_type(fe_map):
+    with pytest.raises(ValueError, match="Invalid prediction_type: pairwise"):
+        compare_and_rank_results(
+            fe_map,
+            prediction_type="pairwise",
+            rank_metric="MUE",
+            metrics_to_compute=["MUE", "RMSE"],
+        )
+
+
+def test_missing_experimental_data(fe_map):
+    new_map = FEMap()
+    for m in fe_map:
+        if m.computational:
+            new_map.add_measurement(m)
+
+    with pytest.raises(ValueError, match="Experimental values are required to rank the results."):
+        compare_and_rank_results(
+            new_map,
+        )
+
+
+def test_missing_source_data(fe_map):
+    new_map = FEMap()
+    for i, m in enumerate(fe_map):
+        if not m.computational:
+            new_map.add_measurement(m)
+        else:
+            new_map.add_relative_calculation(
+                labelA=m.labelA,
+                labelB=m.labelB,
+                value=m.DG,
+                uncertainty=m.uncertainty,
+                source="original"
+            )
+            # for even I add the second source as well
+            if i % 2 == 0:
+                new_map.add_relative_calculation(
+                    labelA=m.labelA,
+                    labelB=m.labelB,
+                    value=m.DG,
+                    uncertainty=m.uncertainty,
+                    source="perturbed"
+                )
+
+    with pytest.raises(ValueError, match="Missing predictions for source perturbed, all sources must have the same number of predictions."):
+        compare_and_rank_results(
+            new_map,
+        )
+
+
+def test_bad_metric(fe_map):
+    with pytest.raises(ValueError, match="Metric bad_metric is not available."):
+        compare_and_rank_results(
+            fe_map,
+            rank_metric="bad_metric",
+        )
+
+
+def test_missing_rank_metric(fe_map):
+    _, _ = compare_and_rank_results(
+        fe_map,
+        rank_metric="MUE",
+        metrics_to_compute=["RMSE"],  # miss the rank metric from the compute list and it should still work
+    )
\ No newline at end of file
diff --git a/docs/api.rst b/docs/api.rst
index e1446938..0823b3cd 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -6,6 +6,7 @@ API Documentation
 
    cinnabar.classification_metrics
    cinnabar.cli
+   cinnabar.compare
    cinnabar.conversion
    cinnabar.estimators
    cinnabar.femap
diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst
index 4880d7fb..39b4cd13 100644
--- a/docs/tutorials/index.rst
+++ b/docs/tutorials/index.rst
@@ -14,3 +14,4 @@ API Tutorials
     :maxdepth: 1
 
     cinnabar-api
+    results_comparison
diff --git a/docs/tutorials/results_comparison.ipynb b/docs/tutorials/results_comparison.ipynb
new file mode 120000
index 00000000..adf9f0f3
--- /dev/null
+++ b/docs/tutorials/results_comparison.ipynb
@@ -0,0 +1 @@
+../../examples/results_comparison.ipynb
\ No newline at end of file
diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb
new file mode 100644
index 00000000..e755343a
--- /dev/null
+++ b/examples/results_comparison.ipynb
@@ -0,0 +1,598 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Results comparison framework for benchmarking using ``Cinnabar``\n",
+    "\n",
+    "Comparing two sets of free energy predictions by eye is deceptively difficult. Two methods can show different RMSE or MUE values on a plot yet be statistically indistinguishable given the size of the dataset. ``Cinnabar`` addresses this with a rigorous comparison framework built on a **joint bootstrapping** method to build a distribution of metric differences and identify statistically significant differences between methods.\n",
+    "\n",
+    "This tutorial walks through the methodology and demonstrates how to use the ``compare_and_rank_results`` function with example RBFE data.\n",
+    "\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Reference:</b> The comparison methodology implemented here is inspired by the approach described in <a href=\"https://www.nature.com/articles/s42004-025-01428-y\">Valsson <i>et al.</i></a>\n",
+    "</div>"
+   ],
+   "id": "f07f14d1b152062e"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding the comparison method\n",
+    "\n",
+    "The comparison is built around a **joint bootstrapping procedure** that generates a distribution of differences in the chosen evaluation metric. The full workflow can be broken down into the following steps:\n",
+    "\n",
+    "1. **Define** the evaluation metric (e.g. RMSE, MUE) and the sets of results to compare — either ``edgewise`` (ΔΔG) or ``nodewise`` (ΔG).\n",
+    "2. **Bootstrap jointly** draw paired resamples of the data and recompute the evaluation metric for every source on the *same* resample. This preserves the correlation structure between methods and avoids artificially inflating differences.\n",
+    "3. **Compute two-sided p-values** from the fraction of bootstrap differences that cross zero.\n",
+    "4. **Apply the Holm multiple testing correction** when comparing more than two sets of results.\n",
+    "5. **Report confidence intervals** for the metric differences directly from the bootstrap distribution.\n",
+    "6. **Rank and group** results using the compact letter display (CLD) via the insert-absorb algorithm. Methods that share a letter are *not significantly different*; methods that share no letters *are* significantly different."
+   ],
+   "id": "f9e3ff514773b116"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Limitations\n",
+    "\n",
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<b>Edgewise independence:</b> RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to <i>underestimate</i> confidence intervals as the effective sample size is closer to <code>N_ligands</code> than to <code>N_edges</code> as assumed by the method.\n",
+    "</div>\n",
+    "\n",
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<b>Nodewise correlations:</b> ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for <b>absolute</b> binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.\n",
+    "</div>\n",
+    "\n",
+    "### Requirements\n",
+    "\n",
+    "- Experimental measurements must be available for every ligand in the ``FEMap``.\n",
+    "- All sources must provide predictions for exactly the **same** set of ligands or ligand pairs."
+   ],
+   "id": "88669d7f9af4abf8"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading example RBFE data\n",
+    "\n",
+    "For this tutorial we use RBFE data generated with [OpenFE](https://docs.openfree.energy/en/latest/) which is included with ``cinnabar``'s test suite. You can replace these files with your own outputs. See the [FEMap tutorial](cinnabar-api.ipynb) for details on building an ``FEMap`` from different data formats.\n",
+    "\n",
+    "The two data files have a simple tabular format:"
+   ],
+   "id": "35186bc4db8fa6ca"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:46.574932Z",
+     "start_time": "2026-05-19T09:21:46.423124Z"
+    }
+   },
+   "source": [
+    "! head ../cinnabar/data/experimental_data.csv"
+   ],
+   "id": "1d4764ff8e34299c",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ligand,expt_DG,expt_dDG\r\n",
+      "CAT-13a,-8.83,0.10\r\n",
+      "CAT-13b,-9.11,0.10\r\n",
+      "CAT-13c,-9.31,0.10\r\n",
+      "CAT-13d,-10.46,0.10\r\n",
+      "CAT-13e,-9.95,0.10\r\n",
+      "CAT-13f,-9.08,0.10\r\n",
+      "CAT-13g,-9.08,0.10\r\n",
+      "CAT-13h,-9.62,0.10\r\n",
+      "CAT-13i,-9.26,0.10\r\n"
+     ]
+    }
+   ],
+   "execution_count": 1
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:46.713070Z",
+     "start_time": "2026-05-19T09:21:46.575967Z"
+    }
+   },
+   "source": [
+    "! head ../cinnabar/data/computational_data.csv"
+   ],
+   "id": "68317fab7ba5d0c8",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ligand1,Ligand2,calc_DDG,calc_dDDG(MBAR),calc_dDDG(additional)\r\n",
+      "CAT-13b,CAT-17g,0.36,0.11,0.0\r\n",
+      "CAT-13a,CAT-17g,-0.02,0.1,0.0\r\n",
+      "CAT-13e,CAT-17g,1.5,0.11,0.0\r\n",
+      "CAT-4m,CAT-4c,0.78,0.1,0.0\r\n",
+      "CAT-13k,CAT-4d,-0.59,0.11,0.0\r\n",
+      "CAT-24,CAT-17e,1.98,0.08,0.0\r\n",
+      "CAT-13g,CAT-17g,0.86,0.15,0.0\r\n",
+      "CAT-13d,CAT-13h,1.46,0.1,0.0\r\n",
+      "CAT-13a,CAT-17i,-0.76,0.11,0.0\r\n"
+     ]
+    }
+   ],
+   "execution_count": 2
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:47.429987Z",
+     "start_time": "2026-05-19T09:21:46.714383Z"
+    }
+   },
+   "source": [
+    "# fmt: off\n",
+    "from cinnabar.compare import compare_and_rank_results\n",
+    "from cinnabar.femap import FEMap\n",
+    "\n",
+    "%matplotlib inline\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from openff.units import unit\n",
+    "\n",
+    "femap = FEMap()\n",
+    "\n",
+    "# load the computational results (source 1)\n",
+    "rbfe_results = pd.read_csv(\"../cinnabar/data/computational_data.csv\")\n",
+    "\n",
+    "for _, result in rbfe_results.iterrows():\n",
+    "    femap.add_relative_calculation(\n",
+    "        labelA=result[\"Ligand1\"],\n",
+    "        labelB=result[\"Ligand2\"],\n",
+    "        value=result[\"calc_DDG\"] * unit.kilocalorie_per_mole,\n",
+    "        uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n",
+    "        source=\"OpenFE\",\n",
+    "    )\n",
+    "\n",
+    "# load the experimental values\n",
+    "experimental_results = pd.read_csv(\"../cinnabar/data/experimental_data.csv\")\n",
+    "\n",
+    "for _, exp_row in experimental_results.iterrows():\n",
+    "    femap.add_experimental_measurement(\n",
+    "        label=exp_row[\"Ligand\"],\n",
+    "        value=exp_row[\"expt_DG\"] * unit.kilocalorie_per_mole,\n",
+    "        uncertainty=exp_row[\"expt_dDG\"] * unit.kilocalorie_per_mole,\n",
+    "        source=\"Experimental\",\n",
+    "    )\n",
+    "# fmt: on"
+   ],
+   "id": "cd9c5ddc07ceeb9",
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating additional result sets for comparison\n",
+    "\n",
+    "In a real benchmarking study you would load a second (and third) set of calculated values here from a different force field, a different simulation protocol, or a different software package. You just need to add them to the same ``FEMap`` with a distinct ``source`` string.\n",
+    "\n",
+    "For this tutorial we generate two synthetic alternatives by perturbing the original values with random noise, giving us full control over how similar or different the methods appear."
+   ],
+   "id": "7dc78019e981e2f9"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:47.502421Z",
+     "start_time": "2026-05-19T09:21:47.468442Z"
+    }
+   },
+   "source": [
+    "np.random.seed(42)  # for reproducibility\n",
+    "\n",
+    "# Source 2: slight perturbation (similar accuracy to source 1)\n",
+    "for _, result in rbfe_results.iterrows():\n",
+    "    femap.add_relative_calculation(\n",
+    "        labelA=result[\"Ligand1\"],\n",
+    "        labelB=result[\"Ligand2\"],\n",
+    "        value=np.random.normal(\n",
+    "            loc=result[\"calc_DDG\"],\n",
+    "            scale=result[\"calc_dDDG(MBAR)\"]\n",
+    "        ) * unit.kilocalorie_per_mole,\n",
+    "        uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n",
+    "        source=\"OpenFE_perturbed\",\n",
+    "    )\n",
+    "\n",
+    "# Source 3: large perturbation (noticeably worse accuracy)\n",
+    "for _, result in rbfe_results.iterrows():\n",
+    "    femap.add_relative_calculation(\n",
+    "        labelA=result[\"Ligand1\"],\n",
+    "        labelB=result[\"Ligand2\"],\n",
+    "        value=np.random.normal(\n",
+    "            loc=result[\"calc_DDG\"],\n",
+    "            scale=12.0 * result[\"calc_dDDG(MBAR)\"]\n",
+    "        ) * unit.kilocalorie_per_mole,\n",
+    "        uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n",
+    "        source=\"OpenFE_noisy\",\n",
+    "    )"
+   ],
+   "id": "71a8a6b537f85455",
+   "outputs": [],
+   "execution_count": 4
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Running the comparison\n",
+    "\n",
+    "We now call ``compare_and_rank_results`` with the ``FEMap`` containing all three sources. Key parameters:\n",
+    "\n",
+    "| Parameter | Description                                                                          |\n",
+    "| --- |--------------------------------------------------------------------------------------|\n",
+    "| ``prediction_type`` | ``\"edgewise\"`` (ΔΔG) or ``\"nodewise\"`` (ΔG)                                          |\n",
+    "| ``rank_metric`` | The metric used to rank and compare methods.                                         |\n",
+    "| ``metrics_to_compute`` | Metrics reported in the summary table. Defaults to ``[\"MUE\", \"RMSE\"]`` for edgewise. |\n",
+    "| ``num_bootstraps`` | Number of joint bootstrap resamples (1 000 is a sensible default).                   |\n",
+    "| ``confidence_level`` | Width of the reported confidence intervals, default ``0.95`` (95 %).                 |\n"
+   ],
+   "id": "4df80ede3e08ddf0"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:49.156781Z",
+     "start_time": "2026-05-19T09:21:47.504921Z"
+    }
+   },
+   "source": [
+    "summary_df, comparison_df = compare_and_rank_results(\n",
+    "    femap=femap,\n",
+    "    prediction_type=\"edgewise\",\n",
+    "    rank_metric=\"MUE\",  # the metric used for ranking the results\n",
+    "    metrics_to_compute=[\"MUE\", \"RMSE\"],  # metrics to report in the summary table\n",
+    "    num_bootstraps=1_000,\n",
+    "    confidence_level=0.95,  # report 95 % confidence intervals\n",
+    ")"
+   ],
+   "id": "3d987cc6b1ddf37e",
+   "outputs": [],
+   "execution_count": 5
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The summary table\n",
+    "\n",
+    "The summary table has one row per source. For each requested metric you get:\n",
+    "\n",
+    "- the **sample value** (metric computed on all data points),\n",
+    "- ``*_CI_Lower`` / ``*_CI_Upper``: the **bootstrap confidence interval** bounds, and\n",
+    "- **``CLD``**: the compact letter display assignment.\n",
+    "\n",
+    "Sources that **share a CLD letter** are *not significantly different* from each other for the chosen ``rank_metric`` (after multiple testing correction). Sources with **no letters in common** *are* significantly different. For example, three methods labelled ``\"a\"``, ``\"b\"``, and ``\"b\"`` tell you: the first and second/third are significantly different from each other, but the second is not significantly different from the third method."
+   ],
+   "id": "48fb1d4e6b173827"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:49.183272Z",
+     "start_time": "2026-05-19T09:21:49.157518Z"
+    }
+   },
+   "source": [
+    "summary_df"
+   ],
+   "id": "cb590e7befe1d980",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "              Model       MUE  MUE_CI_Lower  MUE_CI_Upper      RMSE  \\\n",
+       "0            OpenFE  0.867586      0.710513      1.021414  1.053002   \n",
+       "1      OpenFE_noisy  1.166930      0.971467      1.361760  1.386575   \n",
+       "2  OpenFE_perturbed  0.859956      0.698599      1.023484  1.056510   \n",
+       "\n",
+       "   RMSE_CI_Lower  RMSE_CI_Upper CLD  \n",
+       "0       0.869701       1.221763   a  \n",
+       "1       1.175243       1.577053   b  \n",
+       "2       0.867684       1.228928   a  "
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Model</th>\n",
+       "      <th>MUE</th>\n",
+       "      <th>MUE_CI_Lower</th>\n",
+       "      <th>MUE_CI_Upper</th>\n",
+       "      <th>RMSE</th>\n",
+       "      <th>RMSE_CI_Lower</th>\n",
+       "      <th>RMSE_CI_Upper</th>\n",
+       "      <th>CLD</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>OpenFE</td>\n",
+       "      <td>0.867586</td>\n",
+       "      <td>0.710513</td>\n",
+       "      <td>1.021414</td>\n",
+       "      <td>1.053002</td>\n",
+       "      <td>0.869701</td>\n",
+       "      <td>1.221763</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>OpenFE_noisy</td>\n",
+       "      <td>1.166930</td>\n",
+       "      <td>0.971467</td>\n",
+       "      <td>1.361760</td>\n",
+       "      <td>1.386575</td>\n",
+       "      <td>1.175243</td>\n",
+       "      <td>1.577053</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>OpenFE_perturbed</td>\n",
+       "      <td>0.859956</td>\n",
+       "      <td>0.698599</td>\n",
+       "      <td>1.023484</td>\n",
+       "      <td>1.056510</td>\n",
+       "      <td>0.867684</td>\n",
+       "      <td>1.228928</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 6
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The comparison table\n",
+    "\n",
+    "The comparison table has one row per unique source pair and records:\n",
+    "\n",
+    "- ``Diff in <rank_metric>``: observed difference in the ranking metric (source 1 minus source 2, on all data),\n",
+    "- ``CI Lower`` / ``CI Upper``: bootstrap confidence interval around that difference,\n",
+    "- ``p-value``: two-sided bootstrap p-value,\n",
+    "- ``p-value corrected``: Holm-corrected p-value *(appears automatically when >2 sources are present)*,\n",
+    "- ``significant``: ``True`` if the (corrected) p-value is below 0.05."
+   ],
+   "id": "c9ec8f1dd33a19bb"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:49.211626Z",
+     "start_time": "2026-05-19T09:21:49.190516Z"
+    }
+   },
+   "source": [
+    "comparison_df"
+   ],
+   "id": "330b38a6f59bf249",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "        Model 1           Model 2  Diff in MUE  CI Lower  CI Upper  p-value  \\\n",
+       "0        OpenFE      OpenFE_noisy    -0.299344 -0.511742 -0.088689    0.008   \n",
+       "1        OpenFE  OpenFE_perturbed     0.007630 -0.015944  0.030025    0.562   \n",
+       "2  OpenFE_noisy  OpenFE_perturbed     0.306974  0.096280  0.518630    0.008   \n",
+       "\n",
+       "   significant  p-value corrected  \n",
+       "0         True              0.024  \n",
+       "1        False              0.562  \n",
+       "2         True              0.024  "
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Model 1</th>\n",
+       "      <th>Model 2</th>\n",
+       "      <th>Diff in MUE</th>\n",
+       "      <th>CI Lower</th>\n",
+       "      <th>CI Upper</th>\n",
+       "      <th>p-value</th>\n",
+       "      <th>significant</th>\n",
+       "      <th>p-value corrected</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>OpenFE</td>\n",
+       "      <td>OpenFE_noisy</td>\n",
+       "      <td>-0.299344</td>\n",
+       "      <td>-0.511742</td>\n",
+       "      <td>-0.088689</td>\n",
+       "      <td>0.008</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0.024</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>OpenFE</td>\n",
+       "      <td>OpenFE_perturbed</td>\n",
+       "      <td>0.007630</td>\n",
+       "      <td>-0.015944</td>\n",
+       "      <td>0.030025</td>\n",
+       "      <td>0.562</td>\n",
+       "      <td>False</td>\n",
+       "      <td>0.562</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>OpenFE_noisy</td>\n",
+       "      <td>OpenFE_perturbed</td>\n",
+       "      <td>0.306974</td>\n",
+       "      <td>0.096280</td>\n",
+       "      <td>0.518630</td>\n",
+       "      <td>0.008</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0.024</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 7
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise — should appear in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` may share a letter if their difference is not large enough to be resolved at this sample size.\n",
+    "\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Tip:</b> When comparing only <i>two</i> sources the <code>p-value corrected</code> column is absent (no correction needed) and the raw <code>p-value</code> drives the <code>significant</code> flag directly.\n",
+    "</div>"
+   ],
+   "id": "46a1d694c36bee59"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Additional options\n",
+    "\n",
+    "### Choosing the right metrics\n",
+    "\n",
+    "The available metrics depend on ``prediction_type``:\n",
+    "\n",
+    "| Metric | Edgewise | Nodewise | Notes |\n",
+    "| --- | :---: | :---: | --- |\n",
+    "| ``MUE`` | ✓ | ✓ | Mean unsigned error |\n",
+    "| ``RMSE`` | ✓ | ✓ | Root mean squared error |\n",
+    "| ``RAE`` | ✓ | ✓ | Relative absolute error vs. a naïve mean predictor |\n",
+    "| ``R2`` | — | ✓ | R² (Pearson r²); not meaningful for relative ΔΔG data |\n",
+    "| ``rho`` | — | ✓ | Pearson r |\n",
+    "| ``KTAU`` | — | ✓ | Kendall's τ |\n",
+    "| ``PI`` | — | ✓ | Predictive index (Pearlman *et al.*) |\n",
+    "\n",
+    "Correlation metrics (R², ρ, KTAU, PI) are only meaningful for nodewise ΔG comparisons where the data have an absolute scale. For edgewise comparisons the sign of a ΔΔG is arbitrary, so error metrics (MUE, RMSE) are the default.\n",
+    "\n",
+    "If ``metrics_to_compute=None`` (the default), cinnabar automatically selects ``[\"MUE\", \"RMSE\"]`` for edgewise and ``[\"MUE\", \"RMSE\", \"RAE\", \"R2\", \"rho\", \"KTAU\", \"PI\"]`` for nodewise comparisons.\n",
+    "\n",
+    "### Adjusting the confidence level\n",
+    "\n",
+    "The ``confidence_level`` parameter (default ``0.95``) controls the width of the reported CIs. Pass a different value to tighten or loosen the intervals:"
+   ],
+   "id": "4d0c3678b72cf1b7"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-05-19T09:21:50.857180Z",
+     "start_time": "2026-05-19T09:21:49.212873Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "summary_df, comparison_df = compare_and_rank_results(\n",
+    "    femap=femap,\n",
+    "    rank_metric=\"MUE\",\n",
+    "    metrics_to_compute=[\"MUE\", \"RMSE\"],\n",
+    "    confidence_level=0.90,  # 90 % CIs — narrower, easier to detect differences\n",
+    ")"
+   ],
+   "id": "e5f167ac67af2e25",
+   "outputs": [],
+   "execution_count": 8
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## Recap\n",
+    "\n",
+    "- ``compare_and_rank_results`` performs **joint bootstrapping** across all sources, ensuring paired, fair comparisons on the same resampled data.\n",
+    "- Two tables are returned: a **summary** table (per-source metric values, CIs, and CLD letters) and a **comparison** table (pairwise differences, CIs, and p-values).\n",
+    "- The **CLD** encodes statistical groupings: sources sharing a letter are *not* significantly different; sources with no shared letter *are* significantly different.\n",
+    "- With **more than two sources**, Holm-corrected p-values appear automatically in ``p-value corrected`` and drive the ``significant`` flags.\n",
+    "- Use ``prediction_type=\"nodewise\"`` only for independent per-ligand ΔG estimates (e.g. ABFE), *not* for MLE-derived values from a relative network."
+   ],
+   "id": "3265c28aba157f15"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 7e383ce5ca04aecf034187c5f7cc112dfcf73958 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 09:34:57 +0000
Subject: [PATCH 05/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 cinnabar/compare.py               |  16 ++-
 cinnabar/tests/test_compare.py    |  32 ++----
 examples/results_comparison.ipynb | 175 +++++++++++++++---------------
 3 files changed, 106 insertions(+), 117 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index c76e7e9f..d5ae7003 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -13,7 +13,7 @@ def compare_and_rank_results(
     femap: FEMap,
     prediction_type: Literal["nodewise", "edgewise"] = "edgewise",
     rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"] = "MUE",
-    metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU","PI"]] | None = None,
+    metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"]] | None = None,
     num_bootstraps: int = 1_000,
     confidence_level: float = 0.95,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
@@ -69,18 +69,18 @@ def compare_and_rank_results(
             if not row["computational"]:
                 source_label = "exp"
             else:
-                source_label = f"{row["source"]}_calc"
+                source_label = f"{row['source']}_calc"
             predictions_by_key[node_label][source_label] = row["DG (kcal/mol)"]
 
     elif prediction_type == "edgewise":
         rel_df = femap.get_relative_dataframe()
-        sources =rel_df[rel_df["computational"] == True]["source"].unique()
+        sources = rel_df[rel_df["computational"] == True]["source"].unique()
         for _, row in rel_df.iterrows():
             key = (row["labelA"], row["labelB"])
             if not row["computational"]:
                 source_label = "exp"
             else:
-                source_label = f"{row["source"]}_calc"
+                source_label = f"{row['source']}_calc"
             predictions_by_key[key][source_label] = row["DDG (kcal/mol)"]
     else:
         raise ValueError(f"Invalid prediction_type: {prediction_type}")
@@ -95,7 +95,9 @@ def compare_and_rank_results(
     for source in sources:
         # check if any values for this source are missing
         if any(predictions_df[f"{source}_calc"].isna()):
-            raise ValueError(f"Missing predictions for source {source}, all sources must have the same number of predictions.")
+            raise ValueError(
+                f"Missing predictions for source {source}, all sources must have the same number of predictions."
+            )
 
     # set metrics to compute based on best practices if not provided
     if metrics_to_compute is None:
@@ -130,7 +132,9 @@ def compare_and_rank_results(
         for j, source_j in enumerate(sources):
             if j <= i:
                 continue
-            diffs = np.array(metrics_by_source[source_i][rank_metric]) - np.array(metrics_by_source[source_j][rank_metric])
+            diffs = np.array(metrics_by_source[source_i][rank_metric]) - np.array(
+                metrics_by_source[source_j][rank_metric]
+            )
             pairwise_metrics[(source_i, source_j)] = diffs
 
     # summarize all metrics
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 0e8c6094..10165dde 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -1,11 +1,10 @@
 import numpy as np
 import pytest
+from openff.units import unit
 
 from cinnabar import FEMap
 from cinnabar.compare import compare_and_rank_results
 
-from openff.units import unit
-
 
 def test_compare_and_rank_results(fe_map):
     np.random.seed(42)
@@ -15,11 +14,7 @@ def test_compare_and_rank_results(fe_map):
         if m.computational:
             # add the result with a new source
             compare_map.add_relative_calculation(
-                labelA=m.labelA,
-                labelB=m.labelB,
-                value=m.DG,
-                uncertainty=m.uncertainty,
-                source="original"
+                labelA=m.labelA, labelB=m.labelB, value=m.DG, uncertainty=m.uncertainty, source="original"
             )
             # add the data again under a second source with some noise added
             compare_map.add_relative_calculation(
@@ -27,7 +22,7 @@ def test_compare_and_rank_results(fe_map):
                 labelB=m.labelB,
                 value=np.random.normal(m.DG.m, m.uncertainty.m) * unit.kilocalorie_per_mole,
                 uncertainty=m.uncertainty,
-                source="perturbed"
+                source="perturbed",
             )
             # add a third set of data with a lot of noise to get a significant difference
             compare_map.add_relative_calculation(
@@ -35,7 +30,7 @@ def test_compare_and_rank_results(fe_map):
                 labelB=m.labelB,
                 value=np.random.normal(m.DG.m, 12.0 * m.uncertainty.m) * unit.kilocalorie_per_mole,
                 uncertainty=m.uncertainty,
-                source="noisy"
+                source="noisy",
             )
         else:
             # add the experimental data
@@ -92,23 +87,18 @@ def test_missing_source_data(fe_map):
             new_map.add_measurement(m)
         else:
             new_map.add_relative_calculation(
-                labelA=m.labelA,
-                labelB=m.labelB,
-                value=m.DG,
-                uncertainty=m.uncertainty,
-                source="original"
+                labelA=m.labelA, labelB=m.labelB, value=m.DG, uncertainty=m.uncertainty, source="original"
             )
             # for even I add the second source as well
             if i % 2 == 0:
                 new_map.add_relative_calculation(
-                    labelA=m.labelA,
-                    labelB=m.labelB,
-                    value=m.DG,
-                    uncertainty=m.uncertainty,
-                    source="perturbed"
+                    labelA=m.labelA, labelB=m.labelB, value=m.DG, uncertainty=m.uncertainty, source="perturbed"
                 )
 
-    with pytest.raises(ValueError, match="Missing predictions for source perturbed, all sources must have the same number of predictions."):
+    with pytest.raises(
+        ValueError,
+        match="Missing predictions for source perturbed, all sources must have the same number of predictions.",
+    ):
         compare_and_rank_results(
             new_map,
         )
@@ -127,4 +117,4 @@ def test_missing_rank_metric(fe_map):
         fe_map,
         rank_metric="MUE",
         metrics_to_compute=["RMSE"],  # miss the rank metric from the compute list and it should still work
-    )
\ No newline at end of file
+    )
diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb
index e755343a..9b58eaf2 100644
--- a/examples/results_comparison.ipynb
+++ b/examples/results_comparison.ipynb
@@ -2,6 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "f07f14d1b152062e",
    "metadata": {},
    "source": [
     "# Results comparison framework for benchmarking using ``Cinnabar``\n",
@@ -13,11 +14,11 @@
     "<div class=\"alert alert-block alert-info\">\n",
     "<b>Reference:</b> The comparison methodology implemented here is inspired by the approach described in <a href=\"https://www.nature.com/articles/s42004-025-01428-y\">Valsson <i>et al.</i></a>\n",
     "</div>"
-   ],
-   "id": "f07f14d1b152062e"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "f9e3ff514773b116",
    "metadata": {},
    "source": [
     "## Understanding the comparison method\n",
@@ -30,11 +31,11 @@
     "4. **Apply the Holm multiple testing correction** when comparing more than two sets of results.\n",
     "5. **Report confidence intervals** for the metric differences directly from the bootstrap distribution.\n",
     "6. **Rank and group** results using the compact letter display (CLD) via the insert-absorb algorithm. Methods that share a letter are *not significantly different*; methods that share no letters *are* significantly different."
-   ],
-   "id": "f9e3ff514773b116"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "88669d7f9af4abf8",
    "metadata": {},
    "source": [
     "## Limitations\n",
@@ -51,11 +52,11 @@
     "\n",
     "- Experimental measurements must be available for every ligand in the ``FEMap``.\n",
     "- All sources must provide predictions for exactly the **same** set of ligands or ligand pairs."
-   ],
-   "id": "88669d7f9af4abf8"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "35186bc4db8fa6ca",
    "metadata": {},
    "source": [
     "## Loading example RBFE data\n",
@@ -63,21 +64,18 @@
     "For this tutorial we use RBFE data generated with [OpenFE](https://docs.openfree.energy/en/latest/) which is included with ``cinnabar``'s test suite. You can replace these files with your own outputs. See the [FEMap tutorial](cinnabar-api.ipynb) for details on building an ``FEMap`` from different data formats.\n",
     "\n",
     "The two data files have a simple tabular format:"
-   ],
-   "id": "35186bc4db8fa6ca"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
+   "id": "1d4764ff8e34299c",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:46.574932Z",
      "start_time": "2026-05-19T09:21:46.423124Z"
     }
    },
-   "source": [
-    "! head ../cinnabar/data/experimental_data.csv"
-   ],
-   "id": "1d4764ff8e34299c",
    "outputs": [
     {
      "name": "stdout",
@@ -96,20 +94,20 @@
      ]
     }
    ],
-   "execution_count": 1
+   "source": [
+    "! head ../cinnabar/data/experimental_data.csv"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
+   "id": "68317fab7ba5d0c8",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:46.713070Z",
      "start_time": "2026-05-19T09:21:46.575967Z"
     }
    },
-   "source": [
-    "! head ../cinnabar/data/computational_data.csv"
-   ],
-   "id": "68317fab7ba5d0c8",
    "outputs": [
     {
      "name": "stdout",
@@ -128,16 +126,21 @@
      ]
     }
    ],
-   "execution_count": 2
+   "source": [
+    "! head ../cinnabar/data/computational_data.csv"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
+   "id": "cd9c5ddc07ceeb9",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:47.429987Z",
      "start_time": "2026-05-19T09:21:46.714383Z"
     }
    },
+   "outputs": [],
    "source": [
     "# fmt: off\n",
     "from cinnabar.compare import compare_and_rank_results\n",
@@ -173,13 +176,11 @@
     "        source=\"Experimental\",\n",
     "    )\n",
     "# fmt: on"
-   ],
-   "id": "cd9c5ddc07ceeb9",
-   "outputs": [],
-   "execution_count": 3
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "7dc78019e981e2f9",
    "metadata": {},
    "source": [
     "## Creating additional result sets for comparison\n",
@@ -187,17 +188,19 @@
     "In a real benchmarking study you would load a second (and third) set of calculated values here from a different force field, a different simulation protocol, or a different software package. You just need to add them to the same ``FEMap`` with a distinct ``source`` string.\n",
     "\n",
     "For this tutorial we generate two synthetic alternatives by perturbing the original values with random noise, giving us full control over how similar or different the methods appear."
-   ],
-   "id": "7dc78019e981e2f9"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
+   "id": "71a8a6b537f85455",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:47.502421Z",
      "start_time": "2026-05-19T09:21:47.468442Z"
     }
    },
+   "outputs": [],
    "source": [
     "np.random.seed(42)  # for reproducibility\n",
     "\n",
@@ -206,10 +209,7 @@
     "    femap.add_relative_calculation(\n",
     "        labelA=result[\"Ligand1\"],\n",
     "        labelB=result[\"Ligand2\"],\n",
-    "        value=np.random.normal(\n",
-    "            loc=result[\"calc_DDG\"],\n",
-    "            scale=result[\"calc_dDDG(MBAR)\"]\n",
-    "        ) * unit.kilocalorie_per_mole,\n",
+    "        value=np.random.normal(loc=result[\"calc_DDG\"], scale=result[\"calc_dDDG(MBAR)\"]) * unit.kilocalorie_per_mole,\n",
     "        uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n",
     "        source=\"OpenFE_perturbed\",\n",
     "    )\n",
@@ -219,20 +219,16 @@
     "    femap.add_relative_calculation(\n",
     "        labelA=result[\"Ligand1\"],\n",
     "        labelB=result[\"Ligand2\"],\n",
-    "        value=np.random.normal(\n",
-    "            loc=result[\"calc_DDG\"],\n",
-    "            scale=12.0 * result[\"calc_dDDG(MBAR)\"]\n",
-    "        ) * unit.kilocalorie_per_mole,\n",
+    "        value=np.random.normal(loc=result[\"calc_DDG\"], scale=12.0 * result[\"calc_dDDG(MBAR)\"])\n",
+    "        * unit.kilocalorie_per_mole,\n",
     "        uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n",
     "        source=\"OpenFE_noisy\",\n",
     "    )"
-   ],
-   "id": "71a8a6b537f85455",
-   "outputs": [],
-   "execution_count": 4
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "4df80ede3e08ddf0",
    "metadata": {},
    "source": [
     "## Running the comparison\n",
@@ -246,17 +242,19 @@
     "| ``metrics_to_compute`` | Metrics reported in the summary table. Defaults to ``[\"MUE\", \"RMSE\"]`` for edgewise. |\n",
     "| ``num_bootstraps`` | Number of joint bootstrap resamples (1 000 is a sensible default).                   |\n",
     "| ``confidence_level`` | Width of the reported confidence intervals, default ``0.95`` (95 %).                 |\n"
-   ],
-   "id": "4df80ede3e08ddf0"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
+   "id": "3d987cc6b1ddf37e",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:49.156781Z",
      "start_time": "2026-05-19T09:21:47.504921Z"
     }
    },
+   "outputs": [],
    "source": [
     "summary_df, comparison_df = compare_and_rank_results(\n",
     "    femap=femap,\n",
@@ -266,13 +264,11 @@
     "    num_bootstraps=1_000,\n",
     "    confidence_level=0.95,  # report 95 % confidence intervals\n",
     ")"
-   ],
-   "id": "3d987cc6b1ddf37e",
-   "outputs": [],
-   "execution_count": 5
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "48fb1d4e6b173827",
    "metadata": {},
    "source": [
     "### The summary table\n",
@@ -284,35 +280,21 @@
     "- **``CLD``**: the compact letter display assignment.\n",
     "\n",
     "Sources that **share a CLD letter** are *not significantly different* from each other for the chosen ``rank_metric`` (after multiple testing correction). Sources with **no letters in common** *are* significantly different. For example, three methods labelled ``\"a\"``, ``\"b\"``, and ``\"b\"`` tell you: the first and second/third are significantly different from each other, but the second is not significantly different from the third method."
-   ],
-   "id": "48fb1d4e6b173827"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
+   "id": "cb590e7befe1d980",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:49.183272Z",
      "start_time": "2026-05-19T09:21:49.157518Z"
     }
    },
-   "source": [
-    "summary_df"
-   ],
-   "id": "cb590e7befe1d980",
    "outputs": [
     {
      "data": {
-      "text/plain": [
-       "              Model       MUE  MUE_CI_Lower  MUE_CI_Upper      RMSE  \\\n",
-       "0            OpenFE  0.867586      0.710513      1.021414  1.053002   \n",
-       "1      OpenFE_noisy  1.166930      0.971467      1.361760  1.386575   \n",
-       "2  OpenFE_perturbed  0.859956      0.698599      1.023484  1.056510   \n",
-       "\n",
-       "   RMSE_CI_Lower  RMSE_CI_Upper CLD  \n",
-       "0       0.869701       1.221763   a  \n",
-       "1       1.175243       1.577053   b  \n",
-       "2       0.867684       1.228928   a  "
-      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -379,6 +361,17 @@
        "  </tbody>\n",
        "</table>\n",
        "</div>"
+      ],
+      "text/plain": [
+       "              Model       MUE  MUE_CI_Lower  MUE_CI_Upper      RMSE  \\\n",
+       "0            OpenFE  0.867586      0.710513      1.021414  1.053002   \n",
+       "1      OpenFE_noisy  1.166930      0.971467      1.361760  1.386575   \n",
+       "2  OpenFE_perturbed  0.859956      0.698599      1.023484  1.056510   \n",
+       "\n",
+       "   RMSE_CI_Lower  RMSE_CI_Upper CLD  \n",
+       "0       0.869701       1.221763   a  \n",
+       "1       1.175243       1.577053   b  \n",
+       "2       0.867684       1.228928   a  "
       ]
      },
      "execution_count": 6,
@@ -386,10 +379,13 @@
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 6
+   "source": [
+    "summary_df"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "c9ec8f1dd33a19bb",
    "metadata": {},
    "source": [
     "### The comparison table\n",
@@ -401,35 +397,21 @@
     "- ``p-value``: two-sided bootstrap p-value,\n",
     "- ``p-value corrected``: Holm-corrected p-value *(appears automatically when >2 sources are present)*,\n",
     "- ``significant``: ``True`` if the (corrected) p-value is below 0.05."
-   ],
-   "id": "c9ec8f1dd33a19bb"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
+   "id": "330b38a6f59bf249",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:49.211626Z",
      "start_time": "2026-05-19T09:21:49.190516Z"
     }
    },
-   "source": [
-    "comparison_df"
-   ],
-   "id": "330b38a6f59bf249",
    "outputs": [
     {
      "data": {
-      "text/plain": [
-       "        Model 1           Model 2  Diff in MUE  CI Lower  CI Upper  p-value  \\\n",
-       "0        OpenFE      OpenFE_noisy    -0.299344 -0.511742 -0.088689    0.008   \n",
-       "1        OpenFE  OpenFE_perturbed     0.007630 -0.015944  0.030025    0.562   \n",
-       "2  OpenFE_noisy  OpenFE_perturbed     0.306974  0.096280  0.518630    0.008   \n",
-       "\n",
-       "   significant  p-value corrected  \n",
-       "0         True              0.024  \n",
-       "1        False              0.562  \n",
-       "2         True              0.024  "
-      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -496,6 +478,17 @@
        "  </tbody>\n",
        "</table>\n",
        "</div>"
+      ],
+      "text/plain": [
+       "        Model 1           Model 2  Diff in MUE  CI Lower  CI Upper  p-value  \\\n",
+       "0        OpenFE      OpenFE_noisy    -0.299344 -0.511742 -0.088689    0.008   \n",
+       "1        OpenFE  OpenFE_perturbed     0.007630 -0.015944  0.030025    0.562   \n",
+       "2  OpenFE_noisy  OpenFE_perturbed     0.306974  0.096280  0.518630    0.008   \n",
+       "\n",
+       "   significant  p-value corrected  \n",
+       "0         True              0.024  \n",
+       "1        False              0.562  \n",
+       "2         True              0.024  "
       ]
      },
      "execution_count": 7,
@@ -503,10 +496,13 @@
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 7
+   "source": [
+    "comparison_df"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "46a1d694c36bee59",
    "metadata": {},
    "source": [
     "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise — should appear in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` may share a letter if their difference is not large enough to be resolved at this sample size.\n",
@@ -514,11 +510,11 @@
     "<div class=\"alert alert-block alert-info\">\n",
     "<b>Tip:</b> When comparing only <i>two</i> sources the <code>p-value corrected</code> column is absent (no correction needed) and the raw <code>p-value</code> drives the <code>significant</code> flag directly.\n",
     "</div>"
-   ],
-   "id": "46a1d694c36bee59"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "4d0c3678b72cf1b7",
    "metadata": {},
    "source": [
     "## Additional options\n",
@@ -544,17 +540,19 @@
     "### Adjusting the confidence level\n",
     "\n",
     "The ``confidence_level`` parameter (default ``0.95``) controls the width of the reported CIs. Pass a different value to tighten or loosen the intervals:"
-   ],
-   "id": "4d0c3678b72cf1b7"
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e5f167ac67af2e25",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2026-05-19T09:21:50.857180Z",
      "start_time": "2026-05-19T09:21:49.212873Z"
     }
    },
-   "cell_type": "code",
+   "outputs": [],
    "source": [
     "summary_df, comparison_df = compare_and_rank_results(\n",
     "    femap=femap,\n",
@@ -562,14 +560,12 @@
     "    metrics_to_compute=[\"MUE\", \"RMSE\"],\n",
     "    confidence_level=0.90,  # 90 % CIs — narrower, easier to detect differences\n",
     ")"
-   ],
-   "id": "e5f167ac67af2e25",
-   "outputs": [],
-   "execution_count": 8
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "3265c28aba157f15",
+   "metadata": {},
    "source": [
     "## Recap\n",
     "\n",
@@ -578,8 +574,7 @@
     "- The **CLD** encodes statistical groupings: sources sharing a letter are *not* significantly different; sources with no shared letter *are* significantly different.\n",
     "- With **more than two sources**, Holm-corrected p-values appear automatically in ``p-value corrected`` and drive the ``significant`` flags.\n",
     "- Use ``prediction_type=\"nodewise\"`` only for independent per-ligand ΔG estimates (e.g. ABFE), *not* for MLE-derived values from a relative network."
-   ],
-   "id": "3265c28aba157f15"
+   ]
   }
  ],
  "metadata": {

From 9af5c5a906900c33005703994ef67c31bfb8de21 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Tue, 19 May 2026 11:32:10 +0100
Subject: [PATCH 06/17] fix alter box add nodewise test

---
 cinnabar/tests/test_compare.py    | 14 ++++++++++++++
 examples/results_comparison.ipynb | 21 +++++++--------------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 10165dde..fa5f8a00 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -58,6 +58,20 @@ def test_compare_and_rank_results(fe_map):
     assert "p-value corrected" in comparison_df.columns
 
 
+def test_nodewise_comparison(fe_map):
+    # a simple test to make sure it runs
+    summary_df, comparison_df = compare_and_rank_results(
+        fe_map,
+        prediction_type="nodewise",
+        rank_metric="PI",
+    )
+    for metric in ["MUE", "RMSE", "R2", "rho", "PI", "KTAU", "RAE"]:
+        assert metric in summary_df.columns
+        for ci in ["Upper", "Lower"]:
+            assert f"{metric}_CI_{ci}" in summary_df.columns
+    summary_df.to_csv("summary_df.csv")
+
+
 def test_invalid_prediction_type(fe_map):
     with pytest.raises(ValueError, match="Invalid prediction_type: pairwise"):
         compare_and_rank_results(
diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb
index 9b58eaf2..ff36c0cd 100644
--- a/examples/results_comparison.ipynb
+++ b/examples/results_comparison.ipynb
@@ -11,9 +11,7 @@
     "\n",
     "This tutorial walks through the methodology and demonstrates how to use the ``compare_and_rank_results`` function with example RBFE data.\n",
     "\n",
-    "<div class=\"alert alert-block alert-info\">\n",
-    "<b>Reference:</b> The comparison methodology implemented here is inspired by the approach described in <a href=\"https://www.nature.com/articles/s42004-025-01428-y\">Valsson <i>et al.</i></a>\n",
-    "</div>"
+    "<div class=\"alert alert-block alert-info\">**Reference:** The comparison methodology implemented here is inspired by the approach described in [Valsson *et al.*](https://www.nature.com/articles/s42004-025-01428-y)</div>"
    ]
   },
   {
@@ -21,7 +19,7 @@
    "id": "f9e3ff514773b116",
    "metadata": {},
    "source": [
-    "## Understanding the comparison method\n",
+    "|## Understanding the comparison method\n",
     "\n",
     "The comparison is built around a **joint bootstrapping procedure** that generates a distribution of differences in the chosen evaluation metric. The full workflow can be broken down into the following steps:\n",
     "\n",
@@ -40,13 +38,9 @@
    "source": [
     "## Limitations\n",
     "\n",
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<b>Edgewise independence:</b> RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to <i>underestimate</i> confidence intervals as the effective sample size is closer to <code>N_ligands</code> than to <code>N_edges</code> as assumed by the method.\n",
-    "</div>\n",
+    "<div class=\"alert alert-block alert-warning\">**Edgewise independence:** RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to *underestimate* confidence intervals as the effective sample size is closer to ``N_ligands`` than to ``N_edges`` as assumed by the method.</div>\n",
     "\n",
-    "<div class=\"alert alert-block alert-warning\">\n",
-    "<b>Nodewise correlations:</b> ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for <b>absolute</b> binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.\n",
-    "</div>\n",
+    "<div class=\"alert alert-block alert-warning\">**Nodewise correlations:** ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for **absolute** binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.</div>\n",
     "\n",
     "### Requirements\n",
     "\n",
@@ -507,9 +501,7 @@
    "source": [
     "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise — should appear in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` may share a letter if their difference is not large enough to be resolved at this sample size.\n",
     "\n",
-    "<div class=\"alert alert-block alert-info\">\n",
-    "<b>Tip:</b> When comparing only <i>two</i> sources the <code>p-value corrected</code> column is absent (no correction needed) and the raw <code>p-value</code> drives the <code>significant</code> flag directly.\n",
-    "</div>"
+    "<div class=\"alert alert-block alert-info\">**Tip:** When comparing only *two* sources the ``p-value corrected`` column is absent (no correction needed) and the raw ``p-value`` drives the ``significant`` flag directly.</div>"
    ]
   },
   {
@@ -573,7 +565,8 @@
     "- Two tables are returned: a **summary** table (per-source metric values, CIs, and CLD letters) and a **comparison** table (pairwise differences, CIs, and p-values).\n",
     "- The **CLD** encodes statistical groupings: sources sharing a letter are *not* significantly different; sources with no shared letter *are* significantly different.\n",
     "- With **more than two sources**, Holm-corrected p-values appear automatically in ``p-value corrected`` and drive the ``significant`` flags.\n",
-    "- Use ``prediction_type=\"nodewise\"`` only for independent per-ligand ΔG estimates (e.g. ABFE), *not* for MLE-derived values from a relative network."
+    "- Use ``prediction_type=\"nodewise\"`` only for independent per-ligand ΔG estimates (e.g. ABFE), *not* for MLE-derived values from a relative network.\n",
+    "\n"
    ]
   }
  ],

From 054fea3a2dbb77df235aa78d4d12adda92a926d9 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Tue, 19 May 2026 11:41:15 +0100
Subject: [PATCH 07/17] render output

---
 examples/results_comparison.ipynb | 56 ++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb
index ff36c0cd..dc46fb3e 100644
--- a/examples/results_comparison.ipynb
+++ b/examples/results_comparison.ipynb
@@ -19,7 +19,7 @@
    "id": "f9e3ff514773b116",
    "metadata": {},
    "source": [
-    "|## Understanding the comparison method\n",
+    "## Understanding the comparison method\n",
     "\n",
     "The comparison is built around a **joint bootstrapping procedure** that generates a distribution of differences in the chosen evaluation metric. The full workflow can be broken down into the following steps:\n",
     "\n",
@@ -75,16 +75,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Ligand,expt_DG,expt_dDG\r\n",
-      "CAT-13a,-8.83,0.10\r\n",
-      "CAT-13b,-9.11,0.10\r\n",
-      "CAT-13c,-9.31,0.10\r\n",
-      "CAT-13d,-10.46,0.10\r\n",
-      "CAT-13e,-9.95,0.10\r\n",
-      "CAT-13f,-9.08,0.10\r\n",
-      "CAT-13g,-9.08,0.10\r\n",
-      "CAT-13h,-9.62,0.10\r\n",
-      "CAT-13i,-9.26,0.10\r\n"
+      "Ligand,expt_DG,expt_dDG\n",
+      "CAT-13a,-8.83,0.10\n",
+      "CAT-13b,-9.11,0.10\n",
+      "CAT-13c,-9.31,0.10\n",
+      "CAT-13d,-10.46,0.10\n",
+      "CAT-13e,-9.95,0.10\n",
+      "CAT-13f,-9.08,0.10\n",
+      "CAT-13g,-9.08,0.10\n",
+      "CAT-13h,-9.62,0.10\n",
+      "CAT-13i,-9.26,0.10\n"
      ]
     }
    ],
@@ -107,16 +107,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Ligand1,Ligand2,calc_DDG,calc_dDDG(MBAR),calc_dDDG(additional)\r\n",
-      "CAT-13b,CAT-17g,0.36,0.11,0.0\r\n",
-      "CAT-13a,CAT-17g,-0.02,0.1,0.0\r\n",
-      "CAT-13e,CAT-17g,1.5,0.11,0.0\r\n",
-      "CAT-4m,CAT-4c,0.78,0.1,0.0\r\n",
-      "CAT-13k,CAT-4d,-0.59,0.11,0.0\r\n",
-      "CAT-24,CAT-17e,1.98,0.08,0.0\r\n",
-      "CAT-13g,CAT-17g,0.86,0.15,0.0\r\n",
-      "CAT-13d,CAT-13h,1.46,0.1,0.0\r\n",
-      "CAT-13a,CAT-17i,-0.76,0.11,0.0\r\n"
+      "Ligand1,Ligand2,calc_DDG,calc_dDDG(MBAR),calc_dDDG(additional)\n",
+      "CAT-13b,CAT-17g,0.36,0.11,0.0\n",
+      "CAT-13a,CAT-17g,-0.02,0.1,0.0\n",
+      "CAT-13e,CAT-17g,1.5,0.11,0.0\n",
+      "CAT-4m,CAT-4c,0.78,0.1,0.0\n",
+      "CAT-13k,CAT-4d,-0.59,0.11,0.0\n",
+      "CAT-24,CAT-17e,1.98,0.08,0.0\n",
+      "CAT-13g,CAT-17g,0.86,0.15,0.0\n",
+      "CAT-13d,CAT-13h,1.46,0.1,0.0\n",
+      "CAT-13a,CAT-17i,-0.76,0.11,0.0\n"
      ]
     }
    ],
@@ -499,7 +499,7 @@
    "id": "46a1d694c36bee59",
    "metadata": {},
    "source": [
-    "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise — should appear in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` may share a letter if their difference is not large enough to be resolved at this sample size.\n",
+    "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise is in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` share a letter as their difference is not large enough to be resolved at this sample size.\n",
     "\n",
     "<div class=\"alert alert-block alert-info\">**Tip:** When comparing only *two* sources the ``p-value corrected`` column is absent (no correction needed) and the raw ``p-value`` drives the ``significant`` flag directly.</div>"
    ]
@@ -572,13 +572,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "cinnabar",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
-   "version": "3.10.0"
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,

From 8b610e994ef99008b8ac0a555f53717ae603cf84 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Tue, 19 May 2026 12:01:20 +0100
Subject: [PATCH 08/17] fix highlights

---
 examples/results_comparison.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb
index dc46fb3e..133819bd 100644
--- a/examples/results_comparison.ipynb
+++ b/examples/results_comparison.ipynb
@@ -271,7 +271,7 @@
     "\n",
     "- the **sample value** (metric computed on all data points),\n",
     "- ``*_CI_Lower`` / ``*_CI_Upper``: the **bootstrap confidence interval** bounds, and\n",
-    "- **``CLD``**: the compact letter display assignment.\n",
+    "- ``CLD``: the compact letter display assignment.\n",
     "\n",
     "Sources that **share a CLD letter** are *not significantly different* from each other for the chosen ``rank_metric`` (after multiple testing correction). Sources with **no letters in common** *are* significantly different. For example, three methods labelled ``\"a\"``, ``\"b\"``, and ``\"b\"`` tell you: the first and second/third are significantly different from each other, but the second is not significantly different from the third method."
    ]

From fb486eb5fd058b8963182342421f503da2808bfa Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Tue, 19 May 2026 12:04:07 +0100
Subject: [PATCH 09/17] add news

---
 news/results_ranking.rst | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 news/results_ranking.rst

diff --git a/news/results_ranking.rst b/news/results_ranking.rst
new file mode 100644
index 00000000..116e852d
--- /dev/null
+++ b/news/results_ranking.rst
@@ -0,0 +1,23 @@
+**Added:**
+
+* ``compare_and_rank_results`` function to check for statistically significant differences between multiple sources in the same FEMap `PR#174 <https://github.com/OpenFreeEnergy/cinnabar/pull/174>`_.
+
+**Changed:**
+
+* <news item>
+
+**Deprecated:**
+
+* <news item>
+
+**Removed:**
+
+* <news item>
+
+**Fixed:**
+
+* <news item>
+
+**Security:**
+
+* <news item>

From 2269b75782615c83cf52eab5c44300ed24ce6bf6 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Fri, 12 Jun 2026 13:24:37 +0100
Subject: [PATCH 10/17] expose alpha, update tests, dont generate abs values in
 call

---
 cinnabar/compare.py            | 44 +++++++++++++++++++++-------------
 cinnabar/tests/test_compare.py | 41 +++++++++++++++++++++++++++++--
 devtools/conda-envs/env.yaml   |  1 +
 3 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index d5ae7003..88310e18 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -16,6 +16,7 @@ def compare_and_rank_results(
     metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"]] | None = None,
     num_bootstraps: int = 1_000,
     confidence_level: float = 0.95,
+    alpha: float = 0.05,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
     Compare and rank multiple result sources on a single FEMap based on the chosen performance metric and return an ordered table of
@@ -25,23 +26,27 @@ def compare_and_rank_results(
     ----------
     femap : FEMap
         An FEMap instance with results from multiple sources to compare.
-    prediction_type: Literal["nodewise", "edgewise"], optional
-        The type of prediction to evaluate, by default "edgewise".
-    rank_metric : Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"], optional
-        The metric used to rank the models, by default "MUE".
-    metrics_to_compute : list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"]], optional
-        A list of metrics to compute for each model. If None, all metrics appropriate for the `prediction_type` will be computed.
-    num_bootstraps : int, optional
-        The number of bootstrap samples to use for estimating confidence intervals, by default 1000.
-    confidence_level : float, optional
-        The confidence level for the intervals, by default 0.95.
+    prediction_type: {"nodewise", "edgewise"}, default "edgewise"
+        The type of prediction in the FEMap to evaluate.
+    rank_metric : {"MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"}, default "MUE"
+        The metric used to rank the models.
+    metrics_to_compute : list[{"MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"}] | None, default None
+        A list of metrics to compute for each model. If ``None``, all metrics appropriate for the ``prediction_type`` will be computed.
+    num_bootstraps : int, default 1000
+        The number of bootstrap samples to use for estimating confidence intervals.
+    confidence_level : float, default 0.95
+        The confidence level for the intervals.
+    alpha : float, default 0.05
+        The significance level (Type I error probability) for determining statistical significance
+        in pairwise comparisons. Lower values are more conservative and require stronger evidence for significance.
 
     Note
     ----
     - The comparison method uses a joint bootstrapping procedure that generates a distribution of differences in the rank metric and checks for significant differences using a method inspired by. [1]_
     - Each source must be evaluated on the same set of edges.
     - Prediction types "nodewise" and "edgewise" correspond to DGs and edgewise DDGs respectively.
-    - When we have more than 2 models, we apply multiple testing correction to the pairwise comparisons using the ``Holm`` method by default.
+    - When we have more than 2 models, we apply multiple testing correction to the pairwise comparisons using the ``Holm``
+    method to control the family-wise error rate in a low number of comparisons. For more information see https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method.
 
     Returns
     -------
@@ -58,11 +63,16 @@ def compare_and_rank_results(
     # get the predictions and experimental values from the FEMaps and align them into a DataFrame
     predictions_by_key = defaultdict(dict)
     if prediction_type == "nodewise":
-        # make sure we have absolute values
-        femap.generate_absolute_values()
         abs_df = femap.get_absolute_dataframe()
         # get the computational sources we want to compare
-        sources = abs_df[abs_df["computational"] == True]["source"].unique()
+        sources = abs_df[abs_df["computational"] == True]["source"].unique().tolist()
+
+        if not sources:
+            raise ValueError(
+                "The FEMap contains no computed absolute values. "
+                "Call generate_absolute_values() first or add calculated absolute measurements directly to run nodewise comparisons."
+            )
+        
         # for each row add the node prediction
         for _, row in abs_df.iterrows():
             node_label = row["label"]
@@ -173,7 +183,7 @@ def compare_and_rank_results(
                 "CI Lower": lower,
                 "CI Upper": upper,
                 "p-value": p_value,
-                "significant": p_value < 0.05,
+                "significant": p_value < alpha,
             }
         )
     comparison_df = pd.DataFrame(comparison_data)
@@ -183,10 +193,10 @@ def compare_and_rank_results(
         from statsmodels.stats.multitest import multipletests
 
         p_values = comparison_df["p-value"].values
-        reject, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method="holm")
+        reject, pvals_corrected, _, _ = multipletests(p_values, alpha=alpha, method="holm")
         comparison_df["p-value corrected"] = pvals_corrected
         # add corrected significance
-        comparison_df["significant"] = pvals_corrected < 0.05
+        comparison_df["significant"] = pvals_corrected < alpha
 
     # rank the models and apply the CLD
     # the order depends on whether lower or higher is better for the rank metric
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index fa5f8a00..8dc38222 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -59,9 +59,32 @@ def test_compare_and_rank_results(fe_map):
 
 
 def test_nodewise_comparison(fe_map):
+    np.random.seed(42)
+
+    compare_map = FEMap()
+    for m in fe_map:
+        if m.computational:
+            # add the result with a new source
+            compare_map.add_relative_calculation(
+                labelA=m.labelA, labelB=m.labelB, value=m.DG, uncertainty=m.uncertainty, source="original"
+            )
+            # add the data again under a second source with some noise added
+            compare_map.add_relative_calculation(
+                labelA=m.labelA,
+                labelB=m.labelB,
+                value=np.random.normal(m.DG.m, m.uncertainty.m) * unit.kilocalorie_per_mole,
+                uncertainty=m.uncertainty,
+                source="perturbed",
+            )
+        else:
+            # add the experimental data
+            compare_map.add_measurement(m)
+
+    # add the absolute values to the map
+    compare_map.generate_absolute_values()
     # a simple test to make sure it runs
     summary_df, comparison_df = compare_and_rank_results(
-        fe_map,
+        compare_map,
         prediction_type="nodewise",
         rank_metric="PI",
     )
@@ -69,7 +92,13 @@ def test_nodewise_comparison(fe_map):
         assert metric in summary_df.columns
         for ci in ["Upper", "Lower"]:
             assert f"{metric}_CI_{ci}" in summary_df.columns
-    summary_df.to_csv("summary_df.csv")
+        # check that original and perturbed are ranked the same
+        assert summary_df[summary_df["Model"] == "MLE(original)"]["CLD"].values[0] == "a"
+        assert summary_df[summary_df["Model"] == "MLE(perturbed)"]["CLD"].values[0] == "a"
+        # check that the comparison table has a single comparison between the two models
+        assert len(comparison_df) == 1
+        # as we have two methods we do not need to correct the p-values
+        assert "p-value corrected" not in comparison_df.columns
 
 
 def test_invalid_prediction_type(fe_map):
@@ -132,3 +161,11 @@ def test_missing_rank_metric(fe_map):
         rank_metric="MUE",
         metrics_to_compute=["RMSE"],  # miss the rank metric from the compute list and it should still work
     )
+
+
+def test_missing_node_values(fe_map):
+    with pytest.raises(ValueError, match="The FEMap contains no computed absolute values. "):
+        _, _ = compare_and_rank_results(
+            fe_map,
+            prediction_type="nodewise",
+        )
\ No newline at end of file
diff --git a/devtools/conda-envs/env.yaml b/devtools/conda-envs/env.yaml
index 3c2da9d5..d705d54c 100644
--- a/devtools/conda-envs/env.yaml
+++ b/devtools/conda-envs/env.yaml
@@ -14,6 +14,7 @@ dependencies:
   - adjusttext
   - openff-units
   - duecredit<0.10
+  - statsmodels
 
   # format
   - black

From d0360573efe38f07a41742132821f8da181ab915 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Fri, 12 Jun 2026 13:48:36 +0100
Subject: [PATCH 11/17] fix notebook rendering

---
 cinnabar/compare.py               |  4 ++--
 examples/results_comparison.ipynb | 28 +++++++++++++++++++---------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index 88310e18..72864a65 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -46,7 +46,7 @@ def compare_and_rank_results(
     - Each source must be evaluated on the same set of edges.
     - Prediction types "nodewise" and "edgewise" correspond to DGs and edgewise DDGs respectively.
     - When we have more than 2 models, we apply multiple testing correction to the pairwise comparisons using the ``Holm``
-    method to control the family-wise error rate in a low number of comparisons. For more information see https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method.
+          method to control the family-wise error rate in a low number of comparisons. For more information see https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method.
 
     Returns
     -------
@@ -72,7 +72,7 @@ def compare_and_rank_results(
                 "The FEMap contains no computed absolute values. "
                 "Call generate_absolute_values() first or add calculated absolute measurements directly to run nodewise comparisons."
             )
-        
+
         # for each row add the node prediction
         for _, row in abs_df.iterrows():
             node_label = row["label"]
diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb
index 133819bd..dce2193f 100644
--- a/examples/results_comparison.ipynb
+++ b/examples/results_comparison.ipynb
@@ -11,7 +11,9 @@
     "\n",
     "This tutorial walks through the methodology and demonstrates how to use the ``compare_and_rank_results`` function with example RBFE data.\n",
     "\n",
-    "<div class=\"alert alert-block alert-info\">**Reference:** The comparison methodology implemented here is inspired by the approach described in [Valsson *et al.*](https://www.nature.com/articles/s42004-025-01428-y)</div>"
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Note</b> The comparison methodology implemented here is inspired by the approach described in <a href=\"https://www.nature.com/articles/s42004-025-01428-y\">Valsson *et al.*</a>.\n",
+    "</div>"
    ]
   },
   {
@@ -38,9 +40,13 @@
    "source": [
     "## Limitations\n",
     "\n",
-    "<div class=\"alert alert-block alert-warning\">**Edgewise independence:** RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to *underestimate* confidence intervals as the effective sample size is closer to ``N_ligands`` than to ``N_edges`` as assumed by the method.</div>\n",
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<b>Important</b> RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to <b>underestimate</b> confidence intervals as the effective sample size is closer to <b>N_ligands</b> than to <b>N_edges</b> as assumed by the method.\n",
+    "</div>\n",
     "\n",
-    "<div class=\"alert alert-block alert-warning\">**Nodewise correlations:** ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for **absolute** binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.</div>\n",
+    "<div class=\"alert alert-block alert-warning\">\n",
+    "<b>Important</b> ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for <b>absolute</b> binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.\n",
+    "</div>\n",
     "\n",
     "### Requirements\n",
     "\n",
@@ -55,7 +61,7 @@
    "source": [
     "## Loading example RBFE data\n",
     "\n",
-    "For this tutorial we use RBFE data generated with [OpenFE](https://docs.openfree.energy/en/latest/) which is included with ``cinnabar``'s test suite. You can replace these files with your own outputs. See the [FEMap tutorial](cinnabar-api.ipynb) for details on building an ``FEMap`` from different data formats.\n",
+    "For this tutorial we use RBFE data generated with [OpenFE](https://docs.openfree.energy/en/latest/) which is included with ``cinnabar``'s test suite. You can replace these files with your own outputs. See the {doc}`cinnabar API tutorial <cinnabar-api>` for details on building an ``FEMap`` from different data formats.\n",
     "\n",
     "The two data files have a simple tabular format:"
    ]
@@ -235,7 +241,8 @@
     "| ``rank_metric`` | The metric used to rank and compare methods.                                         |\n",
     "| ``metrics_to_compute`` | Metrics reported in the summary table. Defaults to ``[\"MUE\", \"RMSE\"]`` for edgewise. |\n",
     "| ``num_bootstraps`` | Number of joint bootstrap resamples (1 000 is a sensible default).                   |\n",
-    "| ``confidence_level`` | Width of the reported confidence intervals, default ``0.95`` (95 %).                 |\n"
+    "| ``confidence_level`` | Width of the reported confidence intervals, default ``0.95`` (95 %).                 |\n",
+    "| ``alpha``      | The significance level for determining statistical significance, default ``0.05``.       |\n"
    ]
   },
   {
@@ -257,6 +264,7 @@
     "    metrics_to_compute=[\"MUE\", \"RMSE\"],  # metrics to report in the summary table\n",
     "    num_bootstraps=1_000,\n",
     "    confidence_level=0.95,  # report 95 % confidence intervals\n",
+    "    alpha=0.05,  # for statistical significance testing (e.g. p-value threshold)\n",
     ")"
    ]
   },
@@ -390,7 +398,7 @@
     "- ``CI Lower`` / ``CI Upper``: bootstrap confidence interval around that difference,\n",
     "- ``p-value``: two-sided bootstrap p-value,\n",
     "- ``p-value corrected``: Holm-corrected p-value *(appears automatically when >2 sources are present)*,\n",
-    "- ``significant``: ``True`` if the (corrected) p-value is below 0.05."
+    "- ``significant``: ``True`` if the (corrected) p-value is below ``alpha``."
    ]
   },
   {
@@ -501,7 +509,9 @@
    "source": [
     "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise is in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` share a letter as their difference is not large enough to be resolved at this sample size.\n",
     "\n",
-    "<div class=\"alert alert-block alert-info\">**Tip:** When comparing only *two* sources the ``p-value corrected`` column is absent (no correction needed) and the raw ``p-value`` drives the ``significant`` flag directly.</div>"
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Tip</b> When comparing only <b>two</b> sources the <b>p-value corrected</b> column is absent (no correction needed) and the raw <b>p-value</b> drives the <b>significant</b> flag directly.\n",
+    "</div>"
    ]
   },
   {
@@ -572,7 +582,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cinnabar",
+   "display_name": "openfe_dev",
    "language": "python",
    "name": "python3"
   },
@@ -586,7 +596,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.5"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

From ef8d23043351ef6895087d96f6b5c18484bc2f08 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 12:49:42 +0000
Subject: [PATCH 12/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 cinnabar/tests/test_compare.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 8dc38222..84df97e2 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -168,4 +168,4 @@ def test_missing_node_values(fe_map):
         _, _ = compare_and_rank_results(
             fe_map,
             prediction_type="nodewise",
-        )
\ No newline at end of file
+        )

From 404a5d3b0d297f070b30cd6b71f0f004adfffe05 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Fri, 12 Jun 2026 17:14:32 +0100
Subject: [PATCH 13/17] fix mypy

---
 cinnabar/compare.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index 72864a65..6fee9e31 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -61,7 +61,7 @@ def compare_and_rank_results(
     """
 
     # get the predictions and experimental values from the FEMaps and align them into a DataFrame
-    predictions_by_key = defaultdict(dict)
+    predictions_by_key:dict[tuple[str, str] | str, dict[str, float]] = defaultdict(dict)
     if prediction_type == "nodewise":
         abs_df = femap.get_absolute_dataframe()
         # get the computational sources we want to compare
@@ -252,7 +252,7 @@ def _apply_cld(comparison_df: pd.DataFrame, ordered_labels: list[str]) -> dict[s
     sig = _build_significance_lookup(comparison_df)
 
     # Each letter is represented as a set of methods which are not significantly different
-    letters = []
+    letters: list[set[str]] = []
 
     def is_compatible(meth, let):
         """Check if method can join this letter set"""

From 179126a42eb911cff5a11597cf4fe52135fdf1eb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Jun 2026 10:17:23 +0000
Subject: [PATCH 14/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 cinnabar/compare.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index 6fee9e31..d734ad6f 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -61,7 +61,7 @@ def compare_and_rank_results(
     """
 
     # get the predictions and experimental values from the FEMaps and align them into a DataFrame
-    predictions_by_key:dict[tuple[str, str] | str, dict[str, float]] = defaultdict(dict)
+    predictions_by_key: dict[tuple[str, str] | str, dict[str, float]] = defaultdict(dict)
     if prediction_type == "nodewise":
         abs_df = femap.get_absolute_dataframe()
         # get the computational sources we want to compare

From d70c014db97d0e34228186fdf7b147ed06a22b79 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Tue, 16 Jun 2026 11:49:49 +0100
Subject: [PATCH 15/17] PR feedback, add bounds check tests

---
 cinnabar/compare.py            | 15 ++++++++++++---
 cinnabar/tests/test_compare.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index d734ad6f..d6428abe 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -59,13 +59,20 @@ def compare_and_rank_results(
     ----------
     .. [1] https://www.nature.com/articles/s42004-025-01428-y
     """
+    # bounds checks on the input parameters
+    if not isinstance(num_bootstraps, int) or num_bootstraps < 1:
+        raise ValueError("num_bootstraps must be an integer >= 1.")
+    if not isinstance(confidence_level, (int, float)) or not (0 < confidence_level < 1):
+        raise ValueError("confidence_level must be a number between 0 and 1 (exclusive).")
+    if not isinstance(alpha, (int, float)) or not (0 < alpha < 1):
+        raise ValueError("alpha must be a number between 0 and 1 (exclusive).")
 
     # get the predictions and experimental values from the FEMaps and align them into a DataFrame
     predictions_by_key: dict[tuple[str, str] | str, dict[str, float]] = defaultdict(dict)
     if prediction_type == "nodewise":
         abs_df = femap.get_absolute_dataframe()
         # get the computational sources we want to compare
-        sources = abs_df[abs_df["computational"] == True]["source"].unique().tolist()
+        sources = abs_df[abs_df["computational"]]["source"].unique().tolist()
 
         if not sources:
             raise ValueError(
@@ -84,7 +91,7 @@ def compare_and_rank_results(
 
     elif prediction_type == "edgewise":
         rel_df = femap.get_relative_dataframe()
-        sources = rel_df[rel_df["computational"] == True]["source"].unique()
+        sources = rel_df[rel_df["computational"]]["source"].unique().tolist()
         for _, row in rel_df.iterrows():
             key = (row["labelA"], row["labelB"])
             if not row["computational"]:
@@ -118,6 +125,8 @@ def compare_and_rank_results(
 
     # we must compute the rank metric however it is possible to miss it
     if rank_metric not in metrics_to_compute:
+        # make a copy of the input list as we may need to mutate it - use shall as metrics are immutable
+        metrics_to_compute = list(metrics_to_compute)
         metrics_to_compute.append(rank_metric)
 
     # check we can compute all requested metrics
@@ -125,7 +134,7 @@ def compare_and_rank_results(
         if metric not in _AVAILABLE_STATS:
             raise ValueError(f"Metric {metric} is not available.")
 
-    metrics_by_source = dict((source, dict((metric, []) for metric in metrics_to_compute)) for source in sources)
+    metrics_by_source: dict[str, dict[str, list[float]]] = {source: {metric: [] for metric in metrics_to_compute} for source in sources}
     pairwise_metrics = {}
     # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap
     for _ in range(num_bootstraps):
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 84df97e2..13a1af68 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -169,3 +169,33 @@ def test_missing_node_values(fe_map):
             fe_map,
             prediction_type="nodewise",
         )
+
+
+def test_low_bootstrap_samples(fe_map):
+    with pytest.raises(ValueError, match="num_bootstraps must be an integer >= 1."):
+        _, _ = compare_and_rank_results(
+            fe_map,
+            num_bootstraps=0
+        )
+
+
+@pytest.mark.parametrize("ci", [
+    0, 1, "one"
+])
+def test_bad_ci(fe_map, ci):
+    with pytest.raises(ValueError, match="confidence_level must be a number between 0 and 1"):
+        _, _ = compare_and_rank_results(
+            fe_map,
+            confidence_level=ci,
+        )
+
+
+@pytest.mark.parametrize("alpha", [
+    0, 1, "one"
+])
+def test_bad_alpha(fe_map, alpha):
+    with pytest.raises(ValueError, match="alpha must be a number between 0 and 1"):
+        _, _ = compare_and_rank_results(
+            fe_map,
+            alpha=alpha,
+        )
\ No newline at end of file

From 5a861409737f37eeae1e1094fe6d920d52d4f4f2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 17 Jun 2026 10:20:40 +0000
Subject: [PATCH 16/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 cinnabar/compare.py            |  4 +++-
 cinnabar/tests/test_compare.py | 15 ++++-----------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/cinnabar/compare.py b/cinnabar/compare.py
index d6428abe..15a6a879 100644
--- a/cinnabar/compare.py
+++ b/cinnabar/compare.py
@@ -134,7 +134,9 @@ def compare_and_rank_results(
         if metric not in _AVAILABLE_STATS:
             raise ValueError(f"Metric {metric} is not available.")
 
-    metrics_by_source: dict[str, dict[str, list[float]]] = {source: {metric: [] for metric in metrics_to_compute} for source in sources}
+    metrics_by_source: dict[str, dict[str, list[float]]] = {
+        source: {metric: [] for metric in metrics_to_compute} for source in sources
+    }
     pairwise_metrics = {}
     # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap
     for _ in range(num_bootstraps):
diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py
index 13a1af68..1f7e95b2 100644
--- a/cinnabar/tests/test_compare.py
+++ b/cinnabar/tests/test_compare.py
@@ -173,15 +173,10 @@ def test_missing_node_values(fe_map):
 
 def test_low_bootstrap_samples(fe_map):
     with pytest.raises(ValueError, match="num_bootstraps must be an integer >= 1."):
-        _, _ = compare_and_rank_results(
-            fe_map,
-            num_bootstraps=0
-        )
+        _, _ = compare_and_rank_results(fe_map, num_bootstraps=0)
 
 
-@pytest.mark.parametrize("ci", [
-    0, 1, "one"
-])
+@pytest.mark.parametrize("ci", [0, 1, "one"])
 def test_bad_ci(fe_map, ci):
     with pytest.raises(ValueError, match="confidence_level must be a number between 0 and 1"):
         _, _ = compare_and_rank_results(
@@ -190,12 +185,10 @@ def test_bad_ci(fe_map, ci):
         )
 
 
-@pytest.mark.parametrize("alpha", [
-    0, 1, "one"
-])
+@pytest.mark.parametrize("alpha", [0, 1, "one"])
 def test_bad_alpha(fe_map, alpha):
     with pytest.raises(ValueError, match="alpha must be a number between 0 and 1"):
         _, _ = compare_and_rank_results(
             fe_map,
             alpha=alpha,
-        )
\ No newline at end of file
+        )

From 8354a91b1141fba004e27429580dd2e56e90ffb9 Mon Sep 17 00:00:00 2001
From: Josh Horton <Josh.Horton@newcastle.ac.uk>
Date: Wed, 17 Jun 2026 11:27:59 +0100
Subject: [PATCH 17/17] update info and warning boxes

---
 examples/results_comparison.ipynb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb
index dce2193f..06143c3c 100644
--- a/examples/results_comparison.ipynb
+++ b/examples/results_comparison.ipynb
@@ -12,7 +12,7 @@
     "This tutorial walks through the methodology and demonstrates how to use the ``compare_and_rank_results`` function with example RBFE data.\n",
     "\n",
     "<div class=\"alert alert-block alert-info\">\n",
-    "<b>Note</b> The comparison methodology implemented here is inspired by the approach described in <a href=\"https://www.nature.com/articles/s42004-025-01428-y\">Valsson *et al.*</a>.\n",
+    "<b>Note:</b> The comparison methodology implemented here is inspired by the approach described in <a href=\"https://www.nature.com/articles/s42004-025-01428-y\">Valsson *et al.*</a>.\n",
     "</div>"
    ]
   },
@@ -41,11 +41,11 @@
     "## Limitations\n",
     "\n",
     "<div class=\"alert alert-block alert-warning\">\n",
-    "<b>Important</b> RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to <b>underestimate</b> confidence intervals as the effective sample size is closer to <b>N_ligands</b> than to <b>N_edges</b> as assumed by the method.\n",
+    "<b>Important:</b> RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to <b>underestimate</b> confidence intervals as the effective sample size is closer to <b>N_ligands</b> than to <b>N_edges</b> as assumed by the method.\n",
     "</div>\n",
     "\n",
     "<div class=\"alert alert-block alert-warning\">\n",
-    "<b>Important</b> ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for <b>absolute</b> binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.\n",
+    "<b>Important:</b> ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for <b>absolute</b> binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.\n",
     "</div>\n",
     "\n",
     "### Requirements\n",
@@ -510,7 +510,7 @@
     "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise is in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` share a letter as their difference is not large enough to be resolved at this sample size.\n",
     "\n",
     "<div class=\"alert alert-block alert-info\">\n",
-    "<b>Tip</b> When comparing only <b>two</b> sources the <b>p-value corrected</b> column is absent (no correction needed) and the raw <b>p-value</b> drives the <b>significant</b> flag directly.\n",
+    "<b>Tip:</b> When comparing only <b>two</b> sources the <b>p-value corrected</b> column is absent (no correction needed) and the raw <b>p-value</b> drives the <b>significant</b> flag directly.\n",
     "</div>"
    ]
   },