From 639fbbe81e637d22c690655c8decbc19a391d817 Mon Sep 17 00:00:00 2001 From: Josh Horton Date: Fri, 16 Jan 2026 10:40:05 +0000 Subject: [PATCH 01/17] add compare and rank method for FEMaps --- cinnabar/compare.py | 275 +++++++++++++++++++++++++++++++++ cinnabar/stats.py | 160 ++++++++++++++++--- cinnabar/tests/test_compare.py | 48 ++++++ 3 files changed, 463 insertions(+), 20 deletions(-) create mode 100644 cinnabar/compare.py create mode 100644 cinnabar/tests/test_compare.py diff --git a/cinnabar/compare.py b/cinnabar/compare.py new file mode 100644 index 00000000..90e85ed0 --- /dev/null +++ b/cinnabar/compare.py @@ -0,0 +1,275 @@ +import string + +import numpy as np + +from cinnabar import FEMap +from typing import Literal +import pandas as pd +from collections import defaultdict +from cinnabar.stats import AVAILABLE_STATS +import itertools + + + +def compare_and_rank_femaps( + femaps: list[FEMap], + labels: list[str], + prediction_type: Literal["nodewise", "edgewise", "pairwise"] = "pairwise", + rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"] = "MUE", + metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]] | None = None, + num_bootstraps: int = 1000, + confidence_level: float = 0.95, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Compare and rank multiple FEMaps based on the chosen performance metric and return an ordered table of + results using the compact letter display (CLD) system. + + Parameters + ---------- + femaps : list[FEMap] + A list of FEMap instances to compare. + labels : list[str] + A list of labels corresponding to each FEMap used in the output tables. + prediction_type: Literal["nodewise", "edgewise", "pairwise"], optional + The type of prediction to evaluate, by default "pairwise". + rank_metric : Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"], optional + The metric used to rank the models, by default "MUE". + metrics_to_compute : list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]], optional + A list of metrics to compute for each model. If None, all metrics appropriate for the `prediction_type` will be computed. + num_bootstraps : int, optional + The number of bootstrap samples to use for estimating confidence intervals, by default 1000. + confidence_level : float, optional + The confidence level for the intervals, by default 0.95. + + Note + ---- + - We assume that all FEMaps have been evaluated on the same test set. + - Prediction types "nodewise", "edgewise", and "pairwise" correspond to DGs, edgewise DDGs and pairwise DDGs (back calculated from nodewise DGs) respectively. + - When we have more than 2 models, we apply multiple testing correction to the pairwise comparisons using the ``Holm`` method by default. + + Returns + ------- + tuple[pd.DataFrame, pd.DataFrame] + A tuple containing two DataFrames: + - The first DataFrame contains the computed metrics for each model. + - The second DataFrame contains the pairwise comparison results between models. + """ + + + # get the predictions and experimental values from the FEMaps and align them into a DataFrame + predictions_by_key = defaultdict(dict) + for label, femap in zip(labels, femaps): + graph = femap.to_legacy_graph() + if prediction_type == "nodewise": + #TODO should we expose a shift or centering option here? + for node, data in graph.nodes(data=True): + predictions_by_key[node][f"{label}_calc"] = data["calc_DG"] + if "exp_DG" not in predictions_by_key[node]: + predictions_by_key[node]["exp"] = data["exp_DG"] + elif prediction_type == "edgewise": + for a, b, data in graph.edges(data=True): + # we assume all edges have been run in the same direction + key = (a, b) + predictions_by_key[key][f"{label}_calc"] = data["calc_DDG"] + if "exp_DDG" not in predictions_by_key[key]: + predictions_by_key[key]["exp"] = data["exp_DDG"] + elif prediction_type == "pairwise": + nodes = list(graph.nodes()) + for a, b in itertools.combinations(nodes, 2): + exp = graph.nodes[b]["exp_DG"] - graph.nodes[a]["exp_DG"] + calc = graph.nodes[b]["calc_DG"] - graph.nodes[a]["calc_DG"] + key = (a, b) + predictions_by_key[key][f"{label}_calc"] = calc + if "exp" not in predictions_by_key[key]: + predictions_by_key[key]["exp"] = exp + else: + raise ValueError(f"Invalid prediction_type: {prediction_type}") + + predictions_df = pd.DataFrame(predictions_by_key.values()) + + # set metrics to compute based on best practices if not provided + if metrics_to_compute is None: + if prediction_type in ["pairwise", "edgewise"]: + metrics_to_compute = ["MUE", "RMSE"] + elif prediction_type == "nodewise": + metrics_to_compute = ["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"] + + # check we can compute all requested metrics + for metric in metrics_to_compute: + if metric not in AVAILABLE_STATS: + raise ValueError(f"Metric {metric} is not available.") + + metrics_by_label = dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels) + pairwise_metrics = {} + # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap + for _ in range(num_bootstraps): + bootstrap_sample = predictions_df.sample(frac=1.0, replace=True) + for label in labels: + y_true = bootstrap_sample["exp"].values + y_pred = bootstrap_sample[f"{label}_calc"].values + for metric in metrics_to_compute: + value = AVAILABLE_STATS[metric](y_true, y_pred) + metrics_by_label[label][metric].append(value) + + # compute pairwise differences for ranking metric + for i, label_i in enumerate(labels): + for j, label_j in enumerate(labels): + if j <= i: + continue + diffs = np.array(metrics_by_label[label_i][rank_metric]) - np.array(metrics_by_label[label_j][rank_metric]) + pairwise_metrics[(label_i, label_j)] = diffs + + # summarize all metrics + summary_data = [] + for label in labels: + row = {"Model": label} + for metric in metrics_to_compute: + # compute the sample metric + x = predictions_df[f"{label}_calc"].values + y = predictions_df["exp"].values + sample_value = AVAILABLE_STATS[metric](y, x) + bootstrap_values = np.array(metrics_by_label[label][metric]) + lower = np.percentile(bootstrap_values, (1 - confidence_level) / 2 * 100) + upper = np.percentile(bootstrap_values, (1 + confidence_level) / 2 * 100) + row[f"{metric}"] = sample_value + row[f"{metric}_CI_Lower"] = lower + row[f"{metric}_CI_Upper"] = upper + summary_data.append(row) + summary_df = pd.DataFrame(summary_data) + + # summarize pairwise comparisons with corrected p-values + comparison_data = [] + for (label_i, label_j), diffs in pairwise_metrics.items(): + lower = np.percentile(diffs, (1 - confidence_level) / 2 * 100) + upper = np.percentile(diffs, (1 + confidence_level) / 2 * 100) + # calculate the p-value as the fraction of bootstrap samples that cross zero + # use a 2-tailed test + # inspired by https://www.nature.com/articles/s42004-025-01428-y + p_value = 2 * min(np.mean(diffs < 0), np.mean(diffs > 0)) + comparison_data.append({ + "Model 1": label_i, + "Model 2": label_j, + f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0] - summary_df[summary_df["Model"] == label_j][rank_metric].values[0], + "CI Lower": lower, + "CI Upper": upper, + "p-value": p_value, + "significant": p_value < 0.05 + }) + comparison_df = pd.DataFrame(comparison_data) + + # if we have more than 2 models, apply multiple testing correction + if len(labels) > 2: + from statsmodels.stats.multitest import multipletests + p_values = comparison_df["p-value"].values + reject, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method="holm") + comparison_df["p-value corrected"] = pvals_corrected + # add corrected significance + comparison_df["significant"] = pvals_corrected < 0.05 + + # rank the models and apply the CLD + # the order depends on whether lower or higher is better for the rank metric + if rank_metric in ["MUE", "RMSE", "RAE"]: + ascending = True # lower is better + else: + ascending = False # higher is better + ordered_labels = summary_df.sort_values(by=rank_metric, ascending=ascending).Model.tolist() + cld_assignment = _apply_cld(comparison_df, ordered_labels) + summary_df["CLD"] = summary_df["Model"].map(cld_assignment) + + return summary_df, comparison_df + + +def _build_significance_lookup(pairwise_df: pd.DataFrame): + """ + Build a lookup dictionary for significance between model pairs. + + Parameters + ---------- + pairwise_df : pd.DataFrame + DataFrame containing pairwise comparison results with a 'significant' column. + + Returns + ------- + dict[frozenset, bool] + A dictionary mapping frozensets of model pairs to their significance (True if significant, False otherwise). + """ + sig = {} + for _, row in pairwise_df.iterrows(): + m1, m2 = row["Model 1"], row["Model 2"] + sig[frozenset((m1, m2))] = row.significant + return sig + + +def _apply_cld(comparison_df: pd.DataFrame, ordered_labels: list[str]) -> dict[str, str]: + """ + Apply the Compact Letter Display (CLD) system to the pairwise comparison results using the insert-absorb method. + + Parameters + ---------- + comparison_df : pd.DataFrame + DataFrame containing pairwise comparison results with a 'significant' column. + ordered_labels : list[str] + List of model labels sorted by metric performance (best to worst). + + Returns + ------- + dict[str, str] + A dictionary mapping each model label to its assigned CLD letters. + """ + + sig = _build_significance_lookup(comparison_df) + + # Each letter is represented as a set of methods which are not significantly different + letters = [] + + def is_compatible(meth, let): + """Check if method can join this letter set""" + for other in let: + if sig.get(frozenset((meth, other)), False): + return False + return True + + def absorb(lets): + """Remove letters that are strict subsets of others""" + absorbed = [] + for i, a in enumerate(lets): + redundant = False + for j, b in enumerate(lets): + if i != j and a < b: # a is a strict subset of b + redundant = True + break + if not redundant: + absorbed.append(a) + return absorbed + + # For each method in order, try to insert into existing letters or create new letter + for method in ordered_labels: + inserted = False + + # Try inserting into existing letters + for letter in letters: + if is_compatible(method, letter): + letter.add(method) + inserted = True + + # If no insertion possible, create new letter + if not inserted: + letters.append({method}) + + # Absorb step + letters = absorb(letters) + + # Convert to method → string mapping + alphabet = string.ascii_lowercase + if len(letters) > len(alphabet): + raise ValueError("Too many letters required for CLD") + + result = {m: "" for m in ordered_labels} + for i, letter in enumerate(letters): + for m in letter: + result[m] += alphabet[i] + + return result + + + diff --git a/cinnabar/stats.py b/cinnabar/stats.py index 7a864c3c..6aa1e216 100644 --- a/cinnabar/stats.py +++ b/cinnabar/stats.py @@ -15,6 +15,144 @@ ) +def calculate_rmse( + y_true: np.ndarray, + y_pred: np.ndarray, +) -> float: + """Compute root mean squared error between true and predicted values. + + Parameters + ---------- + y_true : ndarray with shape (N,) + True values + y_pred : ndarray with shape (N,) + Predicted values + + Returns + ------- + rmse : float + RMSE between true and predicted values + """ + return np.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred)) + + +def calculate_mue( + y_true: np.ndarray, + y_pred: np.ndarray, +) -> float: + """Compute mean unsigned error between true and predicted values. + + Parameters + ---------- + y_true : ndarray with shape (N,) + True values + y_pred : ndarray with shape (N,) + Predicted values + + Returns + ------- + mue : float + MUE between true and predicted values + """ + return sklearn.metrics.mean_absolute_error(y_true, y_pred) + + +def calculate_rae( + y_true: np.ndarray, + y_pred: np.ndarray, +) -> float: + """Compute relative absolute error between true and predicted values. + + Parameters + ---------- + y_true : ndarray with shape (N,) + True values + y_pred : ndarray with shape (N,) + Predicted values + + Returns + ------- + rae : float + RAE between true and predicted values + """ + MAE = sklearn.metrics.mean_absolute_error(y_true, y_pred) + mean = np.mean(y_true) + MAD = np.sum([np.abs(mean - i) for i in y_true]) / float(len(y_true)) + return MAE / MAD + + +def calculate_r2( + y_true: np.ndarray, + y_pred: np.ndarray, +) -> float: + """Compute R^2 between true and predicted values. + + Parameters + ---------- + y_true : ndarray with shape (N,) + True values + y_pred : ndarray with shape (N,) + Predicted values + + Returns + ------- + r2 : float + R^2 between true and predicted values + """ + _, _, r_value, _, _ = scipy.stats.linregress(y_true, y_pred) + return r_value**2 + + +def calculate_pearson_r( + y_true: np.ndarray, + y_pred: np.ndarray, +) -> float: + """Compute Pearson's r between true and predicted values. + + Parameters + ---------- + y_true : ndarray with shape (N,) + True values + y_pred : ndarray with shape (N,) + Predicted values + + Returns + ------- + r : float + Pearson's r between true and predicted values + """ + return scipy.stats.pearsonr(y_true, y_pred)[0] + + +def calculate_kendalls_tau( + y_true: np.ndarray, + y_pred: np.ndarray, +) -> float: + """Compute Kendall's tau between true and predicted values. + + Parameters + ---------- + y_true : ndarray with shape (N,) + True values + y_pred : ndarray with shape (N,) + Predicted values + + Returns + ------- + tau : float + Kendall's tau between true and predicted values + """ + return scipy.stats.kendalltau(y_true, y_pred)[0] + +AVAILABLE_STATS = { + "RMSE": calculate_rmse, + "MUE": calculate_mue, + "RAE": calculate_rae, + "R2": calculate_r2, + "rho": calculate_pearson_r, + "KTAU": calculate_kendalls_tau, +} + def bootstrap_statistic( y_true: np.ndarray, y_pred: np.ndarray, @@ -75,12 +213,6 @@ def compute_statistic(y_true_sample: np.ndarray, y_pred_sample: np.ndarray, stat """ - def calc_RAE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray): - MAE = sklearn.metrics.mean_absolute_error(y_true_sample, y_pred_sample) - mean = np.mean(y_true_sample) - MAD = np.sum([np.abs(mean - i) for i in y_true_sample]) / float(len(y_true_sample)) - return MAE / MAD - def calc_RRMSE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray): rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_true_sample, y_pred_sample)) mean_exp = np.mean(y_true_sample) @@ -88,21 +220,9 @@ def calc_RRMSE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray): rrmse = np.sqrt(rmse**2 / mds) return rrmse - if statistic == "RMSE": - return np.sqrt(sklearn.metrics.mean_squared_error(y_true_sample, y_pred_sample)) - elif statistic == "MUE": - return sklearn.metrics.mean_absolute_error(y_true_sample, y_pred_sample) - elif statistic == "R2": - slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(y_true_sample, y_pred_sample) - return r_value**2 - elif statistic == "rho": - return scipy.stats.pearsonr(y_true_sample, y_pred_sample)[0] - elif statistic == "RAE": - return calc_RAE(y_true_sample, y_pred_sample) - elif statistic == "KTAU": - return scipy.stats.kendalltau(y_true_sample, y_pred_sample)[0] - else: + if statistic not in AVAILABLE_STATS: raise Exception("unknown statistic '{}'".format(statistic)) + return AVAILABLE_STATS[statistic](y_true_sample, y_pred_sample) # not used? def unique_differences(x): diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py new file mode 100644 index 00000000..866f1733 --- /dev/null +++ b/cinnabar/tests/test_compare.py @@ -0,0 +1,48 @@ +from cinnabar.compare import compare_and_rank_femaps +import networkx as nx +import numpy as np +from cinnabar import FEMap + + +def test_compare_and_rank_femaps(fe_map): + graph1 = nx.MultiDiGraph() + graph2 = nx.MultiDiGraph() + for a, b, data in fe_map.to_networkx().edges(data=True): + new_data = data.copy() + if data["source"] != "reverse" and data["computational"]: + # add noise to the result + new_result = data["DG"] + np.random.normal(0, data["uncertainty"].m) * data["DG"].u + new_data["DG"] = new_result + graph1.add_edge(a, b, **new_data) + # and add the reverse edge + rev_data = new_data.copy() + rev_data["source"] = "reverse" + rev_data["DG"] = -new_data["DG"] + graph1.add_edge(b, a, **rev_data) + # add a large value to the second graph to simulate a bad prediction + new_result2 = data["DG"] + 1.5 * data["DG"].u + new_data["DG"] = new_result2 + graph2.add_edge(a, b, **new_data) + # add the reverse edge + rev_data2 = new_data.copy() + rev_data2["source"] = "reverse" + rev_data2["DG"] = -new_data["DG"] + graph2.add_edge(b, a, **rev_data2) + + else: + graph1.add_edge(a, b, **data) + graph2.add_edge(a, b, **data) + fe_map_2 = FEMap.from_networkx(graph1) + fe_map_3 = FEMap.from_networkx(graph2) + + t1, t2 = compare_and_rank_femaps([fe_map, fe_map_2, fe_map_3], ["FE Map 1", "FE Map 2", "FE Map 3"], prediction_type="nodewise", rank_metric="rho") + print(t1) + print(t2) + # check that FE Map 3 is ranked worst + assert t1[t1["Model"] == "FE Map 3"]["CLD"].values[0] == "b" + # check that FE Map 1 and FE Map 2 are ranked better + assert t1[t1["Model"] == "FE Map 1"]["CLD"].values[0] == "a" + assert t1[t1["Model"] == "FE Map 2"]["CLD"].values[0] == "a" + # check that the comparison table has all three models and corrected p-values + assert len(t2) == 3 + assert "p-value corrected" in t2.columns From 1ab43807334cfaaff0d338d6110c3e96b40b8f48 Mon Sep 17 00:00:00 2001 From: Josh Horton Date: Fri, 16 Jan 2026 11:13:01 +0000 Subject: [PATCH 02/17] fix pairwise comparisons, remove prints --- cinnabar/compare.py | 3 ++- cinnabar/tests/test_compare.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cinnabar/compare.py b/cinnabar/compare.py index 90e85ed0..b770e500 100644 --- a/cinnabar/compare.py +++ b/cinnabar/compare.py @@ -74,7 +74,8 @@ def compare_and_rank_femaps( if "exp_DDG" not in predictions_by_key[key]: predictions_by_key[key]["exp"] = data["exp_DDG"] elif prediction_type == "pairwise": - nodes = list(graph.nodes()) + # incase the networks nodes are in a different order, sort them first to get consistent pairs + nodes = sorted(list(graph.nodes())) for a, b in itertools.combinations(nodes, 2): exp = graph.nodes[b]["exp_DG"] - graph.nodes[a]["exp_DG"] calc = graph.nodes[b]["calc_DG"] - graph.nodes[a]["calc_DG"] diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py index 866f1733..2dee2876 100644 --- a/cinnabar/tests/test_compare.py +++ b/cinnabar/tests/test_compare.py @@ -36,8 +36,7 @@ def test_compare_and_rank_femaps(fe_map): fe_map_3 = FEMap.from_networkx(graph2) t1, t2 = compare_and_rank_femaps([fe_map, fe_map_2, fe_map_3], ["FE Map 1", "FE Map 2", "FE Map 3"], prediction_type="nodewise", rank_metric="rho") - print(t1) - print(t2) + # check that FE Map 3 is ranked worst assert t1[t1["Model"] == "FE Map 3"]["CLD"].values[0] == "b" # check that FE Map 1 and FE Map 2 are ranked better From b0dbba7c119a10e6109a3935a9e5e1c294219011 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Jan 2026 11:13:49 +0000 Subject: [PATCH 03/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cinnabar/compare.py | 39 +++++++++++++++++----------------- cinnabar/stats.py | 2 ++ cinnabar/tests/test_compare.py | 10 +++++++-- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/cinnabar/compare.py b/cinnabar/compare.py index 90e85ed0..b8338f49 100644 --- a/cinnabar/compare.py +++ b/cinnabar/compare.py @@ -1,14 +1,13 @@ +import itertools import string +from collections import defaultdict +from typing import Literal import numpy as np +import pandas as pd from cinnabar import FEMap -from typing import Literal -import pandas as pd -from collections import defaultdict from cinnabar.stats import AVAILABLE_STATS -import itertools - def compare_and_rank_femaps( @@ -55,13 +54,12 @@ def compare_and_rank_femaps( - The second DataFrame contains the pairwise comparison results between models. """ - # get the predictions and experimental values from the FEMaps and align them into a DataFrame predictions_by_key = defaultdict(dict) for label, femap in zip(labels, femaps): graph = femap.to_legacy_graph() if prediction_type == "nodewise": - #TODO should we expose a shift or centering option here? + # TODO should we expose a shift or centering option here? for node, data in graph.nodes(data=True): predictions_by_key[node][f"{label}_calc"] = data["calc_DG"] if "exp_DG" not in predictions_by_key[node]: @@ -99,7 +97,7 @@ def compare_and_rank_femaps( if metric not in AVAILABLE_STATS: raise ValueError(f"Metric {metric} is not available.") - metrics_by_label = dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels) + metrics_by_label = dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels) pairwise_metrics = {} # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap for _ in range(num_bootstraps): @@ -146,20 +144,24 @@ def compare_and_rank_femaps( # use a 2-tailed test # inspired by https://www.nature.com/articles/s42004-025-01428-y p_value = 2 * min(np.mean(diffs < 0), np.mean(diffs > 0)) - comparison_data.append({ - "Model 1": label_i, - "Model 2": label_j, - f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0] - summary_df[summary_df["Model"] == label_j][rank_metric].values[0], - "CI Lower": lower, - "CI Upper": upper, - "p-value": p_value, - "significant": p_value < 0.05 - }) + comparison_data.append( + { + "Model 1": label_i, + "Model 2": label_j, + f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0] + - summary_df[summary_df["Model"] == label_j][rank_metric].values[0], + "CI Lower": lower, + "CI Upper": upper, + "p-value": p_value, + "significant": p_value < 0.05, + } + ) comparison_df = pd.DataFrame(comparison_data) # if we have more than 2 models, apply multiple testing correction if len(labels) > 2: from statsmodels.stats.multitest import multipletests + p_values = comparison_df["p-value"].values reject, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method="holm") comparison_df["p-value corrected"] = pvals_corrected @@ -270,6 +272,3 @@ def absorb(lets): result[m] += alphabet[i] return result - - - diff --git a/cinnabar/stats.py b/cinnabar/stats.py index 6aa1e216..a2346b08 100644 --- a/cinnabar/stats.py +++ b/cinnabar/stats.py @@ -144,6 +144,7 @@ def calculate_kendalls_tau( """ return scipy.stats.kendalltau(y_true, y_pred)[0] + AVAILABLE_STATS = { "RMSE": calculate_rmse, "MUE": calculate_mue, @@ -153,6 +154,7 @@ def calculate_kendalls_tau( "KTAU": calculate_kendalls_tau, } + def bootstrap_statistic( y_true: np.ndarray, y_pred: np.ndarray, diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py index 866f1733..6d284efb 100644 --- a/cinnabar/tests/test_compare.py +++ b/cinnabar/tests/test_compare.py @@ -1,7 +1,8 @@ -from cinnabar.compare import compare_and_rank_femaps import networkx as nx import numpy as np + from cinnabar import FEMap +from cinnabar.compare import compare_and_rank_femaps def test_compare_and_rank_femaps(fe_map): @@ -35,7 +36,12 @@ def test_compare_and_rank_femaps(fe_map): fe_map_2 = FEMap.from_networkx(graph1) fe_map_3 = FEMap.from_networkx(graph2) - t1, t2 = compare_and_rank_femaps([fe_map, fe_map_2, fe_map_3], ["FE Map 1", "FE Map 2", "FE Map 3"], prediction_type="nodewise", rank_metric="rho") + t1, t2 = compare_and_rank_femaps( + [fe_map, fe_map_2, fe_map_3], + ["FE Map 1", "FE Map 2", "FE Map 3"], + prediction_type="nodewise", + rank_metric="rho", + ) print(t1) print(t2) # check that FE Map 3 is ranked worst From 739ba9907051e35e8a4808a023a66a1ddd30a303 Mon Sep 17 00:00:00 2001 From: Josh Horton Date: Tue, 19 May 2026 10:33:54 +0100 Subject: [PATCH 04/17] rework to use a single femap, remove pairwise, add more tests and a tutorial --- cinnabar/compare.py | 155 +++--- cinnabar/tests/test_compare.py | 167 +++++-- docs/api.rst | 1 + docs/tutorials/index.rst | 1 + docs/tutorials/results_comparison.ipynb | 1 + examples/results_comparison.ipynb | 598 ++++++++++++++++++++++++ 6 files changed, 808 insertions(+), 115 deletions(-) create mode 120000 docs/tutorials/results_comparison.ipynb create mode 100644 examples/results_comparison.ipynb diff --git a/cinnabar/compare.py b/cinnabar/compare.py index c2cd5dc7..c76e7e9f 100644 --- a/cinnabar/compare.py +++ b/cinnabar/compare.py @@ -1,4 +1,3 @@ -import itertools import string from collections import defaultdict from typing import Literal @@ -7,33 +6,30 @@ import pandas as pd from cinnabar import FEMap -from cinnabar.stats import AVAILABLE_STATS +from cinnabar.stats import _AVAILABLE_STATS -def compare_and_rank_femaps( - femaps: list[FEMap], - labels: list[str], - prediction_type: Literal["nodewise", "edgewise", "pairwise"] = "pairwise", - rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"] = "MUE", - metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]] | None = None, - num_bootstraps: int = 1000, +def compare_and_rank_results( + femap: FEMap, + prediction_type: Literal["nodewise", "edgewise"] = "edgewise", + rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"] = "MUE", + metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU","PI"]] | None = None, + num_bootstraps: int = 1_000, confidence_level: float = 0.95, ) -> tuple[pd.DataFrame, pd.DataFrame]: """ - Compare and rank multiple FEMaps based on the chosen performance metric and return an ordered table of + Compare and rank multiple result sources on a single FEMap based on the chosen performance metric and return an ordered table of results using the compact letter display (CLD) system. Parameters ---------- - femaps : list[FEMap] - A list of FEMap instances to compare. - labels : list[str] - A list of labels corresponding to each FEMap used in the output tables. - prediction_type: Literal["nodewise", "edgewise", "pairwise"], optional - The type of prediction to evaluate, by default "pairwise". - rank_metric : Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"], optional + femap : FEMap + An FEMap instance with results from multiple sources to compare. + prediction_type: Literal["nodewise", "edgewise"], optional + The type of prediction to evaluate, by default "edgewise". + rank_metric : Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"], optional The metric used to rank the models, by default "MUE". - metrics_to_compute : list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"]], optional + metrics_to_compute : list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"]], optional A list of metrics to compute for each model. If None, all metrics appropriate for the `prediction_type` will be computed. num_bootstraps : int, optional The number of bootstrap samples to use for estimating confidence intervals, by default 1000. @@ -42,8 +38,9 @@ def compare_and_rank_femaps( Note ---- - - We assume that all FEMaps have been evaluated on the same test set. - - Prediction types "nodewise", "edgewise", and "pairwise" correspond to DGs, edgewise DDGs and pairwise DDGs (back calculated from nodewise DGs) respectively. + - The comparison method uses a joint bootstrapping procedure that generates a distribution of differences in the rank metric and checks for significant differences using a method inspired by. [1]_ + - Each source must be evaluated on the same set of edges. + - Prediction types "nodewise" and "edgewise" correspond to DGs and edgewise DDGs respectively. - When we have more than 2 models, we apply multiple testing correction to the pairwise comparisons using the ``Holm`` method by default. Returns @@ -52,82 +49,100 @@ def compare_and_rank_femaps( A tuple containing two DataFrames: - The first DataFrame contains the computed metrics for each model. - The second DataFrame contains the pairwise comparison results between models. + + References + ---------- + .. [1] https://www.nature.com/articles/s42004-025-01428-y """ # get the predictions and experimental values from the FEMaps and align them into a DataFrame predictions_by_key = defaultdict(dict) - for label, femap in zip(labels, femaps): - graph = femap.to_legacy_graph() - if prediction_type == "nodewise": - # TODO should we expose a shift or centering option here? - for node, data in graph.nodes(data=True): - predictions_by_key[node][f"{label}_calc"] = data["calc_DG"] - if "exp_DG" not in predictions_by_key[node]: - predictions_by_key[node]["exp"] = data["exp_DG"] - elif prediction_type == "edgewise": - for a, b, data in graph.edges(data=True): - # we assume all edges have been run in the same direction - key = (a, b) - predictions_by_key[key][f"{label}_calc"] = data["calc_DDG"] - if "exp_DDG" not in predictions_by_key[key]: - predictions_by_key[key]["exp"] = data["exp_DDG"] - elif prediction_type == "pairwise": - # incase the networks nodes are in a different order, sort them first to get consistent pairs - nodes = sorted(list(graph.nodes())) - for a, b in itertools.combinations(nodes, 2): - exp = graph.nodes[b]["exp_DG"] - graph.nodes[a]["exp_DG"] - calc = graph.nodes[b]["calc_DG"] - graph.nodes[a]["calc_DG"] - key = (a, b) - predictions_by_key[key][f"{label}_calc"] = calc - if "exp" not in predictions_by_key[key]: - predictions_by_key[key]["exp"] = exp - else: - raise ValueError(f"Invalid prediction_type: {prediction_type}") + if prediction_type == "nodewise": + # make sure we have absolute values + femap.generate_absolute_values() + abs_df = femap.get_absolute_dataframe() + # get the computational sources we want to compare + sources = abs_df[abs_df["computational"] == True]["source"].unique() + # for each row add the node prediction + for _, row in abs_df.iterrows(): + node_label = row["label"] + if not row["computational"]: + source_label = "exp" + else: + source_label = f"{row["source"]}_calc" + predictions_by_key[node_label][source_label] = row["DG (kcal/mol)"] + + elif prediction_type == "edgewise": + rel_df = femap.get_relative_dataframe() + sources =rel_df[rel_df["computational"] == True]["source"].unique() + for _, row in rel_df.iterrows(): + key = (row["labelA"], row["labelB"]) + if not row["computational"]: + source_label = "exp" + else: + source_label = f"{row["source"]}_calc" + predictions_by_key[key][source_label] = row["DDG (kcal/mol)"] + else: + raise ValueError(f"Invalid prediction_type: {prediction_type}") predictions_df = pd.DataFrame(predictions_by_key.values()) + # check that we have experimental values for all edges + if "exp" not in predictions_df.columns: + raise ValueError("Experimental values are required to rank the results.") + + # check that we have the same number of values for all sources + for source in sources: + # check if any values for this source are missing + if any(predictions_df[f"{source}_calc"].isna()): + raise ValueError(f"Missing predictions for source {source}, all sources must have the same number of predictions.") + # set metrics to compute based on best practices if not provided if metrics_to_compute is None: - if prediction_type in ["pairwise", "edgewise"]: + if prediction_type == "edgewise": metrics_to_compute = ["MUE", "RMSE"] elif prediction_type == "nodewise": - metrics_to_compute = ["MUE", "RMSE", "RAE", "R2", "rho", "KTAU"] + metrics_to_compute = ["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"] + + # we must compute the rank metric however it is possible to miss it + if rank_metric not in metrics_to_compute: + metrics_to_compute.append(rank_metric) # check we can compute all requested metrics for metric in metrics_to_compute: - if metric not in AVAILABLE_STATS: + if metric not in _AVAILABLE_STATS: raise ValueError(f"Metric {metric} is not available.") - metrics_by_label = dict((label, dict((metric, []) for metric in metrics_to_compute)) for label in labels) + metrics_by_source = dict((source, dict((metric, []) for metric in metrics_to_compute)) for source in sources) pairwise_metrics = {} # compute bootstrap metrics for each model using the same bootstrap samples, joint bootstrap for _ in range(num_bootstraps): bootstrap_sample = predictions_df.sample(frac=1.0, replace=True) - for label in labels: + for source in sources: y_true = bootstrap_sample["exp"].values - y_pred = bootstrap_sample[f"{label}_calc"].values + y_pred = bootstrap_sample[f"{source}_calc"].values for metric in metrics_to_compute: - value = AVAILABLE_STATS[metric](y_true, y_pred) - metrics_by_label[label][metric].append(value) + value = _AVAILABLE_STATS[metric](y_true, y_pred) + metrics_by_source[source][metric].append(value) # compute pairwise differences for ranking metric - for i, label_i in enumerate(labels): - for j, label_j in enumerate(labels): + for i, source_i in enumerate(sources): + for j, source_j in enumerate(sources): if j <= i: continue - diffs = np.array(metrics_by_label[label_i][rank_metric]) - np.array(metrics_by_label[label_j][rank_metric]) - pairwise_metrics[(label_i, label_j)] = diffs + diffs = np.array(metrics_by_source[source_i][rank_metric]) - np.array(metrics_by_source[source_j][rank_metric]) + pairwise_metrics[(source_i, source_j)] = diffs # summarize all metrics summary_data = [] - for label in labels: - row = {"Model": label} + for source in sources: + row = {"Model": source} for metric in metrics_to_compute: # compute the sample metric - x = predictions_df[f"{label}_calc"].values + x = predictions_df[f"{source}_calc"].values y = predictions_df["exp"].values - sample_value = AVAILABLE_STATS[metric](y, x) - bootstrap_values = np.array(metrics_by_label[label][metric]) + sample_value = _AVAILABLE_STATS[metric](y, x) + bootstrap_values = np.array(metrics_by_source[source][metric]) lower = np.percentile(bootstrap_values, (1 - confidence_level) / 2 * 100) upper = np.percentile(bootstrap_values, (1 + confidence_level) / 2 * 100) row[f"{metric}"] = sample_value @@ -138,7 +153,7 @@ def compare_and_rank_femaps( # summarize pairwise comparisons with corrected p-values comparison_data = [] - for (label_i, label_j), diffs in pairwise_metrics.items(): + for (source_i, source_j), diffs in pairwise_metrics.items(): lower = np.percentile(diffs, (1 - confidence_level) / 2 * 100) upper = np.percentile(diffs, (1 + confidence_level) / 2 * 100) # calculate the p-value as the fraction of bootstrap samples that cross zero @@ -147,10 +162,10 @@ def compare_and_rank_femaps( p_value = 2 * min(np.mean(diffs < 0), np.mean(diffs > 0)) comparison_data.append( { - "Model 1": label_i, - "Model 2": label_j, - f"Diff in {rank_metric}": summary_df[summary_df["Model"] == label_i][rank_metric].values[0] - - summary_df[summary_df["Model"] == label_j][rank_metric].values[0], + "Model 1": source_i, + "Model 2": source_j, + f"Diff in {rank_metric}": summary_df[summary_df["Model"] == source_i][rank_metric].values[0] + - summary_df[summary_df["Model"] == source_j][rank_metric].values[0], "CI Lower": lower, "CI Upper": upper, "p-value": p_value, @@ -160,7 +175,7 @@ def compare_and_rank_femaps( comparison_df = pd.DataFrame(comparison_data) # if we have more than 2 models, apply multiple testing correction - if len(labels) > 2: + if len(sources) > 2: from statsmodels.stats.multitest import multipletests p_values = comparison_df["p-value"].values diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py index 8f55d3de..0e8c6094 100644 --- a/cinnabar/tests/test_compare.py +++ b/cinnabar/tests/test_compare.py @@ -1,53 +1,130 @@ -import networkx as nx import numpy as np +import pytest from cinnabar import FEMap -from cinnabar.compare import compare_and_rank_femaps - - -def test_compare_and_rank_femaps(fe_map): - graph1 = nx.MultiDiGraph() - graph2 = nx.MultiDiGraph() - for a, b, data in fe_map.to_networkx().edges(data=True): - new_data = data.copy() - if data["source"] != "reverse" and data["computational"]: - # add noise to the result - new_result = data["DG"] + np.random.normal(0, data["uncertainty"].m) * data["DG"].u - new_data["DG"] = new_result - graph1.add_edge(a, b, **new_data) - # and add the reverse edge - rev_data = new_data.copy() - rev_data["source"] = "reverse" - rev_data["DG"] = -new_data["DG"] - graph1.add_edge(b, a, **rev_data) - # add a large value to the second graph to simulate a bad prediction - new_result2 = data["DG"] + 1.5 * data["DG"].u - new_data["DG"] = new_result2 - graph2.add_edge(a, b, **new_data) - # add the reverse edge - rev_data2 = new_data.copy() - rev_data2["source"] = "reverse" - rev_data2["DG"] = -new_data["DG"] - graph2.add_edge(b, a, **rev_data2) +from cinnabar.compare import compare_and_rank_results +from openff.units import unit + + +def test_compare_and_rank_results(fe_map): + np.random.seed(42) + + compare_map = FEMap() + for m in fe_map: + if m.computational: + # add the result with a new source + compare_map.add_relative_calculation( + labelA=m.labelA, + labelB=m.labelB, + value=m.DG, + uncertainty=m.uncertainty, + source="original" + ) + # add the data again under a second source with some noise added + compare_map.add_relative_calculation( + labelA=m.labelA, + labelB=m.labelB, + value=np.random.normal(m.DG.m, m.uncertainty.m) * unit.kilocalorie_per_mole, + uncertainty=m.uncertainty, + source="perturbed" + ) + # add a third set of data with a lot of noise to get a significant difference + compare_map.add_relative_calculation( + labelA=m.labelA, + labelB=m.labelB, + value=np.random.normal(m.DG.m, 12.0 * m.uncertainty.m) * unit.kilocalorie_per_mole, + uncertainty=m.uncertainty, + source="noisy" + ) else: - graph1.add_edge(a, b, **data) - graph2.add_edge(a, b, **data) - fe_map_2 = FEMap.from_networkx(graph1) - fe_map_3 = FEMap.from_networkx(graph2) - - t1, t2 = compare_and_rank_femaps( - [fe_map, fe_map_2, fe_map_3], - ["FE Map 1", "FE Map 2", "FE Map 3"], - prediction_type="nodewise", - rank_metric="rho", + # add the experimental data + compare_map.add_measurement(m) + + summary_df, comparison_df = compare_and_rank_results( + compare_map, + prediction_type="edgewise", + rank_metric="MUE", + metrics_to_compute=["MUE", "RMSE"], ) - # check that FE Map 3 is ranked worst - assert t1[t1["Model"] == "FE Map 3"]["CLD"].values[0] == "b" - # check that FE Map 1 and FE Map 2 are ranked better - assert t1[t1["Model"] == "FE Map 1"]["CLD"].values[0] == "a" - assert t1[t1["Model"] == "FE Map 2"]["CLD"].values[0] == "a" + # make sure the MUE and RMSE have been calculated and recorded in the summary table + for metric in ["MUE", "RMSE"]: + assert metric in summary_df.columns + for ci in ["Upper", "Lower"]: + assert f"{metric}_CI_{ci}" in summary_df.columns + # check that noisy is ranked worst + assert summary_df[summary_df["Model"] == "noisy"]["CLD"].values[0] == "b" + # check that original and perturbed are ranked better + assert summary_df[summary_df["Model"] == "original"]["CLD"].values[0] == "a" + assert summary_df[summary_df["Model"] == "perturbed"]["CLD"].values[0] == "a" # check that the comparison table has all three models and corrected p-values - assert len(t2) == 3 - assert "p-value corrected" in t2.columns + assert len(comparison_df) == 3 + assert "p-value corrected" in comparison_df.columns + + +def test_invalid_prediction_type(fe_map): + with pytest.raises(ValueError, match="Invalid prediction_type: pairwise"): + compare_and_rank_results( + fe_map, + prediction_type="pairwise", + rank_metric="MUE", + metrics_to_compute=["MUE", "RMSE"], + ) + + +def test_missing_experimental_data(fe_map): + new_map = FEMap() + for m in fe_map: + if m.computational: + new_map.add_measurement(m) + + with pytest.raises(ValueError, match="Experimental values are required to rank the results."): + compare_and_rank_results( + new_map, + ) + + +def test_missing_source_data(fe_map): + new_map = FEMap() + for i, m in enumerate(fe_map): + if not m.computational: + new_map.add_measurement(m) + else: + new_map.add_relative_calculation( + labelA=m.labelA, + labelB=m.labelB, + value=m.DG, + uncertainty=m.uncertainty, + source="original" + ) + # for even I add the second source as well + if i % 2 == 0: + new_map.add_relative_calculation( + labelA=m.labelA, + labelB=m.labelB, + value=m.DG, + uncertainty=m.uncertainty, + source="perturbed" + ) + + with pytest.raises(ValueError, match="Missing predictions for source perturbed, all sources must have the same number of predictions."): + compare_and_rank_results( + new_map, + ) + + +def test_bad_metric(fe_map): + with pytest.raises(ValueError, match="Metric bad_metric is not available."): + compare_and_rank_results( + fe_map, + rank_metric="bad_metric", + ) + + +def test_missing_rank_metric(fe_map): + _, _ = compare_and_rank_results( + fe_map, + rank_metric="MUE", + metrics_to_compute=["RMSE"], # miss the rank metric from the compute list and it should still work + ) \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst index e1446938..0823b3cd 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -6,6 +6,7 @@ API Documentation cinnabar.classification_metrics cinnabar.cli + cinnabar.compare cinnabar.conversion cinnabar.estimators cinnabar.femap diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst index 4880d7fb..39b4cd13 100644 --- a/docs/tutorials/index.rst +++ b/docs/tutorials/index.rst @@ -14,3 +14,4 @@ API Tutorials :maxdepth: 1 cinnabar-api + results_comparison diff --git a/docs/tutorials/results_comparison.ipynb b/docs/tutorials/results_comparison.ipynb new file mode 120000 index 00000000..adf9f0f3 --- /dev/null +++ b/docs/tutorials/results_comparison.ipynb @@ -0,0 +1 @@ +../../examples/results_comparison.ipynb \ No newline at end of file diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb new file mode 100644 index 00000000..e755343a --- /dev/null +++ b/examples/results_comparison.ipynb @@ -0,0 +1,598 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Results comparison framework for benchmarking using ``Cinnabar``\n", + "\n", + "Comparing two sets of free energy predictions by eye is deceptively difficult. Two methods can show different RMSE or MUE values on a plot yet be statistically indistinguishable given the size of the dataset. ``Cinnabar`` addresses this with a rigorous comparison framework built on a **joint bootstrapping** method to build a distribution of metric differences and identify statistically significant differences between methods.\n", + "\n", + "This tutorial walks through the methodology and demonstrates how to use the ``compare_and_rank_results`` function with example RBFE data.\n", + "\n", + "
\n", + "Reference: The comparison methodology implemented here is inspired by the approach described in Valsson et al.\n", + "
" + ], + "id": "f07f14d1b152062e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding the comparison method\n", + "\n", + "The comparison is built around a **joint bootstrapping procedure** that generates a distribution of differences in the chosen evaluation metric. The full workflow can be broken down into the following steps:\n", + "\n", + "1. **Define** the evaluation metric (e.g. RMSE, MUE) and the sets of results to compare — either ``edgewise`` (ΔΔG) or ``nodewise`` (ΔG).\n", + "2. **Bootstrap jointly** draw paired resamples of the data and recompute the evaluation metric for every source on the *same* resample. This preserves the correlation structure between methods and avoids artificially inflating differences.\n", + "3. **Compute two-sided p-values** from the fraction of bootstrap differences that cross zero.\n", + "4. **Apply the Holm multiple testing correction** when comparing more than two sets of results.\n", + "5. **Report confidence intervals** for the metric differences directly from the bootstrap distribution.\n", + "6. **Rank and group** results using the compact letter display (CLD) via the insert-absorb algorithm. Methods that share a letter are *not significantly different*; methods that share no letters *are* significantly different." + ], + "id": "f9e3ff514773b116" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Limitations\n", + "\n", + "
\n", + "Edgewise independence: RBFE calculations are inherently correlated because a weakly connected network is required to estimate ΔG for each ligand. Bootstrapping edges therefore tends to underestimate confidence intervals as the effective sample size is closer to N_ligands than to N_edges as assumed by the method.\n", + "
\n", + "\n", + "
\n", + "Nodewise correlations: ΔG estimates back-calculated from ΔΔGs via the MLE estimator are not independent across ligands as the estimator uses information from all edges simultaneously. Nodewise comparisons are most appropriate for absolute binding free energy (ABFE) predictions, where each ligand's estimate is obtained independently.\n", + "
\n", + "\n", + "### Requirements\n", + "\n", + "- Experimental measurements must be available for every ligand in the ``FEMap``.\n", + "- All sources must provide predictions for exactly the **same** set of ligands or ligand pairs." + ], + "id": "88669d7f9af4abf8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading example RBFE data\n", + "\n", + "For this tutorial we use RBFE data generated with [OpenFE](https://docs.openfree.energy/en/latest/) which is included with ``cinnabar``'s test suite. You can replace these files with your own outputs. See the [FEMap tutorial](cinnabar-api.ipynb) for details on building an ``FEMap`` from different data formats.\n", + "\n", + "The two data files have a simple tabular format:" + ], + "id": "35186bc4db8fa6ca" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:46.574932Z", + "start_time": "2026-05-19T09:21:46.423124Z" + } + }, + "source": [ + "! head ../cinnabar/data/experimental_data.csv" + ], + "id": "1d4764ff8e34299c", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ligand,expt_DG,expt_dDG\r\n", + "CAT-13a,-8.83,0.10\r\n", + "CAT-13b,-9.11,0.10\r\n", + "CAT-13c,-9.31,0.10\r\n", + "CAT-13d,-10.46,0.10\r\n", + "CAT-13e,-9.95,0.10\r\n", + "CAT-13f,-9.08,0.10\r\n", + "CAT-13g,-9.08,0.10\r\n", + "CAT-13h,-9.62,0.10\r\n", + "CAT-13i,-9.26,0.10\r\n" + ] + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:46.713070Z", + "start_time": "2026-05-19T09:21:46.575967Z" + } + }, + "source": [ + "! head ../cinnabar/data/computational_data.csv" + ], + "id": "68317fab7ba5d0c8", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ligand1,Ligand2,calc_DDG,calc_dDDG(MBAR),calc_dDDG(additional)\r\n", + "CAT-13b,CAT-17g,0.36,0.11,0.0\r\n", + "CAT-13a,CAT-17g,-0.02,0.1,0.0\r\n", + "CAT-13e,CAT-17g,1.5,0.11,0.0\r\n", + "CAT-4m,CAT-4c,0.78,0.1,0.0\r\n", + "CAT-13k,CAT-4d,-0.59,0.11,0.0\r\n", + "CAT-24,CAT-17e,1.98,0.08,0.0\r\n", + "CAT-13g,CAT-17g,0.86,0.15,0.0\r\n", + "CAT-13d,CAT-13h,1.46,0.1,0.0\r\n", + "CAT-13a,CAT-17i,-0.76,0.11,0.0\r\n" + ] + } + ], + "execution_count": 2 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:47.429987Z", + "start_time": "2026-05-19T09:21:46.714383Z" + } + }, + "source": [ + "# fmt: off\n", + "from cinnabar.compare import compare_and_rank_results\n", + "from cinnabar.femap import FEMap\n", + "\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd\n", + "from openff.units import unit\n", + "\n", + "femap = FEMap()\n", + "\n", + "# load the computational results (source 1)\n", + "rbfe_results = pd.read_csv(\"../cinnabar/data/computational_data.csv\")\n", + "\n", + "for _, result in rbfe_results.iterrows():\n", + " femap.add_relative_calculation(\n", + " labelA=result[\"Ligand1\"],\n", + " labelB=result[\"Ligand2\"],\n", + " value=result[\"calc_DDG\"] * unit.kilocalorie_per_mole,\n", + " uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n", + " source=\"OpenFE\",\n", + " )\n", + "\n", + "# load the experimental values\n", + "experimental_results = pd.read_csv(\"../cinnabar/data/experimental_data.csv\")\n", + "\n", + "for _, exp_row in experimental_results.iterrows():\n", + " femap.add_experimental_measurement(\n", + " label=exp_row[\"Ligand\"],\n", + " value=exp_row[\"expt_DG\"] * unit.kilocalorie_per_mole,\n", + " uncertainty=exp_row[\"expt_dDG\"] * unit.kilocalorie_per_mole,\n", + " source=\"Experimental\",\n", + " )\n", + "# fmt: on" + ], + "id": "cd9c5ddc07ceeb9", + "outputs": [], + "execution_count": 3 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating additional result sets for comparison\n", + "\n", + "In a real benchmarking study you would load a second (and third) set of calculated values here from a different force field, a different simulation protocol, or a different software package. You just need to add them to the same ``FEMap`` with a distinct ``source`` string.\n", + "\n", + "For this tutorial we generate two synthetic alternatives by perturbing the original values with random noise, giving us full control over how similar or different the methods appear." + ], + "id": "7dc78019e981e2f9" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:47.502421Z", + "start_time": "2026-05-19T09:21:47.468442Z" + } + }, + "source": [ + "np.random.seed(42) # for reproducibility\n", + "\n", + "# Source 2: slight perturbation (similar accuracy to source 1)\n", + "for _, result in rbfe_results.iterrows():\n", + " femap.add_relative_calculation(\n", + " labelA=result[\"Ligand1\"],\n", + " labelB=result[\"Ligand2\"],\n", + " value=np.random.normal(\n", + " loc=result[\"calc_DDG\"],\n", + " scale=result[\"calc_dDDG(MBAR)\"]\n", + " ) * unit.kilocalorie_per_mole,\n", + " uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n", + " source=\"OpenFE_perturbed\",\n", + " )\n", + "\n", + "# Source 3: large perturbation (noticeably worse accuracy)\n", + "for _, result in rbfe_results.iterrows():\n", + " femap.add_relative_calculation(\n", + " labelA=result[\"Ligand1\"],\n", + " labelB=result[\"Ligand2\"],\n", + " value=np.random.normal(\n", + " loc=result[\"calc_DDG\"],\n", + " scale=12.0 * result[\"calc_dDDG(MBAR)\"]\n", + " ) * unit.kilocalorie_per_mole,\n", + " uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n", + " source=\"OpenFE_noisy\",\n", + " )" + ], + "id": "71a8a6b537f85455", + "outputs": [], + "execution_count": 4 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running the comparison\n", + "\n", + "We now call ``compare_and_rank_results`` with the ``FEMap`` containing all three sources. Key parameters:\n", + "\n", + "| Parameter | Description |\n", + "| --- |--------------------------------------------------------------------------------------|\n", + "| ``prediction_type`` | ``\"edgewise\"`` (ΔΔG) or ``\"nodewise\"`` (ΔG) |\n", + "| ``rank_metric`` | The metric used to rank and compare methods. |\n", + "| ``metrics_to_compute`` | Metrics reported in the summary table. Defaults to ``[\"MUE\", \"RMSE\"]`` for edgewise. |\n", + "| ``num_bootstraps`` | Number of joint bootstrap resamples (1 000 is a sensible default). |\n", + "| ``confidence_level`` | Width of the reported confidence intervals, default ``0.95`` (95 %). |\n" + ], + "id": "4df80ede3e08ddf0" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:49.156781Z", + "start_time": "2026-05-19T09:21:47.504921Z" + } + }, + "source": [ + "summary_df, comparison_df = compare_and_rank_results(\n", + " femap=femap,\n", + " prediction_type=\"edgewise\",\n", + " rank_metric=\"MUE\", # the metric used for ranking the results\n", + " metrics_to_compute=[\"MUE\", \"RMSE\"], # metrics to report in the summary table\n", + " num_bootstraps=1_000,\n", + " confidence_level=0.95, # report 95 % confidence intervals\n", + ")" + ], + "id": "3d987cc6b1ddf37e", + "outputs": [], + "execution_count": 5 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The summary table\n", + "\n", + "The summary table has one row per source. For each requested metric you get:\n", + "\n", + "- the **sample value** (metric computed on all data points),\n", + "- ``*_CI_Lower`` / ``*_CI_Upper``: the **bootstrap confidence interval** bounds, and\n", + "- **``CLD``**: the compact letter display assignment.\n", + "\n", + "Sources that **share a CLD letter** are *not significantly different* from each other for the chosen ``rank_metric`` (after multiple testing correction). Sources with **no letters in common** *are* significantly different. For example, three methods labelled ``\"a\"``, ``\"b\"``, and ``\"b\"`` tell you: the first and second/third are significantly different from each other, but the second is not significantly different from the third method." + ], + "id": "48fb1d4e6b173827" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:49.183272Z", + "start_time": "2026-05-19T09:21:49.157518Z" + } + }, + "source": [ + "summary_df" + ], + "id": "cb590e7befe1d980", + "outputs": [ + { + "data": { + "text/plain": [ + " Model MUE MUE_CI_Lower MUE_CI_Upper RMSE \\\n", + "0 OpenFE 0.867586 0.710513 1.021414 1.053002 \n", + "1 OpenFE_noisy 1.166930 0.971467 1.361760 1.386575 \n", + "2 OpenFE_perturbed 0.859956 0.698599 1.023484 1.056510 \n", + "\n", + " RMSE_CI_Lower RMSE_CI_Upper CLD \n", + "0 0.869701 1.221763 a \n", + "1 1.175243 1.577053 b \n", + "2 0.867684 1.228928 a " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelMUEMUE_CI_LowerMUE_CI_UpperRMSERMSE_CI_LowerRMSE_CI_UpperCLD
0OpenFE0.8675860.7105131.0214141.0530020.8697011.221763a
1OpenFE_noisy1.1669300.9714671.3617601.3865751.1752431.577053b
2OpenFE_perturbed0.8599560.6985991.0234841.0565100.8676841.228928a
\n", + "
" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 6 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The comparison table\n", + "\n", + "The comparison table has one row per unique source pair and records:\n", + "\n", + "- ``Diff in ``: observed difference in the ranking metric (source 1 minus source 2, on all data),\n", + "- ``CI Lower`` / ``CI Upper``: bootstrap confidence interval around that difference,\n", + "- ``p-value``: two-sided bootstrap p-value,\n", + "- ``p-value corrected``: Holm-corrected p-value *(appears automatically when >2 sources are present)*,\n", + "- ``significant``: ``True`` if the (corrected) p-value is below 0.05." + ], + "id": "c9ec8f1dd33a19bb" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:49.211626Z", + "start_time": "2026-05-19T09:21:49.190516Z" + } + }, + "source": [ + "comparison_df" + ], + "id": "330b38a6f59bf249", + "outputs": [ + { + "data": { + "text/plain": [ + " Model 1 Model 2 Diff in MUE CI Lower CI Upper p-value \\\n", + "0 OpenFE OpenFE_noisy -0.299344 -0.511742 -0.088689 0.008 \n", + "1 OpenFE OpenFE_perturbed 0.007630 -0.015944 0.030025 0.562 \n", + "2 OpenFE_noisy OpenFE_perturbed 0.306974 0.096280 0.518630 0.008 \n", + "\n", + " significant p-value corrected \n", + "0 True 0.024 \n", + "1 False 0.562 \n", + "2 True 0.024 " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model 1Model 2Diff in MUECI LowerCI Upperp-valuesignificantp-value corrected
0OpenFEOpenFE_noisy-0.299344-0.511742-0.0886890.008True0.024
1OpenFEOpenFE_perturbed0.007630-0.0159440.0300250.562False0.562
2OpenFE_noisyOpenFE_perturbed0.3069740.0962800.5186300.008True0.024
\n", + "
" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because we have three sources here the ``p-value corrected`` column is present along with the ``p-value`` from the bootstrap test. ``OpenFE_noisy`` — with 12× the per-edge noise — should appear in a distinct CLD group from the other two, while ``OpenFE`` and ``OpenFE_perturbed`` may share a letter if their difference is not large enough to be resolved at this sample size.\n", + "\n", + "
\n", + "Tip: When comparing only two sources the p-value corrected column is absent (no correction needed) and the raw p-value drives the significant flag directly.\n", + "
" + ], + "id": "46a1d694c36bee59" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional options\n", + "\n", + "### Choosing the right metrics\n", + "\n", + "The available metrics depend on ``prediction_type``:\n", + "\n", + "| Metric | Edgewise | Nodewise | Notes |\n", + "| --- | :---: | :---: | --- |\n", + "| ``MUE`` | ✓ | ✓ | Mean unsigned error |\n", + "| ``RMSE`` | ✓ | ✓ | Root mean squared error |\n", + "| ``RAE`` | ✓ | ✓ | Relative absolute error vs. a naïve mean predictor |\n", + "| ``R2`` | — | ✓ | R² (Pearson r²); not meaningful for relative ΔΔG data |\n", + "| ``rho`` | — | ✓ | Pearson r |\n", + "| ``KTAU`` | — | ✓ | Kendall's τ |\n", + "| ``PI`` | — | ✓ | Predictive index (Pearlman *et al.*) |\n", + "\n", + "Correlation metrics (R², ρ, KTAU, PI) are only meaningful for nodewise ΔG comparisons where the data have an absolute scale. For edgewise comparisons the sign of a ΔΔG is arbitrary, so error metrics (MUE, RMSE) are the default.\n", + "\n", + "If ``metrics_to_compute=None`` (the default), cinnabar automatically selects ``[\"MUE\", \"RMSE\"]`` for edgewise and ``[\"MUE\", \"RMSE\", \"RAE\", \"R2\", \"rho\", \"KTAU\", \"PI\"]`` for nodewise comparisons.\n", + "\n", + "### Adjusting the confidence level\n", + "\n", + "The ``confidence_level`` parameter (default ``0.95``) controls the width of the reported CIs. Pass a different value to tighten or loosen the intervals:" + ], + "id": "4d0c3678b72cf1b7" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-19T09:21:50.857180Z", + "start_time": "2026-05-19T09:21:49.212873Z" + } + }, + "cell_type": "code", + "source": [ + "summary_df, comparison_df = compare_and_rank_results(\n", + " femap=femap,\n", + " rank_metric=\"MUE\",\n", + " metrics_to_compute=[\"MUE\", \"RMSE\"],\n", + " confidence_level=0.90, # 90 % CIs — narrower, easier to detect differences\n", + ")" + ], + "id": "e5f167ac67af2e25", + "outputs": [], + "execution_count": 8 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Recap\n", + "\n", + "- ``compare_and_rank_results`` performs **joint bootstrapping** across all sources, ensuring paired, fair comparisons on the same resampled data.\n", + "- Two tables are returned: a **summary** table (per-source metric values, CIs, and CLD letters) and a **comparison** table (pairwise differences, CIs, and p-values).\n", + "- The **CLD** encodes statistical groupings: sources sharing a letter are *not* significantly different; sources with no shared letter *are* significantly different.\n", + "- With **more than two sources**, Holm-corrected p-values appear automatically in ``p-value corrected`` and drive the ``significant`` flags.\n", + "- Use ``prediction_type=\"nodewise\"`` only for independent per-ligand ΔG estimates (e.g. ABFE), *not* for MLE-derived values from a relative network." + ], + "id": "3265c28aba157f15" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 7e383ce5ca04aecf034187c5f7cc112dfcf73958 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 May 2026 09:34:57 +0000 Subject: [PATCH 05/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cinnabar/compare.py | 16 ++- cinnabar/tests/test_compare.py | 32 ++---- examples/results_comparison.ipynb | 175 +++++++++++++++--------------- 3 files changed, 106 insertions(+), 117 deletions(-) diff --git a/cinnabar/compare.py b/cinnabar/compare.py index c76e7e9f..d5ae7003 100644 --- a/cinnabar/compare.py +++ b/cinnabar/compare.py @@ -13,7 +13,7 @@ def compare_and_rank_results( femap: FEMap, prediction_type: Literal["nodewise", "edgewise"] = "edgewise", rank_metric: Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"] = "MUE", - metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU","PI"]] | None = None, + metrics_to_compute: list[Literal["MUE", "RMSE", "RAE", "R2", "rho", "KTAU", "PI"]] | None = None, num_bootstraps: int = 1_000, confidence_level: float = 0.95, ) -> tuple[pd.DataFrame, pd.DataFrame]: @@ -69,18 +69,18 @@ def compare_and_rank_results( if not row["computational"]: source_label = "exp" else: - source_label = f"{row["source"]}_calc" + source_label = f"{row['source']}_calc" predictions_by_key[node_label][source_label] = row["DG (kcal/mol)"] elif prediction_type == "edgewise": rel_df = femap.get_relative_dataframe() - sources =rel_df[rel_df["computational"] == True]["source"].unique() + sources = rel_df[rel_df["computational"] == True]["source"].unique() for _, row in rel_df.iterrows(): key = (row["labelA"], row["labelB"]) if not row["computational"]: source_label = "exp" else: - source_label = f"{row["source"]}_calc" + source_label = f"{row['source']}_calc" predictions_by_key[key][source_label] = row["DDG (kcal/mol)"] else: raise ValueError(f"Invalid prediction_type: {prediction_type}") @@ -95,7 +95,9 @@ def compare_and_rank_results( for source in sources: # check if any values for this source are missing if any(predictions_df[f"{source}_calc"].isna()): - raise ValueError(f"Missing predictions for source {source}, all sources must have the same number of predictions.") + raise ValueError( + f"Missing predictions for source {source}, all sources must have the same number of predictions." + ) # set metrics to compute based on best practices if not provided if metrics_to_compute is None: @@ -130,7 +132,9 @@ def compare_and_rank_results( for j, source_j in enumerate(sources): if j <= i: continue - diffs = np.array(metrics_by_source[source_i][rank_metric]) - np.array(metrics_by_source[source_j][rank_metric]) + diffs = np.array(metrics_by_source[source_i][rank_metric]) - np.array( + metrics_by_source[source_j][rank_metric] + ) pairwise_metrics[(source_i, source_j)] = diffs # summarize all metrics diff --git a/cinnabar/tests/test_compare.py b/cinnabar/tests/test_compare.py index 0e8c6094..10165dde 100644 --- a/cinnabar/tests/test_compare.py +++ b/cinnabar/tests/test_compare.py @@ -1,11 +1,10 @@ import numpy as np import pytest +from openff.units import unit from cinnabar import FEMap from cinnabar.compare import compare_and_rank_results -from openff.units import unit - def test_compare_and_rank_results(fe_map): np.random.seed(42) @@ -15,11 +14,7 @@ def test_compare_and_rank_results(fe_map): if m.computational: # add the result with a new source compare_map.add_relative_calculation( - labelA=m.labelA, - labelB=m.labelB, - value=m.DG, - uncertainty=m.uncertainty, - source="original" + labelA=m.labelA, labelB=m.labelB, value=m.DG, uncertainty=m.uncertainty, source="original" ) # add the data again under a second source with some noise added compare_map.add_relative_calculation( @@ -27,7 +22,7 @@ def test_compare_and_rank_results(fe_map): labelB=m.labelB, value=np.random.normal(m.DG.m, m.uncertainty.m) * unit.kilocalorie_per_mole, uncertainty=m.uncertainty, - source="perturbed" + source="perturbed", ) # add a third set of data with a lot of noise to get a significant difference compare_map.add_relative_calculation( @@ -35,7 +30,7 @@ def test_compare_and_rank_results(fe_map): labelB=m.labelB, value=np.random.normal(m.DG.m, 12.0 * m.uncertainty.m) * unit.kilocalorie_per_mole, uncertainty=m.uncertainty, - source="noisy" + source="noisy", ) else: # add the experimental data @@ -92,23 +87,18 @@ def test_missing_source_data(fe_map): new_map.add_measurement(m) else: new_map.add_relative_calculation( - labelA=m.labelA, - labelB=m.labelB, - value=m.DG, - uncertainty=m.uncertainty, - source="original" + labelA=m.labelA, labelB=m.labelB, value=m.DG, uncertainty=m.uncertainty, source="original" ) # for even I add the second source as well if i % 2 == 0: new_map.add_relative_calculation( - labelA=m.labelA, - labelB=m.labelB, - value=m.DG, - uncertainty=m.uncertainty, - source="perturbed" + labelA=m.labelA, labelB=m.labelB, value=m.DG, uncertainty=m.uncertainty, source="perturbed" ) - with pytest.raises(ValueError, match="Missing predictions for source perturbed, all sources must have the same number of predictions."): + with pytest.raises( + ValueError, + match="Missing predictions for source perturbed, all sources must have the same number of predictions.", + ): compare_and_rank_results( new_map, ) @@ -127,4 +117,4 @@ def test_missing_rank_metric(fe_map): fe_map, rank_metric="MUE", metrics_to_compute=["RMSE"], # miss the rank metric from the compute list and it should still work - ) \ No newline at end of file + ) diff --git a/examples/results_comparison.ipynb b/examples/results_comparison.ipynb index e755343a..9b58eaf2 100644 --- a/examples/results_comparison.ipynb +++ b/examples/results_comparison.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "f07f14d1b152062e", "metadata": {}, "source": [ "# Results comparison framework for benchmarking using ``Cinnabar``\n", @@ -13,11 +14,11 @@ "
\n", "Reference: The comparison methodology implemented here is inspired by the approach described in Valsson et al.\n", "
" - ], - "id": "f07f14d1b152062e" + ] }, { "cell_type": "markdown", + "id": "f9e3ff514773b116", "metadata": {}, "source": [ "## Understanding the comparison method\n", @@ -30,11 +31,11 @@ "4. **Apply the Holm multiple testing correction** when comparing more than two sets of results.\n", "5. **Report confidence intervals** for the metric differences directly from the bootstrap distribution.\n", "6. **Rank and group** results using the compact letter display (CLD) via the insert-absorb algorithm. Methods that share a letter are *not significantly different*; methods that share no letters *are* significantly different." - ], - "id": "f9e3ff514773b116" + ] }, { "cell_type": "markdown", + "id": "88669d7f9af4abf8", "metadata": {}, "source": [ "## Limitations\n", @@ -51,11 +52,11 @@ "\n", "- Experimental measurements must be available for every ligand in the ``FEMap``.\n", "- All sources must provide predictions for exactly the **same** set of ligands or ligand pairs." - ], - "id": "88669d7f9af4abf8" + ] }, { "cell_type": "markdown", + "id": "35186bc4db8fa6ca", "metadata": {}, "source": [ "## Loading example RBFE data\n", @@ -63,21 +64,18 @@ "For this tutorial we use RBFE data generated with [OpenFE](https://docs.openfree.energy/en/latest/) which is included with ``cinnabar``'s test suite. You can replace these files with your own outputs. See the [FEMap tutorial](cinnabar-api.ipynb) for details on building an ``FEMap`` from different data formats.\n", "\n", "The two data files have a simple tabular format:" - ], - "id": "35186bc4db8fa6ca" + ] }, { "cell_type": "code", + "execution_count": 1, + "id": "1d4764ff8e34299c", "metadata": { "ExecuteTime": { "end_time": "2026-05-19T09:21:46.574932Z", "start_time": "2026-05-19T09:21:46.423124Z" } }, - "source": [ - "! head ../cinnabar/data/experimental_data.csv" - ], - "id": "1d4764ff8e34299c", "outputs": [ { "name": "stdout", @@ -96,20 +94,20 @@ ] } ], - "execution_count": 1 + "source": [ + "! head ../cinnabar/data/experimental_data.csv" + ] }, { "cell_type": "code", + "execution_count": 2, + "id": "68317fab7ba5d0c8", "metadata": { "ExecuteTime": { "end_time": "2026-05-19T09:21:46.713070Z", "start_time": "2026-05-19T09:21:46.575967Z" } }, - "source": [ - "! head ../cinnabar/data/computational_data.csv" - ], - "id": "68317fab7ba5d0c8", "outputs": [ { "name": "stdout", @@ -128,16 +126,21 @@ ] } ], - "execution_count": 2 + "source": [ + "! head ../cinnabar/data/computational_data.csv" + ] }, { "cell_type": "code", + "execution_count": 3, + "id": "cd9c5ddc07ceeb9", "metadata": { "ExecuteTime": { "end_time": "2026-05-19T09:21:47.429987Z", "start_time": "2026-05-19T09:21:46.714383Z" } }, + "outputs": [], "source": [ "# fmt: off\n", "from cinnabar.compare import compare_and_rank_results\n", @@ -173,13 +176,11 @@ " source=\"Experimental\",\n", " )\n", "# fmt: on" - ], - "id": "cd9c5ddc07ceeb9", - "outputs": [], - "execution_count": 3 + ] }, { "cell_type": "markdown", + "id": "7dc78019e981e2f9", "metadata": {}, "source": [ "## Creating additional result sets for comparison\n", @@ -187,17 +188,19 @@ "In a real benchmarking study you would load a second (and third) set of calculated values here from a different force field, a different simulation protocol, or a different software package. You just need to add them to the same ``FEMap`` with a distinct ``source`` string.\n", "\n", "For this tutorial we generate two synthetic alternatives by perturbing the original values with random noise, giving us full control over how similar or different the methods appear." - ], - "id": "7dc78019e981e2f9" + ] }, { "cell_type": "code", + "execution_count": 4, + "id": "71a8a6b537f85455", "metadata": { "ExecuteTime": { "end_time": "2026-05-19T09:21:47.502421Z", "start_time": "2026-05-19T09:21:47.468442Z" } }, + "outputs": [], "source": [ "np.random.seed(42) # for reproducibility\n", "\n", @@ -206,10 +209,7 @@ " femap.add_relative_calculation(\n", " labelA=result[\"Ligand1\"],\n", " labelB=result[\"Ligand2\"],\n", - " value=np.random.normal(\n", - " loc=result[\"calc_DDG\"],\n", - " scale=result[\"calc_dDDG(MBAR)\"]\n", - " ) * unit.kilocalorie_per_mole,\n", + " value=np.random.normal(loc=result[\"calc_DDG\"], scale=result[\"calc_dDDG(MBAR)\"]) * unit.kilocalorie_per_mole,\n", " uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n", " source=\"OpenFE_perturbed\",\n", " )\n", @@ -219,20 +219,16 @@ " femap.add_relative_calculation(\n", " labelA=result[\"Ligand1\"],\n", " labelB=result[\"Ligand2\"],\n", - " value=np.random.normal(\n", - " loc=result[\"calc_DDG\"],\n", - " scale=12.0 * result[\"calc_dDDG(MBAR)\"]\n", - " ) * unit.kilocalorie_per_mole,\n", + " value=np.random.normal(loc=result[\"calc_DDG\"], scale=12.0 * result[\"calc_dDDG(MBAR)\"])\n", + " * unit.kilocalorie_per_mole,\n", " uncertainty=result[\"calc_dDDG(MBAR)\"] * unit.kilocalorie_per_mole,\n", " source=\"OpenFE_noisy\",\n", " )" - ], - "id": "71a8a6b537f85455", - "outputs": [], - "execution_count": 4 + ] }, { "cell_type": "markdown", + "id": "4df80ede3e08ddf0", "metadata": {}, "source": [ "## Running the comparison\n", @@ -246,17 +242,19 @@ "| ``metrics_to_compute`` | Metrics reported in the summary table. Defaults to ``[\"MUE\", \"RMSE\"]`` for edgewise. |\n", "| ``num_bootstraps`` | Number of joint bootstrap resamples (1 000 is a sensible default). |\n", "| ``confidence_level`` | Width of the reported confidence intervals, default ``0.95`` (95 %). |\n" - ], - "id": "4df80ede3e08ddf0" + ] }, { "cell_type": "code", + "execution_count": 5, + "id": "3d987cc6b1ddf37e", "metadata": { "ExecuteTime": { "end_time": "2026-05-19T09:21:49.156781Z", "start_time": "2026-05-19T09:21:47.504921Z" } }, + "outputs": [], "source": [ "summary_df, comparison_df = compare_and_rank_results(\n", " femap=femap,\n", @@ -266,13 +264,11 @@ " num_bootstraps=1_000,\n", " confidence_level=0.95, # report 95 % confidence intervals\n", ")" - ], - "id": "3d987cc6b1ddf37e", - "outputs": [], - "execution_count": 5 + ] }, { "cell_type": "markdown", + "id": "48fb1d4e6b173827", "metadata": {}, "source": [ "### The summary table\n", @@ -284,35 +280,21 @@ "- **``CLD``**: the compact letter display assignment.\n", "\n", "Sources that **share a CLD letter** are *not significantly different* from each other for the chosen ``rank_metric`` (after multiple testing correction). Sources with **no letters in common** *are* significantly different. For example, three methods labelled ``\"a\"``, ``\"b\"``, and ``\"b\"`` tell you: the first and second/third are significantly different from each other, but the second is not significantly different from the third method." - ], - "id": "48fb1d4e6b173827" + ] }, { "cell_type": "code", + "execution_count": 6, + "id": "cb590e7befe1d980", "metadata": { "ExecuteTime": { "end_time": "2026-05-19T09:21:49.183272Z", "start_time": "2026-05-19T09:21:49.157518Z" } }, - "source": [ - "summary_df" - ], - "id": "cb590e7befe1d980", "outputs": [ { "data": { - "text/plain": [ - " Model MUE MUE_CI_Lower MUE_CI_Upper RMSE \\\n", - "0 OpenFE 0.867586 0.710513 1.021414 1.053002 \n", - "1 OpenFE_noisy 1.166930 0.971467 1.361760 1.386575 \n", - "2 OpenFE_perturbed 0.859956 0.698599 1.023484 1.056510 \n", - "\n", - " RMSE_CI_Lower RMSE_CI_Upper CLD \n", - "0 0.869701 1.221763 a \n", - "1 1.175243 1.577053 b \n", - "2 0.867684 1.228928 a " - ], "text/html": [ "
\n", "