bytedance · AllanChain · Jun 13, 2026 · Jun 17, 2026
diff --git a/src/jaqmc/utils/checkpoint.py b/src/jaqmc/utils/checkpoint.py
@@ -90,7 +90,9 @@ def restore_from_file[ValueT](
         logger.info("Restored checkpoint %s", restore_path)
         return step + 1, cast(ValueT, data)
 
-    def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
+    def restore[ValueT](
+        self, fallback: ValueT, *, strict: bool = False
+    ) -> tuple[int, ValueT]:
         """Restore the latest checkpoint from ``restore_path`` if available.
 
         The manager searches for the newest ``ckpt_*.npz`` file under
@@ -101,6 +103,8 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
         Args:
             fallback: Reference PyTree to use for structure and default values
                 when no checkpoint exists or all are unreadable.
+            strict: If ``True``, raise when no matching checkpoint can be
+                restored instead of returning ``fallback``.
 
         Returns:
             A tuple ``(step, restored)``:
@@ -111,8 +115,18 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
 
         Type Parameters:
             ValueT: Reference-tree type that is preserved in the restored value.
+
+        Raises:
+            FileNotFoundError: If ``strict`` is ``True`` and ``restore_path``
+                does not exist, or no matching checkpoints are found.
+            RuntimeError: If ``strict`` is ``True`` and every matching
+                checkpoint fails to restore.
         """
         if not self.restore_path.exists():
+            if strict:
+                raise FileNotFoundError(
+                    f"Checkpoint path does not exist: {self.restore_path}"
+                )
             logger.warning("No checkpoint to restore in: %s", self.restore_path)
             return 0, fallback
         if self.restore_path.is_file():
@@ -121,6 +135,11 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
             self.restore_path.glob(f"{self.prefix}ckpt_*.npz"), reverse=True
         )
         if not ckpt_files:
+            if strict:
+                raise FileNotFoundError(
+                    "No matching checkpoints found in "
+                    f"{self.restore_path}: {self.prefix}ckpt_*.npz"
+                )
             if self.restore_path != self.save_path:
                 logger.warning(
                     "Directory exists but no matching "
@@ -129,11 +148,18 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
                     self.prefix,
                 )
             return 0, fallback
+        last_error: Exception | None = None
         for ckpt_path in ckpt_files:
             try:
                 return self.restore_from_file(ckpt_path, fallback)
-            except (OSError, EOFError, BadZipFile):
+            except (OSError, EOFError, BadZipFile) as err:
+                last_error = err
                 logger.warning("Fail to restore checkpoint %s", ckpt_path)
+        if strict:
+            raise RuntimeError(
+                "Failed to restore any checkpoint from "
+                f"{self.restore_path}: {self.prefix}ckpt_*.npz"
+            ) from last_error
         return 0, fallback
 
     def save(self, step: int, data):

diff --git a/src/jaqmc/workflow/evaluation.py b/src/jaqmc/workflow/evaluation.py
@@ -103,7 +103,7 @@ def run(self) -> None:
         # train_ckpt_*.npz in the directory.
         prefix = "" if source_path.is_file() else "train"
         restored = self.evaluation_stage.restore_checkpoint(
-            source_path, wrapper, prefix=prefix
+            source_path, wrapper, prefix=prefix, strict=True
         )
         state = replace(
             state,

diff --git a/src/jaqmc/workflow/stage/base.py b/src/jaqmc/workflow/stage/base.py
@@ -215,13 +215,16 @@ def restore_checkpoint(
         template: Any,
         *,
         prefix: str = "",
+        strict: bool = False,
     ) -> Any:
         """Restore state from a checkpoint.
 
         Args:
             checkpoint_path: Path to checkpoint file or directory.
             template: Template state for deserialization.
             prefix: Checkpoint filename prefix to match.
+            strict: If ``True``, raise when no matching checkpoint can be
+                restored.
 
         Returns:
             Restored state.

diff --git a/src/jaqmc/workflow/stage/sampling.py b/src/jaqmc/workflow/stage/sampling.py
@@ -177,20 +177,23 @@ def restore_checkpoint(
         template: Any,
         *,
         prefix: str = "",
+        strict: bool = False,
     ):
         """Restore state from a checkpoint.
 
         Args:
             checkpoint_path: Path to checkpoint file or directory.
             template: Template state for deserialization.
             prefix: Checkpoint filename prefix to match.
+            strict: If ``True``, raise when no matching checkpoint can be
+                restored.
 
         Returns:
             Restored state.
         """
         checkpoint_path = UPath(checkpoint_path)
         ckpt = NumPyCheckpointManager(checkpoint_path, checkpoint_path, prefix=prefix)
-        _, state = ckpt.restore(template)
+        _, state = ckpt.restore(template, strict=strict)
         return state
 
     def create_state(  # type: ignore[override]

diff --git a/tests/hydrogen/atom_test.py b/tests/hydrogen/atom_test.py
@@ -89,3 +89,71 @@ def test_evaluation_writes_per_step_stats_and_digest(tmp_path):
     assert digest["total_energy"].ndim == 0
     assert "energy:kinetic" in digest
     assert digest["energy:kinetic"].ndim == 0
+
+
+def test_evaluation_fails_when_source_path_is_missing(tmp_path):
+    eval_dir = tmp_path / "eval-run"
+    eval_cfg = ConfigManager(
+        {
+            "workflow": {
+                "save_path": str(eval_dir),
+                "batch_size": 128,
+                "source_path": str(tmp_path / "missing-train-run"),
+            },
+            "run": {"iterations": 1},
+        }
+    )
+
+    with pytest.raises(FileNotFoundError, match="Checkpoint path does not exist"):
+        hydrogen_atom_eval_workflow(eval_cfg)()
+
+    assert not (eval_dir / "evaluation_digest.npz").exists()
+
+
+def test_evaluation_fails_when_source_dir_has_no_train_checkpoints(tmp_path):
+    source_dir = tmp_path / "empty-train-run"
+    source_dir.mkdir()
+    eval_dir = tmp_path / "eval-run"
+    eval_cfg = ConfigManager(
+        {
+            "workflow": {
+                "save_path": str(eval_dir),
+                "batch_size": 128,
+                "source_path": str(source_dir),
+            },
+            "run": {"iterations": 1},
+        }
+    )
+
+    with pytest.raises(FileNotFoundError, match="No matching checkpoints found"):
+        hydrogen_atom_eval_workflow(eval_cfg)()
+
+    assert not (eval_dir / "evaluation_digest.npz").exists()
+
+
+def test_evaluation_accepts_checkpoint_file_source_path(tmp_path):
+    train_dir = tmp_path / "train-run"
+    eval_dir = tmp_path / "eval-run"
+
+    train_cfg = ConfigManager(
+        {
+            "workflow": {"save_path": str(train_dir), "batch_size": 128},
+            "train": {"run": {"iterations": 3}},
+        }
+    )
+    hydrogen_atom_train_workflow(train_cfg)()
+
+    checkpoint_path = max(train_dir.glob("train_ckpt_*.npz"))
+    eval_cfg = ConfigManager(
+        {
+            "workflow": {
+                "save_path": str(eval_dir),
+                "batch_size": 128,
+                "source_path": str(checkpoint_path),
+            },
+            "run": {"iterations": 3},
+        }
+    )
+    hydrogen_atom_eval_workflow(eval_cfg)()
+
+    assert (eval_dir / "evaluation_digest.npz").exists()
diff --git a/tests/utils/checkpoint_test.py b/tests/utils/checkpoint_test.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2026 ByteDance Ltd. and/or its affiliates
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+from jax import numpy as jnp
+
+from jaqmc.utils.checkpoint import NumPyCheckpointManager, tree_from_npz, tree_to_npz
+
+
+def _fallback_tree():
+    return {
+        "scalar": 0,
+        "np": np.array([[0.0, 0.0]]),
+        "nested": {"jax": jnp.array([0, 0])},
+    }
+
+
+def _restored_tree():
+    return {
+        "scalar": 7,
+        "np": np.array([[1.5, 2.5]]),
+        "nested": {"jax": jnp.array([3, 4])},
+    }
+
+
+def _corrupt_file(path):
+    path.write_bytes(path.read_bytes()[:-10])
+
+
+def test_save_and_restore_round_trip_with_prefix(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+    fallback = _fallback_tree()
+    expected = _restored_tree()
+
+    manager.save(4, expected)
+
+    ckpt_path = tmp_path / "train_ckpt_000004.npz"
+    assert ckpt_path.exists()
+
+    step, restored = manager.restore(fallback)
+
+    assert step == 5
+    assert restored["scalar"] == expected["scalar"]
+    np.testing.assert_array_equal(restored["np"], expected["np"])
+    np.testing.assert_array_equal(
+        np.asarray(restored["nested"]["jax"]),
+        np.asarray(expected["nested"]["jax"]),
+    )
+
+
+def test_restore_uses_restore_path_file(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+    expected = _restored_tree()
+    manager.save(2, expected)
+
+    ckpt_path = tmp_path / "train_ckpt_000002.npz"
+    direct_file_manager = NumPyCheckpointManager(tmp_path, ckpt_path, prefix="ignored")
+
+    step, restored = direct_file_manager.restore(_fallback_tree())
+
+    assert step == 3
+    assert restored["scalar"] == expected["scalar"]
+    np.testing.assert_array_equal(restored["np"], expected["np"])
+
+
+def test_restore_from_file_rejects_non_file_path(tmp_path):
+    with pytest.raises(ValueError, match="is not a file"):
+        NumPyCheckpointManager.restore_from_file(tmp_path, _fallback_tree())
+
+
+def test_restore_returns_fallback_when_path_missing(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path / "missing", prefix="train")
+    fallback = _fallback_tree()
+
+    step, restored = manager.restore(fallback)
+
+    assert step == 0
+    assert restored is fallback
+
+
+def test_restore_raises_when_path_missing_in_strict_mode(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path / "missing", prefix="train")
+
+    with pytest.raises(FileNotFoundError, match="Checkpoint path does not exist"):
+        manager.restore(_fallback_tree(), strict=True)
+
+
+def test_restore_returns_fallback_when_no_matching_checkpoint_files(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+    fallback = _fallback_tree()
+    (tmp_path / "other_ckpt_000001.npz").write_bytes(b"placeholder")
+
+    step, restored = manager.restore(fallback)
+
+    assert step == 0
+    assert restored is fallback
+
+
+def test_restore_raises_when_no_matching_checkpoint_files_in_strict_mode(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+
+    with pytest.raises(FileNotFoundError, match="No matching checkpoints found"):
+        manager.restore(_fallback_tree(), strict=True)
+
+
+def test_restore_skips_bad_latest_checkpoint(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+    expected = _restored_tree()
+
+    manager.save(2, expected)
+    manager.save(4, _fallback_tree())
+    _corrupt_file(tmp_path / "train_ckpt_000004.npz")
+
+    step, restored = manager.restore(_fallback_tree())
+
+    assert step == 3
+    assert restored["scalar"] == expected["scalar"]
+    np.testing.assert_array_equal(restored["np"], expected["np"])
+
+
+def test_restore_skips_bad_latest_checkpoint_in_strict_mode(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+    expected = _restored_tree()
+
+    manager.save(2, expected)
+    manager.save(4, _fallback_tree())
+    _corrupt_file(tmp_path / "train_ckpt_000004.npz")
+
+    step, restored = manager.restore(_fallback_tree(), strict=True)
+
+    assert step == 3
+    assert restored["scalar"] == expected["scalar"]
+    np.testing.assert_array_equal(restored["np"], expected["np"])
+
+
+def test_restore_returns_fallback_when_all_matching_checkpoints_are_bad(tmp_path):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+    fallback = _fallback_tree()
+
+    manager.save(2, _restored_tree())
+    manager.save(4, _fallback_tree())
+    _corrupt_file(tmp_path / "train_ckpt_000002.npz")
+    _corrupt_file(tmp_path / "train_ckpt_000004.npz")
+
+    step, restored = manager.restore(fallback)
+
+    assert step == 0
+    assert restored is fallback
+
+
+def test_restore_raises_when_all_matching_checkpoints_are_bad_in_strict_mode(
+    tmp_path,
+):
+    manager = NumPyCheckpointManager(tmp_path, prefix="train")
+
+    manager.save(2, _restored_tree())
+    manager.save(4, _fallback_tree())
+    _corrupt_file(tmp_path / "train_ckpt_000002.npz")
+    _corrupt_file(tmp_path / "train_ckpt_000004.npz")
+
+    with pytest.raises(RuntimeError, match="Failed to restore any checkpoint"):
+        manager.restore(_fallback_tree(), strict=True)
+
+
+def test_tree_to_npz_and_tree_from_npz_round_trip():
+    tree = _restored_tree()
+    fallback = _fallback_tree()
+
+    npz_data = tree_to_npz(tree)
+
+    assert {"scalar", "np", "nested/jax"} <= set(npz_data)
+
+    restored = tree_from_npz(npz_data, fallback)
+
+    assert restored["scalar"] == tree["scalar"]
+    np.testing.assert_array_equal(restored["np"], tree["np"])
+    np.testing.assert_array_equal(
+        np.asarray(restored["nested"]["jax"]),
+        np.asarray(tree["nested"]["jax"]),
+    )