Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions src/jaqmc/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def restore_from_file[ValueT](
logger.info("Restored checkpoint %s", restore_path)
return step + 1, cast(ValueT, data)

def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
def restore[ValueT](
self, fallback: ValueT, *, strict: bool = False
) -> tuple[int, ValueT]:
"""Restore the latest checkpoint from ``restore_path`` if available.

The manager searches for the newest ``ckpt_*.npz`` file under
Expand All @@ -101,6 +103,8 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
Args:
fallback: Reference PyTree to use for structure and default values
when no checkpoint exists or all are unreadable.
strict: If ``True``, raise when no matching checkpoint can be
restored instead of returning ``fallback``.

Returns:
A tuple ``(step, restored)``:
Expand All @@ -111,8 +115,18 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:

Type Parameters:
ValueT: Reference-tree type that is preserved in the restored value.

Raises:
FileNotFoundError: If ``strict`` is ``True`` and ``restore_path``
does not exist, or no matching checkpoints are found.
RuntimeError: If ``strict`` is ``True`` and every matching
checkpoint fails to restore.
"""
if not self.restore_path.exists():
if strict:
raise FileNotFoundError(
f"Checkpoint path does not exist: {self.restore_path}"
)
logger.warning("No checkpoint to restore in: %s", self.restore_path)
return 0, fallback
if self.restore_path.is_file():
Expand All @@ -121,6 +135,11 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
self.restore_path.glob(f"{self.prefix}ckpt_*.npz"), reverse=True
)
if not ckpt_files:
if strict:
raise FileNotFoundError(
"No matching checkpoints found in "
f"{self.restore_path}: {self.prefix}ckpt_*.npz"
)
if self.restore_path != self.save_path:
logger.warning(
"Directory exists but no matching "
Expand All @@ -129,11 +148,18 @@ def restore[ValueT](self, fallback: ValueT) -> tuple[int, ValueT]:
self.prefix,
)
return 0, fallback
last_error: Exception | None = None
for ckpt_path in ckpt_files:
try:
return self.restore_from_file(ckpt_path, fallback)
except (OSError, EOFError, BadZipFile):
except (OSError, EOFError, BadZipFile) as err:
last_error = err
logger.warning("Fail to restore checkpoint %s", ckpt_path)
if strict:
raise RuntimeError(
"Failed to restore any checkpoint from "
f"{self.restore_path}: {self.prefix}ckpt_*.npz"
) from last_error
return 0, fallback

def save(self, step: int, data):
Expand Down
2 changes: 1 addition & 1 deletion src/jaqmc/workflow/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def run(self) -> None:
# train_ckpt_*.npz in the directory.
prefix = "" if source_path.is_file() else "train"
restored = self.evaluation_stage.restore_checkpoint(
source_path, wrapper, prefix=prefix
source_path, wrapper, prefix=prefix, strict=True
)
state = replace(
state,
Expand Down
3 changes: 3 additions & 0 deletions src/jaqmc/workflow/stage/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,13 +215,16 @@ def restore_checkpoint(
template: Any,
*,
prefix: str = "",
strict: bool = False,
) -> Any:
"""Restore state from a checkpoint.

Args:
checkpoint_path: Path to checkpoint file or directory.
template: Template state for deserialization.
prefix: Checkpoint filename prefix to match.
strict: If ``True``, raise when no matching checkpoint can be
restored.

Returns:
Restored state.
Expand Down
5 changes: 4 additions & 1 deletion src/jaqmc/workflow/stage/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,20 +177,23 @@ def restore_checkpoint(
template: Any,
*,
prefix: str = "",
strict: bool = False,
):
"""Restore state from a checkpoint.

Args:
checkpoint_path: Path to checkpoint file or directory.
template: Template state for deserialization.
prefix: Checkpoint filename prefix to match.
strict: If ``True``, raise when no matching checkpoint can be
restored.

Returns:
Restored state.
"""
checkpoint_path = UPath(checkpoint_path)
ckpt = NumPyCheckpointManager(checkpoint_path, checkpoint_path, prefix=prefix)
_, state = ckpt.restore(template)
_, state = ckpt.restore(template, strict=strict)
return state

def create_state( # type: ignore[override]
Expand Down
68 changes: 68 additions & 0 deletions tests/hydrogen/atom_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,71 @@ def test_evaluation_writes_per_step_stats_and_digest(tmp_path):
assert digest["total_energy"].ndim == 0
assert "energy:kinetic" in digest
assert digest["energy:kinetic"].ndim == 0


def test_evaluation_fails_when_source_path_is_missing(tmp_path):
eval_dir = tmp_path / "eval-run"
eval_cfg = ConfigManager(
{
"workflow": {
"save_path": str(eval_dir),
"batch_size": 128,
"source_path": str(tmp_path / "missing-train-run"),
},
"run": {"iterations": 1},
}
)

with pytest.raises(FileNotFoundError, match="Checkpoint path does not exist"):
hydrogen_atom_eval_workflow(eval_cfg)()

assert not (eval_dir / "evaluation_digest.npz").exists()


def test_evaluation_fails_when_source_dir_has_no_train_checkpoints(tmp_path):
source_dir = tmp_path / "empty-train-run"
source_dir.mkdir()
eval_dir = tmp_path / "eval-run"
eval_cfg = ConfigManager(
{
"workflow": {
"save_path": str(eval_dir),
"batch_size": 128,
"source_path": str(source_dir),
},
"run": {"iterations": 1},
}
)

with pytest.raises(FileNotFoundError, match="No matching checkpoints found"):
hydrogen_atom_eval_workflow(eval_cfg)()

assert not (eval_dir / "evaluation_digest.npz").exists()


def test_evaluation_accepts_checkpoint_file_source_path(tmp_path):
train_dir = tmp_path / "train-run"
eval_dir = tmp_path / "eval-run"

train_cfg = ConfigManager(
{
"workflow": {"save_path": str(train_dir), "batch_size": 128},
"train": {"run": {"iterations": 3}},
}
)
hydrogen_atom_train_workflow(train_cfg)()

checkpoint_path = max(train_dir.glob("train_ckpt_*.npz"))
eval_cfg = ConfigManager(
{
"workflow": {
"save_path": str(eval_dir),
"batch_size": 128,
"source_path": str(checkpoint_path),
},
"run": {"iterations": 3},
}
)
hydrogen_atom_eval_workflow(eval_cfg)()

assert (eval_dir / "evaluation_digest.npz").exists()
181 changes: 181 additions & 0 deletions tests/utils/checkpoint_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# Copyright (c) 2026 ByteDance Ltd. and/or its affiliates
# SPDX-License-Identifier: Apache-2.0

import numpy as np
import pytest
from jax import numpy as jnp

from jaqmc.utils.checkpoint import NumPyCheckpointManager, tree_from_npz, tree_to_npz


def _fallback_tree():
return {
"scalar": 0,
"np": np.array([[0.0, 0.0]]),
"nested": {"jax": jnp.array([0, 0])},
}


def _restored_tree():
return {
"scalar": 7,
"np": np.array([[1.5, 2.5]]),
"nested": {"jax": jnp.array([3, 4])},
}


def _corrupt_file(path):
path.write_bytes(path.read_bytes()[:-10])


def test_save_and_restore_round_trip_with_prefix(tmp_path):
manager = NumPyCheckpointManager(tmp_path, prefix="train")
fallback = _fallback_tree()
expected = _restored_tree()

manager.save(4, expected)

ckpt_path = tmp_path / "train_ckpt_000004.npz"
assert ckpt_path.exists()

step, restored = manager.restore(fallback)

assert step == 5
assert restored["scalar"] == expected["scalar"]
np.testing.assert_array_equal(restored["np"], expected["np"])
np.testing.assert_array_equal(
np.asarray(restored["nested"]["jax"]),
np.asarray(expected["nested"]["jax"]),
)


def test_restore_uses_restore_path_file(tmp_path):
manager = NumPyCheckpointManager(tmp_path, prefix="train")
expected = _restored_tree()
manager.save(2, expected)

ckpt_path = tmp_path / "train_ckpt_000002.npz"
direct_file_manager = NumPyCheckpointManager(tmp_path, ckpt_path, prefix="ignored")

step, restored = direct_file_manager.restore(_fallback_tree())

assert step == 3
assert restored["scalar"] == expected["scalar"]
np.testing.assert_array_equal(restored["np"], expected["np"])


def test_restore_from_file_rejects_non_file_path(tmp_path):
with pytest.raises(ValueError, match="is not a file"):
NumPyCheckpointManager.restore_from_file(tmp_path, _fallback_tree())


def test_restore_returns_fallback_when_path_missing(tmp_path):
manager = NumPyCheckpointManager(tmp_path / "missing", prefix="train")
fallback = _fallback_tree()

step, restored = manager.restore(fallback)

assert step == 0
assert restored is fallback


def test_restore_raises_when_path_missing_in_strict_mode(tmp_path):
manager = NumPyCheckpointManager(tmp_path / "missing", prefix="train")

with pytest.raises(FileNotFoundError, match="Checkpoint path does not exist"):
manager.restore(_fallback_tree(), strict=True)


def test_restore_returns_fallback_when_no_matching_checkpoint_files(tmp_path):
manager = NumPyCheckpointManager(tmp_path, prefix="train")
fallback = _fallback_tree()
(tmp_path / "other_ckpt_000001.npz").write_bytes(b"placeholder")

step, restored = manager.restore(fallback)

assert step == 0
assert restored is fallback


def test_restore_raises_when_no_matching_checkpoint_files_in_strict_mode(tmp_path):
manager = NumPyCheckpointManager(tmp_path, prefix="train")

with pytest.raises(FileNotFoundError, match="No matching checkpoints found"):
manager.restore(_fallback_tree(), strict=True)


def test_restore_skips_bad_latest_checkpoint(tmp_path):
manager = NumPyCheckpointManager(tmp_path, prefix="train")
expected = _restored_tree()

manager.save(2, expected)
manager.save(4, _fallback_tree())
_corrupt_file(tmp_path / "train_ckpt_000004.npz")

step, restored = manager.restore(_fallback_tree())

assert step == 3
assert restored["scalar"] == expected["scalar"]
np.testing.assert_array_equal(restored["np"], expected["np"])


def test_restore_skips_bad_latest_checkpoint_in_strict_mode(tmp_path):
manager = NumPyCheckpointManager(tmp_path, prefix="train")
expected = _restored_tree()

manager.save(2, expected)
manager.save(4, _fallback_tree())
_corrupt_file(tmp_path / "train_ckpt_000004.npz")

step, restored = manager.restore(_fallback_tree(), strict=True)

assert step == 3
assert restored["scalar"] == expected["scalar"]
np.testing.assert_array_equal(restored["np"], expected["np"])


def test_restore_returns_fallback_when_all_matching_checkpoints_are_bad(tmp_path):
manager = NumPyCheckpointManager(tmp_path, prefix="train")
fallback = _fallback_tree()

manager.save(2, _restored_tree())
manager.save(4, _fallback_tree())
_corrupt_file(tmp_path / "train_ckpt_000002.npz")
_corrupt_file(tmp_path / "train_ckpt_000004.npz")

step, restored = manager.restore(fallback)

assert step == 0
assert restored is fallback


def test_restore_raises_when_all_matching_checkpoints_are_bad_in_strict_mode(
tmp_path,
):
manager = NumPyCheckpointManager(tmp_path, prefix="train")

manager.save(2, _restored_tree())
manager.save(4, _fallback_tree())
_corrupt_file(tmp_path / "train_ckpt_000002.npz")
_corrupt_file(tmp_path / "train_ckpt_000004.npz")

with pytest.raises(RuntimeError, match="Failed to restore any checkpoint"):
manager.restore(_fallback_tree(), strict=True)


def test_tree_to_npz_and_tree_from_npz_round_trip():
tree = _restored_tree()
fallback = _fallback_tree()

npz_data = tree_to_npz(tree)

assert {"scalar", "np", "nested/jax"} <= set(npz_data)

restored = tree_from_npz(npz_data, fallback)

assert restored["scalar"] == tree["scalar"]
np.testing.assert_array_equal(restored["np"], tree["np"])
np.testing.assert_array_equal(
np.asarray(restored["nested"]["jax"]),
np.asarray(tree["nested"]["jax"]),
)
Loading