Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions arctic_training/config/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,11 @@ class TrainerConfig(BaseConfig):
loss_log_interval: HumanInt = Field(default=1, ge=0)
""" Number of steps between logging loss. """

train_log_iter_interval: Literal[0, 1] = 1
""" Iters between training metric log outputs. `0` is off, only intervals of `1` currently supported. """
train_log_iter_interval: HumanInt = Field(default=1, ge=0)
""" Iters between training metric log outputs. `0` disables metrics logging. """

metrics_display_order: List[str] = []
""" Optional display order for metrics in the log line. Unlisted metrics are appended in their default order. """

# XXX: fixme: the default output dir is broken
# train_log_metrics_path: Path = Field(
Expand Down
20 changes: 6 additions & 14 deletions arctic_training/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,27 +134,19 @@ def see_memory_usage(message, force=False, ranks=[0]):
get_accelerator().reset_peak_memory_stats()


def get_mem_metrics():

def get_mem_metrics() -> tuple[float, float, float]:
"""Return (memory_allocated_gb, max_memory_allocated_gb, nvml_mem_gb)."""
gc.collect()
# torch.cuda.empty_cache()

nv_mem = get_nvml_mem()

summary = " | ".join(
[
f"MA {round(get_accelerator().memory_allocated() / 2**30, 2):0.2f} GB",
f"Max_MA {round(get_accelerator().max_memory_allocated() / 2**30, 2):0.2f} GB",
f"NV {round(nv_mem / 2**30, 2):0.2f} GB",
]
)
ma_gb = round(get_accelerator().memory_allocated() / 2**30, 2)
Copy link
Copy Markdown
Collaborator

@sfc-gh-sbekman sfc-gh-sbekman Mar 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mike, while at it, let's please fix my sloppiness - use s/_gb/_gib/ and s/GB/GiB/ later in the metrics registry. Thank you!

max_ma_gb = round(get_accelerator().max_memory_allocated() / 2**30, 2)
nv_gb = round(nv_mem / 2**30, 2)

# get the peak memory to report correct data, so reset the counter for the next call
# this will lead to wrong peak reports if `see_mem_usage` is also used during the run,
# as it resets the peak counter and there is only one counter
Comment on lines -153 to -154
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why have you removed this warning?

get_accelerator().reset_peak_memory_stats()

return summary
return (ma_gb, max_ma_gb, nv_gb)


# fcntl.flock can be slow on shared fs, so if things are too slow especially when many ranks are
Expand Down
Loading
Loading