Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: CI

on:
push:
branches: [main]
pull_request:

jobs:
tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.12"]
steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install CPU dependencies
run: |
python -m pip install --upgrade pip
# CPU-only torch keeps CI fast; the unit-tested modules need no CUDA/triton.
pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cpu
pip install numpy PyYAML tqdm pytest

- name: Run tests
run: python -m pytest tests -v
3 changes: 3 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Presence of a conftest.py at the repo root ensures the project root is on
# sys.path during test collection, so `import deepspec...` resolves without an
# editable install. Intentionally minimal.
16 changes: 16 additions & 0 deletions deepspec/data/jsonl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,22 @@ def _build_all_line_starts(self):
continue

handle = open(path, "rb")
if os.fstat(handle.fileno()).st_size == 0:
# mmap cannot map a 0-byte file; treat an empty shard as 0 records.
handle.close()
starts = []
self.line_starts_per_file[idx] = starts
self.num_data_per_file.append(0)
if cache_path is not None:
self._atomic_pickle_dump(
{
"file_key": file_key,
"file_path": os.path.abspath(path),
"line_starts": starts,
},
cache_path,
)
continue
mm = mmap.mmap(handle.fileno(), 0, access=mmap.ACCESS_READ)
self.files[idx] = handle
self.mmaps[idx] = mm
Expand Down
189 changes: 189 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
"""Tests for deepspec.utils.config.

Characterization tests for the pure-Python config machinery: the attribute-dict
``ConfigNode``, the recursive (to|from)-config-node converters, ``jsonable``,
the ``CustomJSONEncoder``, and the ``--opts`` override parser with its error
paths. All CPU-only, no model or GPU required.
"""

from __future__ import annotations

import json
from argparse import Namespace
from pathlib import Path
from types import SimpleNamespace

import pytest
import torch

from deepspec.utils.config import (
ConfigNode,
CustomJSONEncoder,
config_to_plain_dict,
finalize_config,
jsonable,
load_config,
parse_opts_to_config,
to_config_node,
)


class TestConfigNode:
def test_attribute_access_reads_items(self):
node = ConfigNode({"lr": 0.1})
assert node.lr == 0.1
assert node["lr"] == 0.1

def test_attribute_assignment_sets_items(self):
node = ConfigNode()
node.epochs = 5
assert node["epochs"] == 5

def test_missing_attribute_raises_attribute_error(self):
node = ConfigNode({"a": 1})
with pytest.raises(AttributeError):
_ = node.does_not_exist

def test_copy_returns_config_node(self):
node = ConfigNode({"a": 1})
clone = node.copy()
assert isinstance(clone, ConfigNode)
assert clone == node
clone.a = 2
assert node.a == 1 # copy is shallow but independent at the top level


class TestToConfigNode:
def test_nested_dict_becomes_config_node(self):
out = to_config_node({"outer": {"inner": 1}})
assert isinstance(out, ConfigNode)
assert isinstance(out["outer"], ConfigNode)
assert out.outer.inner == 1

def test_list_is_preserved_with_converted_elements(self):
out = to_config_node({"items": [{"k": 1}, 2]})
assert isinstance(out["items"], list)
assert isinstance(out["items"][0], ConfigNode)
assert out["items"][1] == 2

def test_tuple_is_preserved(self):
out = to_config_node({"pair": ({"k": 1}, 2)})
assert isinstance(out["pair"], tuple)
assert isinstance(out["pair"][0], ConfigNode)

def test_scalars_pass_through(self):
assert to_config_node(7) == 7
assert to_config_node("x") == "x"


class TestConfigToPlainDict:
def test_config_node_becomes_plain_dict(self):
node = to_config_node({"a": {"b": 1}})
plain = config_to_plain_dict(node)
assert type(plain) is dict
assert type(plain["a"]) is dict
assert plain == {"a": {"b": 1}}

def test_tuple_becomes_list(self):
node = to_config_node({"pair": (1, 2)})
plain = config_to_plain_dict(node)
assert plain["pair"] == [1, 2]


class TestJsonable:
def test_path_becomes_string(self):
assert jsonable(Path("/tmp/x")) == "/tmp/x"

def test_nested_structures_are_converted(self):
node = ConfigNode({"p": Path("/a"), "items": (1, Path("/b"))})
out = jsonable(node)
assert out == {"p": "/a", "items": [1, "/b"]}


class TestCustomJSONEncoder:
def _dumps(self, obj):
return json.loads(json.dumps(obj, cls=CustomJSONEncoder))

def test_encodes_function(self):
def my_fn():
return None

assert self._dumps({"f": my_fn}) == {"f": "<function my_fn>"}

def test_encodes_type(self):
assert self._dumps({"t": int}) == {"t": "<class 'int'>"}

def test_encodes_torch_dtype(self):
assert self._dumps({"dt": torch.float32}) == {"dt": "torch.float32"}

def test_encodes_path(self):
assert self._dumps({"p": Path("/a/b")}) == {"p": "/a/b"}

def test_encodes_namespace(self):
assert self._dumps(Namespace(a=1, b="x")) == {"a": 1, "b": "x"}
assert self._dumps(SimpleNamespace(c=3)) == {"c": 3}

def test_encodes_config_node(self):
assert self._dumps(to_config_node({"a": {"b": 1}})) == {"a": {"b": 1}}


class TestParseOptsToConfig:
def test_sets_nested_value(self):
cfg = {"train": {"lr": 0.1}}
out = parse_opts_to_config(["train.lr=0.5"], cfg)
assert out.train.lr == 0.5

def test_value_is_yaml_parsed(self):
out = parse_opts_to_config(["train.steps=10"], {"train": {"steps": 1}})
assert out.train.steps == 10
assert isinstance(out.train.steps, int)

def test_empty_opts_returns_finalized_config_node(self):
out = parse_opts_to_config([], {"a": {"b": 1}})
assert isinstance(out, ConfigNode)
assert out.a.b == 1

def test_unknown_top_level_key_raises_key_error(self):
with pytest.raises(KeyError):
parse_opts_to_config(["missing=1"], {"a": 1})

def test_unknown_nested_key_raises_key_error(self):
with pytest.raises(KeyError):
parse_opts_to_config(["a.missing=1"], {"a": {"b": 1}})

def test_non_mapping_intermediate_raises_type_error(self):
with pytest.raises(TypeError):
parse_opts_to_config(["a.b=1"], {"a": 1})


class TestFinalizeConfig:
def test_returns_config_node_without_hook(self):
out = finalize_config({"a": 1})
assert isinstance(out, ConfigNode)
assert out.a == 1

def test_runs_finalize_cfg_hook(self):
def finalize_cfg(cfg):
cfg["added"] = cfg["base"] * 2
return cfg

out = finalize_config({"base": 3, "finalize_cfg": finalize_cfg})
assert out.added == 6


class TestLoadConfig:
def test_loads_module_level_vars(self, tmp_path):
cfg_file = tmp_path / "cfg.py"
cfg_file.write_text(
"import os\n" # modules must be skipped
"lr = 0.01\n"
"layers = [1, 2, 3]\n"
"_private = 'kept' # leading single underscore is NOT skipped\n"
"__dunder__ = 'skipped'\n"
)
cfg = load_config(str(cfg_file))
assert isinstance(cfg, ConfigNode)
assert cfg.lr == 0.01
assert cfg.layers == [1, 2, 3]
assert "os" not in cfg # ModuleType filtered out
assert "__dunder__" not in cfg # dunder filtered out
122 changes: 122 additions & 0 deletions tests/test_jsonl_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Tests for deepspec.data.jsonl_dataset.JsonLineDataset.

Exercises the mmap-based line indexing end to end on temporary .jsonl files:
length counting (with and without a trailing newline), per-record decoding,
multi-file global indexing, bounds checking, and the on-disk line-start cache.
CPU-only; ``CACHE_DIR`` is redirected to a temp dir so tests are hermetic.
"""

from __future__ import annotations

import json
import pickle

import pytest

import deepspec.data.jsonl_dataset as jsonl_mod
from deepspec.data.jsonl_dataset import JsonLineDataset


@pytest.fixture(autouse=True)
def isolated_cache(tmp_path, monkeypatch):
"""Redirect the module-level CACHE_DIR to a temp dir for each test."""
monkeypatch.setattr(jsonl_mod, "CACHE_DIR", str(tmp_path / "jsonl_cache"))
return tmp_path


def _write_jsonl(path, records, trailing_newline=True):
body = "\n".join(json.dumps(r) for r in records)
if trailing_newline and records:
body += "\n"
path.write_text(body, encoding="utf-8")
return path


def test_len_with_trailing_newline(tmp_path):
records = [{"i": 0}, {"i": 1}, {"i": 2}]
path = _write_jsonl(tmp_path / "a.jsonl", records, trailing_newline=True)
ds = JsonLineDataset([str(path)])
assert len(ds) == 3
ds.close()


def test_len_without_trailing_newline(tmp_path):
records = [{"i": 0}, {"i": 1}, {"i": 2}]
path = _write_jsonl(tmp_path / "a.jsonl", records, trailing_newline=False)
ds = JsonLineDataset([str(path)])
assert len(ds) == 3
ds.close()


def test_getitem_roundtrips_records(tmp_path):
records = [{"i": 0, "text": "alpha"}, {"i": 1, "text": "beta"}]
path = _write_jsonl(tmp_path / "a.jsonl", records)
ds = JsonLineDataset([str(path)])
assert ds[0] == records[0]
assert ds[1] == records[1]
ds.close()


def test_empty_file_has_zero_length(tmp_path):
# A 0-byte .jsonl must be treated as 0 records rather than crashing on the
# `mmap` of an empty file (regression test for the empty-shard guard).
path = tmp_path / "empty.jsonl"
path.write_text("", encoding="utf-8")
ds = JsonLineDataset([str(path)])
assert len(ds) == 0
ds.close()


def test_empty_shard_among_populated_files(tmp_path):
empty = tmp_path / "a_empty.jsonl"
empty.write_text("", encoding="utf-8")
populated = _write_jsonl(tmp_path / "b.jsonl", [{"i": 0}, {"i": 1}])
ds = JsonLineDataset([str(empty), str(populated)])
assert len(ds) == 2
assert ds[0] == {"i": 0}
assert ds[1] == {"i": 1}
ds.close()


def test_out_of_range_index_raises(tmp_path):
path = _write_jsonl(tmp_path / "a.jsonl", [{"i": 0}])
ds = JsonLineDataset([str(path)])
with pytest.raises(IndexError):
_ = ds[len(ds)]
with pytest.raises(IndexError):
_ = ds[-1]
ds.close()


def test_multi_file_global_indexing(tmp_path):
# data_paths are sorted internally; name files so order is deterministic.
f0 = _write_jsonl(tmp_path / "a.jsonl", [{"f": 0, "i": 0}, {"f": 0, "i": 1}])
f1 = _write_jsonl(tmp_path / "b.jsonl", [{"f": 1, "i": 0}, {"f": 1, "i": 1}, {"f": 1, "i": 2}])
ds = JsonLineDataset([str(f1), str(f0)]) # pass unsorted on purpose
assert len(ds) == 5
# First two records come from a.jsonl, the next three from b.jsonl.
assert ds[0] == {"f": 0, "i": 0}
assert ds[1] == {"f": 0, "i": 1}
assert ds[2] == {"f": 1, "i": 0}
assert ds[4] == {"f": 1, "i": 2}
ds.close()


def test_line_start_cache_is_written_and_reused(tmp_path, monkeypatch):
cache_dir = tmp_path / "jsonl_cache"
monkeypatch.setattr(jsonl_mod, "CACHE_DIR", str(cache_dir))
path = _write_jsonl(tmp_path / "a.jsonl", [{"i": 0}, {"i": 1}, {"i": 2}])

ds1 = JsonLineDataset([str(path)])
ds1.close()
cache_files = list(cache_dir.glob("jsonlindex-*.pkl"))
assert len(cache_files) == 1

cached = pickle.loads(cache_files[0].read_bytes())
assert cached["line_starts"] == [0, len('{"i": 0}\n'), 2 * len('{"i": 0}\n')]

# Second instance must reuse the cache and report the same length.
ds2 = JsonLineDataset([str(path)])
assert len(ds2) == 3
assert ds2[2] == {"i": 2}
ds2.close()
Loading