diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..fddac22 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,31 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + +jobs: + tests: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install CPU dependencies + run: | + python -m pip install --upgrade pip + # CPU-only torch keeps CI fast; the unit-tested modules need no CUDA/triton. + pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cpu + pip install numpy PyYAML tqdm pytest + + - name: Run tests + run: python -m pytest tests -v diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..9457674 --- /dev/null +++ b/conftest.py @@ -0,0 +1,3 @@ +# Presence of a conftest.py at the repo root ensures the project root is on +# sys.path during test collection, so `import deepspec...` resolves without an +# editable install. Intentionally minimal. diff --git a/deepspec/data/jsonl_dataset.py b/deepspec/data/jsonl_dataset.py index a3c2219..58e70b8 100644 --- a/deepspec/data/jsonl_dataset.py +++ b/deepspec/data/jsonl_dataset.py @@ -105,6 +105,22 @@ def _build_all_line_starts(self): continue handle = open(path, "rb") + if os.fstat(handle.fileno()).st_size == 0: + # mmap cannot map a 0-byte file; treat an empty shard as 0 records. + handle.close() + starts = [] + self.line_starts_per_file[idx] = starts + self.num_data_per_file.append(0) + if cache_path is not None: + self._atomic_pickle_dump( + { + "file_key": file_key, + "file_path": os.path.abspath(path), + "line_starts": starts, + }, + cache_path, + ) + continue mm = mmap.mmap(handle.fileno(), 0, access=mmap.ACCESS_READ) self.files[idx] = handle self.mmaps[idx] = mm diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..aa5e8ce --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,189 @@ +"""Tests for deepspec.utils.config. + +Characterization tests for the pure-Python config machinery: the attribute-dict +``ConfigNode``, the recursive (to|from)-config-node converters, ``jsonable``, +the ``CustomJSONEncoder``, and the ``--opts`` override parser with its error +paths. All CPU-only, no model or GPU required. +""" + +from __future__ import annotations + +import json +from argparse import Namespace +from pathlib import Path +from types import SimpleNamespace + +import pytest +import torch + +from deepspec.utils.config import ( + ConfigNode, + CustomJSONEncoder, + config_to_plain_dict, + finalize_config, + jsonable, + load_config, + parse_opts_to_config, + to_config_node, +) + + +class TestConfigNode: + def test_attribute_access_reads_items(self): + node = ConfigNode({"lr": 0.1}) + assert node.lr == 0.1 + assert node["lr"] == 0.1 + + def test_attribute_assignment_sets_items(self): + node = ConfigNode() + node.epochs = 5 + assert node["epochs"] == 5 + + def test_missing_attribute_raises_attribute_error(self): + node = ConfigNode({"a": 1}) + with pytest.raises(AttributeError): + _ = node.does_not_exist + + def test_copy_returns_config_node(self): + node = ConfigNode({"a": 1}) + clone = node.copy() + assert isinstance(clone, ConfigNode) + assert clone == node + clone.a = 2 + assert node.a == 1 # copy is shallow but independent at the top level + + +class TestToConfigNode: + def test_nested_dict_becomes_config_node(self): + out = to_config_node({"outer": {"inner": 1}}) + assert isinstance(out, ConfigNode) + assert isinstance(out["outer"], ConfigNode) + assert out.outer.inner == 1 + + def test_list_is_preserved_with_converted_elements(self): + out = to_config_node({"items": [{"k": 1}, 2]}) + assert isinstance(out["items"], list) + assert isinstance(out["items"][0], ConfigNode) + assert out["items"][1] == 2 + + def test_tuple_is_preserved(self): + out = to_config_node({"pair": ({"k": 1}, 2)}) + assert isinstance(out["pair"], tuple) + assert isinstance(out["pair"][0], ConfigNode) + + def test_scalars_pass_through(self): + assert to_config_node(7) == 7 + assert to_config_node("x") == "x" + + +class TestConfigToPlainDict: + def test_config_node_becomes_plain_dict(self): + node = to_config_node({"a": {"b": 1}}) + plain = config_to_plain_dict(node) + assert type(plain) is dict + assert type(plain["a"]) is dict + assert plain == {"a": {"b": 1}} + + def test_tuple_becomes_list(self): + node = to_config_node({"pair": (1, 2)}) + plain = config_to_plain_dict(node) + assert plain["pair"] == [1, 2] + + +class TestJsonable: + def test_path_becomes_string(self): + assert jsonable(Path("/tmp/x")) == "/tmp/x" + + def test_nested_structures_are_converted(self): + node = ConfigNode({"p": Path("/a"), "items": (1, Path("/b"))}) + out = jsonable(node) + assert out == {"p": "/a", "items": [1, "/b"]} + + +class TestCustomJSONEncoder: + def _dumps(self, obj): + return json.loads(json.dumps(obj, cls=CustomJSONEncoder)) + + def test_encodes_function(self): + def my_fn(): + return None + + assert self._dumps({"f": my_fn}) == {"f": ""} + + def test_encodes_type(self): + assert self._dumps({"t": int}) == {"t": ""} + + def test_encodes_torch_dtype(self): + assert self._dumps({"dt": torch.float32}) == {"dt": "torch.float32"} + + def test_encodes_path(self): + assert self._dumps({"p": Path("/a/b")}) == {"p": "/a/b"} + + def test_encodes_namespace(self): + assert self._dumps(Namespace(a=1, b="x")) == {"a": 1, "b": "x"} + assert self._dumps(SimpleNamespace(c=3)) == {"c": 3} + + def test_encodes_config_node(self): + assert self._dumps(to_config_node({"a": {"b": 1}})) == {"a": {"b": 1}} + + +class TestParseOptsToConfig: + def test_sets_nested_value(self): + cfg = {"train": {"lr": 0.1}} + out = parse_opts_to_config(["train.lr=0.5"], cfg) + assert out.train.lr == 0.5 + + def test_value_is_yaml_parsed(self): + out = parse_opts_to_config(["train.steps=10"], {"train": {"steps": 1}}) + assert out.train.steps == 10 + assert isinstance(out.train.steps, int) + + def test_empty_opts_returns_finalized_config_node(self): + out = parse_opts_to_config([], {"a": {"b": 1}}) + assert isinstance(out, ConfigNode) + assert out.a.b == 1 + + def test_unknown_top_level_key_raises_key_error(self): + with pytest.raises(KeyError): + parse_opts_to_config(["missing=1"], {"a": 1}) + + def test_unknown_nested_key_raises_key_error(self): + with pytest.raises(KeyError): + parse_opts_to_config(["a.missing=1"], {"a": {"b": 1}}) + + def test_non_mapping_intermediate_raises_type_error(self): + with pytest.raises(TypeError): + parse_opts_to_config(["a.b=1"], {"a": 1}) + + +class TestFinalizeConfig: + def test_returns_config_node_without_hook(self): + out = finalize_config({"a": 1}) + assert isinstance(out, ConfigNode) + assert out.a == 1 + + def test_runs_finalize_cfg_hook(self): + def finalize_cfg(cfg): + cfg["added"] = cfg["base"] * 2 + return cfg + + out = finalize_config({"base": 3, "finalize_cfg": finalize_cfg}) + assert out.added == 6 + + +class TestLoadConfig: + def test_loads_module_level_vars(self, tmp_path): + cfg_file = tmp_path / "cfg.py" + cfg_file.write_text( + "import os\n" # modules must be skipped + "lr = 0.01\n" + "layers = [1, 2, 3]\n" + "_private = 'kept' # leading single underscore is NOT skipped\n" + "__dunder__ = 'skipped'\n" + ) + cfg = load_config(str(cfg_file)) + assert isinstance(cfg, ConfigNode) + assert cfg.lr == 0.01 + assert cfg.layers == [1, 2, 3] + assert "os" not in cfg # ModuleType filtered out + assert "__dunder__" not in cfg # dunder filtered out diff --git a/tests/test_jsonl_dataset.py b/tests/test_jsonl_dataset.py new file mode 100644 index 0000000..2772b8f --- /dev/null +++ b/tests/test_jsonl_dataset.py @@ -0,0 +1,122 @@ +"""Tests for deepspec.data.jsonl_dataset.JsonLineDataset. + +Exercises the mmap-based line indexing end to end on temporary .jsonl files: +length counting (with and without a trailing newline), per-record decoding, +multi-file global indexing, bounds checking, and the on-disk line-start cache. +CPU-only; ``CACHE_DIR`` is redirected to a temp dir so tests are hermetic. +""" + +from __future__ import annotations + +import json +import pickle + +import pytest + +import deepspec.data.jsonl_dataset as jsonl_mod +from deepspec.data.jsonl_dataset import JsonLineDataset + + +@pytest.fixture(autouse=True) +def isolated_cache(tmp_path, monkeypatch): + """Redirect the module-level CACHE_DIR to a temp dir for each test.""" + monkeypatch.setattr(jsonl_mod, "CACHE_DIR", str(tmp_path / "jsonl_cache")) + return tmp_path + + +def _write_jsonl(path, records, trailing_newline=True): + body = "\n".join(json.dumps(r) for r in records) + if trailing_newline and records: + body += "\n" + path.write_text(body, encoding="utf-8") + return path + + +def test_len_with_trailing_newline(tmp_path): + records = [{"i": 0}, {"i": 1}, {"i": 2}] + path = _write_jsonl(tmp_path / "a.jsonl", records, trailing_newline=True) + ds = JsonLineDataset([str(path)]) + assert len(ds) == 3 + ds.close() + + +def test_len_without_trailing_newline(tmp_path): + records = [{"i": 0}, {"i": 1}, {"i": 2}] + path = _write_jsonl(tmp_path / "a.jsonl", records, trailing_newline=False) + ds = JsonLineDataset([str(path)]) + assert len(ds) == 3 + ds.close() + + +def test_getitem_roundtrips_records(tmp_path): + records = [{"i": 0, "text": "alpha"}, {"i": 1, "text": "beta"}] + path = _write_jsonl(tmp_path / "a.jsonl", records) + ds = JsonLineDataset([str(path)]) + assert ds[0] == records[0] + assert ds[1] == records[1] + ds.close() + + +def test_empty_file_has_zero_length(tmp_path): + # A 0-byte .jsonl must be treated as 0 records rather than crashing on the + # `mmap` of an empty file (regression test for the empty-shard guard). + path = tmp_path / "empty.jsonl" + path.write_text("", encoding="utf-8") + ds = JsonLineDataset([str(path)]) + assert len(ds) == 0 + ds.close() + + +def test_empty_shard_among_populated_files(tmp_path): + empty = tmp_path / "a_empty.jsonl" + empty.write_text("", encoding="utf-8") + populated = _write_jsonl(tmp_path / "b.jsonl", [{"i": 0}, {"i": 1}]) + ds = JsonLineDataset([str(empty), str(populated)]) + assert len(ds) == 2 + assert ds[0] == {"i": 0} + assert ds[1] == {"i": 1} + ds.close() + + +def test_out_of_range_index_raises(tmp_path): + path = _write_jsonl(tmp_path / "a.jsonl", [{"i": 0}]) + ds = JsonLineDataset([str(path)]) + with pytest.raises(IndexError): + _ = ds[len(ds)] + with pytest.raises(IndexError): + _ = ds[-1] + ds.close() + + +def test_multi_file_global_indexing(tmp_path): + # data_paths are sorted internally; name files so order is deterministic. + f0 = _write_jsonl(tmp_path / "a.jsonl", [{"f": 0, "i": 0}, {"f": 0, "i": 1}]) + f1 = _write_jsonl(tmp_path / "b.jsonl", [{"f": 1, "i": 0}, {"f": 1, "i": 1}, {"f": 1, "i": 2}]) + ds = JsonLineDataset([str(f1), str(f0)]) # pass unsorted on purpose + assert len(ds) == 5 + # First two records come from a.jsonl, the next three from b.jsonl. + assert ds[0] == {"f": 0, "i": 0} + assert ds[1] == {"f": 0, "i": 1} + assert ds[2] == {"f": 1, "i": 0} + assert ds[4] == {"f": 1, "i": 2} + ds.close() + + +def test_line_start_cache_is_written_and_reused(tmp_path, monkeypatch): + cache_dir = tmp_path / "jsonl_cache" + monkeypatch.setattr(jsonl_mod, "CACHE_DIR", str(cache_dir)) + path = _write_jsonl(tmp_path / "a.jsonl", [{"i": 0}, {"i": 1}, {"i": 2}]) + + ds1 = JsonLineDataset([str(path)]) + ds1.close() + cache_files = list(cache_dir.glob("jsonlindex-*.pkl")) + assert len(cache_files) == 1 + + cached = pickle.loads(cache_files[0].read_bytes()) + assert cached["line_starts"] == [0, len('{"i": 0}\n'), 2 * len('{"i": 0}\n')] + + # Second instance must reuse the cache and report the same length. + ds2 = JsonLineDataset([str(path)]) + assert len(ds2) == 3 + assert ds2[2] == {"i": 2} + ds2.close() diff --git a/tests/test_sampling.py b/tests/test_sampling.py new file mode 100644 index 0000000..dedd04f --- /dev/null +++ b/tests/test_sampling.py @@ -0,0 +1,73 @@ +"""Tests for deepspec.utils.sampling. + +Covers the deterministic surfaces of the sampling helpers: greedy/temperature-0 +paths, the softmax path, probability gathering, and speculative-decoding residual +sampling (including the equal-distribution fallback). Sampling calls use one-hot +distributions so ``torch.multinomial`` is deterministic. CPU-only. +""" + +from __future__ import annotations + +import torch + +from deepspec.utils.sampling import ( + gather_token_probs, + logits_to_probs, + sample_from_probs, + sample_residual, + sample_tokens, +) + + +def test_logits_to_probs_greedy_is_one_hot_at_argmax(): + logits = torch.tensor([[[1.0, 3.0, 2.0]]]) + probs = logits_to_probs(logits, temperature=0.0) + assert torch.equal(probs, torch.tensor([[[0.0, 1.0, 0.0]]])) + assert torch.allclose(probs.sum(-1), torch.ones(1, 1)) + + +def test_logits_to_probs_temperature_one_matches_softmax(): + logits = torch.tensor([[[1.0, 2.0, 3.0]]]) + probs = logits_to_probs(logits, temperature=1.0) + expected = torch.softmax(logits.float(), dim=-1) + assert torch.allclose(probs, expected) + assert torch.allclose(probs.sum(-1), torch.ones(1, 1)) + + +def test_sample_tokens_greedy_returns_argmax(): + logits = torch.tensor([[[0.1, 0.9, 0.0], [5.0, 1.0, 2.0]]]) + out = sample_tokens(logits, temperature=0.0) + assert torch.equal(out, logits.argmax(dim=-1)) + assert out.shape == (1, 2) + + +def test_gather_token_probs_selects_indexed_values(): + probs = torch.tensor([[[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]]]) + token_ids = torch.tensor([[2, 0]]) + out = gather_token_probs(probs, token_ids) + assert torch.allclose(out, torch.tensor([[0.5, 0.6]])) + + +def test_sample_from_probs_is_deterministic_for_one_hot(): + torch.manual_seed(0) + probs = torch.tensor([[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]]) + out = sample_from_probs(probs) + assert torch.equal(out, torch.tensor([[1, 0]])) + + +def test_sample_residual_prefers_unmatched_target_mass(): + torch.manual_seed(0) + target = torch.tensor([[0.0, 1.0, 0.0]]) + draft = torch.tensor([[0.0, 0.0, 1.0]]) + # residual = clamp(target - draft, 0) = [0, 1, 0] -> token 1 + out = sample_residual(target, draft) + assert torch.equal(out, torch.tensor([1])) + + +def test_sample_residual_falls_back_when_distributions_match(): + torch.manual_seed(0) + target = torch.tensor([[0.0, 1.0, 0.0]]) + draft = torch.tensor([[0.0, 1.0, 0.0]]) + # residual mass is ~0 -> fall back to target_probs -> token 1 + out = sample_residual(target, draft) + assert torch.equal(out, torch.tensor([1]))