deepseek-ai · ajinkyajawale14499 · Jun 28, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,31 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install CPU dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # CPU-only torch keeps CI fast; the unit-tested modules need no CUDA/triton.
+          pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cpu
+          pip install numpy PyYAML tqdm pytest
+
+      - name: Run tests
+        run: python -m pytest tests -v
diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,3 @@
+# Presence of a conftest.py at the repo root ensures the project root is on
+# sys.path during test collection, so `import deepspec...` resolves without an
+# editable install. Intentionally minimal.
diff --git a/deepspec/data/jsonl_dataset.py b/deepspec/data/jsonl_dataset.py
@@ -105,6 +105,22 @@ def _build_all_line_starts(self):
                     continue
 
             handle = open(path, "rb")
+            if os.fstat(handle.fileno()).st_size == 0:
+                # mmap cannot map a 0-byte file; treat an empty shard as 0 records.
+                handle.close()
+                starts = []
+                self.line_starts_per_file[idx] = starts
+                self.num_data_per_file.append(0)
+                if cache_path is not None:
+                    self._atomic_pickle_dump(
+                        {
+                            "file_key": file_key,
+                            "file_path": os.path.abspath(path),
+                            "line_starts": starts,
+                        },
+                        cache_path,
+                    )
+                continue
             mm = mmap.mmap(handle.fileno(), 0, access=mmap.ACCESS_READ)
             self.files[idx] = handle
             self.mmaps[idx] = mm

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -0,0 +1,189 @@
+"""Tests for deepspec.utils.config.
+
+Characterization tests for the pure-Python config machinery: the attribute-dict
+``ConfigNode``, the recursive (to|from)-config-node converters, ``jsonable``,
+the ``CustomJSONEncoder``, and the ``--opts`` override parser with its error
+paths. All CPU-only, no model or GPU required.
+"""
+
+from __future__ import annotations
+
+import json
+from argparse import Namespace
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from deepspec.utils.config import (
+    ConfigNode,
+    CustomJSONEncoder,
+    config_to_plain_dict,
+    finalize_config,
+    jsonable,
+    load_config,
+    parse_opts_to_config,
+    to_config_node,
+)
+
+
+class TestConfigNode:
+    def test_attribute_access_reads_items(self):
+        node = ConfigNode({"lr": 0.1})
+        assert node.lr == 0.1
+        assert node["lr"] == 0.1
+
+    def test_attribute_assignment_sets_items(self):
+        node = ConfigNode()
+        node.epochs = 5
+        assert node["epochs"] == 5
+
+    def test_missing_attribute_raises_attribute_error(self):
+        node = ConfigNode({"a": 1})
+        with pytest.raises(AttributeError):
+            _ = node.does_not_exist
+
+    def test_copy_returns_config_node(self):
+        node = ConfigNode({"a": 1})
+        clone = node.copy()
+        assert isinstance(clone, ConfigNode)
+        assert clone == node
+        clone.a = 2
+        assert node.a == 1  # copy is shallow but independent at the top level
+
+
+class TestToConfigNode:
+    def test_nested_dict_becomes_config_node(self):
+        out = to_config_node({"outer": {"inner": 1}})
+        assert isinstance(out, ConfigNode)
+        assert isinstance(out["outer"], ConfigNode)
+        assert out.outer.inner == 1
+
+    def test_list_is_preserved_with_converted_elements(self):
+        out = to_config_node({"items": [{"k": 1}, 2]})
+        assert isinstance(out["items"], list)
+        assert isinstance(out["items"][0], ConfigNode)
+        assert out["items"][1] == 2
+
+    def test_tuple_is_preserved(self):
+        out = to_config_node({"pair": ({"k": 1}, 2)})
+        assert isinstance(out["pair"], tuple)
+        assert isinstance(out["pair"][0], ConfigNode)
+
+    def test_scalars_pass_through(self):
+        assert to_config_node(7) == 7
+        assert to_config_node("x") == "x"
+
+
+class TestConfigToPlainDict:
+    def test_config_node_becomes_plain_dict(self):
+        node = to_config_node({"a": {"b": 1}})
+        plain = config_to_plain_dict(node)
+        assert type(plain) is dict
+        assert type(plain["a"]) is dict
+        assert plain == {"a": {"b": 1}}
+
+    def test_tuple_becomes_list(self):
+        node = to_config_node({"pair": (1, 2)})
+        plain = config_to_plain_dict(node)
+        assert plain["pair"] == [1, 2]
+
+
+class TestJsonable:
+    def test_path_becomes_string(self):
+        assert jsonable(Path("/tmp/x")) == "/tmp/x"
+
+    def test_nested_structures_are_converted(self):
+        node = ConfigNode({"p": Path("/a"), "items": (1, Path("/b"))})
+        out = jsonable(node)
+        assert out == {"p": "/a", "items": [1, "/b"]}
+
+
+class TestCustomJSONEncoder:
+    def _dumps(self, obj):
+        return json.loads(json.dumps(obj, cls=CustomJSONEncoder))
+
+    def test_encodes_function(self):
+        def my_fn():
+            return None
+
+        assert self._dumps({"f": my_fn}) == {"f": "<function my_fn>"}
+
+    def test_encodes_type(self):
+        assert self._dumps({"t": int}) == {"t": "<class 'int'>"}
+
+    def test_encodes_torch_dtype(self):
+        assert self._dumps({"dt": torch.float32}) == {"dt": "torch.float32"}
+
+    def test_encodes_path(self):
+        assert self._dumps({"p": Path("/a/b")}) == {"p": "/a/b"}
+
+    def test_encodes_namespace(self):
+        assert self._dumps(Namespace(a=1, b="x")) == {"a": 1, "b": "x"}
+        assert self._dumps(SimpleNamespace(c=3)) == {"c": 3}
+
+    def test_encodes_config_node(self):
+        assert self._dumps(to_config_node({"a": {"b": 1}})) == {"a": {"b": 1}}
+
+
+class TestParseOptsToConfig:
+    def test_sets_nested_value(self):
+        cfg = {"train": {"lr": 0.1}}
+        out = parse_opts_to_config(["train.lr=0.5"], cfg)
+        assert out.train.lr == 0.5
+
+    def test_value_is_yaml_parsed(self):
+        out = parse_opts_to_config(["train.steps=10"], {"train": {"steps": 1}})
+        assert out.train.steps == 10
+        assert isinstance(out.train.steps, int)
+
+    def test_empty_opts_returns_finalized_config_node(self):
+        out = parse_opts_to_config([], {"a": {"b": 1}})
+        assert isinstance(out, ConfigNode)
+        assert out.a.b == 1
+
+    def test_unknown_top_level_key_raises_key_error(self):
+        with pytest.raises(KeyError):
+            parse_opts_to_config(["missing=1"], {"a": 1})
+
+    def test_unknown_nested_key_raises_key_error(self):
+        with pytest.raises(KeyError):
+            parse_opts_to_config(["a.missing=1"], {"a": {"b": 1}})
+
+    def test_non_mapping_intermediate_raises_type_error(self):
+        with pytest.raises(TypeError):
+            parse_opts_to_config(["a.b=1"], {"a": 1})
+
+
+class TestFinalizeConfig:
+    def test_returns_config_node_without_hook(self):
+        out = finalize_config({"a": 1})
+        assert isinstance(out, ConfigNode)
+        assert out.a == 1
+
+    def test_runs_finalize_cfg_hook(self):
+        def finalize_cfg(cfg):
+            cfg["added"] = cfg["base"] * 2
+            return cfg
+
+        out = finalize_config({"base": 3, "finalize_cfg": finalize_cfg})
+        assert out.added == 6
+
+
+class TestLoadConfig:
+    def test_loads_module_level_vars(self, tmp_path):
+        cfg_file = tmp_path / "cfg.py"
+        cfg_file.write_text(
+            "import os\n"  # modules must be skipped
+            "lr = 0.01\n"
+            "layers = [1, 2, 3]\n"
+            "_private = 'kept'  # leading single underscore is NOT skipped\n"
+            "__dunder__ = 'skipped'\n"
+        )
+        cfg = load_config(str(cfg_file))
+        assert isinstance(cfg, ConfigNode)
+        assert cfg.lr == 0.01
+        assert cfg.layers == [1, 2, 3]
+        assert "os" not in cfg  # ModuleType filtered out
+        assert "__dunder__" not in cfg  # dunder filtered out
diff --git a/tests/test_jsonl_dataset.py b/tests/test_jsonl_dataset.py
@@ -0,0 +1,122 @@
+"""Tests for deepspec.data.jsonl_dataset.JsonLineDataset.
+
+Exercises the mmap-based line indexing end to end on temporary .jsonl files:
+length counting (with and without a trailing newline), per-record decoding,
+multi-file global indexing, bounds checking, and the on-disk line-start cache.
+CPU-only; ``CACHE_DIR`` is redirected to a temp dir so tests are hermetic.
+"""
+
+from __future__ import annotations
+
+import json
+import pickle
+
+import pytest
+
+import deepspec.data.jsonl_dataset as jsonl_mod
+from deepspec.data.jsonl_dataset import JsonLineDataset
+
+
+@pytest.fixture(autouse=True)
+def isolated_cache(tmp_path, monkeypatch):
+    """Redirect the module-level CACHE_DIR to a temp dir for each test."""
+    monkeypatch.setattr(jsonl_mod, "CACHE_DIR", str(tmp_path / "jsonl_cache"))
+    return tmp_path
+
+
+def _write_jsonl(path, records, trailing_newline=True):
+    body = "\n".join(json.dumps(r) for r in records)
+    if trailing_newline and records:
+        body += "\n"
+    path.write_text(body, encoding="utf-8")
+    return path
+
+
+def test_len_with_trailing_newline(tmp_path):
+    records = [{"i": 0}, {"i": 1}, {"i": 2}]
+    path = _write_jsonl(tmp_path / "a.jsonl", records, trailing_newline=True)
+    ds = JsonLineDataset([str(path)])
+    assert len(ds) == 3
+    ds.close()
+
+
+def test_len_without_trailing_newline(tmp_path):
+    records = [{"i": 0}, {"i": 1}, {"i": 2}]
+    path = _write_jsonl(tmp_path / "a.jsonl", records, trailing_newline=False)
+    ds = JsonLineDataset([str(path)])
+    assert len(ds) == 3
+    ds.close()
+
+
+def test_getitem_roundtrips_records(tmp_path):
+    records = [{"i": 0, "text": "alpha"}, {"i": 1, "text": "beta"}]
+    path = _write_jsonl(tmp_path / "a.jsonl", records)
+    ds = JsonLineDataset([str(path)])
+    assert ds[0] == records[0]
+    assert ds[1] == records[1]
+    ds.close()
+
+
+def test_empty_file_has_zero_length(tmp_path):
+    # A 0-byte .jsonl must be treated as 0 records rather than crashing on the
+    # `mmap` of an empty file (regression test for the empty-shard guard).
+    path = tmp_path / "empty.jsonl"
+    path.write_text("", encoding="utf-8")
+    ds = JsonLineDataset([str(path)])
+    assert len(ds) == 0
+    ds.close()
+
+
+def test_empty_shard_among_populated_files(tmp_path):
+    empty = tmp_path / "a_empty.jsonl"
+    empty.write_text("", encoding="utf-8")
+    populated = _write_jsonl(tmp_path / "b.jsonl", [{"i": 0}, {"i": 1}])
+    ds = JsonLineDataset([str(empty), str(populated)])
+    assert len(ds) == 2
+    assert ds[0] == {"i": 0}
+    assert ds[1] == {"i": 1}
+    ds.close()
+
+
+def test_out_of_range_index_raises(tmp_path):
+    path = _write_jsonl(tmp_path / "a.jsonl", [{"i": 0}])
+    ds = JsonLineDataset([str(path)])
+    with pytest.raises(IndexError):
+        _ = ds[len(ds)]
+    with pytest.raises(IndexError):
+        _ = ds[-1]
+    ds.close()
+
+
+def test_multi_file_global_indexing(tmp_path):
+    # data_paths are sorted internally; name files so order is deterministic.
+    f0 = _write_jsonl(tmp_path / "a.jsonl", [{"f": 0, "i": 0}, {"f": 0, "i": 1}])
+    f1 = _write_jsonl(tmp_path / "b.jsonl", [{"f": 1, "i": 0}, {"f": 1, "i": 1}, {"f": 1, "i": 2}])
+    ds = JsonLineDataset([str(f1), str(f0)])  # pass unsorted on purpose
+    assert len(ds) == 5
+    # First two records come from a.jsonl, the next three from b.jsonl.
+    assert ds[0] == {"f": 0, "i": 0}
+    assert ds[1] == {"f": 0, "i": 1}
+    assert ds[2] == {"f": 1, "i": 0}
+    assert ds[4] == {"f": 1, "i": 2}
+    ds.close()
+
+
+def test_line_start_cache_is_written_and_reused(tmp_path, monkeypatch):
+    cache_dir = tmp_path / "jsonl_cache"
+    monkeypatch.setattr(jsonl_mod, "CACHE_DIR", str(cache_dir))
+    path = _write_jsonl(tmp_path / "a.jsonl", [{"i": 0}, {"i": 1}, {"i": 2}])
+
+    ds1 = JsonLineDataset([str(path)])
+    ds1.close()
+    cache_files = list(cache_dir.glob("jsonlindex-*.pkl"))
+    assert len(cache_files) == 1
+
+    cached = pickle.loads(cache_files[0].read_bytes())
+    assert cached["line_starts"] == [0, len('{"i": 0}\n'), 2 * len('{"i": 0}\n')]
+
+    # Second instance must reuse the cache and report the same length.
+    ds2 = JsonLineDataset([str(path)])
+    assert len(ds2) == 3
+    assert ds2[2] == {"i": 2}
+    ds2.close()