From 9eac9bd3a0200fd65b064c537112428035f2be38 Mon Sep 17 00:00:00 2001 From: Steph Bench Date: Fri, 1 Dec 2023 16:35:28 +0100 Subject: [PATCH 1/2] Add build index cli --- cfgrib/__main__.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/cfgrib/__main__.py b/cfgrib/__main__.py index 0404c4e6..84718543 100644 --- a/cfgrib/__main__.py +++ b/cfgrib/__main__.py @@ -20,6 +20,7 @@ import json import os.path import typing as T +from pathlib import Path import click @@ -176,5 +177,35 @@ def dump(inpaths, variable, cdm, engine): print(ds_or_da) +@cfgrib_cli.command("build_index") +@click.argument("inpaths", nargs=-1, required=True) +@click.option("--index-basedir", default=None) +@click.option("--force", default=None) +def build_index(inpaths, index_basedir, force): + # type: (T.List[str], str, bool) -> None + from .messages import FileStream, FileIndex + from .dataset import compute_index_keys + + index_keys = compute_index_keys(("time", "step", "shortName"), {}) + indexpath = "{path}.idx" + if index_basedir: + indexpath = os.path.join(index_basedir, '{path}.idx') + + for fp in inpaths: + fp_idx = Path(indexpath.format(path=fp)) + if force: + fp_idx.unlink(missing_ok=True) + + print(f"{fp}: Creating index to {fp_idx}") + stream = FileStream(str(fp)) + index = FileIndex.from_indexpath_or_filestream( + filestream=stream, + index_keys=index_keys, + indexpath=indexpath + ) + + + + if __name__ == "__main__": # pragma: no cover cfgrib_cli() From d549e4c06a3421c13dc6b85c48cc199cbe73fa30 Mon Sep 17 00:00:00 2001 From: Steph Bench Date: Fri, 1 Dec 2023 22:58:45 +0100 Subject: [PATCH 2/2] Working on ... --- cfgrib/__main__.py | 28 +++++----------------------- cfgrib/dataset.py | 14 ++++++++++++++ cfgrib/messages.py | 9 +++++++-- tests/test_30_dataset.py | 8 ++++++++ 4 files changed, 34 insertions(+), 25 deletions(-) diff --git a/cfgrib/__main__.py b/cfgrib/__main__.py index 84718543..73b6c62f 100644 --- a/cfgrib/__main__.py +++ b/cfgrib/__main__.py @@ -20,7 +20,6 @@ import json import os.path import typing as T -from pathlib import Path import click @@ -180,31 +179,14 @@ def dump(inpaths, variable, cdm, engine): @cfgrib_cli.command("build_index") @click.argument("inpaths", nargs=-1, required=True) @click.option("--index-basedir", default=None) -@click.option("--force", default=None) -def build_index(inpaths, index_basedir, force): +@click.option("--force-index-creation", default=None) +def build_index(inpaths, index_basedir, force_index_creation): # type: (T.List[str], str, bool) -> None - from .messages import FileStream, FileIndex - from .dataset import compute_index_keys - - index_keys = compute_index_keys(("time", "step", "shortName"), {}) - indexpath = "{path}.idx" - if index_basedir: - indexpath = os.path.join(index_basedir, '{path}.idx') + from .dataset import get_or_create_index for fp in inpaths: - fp_idx = Path(indexpath.format(path=fp)) - if force: - fp_idx.unlink(missing_ok=True) - - print(f"{fp}: Creating index to {fp_idx}") - stream = FileStream(str(fp)) - index = FileIndex.from_indexpath_or_filestream( - filestream=stream, - index_keys=index_keys, - indexpath=indexpath - ) - - + print(f"{fp}: Creating index") + get_or_create_index(str(fp), index_basedir, force_index_creation) if __name__ == "__main__": # pragma: no cover diff --git a/cfgrib/dataset.py b/cfgrib/dataset.py index bf5eea9a..209dc459 100644 --- a/cfgrib/dataset.py +++ b/cfgrib/dataset.py @@ -23,6 +23,7 @@ import logging import os import typing as T +from pathlib import Path import attr import numpy as np @@ -797,3 +798,16 @@ def open_file( index = open_fileindex(stream, indexpath, index_keys, filter_by_keys=filter_by_keys) return open_from_index(index, read_keys, time_dims, extra_coords, errors=errors, **kwargs) + + +def get_or_create_index(fp: str | Path, index_basedir: str | Path, force_index_creation: bool=False) -> messages.FileIndex: + """ Create a pygrib index file """ + index_keys = compute_index_keys() + stream = messages.FileStream(str(fp)) + index = messages.FileIndex.from_indexpath_or_filestream( + filestream=stream, + index_keys=index_keys, + indexpath=str(os.path.join(index_basedir, '{path}.idx')), + force_index_creation=force_index_creation + ) + return index diff --git a/cfgrib/messages.py b/cfgrib/messages.py index f7d725fb..6aa365df 100644 --- a/cfgrib/messages.py +++ b/cfgrib/messages.py @@ -520,9 +520,10 @@ class FileIndex(FieldsetIndex): @classmethod def from_indexpath_or_filestream( - cls, filestream, index_keys, indexpath=DEFAULT_INDEXPATH, computed_keys={}, log=LOG + cls, filestream, index_keys, indexpath=DEFAULT_INDEXPATH, computed_keys={}, log=LOG, + force_index_creation=False ): - # type: (FileStream, T.Sequence[str], str, ComputedKeysType, logging.Logger) -> FileIndex + # type: (FileStream, T.Sequence[str], str, ComputedKeysType, logging.Logger, bool) -> FileIndex # Reading and writing the index can be explicitly suppressed by passing indexpath==''. if not indexpath: @@ -530,6 +531,10 @@ def from_indexpath_or_filestream( hash = hashlib.md5(repr(index_keys).encode("utf-8")).hexdigest() indexpath = indexpath.format(path=filestream.path, hash=hash, short_hash=hash[:5]) + + if force_index_creation and os.path.exists(indexpath): + os.unlink(indexpath) + try: with compat_create_exclusive(indexpath) as new_index_file: self = cls.from_fieldset(filestream, index_keys, computed_keys) diff --git a/tests/test_30_dataset.py b/tests/test_30_dataset.py index 5523d3ee..94acb316 100644 --- a/tests/test_30_dataset.py +++ b/tests/test_30_dataset.py @@ -324,3 +324,11 @@ def test_missing_field_values() -> None: t2 = res.variables["t2m"] assert np.isclose(np.nanmean(t2.data[0, :, :]), 268.375) assert np.isclose(np.nanmean(t2.data[1, :, :]), 270.716) + + +def test_get_or_create_index(tmpdir) -> None: + index = dataset.get_or_create_index(TEST_DATA, os.path.join(tmpdir, "indexes")) + assert isinstance(index, messages.FileIndex) + + index = dataset.get_or_create_index(TEST_DATA, os.path.join(tmpdir, "indexes"), force_index_creation=True) + assert isinstance(index, messages.FileIndex)