From 9eac9bd3a0200fd65b064c537112428035f2be38 Mon Sep 17 00:00:00 2001
From: Steph Bench <stephane.benchimol@mfi.fr>
Date: Fri, 1 Dec 2023 16:35:28 +0100
Subject: [PATCH 1/2] Add build index cli

---
 cfgrib/__main__.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/cfgrib/__main__.py b/cfgrib/__main__.py
index 0404c4e6..84718543 100644
--- a/cfgrib/__main__.py
+++ b/cfgrib/__main__.py
@@ -20,6 +20,7 @@
 import json
 import os.path
 import typing as T
+from pathlib import Path
 
 import click
 
@@ -176,5 +177,35 @@ def dump(inpaths, variable, cdm, engine):
     print(ds_or_da)
 
 
+@cfgrib_cli.command("build_index")
+@click.argument("inpaths", nargs=-1, required=True)
+@click.option("--index-basedir", default=None)
+@click.option("--force", default=None)
+def build_index(inpaths, index_basedir, force):
+    # type: (T.List[str], str, bool) -> None
+    from .messages import FileStream, FileIndex
+    from .dataset import compute_index_keys
+
+    index_keys = compute_index_keys(("time", "step", "shortName"), {})
+    indexpath = "{path}.idx"
+    if index_basedir:
+        indexpath = os.path.join(index_basedir, '{path}.idx')
+
+    for fp in inpaths:
+        fp_idx = Path(indexpath.format(path=fp))
+        if force:
+            fp_idx.unlink(missing_ok=True)
+
+        print(f"{fp}: Creating index to {fp_idx}")
+        stream = FileStream(str(fp))
+        index = FileIndex.from_indexpath_or_filestream(
+            filestream=stream,
+            index_keys=index_keys,
+            indexpath=indexpath
+        )
+
+
+
+
 if __name__ == "__main__":  # pragma: no cover
     cfgrib_cli()

From d549e4c06a3421c13dc6b85c48cc199cbe73fa30 Mon Sep 17 00:00:00 2001
From: Steph Bench <stephane.benchimol@mfi.fr>
Date: Fri, 1 Dec 2023 22:58:45 +0100
Subject: [PATCH 2/2] Working on ...

---
 cfgrib/__main__.py       | 28 +++++-----------------------
 cfgrib/dataset.py        | 14 ++++++++++++++
 cfgrib/messages.py       |  9 +++++++--
 tests/test_30_dataset.py |  8 ++++++++
 4 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/cfgrib/__main__.py b/cfgrib/__main__.py
index 84718543..73b6c62f 100644
--- a/cfgrib/__main__.py
+++ b/cfgrib/__main__.py
@@ -20,7 +20,6 @@
 import json
 import os.path
 import typing as T
-from pathlib import Path
 
 import click
 
@@ -180,31 +179,14 @@ def dump(inpaths, variable, cdm, engine):
 @cfgrib_cli.command("build_index")
 @click.argument("inpaths", nargs=-1, required=True)
 @click.option("--index-basedir", default=None)
-@click.option("--force", default=None)
-def build_index(inpaths, index_basedir, force):
+@click.option("--force-index-creation", default=None)
+def build_index(inpaths, index_basedir, force_index_creation):
     # type: (T.List[str], str, bool) -> None
-    from .messages import FileStream, FileIndex
-    from .dataset import compute_index_keys
-
-    index_keys = compute_index_keys(("time", "step", "shortName"), {})
-    indexpath = "{path}.idx"
-    if index_basedir:
-        indexpath = os.path.join(index_basedir, '{path}.idx')
+    from .dataset import get_or_create_index
 
     for fp in inpaths:
-        fp_idx = Path(indexpath.format(path=fp))
-        if force:
-            fp_idx.unlink(missing_ok=True)
-
-        print(f"{fp}: Creating index to {fp_idx}")
-        stream = FileStream(str(fp))
-        index = FileIndex.from_indexpath_or_filestream(
-            filestream=stream,
-            index_keys=index_keys,
-            indexpath=indexpath
-        )
-
-
+        print(f"{fp}: Creating index")
+        get_or_create_index(str(fp), index_basedir, force_index_creation)
 
 
 if __name__ == "__main__":  # pragma: no cover
diff --git a/cfgrib/dataset.py b/cfgrib/dataset.py
index bf5eea9a..209dc459 100644
--- a/cfgrib/dataset.py
+++ b/cfgrib/dataset.py
@@ -23,6 +23,7 @@
 import logging
 import os
 import typing as T
+from pathlib import Path
 
 import attr
 import numpy as np
@@ -797,3 +798,16 @@ def open_file(
     index = open_fileindex(stream, indexpath, index_keys, filter_by_keys=filter_by_keys)
 
     return open_from_index(index, read_keys, time_dims, extra_coords, errors=errors, **kwargs)
+
+
+def get_or_create_index(fp: str | Path, index_basedir: str | Path, force_index_creation: bool=False) -> messages.FileIndex:
+    """ Create a pygrib index file """
+    index_keys = compute_index_keys()
+    stream = messages.FileStream(str(fp))
+    index = messages.FileIndex.from_indexpath_or_filestream(
+        filestream=stream,
+        index_keys=index_keys,
+        indexpath=str(os.path.join(index_basedir, '{path}.idx')),
+        force_index_creation=force_index_creation
+    )
+    return index
diff --git a/cfgrib/messages.py b/cfgrib/messages.py
index f7d725fb..6aa365df 100644
--- a/cfgrib/messages.py
+++ b/cfgrib/messages.py
@@ -520,9 +520,10 @@ class FileIndex(FieldsetIndex):
 
     @classmethod
     def from_indexpath_or_filestream(
-        cls, filestream, index_keys, indexpath=DEFAULT_INDEXPATH, computed_keys={}, log=LOG
+        cls, filestream, index_keys, indexpath=DEFAULT_INDEXPATH, computed_keys={}, log=LOG,
+            force_index_creation=False
     ):
-        # type: (FileStream, T.Sequence[str], str, ComputedKeysType, logging.Logger) -> FileIndex
+        # type: (FileStream, T.Sequence[str], str, ComputedKeysType, logging.Logger, bool) -> FileIndex
 
         # Reading and writing the index can be explicitly suppressed by passing indexpath==''.
         if not indexpath:
@@ -530,6 +531,10 @@ def from_indexpath_or_filestream(
 
         hash = hashlib.md5(repr(index_keys).encode("utf-8")).hexdigest()
         indexpath = indexpath.format(path=filestream.path, hash=hash, short_hash=hash[:5])
+
+        if force_index_creation and os.path.exists(indexpath):
+            os.unlink(indexpath)
+
         try:
             with compat_create_exclusive(indexpath) as new_index_file:
                 self = cls.from_fieldset(filestream, index_keys, computed_keys)
diff --git a/tests/test_30_dataset.py b/tests/test_30_dataset.py
index 5523d3ee..94acb316 100644
--- a/tests/test_30_dataset.py
+++ b/tests/test_30_dataset.py
@@ -324,3 +324,11 @@ def test_missing_field_values() -> None:
     t2 = res.variables["t2m"]
     assert np.isclose(np.nanmean(t2.data[0, :, :]), 268.375)
     assert np.isclose(np.nanmean(t2.data[1, :, :]), 270.716)
+
+
+def test_get_or_create_index(tmpdir) -> None:
+    index = dataset.get_or_create_index(TEST_DATA, os.path.join(tmpdir, "indexes"))
+    assert isinstance(index, messages.FileIndex)
+
+    index = dataset.get_or_create_index(TEST_DATA, os.path.join(tmpdir, "indexes"), force_index_creation=True)
+    assert isinstance(index, messages.FileIndex)