diff --git a/environment.yml b/environment.yml index 6c4949e25..9f6bab0a8 100644 --- a/environment.yml +++ b/environment.yml @@ -58,3 +58,4 @@ dependencies: - requests-mock >=1.8 - ruff >=0.9 - werkzeug + - xvec diff --git a/pyproject.toml b/pyproject.toml index 4cb896960..510aa8be5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ dependencies = [ "tornado>=6.0", "urllib3>=1.26", "xarray>=2024.7", + "xvec", "zarr>=2.11,<3" ] classifiers = [ diff --git a/rtd-environment.yml b/rtd-environment.yml index f345b8083..94139829a 100644 --- a/rtd-environment.yml +++ b/rtd-environment.yml @@ -44,6 +44,7 @@ dependencies: - urllib3 >=1.26 - werkzeug <2.2 # >=2.2 slows down S3 tests (deps: moto->flask->werkzeug) - xarray >=2022.6, <= 2024.6 + - xvec - zarr >=2.11 # Required by Coiled # These are very likely transitive deps anyway diff --git a/test/core/store/fs/impl/test_vectordatacube.py b/test/core/store/fs/impl/test_vectordatacube.py new file mode 100644 index 000000000..068ef4c8a --- /dev/null +++ b/test/core/store/fs/impl/test_vectordatacube.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024 by xcube team and contributors +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + +import unittest + +from xcube.core.new import new_vector_data_cube + +from xcube.core.mldataset.abc import VectorDataCube +from xcube.core.store import new_data_store +from xcube.core.store.datatype import VECTOR_DATA_CUBE_TYPE +from xcube.core.store.fs.impl.vectordatacube import VectorDataCubeZarrFsDataAccessor + + +class VectorDataCubeStoreTest(unittest.TestCase): + + def test_write_to_and_read_from_store(self): + store = new_data_store("file") + vdc = new_vector_data_cube( + variables=dict( + precipitation=0.5, + soilmoisture=1.0 + ) + ) + data_id = store.write_data( + vdc, data_id="vdc_test.zarr", writer_id="vectordatacube:zarr:file" + ) + self.assertIsNotNone(data_id) + ds = store.open_data(data_id, opener_id="vectordatacube:zarr:file") + self.assertIsNotNone(ds) + self.assertIsInstance(ds, VectorDataCube) + store.delete_data(data_id) + +class VectorDataCubeZarrFsDataAccessorTest(unittest.TestCase): + + @classmethod + def setUp(self): + self.accessor = VectorDataCubeZarrFsDataAccessor() + + def test_get_data_type(self): + self.assertEqual(VECTOR_DATA_CUBE_TYPE, self.accessor.get_data_type()) + + def test_get_format_id(self) -> str: + self.assertEqual("zarr", self.accessor.get_format_id()) + + def test_write_and_open_data(self): + vdc = new_vector_data_cube() + data_id = self.accessor.write_data( + vdc, data_id="vdc_test.zarr", protocol="file", root="/test_data/" + ) + self.assertIsNotNone(data_id) diff --git a/xcube/core/mldataset/abc.py b/xcube/core/mldataset/abc.py index 58a8c97f8..efe25af03 100644 --- a/xcube/core/mldataset/abc.py +++ b/xcube/core/mldataset/abc.py @@ -9,6 +9,8 @@ from typing import Any, Callable, Dict, Tuple import xarray as xr +import xvec +from jinja2.lexer import TOKEN_DOT from xcube.core.gridmapping import GridMapping from xcube.core.tilingscheme import TilingScheme @@ -147,3 +149,28 @@ def get_level_for_resolution(self, xy_res: ScalarOrPair[float]) -> int: if x_res > given_x_res and y_res > given_y_res: return max(0, level - 1) return self.num_levels - 1 + + +class VectorDataCube(xr.Dataset): + + def __init__( + self, geometry_column: str = "geometry", crs = None, *args, **kwargs + ): + super().__init__(*args, **kwargs) + # TODO add check that dataset qualifies as vector data cube + # self.assign_coords( + # dict( + # geometry_column=self.get(geometry_column) + # ) + # ) + # {geometry_column= self[geometry_name]}) + # self.xvec.set_geom_indexes(geometry_column, crs=crs) + # self.xvec.encode_cf() + + @classmethod + def from_dataset(cls, dataset): + return cls( + data_vars=dataset.data_vars, + coords=dataset.coords, + attrs=dataset.attrs + ) diff --git a/xcube/core/new.py b/xcube/core/new.py index 355102a50..af3a04206 100644 --- a/xcube/core/new.py +++ b/xcube/core/new.py @@ -7,8 +7,12 @@ import numpy as np import pandas as pd import pyproj +import random +from shapely import Point import xarray as xr +from xcube.core.mldataset.abc import VectorDataCube + def new_cube( title="Test Cube", @@ -266,3 +270,213 @@ def my_func(time: int, y: int, x: int) -> Union[bool, int, float] data_vars[crs_name] = xr.DataArray(0, attrs=crs.to_cf()) return xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs) + + +def new_vector_data_cube( + title="Test Vector Data Cube", + geometries=None, + num_geometries=20, + geometry_name="geometry", + bbox=(-180.0, -90.0, 180.0, 90.0), + time_name="time", + time_dtype="datetime64[ns]", + time_units="seconds since 1970-01-01T00:00:00", + time_calendar="proleptic_gregorian", + time_periods=5, + time_freq="D", + time_start="2010-01-01T00:00:00", + use_cftime=False, + drop_bounds=False, + variables=None, + crs="WGS 84", + crs_name=None, + time_encoding_dtype="int64", +): + """Create a new empty vector data cube. Useful for creating vector data cube + templates with predefined coordinate variables and metadata. The function is also + used by xcube's unit tests. + + The values of the *variables* dictionary can be either constants, + array-like objects, or functions that compute their return value from + passed coordinate indexes. The expected signature is::: + + def my_func(time: int, y: int, x: int) -> Union[bool, int, float] + + Args: + title: A title. Defaults to 'Test Cube'. + width: Horizontal number of grid cells. Defaults to 360. + height: Vertical number of grid cells. Defaults to 180. + x_name: Name of the x coordinate variable. Defaults to 'lon'. + y_name: Name of the y coordinate variable. Defaults to 'lat'. + x_dtype: Data type of x coordinates. Defaults to 'float64'. + y_dtype: Data type of y coordinates. Defaults to 'float64'. + x_units: Units of the x coordinates. Defaults to 'degrees_east'. + y_units: Units of the y coordinates. Defaults to + 'degrees_north'. + x_start: Minimum x value. Defaults to -180. + y_start: Minimum y value. Defaults to -90. + x_res: Spatial resolution in x-direction. Defaults to 1.0. + y_res: Spatial resolution in y-direction. Defaults to 1.0. + inverse_y: Whether to create an inverse y axis. Defaults to + False. + time_name: Name of the time coordinate variable. Defaults to + 'time'. + time_periods: Number of time steps. Defaults to 5. + time_freq: Duration of each time step. Defaults to `1D'. + time_start: First time value. Defaults to '2010-01-01T00:00:00'. + time_dtype: Numpy data type for time coordinates. Defaults to + 'datetime64[s]'. If used, parameter 'use_cftime' must be + False. + time_units: Units for time coordinates. Defaults to 'seconds + since 1970-01-01T00:00:00'. + time_calendar: Calender for time coordinates. Defaults to + 'proleptic_gregorian'. + use_cftime: If True, the time will be given as data types + according to the 'cftime' package. If used, the + time_calendar parameter must be also be given with an + appropriate value such as 'gregorian' or 'julian'. If used, + parameter 'time_dtype' must be None. + drop_bounds: If True, coordinate bounds variables are not + created. Defaults to False. + variables: Dictionary of data variables to be added. None by + default. + crs: pyproj-compatible CRS string or instance of pyproj.CRS or + None + crs_name: Name of the variable that will hold the CRS + information. Ignored, if *crs* is not given. + time_encoding_dtype: data type used to encode the time variable + when serializing the dataset + + Returns: + A cube instance + """ + # y_dtype = y_dtype if y_dtype is not None else y_dtype + # y_res = y_res if y_res is not None else x_res + # if width < 0 or height < 0 or x_res <= 0.0 or y_res <= 0.0: + # if num_geometries < 0: + # raise ValueError() + + if isinstance(crs, str): + crs = pyproj.CRS.from_string(crs) + + if isinstance(crs, pyproj.CRS): + crs_name = crs_name or "crs" + + if geometries: + num_geometries = len(geometries) + else: + if num_geometries < 0: + raise ValueError() + geometries = [] + for _ in range(num_geometries): + x = random.uniform(bbox[0], bbox[2]) + y = random.uniform(bbox[1], bbox[3]) + geometries.append(Point(x, y)) + np_array = np.asarray(geometries, dtype=object) + + if time_periods < 0: + raise ValueError() + + if use_cftime and time_dtype is not None: + raise ValueError('If "use_cftime" is True,' ' "time_dtype" must not be set.') + + geometry_var = xr.DataArray(np_array, dims=geometry_name) + + if use_cftime: + time_data_p1 = xr.cftime_range( + start=time_start, + periods=time_periods + 1, + freq=time_freq, + calendar=time_calendar, + ).values + else: + time_data_p1 = pd.date_range( + start=time_start, periods=time_periods + 1, freq=time_freq + ).values + time_data_p1 = time_data_p1.astype(dtype=time_dtype) + + time_delta = time_data_p1[1] - time_data_p1[0] + time_data = time_data_p1[0:-1] + time_delta // 2 + time_var = xr.DataArray(time_data, dims=time_name) + time_var.encoding["units"] = time_units + time_var.encoding["calendar"] = time_calendar + time_var.encoding["dtype"] = time_encoding_dtype + + coords = {geometry_name: geometry_var, time_name: time_var} + if not drop_bounds: + time_bnds_name = f"{time_name}_bnds" + + bnds_dim = "bnds" + + time_bnds_data = np.zeros((time_periods, 2), dtype=time_data_p1.dtype) + time_bnds_data[:, 0] = time_data_p1[:-1] + time_bnds_data[:, 1] = time_data_p1[1:] + time_bnds_var = xr.DataArray(time_bnds_data, dims=(time_name, bnds_dim)) + time_bnds_var.encoding["units"] = time_units + time_bnds_var.encoding["calendar"] = time_calendar + time_bnds_var.encoding["dtype"] = time_encoding_dtype + + time_var.attrs["bounds"] = time_bnds_name + + coords.update( + { + time_bnds_name: time_bnds_var, + } + ) + + attrs = dict( + Conventions="CF-1.7", + title=title, + time_coverage_start=str(time_data_p1[0]), + time_coverage_end=str(time_data_p1[-1]), + ) + + if crs.to_epsg() == 4326: + attrs.update( + dict( + geospatial_lon_min=bbox[0], + geospatial_lon_max=bbox[2], + geospatial_lat_min=bbox[1], + geospatial_lat_max=bbox[3], + geospatial_units="degree", + + ) + ) + + data_vars = {} + if variables: + dims = (time_name, geometry_name) + shape = (time_periods, num_geometries) + size = time_periods * num_geometries + for var_name, data in variables.items(): + if isinstance(data, xr.DataArray): + data_vars[var_name] = data + elif ( + isinstance(data, int) + or isinstance(data, float) + or isinstance(data, bool) + ): + data_vars[var_name] = xr.DataArray(np.full(shape, data), dims=dims) + elif callable(data): + func = data + data = np.zeros(shape) + for index in itertools.product(*map(range, shape)): + data[index] = func(*index) + data_vars[var_name] = xr.DataArray(data, dims=dims) + elif data is None: + data_vars[var_name] = xr.DataArray( + np.random.uniform(0.0, 1.0, size).reshape(shape), dims=dims + ) + else: + data_vars[var_name] = xr.DataArray(data, dims=dims) + + if isinstance(crs, pyproj.CRS): + for v in data_vars.values(): + v.attrs["grid_mapping"] = crs_name + data_vars[crs_name] = xr.DataArray(0, attrs=crs.to_cf()) + + ds = VectorDataCube( + geometry_column=geometry_name, crs=crs, data_vars=data_vars, + coords=coords, attrs=attrs + ) + return ds diff --git a/xcube/core/store/__init__.py b/xcube/core/store/__init__.py index 97704b7db..64c491a4e 100644 --- a/xcube/core/store/__init__.py +++ b/xcube/core/store/__init__.py @@ -20,6 +20,7 @@ DATASET_TYPE, GEO_DATA_FRAME_TYPE, MULTI_LEVEL_DATASET_TYPE, + VECTOR_DATA_CUBE_TYPE, DataType, DataTypeLike, ) diff --git a/xcube/core/store/datatype.py b/xcube/core/store/datatype.py index ea8332b38..b236ce675 100644 --- a/xcube/core/store/datatype.py +++ b/xcube/core/store/datatype.py @@ -195,6 +195,12 @@ def __next__(self) -> T: ["gdfiter", "DataIterator[geopandas.GeoDataFrame]"], ) +VECTOR_DATA_CUBE_TYPE = DataType(xarray.Dataset, ["vectordatacube", "xarray.Dataset"]) + +VECTOR_DATA_CUBE_ITERATOR_TYPE = DataType( + DataIterator, # Unfortunately we cannot use DataIterator[xarray.Dataset] + ["vdciter", "DataIterator[xarray.Dataset]"], +) def register_default_data_types(): for data_type in [ @@ -205,6 +211,8 @@ def register_default_data_types(): MULTI_LEVEL_DATASET_ITERATOR_TYPE, GEO_DATA_FRAME_TYPE, GEO_DATA_FRAME_ITERATOR_TYPE, + VECTOR_DATA_CUBE_TYPE, + VECTOR_DATA_CUBE_ITERATOR_TYPE, ]: DataType.register_data_type(data_type) diff --git a/xcube/core/store/fs/impl/vectordatacube.py b/xcube/core/store/fs/impl/vectordatacube.py new file mode 100644 index 000000000..f882037b8 --- /dev/null +++ b/xcube/core/store/fs/impl/vectordatacube.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024 by xcube team and contributors +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + +import xarray as xr +import xvec + +from xcube.core.mldataset.abc import VectorDataCube +from xcube.util.assertions import assert_instance +from ...datatype import DataType +from ...datatype import VECTOR_DATA_CUBE_TYPE + +from .dataset import DatasetZarrFsDataAccessor +from .dataset import DatasetNetcdfFsDataAccessor + + +class VectorDataCubeZarrFsDataAccessor(DatasetZarrFsDataAccessor): + """Opener/writer extension name: 'vectordatacube:zarr:'.""" + + @classmethod + def get_data_type(cls) -> DataType: + return VECTOR_DATA_CUBE_TYPE + + def open_data(self, data_id: str, **open_params) -> xr.Dataset: + dataset = super().open_data(data_id, **open_params) + dataset = dataset.xvec.decode_cf() + dataset = VectorDataCube.from_dataset(dataset) + return dataset + + def write_data( + self, data: xr.Dataset, data_id: str, replace=False, **write_params + ) -> str: + assert_instance(data, xr.Dataset, name="data") + data_id = super().write_data( + data.xvec.encode_cf(), data_id, replace, **write_params + ) + return data_id + + +class VectorDataCubeNetcdfFsDataAccessor(DatasetNetcdfFsDataAccessor): + """Opener/writer extension name: 'vectordatacube:netcdf:'.""" + + @classmethod + def get_data_type(cls) -> DataType: + return VECTOR_DATA_CUBE_TYPE + + def open_data(self, data_id: str, **open_params) -> VectorDataCube: + dataset = super().open_data(data_id, **open_params) + return dataset.xvec.decode_cf() + + def write_data( + self, data: xr.Dataset, data_id: str, replace=False, **write_params + ) -> str: + assert_instance(data, xr.Dataset, name="data") + data_id = super().write_data( + data.xvec.encode_cf(), data_id, replace, **write_params + ) + return data_id diff --git a/xcube/core/store/fs/registry.py b/xcube/core/store/fs/registry.py index ebfe804d1..6004116a1 100644 --- a/xcube/core/store/fs/registry.py +++ b/xcube/core/store/fs/registry.py @@ -32,6 +32,10 @@ DatasetLevelsFsDataAccessor, MultiLevelDatasetLevelsFsDataAccessor, ) +from .impl.vectordatacube import ( + VectorDataCubeNetcdfFsDataAccessor, + VectorDataCubeZarrFsDataAccessor +) from .store import FsDataStore ############################################ @@ -127,6 +131,8 @@ def register_fs_data_accessor_class(fs_data_accessor_class: type[FsDataAccessor] MultiLevelDatasetLevelsFsDataAccessor, GeoDataFrameShapefileFsDataAccessor, GeoDataFrameGeoJsonFsDataAccessor, + VectorDataCubeNetcdfFsDataAccessor, + VectorDataCubeZarrFsDataAccessor ): register_fs_data_accessor_class(cls) diff --git a/xcube/core/store/fs/store.py b/xcube/core/store/fs/store.py index 843a7311d..f0f0ab63a 100644 --- a/xcube/core/store/fs/store.py +++ b/xcube/core/store/fs/store.py @@ -45,6 +45,7 @@ DATASET_TYPE, GEO_DATA_FRAME_TYPE, MULTI_LEVEL_DATASET_TYPE, + VECTOR_DATA_CUBE_TYPE, DataType, DataTypeLike, ) @@ -443,7 +444,10 @@ def _guess_writer_id(self, data, data_id: str = None): data_type = accessor_id_parts[0] format_id = accessor_id_parts[1] if isinstance(data, xr.Dataset): - data_type = DATASET_TYPE.alias + if len(data.xvec.geom_coords) > 0: + data_type = VECTOR_DATA_CUBE_TYPE.alias + else: + data_type = DATASET_TYPE.alias format_id = format_id or "zarr" elif isinstance(data, MultiLevelDataset): data_type = MULTI_LEVEL_DATASET_TYPE.alias diff --git a/xcube/plugin.py b/xcube/plugin.py index 93af1ab8c..9ac854b22 100644 --- a/xcube/plugin.py +++ b/xcube/plugin.py @@ -106,6 +106,8 @@ def _register_dataset_ios(ext_registry: extension.ExtensionRegistry): ), ("geodataframe", "shapefile", "gpd.GeoDataFrame in ESRI Shapefile format"), ("geodataframe", "geojson", "gpd.GeoDataFrame in GeoJSON format"), + ("vectordatacube", "netcdf", "xarray.Dataset in NetCDF format"), + ("vectordatacube", "zarr", "xarray.Dataset in Zarr format"), ) _FS_DATA_OPENER_ITEMS = _FS_DATA_ACCESSOR_ITEMS