Skip to content
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ dependencies:
- requests-mock >=1.8
- ruff >=0.9
- werkzeug
- xvec
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ dependencies = [
"tornado>=6.0",
"urllib3>=1.26",
"xarray>=2024.7",
"xvec",
"zarr>=2.11,<3"
]
classifiers = [
Expand Down
1 change: 1 addition & 0 deletions rtd-environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ dependencies:
- urllib3 >=1.26
- werkzeug <2.2 # >=2.2 slows down S3 tests (deps: moto->flask->werkzeug)
- xarray >=2022.6, <= 2024.6
- xvec
- zarr >=2.11
# Required by Coiled
# These are very likely transitive deps anyway
Expand Down
51 changes: 51 additions & 0 deletions test/core/store/fs/impl/test_vectordatacube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) 2024 by xcube team and contributors
# Permissions are hereby granted under the terms of the MIT License:
# https://opensource.org/licenses/MIT.

import unittest

from xcube.core.new import new_vector_data_cube

from xcube.core.mldataset.abc import VectorDataCube
from xcube.core.store import new_data_store
from xcube.core.store.datatype import VECTOR_DATA_CUBE_TYPE
from xcube.core.store.fs.impl.vectordatacube import VectorDataCubeZarrFsDataAccessor


class VectorDataCubeStoreTest(unittest.TestCase):

def test_write_to_and_read_from_store(self):
store = new_data_store("file")
vdc = new_vector_data_cube(
variables=dict(
precipitation=0.5,
soilmoisture=1.0
)
)
data_id = store.write_data(
vdc, data_id="vdc_test.zarr", writer_id="vectordatacube:zarr:file"
)
self.assertIsNotNone(data_id)
ds = store.open_data(data_id, opener_id="vectordatacube:zarr:file")
self.assertIsNotNone(ds)
self.assertIsInstance(ds, VectorDataCube)
store.delete_data(data_id)

class VectorDataCubeZarrFsDataAccessorTest(unittest.TestCase):

@classmethod
def setUp(self):
self.accessor = VectorDataCubeZarrFsDataAccessor()

def test_get_data_type(self):
self.assertEqual(VECTOR_DATA_CUBE_TYPE, self.accessor.get_data_type())

def test_get_format_id(self) -> str:
self.assertEqual("zarr", self.accessor.get_format_id())

def test_write_and_open_data(self):
vdc = new_vector_data_cube()
data_id = self.accessor.write_data(
vdc, data_id="vdc_test.zarr", protocol="file", root="/test_data/"
)
self.assertIsNotNone(data_id)
27 changes: 27 additions & 0 deletions xcube/core/mldataset/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from typing import Any, Callable, Dict, Tuple

import xarray as xr
import xvec
from jinja2.lexer import TOKEN_DOT

from xcube.core.gridmapping import GridMapping
from xcube.core.tilingscheme import TilingScheme
Expand Down Expand Up @@ -147,3 +149,28 @@ def get_level_for_resolution(self, xy_res: ScalarOrPair[float]) -> int:
if x_res > given_x_res and y_res > given_y_res:
return max(0, level - 1)
return self.num_levels - 1


class VectorDataCube(xr.Dataset):

def __init__(
self, geometry_column: str = "geometry", crs = None, *args, **kwargs
):
super().__init__(*args, **kwargs)
# TODO add check that dataset qualifies as vector data cube
# self.assign_coords(
# dict(
# geometry_column=self.get(geometry_column)
# )
# )
# {geometry_column= self[geometry_name]})
# self.xvec.set_geom_indexes(geometry_column, crs=crs)
# self.xvec.encode_cf()

@classmethod
def from_dataset(cls, dataset):
return cls(
data_vars=dataset.data_vars,
coords=dataset.coords,
attrs=dataset.attrs
)
214 changes: 214 additions & 0 deletions xcube/core/new.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@
import numpy as np
import pandas as pd
import pyproj
import random
from shapely import Point
import xarray as xr

from xcube.core.mldataset.abc import VectorDataCube


def new_cube(
title="Test Cube",
Expand Down Expand Up @@ -266,3 +270,213 @@ def my_func(time: int, y: int, x: int) -> Union[bool, int, float]
data_vars[crs_name] = xr.DataArray(0, attrs=crs.to_cf())

return xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs)


def new_vector_data_cube(
title="Test Vector Data Cube",
geometries=None,
num_geometries=20,
geometry_name="geometry",
bbox=(-180.0, -90.0, 180.0, 90.0),
time_name="time",
time_dtype="datetime64[ns]",
time_units="seconds since 1970-01-01T00:00:00",
time_calendar="proleptic_gregorian",
time_periods=5,
time_freq="D",
time_start="2010-01-01T00:00:00",
use_cftime=False,
drop_bounds=False,
variables=None,
crs="WGS 84",
crs_name=None,
time_encoding_dtype="int64",
):
"""Create a new empty vector data cube. Useful for creating vector data cube
templates with predefined coordinate variables and metadata. The function is also
used by xcube's unit tests.

The values of the *variables* dictionary can be either constants,
array-like objects, or functions that compute their return value from
passed coordinate indexes. The expected signature is:::

def my_func(time: int, y: int, x: int) -> Union[bool, int, float]

Args:
title: A title. Defaults to 'Test Cube'.
width: Horizontal number of grid cells. Defaults to 360.
height: Vertical number of grid cells. Defaults to 180.
x_name: Name of the x coordinate variable. Defaults to 'lon'.
y_name: Name of the y coordinate variable. Defaults to 'lat'.
x_dtype: Data type of x coordinates. Defaults to 'float64'.
y_dtype: Data type of y coordinates. Defaults to 'float64'.
x_units: Units of the x coordinates. Defaults to 'degrees_east'.
y_units: Units of the y coordinates. Defaults to
'degrees_north'.
x_start: Minimum x value. Defaults to -180.
y_start: Minimum y value. Defaults to -90.
x_res: Spatial resolution in x-direction. Defaults to 1.0.
y_res: Spatial resolution in y-direction. Defaults to 1.0.
inverse_y: Whether to create an inverse y axis. Defaults to
False.
time_name: Name of the time coordinate variable. Defaults to
'time'.
time_periods: Number of time steps. Defaults to 5.
time_freq: Duration of each time step. Defaults to `1D'.
time_start: First time value. Defaults to '2010-01-01T00:00:00'.
time_dtype: Numpy data type for time coordinates. Defaults to
'datetime64[s]'. If used, parameter 'use_cftime' must be
False.
time_units: Units for time coordinates. Defaults to 'seconds
since 1970-01-01T00:00:00'.
time_calendar: Calender for time coordinates. Defaults to
'proleptic_gregorian'.
use_cftime: If True, the time will be given as data types
according to the 'cftime' package. If used, the
time_calendar parameter must be also be given with an
appropriate value such as 'gregorian' or 'julian'. If used,
parameter 'time_dtype' must be None.
drop_bounds: If True, coordinate bounds variables are not
created. Defaults to False.
variables: Dictionary of data variables to be added. None by
default.
crs: pyproj-compatible CRS string or instance of pyproj.CRS or
None
crs_name: Name of the variable that will hold the CRS
information. Ignored, if *crs* is not given.
time_encoding_dtype: data type used to encode the time variable
when serializing the dataset

Returns:
A cube instance
"""
# y_dtype = y_dtype if y_dtype is not None else y_dtype
# y_res = y_res if y_res is not None else x_res
# if width < 0 or height < 0 or x_res <= 0.0 or y_res <= 0.0:
# if num_geometries < 0:
# raise ValueError()

if isinstance(crs, str):
crs = pyproj.CRS.from_string(crs)

if isinstance(crs, pyproj.CRS):
crs_name = crs_name or "crs"

if geometries:
num_geometries = len(geometries)
else:
if num_geometries < 0:
raise ValueError()
geometries = []
for _ in range(num_geometries):
x = random.uniform(bbox[0], bbox[2])
y = random.uniform(bbox[1], bbox[3])
geometries.append(Point(x, y))
np_array = np.asarray(geometries, dtype=object)

if time_periods < 0:
raise ValueError()

if use_cftime and time_dtype is not None:
raise ValueError('If "use_cftime" is True,' ' "time_dtype" must not be set.')

geometry_var = xr.DataArray(np_array, dims=geometry_name)

if use_cftime:
time_data_p1 = xr.cftime_range(
start=time_start,
periods=time_periods + 1,
freq=time_freq,
calendar=time_calendar,
).values
else:
time_data_p1 = pd.date_range(
start=time_start, periods=time_periods + 1, freq=time_freq
).values
time_data_p1 = time_data_p1.astype(dtype=time_dtype)

time_delta = time_data_p1[1] - time_data_p1[0]
time_data = time_data_p1[0:-1] + time_delta // 2
time_var = xr.DataArray(time_data, dims=time_name)
time_var.encoding["units"] = time_units
time_var.encoding["calendar"] = time_calendar
time_var.encoding["dtype"] = time_encoding_dtype

coords = {geometry_name: geometry_var, time_name: time_var}
if not drop_bounds:
time_bnds_name = f"{time_name}_bnds"

bnds_dim = "bnds"

time_bnds_data = np.zeros((time_periods, 2), dtype=time_data_p1.dtype)
time_bnds_data[:, 0] = time_data_p1[:-1]
time_bnds_data[:, 1] = time_data_p1[1:]
time_bnds_var = xr.DataArray(time_bnds_data, dims=(time_name, bnds_dim))
time_bnds_var.encoding["units"] = time_units
time_bnds_var.encoding["calendar"] = time_calendar
time_bnds_var.encoding["dtype"] = time_encoding_dtype

time_var.attrs["bounds"] = time_bnds_name

coords.update(
{
time_bnds_name: time_bnds_var,
}
)

attrs = dict(
Conventions="CF-1.7",
title=title,
time_coverage_start=str(time_data_p1[0]),
time_coverage_end=str(time_data_p1[-1]),
)

if crs.to_epsg() == 4326:
attrs.update(
dict(
geospatial_lon_min=bbox[0],
geospatial_lon_max=bbox[2],
geospatial_lat_min=bbox[1],
geospatial_lat_max=bbox[3],
geospatial_units="degree",

)
)

data_vars = {}
if variables:
dims = (time_name, geometry_name)
shape = (time_periods, num_geometries)
size = time_periods * num_geometries
for var_name, data in variables.items():
if isinstance(data, xr.DataArray):
data_vars[var_name] = data
elif (
isinstance(data, int)
or isinstance(data, float)
or isinstance(data, bool)
):
data_vars[var_name] = xr.DataArray(np.full(shape, data), dims=dims)
elif callable(data):
func = data
data = np.zeros(shape)
for index in itertools.product(*map(range, shape)):
data[index] = func(*index)
data_vars[var_name] = xr.DataArray(data, dims=dims)
elif data is None:
data_vars[var_name] = xr.DataArray(
np.random.uniform(0.0, 1.0, size).reshape(shape), dims=dims
)
else:
data_vars[var_name] = xr.DataArray(data, dims=dims)

if isinstance(crs, pyproj.CRS):
for v in data_vars.values():
v.attrs["grid_mapping"] = crs_name
data_vars[crs_name] = xr.DataArray(0, attrs=crs.to_cf())

ds = VectorDataCube(
geometry_column=geometry_name, crs=crs, data_vars=data_vars,
coords=coords, attrs=attrs
)
return ds
1 change: 1 addition & 0 deletions xcube/core/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
DATASET_TYPE,
GEO_DATA_FRAME_TYPE,
MULTI_LEVEL_DATASET_TYPE,
VECTOR_DATA_CUBE_TYPE,
DataType,
DataTypeLike,
)
Expand Down
8 changes: 8 additions & 0 deletions xcube/core/store/datatype.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,12 @@ def __next__(self) -> T:
["gdfiter", "DataIterator[geopandas.GeoDataFrame]"],
)

VECTOR_DATA_CUBE_TYPE = DataType(xarray.Dataset, ["vectordatacube", "xarray.Dataset"])

VECTOR_DATA_CUBE_ITERATOR_TYPE = DataType(
DataIterator, # Unfortunately we cannot use DataIterator[xarray.Dataset]
["vdciter", "DataIterator[xarray.Dataset]"],
)

def register_default_data_types():
for data_type in [
Expand All @@ -205,6 +211,8 @@ def register_default_data_types():
MULTI_LEVEL_DATASET_ITERATOR_TYPE,
GEO_DATA_FRAME_TYPE,
GEO_DATA_FRAME_ITERATOR_TYPE,
VECTOR_DATA_CUBE_TYPE,
VECTOR_DATA_CUBE_ITERATOR_TYPE,
]:
DataType.register_data_type(data_type)

Expand Down
Loading