diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index fc3e47e0a1..93e3652f0d 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -329,7 +329,7 @@ def detect_schema( # Handle name/empty for index, name in enumerate(names): - names[index] = name or f"field{index+1}" + names[index] = name or f"field{index + 1}" # Deduplicate names if len(names) != len(set(names)): @@ -354,16 +354,19 @@ def detect_schema( runner_fields: List[Field] = [] # we use shared fields for candidate in field_candidates: descriptor = candidate.copy() + + if descriptor["type"] == "boolean": + if self.field_true_values != settings.DEFAULT_TRUE_VALUES: + descriptor["true_values"] = self.field_true_values # type: ignore + if self.field_false_values != settings.DEFAULT_FALSE_VALUES: + descriptor["false_values"] = self.field_false_values # type: ignore + descriptor["name"] = "shared" field = Field.from_descriptor(descriptor) if field.type == "number" and self.field_float_numbers: field.float_number = True # type: ignore - elif field.type == "boolean": - if self.field_true_values != settings.DEFAULT_TRUE_VALUES: - field.true_values = self.field_true_values # type: ignore - if self.field_false_values != settings.DEFAULT_FALSE_VALUES: - field.false_values = self.field_false_values # type: ignore runner_fields.append(field) + for index, name in enumerate(names): runners.append([]) for field in runner_fields: diff --git a/frictionless/fields/__init__.py b/frictionless/fields/__init__.py index 550d07b079..3dfea7fd50 100644 --- a/frictionless/fields/__init__.py +++ b/frictionless/fields/__init__.py @@ -1,6 +1,6 @@ from .any import AnyField as AnyField from .array import ArrayField as ArrayField -from .boolean import BooleanField as BooleanField +from .boolean import BooleanField from .date import DateField as DateField from .datetime import DatetimeField as DatetimeField from .duration import DurationField as DurationField diff --git a/frictionless/fields/any_descriptor.py b/frictionless/fields/any_descriptor.py new file mode 100644 index 0000000000..87ffcf6939 --- /dev/null +++ b/frictionless/fields/any_descriptor.py @@ -0,0 +1,21 @@ +from typing import Any, Literal, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + + +class AnyFieldDescriptor(BaseFieldDescriptor): + """The field contains values of a unspecified or mixed type.""" + + type: Literal["any"] = "any" + format: Optional[Literal["default"]] = None + constraints: Optional[BaseConstraints[str]] = None + + def read_value(self, cell: Any) -> Any: + # Any field accepts any value as-is + return cell + + def write_value(self, cell: Any) -> Any: + # Any field returns the value as-is + return cell + diff --git a/frictionless/fields/array.py b/frictionless/fields/array.py index fb47aa655c..bf6d1c2bf8 100644 --- a/frictionless/fields/array.py +++ b/frictionless/fields/array.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json from typing import Any, Dict, Optional import attrs @@ -55,34 +54,6 @@ def cell_reader(cell: Any): return cell_reader - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): # type: ignore - if not isinstance(cell, list): - if isinstance(cell, str): - try: - cell = json.loads(cell) - except Exception: - return None - if not isinstance(cell, list): - return None - elif isinstance(cell, tuple): - cell = list(cell) # type: ignore - else: - return None - return cell # type: ignore - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return json.dumps(cell) - - return value_writer - # Metadata metadata_profile_patch = { diff --git a/frictionless/fields/array_descriptor.py b/frictionless/fields/array_descriptor.py new file mode 100644 index 0000000000..a890c822d5 --- /dev/null +++ b/frictionless/fields/array_descriptor.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import json +from typing import Any, Literal, Optional + +from pydantic import Field as PydanticField + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import JSONConstraints + + +class ArrayFieldDescriptor(BaseFieldDescriptor): + """The field contains a valid JSON array.""" + + type: Literal["array"] = "array" + format: Optional[Literal["default"]] = None + constraints: Optional[JSONConstraints] = None + # TODO: check later: + # arrayItem in Frictionless schemas is an unnamed field-like descriptor to prevent using a full FieldDescriptor with "name" (backward compatibility) + array_item: Optional[dict[str, Any]] = PydanticField(default=None, alias="arrayItem") + + def read_value(self, cell: Any) -> Optional[list[Any]]: + if not isinstance(cell, list): + if isinstance(cell, str): + try: + cell = json.loads(cell) + except Exception: + return None + if not isinstance(cell, list): + return None + elif isinstance(cell, tuple): + cell = list(cell) # type: ignore[arg-type] + else: + return None + return cell # type: ignore[return-value] + + def write_value(self, cell: Any) -> str: + return json.dumps(cell) + + diff --git a/frictionless/fields/base_field_descriptor.py b/frictionless/fields/base_field_descriptor.py new file mode 100644 index 0000000000..d89bbac610 --- /dev/null +++ b/frictionless/fields/base_field_descriptor.py @@ -0,0 +1,65 @@ +"""base_field_descriptor.py provides the base Pydantic model for all field descriptors""" + +from __future__ import annotations + +from pydantic import BaseModel, Field as PydanticField, model_validator +from typing import Any, Dict, List, Optional +from typing_extensions import Self + + +class BaseFieldDescriptor(BaseModel): + """Data model of a (unspecialised) field descriptor""" + + name: str + """ + The field descriptor MUST contain a name property. + """ + + title: Optional[str] = None + """ + A human readable label or title for the field + """ + + description: Optional[str] = None + """ + A description for this field e.g. "The recipient of the funds" + """ + + missing_values: Optional[List[str]] = PydanticField( + default=None, alias="missingValues" + ) + """ + A list of field values to consider as null values + """ + + example: Optional[Any] = None + """ + An example of a value for the field. + """ + + @model_validator(mode="before") + @classmethod + def compat(cls, data: Dict[str, Any]) -> Dict[str, Any]: + # Backward compatibility for field.format + + format_ = data.get("format") + if format_: + if format_.startswith("fmt:"): + data["format"] = format_[4:] + + return data + + @model_validator(mode="after") + def validate_example(self) -> Self: + """Validate that the example value can be converted using read_value() if available""" + if self.example is not None: + if hasattr(self, "read_value"): + read_value_method = getattr(self, "read_value") + result = read_value_method(self.example) + if result is None: + raise ValueError( + f'example value "{self.example}" for field "{self.name}" is not valid' + ) + + return self + diff --git a/frictionless/fields/boolean.py b/frictionless/fields/boolean.py index 6d78984fe1..365f8a6eef 100644 --- a/frictionless/fields/boolean.py +++ b/frictionless/fields/boolean.py @@ -1,67 +1,9 @@ from __future__ import annotations +from ..schema.field import Field -from typing import Any, Dict, List - -import attrs - -from .. import settings -from ..schema import Field - - -@attrs.define(kw_only=True, repr=False) class BooleanField(Field): + ### TEMP Only required for Metadata compatibility + ### This is required because "metadata_import" makes a distinction based + ### on the "type" property (`is_typed_class`) type = "boolean" - builtin = True - supported_constraints = [ - "required", - "enum", - ] - - true_values: List[str] = attrs.field(factory=settings.DEFAULT_TRUE_VALUES.copy) - """ - It defines the values to be read as true values while reading data. The default - true values are ["true", "True", "TRUE", "1"]. - """ - - false_values: List[str] = attrs.field(factory=settings.DEFAULT_FALSE_VALUES.copy) - """ - It defines the values to be read as false values while reading data. The default - true values are ["false", "False", "FALSE", "0"]. - """ - - # Read - - def create_value_reader(self): - # Create mapping - mapping: Dict[str, bool] = {} - for value in self.true_values: - mapping[value] = True - for value in self.false_values: - mapping[value] = False - - # Create reader - def value_reader(cell: Any): - if cell is True or cell is False: - return cell - if isinstance(cell, str): - return mapping.get(cell) - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return self.true_values[0] if cell else self.false_values[0] - - return value_writer - - # Metadata - - metadata_profile_patch = { - "properties": { - "trueValues": {"type": "array", "items": {"type": "string"}}, - "falseValues": {"type": "array", "items": {"type": "string"}}, - } - } + \ No newline at end of file diff --git a/frictionless/fields/boolean_descriptor.py b/frictionless/fields/boolean_descriptor.py new file mode 100644 index 0000000000..031540bebe --- /dev/null +++ b/frictionless/fields/boolean_descriptor.py @@ -0,0 +1,50 @@ +from typing import Any, ClassVar, List, Literal, Optional + +from pydantic import Field as PydanticField, AliasChoices + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + +class BooleanFieldDescriptor(BaseFieldDescriptor): + """The field contains boolean (true/false) data.""" + + type: ClassVar[Literal["boolean"]] = "boolean" + + format: Optional[Literal["default"]] = None + constraints: Optional[BaseConstraints[bool]] = None + + true_values: Optional[List[str]] = PydanticField( + default=settings.DEFAULT_TRUE_VALUES, + alias="trueValues", + validation_alias=AliasChoices("trueValues", "true_values"), + ) + """ + Values to be interpreted as "true" for boolean fields + """ + + false_values: Optional[List[str]] = PydanticField( + default=settings.DEFAULT_FALSE_VALUES, + alias="falseValues", + validation_alias=AliasChoices("falseValues", "false_values"), + ) + """ + Values to be interpreted as "false" for boolean fields + """ + + def read_value(self, cell: Any) -> Optional[bool]: + if isinstance(cell, bool): + return cell + + if isinstance(cell, str): + if self.true_values and cell in self.true_values: + return True + if self.false_values and cell in self.false_values: + return False + + return None + + def write_value(self, cell: Optional[bool]) -> Optional[str]: + if self.true_values and self.false_values: + return self.true_values[0] if cell else self.false_values[0] + return None diff --git a/frictionless/fields/date.py b/frictionless/fields/date.py index 809f037ec1..b13521ca1b 100644 --- a/frictionless/fields/date.py +++ b/frictionless/fields/date.py @@ -1,15 +1,8 @@ from __future__ import annotations -from datetime import date, datetime -from typing import Any - import attrs - -from .. import settings -from ..platform import platform from ..schema import Field - @attrs.define(kw_only=True, repr=False) class DateField(Field): type = "date" @@ -21,49 +14,3 @@ class DateField(Field): "enum", ] - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if isinstance(cell, datetime): - value_time = cell.time() - if ( - value_time.hour == 0 - and value_time.minute == 0 - and value_time.second == 0 - ): - return datetime(cell.year, cell.month, cell.day).date() - else: - return None - if isinstance(cell, date): - return cell - if not isinstance(cell, str): - return None - try: - if self.format == "default": - cell = datetime.strptime(cell, settings.DEFAULT_DATE_PATTERN).date() - elif self.format == "any": - cell = platform.dateutil_parser.parse(cell).date() - else: - cell = datetime.strptime(cell, self.format).date() - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create format - format = self.format - if format == settings.DEFAULT_FIELD_FORMAT: - format = settings.DEFAULT_DATE_PATTERN - - # Create writer - def value_writer(cell: Any): - return cell.strftime(format) - - return value_writer diff --git a/frictionless/fields/date_descriptor.py b/frictionless/fields/date_descriptor.py new file mode 100644 index 0000000000..b332c55508 --- /dev/null +++ b/frictionless/fields/date_descriptor.py @@ -0,0 +1,53 @@ +import datetime +from typing import Any, Literal, Optional + + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class DateFieldDescriptor(BaseFieldDescriptor): + """The field contains a date without a time.""" + + type: Literal["date"] = "date" + format: Optional[str] = None + constraints: Optional[ValueConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[datetime.date]: + from datetime import date, datetime + from ..platform import platform + + if isinstance(cell, datetime): + value_time = cell.time() + if ( + value_time.hour == 0 + and value_time.minute == 0 + and value_time.second == 0 + ): + return datetime(cell.year, cell.month, cell.day).date() + else: + return None + if isinstance(cell, date): + return cell + if not isinstance(cell, str): + return None + try: + format_value = self.format or "default" + if format_value == "default": + cell = datetime.strptime(cell, settings.DEFAULT_DATE_PATTERN).date() + elif format_value == "any": + cell = platform.dateutil_parser.parse(cell).date() + else: + cell = datetime.strptime(cell, format_value).date() + except Exception: + return None + return cell + + def write_value(self, cell: Optional[datetime.date]) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == settings.DEFAULT_FIELD_FORMAT: + format_value = settings.DEFAULT_DATE_PATTERN + return cell.strftime(format_value) diff --git a/frictionless/fields/datetime.py b/frictionless/fields/datetime.py index 6ac16d20b5..0795c8dfea 100644 --- a/frictionless/fields/datetime.py +++ b/frictionless/fields/datetime.py @@ -1,12 +1,7 @@ from __future__ import annotations -from datetime import datetime -from typing import Any - import attrs -from .. import settings -from ..platform import platform from ..schema import Field @@ -20,44 +15,3 @@ class DatetimeField(Field): "maximum", "enum", ] - - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, datetime): - if not isinstance(cell, str): - return None - try: - if self.format == "default": - # Guard against shorter formats supported by dateutil - assert cell[16] == ":" - assert len(cell) >= 19 - cell = platform.dateutil_parser.isoparse(cell) - elif self.format == "any": - cell = platform.dateutil_parser.parse(cell) - else: - cell = datetime.strptime(cell, self.format) - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create format - format = self.format - if format == settings.DEFAULT_FIELD_FORMAT: - format = settings.DEFAULT_DATETIME_PATTERN - - # Create writer - def value_writer(cell: Any): - cell = cell.strftime(format) - cell = cell.replace("+0000", "Z") - return cell - - return value_writer diff --git a/frictionless/fields/datetime_descriptor.py b/frictionless/fields/datetime_descriptor.py new file mode 100644 index 0000000000..c2fca77d99 --- /dev/null +++ b/frictionless/fields/datetime_descriptor.py @@ -0,0 +1,44 @@ +import datetime +from typing import Any, Literal, Optional + +from .. import settings +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class DatetimeFieldDescriptor(BaseFieldDescriptor): + """The field contains a date with a time.""" + + type: Literal["datetime"] = "datetime" + format: Optional[str] = None + constraints: Optional[ValueConstraints[datetime.datetime]] = None + + def read_value(self, cell: Any) -> Optional[datetime.datetime]: + if not isinstance(cell, datetime.datetime): + if not isinstance(cell, str): + return None + try: + format_value = self.format or "default" + if format_value == "default": + # Guard against shorter formats supported by dateutil + assert cell[16] == ":" + assert len(cell) >= 19 + cell = platform.dateutil_parser.isoparse(cell) + elif format_value == "any": + cell = platform.dateutil_parser.parse(cell) + else: + cell = datetime.datetime.strptime(cell, format_value) + except Exception: + return None + return cell + + def write_value(self, cell: Optional[datetime.datetime]) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == settings.DEFAULT_FIELD_FORMAT: + format_value = settings.DEFAULT_DATETIME_PATTERN + result = cell.strftime(format_value) + result = result.replace("+0000", "Z") + return result diff --git a/frictionless/fields/duration.py b/frictionless/fields/duration.py index f0ddc61451..d641442c4b 100644 --- a/frictionless/fields/duration.py +++ b/frictionless/fields/duration.py @@ -1,11 +1,7 @@ from __future__ import annotations -import datetime -from typing import Any - import attrs -from ..platform import platform from ..schema import Field @@ -17,28 +13,3 @@ class DurationField(Field): "required", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, (platform.isodate.Duration, datetime.timedelta)): # type: ignore - if not isinstance(cell, str): - return None - try: - cell = platform.isodate.parse_duration(cell) # type: ignore - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): # type: ignore - return platform.isodate.duration_isoformat(cell) # type: ignore - - return value_writer diff --git a/frictionless/fields/duration_descriptor.py b/frictionless/fields/duration_descriptor.py new file mode 100644 index 0000000000..0215af3c6e --- /dev/null +++ b/frictionless/fields/duration_descriptor.py @@ -0,0 +1,31 @@ +import datetime +from typing import Any, Literal, Optional + +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class DurationFieldDescriptor(BaseFieldDescriptor): + """The field contains a duration of time.""" + + type: Literal["duration"] = "duration" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[str]] = None + + def read_value(self, cell: Any) -> Any: + if not isinstance(cell, (platform.isodate.Duration, datetime.timedelta)): # type: ignore + if not isinstance(cell, str): + return None + try: + cell = platform.isodate.parse_duration(cell) # type: ignore + except Exception: + return None + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return platform.isodate.duration_isoformat(cell) # type: ignore + + diff --git a/frictionless/fields/field_constraints.py b/frictionless/fields/field_constraints.py new file mode 100644 index 0000000000..9323714d0f --- /dev/null +++ b/frictionless/fields/field_constraints.py @@ -0,0 +1,33 @@ +"""field_constraints.py provide pydantic Models for constraints""" + +from typing import Any, Dict, Generic, List, Optional, TypeVar, Union + +import pydantic + +T = TypeVar("T") + + +class BaseConstraints(pydantic.BaseModel, Generic[T]): + required: Optional[bool] = None + unique: Optional[bool] = None + enum: Optional[List[Union[str, T]]] = None + + +class CollectionConstraints(BaseConstraints[str]): + minLength: Optional[int] = None + maxLength: Optional[int] = None + + +class JSONConstraints(CollectionConstraints): + jsonSchema: Optional[Dict[str, Any]] = None + + +class StringConstraints(CollectionConstraints): + pattern: Optional[str] = None + + +class ValueConstraints(BaseConstraints[T], Generic[T]): + minimum: Optional[Union[str, T]] = None + maximum: Optional[Union[str, T]] = None + exclusiveMinimum: Optional[Union[str, T]] = None + exclusiveMaximum: Optional[Union[str, T]] = None diff --git a/frictionless/fields/field_descriptor.py b/frictionless/fields/field_descriptor.py new file mode 100644 index 0000000000..4841ad7ff5 --- /dev/null +++ b/frictionless/fields/field_descriptor.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from typing import Literal, Optional, Union + +from pydantic import Field as PydanticField + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import CollectionConstraints + +from .any_descriptor import AnyFieldDescriptor +from .array_descriptor import ArrayFieldDescriptor +from .boolean_descriptor import BooleanFieldDescriptor +from .date_descriptor import DateFieldDescriptor +from .datetime_descriptor import DatetimeFieldDescriptor +from .duration_descriptor import DurationFieldDescriptor +from .geojson_descriptor import GeoJSONFieldDescriptor +from .geopoint_descriptor import GeoPointFieldDescriptor +from .integer_descriptor import IntegerFieldDescriptor +from .number_descriptor import NumberFieldDescriptor +from .object_descriptor import ObjectFieldDescriptor +from .string_descriptor import StringFieldDescriptor +from .time_descriptor import TimeFieldDescriptor +from .year_descriptor import YearFieldDescriptor +from .yearmonth_descriptor import YearmonthFieldDescriptor + + +IItemType = Literal[ + "boolean", + "date", + "datetime", + "integer", + "number", + "string", + "time", +] + +# TODO: why is this not implemented? +class ListFieldDescriptor(BaseFieldDescriptor): + """The field contains data that is an ordered + one-level depth collection of primitive values with a fixed item type. + """ + + type: Literal["list"] = "list" + format: Optional[Literal["default"]] = None + constraints: CollectionConstraints = PydanticField( + default_factory=CollectionConstraints + ) + + delimiter: Optional[str] = None + """ + Specifies the character sequence which separates lexically represented list items. + """ + + item_type: Optional[IItemType] = PydanticField(default=None, alias="itemType") + """ + Specifies the list item type in terms of existent Table Schema types. + """ + + +FieldDescriptorNoArrayOrList = Union[ + AnyFieldDescriptor, + BooleanFieldDescriptor, + DateFieldDescriptor, + DatetimeFieldDescriptor, + DurationFieldDescriptor, + GeoJSONFieldDescriptor, + GeoPointFieldDescriptor, + IntegerFieldDescriptor, + NumberFieldDescriptor, + ObjectFieldDescriptor, + StringFieldDescriptor, + TimeFieldDescriptor, + YearFieldDescriptor, + YearmonthFieldDescriptor, +] + +# Recursive field descriptors (reference FieldDescriptor itself) +FieldDescriptor = Union[ + FieldDescriptorNoArrayOrList, + ArrayFieldDescriptor, + ListFieldDescriptor, +] diff --git a/frictionless/fields/geojson.py b/frictionless/fields/geojson.py index 488421e2b3..12299ee23e 100644 --- a/frictionless/fields/geojson.py +++ b/frictionless/fields/geojson.py @@ -1,12 +1,7 @@ from __future__ import annotations -import json -from typing import Any, Dict, cast - import attrs -from .. import settings -from ..platform import platform from ..schema import Field @@ -18,53 +13,3 @@ class GeojsonField(Field): "required", "enum", ] - - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - validator_for = platform.jsonschema_validators.validator_for # type: ignore - validators = { # type: ignore - "default": validator_for(settings.GEOJSON_PROFILE)(settings.GEOJSON_PROFILE), - "topojson": validator_for(settings.TOPOJSON_PROFILE)( - settings.TOPOJSON_PROFILE - ), - } - - # Create reader - def value_reader(cell: Any): - if isinstance(cell, str): - try: - cell = json.loads(cell) - except Exception: - return None - if not isinstance(cell, dict): - return None - if self.format in ["default", "topojson"]: - try: - validators[self.format].validate(cell) # type: ignore - except Exception: - return None - return cast(Dict[str, Any], cell) - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return json.dumps(cell) - - return value_writer - - # Metadata - - metadata_profile_patch = { - "properties": { - "format": { - "type": "string", - "enum": ["default", "topojson"], - }, - } - } diff --git a/frictionless/fields/geojson_descriptor.py b/frictionless/fields/geojson_descriptor.py new file mode 100644 index 0000000000..918b1a1466 --- /dev/null +++ b/frictionless/fields/geojson_descriptor.py @@ -0,0 +1,44 @@ +import json +from typing import Any, Dict, Literal, Optional, cast + +from .. import settings +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + + +class GeoJSONFieldDescriptor(BaseFieldDescriptor): + """The field contains a JSON object according to GeoJSON or TopoJSON spec.""" + + type: Literal["geojson"] = "geojson" + format: Optional[Literal["default", "topojson"]] = None + constraints: Optional[BaseConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[Dict[str, Any]]: + validator_for = platform.jsonschema_validators.validator_for # type: ignore + validators = { # type: ignore + "default": validator_for(settings.GEOJSON_PROFILE)(settings.GEOJSON_PROFILE), + "topojson": validator_for(settings.TOPOJSON_PROFILE)( + settings.TOPOJSON_PROFILE + ), + } + + if isinstance(cell, str): + try: + cell = json.loads(cell) + except Exception: + return None + if not isinstance(cell, dict): + return None + if self.format in ["default", "topojson"]: + try: + validators[self.format].validate(cell) # type: ignore + except Exception: + return None + return cast(Dict[str, Any], cell) + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return json.dumps(cell) + diff --git a/frictionless/fields/geopoint.py b/frictionless/fields/geopoint.py index 46a1435350..794de21fce 100644 --- a/frictionless/fields/geopoint.py +++ b/frictionless/fields/geopoint.py @@ -1,9 +1,5 @@ from __future__ import annotations -import json -from decimal import Decimal -from typing import Any, NamedTuple - import attrs from ..schema import Field @@ -17,65 +13,3 @@ class GeopointField(Field): "required", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - # Parse - if isinstance(cell, str): - try: - if self.format == "default": - lon, lat = cell.split(",") - lon = lon.strip() - lat = lat.strip() - elif self.format == "array": - lon, lat = json.loads(cell) - elif self.format == "object": - cell = json.loads(cell) - if len(cell) != 2: - return None - lon = cell["lon"] - lat = cell["lat"] - cell = geopoint(Decimal(lon), Decimal(lat)) # type: ignore - except Exception: - return None - - # Validate - try: - cell = geopoint(*cell) - if cell.lon > 180 or cell.lon < -180: - return None - if cell.lat > 90 or cell.lat < -90: - return None - except Exception: - return None - - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - if self.format == "array": - return json.dumps(list(cell)) - elif self.format == "object": - return json.dumps({"lon": cell.lon, "lat": cell.lat}) - return ",".join(map(str, cell)) - - return value_writer - - -# Internal - - -class geopoint(NamedTuple): - lon: int - lat: int - - def __repr__(self): - return str([float(self[0]), float(self[1])]) diff --git a/frictionless/fields/geopoint_descriptor.py b/frictionless/fields/geopoint_descriptor.py new file mode 100644 index 0000000000..872bdb3f5d --- /dev/null +++ b/frictionless/fields/geopoint_descriptor.py @@ -0,0 +1,66 @@ +import json +from decimal import Decimal +from typing import Any, Literal, NamedTuple, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import BaseConstraints + + +class geopoint(NamedTuple): + """Internal representation of a geographic point""" + lon: Decimal + lat: Decimal + + def __repr__(self): + return str([float(self[0]), float(self[1])]) + + +class GeoPointFieldDescriptor(BaseFieldDescriptor): + """The field contains data describing a geographic point.""" + + type: Literal["geopoint"] = "geopoint" + format: Optional[Literal["default", "array", "object"]] = None + constraints: Optional[BaseConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[geopoint]: + # Parse + if isinstance(cell, str): + try: + if self.format == "default" or self.format is None: + lon, lat = cell.split(",") + lon = lon.strip() + lat = lat.strip() + elif self.format == "array": + lon, lat = json.loads(cell) + elif self.format == "object": + cell = json.loads(cell) + if len(cell) != 2: + return None + lon = cell["lon"] + lat = cell["lat"] + cell = geopoint(Decimal(lon), Decimal(lat)) # type: ignore + except Exception: + return None + + # Validate + try: + cell = geopoint(*cell) + if cell.lon > 180 or cell.lon < -180: + return None + if cell.lat > 90 or cell.lat < -90: + return None + except Exception: + return None + + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == "array": + return json.dumps(list(cell)) + elif format_value == "object": + return json.dumps({"lon": cell.lon, "lat": cell.lat}) + return ",".join(map(str, cell)) + diff --git a/frictionless/fields/integer.py b/frictionless/fields/integer.py index 28586607fc..40951afbaa 100644 --- a/frictionless/fields/integer.py +++ b/frictionless/fields/integer.py @@ -1,17 +1,10 @@ from __future__ import annotations +from ..schema.field import Field -import re -from decimal import Decimal -from typing import Any - -import attrs - -from .. import settings -from ..schema import Field - - -@attrs.define(kw_only=True, repr=False) class IntegerField(Field): + ### TEMP Only required for Metadata compatibility + ### This is required because "metadata_import" makes a distinction based + ### on the "type" property (`is_typed_class`) type = "integer" builtin = True supported_constraints = [ @@ -20,62 +13,3 @@ class IntegerField(Field): "maximum", "enum", ] - - bare_number: bool = settings.DEFAULT_BARE_NUMBER - """ - It specifies that the value is a bare number. If true, the pattern to - remove non digit character does not get applied and vice versa. - The default value is True. - """ - - # Read - - def create_value_reader(self): - # Create pattern - pattern = None - if not self.bare_number: - pattern = re.compile(r"((^[^-\d]*)|(\D*$))") - - # Create reader - def value_reader(cell: Any): - if isinstance(cell, str): - cell = cell.strip() - - # Process the cell - if pattern: - cell = pattern.sub("", cell) - - # Cast the cell - try: - return int(cell) - except Exception: - return None - - elif cell is True or cell is False: - return None - elif isinstance(cell, int): - return cell - elif isinstance(cell, float) and cell.is_integer(): - return int(cell) - elif isinstance(cell, Decimal) and cell % 1 == 0: - return int(cell) - return None - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return str(cell) - - return value_writer - - # Metadata - - metadata_profile_patch = { - "properties": { - "bareNumber": {"type": "boolean"}, - } - } diff --git a/frictionless/fields/integer_descriptor.py b/frictionless/fields/integer_descriptor.py new file mode 100644 index 0000000000..675e336bcb --- /dev/null +++ b/frictionless/fields/integer_descriptor.py @@ -0,0 +1,87 @@ + +import re +from decimal import Decimal +from typing import Any, ClassVar, Literal, Optional, Pattern, Union, List + +from pydantic import Field as PydanticField, BaseModel + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class CategoryDict(BaseModel): + """Category dictionary for field categories.""" + value: str + label: Optional[str] = None + +ICategories = Union[ + List[str], + List[CategoryDict], +] + +class IntegerFieldDescriptor(BaseFieldDescriptor): + """The field contains integers - that is whole numbers.""" + + type: Literal["integer"] = "integer" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[int]] = None + + categories: Optional[ICategories] = None + """ + Property to restrict the field to a finite set of possible values + """ + + categories_ordered: Optional[bool] = PydanticField( + default=None, alias="categoriesOrdered" + ) + """ + When categoriesOrdered is true, implementations SHOULD regard the order of + appearance of the values in the categories property as their natural order. + """ + + group_char: Optional[str] = PydanticField(default=None, alias="groupChar") + """ + String whose value is used to group digits for integer/number fields + """ + + bare_number: bool = PydanticField( + default=settings.DEFAULT_BARE_NUMBER, alias="bareNumber" + ) + """ + If false leading and trailing non numbers will be removed for integer/number fields + """ + + pattern: ClassVar[Pattern[str]] = re.compile(r"((^[^-\d]*)|(\D*$))") + + def read_value(self, cell: Any) -> Optional[int]: + if isinstance(cell, bool): + return None + + elif isinstance(cell, int): + return cell + + elif isinstance(cell, str): + cell = cell.strip() + + # Process the cell (remove non-digit characters if bare_number is False) + if not self.bare_number: + cell = self.pattern.sub("", cell) + + # Cast the cell + try: + return int(cell) + except Exception: + return None + + elif isinstance(cell, float) and cell.is_integer(): + return int(cell) + elif isinstance(cell, Decimal) and cell % 1 == 0: + return int(cell) + + return None + + def write_value(self, cell: Optional[int]) -> Optional[str]: + if cell is None: + return None + return str(cell) diff --git a/frictionless/fields/number.py b/frictionless/fields/number.py index 39b11f70c1..6af9291659 100644 --- a/frictionless/fields/number.py +++ b/frictionless/fields/number.py @@ -1,12 +1,8 @@ from __future__ import annotations -import re -from decimal import Decimal -from typing import Any - import attrs +from typing import Optional -from .. import settings from ..schema import Field @@ -20,113 +16,9 @@ class NumberField(Field): "maximum", "enum", ] + decimal_char: Optional[str] = None + group_char: Optional[str] = None + bare_number: Optional[bool] = None + float_number: Optional[bool] = None - bare_number: bool = settings.DEFAULT_BARE_NUMBER - """ - It specifies that the value is a bare number. If true, the pattern to remove non digit - character does not get applied and vice versa. The default value is True. - """ - - float_number: bool = settings.DEFAULT_FLOAT_NUMBER - """ - It specifies that the value is a float number. - """ - - decimal_char: str = settings.DEFAULT_DECIMAL_CHAR - """ - It specifies the char to be used as decimal character. The default - value is ".". It values can be: ".", "@" etc. - """ - - group_char: str = settings.DEFAULT_GROUP_CHAR - """ - It specifies the char to be used as group character. The default value - is "". It can take values such as: ",", "#" etc. - """ - - # Read - - def create_value_reader(self): - # Create pattern - pattern = None - if not self.bare_number: - pattern = re.compile(r"((^[^-\d]*)|(\D*$))") - - # Create processor - processor = None - properties = ["group_char", "decimal_char", "bare_number"] - if set(properties).intersection(self.list_defined()): - - def processor_function(cell: Any): - if pattern: - cell = pattern.sub("", cell) - cell = cell.replace(self.group_char, "") - if self.decimal_char != "." and "." in cell: - return None - cell = cell.replace(self.decimal_char, ".") - return cell - - processor = processor_function - - # Create reader - def value_reader(cell: Any): - Primary = Decimal - Secondary = float - if self.float_number: - Primary = float - Secondary = Decimal - if isinstance(cell, str): - cell = cell.strip() - - # Process the cell - if processor: - cell = processor(cell) # type: ignore - if cell is None: - return None - - # Cast the cell - try: - return Primary(cell) # type: ignore - except Exception: - return None - - elif isinstance(cell, Primary): - return cell - elif cell is True or cell is False: - return None - elif isinstance(cell, int): - return cell - elif isinstance(cell, Secondary): - return Primary(str(cell) if Primary is Decimal else cell) - return None - - return value_reader - - # Write - - # TODO: optimize - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - if self.has_defined("group_char"): - cell = f"{cell:,}".replace(",", "g") - else: - cell = str(cell) - if self.has_defined("decimal_char"): - cell = cell.replace(".", self.decimal_char) - if self.has_defined("group_char"): - cell = cell.replace("g", self.group_char) - return cell - - return value_writer - - # Metadata - metadata_profile_patch = { - "properties": { - "bareNumber": {"type": "boolean"}, - "floatNumber": {"type": "boolean"}, - "decimalChar": {"type": "string"}, - "groupChar": {"type": "string"}, - } - } diff --git a/frictionless/fields/number_descriptor.py b/frictionless/fields/number_descriptor.py new file mode 100644 index 0000000000..e91cb958a6 --- /dev/null +++ b/frictionless/fields/number_descriptor.py @@ -0,0 +1,109 @@ +import re +from decimal import Decimal +from typing import Any, Callable, Literal, Optional, Pattern, Union + +from pydantic import Field as PydanticField + +from .. import settings +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class NumberFieldDescriptor(BaseFieldDescriptor): + """The field contains numbers of any kind including decimals.""" + + type: Literal["number"] = "number" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[Union[int, float]]] = None + + decimal_char: Optional[str] = PydanticField(default=None, alias="decimalChar") + """ + String whose value is used to represent a decimal point for number fields + """ + + group_char: Optional[str] = PydanticField(default=None, alias="groupChar") + """ + String whose value is used to group digits for integer/number fields + """ + + bare_number: Optional[bool] = PydanticField(default=None, alias="bareNumber") + """ + If false leading and trailing non numbers will be removed for integer/number fields + """ + + float_number: Optional[bool] = PydanticField(default=None, alias="floatNumber") + """ + It specifies that the value is a float number. + """ + + def read_value(self, cell: Any) -> Optional[Union[float, Decimal]]: + # Create pattern + pattern: Optional[Pattern[str]] = None + bare_number_value = self.bare_number if self.bare_number is not None else settings.DEFAULT_BARE_NUMBER + if not bare_number_value: + pattern = re.compile(r"((^[^-\d]*)|(\D*$))") + + # Create processor + processor: Optional[Callable[[str], Optional[str]]] = None + decimal_char_value = self.decimal_char if self.decimal_char is not None else settings.DEFAULT_DECIMAL_CHAR + group_char_value = self.group_char if self.group_char is not None else settings.DEFAULT_GROUP_CHAR + + if self.decimal_char is not None or self.group_char is not None or self.bare_number is not None: + def processor_function(cell: str) -> Optional[str]: + if pattern: + cell = pattern.sub("", cell) + cell = cell.replace(group_char_value, "") + if decimal_char_value != "." and "." in cell: + return None + cell = cell.replace(decimal_char_value, ".") + return cell + + processor = processor_function + + # Determine primary and secondary types + Primary = Decimal + Secondary = float + float_number_value = self.float_number if self.float_number is not None else settings.DEFAULT_FLOAT_NUMBER + if float_number_value: + Primary = float + Secondary = Decimal + + if isinstance(cell, str): + cell = cell.strip() + + # Process the cell + if processor: + cell = processor(cell) + if cell is None: + return None + + # Cast the cell + try: + return Primary(cell) # type: ignore + except Exception: + return None + + elif isinstance(cell, Primary): + return cell + elif cell is True or cell is False: + return None + elif isinstance(cell, int): + return cell + elif isinstance(cell, Secondary): + return Primary(str(cell) if Primary is Decimal else cell) + return None + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + + if self.group_char is not None: + cell = f"{cell:,}".replace(",", "g") + else: + cell = str(cell) + if self.decimal_char is not None: + cell = cell.replace(".", self.decimal_char) + if self.group_char is not None: + cell = cell.replace("g", self.group_char) + return cell + diff --git a/frictionless/fields/object.py b/frictionless/fields/object.py index a7e947ef12..43d4def9f7 100644 --- a/frictionless/fields/object.py +++ b/frictionless/fields/object.py @@ -1,8 +1,5 @@ from __future__ import annotations -import json -from typing import Any, Dict, cast - import attrs from ..schema import Field @@ -18,30 +15,3 @@ class ObjectField(Field): "maxLength", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, dict): - if not isinstance(cell, str): - return None - try: - cell = json.loads(cell) - except Exception: - return None - if not isinstance(cell, dict): - return None - return cast(Dict[str, Any], cell) - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return json.dumps(cell) - - return value_writer diff --git a/frictionless/fields/object_descriptor.py b/frictionless/fields/object_descriptor.py new file mode 100644 index 0000000000..13a2f64a3d --- /dev/null +++ b/frictionless/fields/object_descriptor.py @@ -0,0 +1,31 @@ +import json +from typing import Any, Dict, Literal, Optional, cast + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import JSONConstraints + + +class ObjectFieldDescriptor(BaseFieldDescriptor): + """The field contains a valid JSON object.""" + + type: Literal["object"] = "object" + format: Optional[Literal["default"]] = None + constraints: Optional[JSONConstraints] = None + + def read_value(self, cell: Any) -> Optional[Dict[str, Any]]: + if not isinstance(cell, dict): + if not isinstance(cell, str): + return None + try: + cell = json.loads(cell) + except Exception: + return None + if not isinstance(cell, dict): + return None + return cast(Dict[str, Any], cell) + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return json.dumps(cell) + diff --git a/frictionless/fields/string.py b/frictionless/fields/string.py index 3fb4eeec5e..981c6b8274 100644 --- a/frictionless/fields/string.py +++ b/frictionless/fields/string.py @@ -1,11 +1,7 @@ from __future__ import annotations -import base64 -from typing import Any - import attrs -from ..platform import platform from ..schema import Field @@ -20,97 +16,3 @@ class StringField(Field): "pattern", "enum", ] - - # Read - - def create_value_reader(self): - # Uri - if self.format == "uri": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - uri_validator = platform.rfc3986.validators.Validator() # type: ignore - uri_validator.require_presence_of("scheme") # type: ignore - uri = platform.rfc3986.uri_reference(cell) # type: ignore - try: - uri_validator.validate(uri) # type: ignore - except platform.rfc3986.exceptions.ValidationError: # type: ignore - return None - return cell - - # Email - elif self.format == "email": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - if not platform.validators.email(cell): # type: ignore - return None - return cell - - # Uuid - elif self.format == "uuid": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - if not platform.validators.uuid(cell): # type: ignore - return None - return cell - - # Binary - elif self.format == "binary": - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - try: - base64.b64decode(cell) - except Exception: - return None - return cell - - # WKT - elif self.format == "wkt": - parser = platform.wkt.Parser() - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - try: - parser.parse(cell) - except Exception: - return None - return cell - - # Default - else: - - def value_reader(cell: Any): - if not isinstance(cell, str): - return None - - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return str(cell) - - return value_writer - - # Metadata - - metadata_profile_patch = { - "properties": { - "format": { - "type": "string", - "enum": ["default", "email", "uri", "binary", "uuid", "wkt"], - }, - } - } diff --git a/frictionless/fields/string_descriptor.py b/frictionless/fields/string_descriptor.py new file mode 100644 index 0000000000..f0034c8e72 --- /dev/null +++ b/frictionless/fields/string_descriptor.py @@ -0,0 +1,102 @@ +import base64 +from typing import Any, Literal, Optional, Union, List + +from pydantic import Field as PydanticField, BaseModel +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import StringConstraints + +class CategoryDict(BaseModel): + """Category dictionary for field categories.""" + value: str + label: Optional[str] = None + + +ICategories = Union[ + List[str], + List[CategoryDict], +] +"""Categories type used by IntegerFieldDescriptor and StringFieldDescriptor""" +class StringFieldDescriptor(BaseFieldDescriptor): + """The field contains strings, that is, sequences of characters.""" + + type: Literal["string"] = "string" + format: Optional[Literal["default", "binary", "email", "uri", "uuid", "wkt"]] = None + constraints: StringConstraints = PydanticField(default_factory=StringConstraints) + + categories: Optional[ICategories] = None + """ + Property to restrict the field to a finite set of possible values + """ + + categoriesOrdered: Optional[bool] = PydanticField(default=None, alias="categoriesOrdered") + """ + When categoriesOrdered is true, implementations SHOULD regard the order of + appearance of the values in the categories property as their natural order. + """ + + def read_value(self, cell: Any) -> Optional[str]: + format_value = self.format or "default" + + # Uri + if format_value == "uri": + if not isinstance(cell, str): + return None + uri_validator = platform.rfc3986.validators.Validator() # type: ignore + uri_validator.require_presence_of("scheme") # type: ignore + uri = platform.rfc3986.uri_reference(cell) # type: ignore + try: + uri_validator.validate(uri) # type: ignore + except platform.rfc3986.exceptions.ValidationError: # type: ignore + return None + return cell + + # Email + elif format_value == "email": + if not isinstance(cell, str): + return None + result = platform.validators.email(cell) # type: ignore + if result is True: + return cell + return None + + # Uuid + elif format_value == "uuid": + if not isinstance(cell, str): + return None + if not platform.validators.uuid(cell): # type: ignore + return None + return cell + + # Binary + elif format_value == "binary": + if not isinstance(cell, str): + return None + try: + base64.b64decode(cell) + except Exception: + return None + return cell + + # WKT + elif format_value == "wkt": + parser = platform.wkt.Parser() + if not isinstance(cell, str): + return None + try: + parser.parse(cell) + except Exception: + return None + return cell + + # Default + else: + if not isinstance(cell, str): + return None + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return str(cell) + diff --git a/frictionless/fields/time.py b/frictionless/fields/time.py index 750ef16f3a..41a5411024 100644 --- a/frictionless/fields/time.py +++ b/frictionless/fields/time.py @@ -1,12 +1,7 @@ from __future__ import annotations -from datetime import datetime, time -from typing import Any - import attrs -from .. import settings -from ..platform import platform from ..schema import Field @@ -20,46 +15,3 @@ class TimeField(Field): "maximum", "enum", ] - - # Read - - # TODO: use different value_readers based on format (see string) - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, time): - if not isinstance(cell, str): - return None - try: - if self.format == "default": - # Guard against shorter formats supported by dateutil - assert cell[5] == ":" - assert len(cell) >= 8 - cell = platform.dateutil_parser.isoparse( - f"2000-01-01T{cell}" - ).timetz() - elif self.format == "any": - cell = platform.dateutil_parser.parse(cell).timetz() - else: - cell = datetime.strptime(cell, self.format).timetz() - except Exception: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create format - format = self.format - if format == settings.DEFAULT_FIELD_FORMAT: - format = settings.DEFAULT_TIME_PATTERN - - # Create writer - def value_writer(cell: Any): - cell = cell.strftime(format) - cell = cell.replace("+0000", "Z") - return cell - - return value_writer diff --git a/frictionless/fields/time_descriptor.py b/frictionless/fields/time_descriptor.py new file mode 100644 index 0000000000..98ecba2d51 --- /dev/null +++ b/frictionless/fields/time_descriptor.py @@ -0,0 +1,48 @@ +import datetime +from datetime import time +from typing import Any, Literal, Optional + +from .. import settings +from ..platform import platform +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class TimeFieldDescriptor(BaseFieldDescriptor): + """The field contains a time without a date.""" + + type: Literal["time"] = "time" + format: Optional[str] = None + constraints: Optional[ValueConstraints[time]] = None + + def read_value(self, cell: Any) -> Optional[time]: + if not isinstance(cell, time): + if not isinstance(cell, str): + return None + try: + format_value = self.format or "default" + if format_value == "default": + # Guard against shorter formats supported by dateutil + assert cell[5] == ":" + assert len(cell) >= 8 + cell = platform.dateutil_parser.isoparse( + f"2000-01-01T{cell}" + ).timetz() + elif format_value == "any": + cell = platform.dateutil_parser.parse(cell).timetz() + else: + cell = datetime.datetime.strptime(cell, format_value).timetz() + except Exception: + return None + return cell + + def write_value(self, cell: Optional[time]) -> Optional[str]: + if cell is None: + return None + format_value = self.format or "default" + if format_value == settings.DEFAULT_FIELD_FORMAT: + format_value = settings.DEFAULT_TIME_PATTERN + result = cell.strftime(format_value) + result = result.replace("+0000", "Z") + return result + diff --git a/frictionless/fields/year.py b/frictionless/fields/year.py index 25a81d4c60..e7be6260b5 100644 --- a/frictionless/fields/year.py +++ b/frictionless/fields/year.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Any - import attrs from ..schema import Field @@ -17,32 +15,3 @@ class YearField(Field): "maximum", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if not isinstance(cell, int): - if not isinstance(cell, str): - return None - if len(cell) != 4: - return None - try: - cell = int(cell) - except Exception: - return None - if cell < 0 or cell > 9999: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return str(cell) - - return value_writer diff --git a/frictionless/fields/year_descriptor.py b/frictionless/fields/year_descriptor.py new file mode 100644 index 0000000000..9c364821c1 --- /dev/null +++ b/frictionless/fields/year_descriptor.py @@ -0,0 +1,32 @@ +from typing import Any, Literal, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class YearFieldDescriptor(BaseFieldDescriptor): + """The field contains a calendar year.""" + + type: Literal["year"] = "year" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[int]] = None + + def read_value(self, cell: Any) -> Optional[int]: + if not isinstance(cell, int): + if not isinstance(cell, str): + return None + if len(cell) != 4: + return None + try: + cell = int(cell) + except Exception: + return None + if cell < 0 or cell > 9999: + return None + return cell + + def write_value(self, cell: Optional[int]) -> Optional[str]: + if cell is None: + return None + return str(cell) + diff --git a/frictionless/fields/yearmonth.py b/frictionless/fields/yearmonth.py index 2c119e19ce..ed75965925 100644 --- a/frictionless/fields/yearmonth.py +++ b/frictionless/fields/yearmonth.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Any, NamedTuple - import attrs from ..schema import Field @@ -17,45 +15,3 @@ class YearmonthField(Field): "maximum", "enum", ] - - # Read - - def create_value_reader(self): - # Create reader - def value_reader(cell: Any): - if isinstance(cell, (tuple, list)): - if len(cell) != 2: # type: ignore - return None - cell = yearmonth(year=cell[0], month=cell[1]) # type: ignore - elif isinstance(cell, str): - try: - year, month = cell.split("-") - year = int(year) - month = int(month) - if month < 1 or month > 12: - return None - cell = yearmonth(year, month) - except Exception: - return None - else: - return None - return cell - - return value_reader - - # Write - - def create_value_writer(self): - # Create writer - def value_writer(cell: Any): - return f"{cell.year}-{cell.month:02}" - - return value_writer - - -# Internal - - -class yearmonth(NamedTuple): - year: int - month: int diff --git a/frictionless/fields/yearmonth_descriptor.py b/frictionless/fields/yearmonth_descriptor.py new file mode 100644 index 0000000000..284880b1c1 --- /dev/null +++ b/frictionless/fields/yearmonth_descriptor.py @@ -0,0 +1,43 @@ +from typing import Any, Literal, NamedTuple, Optional + +from .base_field_descriptor import BaseFieldDescriptor +from .field_constraints import ValueConstraints + + +class yearmonth(NamedTuple): + """Internal representation of a year-month""" + year: int + month: int + + +class YearmonthFieldDescriptor(BaseFieldDescriptor): + """The field contains a specific month of a specific year.""" + + type: Literal["yearmonth"] = "yearmonth" + format: Optional[Literal["default"]] = None + constraints: Optional[ValueConstraints[str]] = None + + def read_value(self, cell: Any) -> Optional[yearmonth]: + if isinstance(cell, (tuple, list)): + if len(cell) != 2: # type: ignore + return None + cell = yearmonth(year=cell[0], month=cell[1]) # type: ignore + elif isinstance(cell, str): + try: + year, month = cell.split("-") + year = int(year) + month = int(month) + if month < 1 or month > 12: + return None + cell = yearmonth(year, month) + except Exception: + return None + else: + return None + return cell + + def write_value(self, cell: Any) -> Optional[str]: + if cell is None: + return None + return f"{cell.year}-{cell.month:02}" + diff --git a/frictionless/formats/jsonschema/mapper.py b/frictionless/formats/jsonschema/mapper.py index d6d11c3846..dc8b7e2c69 100644 --- a/frictionless/formats/jsonschema/mapper.py +++ b/frictionless/formats/jsonschema/mapper.py @@ -27,14 +27,16 @@ def read_schema(self, profile: Dict[str, Any]) -> Schema: # type: ignore # Field assert isinstance(name, str) assert isinstance(prop, dict) - field = Field.from_descriptor({"type": type, "name": name}) - schema.add_field(field) - + field_descriptor = {"type": type, "name": name} + # Description description = prop.get("description") # type: ignore if description: assert isinstance(description, str) - field.description = description + field_descriptor["description"] = description + + field = Field.from_descriptor(field_descriptor) + schema.add_field(field) # Required if name in required: diff --git a/frictionless/schema/field.py b/frictionless/schema/field.py index 5b9f8c3eb5..80c60958b5 100644 --- a/frictionless/schema/field.py +++ b/frictionless/schema/field.py @@ -1,14 +1,38 @@ from __future__ import annotations +import copy import decimal import re from functools import partial -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Pattern +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Pattern, Type import attrs +import pydantic +from pydantic import BaseModel from .. import errors, settings from ..exception import FrictionlessException +# from ..fields.boolean_descriptor import BooleanFieldDescriptor +# from ..fields.date_descriptor import DateFieldDescriptor +# from ..fields.integer_descriptor import IntegerFieldDescriptor +from ..fields.field_descriptor import ( + AnyFieldDescriptor, + ArrayFieldDescriptor, + BooleanFieldDescriptor, + DateFieldDescriptor, + DatetimeFieldDescriptor, + DurationFieldDescriptor, + FieldDescriptor, + GeoJSONFieldDescriptor, + GeoPointFieldDescriptor, + IntegerFieldDescriptor, + NumberFieldDescriptor, + ObjectFieldDescriptor, + StringFieldDescriptor, + TimeFieldDescriptor, + YearFieldDescriptor, + YearmonthFieldDescriptor, +) from ..metadata import Metadata from ..system import system @@ -17,11 +41,47 @@ from . import types from .schema import Schema +# Mapping from field type to its corresponding descriptor class +TYPE_TO_DESCRIPTOR: Dict[str, Type[BaseModel]] = { + "any": AnyFieldDescriptor, + "array": ArrayFieldDescriptor, + "boolean": BooleanFieldDescriptor, + "date": DateFieldDescriptor, + "datetime": DatetimeFieldDescriptor, + "duration": DurationFieldDescriptor, + "geojson": GeoJSONFieldDescriptor, + "geopoint": GeoPointFieldDescriptor, + "integer": IntegerFieldDescriptor, + "number": NumberFieldDescriptor, + "object": ObjectFieldDescriptor, + "string": StringFieldDescriptor, + "time": TimeFieldDescriptor, + "year": YearFieldDescriptor, + "yearmonth": YearmonthFieldDescriptor, +} + +# Descriptor integration (temporary, during Field refactor) +# Used at two points: +# - Sync (runtime): when a Field attribute changes, update the pydantic _descriptor so read_cell/write_cell use up-to-date parsing logic (e.g. format="email"). +# - Init (validation): when creating _descriptor, we pass a dict using Frictionless descriptor keys (camelCase aliases) +DESCRIPTOR_INIT_ALIASES: Dict[str, str] = { + "format": "format", + "decimal_char": "decimalChar", + "group_char": "groupChar", + "bare_number": "bareNumber", + "float_number": "floatNumber", + "true_values": "trueValues", + "false_values": "falseValues", +} + +DESCRIPTOR_SYNC_ATTRS: set[str] = { *DESCRIPTOR_INIT_ALIASES.keys() } @attrs.define(kw_only=True, repr=False) class Field(Metadata): """Field representation""" + _descriptor: Optional[ FieldDescriptor] = None + name: str """ A short url-usable (and preferably human-readable) name. @@ -50,9 +110,7 @@ class Field(Metadata): For example: "default","array" etc. """ - missing_values: List[str] = attrs.field( - factory=settings.DEFAULT_MISSING_VALUES.copy - ) + missing_values: List[str] = attrs.field(factory=settings.DEFAULT_MISSING_VALUES.copy) """ List of string values to be set as missing values in the field. If any of string in missing values is found in the field value then it is set as None. @@ -88,11 +146,83 @@ class Field(Metadata): List of supported constraints for a field. """ + # All optional fields for the field descriptor + decimal_char: Optional[str] = None + group_char: Optional[str] = None + bare_number: Optional[bool] = None + float_number: Optional[bool] = None + true_values: Optional[List[str]] = None + false_values: Optional[List[str]] = None + + def __attrs_post_init__(self): + self._init_descriptor_from_field() + def __setattr__(self, name: str, value: Any): # type: ignore if name == "type": note = 'Use "schema.set_field_type()" to update the type of the field' raise FrictionlessException(errors.FieldError(note=note)) - return super().__setattr__(name, value) # type: ignore + + result = super().__setattr__(name, value) # type: ignore + + self._sync_descriptor_property(name, value) + + return result + + def _sync_descriptor_property(self, name: str, value: Any) -> None: + """Keep the internal pydantic descriptor in sync with Field attribute assignments.""" + if name not in DESCRIPTOR_SYNC_ATTRS: + return + + if name == "format" and isinstance(value, str): + # Don't sync implicit default format into pydantic, so that it doesnt become "set" and get exported by "model_dump(exclude_unset=True)". + if not self._should_include_format(): + return + + if self._descriptor is None and hasattr(self, "type") and self.type: + self._init_descriptor_from_field() + + if self._descriptor and hasattr(self._descriptor, name): + setattr(self._descriptor, name, value) + + def _init_descriptor_from_field(self) -> None: + """Initialize _descriptor from Field properties if not already set + Use camelCase keys for descriptor init (as per Frictionless descriptor keys) + """ + if self._descriptor is not None: + return + + if not hasattr(self, "type") or not self.type: + return + + descriptor_class = TYPE_TO_DESCRIPTOR.get(self.type) + if not descriptor_class: + return + + descriptor_dict: Dict[str, Any] = { + "name": self.name, + "type": self.type, + } + + for attr, alias in DESCRIPTOR_INIT_ALIASES.items(): + if attr == "format": + if self._should_include_format(): + descriptor_dict["format"] = self.format + continue + value = getattr(self, attr, None) + if value is not None: + descriptor_dict[alias] = value + + try: + self._descriptor = descriptor_class.model_validate(descriptor_dict) # type: ignore + except pydantic.ValidationError: + self._descriptor = None + + def _should_include_format(self) -> bool: + """Whether `format` should be considered set for descriptor/init/sync purposes.""" + fmt = getattr(self, "format", None) + if not isinstance(fmt, str) or not fmt: + return False + return self.has_defined("format") or fmt != settings.DEFAULT_FIELD_FORMAT @property def required(self): @@ -153,7 +283,9 @@ def cell_reader(cell: Any): def create_value_reader(self) -> types.IValueReader: # Create reader - def value_reader(cell: Any): + def value_reader(cell: Any) -> Any: + if self._descriptor: + return self._descriptor.read_value(cell) # type: ignore return cell return value_reader @@ -191,8 +323,10 @@ def cell_writer(cell: Any, *, ignore_missing: bool = False): def create_value_writer(self) -> types.IValueWriter: # Create writer - def value_writer(cell: Any): - return str(cell) + def value_writer(cell: Any) -> Any: + if self._descriptor: + return self._descriptor.write_value(cell) # type: ignore + return cell return value_writer @@ -244,6 +378,52 @@ def metadata_transform(cls, descriptor: IDescriptor): if format and isinstance(format, str) and format.startswith("fmt:"): descriptor["format"] = format.replace("fmt:", "") + @classmethod + def metadata_import( + cls, + descriptor: IDescriptor, + *, + with_basepath: bool = False, + **options: Any, + ) -> "Field": + descriptor_copy = copy.deepcopy(descriptor) + field = super().metadata_import( + descriptor, + with_basepath=with_basepath, + ) + + # Get the descriptor class for this field type + field_type = field.type + DescriptorClass = TYPE_TO_DESCRIPTOR.get(field_type) if field_type else None + + if DescriptorClass: + try: + field._descriptor = DescriptorClass.model_validate(descriptor_copy) # type: ignore + except pydantic.ValidationError as ve: + # Temporary: Handle Pydantic validation errors + # TODO: Remove once Pydantic validation is properly integrated + handle_pydantic_error_for_import(ve) + + return field + + def to_descriptor(self, *, validate: bool = False) -> IDescriptor: + if self._descriptor and isinstance( + self._descriptor, (AnyFieldDescriptor, BooleanFieldDescriptor, IntegerFieldDescriptor, DateFieldDescriptor, DatetimeFieldDescriptor, DurationFieldDescriptor, GeoJSONFieldDescriptor, GeoPointFieldDescriptor, NumberFieldDescriptor, ObjectFieldDescriptor, StringFieldDescriptor, TimeFieldDescriptor, YearFieldDescriptor, YearmonthFieldDescriptor) + ): + base_descr = super().to_descriptor(validate=validate) + # Set by_alias=True to get camelCase keys used by Frictionless (bareNumber) instead of snake_case (bare_number) + # Exclude 'name' from descriptor_descr because it may be "shared" (coming from detector.py) + descriptor_descr = self._descriptor.model_dump( + exclude_none=True, exclude_unset=True, by_alias=True, exclude={"name"} + ) + ## Temporarily, Field properties have priority over + ## Field._descriptor properties + ## Merge descriptor_descr into base_descr to preserve base order + descr = {**base_descr, **descriptor_descr} + return descr + else: + return super().to_descriptor(validate=validate) + @classmethod def metadata_validate(cls, descriptor: IDescriptor): # type: ignore metadata_errors = list(super().metadata_validate(descriptor)) @@ -260,25 +440,31 @@ def metadata_validate(cls, descriptor: IDescriptor): # type: ignore # Examples example = descriptor.get("example") if example: + # Validate descriptor with Pydantic before continuing + # This catches Pydantic validation errors (e.g., invalid example values) type = descriptor.get("type") - Class = system.select_field_class(type) - - field = Class( - name=descriptor.get("name"), # type: ignore - format=descriptor.get("format", "default"), - ) - - if type == "boolean": - # 'example' value must be compared to customized 'trueValues' and 'falseValues' - if "trueValues" in descriptor.keys(): - field.true_values = descriptor["trueValues"] - if "falseValues" in descriptor.keys(): - field.false_values = descriptor["falseValues"] + DescriptorClass = TYPE_TO_DESCRIPTOR.get(type) if type else None + if DescriptorClass: + try: + DescriptorClass.model_validate(descriptor) + except pydantic.ValidationError as ve: + # Temporary: Handle Pydantic validation errors + # TODO: Remove once Pydantic validation is properly integrated + field_errors = handle_pydantic_error_for_validate(ve) + for field_error in field_errors: + yield field_error + return + + # Use metadata_select_class + metadata_import directly (without validation) to avoid recursion + # This properly initializes the field with all properties including + # type-specific ones like trueValues/falseValues for boolean + # We need to pass a copy of the descriptor to avoid modifying the original + Class = Field.metadata_select_class(type) + descriptor_copy = copy.deepcopy(descriptor) + field = Class.metadata_import(descriptor_copy) _, notes = field.read_cell(example) if notes is not None: - note = ( - f'example value "{example}" for field "{field.name}" is not valid' - ) + note = f'example value "{example}" for field "{field.name}" is not valid' yield errors.FieldError(note=note) # Misleading @@ -291,6 +477,72 @@ def metadata_validate(cls, descriptor: IDescriptor): # type: ignore # Internal +# Temporary Pydantic error handling functions +# TODO: Remove these once Pydantic validation is properly integrated +# These functions centralize the parsing logic to make future removal easier + +def parse_pydantic_errors(ve: pydantic.ValidationError) -> List[str]: + """Parse Pydantic validation errors into clean error messages. + + This is a temporary function to handle Pydantic ValidationError objects + and convert them to clean error messages by removing Pydantic-specific prefixes. + + Args: + ve: A Pydantic ValidationError + + Returns: + A list of cleaned error messages (with "Value error, " prefix removed) + """ + error_messages: List[str] = [] + for err in ve.errors(): + if "msg" in err: + note: str = str(err["msg"]) + # Remove "Value error, " prefix if present (Pydantic-specific formatting) + note = note.replace("Value error, ", "") + error_messages.append(note) + return error_messages + + +def handle_pydantic_error_for_import(ve: pydantic.ValidationError) -> None: + """Handle Pydantic ValidationError in metadata_import context. + + This is a temporary function that converts Pydantic validation errors + into Frictionless SchemaError exceptions for use during field import. + + Args: + ve: A Pydantic ValidationError + + Raises: + FrictionlessException with a SchemaError containing the first error message + """ + error_messages = parse_pydantic_errors(ve) + + # Use the first error message, or fall back to string representation + if error_messages: + error_note = error_messages[0] + else: + error_note = str(ve) + + error = errors.SchemaError(note=error_note) + raise FrictionlessException(error) + + +def handle_pydantic_error_for_validate(ve: pydantic.ValidationError) -> List[errors.FieldError]: + """Handle Pydantic ValidationError in metadata_validate context. + + This is a temporary function that converts Pydantic validation errors + into Frictionless FieldError objects for use during field validation. + + Args: + ve: A Pydantic ValidationError + + Returns: + A list of FieldError objects, one for each error message + """ + error_messages = parse_pydantic_errors(ve) + return [errors.FieldError(note=note) for note in error_messages] + + def check_required(constraint: bool, cell: Any): if not (constraint and cell is None): return True