diff --git a/docs/formats.md b/docs/formats.md index f18ecc71..697ee39a 100644 --- a/docs/formats.md +++ b/docs/formats.md @@ -18,5 +18,6 @@ | Region | Nom de région | Vérifie les régions françaises valides pour un Code Officiel Géographique donné. Depuis 2017, la liste des régions françaises n'a pas changé. | | Siren | SIREN | Check french SIREN number validity, but does not check if SIREN number exists. | | Siret | SIRET | Check french SIRET number validity, but does not check if SIRET number exists. | +| Validator | Nom de validateur | Vérification si la valeur donnée est bien valide selon des valeurs valides récupérées | [If you want to add a new French Format, click here!](../CONTRIBUTING.md#implementing-a-new-french-format) \ No newline at end of file diff --git a/src/frformat/__init__.py b/src/frformat/__init__.py index 6f7eb403..1ad6cf65 100644 --- a/src/frformat/__init__.py +++ b/src/frformat/__init__.py @@ -21,10 +21,12 @@ from .formats.region import Region as Region from .formats.siren import Siren as Siren from .formats.siret import Siret as Siret +from .formats.validator import Validator as Validator from .options import Options as Options from .versions import Millesime as Millesime all_formats = [ + Validator, Canton, CodeCommuneInsee, CodeFantoir, diff --git a/src/frformat/formats/validator.py b/src/frformat/formats/validator.py new file mode 100644 index 00000000..d8378287 --- /dev/null +++ b/src/frformat/formats/validator.py @@ -0,0 +1,20 @@ +from frformat import set_format +from frformat.versioned_set import VersionedSet +from frformat.versions import Millesime + +name = "Nom de validateur" +description = "Vérification si la valeur donnée est bien valide selon des valeurs valides récupérées" +source = "source1" + +validator_versioned_data = VersionedSet[Millesime]() + + +valid_values_2023 = set_format.SingleSetFormat.get_values_from_csv( + "src/tests/test_files_data/values.csv", "Username" +) +validator_versioned_data.add_version(Millesime.M2023, valid_values_2023) + + +Validator = set_format.new( + "Validator", name, description, source, validator_versioned_data +) diff --git a/src/frformat/get_values.py b/src/frformat/get_values.py new file mode 100644 index 00000000..458b9e5b --- /dev/null +++ b/src/frformat/get_values.py @@ -0,0 +1,79 @@ +import csv +import io +import logging +import os +import urllib.parse +from typing import Protocol + + +class IFileReader(Protocol): + + def read_file(self, path: str) -> io.TextIOBase: ... + + +def get_values_from_csv( + path: str, column: str, remote_reader: IFileReader, local_reader: IFileReader +) -> frozenset[str]: + """ + Extract all values from a given column in a well-formatted CSV file + located either locally or remotely. + + Supported sources: + - Local files and file with 'file' scheme. + - Remote files using 'http'and 'https' schemes. + + Args: + path: The path or URL to the CSV file. + column: The name of the column from which to extract values. + + Raises: + ValueError: If the file is missing, the column is not found, the path uses + an unsupported scheme or the file cannot be parsed as a valid CSV. + + Returns: + A frozenset containing the values found in the specified column. + """ + + values: list[str] = [] + + try: + parsed_uri: urllib.parse.ParseResult = urllib.parse.urlparse(path) + except ValueError: + logging.error(f"An error is occured while parsing url using this path: {path}") + return frozenset({}) + + is_valid_scheme: bool = parsed_uri.scheme in ("http", "https", "file") + + if not is_valid_scheme and not os.path.isfile(path): + raise ValueError( + f"Invalid path: {path}.The URI must use one of the following schemes: http, https, or file or it must be existing csv file." + ) + + try: + if is_valid_scheme: + csvfile = remote_reader.read_file(path) + + else: + csvfile = local_reader.read_file(path) + except Exception as e: + logging.error( + f"While reading the file getted from this path: {path} there is this exception: {e}" + ) + return frozenset({}) + + with csvfile: + reader: csv.DictReader[str] = csv.DictReader(csvfile) + try: + for row in reader: + if column in row: + values.append(row[column]) + else: + logging.error(f"CSV file is missing the {column} column.") + return frozenset({}) + except ValueError: + logging.error( + f"The file associated to this path: {path} is not well csv formatted" + ) + return frozenset({}) + + return frozenset(values) diff --git a/src/frformat/infra_file_reader.py b/src/frformat/infra_file_reader.py new file mode 100644 index 00000000..28e0ebeb --- /dev/null +++ b/src/frformat/infra_file_reader.py @@ -0,0 +1,15 @@ +import io +import urllib.request + + +class RemoteReader: + def read_file(self, path: str) -> io.TextIOBase: + response: urllib.request._UrlopenRet = urllib.request.urlopen(path) + csvfile = io.StringIO(response.read().decode("utf-8")) + return csvfile + + +class LocalReader: + def read_file(self, path: str) -> io.TextIOBase: + csvfile = open(path, newline="", encoding="utf-8") + return csvfile diff --git a/src/frformat/set_format.py b/src/frformat/set_format.py index 39c0ce40..62064df2 100644 --- a/src/frformat/set_format.py +++ b/src/frformat/set_format.py @@ -8,14 +8,33 @@ - `new` creates specialized versions where data is tied to the class """ -from typing import FrozenSet, Generic, Type, TypeVar, Union, overload +import csv +import io +import logging +import os +import urllib.parse +from typing import ( + ClassVar, + FrozenSet, + Generic, + Protocol, + Type, + TypeVar, + Union, + overload, +) from frformat import CustomStrFormat, Metadata from frformat.common import normalize_value +from frformat.infra_file_reader import LocalReader, RemoteReader from frformat.options import Options from frformat.versioned_set import Version, VersionedSet +class IFileReader(Protocol): + def read_file(self, path: str) -> io.TextIOBase: ... + + class SingleSetFormat(CustomStrFormat): """This format defines a closed list of valid values""" @@ -27,6 +46,11 @@ class SingleSetFormat(CustomStrFormat): Beware, child classes may define an instance `_valid_values` attribute, which will always take precedence over the class attribute for the validation. """ + remote_reader: ClassVar[IFileReader] = RemoteReader() + """ RemoteReader is a class that contain a method to read file with remote path""" + + local_reader: ClassVar[IFileReader] = LocalReader() + """ LocalReader is a class that contain a method to read file with local path""" def __init__(self, options: Options = Options()): self._options = options @@ -47,6 +71,72 @@ def is_valid(self, value: str) -> bool: normalized_value = normalize_value(value, self._options) return normalized_value in self._normalized_values + @classmethod + def get_values_from_csv(cls, path: str, column: str) -> frozenset[str]: + """ + Extract all values from a given column in a well-formatted CSV file + located either locally or remotely. + + Supported sources: + - Local files and file with 'file' scheme. + - Remote files using 'http'and 'https' schemes. + + Args: + path: The path or URL to the CSV file. + column: The name of the column from which to extract values. + + Raises: + ValueError: If the file is missing, the column is not found, the path uses + an unsupported scheme or the file cannot be parsed as a valid CSV. + + Returns: + A frozenset containing the values found in the specified column. + """ + + values: list[str] = [] + + try: + parsed_uri: urllib.parse.ParseResult = urllib.parse.urlparse(path) + except ValueError: + logging.error( + f"An error is occured while parsing url using this path: {path}" + ) + return frozenset({}) + + is_valid_scheme: bool = parsed_uri.scheme in ("http", "https", "file") + + if not is_valid_scheme and not os.path.isfile(path): + raise ValueError( + f"Invalid path: {path}.The URI must use one of the following schemes: http, https, or file or it must be existing csv file." + ) + + try: + if is_valid_scheme: + csvfile = cls.remote_reader.read_file(path) + + else: + csvfile = cls.local_reader.read_file(path) + except Exception as e: + logging.error(f"Error while reading the file at {path}: {e}") + return frozenset({}) + + with csvfile: + reader: csv.DictReader[str] = csv.DictReader(csvfile) + try: + for row in reader: + if column in row: + values.append(row[column]) + else: + logging.error(f"CSV file is missing the {column} column.") + return frozenset({}) + except ValueError: + logging.error( + f"The file associated to this path: {path} is not well csv formatted" + ) + return frozenset({}) + cls._valid_values = frozenset(values) + return frozenset(values) + def get_valid_values_set(self) -> FrozenSet[str]: """Returns the canonical set of valid values. diff --git a/src/tests/test_files_data/values.csv b/src/tests/test_files_data/values.csv new file mode 100644 index 00000000..f5194483 --- /dev/null +++ b/src/tests/test_files_data/values.csv @@ -0,0 +1,3 @@ +"Username","Identifier","First name","Last name" +"booker12","9012","Rachel","Booker" +"grey07","2070","Laura","Grey" \ No newline at end of file diff --git a/src/tests/test_validator.py b/src/tests/test_validator.py new file mode 100644 index 00000000..b77a505b --- /dev/null +++ b/src/tests/test_validator.py @@ -0,0 +1,109 @@ +from io import StringIO, TextIOBase + +import pytest + +from frformat import Millesime, Validator +from frformat.get_values import IFileReader, get_values_from_csv +from frformat.infra_file_reader import LocalReader, RemoteReader +from frformat.set_format import SingleSetFormat + + +def test_validator(): + isvalid = Validator(Millesime.M2023).is_valid("booker12") + assert isvalid is True + + +class FakeFileReader(IFileReader): + def __init__(self, data: str): + self._data = data + + def read_file(self, path: str) -> TextIOBase: + return StringIO(self._data) + + +def test_get_values_from_local_file(): + + csv_data: str = ( + "Username,Email\nbooker1,booker12@example.com\ngrey7,grey07@example.com" + ) + + SingleSetFormat.local_reader = FakeFileReader(csv_data) + + SingleSetFormat.remote_reader = RemoteReader() + + valid_values = SingleSetFormat.get_values_from_csv( + "src/tests/test_files_data/values.csv", "Username" + ) + + assert valid_values == frozenset({"booker1", "grey7"}) + + values = SingleSetFormat.get_values_from_csv( + "src/tests/test_files_data/values.csv", "Link" + ) + assert values == frozenset({}) + + +def test_get_values_from_not_well_formatted_local_file(): + csv_data: str = "xff�Name,Age\nJohn,30" + + local_reader = FakeFileReader(csv_data) + + remote_reader = RemoteReader() + + values = get_values_from_csv( + "src/tests/test_files_data/values.csv", + "coucou", + remote_reader, + local_reader, + ) + assert values == frozenset({}) + + +def test_invalid_path_file(): + remote_reader = RemoteReader() + local_reader = LocalReader() + + with pytest.raises( + ValueError, + match="Invalid path: src/tests/test_files_data/non_existed_file.csv.The URI must use one of the following schemes: http, https, or file or it must be existing csv file.", + ): + get_values_from_csv( + "src/tests/test_files_data/non_existed_file.csv", + "DEP", + remote_reader, + local_reader, + ) + + +def test_get_values_from_remote_csv(): + csv_data: str = "Age,RegionCode\n23,7653\n22,5498" + + remote_reader = FakeFileReader(csv_data) + + local_reader = LocalReader() + + valid_values = get_values_from_csv( + "https://some.fake.url/values.csv", + "Age", + remote_reader, + local_reader, + ) + + assert valid_values == frozenset({"23", "22"}) + + valid_values = get_values_from_csv( + "https://some.fake.url/values.csv", + "Name", + remote_reader, + local_reader, + ) + + assert valid_values == frozenset({}) + + valid_values = get_values_from_csv( + "file:///fakedata/values.csv", + "RegionCode", + remote_reader, + local_reader, + ) + assert valid_values == frozenset({"7653", "5498"})