From 3c02e10758d10c7932801b1487d220bf1a66aa32 Mon Sep 17 00:00:00 2001
From: Wauplin <11801849+Wauplin@users.noreply.github.com>
Date: Thu, 9 Apr 2026 03:55:01 +0000
Subject: [PATCH] Update inference types (automated commit)

---
 src/huggingface_hub/inference/_client.py      | 239 ++++++++++++------
 .../inference/_generated/_async_client.py     | 239 ++++++++++++------
 .../_generated/types/audio_classification.py  |   4 +-
 .../types/automatic_speech_recognition.py     |  42 +--
 .../_generated/types/depth_estimation.py      |   4 +-
 .../types/document_question_answering.py      |  20 +-
 .../_generated/types/feature_extraction.py    |  10 +-
 .../inference/_generated/types/fill_mask.py   |  10 +-
 .../_generated/types/image_classification.py  |   4 +-
 .../_generated/types/image_segmentation.py    |  10 +-
 .../_generated/types/image_text_to_image.py   |  18 +-
 .../_generated/types/image_text_to_video.py   |  20 +-
 .../_generated/types/image_to_image.py        |  14 +-
 .../_generated/types/image_to_text.py         |  42 +--
 .../_generated/types/image_to_video.py        |  18 +-
 .../_generated/types/object_detection.py      |   6 +-
 .../_generated/types/question_answering.py    |  18 +-
 .../_generated/types/sentence_similarity.py   |   4 +-
 .../_generated/types/summarization.py         |   6 +-
 .../types/table_question_answering.py         |   8 +-
 .../_generated/types/text_classification.py   |   4 +-
 .../_generated/types/text_generation.py       |  64 ++---
 .../_generated/types/text_to_audio.py         |  38 +--
 .../_generated/types/text_to_image.py         |  18 +-
 .../_generated/types/text_to_speech.py        |  40 +--
 .../_generated/types/text_to_video.py         |  14 +-
 .../_generated/types/token_classification.py  |  10 +-
 .../inference/_generated/types/translation.py |  10 +-
 .../_generated/types/video_classification.py  |   8 +-
 .../types/visual_question_answering.py        |   8 +-
 .../types/zero_shot_classification.py         |   6 +-
 .../types/zero_shot_image_classification.py   |   4 +-
 .../types/zero_shot_object_detection.py       |   1 +
 33 files changed, 562 insertions(+), 399 deletions(-)

diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index a3f1c07706..06ad740d56 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -300,7 +300,7 @@ def audio_classification(
         audio: ContentT,
         *,
         model: str | None = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
         function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
     ) -> list[AudioClassificationOutputElement]:
         """
@@ -939,14 +939,14 @@ def document_question_answering(
         question: str,
         *,
         model: str | None = None,
-        doc_stride: int | None = None,
-        handle_impossible_answer: bool | None = None,
-        lang: str | None = None,
-        max_answer_len: int | None = None,
-        max_question_len: int | None = None,
-        max_seq_len: int | None = None,
-        top_k: int | None = None,
-        word_boxes: list[list[float] | str] | None = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        lang: Optional[str] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+        word_boxes: Optional[list[Union[list[float], str]]] = None,
     ) -> list[DocumentQuestionAnsweringOutputElement]:
         """
         Answer questions on document images.
@@ -980,6 +980,16 @@ def document_question_answering(
             word_boxes (`list[Union[list[float], str`, *optional*):
                 A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
                 step and use the provided bounding boxes instead.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using doc_stride as overlap) if needed.
+            word_boxes (`list[Union[list[float], str]]`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
+                step and use the provided bounding boxes instead.
         Returns:
             `list[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
 
@@ -1108,8 +1118,8 @@ def fill_mask(
         text: str,
         *,
         model: str | None = None,
-        targets: list[str] | None = None,
-        top_k: int | None = None,
+        targets: Optional[list[str]] = None,
+        top_k: Optional[int] = None,
     ) -> list[FillMaskOutputElement]:
         """
         Fill in a hole with a missing word (token to be precise).
@@ -1120,12 +1130,14 @@ def fill_mask(
             model (`str`, *optional*):
                 The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
-            targets (`list[str`, *optional*):
+            targets (`list[str]`, *optional*):
                 When passed, the model will limit the scores to the passed targets instead of looking up in the whole
                 vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first
                 resulting token will be used (with a warning, and that might be slower).
             top_k (`int`, *optional*):
                 When passed, overrides the number of predictions to return.
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
         Returns:
             `list[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
             probability, token reference, and completed text.
@@ -1165,7 +1177,7 @@ def image_classification(
         *,
         model: str | None = None,
         function_to_apply: Optional["ImageClassificationOutputTransform"] = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
     ) -> list[ImageClassificationOutputElement]:
         """
         Perform image classification on the given image using the specified model.
@@ -1214,10 +1226,10 @@ def image_segmentation(
         image: ContentT,
         *,
         model: str | None = None,
-        mask_threshold: float | None = None,
-        overlap_mask_area_threshold: float | None = None,
+        mask_threshold: Optional[float] = None,
+        overlap_mask_area_threshold: Optional[float] = None,
         subtask: Optional["ImageSegmentationSubtask"] = None,
-        threshold: float | None = None,
+        threshold: Optional[float] = None,
     ) -> list[ImageSegmentationOutputElement]:
         """
         Perform image segmentation on the given image using the specified model.
@@ -1239,6 +1251,8 @@ def image_segmentation(
                 Segmentation task to be performed, depending on model capabilities.
             threshold (`float`, *optional*):
                 Probability threshold to filter out predicted masks.
+            overlap_mask_area_threshold (`float`, *optional*):
+                Mask overlap threshold to eliminate small, disconnected segments.
         Returns:
             `list[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
 
@@ -1282,11 +1296,11 @@ def image_to_image(
         image: ContentT,
         prompt: str | None = None,
         *,
-        negative_prompt: str | None = None,
-        num_inference_steps: int | None = None,
-        guidance_scale: float | None = None,
+        negative_prompt: Optional[str] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
         model: str | None = None,
-        target_size: ImageToImageTargetSize | None = None,
+        target_size: Optional[ImageToImageTargetSize] = None,
         **kwargs,
     ) -> "Image":
         """
@@ -1314,6 +1328,9 @@ def image_to_image(
             target_size (`ImageToImageTargetSize`, *optional*):
                 The size in pixels of the output image. This parameter is only supported by some providers and for
                 specific models. It will be ignored when unsupported.
+            num_inference_steps (`int`, *optional*):
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
 
         Returns:
             `Image`: The translated image.
@@ -1359,12 +1376,12 @@ def image_to_video(
         *,
         model: str | None = None,
         prompt: str | None = None,
-        negative_prompt: str | None = None,
-        num_frames: float | None = None,
-        num_inference_steps: int | None = None,
-        guidance_scale: float | None = None,
-        seed: int | None = None,
-        target_size: ImageToVideoTargetSize | None = None,
+        negative_prompt: Optional[str] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        seed: Optional[int] = None,
+        target_size: Optional[ImageToVideoTargetSize] = None,
         **kwargs,
     ) -> bytes:
         """
@@ -1383,13 +1400,13 @@ def image_to_video(
             num_frames (`float`, *optional*):
                 The num_frames parameter determines how many video frames are generated.
             num_inference_steps (`int`, *optional*):
-                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
-                quality image at the expense of slower inference.
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
             guidance_scale (`float`, *optional*):
                 For diffusion models. A higher guidance scale value encourages the model to generate videos closely
                 linked to the text prompt at the expense of lower image quality.
             seed (`int`, *optional*):
-                The seed to use for the video generation.
+                Seed for the random number generator.
             target_size (`ImageToVideoTargetSize`, *optional*):
                 The size in pixel of the output video frames.
             num_inference_steps (`int`, *optional*):
@@ -1397,6 +1414,13 @@ def image_to_video(
                 expense of slower inference.
             seed (`int`, *optional*):
                 Seed for the random number generator.
+            guidance_scale (`float`, *optional*):
+                For diffusion models. A higher guidance scale value encourages the model to generate videos closely
+                linked to the text prompt at the expense of lower image quality.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            target_size (`ImageToVideoTargetSize`, *optional*):
+                The size in pixel of the output video frames.
 
         Returns:
             `bytes`: The generated video.
@@ -1479,7 +1503,7 @@ def image_to_text(self, image: ContentT, *, model: str | None = None) -> ImageTo
         return output_list[0]
 
     def object_detection(
-        self, image: ContentT, *, model: str | None = None, threshold: float | None = None
+        self, image: ContentT, *, model: str | None = None, threshold: Optional[float] = None
     ) -> list[ObjectDetectionOutputElement]:
         """
         Perform object detection on the given image using the specified model.
@@ -1532,13 +1556,13 @@ def question_answering(
         context: str,
         *,
         model: str | None = None,
-        align_to_words: bool | None = None,
-        doc_stride: int | None = None,
-        handle_impossible_answer: bool | None = None,
-        max_answer_len: int | None = None,
-        max_question_len: int | None = None,
-        max_seq_len: int | None = None,
-        top_k: int | None = None,
+        align_to_words: Optional[bool] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
     ) -> QuestionAnsweringOutputElement | list[QuestionAnsweringOutputElement]:
         """
         Retrieve the answer to a question from a given text.
@@ -1569,6 +1593,14 @@ def question_answering(
             top_k (`int`, *optional*):
                 The number of answers to return (will be chosen by order of likelihood). Note that we return less than
                 topk answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*):
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using docStride as overlap) if needed.
 
         Returns:
             Union[`QuestionAnsweringOutputElement`, list[`QuestionAnsweringOutputElement`]]:
@@ -1668,8 +1700,8 @@ def summarization(
         text: str,
         *,
         model: str | None = None,
-        clean_up_tokenization_spaces: bool | None = None,
-        generate_parameters: dict[str, Any] | None = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        generate_parameters: Optional[dict[str, Any]] = None,
         truncation: Optional["SummarizationTruncationStrategy"] = None,
     ) -> SummarizationOutput:
         """
@@ -1687,6 +1719,8 @@ def summarization(
                 Additional parametrization of the text generation algorithm.
             truncation (`"SummarizationTruncationStrategy"`, *optional*):
                 The truncation strategy to use.
+            generate_parameters (`dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
         Returns:
             [`SummarizationOutput`]: The generated summary text.
 
@@ -1728,8 +1762,8 @@ def table_question_answering(
         *,
         model: str | None = None,
         padding: Optional["Padding"] = None,
-        sequential: bool | None = None,
-        truncation: bool | None = None,
+        sequential: Optional[bool] = None,
+        truncation: Optional[bool] = None,
     ) -> TableQuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from information given in a table.
@@ -1751,6 +1785,8 @@ def table_question_answering(
                 nature.
             truncation (`bool`, *optional*):
                 Activates and controls truncation.
+            truncation (`bool`, *optional*):
+                Activates and controls truncation.
 
         Returns:
             [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
@@ -1893,7 +1929,7 @@ def text_classification(
         text: str,
         *,
         model: str | None = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
         function_to_apply: Optional["TextClassificationOutputTransform"] = None,
     ) -> list[TextClassificationOutputElement]:
         """
@@ -2439,14 +2475,14 @@ def text_to_image(
         self,
         prompt: str,
         *,
-        negative_prompt: str | None = None,
-        height: int | None = None,
-        width: int | None = None,
-        num_inference_steps: int | None = None,
-        guidance_scale: float | None = None,
+        negative_prompt: Optional[str] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
         model: str | None = None,
-        scheduler: str | None = None,
-        seed: int | None = None,
+        scheduler: Optional[str] = None,
+        seed: Optional[int] = None,
         extra_body: dict[str, Any] | None = None,
     ) -> "Image":
         """
@@ -2484,6 +2520,13 @@ def text_to_image(
             extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
+            height (`int`, *optional*):
+                The height in pixels of the output image
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
 
         Returns:
             `Image`: The generated image.
@@ -2580,11 +2623,11 @@ def text_to_video(
         prompt: str,
         *,
         model: str | None = None,
-        guidance_scale: float | None = None,
-        negative_prompt: list[str] | None = None,
-        num_frames: float | None = None,
-        num_inference_steps: int | None = None,
-        seed: int | None = None,
+        guidance_scale: Optional[float] = None,
+        negative_prompt: Optional[list[str]] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        seed: Optional[int] = None,
         extra_body: dict[str, Any] | None = None,
     ) -> bytes:
         """
@@ -2615,6 +2658,11 @@ def text_to_video(
             extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
+            negative_prompt (`list[str]`, *optional*):
+                One or several prompt to guide what NOT to include in video generation.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
 
         Returns:
             `bytes`: The generated video.
@@ -2677,22 +2725,22 @@ def text_to_speech(
         text: str,
         *,
         model: str | None = None,
-        do_sample: bool | None = None,
-        early_stopping: Union[bool, "TextToSpeechEarlyStoppingEnum"] | None = None,
-        epsilon_cutoff: float | None = None,
-        eta_cutoff: float | None = None,
-        max_length: int | None = None,
-        max_new_tokens: int | None = None,
-        min_length: int | None = None,
-        min_new_tokens: int | None = None,
-        num_beam_groups: int | None = None,
-        num_beams: int | None = None,
-        penalty_alpha: float | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        typical_p: float | None = None,
-        use_cache: bool | None = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
+        epsilon_cutoff: Optional[float] = None,
+        eta_cutoff: Optional[float] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        min_length: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        num_beam_groups: Optional[int] = None,
+        num_beams: Optional[int] = None,
+        penalty_alpha: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        use_cache: Optional[bool] = None,
         extra_body: dict[str, Any] | None = None,
     ) -> bytes:
         """
@@ -2757,6 +2805,28 @@ def text_to_speech(
             extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
+            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"]`, *optional*):
+                Controls the stopping condition for beam-based methods.
+            eta_cutoff (`float`, *optional*):
+                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly
+                between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff)
+                * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token
+                probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3,
+                depending on the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            max_new_tokens (`int`, *optional*):
+                The maximum number of tokens to generate. Takes precedence over max_length.
+            min_new_tokens (`int`, *optional*):
+                The minimum number of tokens to generate. Takes precedence over min_length.
+            num_beams (`int`, *optional*):
+                Number of beams to use for beam search.
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            top_p (`float`, *optional*):
+                If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
+                top_p or higher are kept for generation.
+            use_cache (`bool`, *optional*):
+                Whether the model should use the past last key/values attentions to speed up decoding
         Returns:
             `bytes`: The generated audio.
 
@@ -2886,8 +2956,8 @@ def token_classification(
         *,
         model: str | None = None,
         aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None,
-        ignore_labels: list[str] | None = None,
-        stride: int | None = None,
+        ignore_labels: Optional[list[str]] = None,
+        stride: Optional[int] = None,
     ) -> list[TokenClassificationOutputElement]:
         """
         Perform token classification on the given text.
@@ -2902,10 +2972,12 @@ def token_classification(
                 Defaults to None.
             aggregation_strategy (`"TokenClassificationAggregationStrategy"`, *optional*):
                 The strategy used to fuse tokens based on model predictions
-            ignore_labels (`list[str`, *optional*):
+            ignore_labels (`list[str]`, *optional*):
                 A list of labels to ignore
             stride (`int`, *optional*):
                 The number of overlapping tokens between chunks when splitting the input text.
+            stride (`int`, *optional*):
+                The number of overlapping tokens between chunks when splitting the input text.
 
         Returns:
             `list[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
@@ -2960,11 +3032,11 @@ def translation(
         text: str,
         *,
         model: str | None = None,
-        src_lang: str | None = None,
-        tgt_lang: str | None = None,
-        clean_up_tokenization_spaces: bool | None = None,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         truncation: Optional["TranslationTruncationStrategy"] = None,
-        generate_parameters: dict[str, Any] | None = None,
+        generate_parameters: Optional[dict[str, Any]] = None,
     ) -> TranslationOutput:
         """
         Convert text from one language to another.
@@ -2991,6 +3063,8 @@ def translation(
                 The truncation strategy to use.
             generate_parameters (`dict[str, Any]`, *optional*):
                 Additional parametrization of the text generation algorithm.
+            tgt_lang (`str`, *optional*):
+                Target language to translate to. Required for models that can translate to multiple languages.
 
         Returns:
             [`TranslationOutput`]: The generated translated text.
@@ -3050,7 +3124,7 @@ def visual_question_answering(
         question: str,
         *,
         model: str | None = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
     ) -> list[VisualQuestionAnsweringOutputElement]:
         """
         Answering open-ended questions based on an image.
@@ -3108,8 +3182,8 @@ def zero_shot_classification(
         text: str,
         candidate_labels: list[str],
         *,
-        multi_label: bool | None = False,
-        hypothesis_template: str | None = None,
+        multi_label: Optional[bool] = False,
+        hypothesis_template: Optional[str] = None,
         model: str | None = None,
     ) -> list[ZeroShotClassificationOutputElement]:
         """
@@ -3132,6 +3206,9 @@ def zero_shot_classification(
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
 
 
         Returns:
@@ -3212,7 +3289,7 @@ def zero_shot_image_classification(
         candidate_labels: list[str],
         *,
         model: str | None = None,
-        hypothesis_template: str | None = None,
+        hypothesis_template: Optional[str] = None,
         # deprecated argument
         labels: list[str] = None,  # type: ignore
     ) -> list[ZeroShotImageClassificationOutputElement]:
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index 7d7e476139..071a28b1d5 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -320,7 +320,7 @@ async def audio_classification(
         audio: ContentT,
         *,
         model: str | None = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
         function_to_apply: Optional["AudioClassificationOutputTransform"] = None,
     ) -> list[AudioClassificationOutputElement]:
         """
@@ -968,14 +968,14 @@ async def document_question_answering(
         question: str,
         *,
         model: str | None = None,
-        doc_stride: int | None = None,
-        handle_impossible_answer: bool | None = None,
-        lang: str | None = None,
-        max_answer_len: int | None = None,
-        max_question_len: int | None = None,
-        max_seq_len: int | None = None,
-        top_k: int | None = None,
-        word_boxes: list[list[float] | str] | None = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        lang: Optional[str] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
+        word_boxes: Optional[list[Union[list[float], str]]] = None,
     ) -> list[DocumentQuestionAnsweringOutputElement]:
         """
         Answer questions on document images.
@@ -1009,6 +1009,16 @@ async def document_question_answering(
             word_boxes (`list[Union[list[float], str`, *optional*):
                 A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
                 step and use the provided bounding boxes instead.
+            handle_impossible_answer (`bool`, *optional*):
+                Whether to accept impossible as an answer
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using doc_stride as overlap) if needed.
+            word_boxes (`list[Union[list[float], str]]`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
+                step and use the provided bounding boxes instead.
         Returns:
             `list[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
 
@@ -1139,8 +1149,8 @@ async def fill_mask(
         text: str,
         *,
         model: str | None = None,
-        targets: list[str] | None = None,
-        top_k: int | None = None,
+        targets: Optional[list[str]] = None,
+        top_k: Optional[int] = None,
     ) -> list[FillMaskOutputElement]:
         """
         Fill in a hole with a missing word (token to be precise).
@@ -1151,12 +1161,14 @@ async def fill_mask(
             model (`str`, *optional*):
                 The model to use for the fill mask task. Can be a model ID hosted on the Hugging Face Hub or a URL to
                 a deployed Inference Endpoint. If not provided, the default recommended fill mask model will be used.
-            targets (`list[str`, *optional*):
+            targets (`list[str]`, *optional*):
                 When passed, the model will limit the scores to the passed targets instead of looking up in the whole
                 vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first
                 resulting token will be used (with a warning, and that might be slower).
             top_k (`int`, *optional*):
                 When passed, overrides the number of predictions to return.
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
         Returns:
             `list[FillMaskOutputElement]`: a list of [`FillMaskOutputElement`] items containing the predicted label, associated
             probability, token reference, and completed text.
@@ -1197,7 +1209,7 @@ async def image_classification(
         *,
         model: str | None = None,
         function_to_apply: Optional["ImageClassificationOutputTransform"] = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
     ) -> list[ImageClassificationOutputElement]:
         """
         Perform image classification on the given image using the specified model.
@@ -1247,10 +1259,10 @@ async def image_segmentation(
         image: ContentT,
         *,
         model: str | None = None,
-        mask_threshold: float | None = None,
-        overlap_mask_area_threshold: float | None = None,
+        mask_threshold: Optional[float] = None,
+        overlap_mask_area_threshold: Optional[float] = None,
         subtask: Optional["ImageSegmentationSubtask"] = None,
-        threshold: float | None = None,
+        threshold: Optional[float] = None,
     ) -> list[ImageSegmentationOutputElement]:
         """
         Perform image segmentation on the given image using the specified model.
@@ -1272,6 +1284,8 @@ async def image_segmentation(
                 Segmentation task to be performed, depending on model capabilities.
             threshold (`float`, *optional*):
                 Probability threshold to filter out predicted masks.
+            overlap_mask_area_threshold (`float`, *optional*):
+                Mask overlap threshold to eliminate small, disconnected segments.
         Returns:
             `list[ImageSegmentationOutputElement]`: A list of [`ImageSegmentationOutputElement`] items containing the segmented masks and associated attributes.
 
@@ -1316,11 +1330,11 @@ async def image_to_image(
         image: ContentT,
         prompt: str | None = None,
         *,
-        negative_prompt: str | None = None,
-        num_inference_steps: int | None = None,
-        guidance_scale: float | None = None,
+        negative_prompt: Optional[str] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
         model: str | None = None,
-        target_size: ImageToImageTargetSize | None = None,
+        target_size: Optional[ImageToImageTargetSize] = None,
         **kwargs,
     ) -> "Image":
         """
@@ -1348,6 +1362,9 @@ async def image_to_image(
             target_size (`ImageToImageTargetSize`, *optional*):
                 The size in pixels of the output image. This parameter is only supported by some providers and for
                 specific models. It will be ignored when unsupported.
+            num_inference_steps (`int`, *optional*):
+                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
 
         Returns:
             `Image`: The translated image.
@@ -1394,12 +1411,12 @@ async def image_to_video(
         *,
         model: str | None = None,
         prompt: str | None = None,
-        negative_prompt: str | None = None,
-        num_frames: float | None = None,
-        num_inference_steps: int | None = None,
-        guidance_scale: float | None = None,
-        seed: int | None = None,
-        target_size: ImageToVideoTargetSize | None = None,
+        negative_prompt: Optional[str] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        seed: Optional[int] = None,
+        target_size: Optional[ImageToVideoTargetSize] = None,
         **kwargs,
     ) -> bytes:
         """
@@ -1418,13 +1435,13 @@ async def image_to_video(
             num_frames (`float`, *optional*):
                 The num_frames parameter determines how many video frames are generated.
             num_inference_steps (`int`, *optional*):
-                For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
-                quality image at the expense of slower inference.
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
             guidance_scale (`float`, *optional*):
                 For diffusion models. A higher guidance scale value encourages the model to generate videos closely
                 linked to the text prompt at the expense of lower image quality.
             seed (`int`, *optional*):
-                The seed to use for the video generation.
+                Seed for the random number generator.
             target_size (`ImageToVideoTargetSize`, *optional*):
                 The size in pixel of the output video frames.
             num_inference_steps (`int`, *optional*):
@@ -1432,6 +1449,13 @@ async def image_to_video(
                 expense of slower inference.
             seed (`int`, *optional*):
                 Seed for the random number generator.
+            guidance_scale (`float`, *optional*):
+                For diffusion models. A higher guidance scale value encourages the model to generate videos closely
+                linked to the text prompt at the expense of lower image quality.
+            num_frames (`float`, *optional*):
+                The num_frames parameter determines how many video frames are generated.
+            target_size (`ImageToVideoTargetSize`, *optional*):
+                The size in pixel of the output video frames.
 
         Returns:
             `bytes`: The generated video.
@@ -1516,7 +1540,7 @@ async def image_to_text(self, image: ContentT, *, model: str | None = None) -> I
         return output_list[0]
 
     async def object_detection(
-        self, image: ContentT, *, model: str | None = None, threshold: float | None = None
+        self, image: ContentT, *, model: str | None = None, threshold: Optional[float] = None
     ) -> list[ObjectDetectionOutputElement]:
         """
         Perform object detection on the given image using the specified model.
@@ -1570,13 +1594,13 @@ async def question_answering(
         context: str,
         *,
         model: str | None = None,
-        align_to_words: bool | None = None,
-        doc_stride: int | None = None,
-        handle_impossible_answer: bool | None = None,
-        max_answer_len: int | None = None,
-        max_question_len: int | None = None,
-        max_seq_len: int | None = None,
-        top_k: int | None = None,
+        align_to_words: Optional[bool] = None,
+        doc_stride: Optional[int] = None,
+        handle_impossible_answer: Optional[bool] = None,
+        max_answer_len: Optional[int] = None,
+        max_question_len: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
+        top_k: Optional[int] = None,
     ) -> QuestionAnsweringOutputElement | list[QuestionAnsweringOutputElement]:
         """
         Retrieve the answer to a question from a given text.
@@ -1607,6 +1631,14 @@ async def question_answering(
             top_k (`int`, *optional*):
                 The number of answers to return (will be chosen by order of likelihood). Note that we return less than
                 topk answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*):
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using docStride as overlap) if needed.
 
         Returns:
             Union[`QuestionAnsweringOutputElement`, list[`QuestionAnsweringOutputElement`]]:
@@ -1708,8 +1740,8 @@ async def summarization(
         text: str,
         *,
         model: str | None = None,
-        clean_up_tokenization_spaces: bool | None = None,
-        generate_parameters: dict[str, Any] | None = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        generate_parameters: Optional[dict[str, Any]] = None,
         truncation: Optional["SummarizationTruncationStrategy"] = None,
     ) -> SummarizationOutput:
         """
@@ -1727,6 +1759,8 @@ async def summarization(
                 Additional parametrization of the text generation algorithm.
             truncation (`"SummarizationTruncationStrategy"`, *optional*):
                 The truncation strategy to use.
+            generate_parameters (`dict[str, Any]`, *optional*):
+                Additional parametrization of the text generation algorithm.
         Returns:
             [`SummarizationOutput`]: The generated summary text.
 
@@ -1769,8 +1803,8 @@ async def table_question_answering(
         *,
         model: str | None = None,
         padding: Optional["Padding"] = None,
-        sequential: bool | None = None,
-        truncation: bool | None = None,
+        sequential: Optional[bool] = None,
+        truncation: Optional[bool] = None,
     ) -> TableQuestionAnsweringOutputElement:
         """
         Retrieve the answer to a question from information given in a table.
@@ -1792,6 +1826,8 @@ async def table_question_answering(
                 nature.
             truncation (`bool`, *optional*):
                 Activates and controls truncation.
+            truncation (`bool`, *optional*):
+                Activates and controls truncation.
 
         Returns:
             [`TableQuestionAnsweringOutputElement`]: a table question answering output containing the answer, coordinates, cells and the aggregator used.
@@ -1937,7 +1973,7 @@ async def text_classification(
         text: str,
         *,
         model: str | None = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
         function_to_apply: Optional["TextClassificationOutputTransform"] = None,
     ) -> list[TextClassificationOutputElement]:
         """
@@ -2485,14 +2521,14 @@ async def text_to_image(
         self,
         prompt: str,
         *,
-        negative_prompt: str | None = None,
-        height: int | None = None,
-        width: int | None = None,
-        num_inference_steps: int | None = None,
-        guidance_scale: float | None = None,
+        negative_prompt: Optional[str] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
         model: str | None = None,
-        scheduler: str | None = None,
-        seed: int | None = None,
+        scheduler: Optional[str] = None,
+        seed: Optional[int] = None,
         extra_body: dict[str, Any] | None = None,
     ) -> "Image":
         """
@@ -2530,6 +2566,13 @@ async def text_to_image(
             extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
+            height (`int`, *optional*):
+                The height in pixels of the output image
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            seed (`int`, *optional*):
+                Seed for the random number generator.
 
         Returns:
             `Image`: The generated image.
@@ -2627,11 +2670,11 @@ async def text_to_video(
         prompt: str,
         *,
         model: str | None = None,
-        guidance_scale: float | None = None,
-        negative_prompt: list[str] | None = None,
-        num_frames: float | None = None,
-        num_inference_steps: int | None = None,
-        seed: int | None = None,
+        guidance_scale: Optional[float] = None,
+        negative_prompt: Optional[list[str]] = None,
+        num_frames: Optional[float] = None,
+        num_inference_steps: Optional[int] = None,
+        seed: Optional[int] = None,
         extra_body: dict[str, Any] | None = None,
     ) -> bytes:
         """
@@ -2662,6 +2705,11 @@ async def text_to_video(
             extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
+            negative_prompt (`list[str]`, *optional*):
+                One or several prompt to guide what NOT to include in video generation.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
 
         Returns:
             `bytes`: The generated video.
@@ -2724,22 +2772,22 @@ async def text_to_speech(
         text: str,
         *,
         model: str | None = None,
-        do_sample: bool | None = None,
-        early_stopping: Union[bool, "TextToSpeechEarlyStoppingEnum"] | None = None,
-        epsilon_cutoff: float | None = None,
-        eta_cutoff: float | None = None,
-        max_length: int | None = None,
-        max_new_tokens: int | None = None,
-        min_length: int | None = None,
-        min_new_tokens: int | None = None,
-        num_beam_groups: int | None = None,
-        num_beams: int | None = None,
-        penalty_alpha: float | None = None,
-        temperature: float | None = None,
-        top_k: int | None = None,
-        top_p: float | None = None,
-        typical_p: float | None = None,
-        use_cache: bool | None = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None,
+        epsilon_cutoff: Optional[float] = None,
+        eta_cutoff: Optional[float] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        min_length: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        num_beam_groups: Optional[int] = None,
+        num_beams: Optional[int] = None,
+        penalty_alpha: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        use_cache: Optional[bool] = None,
         extra_body: dict[str, Any] | None = None,
     ) -> bytes:
         """
@@ -2804,6 +2852,28 @@ async def text_to_speech(
             extra_body (`dict[str, Any]`, *optional*):
                 Additional provider-specific parameters to pass to the model. Refer to the provider's documentation
                 for supported parameters.
+            early_stopping (`Union[bool, "TextToSpeechEarlyStoppingEnum"]`, *optional*):
+                Controls the stopping condition for beam-based methods.
+            eta_cutoff (`float`, *optional*):
+                Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly
+                between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff)
+                * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token
+                probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3,
+                depending on the size of the model. See [Truncation Sampling as Language Model
+                Desmoothing](https://hf.co/papers/2210.15191) for more details.
+            max_new_tokens (`int`, *optional*):
+                The maximum number of tokens to generate. Takes precedence over max_length.
+            min_new_tokens (`int`, *optional*):
+                The minimum number of tokens to generate. Takes precedence over min_length.
+            num_beams (`int`, *optional*):
+                Number of beams to use for beam search.
+            temperature (`float`, *optional*):
+                The value used to modulate the next token probabilities.
+            top_p (`float`, *optional*):
+                If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
+                top_p or higher are kept for generation.
+            use_cache (`bool`, *optional*):
+                Whether the model should use the past last key/values attentions to speed up decoding
         Returns:
             `bytes`: The generated audio.
 
@@ -2934,8 +3004,8 @@ async def token_classification(
         *,
         model: str | None = None,
         aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None,
-        ignore_labels: list[str] | None = None,
-        stride: int | None = None,
+        ignore_labels: Optional[list[str]] = None,
+        stride: Optional[int] = None,
     ) -> list[TokenClassificationOutputElement]:
         """
         Perform token classification on the given text.
@@ -2950,10 +3020,12 @@ async def token_classification(
                 Defaults to None.
             aggregation_strategy (`"TokenClassificationAggregationStrategy"`, *optional*):
                 The strategy used to fuse tokens based on model predictions
-            ignore_labels (`list[str`, *optional*):
+            ignore_labels (`list[str]`, *optional*):
                 A list of labels to ignore
             stride (`int`, *optional*):
                 The number of overlapping tokens between chunks when splitting the input text.
+            stride (`int`, *optional*):
+                The number of overlapping tokens between chunks when splitting the input text.
 
         Returns:
             `list[TokenClassificationOutputElement]`: List of [`TokenClassificationOutputElement`] items containing the entity group, confidence score, word, start and end index.
@@ -3009,11 +3081,11 @@ async def translation(
         text: str,
         *,
         model: str | None = None,
-        src_lang: str | None = None,
-        tgt_lang: str | None = None,
-        clean_up_tokenization_spaces: bool | None = None,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         truncation: Optional["TranslationTruncationStrategy"] = None,
-        generate_parameters: dict[str, Any] | None = None,
+        generate_parameters: Optional[dict[str, Any]] = None,
     ) -> TranslationOutput:
         """
         Convert text from one language to another.
@@ -3040,6 +3112,8 @@ async def translation(
                 The truncation strategy to use.
             generate_parameters (`dict[str, Any]`, *optional*):
                 Additional parametrization of the text generation algorithm.
+            tgt_lang (`str`, *optional*):
+                Target language to translate to. Required for models that can translate to multiple languages.
 
         Returns:
             [`TranslationOutput`]: The generated translated text.
@@ -3100,7 +3174,7 @@ async def visual_question_answering(
         question: str,
         *,
         model: str | None = None,
-        top_k: int | None = None,
+        top_k: Optional[int] = None,
     ) -> list[VisualQuestionAnsweringOutputElement]:
         """
         Answering open-ended questions based on an image.
@@ -3159,8 +3233,8 @@ async def zero_shot_classification(
         text: str,
         candidate_labels: list[str],
         *,
-        multi_label: bool | None = False,
-        hypothesis_template: str | None = None,
+        multi_label: Optional[bool] = False,
+        hypothesis_template: Optional[str] = None,
         model: str | None = None,
     ) -> list[ZeroShotClassificationOutputElement]:
         """
@@ -3183,6 +3257,9 @@ async def zero_shot_classification(
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
+            hypothesis_template (`str`, *optional*):
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
 
 
         Returns:
@@ -3265,7 +3342,7 @@ async def zero_shot_image_classification(
         candidate_labels: list[str],
         *,
         model: str | None = None,
-        hypothesis_template: str | None = None,
+        hypothesis_template: Optional[str] = None,
         # deprecated argument
         labels: list[str] = None,  # type: ignore
     ) -> list[ZeroShotImageClassificationOutputElement]:
diff --git a/src/huggingface_hub/inference/_generated/types/audio_classification.py b/src/huggingface_hub/inference/_generated/types/audio_classification.py
index b99507c4a7..053055787b 100644
--- a/src/huggingface_hub/inference/_generated/types/audio_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/audio_classification.py
@@ -17,7 +17,7 @@ class AudioClassificationParameters(BaseInferenceType):
 
     function_to_apply: Optional["AudioClassificationOutputTransform"] = None
     """The function to apply to the model outputs in order to retrieve the scores."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """When specified, limits the output to the top K most probable classes."""
 
 
@@ -29,7 +29,7 @@ class AudioClassificationInput(BaseInferenceType):
     """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the audio data as a raw bytes payload.
     """
-    parameters: AudioClassificationParameters | None = None
+    parameters: Optional[AudioClassificationParameters] = None
     """Additional inference parameters for Audio Classification"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py b/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
index 9d728bfdb8..2e6afc4411 100644
--- a/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
+++ b/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Literal, Union
+from typing import Literal, Optional, Union
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -15,17 +15,17 @@
 class AutomaticSpeechRecognitionGenerationParameters(BaseInferenceType):
     """Parametrization of the text generation process"""
 
-    do_sample: bool | None = None
+    do_sample: Optional[bool] = None
     """Whether to use sampling instead of greedy decoding when generating new tokens."""
-    early_stopping: Union[bool, "AutomaticSpeechRecognitionEarlyStoppingEnum"] | None = None
+    early_stopping: Optional[Union[bool, "AutomaticSpeechRecognitionEarlyStoppingEnum"]] = None
     """Controls the stopping condition for beam-based methods."""
-    epsilon_cutoff: float | None = None
+    epsilon_cutoff: Optional[float] = None
     """If set to float strictly between 0 and 1, only tokens with a conditional probability
     greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
     3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
     Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
     """
-    eta_cutoff: float | None = None
+    eta_cutoff: Optional[float] = None
     """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
     float strictly between 0 and 1, a token is only considered if it is greater than either
     eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
@@ -34,40 +34,40 @@ class AutomaticSpeechRecognitionGenerationParameters(BaseInferenceType):
     See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
     for more details.
     """
-    max_length: int | None = None
+    max_length: Optional[int] = None
     """The maximum length (in tokens) of the generated text, including the input."""
-    max_new_tokens: int | None = None
+    max_new_tokens: Optional[int] = None
     """The maximum number of tokens to generate. Takes precedence over max_length."""
-    min_length: int | None = None
+    min_length: Optional[int] = None
     """The minimum length (in tokens) of the generated text, including the input."""
-    min_new_tokens: int | None = None
+    min_new_tokens: Optional[int] = None
     """The minimum number of tokens to generate. Takes precedence over min_length."""
-    num_beam_groups: int | None = None
+    num_beam_groups: Optional[int] = None
     """Number of groups to divide num_beams into in order to ensure diversity among different
     groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
     """
-    num_beams: int | None = None
+    num_beams: Optional[int] = None
     """Number of beams to use for beam search."""
-    penalty_alpha: float | None = None
+    penalty_alpha: Optional[float] = None
     """The value balances the model confidence and the degeneration penalty in contrastive
     search decoding.
     """
-    temperature: float | None = None
+    temperature: Optional[float] = None
     """The value used to modulate the next token probabilities."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
-    top_p: float | None = None
+    top_p: Optional[float] = None
     """If set to float < 1, only the smallest set of most probable tokens with probabilities
     that add up to top_p or higher are kept for generation.
     """
-    typical_p: float | None = None
+    typical_p: Optional[float] = None
     """Local typicality measures how similar the conditional probability of predicting a target
     token next is to the expected conditional probability of predicting a random token next,
     given the partial text already generated. If set to float < 1, the smallest set of the
     most locally typical tokens with probabilities that add up to typical_p or higher are
     kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
     """
-    use_cache: bool | None = None
+    use_cache: Optional[bool] = None
     """Whether the model should use the past last key/values attentions to speed up decoding"""
 
 
@@ -75,9 +75,9 @@ class AutomaticSpeechRecognitionGenerationParameters(BaseInferenceType):
 class AutomaticSpeechRecognitionParameters(BaseInferenceType):
     """Additional inference parameters for Automatic Speech Recognition"""
 
-    generation_parameters: AutomaticSpeechRecognitionGenerationParameters | None = None
+    generation_parameters: Optional[AutomaticSpeechRecognitionGenerationParameters] = None
     """Parametrization of the text generation process"""
-    return_timestamps: bool | None = None
+    return_timestamps: Optional[bool] = None
     """Whether to output corresponding timestamps with the generated text"""
 
 
@@ -89,7 +89,7 @@ class AutomaticSpeechRecognitionInput(BaseInferenceType):
     """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the audio data as a raw bytes payload.
     """
-    parameters: AutomaticSpeechRecognitionParameters | None = None
+    parameters: Optional[AutomaticSpeechRecognitionParameters] = None
     """Additional inference parameters for Automatic Speech Recognition"""
 
 
@@ -107,7 +107,7 @@ class AutomaticSpeechRecognitionOutput(BaseInferenceType):
 
     text: str
     """The recognized text."""
-    chunks: list[AutomaticSpeechRecognitionOutputChunk] | None = None
+    chunks: Optional[list[AutomaticSpeechRecognitionOutputChunk]] = None
     """When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
     the model.
     """
diff --git a/src/huggingface_hub/inference/_generated/types/depth_estimation.py b/src/huggingface_hub/inference/_generated/types/depth_estimation.py
index cf26998ed5..765c3635f9 100644
--- a/src/huggingface_hub/inference/_generated/types/depth_estimation.py
+++ b/src/huggingface_hub/inference/_generated/types/depth_estimation.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -14,7 +14,7 @@ class DepthEstimationInput(BaseInferenceType):
 
     inputs: Any
     """The input image data"""
-    parameters: dict[str, Any] | None = None
+    parameters: Optional[dict[str, Any]] = None
     """Additional inference parameters for Depth Estimation"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/document_question_answering.py b/src/huggingface_hub/inference/_generated/types/document_question_answering.py
index 0ec9c29e30..e3886041d6 100644
--- a/src/huggingface_hub/inference/_generated/types/document_question_answering.py
+++ b/src/huggingface_hub/inference/_generated/types/document_question_answering.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional, Union
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -22,31 +22,31 @@ class DocumentQuestionAnsweringInputData(BaseInferenceType):
 class DocumentQuestionAnsweringParameters(BaseInferenceType):
     """Additional inference parameters for Document Question Answering"""
 
-    doc_stride: int | None = None
+    doc_stride: Optional[int] = None
     """If the words in the document are too long to fit with the question for the model, it will
     be split in several chunks with some overlap. This argument controls the size of that
     overlap.
     """
-    handle_impossible_answer: bool | None = None
+    handle_impossible_answer: Optional[bool] = None
     """Whether to accept impossible as an answer"""
-    lang: str | None = None
+    lang: Optional[str] = None
     """Language to use while running OCR. Defaults to english."""
-    max_answer_len: int | None = None
+    max_answer_len: Optional[int] = None
     """The maximum length of predicted answers (e.g., only answers with a shorter length are
     considered).
     """
-    max_question_len: int | None = None
+    max_question_len: Optional[int] = None
     """The maximum length of the question after tokenization. It will be truncated if needed."""
-    max_seq_len: int | None = None
+    max_seq_len: Optional[int] = None
     """The maximum length of the total sentence (context + question) in tokens of each chunk
     passed to the model. The context will be split in several chunks (using doc_stride as
     overlap) if needed.
     """
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of answers to return (will be chosen by order of likelihood). Can return less
     than top_k answers if there are not enough options available within the context.
     """
-    word_boxes: list[list[float] | str] | None = None
+    word_boxes: Optional[list[Union[list[float], str]]] = None
     """A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
     skip the OCR step and use the provided bounding boxes instead.
     """
@@ -58,7 +58,7 @@ class DocumentQuestionAnsweringInput(BaseInferenceType):
 
     inputs: DocumentQuestionAnsweringInputData
     """One (document, question) pair to answer"""
-    parameters: DocumentQuestionAnsweringParameters | None = None
+    parameters: Optional[DocumentQuestionAnsweringParameters] = None
     """Additional inference parameters for Document Question Answering"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/feature_extraction.py b/src/huggingface_hub/inference/_generated/types/feature_extraction.py
index e2868432b8..1c3cd6f70b 100644
--- a/src/huggingface_hub/inference/_generated/types/feature_extraction.py
+++ b/src/huggingface_hub/inference/_generated/types/feature_extraction.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Literal, Optional
+from typing import Literal, Optional, Union
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -19,10 +19,10 @@ class FeatureExtractionInput(BaseInferenceType):
     https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
     """
 
-    inputs: list[str] | str
+    inputs: Union[list[str], str]
     """The text or list of texts to embed."""
-    normalize: bool | None = None
-    prompt_name: str | None = None
+    normalize: Optional[bool] = None
+    prompt_name: Optional[str] = None
     """The name of the prompt that should be used by for encoding. If not set, no prompt
     will be applied.
     Must be a key in the `sentence-transformers` configuration `prompts` dictionary.
@@ -32,5 +32,5 @@ class FeatureExtractionInput(BaseInferenceType):
     "query: What is the capital of France?" because the prompt text will be prepended before
     any text to encode.
     """
-    truncate: bool | None = None
+    truncate: Optional[bool] = None
     truncation_direction: Optional["FeatureExtractionInputTruncationDirection"] = None
diff --git a/src/huggingface_hub/inference/_generated/types/fill_mask.py b/src/huggingface_hub/inference/_generated/types/fill_mask.py
index 84fcac730e..848421dc13 100644
--- a/src/huggingface_hub/inference/_generated/types/fill_mask.py
+++ b/src/huggingface_hub/inference/_generated/types/fill_mask.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -12,13 +12,13 @@
 class FillMaskParameters(BaseInferenceType):
     """Additional inference parameters for Fill Mask"""
 
-    targets: list[str] | None = None
+    targets: Optional[list[str]] = None
     """When passed, the model will limit the scores to the passed targets instead of looking up
     in the whole vocabulary. If the provided targets are not in the model vocab, they will be
     tokenized and the first resulting token will be used (with a warning, and that might be
     slower).
     """
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """When passed, overrides the number of predictions to return."""
 
 
@@ -28,7 +28,7 @@ class FillMaskInput(BaseInferenceType):
 
     inputs: str
     """The text with masked tokens"""
-    parameters: FillMaskParameters | None = None
+    parameters: Optional[FillMaskParameters] = None
     """Additional inference parameters for Fill Mask"""
 
 
@@ -43,5 +43,5 @@ class FillMaskOutputElement(BaseInferenceType):
     token: int
     """The predicted token id (to replace the masked one)."""
     token_str: Any
-    fill_mask_output_token_str: str | None = None
+    fill_mask_output_token_str: Optional[str] = None
     """The predicted token (to replace the masked one)."""
diff --git a/src/huggingface_hub/inference/_generated/types/image_classification.py b/src/huggingface_hub/inference/_generated/types/image_classification.py
index a0d2d564b7..0fdda6c83f 100644
--- a/src/huggingface_hub/inference/_generated/types/image_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/image_classification.py
@@ -17,7 +17,7 @@ class ImageClassificationParameters(BaseInferenceType):
 
     function_to_apply: Optional["ImageClassificationOutputTransform"] = None
     """The function to apply to the model outputs in order to retrieve the scores."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """When specified, limits the output to the top K most probable classes."""
 
 
@@ -29,7 +29,7 @@ class ImageClassificationInput(BaseInferenceType):
     """The input image data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the image data as a raw bytes payload.
     """
-    parameters: ImageClassificationParameters | None = None
+    parameters: Optional[ImageClassificationParameters] = None
     """Additional inference parameters for Image Classification"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/image_segmentation.py b/src/huggingface_hub/inference/_generated/types/image_segmentation.py
index d2938d89ca..3dbf61db83 100644
--- a/src/huggingface_hub/inference/_generated/types/image_segmentation.py
+++ b/src/huggingface_hub/inference/_generated/types/image_segmentation.py
@@ -15,13 +15,13 @@
 class ImageSegmentationParameters(BaseInferenceType):
     """Additional inference parameters for Image Segmentation"""
 
-    mask_threshold: float | None = None
+    mask_threshold: Optional[float] = None
     """Threshold to use when turning the predicted masks into binary values."""
-    overlap_mask_area_threshold: float | None = None
+    overlap_mask_area_threshold: Optional[float] = None
     """Mask overlap threshold to eliminate small, disconnected segments."""
     subtask: Optional["ImageSegmentationSubtask"] = None
     """Segmentation task to be performed, depending on model capabilities."""
-    threshold: float | None = None
+    threshold: Optional[float] = None
     """Probability threshold to filter out predicted masks."""
 
 
@@ -33,7 +33,7 @@ class ImageSegmentationInput(BaseInferenceType):
     """The input image data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the image data as a raw bytes payload.
     """
-    parameters: ImageSegmentationParameters | None = None
+    parameters: Optional[ImageSegmentationParameters] = None
     """Additional inference parameters for Image Segmentation"""
 
 
@@ -47,5 +47,5 @@ class ImageSegmentationOutputElement(BaseInferenceType):
     """The label of the predicted segment."""
     mask: str
     """The corresponding mask as a black-and-white image (base64-encoded)."""
-    score: float | None = None
+    score: Optional[float] = None
     """The score or confidence degree the model has."""
diff --git a/src/huggingface_hub/inference/_generated/types/image_text_to_image.py b/src/huggingface_hub/inference/_generated/types/image_text_to_image.py
index d711f40ca0..1ddd15335a 100644
--- a/src/huggingface_hub/inference/_generated/types/image_text_to_image.py
+++ b/src/huggingface_hub/inference/_generated/types/image_text_to_image.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -22,23 +22,23 @@ class ImageTextToImageTargetSize(BaseInferenceType):
 class ImageTextToImageParameters(BaseInferenceType):
     """Additional inference parameters for Image Text To Image"""
 
-    guidance_scale: float | None = None
+    guidance_scale: Optional[float] = None
     """For diffusion models. A higher guidance scale value encourages the model to generate
     images closely linked to the text prompt at the expense of lower image quality.
     """
-    negative_prompt: str | None = None
+    negative_prompt: Optional[str] = None
     """One prompt to guide what NOT to include in image generation."""
-    num_inference_steps: int | None = None
+    num_inference_steps: Optional[int] = None
     """For diffusion models. The number of denoising steps. More denoising steps usually lead to
     a higher quality image at the expense of slower inference.
     """
-    prompt: str | None = None
+    prompt: Optional[str] = None
     """The text prompt to guide the image generation. Either this or inputs (image) must be
     provided.
     """
-    seed: int | None = None
+    seed: Optional[int] = None
     """Seed for the random number generator."""
-    target_size: ImageTextToImageTargetSize | None = None
+    target_size: Optional[ImageTextToImageTargetSize] = None
     """The size in pixels of the output image. This parameter is only supported by some
     providers and for specific models. It will be ignored when unsupported.
     """
@@ -50,12 +50,12 @@ class ImageTextToImageInput(BaseInferenceType):
     must be provided, or both.
     """
 
-    inputs: str | None = None
+    inputs: Optional[str] = None
     """The input image data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the image data as a raw bytes payload. Either this or prompt must be
     provided.
     """
-    parameters: ImageTextToImageParameters | None = None
+    parameters: Optional[ImageTextToImageParameters] = None
     """Additional inference parameters for Image Text To Image"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/image_text_to_video.py b/src/huggingface_hub/inference/_generated/types/image_text_to_video.py
index 870bb16c04..58b3a4f24e 100644
--- a/src/huggingface_hub/inference/_generated/types/image_text_to_video.py
+++ b/src/huggingface_hub/inference/_generated/types/image_text_to_video.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -20,25 +20,25 @@ class ImageTextToVideoTargetSize(BaseInferenceType):
 class ImageTextToVideoParameters(BaseInferenceType):
     """Additional inference parameters for Image Text To Video"""
 
-    guidance_scale: float | None = None
+    guidance_scale: Optional[float] = None
     """For diffusion models. A higher guidance scale value encourages the model to generate
     videos closely linked to the text prompt at the expense of lower image quality.
     """
-    negative_prompt: str | None = None
+    negative_prompt: Optional[str] = None
     """One prompt to guide what NOT to include in video generation."""
-    num_frames: float | None = None
+    num_frames: Optional[float] = None
     """The num_frames parameter determines how many video frames are generated."""
-    num_inference_steps: int | None = None
+    num_inference_steps: Optional[int] = None
     """The number of denoising steps. More denoising steps usually lead to a higher quality
     video at the expense of slower inference.
     """
-    prompt: str | None = None
+    prompt: Optional[str] = None
     """The text prompt to guide the video generation. Either this or inputs (image) must be
     provided.
     """
-    seed: int | None = None
+    seed: Optional[int] = None
     """Seed for the random number generator."""
-    target_size: ImageTextToVideoTargetSize | None = None
+    target_size: Optional[ImageTextToVideoTargetSize] = None
     """The size in pixel of the output video frames."""
 
 
@@ -48,12 +48,12 @@ class ImageTextToVideoInput(BaseInferenceType):
     must be provided, or both.
     """
 
-    inputs: str | None = None
+    inputs: Optional[str] = None
     """The input image data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the image data as a raw bytes payload. Either this or prompt must be
     provided.
     """
-    parameters: ImageTextToVideoParameters | None = None
+    parameters: Optional[ImageTextToVideoParameters] = None
     """Additional inference parameters for Image Text To Video"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/image_to_image.py b/src/huggingface_hub/inference/_generated/types/image_to_image.py
index 6e943d7391..b14c79fedf 100644
--- a/src/huggingface_hub/inference/_generated/types/image_to_image.py
+++ b/src/huggingface_hub/inference/_generated/types/image_to_image.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -22,19 +22,19 @@ class ImageToImageTargetSize(BaseInferenceType):
 class ImageToImageParameters(BaseInferenceType):
     """Additional inference parameters for Image To Image"""
 
-    guidance_scale: float | None = None
+    guidance_scale: Optional[float] = None
     """For diffusion models. A higher guidance scale value encourages the model to generate
     images closely linked to the text prompt at the expense of lower image quality.
     """
-    negative_prompt: str | None = None
+    negative_prompt: Optional[str] = None
     """One prompt to guide what NOT to include in image generation."""
-    num_inference_steps: int | None = None
+    num_inference_steps: Optional[int] = None
     """For diffusion models. The number of denoising steps. More denoising steps usually lead to
     a higher quality image at the expense of slower inference.
     """
-    prompt: str | None = None
+    prompt: Optional[str] = None
     """The text prompt to guide the image generation."""
-    target_size: ImageToImageTargetSize | None = None
+    target_size: Optional[ImageToImageTargetSize] = None
     """The size in pixels of the output image. This parameter is only supported by some
     providers and for specific models. It will be ignored when unsupported.
     """
@@ -48,7 +48,7 @@ class ImageToImageInput(BaseInferenceType):
     """The input image data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the image data as a raw bytes payload.
     """
-    parameters: ImageToImageParameters | None = None
+    parameters: Optional[ImageToImageParameters] = None
     """Additional inference parameters for Image To Image"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/image_to_text.py b/src/huggingface_hub/inference/_generated/types/image_to_text.py
index 3924a6612c..b65e0e0068 100644
--- a/src/huggingface_hub/inference/_generated/types/image_to_text.py
+++ b/src/huggingface_hub/inference/_generated/types/image_to_text.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any, Literal, Union
+from typing import Any, Literal, Optional, Union
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -15,17 +15,17 @@
 class ImageToTextGenerationParameters(BaseInferenceType):
     """Parametrization of the text generation process"""
 
-    do_sample: bool | None = None
+    do_sample: Optional[bool] = None
     """Whether to use sampling instead of greedy decoding when generating new tokens."""
-    early_stopping: Union[bool, "ImageToTextEarlyStoppingEnum"] | None = None
+    early_stopping: Optional[Union[bool, "ImageToTextEarlyStoppingEnum"]] = None
     """Controls the stopping condition for beam-based methods."""
-    epsilon_cutoff: float | None = None
+    epsilon_cutoff: Optional[float] = None
     """If set to float strictly between 0 and 1, only tokens with a conditional probability
     greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
     3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
     Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
     """
-    eta_cutoff: float | None = None
+    eta_cutoff: Optional[float] = None
     """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
     float strictly between 0 and 1, a token is only considered if it is greater than either
     eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
@@ -34,40 +34,40 @@ class ImageToTextGenerationParameters(BaseInferenceType):
     See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
     for more details.
     """
-    max_length: int | None = None
+    max_length: Optional[int] = None
     """The maximum length (in tokens) of the generated text, including the input."""
-    max_new_tokens: int | None = None
+    max_new_tokens: Optional[int] = None
     """The maximum number of tokens to generate. Takes precedence over max_length."""
-    min_length: int | None = None
+    min_length: Optional[int] = None
     """The minimum length (in tokens) of the generated text, including the input."""
-    min_new_tokens: int | None = None
+    min_new_tokens: Optional[int] = None
     """The minimum number of tokens to generate. Takes precedence over min_length."""
-    num_beam_groups: int | None = None
+    num_beam_groups: Optional[int] = None
     """Number of groups to divide num_beams into in order to ensure diversity among different
     groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
     """
-    num_beams: int | None = None
+    num_beams: Optional[int] = None
     """Number of beams to use for beam search."""
-    penalty_alpha: float | None = None
+    penalty_alpha: Optional[float] = None
     """The value balances the model confidence and the degeneration penalty in contrastive
     search decoding.
     """
-    temperature: float | None = None
+    temperature: Optional[float] = None
     """The value used to modulate the next token probabilities."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
-    top_p: float | None = None
+    top_p: Optional[float] = None
     """If set to float < 1, only the smallest set of most probable tokens with probabilities
     that add up to top_p or higher are kept for generation.
     """
-    typical_p: float | None = None
+    typical_p: Optional[float] = None
     """Local typicality measures how similar the conditional probability of predicting a target
     token next is to the expected conditional probability of predicting a random token next,
     given the partial text already generated. If set to float < 1, the smallest set of the
     most locally typical tokens with probabilities that add up to typical_p or higher are
     kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
     """
-    use_cache: bool | None = None
+    use_cache: Optional[bool] = None
     """Whether the model should use the past last key/values attentions to speed up decoding"""
 
 
@@ -75,9 +75,9 @@ class ImageToTextGenerationParameters(BaseInferenceType):
 class ImageToTextParameters(BaseInferenceType):
     """Additional inference parameters for Image To Text"""
 
-    generation_parameters: ImageToTextGenerationParameters | None = None
+    generation_parameters: Optional[ImageToTextGenerationParameters] = None
     """Parametrization of the text generation process"""
-    max_new_tokens: int | None = None
+    max_new_tokens: Optional[int] = None
     """The amount of maximum tokens to generate."""
 
 
@@ -87,7 +87,7 @@ class ImageToTextInput(BaseInferenceType):
 
     inputs: Any
     """The input image data"""
-    parameters: ImageToTextParameters | None = None
+    parameters: Optional[ImageToTextParameters] = None
     """Additional inference parameters for Image To Text"""
 
 
@@ -96,5 +96,5 @@ class ImageToTextOutput(BaseInferenceType):
     """Outputs of inference for the Image To Text task"""
 
     generated_text: Any
-    image_to_text_output_generated_text: str | None = None
+    image_to_text_output_generated_text: Optional[str] = None
     """The generated text."""
diff --git a/src/huggingface_hub/inference/_generated/types/image_to_video.py b/src/huggingface_hub/inference/_generated/types/image_to_video.py
index b14883f044..92192a2a05 100644
--- a/src/huggingface_hub/inference/_generated/types/image_to_video.py
+++ b/src/huggingface_hub/inference/_generated/types/image_to_video.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -20,23 +20,23 @@ class ImageToVideoTargetSize(BaseInferenceType):
 class ImageToVideoParameters(BaseInferenceType):
     """Additional inference parameters for Image To Video"""
 
-    guidance_scale: float | None = None
+    guidance_scale: Optional[float] = None
     """For diffusion models. A higher guidance scale value encourages the model to generate
     videos closely linked to the text prompt at the expense of lower image quality.
     """
-    negative_prompt: str | None = None
+    negative_prompt: Optional[str] = None
     """One prompt to guide what NOT to include in video generation."""
-    num_frames: float | None = None
+    num_frames: Optional[float] = None
     """The num_frames parameter determines how many video frames are generated."""
-    num_inference_steps: int | None = None
+    num_inference_steps: Optional[int] = None
     """The number of denoising steps. More denoising steps usually lead to a higher quality
     video at the expense of slower inference.
     """
-    prompt: str | None = None
+    prompt: Optional[str] = None
     """The text prompt to guide the video generation."""
-    seed: int | None = None
+    seed: Optional[int] = None
     """Seed for the random number generator."""
-    target_size: ImageToVideoTargetSize | None = None
+    target_size: Optional[ImageToVideoTargetSize] = None
     """The size in pixel of the output video frames."""
 
 
@@ -48,7 +48,7 @@ class ImageToVideoInput(BaseInferenceType):
     """The input image data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the image data as a raw bytes payload.
     """
-    parameters: ImageToVideoParameters | None = None
+    parameters: Optional[ImageToVideoParameters] = None
     """Additional inference parameters for Image To Video"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/object_detection.py b/src/huggingface_hub/inference/_generated/types/object_detection.py
index 1c7ef78434..75f3ebcfe1 100644
--- a/src/huggingface_hub/inference/_generated/types/object_detection.py
+++ b/src/huggingface_hub/inference/_generated/types/object_detection.py
@@ -3,6 +3,8 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Optional
+
 from .base import BaseInferenceType, dataclass_with_extra
 
 
@@ -10,7 +12,7 @@
 class ObjectDetectionParameters(BaseInferenceType):
     """Additional inference parameters for Object Detection"""
 
-    threshold: float | None = None
+    threshold: Optional[float] = None
     """The probability necessary to make a prediction."""
 
 
@@ -22,7 +24,7 @@ class ObjectDetectionInput(BaseInferenceType):
     """The input image data as a base64-encoded string. If no `parameters` are provided, you can
     also provide the image data as a raw bytes payload.
     """
-    parameters: ObjectDetectionParameters | None = None
+    parameters: Optional[ObjectDetectionParameters] = None
     """Additional inference parameters for Object Detection"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/question_answering.py b/src/huggingface_hub/inference/_generated/types/question_answering.py
index ee97c638d6..014ab41893 100644
--- a/src/huggingface_hub/inference/_generated/types/question_answering.py
+++ b/src/huggingface_hub/inference/_generated/types/question_answering.py
@@ -3,6 +3,8 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Optional
+
 from .base import BaseInferenceType, dataclass_with_extra
 
 
@@ -20,28 +22,28 @@ class QuestionAnsweringInputData(BaseInferenceType):
 class QuestionAnsweringParameters(BaseInferenceType):
     """Additional inference parameters for Question Answering"""
 
-    align_to_words: bool | None = None
+    align_to_words: Optional[bool] = None
     """Attempts to align the answer to real words. Improves quality on space separated
     languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
     """
-    doc_stride: int | None = None
+    doc_stride: Optional[int] = None
     """If the context is too long to fit with the question for the model, it will be split in
     several chunks with some overlap. This argument controls the size of that overlap.
     """
-    handle_impossible_answer: bool | None = None
+    handle_impossible_answer: Optional[bool] = None
     """Whether to accept impossible as an answer."""
-    max_answer_len: int | None = None
+    max_answer_len: Optional[int] = None
     """The maximum length of predicted answers (e.g., only answers with a shorter length are
     considered).
     """
-    max_question_len: int | None = None
+    max_question_len: Optional[int] = None
     """The maximum length of the question after tokenization. It will be truncated if needed."""
-    max_seq_len: int | None = None
+    max_seq_len: Optional[int] = None
     """The maximum length of the total sentence (context + question) in tokens of each chunk
     passed to the model. The context will be split in several chunks (using docStride as
     overlap) if needed.
     """
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of answers to return (will be chosen by order of likelihood). Note that we
     return less than topk answers if there are not enough options available within the
     context.
@@ -54,7 +56,7 @@ class QuestionAnsweringInput(BaseInferenceType):
 
     inputs: QuestionAnsweringInputData
     """One (context, question) pair to answer"""
-    parameters: QuestionAnsweringParameters | None = None
+    parameters: Optional[QuestionAnsweringParameters] = None
     """Additional inference parameters for Question Answering"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/sentence_similarity.py b/src/huggingface_hub/inference/_generated/types/sentence_similarity.py
index a06c32d395..4dd42c0bd8 100644
--- a/src/huggingface_hub/inference/_generated/types/sentence_similarity.py
+++ b/src/huggingface_hub/inference/_generated/types/sentence_similarity.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -23,5 +23,5 @@ class SentenceSimilarityInput(BaseInferenceType):
     """Inputs for Sentence similarity inference"""
 
     inputs: SentenceSimilarityInputData
-    parameters: dict[str, Any] | None = None
+    parameters: Optional[dict[str, Any]] = None
     """Additional inference parameters for Sentence Similarity"""
diff --git a/src/huggingface_hub/inference/_generated/types/summarization.py b/src/huggingface_hub/inference/_generated/types/summarization.py
index 35f2a86f30..0103853aa6 100644
--- a/src/huggingface_hub/inference/_generated/types/summarization.py
+++ b/src/huggingface_hub/inference/_generated/types/summarization.py
@@ -15,9 +15,9 @@
 class SummarizationParameters(BaseInferenceType):
     """Additional inference parameters for summarization."""
 
-    clean_up_tokenization_spaces: bool | None = None
+    clean_up_tokenization_spaces: Optional[bool] = None
     """Whether to clean up the potential extra spaces in the text output."""
-    generate_parameters: dict[str, Any] | None = None
+    generate_parameters: Optional[dict[str, Any]] = None
     """Additional parametrization of the text generation algorithm."""
     truncation: Optional["SummarizationTruncationStrategy"] = None
     """The truncation strategy to use."""
@@ -29,7 +29,7 @@ class SummarizationInput(BaseInferenceType):
 
     inputs: str
     """The input text to summarize."""
-    parameters: SummarizationParameters | None = None
+    parameters: Optional[SummarizationParameters] = None
     """Additional inference parameters for summarization."""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/table_question_answering.py b/src/huggingface_hub/inference/_generated/types/table_question_answering.py
index 1909f0d2b7..dfccb4fe43 100644
--- a/src/huggingface_hub/inference/_generated/types/table_question_answering.py
+++ b/src/huggingface_hub/inference/_generated/types/table_question_answering.py
@@ -27,12 +27,12 @@ class TableQuestionAnsweringParameters(BaseInferenceType):
 
     padding: Optional["Padding"] = None
     """Activates and controls padding."""
-    sequential: bool | None = None
+    sequential: Optional[bool] = None
     """Whether to do inference sequentially or as a batch. Batching is faster, but models like
     SQA require the inference to be done sequentially to extract relations within sequences,
     given their conversational nature.
     """
-    truncation: bool | None = None
+    truncation: Optional[bool] = None
     """Activates and controls truncation."""
 
 
@@ -42,7 +42,7 @@ class TableQuestionAnsweringInput(BaseInferenceType):
 
     inputs: TableQuestionAnsweringInputData
     """One (table, question) pair to answer"""
-    parameters: TableQuestionAnsweringParameters | None = None
+    parameters: Optional[TableQuestionAnsweringParameters] = None
     """Additional inference parameters for Table Question Answering"""
 
 
@@ -58,5 +58,5 @@ class TableQuestionAnsweringOutputElement(BaseInferenceType):
     """list of strings made up of the answer cell values."""
     coordinates: list[list[int]]
     """Coordinates of the cells of the answers."""
-    aggregator: str | None = None
+    aggregator: Optional[str] = None
     """If the model has an aggregator, this returns the aggregator."""
diff --git a/src/huggingface_hub/inference/_generated/types/text_classification.py b/src/huggingface_hub/inference/_generated/types/text_classification.py
index 9df576b4de..9a172b23f8 100644
--- a/src/huggingface_hub/inference/_generated/types/text_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/text_classification.py
@@ -17,7 +17,7 @@ class TextClassificationParameters(BaseInferenceType):
 
     function_to_apply: Optional["TextClassificationOutputTransform"] = None
     """The function to apply to the model outputs in order to retrieve the scores."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """When specified, limits the output to the top K most probable classes."""
 
 
@@ -27,7 +27,7 @@ class TextClassificationInput(BaseInferenceType):
 
     inputs: str
     """The text to classify"""
-    parameters: TextClassificationParameters | None = None
+    parameters: Optional[TextClassificationParameters] = None
     """Additional inference parameters for Text Classification"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/text_generation.py b/src/huggingface_hub/inference/_generated/types/text_generation.py
index 1b2269955b..b470198b40 100644
--- a/src/huggingface_hub/inference/_generated/types/text_generation.py
+++ b/src/huggingface_hub/inference/_generated/types/text_generation.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any, Literal
+from typing import Any, Literal, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -23,50 +23,50 @@ class TextGenerationInputGrammarType(BaseInferenceType):
 
 @dataclass_with_extra
 class TextGenerationInputGenerateParameters(BaseInferenceType):
-    adapter_id: str | None = None
+    adapter_id: Optional[str] = None
     """Lora adapter id"""
-    best_of: int | None = None
+    best_of: Optional[int] = None
     """Generate best_of sequences and return the one if the highest token logprobs."""
-    decoder_input_details: bool | None = None
+    decoder_input_details: Optional[bool] = None
     """Whether to return decoder input token logprobs and ids."""
-    details: bool | None = None
+    details: Optional[bool] = None
     """Whether to return generation details."""
-    do_sample: bool | None = None
+    do_sample: Optional[bool] = None
     """Activate logits sampling."""
-    frequency_penalty: float | None = None
+    frequency_penalty: Optional[float] = None
     """The parameter for frequency penalty. 1.0 means no penalty
     Penalize new tokens based on their existing frequency in the text so far,
     decreasing the model's likelihood to repeat the same line verbatim.
     """
-    grammar: TextGenerationInputGrammarType | None = None
-    max_new_tokens: int | None = None
+    grammar: Optional[TextGenerationInputGrammarType] = None
+    max_new_tokens: Optional[int] = None
     """Maximum number of tokens to generate."""
-    repetition_penalty: float | None = None
+    repetition_penalty: Optional[float] = None
     """The parameter for repetition penalty. 1.0 means no penalty.
     See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
     """
-    return_full_text: bool | None = None
+    return_full_text: Optional[bool] = None
     """Whether to prepend the prompt to the generated text"""
-    seed: int | None = None
+    seed: Optional[int] = None
     """Random sampling seed."""
-    stop: list[str] | None = None
+    stop: Optional[list[str]] = None
     """Stop generating tokens if a member of `stop` is generated."""
-    temperature: float | None = None
+    temperature: Optional[float] = None
     """The value used to module the logits distribution."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
-    top_n_tokens: int | None = None
+    top_n_tokens: Optional[int] = None
     """The number of highest probability vocabulary tokens to keep for top-n-filtering."""
-    top_p: float | None = None
+    top_p: Optional[float] = None
     """Top-p value for nucleus sampling."""
-    truncate: int | None = None
+    truncate: Optional[int] = None
     """Truncate inputs tokens to the given size."""
-    typical_p: float | None = None
+    typical_p: Optional[float] = None
     """Typical Decoding mass
     See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666)
     for more information.
     """
-    watermark: bool | None = None
+    watermark: Optional[bool] = None
     """Watermarking with [A Watermark for Large Language
     Models](https://arxiv.org/abs/2301.10226).
     """
@@ -81,8 +81,8 @@ class TextGenerationInput(BaseInferenceType):
     """
 
     inputs: str
-    parameters: TextGenerationInputGenerateParameters | None = None
-    stream: bool | None = None
+    parameters: Optional[TextGenerationInputGenerateParameters] = None
+    stream: Optional[bool] = None
 
 
 TextGenerationOutputFinishReason = Literal["length", "eos_token", "stop_sequence"]
@@ -110,8 +110,8 @@ class TextGenerationOutputBestOfSequence(BaseInferenceType):
     generated_tokens: int
     prefill: list[TextGenerationOutputPrefillToken]
     tokens: list[TextGenerationOutputToken]
-    seed: int | None = None
-    top_tokens: list[list[TextGenerationOutputToken]] | None = None
+    seed: Optional[int] = None
+    top_tokens: Optional[list[list[TextGenerationOutputToken]]] = None
 
 
 @dataclass_with_extra
@@ -120,9 +120,9 @@ class TextGenerationOutputDetails(BaseInferenceType):
     generated_tokens: int
     prefill: list[TextGenerationOutputPrefillToken]
     tokens: list[TextGenerationOutputToken]
-    best_of_sequences: list[TextGenerationOutputBestOfSequence] | None = None
-    seed: int | None = None
-    top_tokens: list[list[TextGenerationOutputToken]] | None = None
+    best_of_sequences: Optional[list[TextGenerationOutputBestOfSequence]] = None
+    seed: Optional[int] = None
+    top_tokens: Optional[list[list[TextGenerationOutputToken]]] = None
 
 
 @dataclass_with_extra
@@ -134,7 +134,7 @@ class TextGenerationOutput(BaseInferenceType):
     """
 
     generated_text: str
-    details: TextGenerationOutputDetails | None = None
+    details: Optional[TextGenerationOutputDetails] = None
 
 
 @dataclass_with_extra
@@ -142,7 +142,7 @@ class TextGenerationStreamOutputStreamDetails(BaseInferenceType):
     finish_reason: "TextGenerationOutputFinishReason"
     generated_tokens: int
     input_length: int
-    seed: int | None = None
+    seed: Optional[int] = None
 
 
 @dataclass_with_extra
@@ -163,6 +163,6 @@ class TextGenerationStreamOutput(BaseInferenceType):
 
     index: int
     token: TextGenerationStreamOutputToken
-    details: TextGenerationStreamOutputStreamDetails | None = None
-    generated_text: str | None = None
-    top_tokens: list[TextGenerationStreamOutputToken] | None = None
+    details: Optional[TextGenerationStreamOutputStreamDetails] = None
+    generated_text: Optional[str] = None
+    top_tokens: Optional[list[TextGenerationStreamOutputToken]] = None
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_audio.py b/src/huggingface_hub/inference/_generated/types/text_to_audio.py
index 35033f5129..87af80a598 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_audio.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_audio.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any, Literal, Union
+from typing import Any, Literal, Optional, Union
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -15,17 +15,17 @@
 class TextToAudioGenerationParameters(BaseInferenceType):
     """Parametrization of the text generation process"""
 
-    do_sample: bool | None = None
+    do_sample: Optional[bool] = None
     """Whether to use sampling instead of greedy decoding when generating new tokens."""
-    early_stopping: Union[bool, "TextToAudioEarlyStoppingEnum"] | None = None
+    early_stopping: Optional[Union[bool, "TextToAudioEarlyStoppingEnum"]] = None
     """Controls the stopping condition for beam-based methods."""
-    epsilon_cutoff: float | None = None
+    epsilon_cutoff: Optional[float] = None
     """If set to float strictly between 0 and 1, only tokens with a conditional probability
     greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
     3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
     Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
     """
-    eta_cutoff: float | None = None
+    eta_cutoff: Optional[float] = None
     """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
     float strictly between 0 and 1, a token is only considered if it is greater than either
     eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
@@ -34,40 +34,40 @@ class TextToAudioGenerationParameters(BaseInferenceType):
     See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
     for more details.
     """
-    max_length: int | None = None
+    max_length: Optional[int] = None
     """The maximum length (in tokens) of the generated text, including the input."""
-    max_new_tokens: int | None = None
+    max_new_tokens: Optional[int] = None
     """The maximum number of tokens to generate. Takes precedence over max_length."""
-    min_length: int | None = None
+    min_length: Optional[int] = None
     """The minimum length (in tokens) of the generated text, including the input."""
-    min_new_tokens: int | None = None
+    min_new_tokens: Optional[int] = None
     """The minimum number of tokens to generate. Takes precedence over min_length."""
-    num_beam_groups: int | None = None
+    num_beam_groups: Optional[int] = None
     """Number of groups to divide num_beams into in order to ensure diversity among different
     groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
     """
-    num_beams: int | None = None
+    num_beams: Optional[int] = None
     """Number of beams to use for beam search."""
-    penalty_alpha: float | None = None
+    penalty_alpha: Optional[float] = None
     """The value balances the model confidence and the degeneration penalty in contrastive
     search decoding.
     """
-    temperature: float | None = None
+    temperature: Optional[float] = None
     """The value used to modulate the next token probabilities."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
-    top_p: float | None = None
+    top_p: Optional[float] = None
     """If set to float < 1, only the smallest set of most probable tokens with probabilities
     that add up to top_p or higher are kept for generation.
     """
-    typical_p: float | None = None
+    typical_p: Optional[float] = None
     """Local typicality measures how similar the conditional probability of predicting a target
     token next is to the expected conditional probability of predicting a random token next,
     given the partial text already generated. If set to float < 1, the smallest set of the
     most locally typical tokens with probabilities that add up to typical_p or higher are
     kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
     """
-    use_cache: bool | None = None
+    use_cache: Optional[bool] = None
     """Whether the model should use the past last key/values attentions to speed up decoding"""
 
 
@@ -75,7 +75,7 @@ class TextToAudioGenerationParameters(BaseInferenceType):
 class TextToAudioParameters(BaseInferenceType):
     """Additional inference parameters for Text To Audio"""
 
-    generation_parameters: TextToAudioGenerationParameters | None = None
+    generation_parameters: Optional[TextToAudioGenerationParameters] = None
     """Parametrization of the text generation process"""
 
 
@@ -85,7 +85,7 @@ class TextToAudioInput(BaseInferenceType):
 
     inputs: str
     """The input text data"""
-    parameters: TextToAudioParameters | None = None
+    parameters: Optional[TextToAudioParameters] = None
     """Additional inference parameters for Text To Audio"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_image.py b/src/huggingface_hub/inference/_generated/types/text_to_image.py
index 716f240385..20c9637313 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_image.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_image.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -12,23 +12,23 @@
 class TextToImageParameters(BaseInferenceType):
     """Additional inference parameters for Text To Image"""
 
-    guidance_scale: float | None = None
+    guidance_scale: Optional[float] = None
     """A higher guidance scale value encourages the model to generate images closely linked to
     the text prompt, but values too high may cause saturation and other artifacts.
     """
-    height: int | None = None
+    height: Optional[int] = None
     """The height in pixels of the output image"""
-    negative_prompt: str | None = None
+    negative_prompt: Optional[str] = None
     """One prompt to guide what NOT to include in image generation."""
-    num_inference_steps: int | None = None
+    num_inference_steps: Optional[int] = None
     """The number of denoising steps. More denoising steps usually lead to a higher quality
     image at the expense of slower inference.
     """
-    scheduler: str | None = None
+    scheduler: Optional[str] = None
     """Override the scheduler with a compatible one."""
-    seed: int | None = None
+    seed: Optional[int] = None
     """Seed for the random number generator."""
-    width: int | None = None
+    width: Optional[int] = None
     """The width in pixels of the output image"""
 
 
@@ -38,7 +38,7 @@ class TextToImageInput(BaseInferenceType):
 
     inputs: str
     """The input text data (sometimes called "prompt")"""
-    parameters: TextToImageParameters | None = None
+    parameters: Optional[TextToImageParameters] = None
     """Additional inference parameters for Text To Image"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_speech.py b/src/huggingface_hub/inference/_generated/types/text_to_speech.py
index 588e0d1a56..ce2db8f3f9 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_speech.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_speech.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any, Literal, Union
+from typing import Any, Literal, Optional, Union
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -15,17 +15,17 @@
 class TextToSpeechGenerationParameters(BaseInferenceType):
     """Parametrization of the text generation process"""
 
-    do_sample: bool | None = None
+    do_sample: Optional[bool] = None
     """Whether to use sampling instead of greedy decoding when generating new tokens."""
-    early_stopping: Union[bool, "TextToSpeechEarlyStoppingEnum"] | None = None
+    early_stopping: Optional[Union[bool, "TextToSpeechEarlyStoppingEnum"]] = None
     """Controls the stopping condition for beam-based methods."""
-    epsilon_cutoff: float | None = None
+    epsilon_cutoff: Optional[float] = None
     """If set to float strictly between 0 and 1, only tokens with a conditional probability
     greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
     3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
     Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
     """
-    eta_cutoff: float | None = None
+    eta_cutoff: Optional[float] = None
     """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
     float strictly between 0 and 1, a token is only considered if it is greater than either
     eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
@@ -34,40 +34,40 @@ class TextToSpeechGenerationParameters(BaseInferenceType):
     See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
     for more details.
     """
-    max_length: int | None = None
+    max_length: Optional[int] = None
     """The maximum length (in tokens) of the generated text, including the input."""
-    max_new_tokens: int | None = None
+    max_new_tokens: Optional[int] = None
     """The maximum number of tokens to generate. Takes precedence over max_length."""
-    min_length: int | None = None
+    min_length: Optional[int] = None
     """The minimum length (in tokens) of the generated text, including the input."""
-    min_new_tokens: int | None = None
+    min_new_tokens: Optional[int] = None
     """The minimum number of tokens to generate. Takes precedence over min_length."""
-    num_beam_groups: int | None = None
+    num_beam_groups: Optional[int] = None
     """Number of groups to divide num_beams into in order to ensure diversity among different
     groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
     """
-    num_beams: int | None = None
+    num_beams: Optional[int] = None
     """Number of beams to use for beam search."""
-    penalty_alpha: float | None = None
+    penalty_alpha: Optional[float] = None
     """The value balances the model confidence and the degeneration penalty in contrastive
     search decoding.
     """
-    temperature: float | None = None
+    temperature: Optional[float] = None
     """The value used to modulate the next token probabilities."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
-    top_p: float | None = None
+    top_p: Optional[float] = None
     """If set to float < 1, only the smallest set of most probable tokens with probabilities
     that add up to top_p or higher are kept for generation.
     """
-    typical_p: float | None = None
+    typical_p: Optional[float] = None
     """Local typicality measures how similar the conditional probability of predicting a target
     token next is to the expected conditional probability of predicting a random token next,
     given the partial text already generated. If set to float < 1, the smallest set of the
     most locally typical tokens with probabilities that add up to typical_p or higher are
     kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
     """
-    use_cache: bool | None = None
+    use_cache: Optional[bool] = None
     """Whether the model should use the past last key/values attentions to speed up decoding"""
 
 
@@ -75,7 +75,7 @@ class TextToSpeechGenerationParameters(BaseInferenceType):
 class TextToSpeechParameters(BaseInferenceType):
     """Additional inference parameters for Text To Speech"""
 
-    generation_parameters: TextToSpeechGenerationParameters | None = None
+    generation_parameters: Optional[TextToSpeechGenerationParameters] = None
     """Parametrization of the text generation process"""
 
 
@@ -85,7 +85,7 @@ class TextToSpeechInput(BaseInferenceType):
 
     inputs: str
     """The input text data"""
-    parameters: TextToSpeechParameters | None = None
+    parameters: Optional[TextToSpeechParameters] = None
     """Additional inference parameters for Text To Speech"""
 
 
@@ -95,5 +95,5 @@ class TextToSpeechOutput(BaseInferenceType):
 
     audio: Any
     """The generated audio"""
-    sampling_rate: float | None = None
+    sampling_rate: Optional[float] = None
     """The sampling rate of the generated audio waveform."""
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_video.py b/src/huggingface_hub/inference/_generated/types/text_to_video.py
index 6e357113ce..a7e9637821 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_video.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_video.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -12,19 +12,19 @@
 class TextToVideoParameters(BaseInferenceType):
     """Additional inference parameters for Text To Video"""
 
-    guidance_scale: float | None = None
+    guidance_scale: Optional[float] = None
     """A higher guidance scale value encourages the model to generate videos closely linked to
     the text prompt, but values too high may cause saturation and other artifacts.
     """
-    negative_prompt: list[str] | None = None
+    negative_prompt: Optional[list[str]] = None
     """One or several prompt to guide what NOT to include in video generation."""
-    num_frames: float | None = None
+    num_frames: Optional[float] = None
     """The num_frames parameter determines how many video frames are generated."""
-    num_inference_steps: int | None = None
+    num_inference_steps: Optional[int] = None
     """The number of denoising steps. More denoising steps usually lead to a higher quality
     video at the expense of slower inference.
     """
-    seed: int | None = None
+    seed: Optional[int] = None
     """Seed for the random number generator."""
 
 
@@ -34,7 +34,7 @@ class TextToVideoInput(BaseInferenceType):
 
     inputs: str
     """The input text data (sometimes called "prompt")"""
-    parameters: TextToVideoParameters | None = None
+    parameters: Optional[TextToVideoParameters] = None
     """Additional inference parameters for Text To Video"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/token_classification.py b/src/huggingface_hub/inference/_generated/types/token_classification.py
index abf21d19a6..b40f4b5f6f 100644
--- a/src/huggingface_hub/inference/_generated/types/token_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/token_classification.py
@@ -17,9 +17,9 @@ class TokenClassificationParameters(BaseInferenceType):
 
     aggregation_strategy: Optional["TokenClassificationAggregationStrategy"] = None
     """The strategy used to fuse tokens based on model predictions"""
-    ignore_labels: list[str] | None = None
+    ignore_labels: Optional[list[str]] = None
     """A list of labels to ignore"""
-    stride: int | None = None
+    stride: Optional[int] = None
     """The number of overlapping tokens between chunks when splitting the input text."""
 
 
@@ -29,7 +29,7 @@ class TokenClassificationInput(BaseInferenceType):
 
     inputs: str
     """The input text data"""
-    parameters: TokenClassificationParameters | None = None
+    parameters: Optional[TokenClassificationParameters] = None
     """Additional inference parameters for Token Classification"""
 
 
@@ -45,7 +45,7 @@ class TokenClassificationOutputElement(BaseInferenceType):
     """The character position in the input where this group begins."""
     word: str
     """The corresponding text"""
-    entity: str | None = None
+    entity: Optional[str] = None
     """The predicted label for a single token"""
-    entity_group: str | None = None
+    entity_group: Optional[str] = None
     """The predicted label for a group of one or more tokens"""
diff --git a/src/huggingface_hub/inference/_generated/types/translation.py b/src/huggingface_hub/inference/_generated/types/translation.py
index 58e0b9de29..59619e9a90 100644
--- a/src/huggingface_hub/inference/_generated/types/translation.py
+++ b/src/huggingface_hub/inference/_generated/types/translation.py
@@ -15,15 +15,15 @@
 class TranslationParameters(BaseInferenceType):
     """Additional inference parameters for Translation"""
 
-    clean_up_tokenization_spaces: bool | None = None
+    clean_up_tokenization_spaces: Optional[bool] = None
     """Whether to clean up the potential extra spaces in the text output."""
-    generate_parameters: dict[str, Any] | None = None
+    generate_parameters: Optional[dict[str, Any]] = None
     """Additional parametrization of the text generation algorithm."""
-    src_lang: str | None = None
+    src_lang: Optional[str] = None
     """The source language of the text. Required for models that can translate from multiple
     languages.
     """
-    tgt_lang: str | None = None
+    tgt_lang: Optional[str] = None
     """Target language to translate to. Required for models that can translate to multiple
     languages.
     """
@@ -37,7 +37,7 @@ class TranslationInput(BaseInferenceType):
 
     inputs: str
     """The text to translate."""
-    parameters: TranslationParameters | None = None
+    parameters: Optional[TranslationParameters] = None
     """Additional inference parameters for Translation"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/video_classification.py b/src/huggingface_hub/inference/_generated/types/video_classification.py
index 489a602c17..e1d7a15bb4 100644
--- a/src/huggingface_hub/inference/_generated/types/video_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/video_classification.py
@@ -15,13 +15,13 @@
 class VideoClassificationParameters(BaseInferenceType):
     """Additional inference parameters for Video Classification"""
 
-    frame_sampling_rate: int | None = None
+    frame_sampling_rate: Optional[int] = None
     """The sampling rate used to select frames from the video."""
     function_to_apply: Optional["VideoClassificationOutputTransform"] = None
     """The function to apply to the model outputs in order to retrieve the scores."""
-    num_frames: int | None = None
+    num_frames: Optional[int] = None
     """The number of sampled frames to consider for classification."""
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """When specified, limits the output to the top K most probable classes."""
 
 
@@ -31,7 +31,7 @@ class VideoClassificationInput(BaseInferenceType):
 
     inputs: Any
     """The input video data"""
-    parameters: VideoClassificationParameters | None = None
+    parameters: Optional[VideoClassificationParameters] = None
     """Additional inference parameters for Video Classification"""
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/visual_question_answering.py b/src/huggingface_hub/inference/_generated/types/visual_question_answering.py
index 73f532aa06..d368f16212 100644
--- a/src/huggingface_hub/inference/_generated/types/visual_question_answering.py
+++ b/src/huggingface_hub/inference/_generated/types/visual_question_answering.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any
+from typing import Any, Optional
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -22,7 +22,7 @@ class VisualQuestionAnsweringInputData(BaseInferenceType):
 class VisualQuestionAnsweringParameters(BaseInferenceType):
     """Additional inference parameters for Visual Question Answering"""
 
-    top_k: int | None = None
+    top_k: Optional[int] = None
     """The number of answers to return (will be chosen by order of likelihood). Note that we
     return less than topk answers if there are not enough options available within the
     context.
@@ -35,7 +35,7 @@ class VisualQuestionAnsweringInput(BaseInferenceType):
 
     inputs: VisualQuestionAnsweringInputData
     """One (image, question) pair to answer"""
-    parameters: VisualQuestionAnsweringParameters | None = None
+    parameters: Optional[VisualQuestionAnsweringParameters] = None
     """Additional inference parameters for Visual Question Answering"""
 
 
@@ -45,5 +45,5 @@ class VisualQuestionAnsweringOutputElement(BaseInferenceType):
 
     score: float
     """The associated score / probability"""
-    answer: str | None = None
+    answer: Optional[str] = None
     """The answer to the question"""
diff --git a/src/huggingface_hub/inference/_generated/types/zero_shot_classification.py b/src/huggingface_hub/inference/_generated/types/zero_shot_classification.py
index a04f1a5935..7b0dd13237 100644
--- a/src/huggingface_hub/inference/_generated/types/zero_shot_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/zero_shot_classification.py
@@ -3,6 +3,8 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Optional
+
 from .base import BaseInferenceType, dataclass_with_extra
 
 
@@ -12,11 +14,11 @@ class ZeroShotClassificationParameters(BaseInferenceType):
 
     candidate_labels: list[str]
     """The set of possible class labels to classify the text into."""
-    hypothesis_template: str | None = None
+    hypothesis_template: Optional[str] = None
     """The sentence used in conjunction with `candidate_labels` to attempt the text
     classification by replacing the placeholder with the candidate labels.
     """
-    multi_label: bool | None = None
+    multi_label: Optional[bool] = None
     """Whether multiple candidate labels can be true. If false, the scores are normalized such
     that the sum of the label likelihoods for each sequence is 1. If true, the labels are
     considered independent and probabilities are normalized for each candidate.
diff --git a/src/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py b/src/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py
index 65c5cd2530..ed138eada5 100644
--- a/src/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py
@@ -3,6 +3,8 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Optional
+
 from .base import BaseInferenceType, dataclass_with_extra
 
 
@@ -12,7 +14,7 @@ class ZeroShotImageClassificationParameters(BaseInferenceType):
 
     candidate_labels: list[str]
     """The candidate labels for this image"""
-    hypothesis_template: str | None = None
+    hypothesis_template: Optional[str] = None
     """The sentence used in conjunction with `candidate_labels` to attempt the image
     classification by replacing the placeholder with the candidate labels.
     """
diff --git a/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py b/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
index e981463b25..d9512c77fd 100644
--- a/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
+++ b/src/huggingface_hub/inference/_generated/types/zero_shot_object_detection.py
@@ -3,6 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+
 from .base import BaseInferenceType, dataclass_with_extra