33# See:
44# - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
55# - specs: https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
6- from typing import Literal , Union
6+ from typing import Literal , Optional , Union
77
88from .base import BaseInferenceType , dataclass_with_extra
99
1515class AutomaticSpeechRecognitionGenerationParameters (BaseInferenceType ):
1616 """Parametrization of the text generation process"""
1717
18- do_sample : bool | None = None
18+ do_sample : Optional [ bool ] = None
1919 """Whether to use sampling instead of greedy decoding when generating new tokens."""
20- early_stopping : Union [bool , "AutomaticSpeechRecognitionEarlyStoppingEnum" ] | None = None
20+ early_stopping : Optional [ Union [bool , "AutomaticSpeechRecognitionEarlyStoppingEnum" ]] = None
2121 """Controls the stopping condition for beam-based methods."""
22- epsilon_cutoff : float | None = None
22+ epsilon_cutoff : Optional [ float ] = None
2323 """If set to float strictly between 0 and 1, only tokens with a conditional probability
2424 greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
2525 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
2626 Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
2727 """
28- eta_cutoff : float | None = None
28+ eta_cutoff : Optional [ float ] = None
2929 """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
3030 float strictly between 0 and 1, a token is only considered if it is greater than either
3131 eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
@@ -34,50 +34,50 @@ class AutomaticSpeechRecognitionGenerationParameters(BaseInferenceType):
3434 See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
3535 for more details.
3636 """
37- max_length : int | None = None
37+ max_length : Optional [ int ] = None
3838 """The maximum length (in tokens) of the generated text, including the input."""
39- max_new_tokens : int | None = None
39+ max_new_tokens : Optional [ int ] = None
4040 """The maximum number of tokens to generate. Takes precedence over max_length."""
41- min_length : int | None = None
41+ min_length : Optional [ int ] = None
4242 """The minimum length (in tokens) of the generated text, including the input."""
43- min_new_tokens : int | None = None
43+ min_new_tokens : Optional [ int ] = None
4444 """The minimum number of tokens to generate. Takes precedence over min_length."""
45- num_beam_groups : int | None = None
45+ num_beam_groups : Optional [ int ] = None
4646 """Number of groups to divide num_beams into in order to ensure diversity among different
4747 groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
4848 """
49- num_beams : int | None = None
49+ num_beams : Optional [ int ] = None
5050 """Number of beams to use for beam search."""
51- penalty_alpha : float | None = None
51+ penalty_alpha : Optional [ float ] = None
5252 """The value balances the model confidence and the degeneration penalty in contrastive
5353 search decoding.
5454 """
55- temperature : float | None = None
55+ temperature : Optional [ float ] = None
5656 """The value used to modulate the next token probabilities."""
57- top_k : int | None = None
57+ top_k : Optional [ int ] = None
5858 """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
59- top_p : float | None = None
59+ top_p : Optional [ float ] = None
6060 """If set to float < 1, only the smallest set of most probable tokens with probabilities
6161 that add up to top_p or higher are kept for generation.
6262 """
63- typical_p : float | None = None
63+ typical_p : Optional [ float ] = None
6464 """Local typicality measures how similar the conditional probability of predicting a target
6565 token next is to the expected conditional probability of predicting a random token next,
6666 given the partial text already generated. If set to float < 1, the smallest set of the
6767 most locally typical tokens with probabilities that add up to typical_p or higher are
6868 kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
6969 """
70- use_cache : bool | None = None
70+ use_cache : Optional [ bool ] = None
7171 """Whether the model should use the past last key/values attentions to speed up decoding"""
7272
7373
7474@dataclass_with_extra
7575class AutomaticSpeechRecognitionParameters (BaseInferenceType ):
7676 """Additional inference parameters for Automatic Speech Recognition"""
7777
78- generation_parameters : AutomaticSpeechRecognitionGenerationParameters | None = None
78+ generation_parameters : Optional [ AutomaticSpeechRecognitionGenerationParameters ] = None
7979 """Parametrization of the text generation process"""
80- return_timestamps : bool | None = None
80+ return_timestamps : Optional [ bool ] = None
8181 """Whether to output corresponding timestamps with the generated text"""
8282
8383
@@ -89,7 +89,7 @@ class AutomaticSpeechRecognitionInput(BaseInferenceType):
8989 """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
9090 also provide the audio data as a raw bytes payload.
9191 """
92- parameters : AutomaticSpeechRecognitionParameters | None = None
92+ parameters : Optional [ AutomaticSpeechRecognitionParameters ] = None
9393 """Additional inference parameters for Automatic Speech Recognition"""
9494
9595
@@ -107,7 +107,7 @@ class AutomaticSpeechRecognitionOutput(BaseInferenceType):
107107
108108 text : str
109109 """The recognized text."""
110- chunks : list [AutomaticSpeechRecognitionOutputChunk ] | None = None
110+ chunks : Optional [ list [AutomaticSpeechRecognitionOutputChunk ]] = None
111111 """When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
112112 the model.
113113 """
0 commit comments