huggingface
diff --git a/‎docs/source/en/package_reference/inference_types.md‎
Lines changed: 2 additions & 6 deletions b/‎docs/source/en/package_reference/inference_types.md‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎docs/source/ko/package_reference/inference_types.md‎
Lines changed: 2 additions & 6 deletions b/‎docs/source/ko/package_reference/inference_types.md‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎src/huggingface_hub/__init__.py‎
Lines changed: 2 additions & 6 deletions b/‎src/huggingface_hub/__init__.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎src/huggingface_hub/inference/_client.py‎
Lines changed: 13 additions & 12 deletions b/‎src/huggingface_hub/inference/_client.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎src/huggingface_hub/inference/_generated/_async_client.py‎
Lines changed: 13 additions & 12 deletions b/‎src/huggingface_hub/inference/_generated/_async_client.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎src/huggingface_hub/inference/_generated/types/__init__.py‎
Lines changed: 1 addition & 3 deletions b/‎src/huggingface_hub/inference/_generated/types/__init__.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/huggingface_hub/inference/_generated/types/audio_classification.py‎
Lines changed: 3 additions & 5 deletions b/‎src/huggingface_hub/inference/_generated/types/audio_classification.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py‎
Lines changed: 4 additions & 8 deletions b/‎src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎src/huggingface_hub/inference/_generated/types/depth_estimation.py‎
Lines changed: 1 addition & 1 deletion b/‎src/huggingface_hub/inference/_generated/types/depth_estimation.py‎
Lines changed: 1 addition & 1 deletion
@@ -369,8 +369,6 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationParameters
@@ -381,8 +379,6 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotImageClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationParameters
@@ -395,6 +391,6 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionInput
 
-[[autodoc]] huggingface_hub.ZeroShotObjectDetectionInputData
-
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionOutputElement
+
+[[autodoc]] huggingface_hub.ZeroShotObjectDetectionParameters
@@ -368,8 +368,6 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationParameters
@@ -380,8 +378,6 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotImageClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationParameters
@@ -394,6 +390,6 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionInput
 
-[[autodoc]] huggingface_hub.ZeroShotObjectDetectionInputData
-
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionOutputElement
+
+[[autodoc]] huggingface_hub.ZeroShotObjectDetectionParameters
@@ -416,17 +416,15 @@
         "VisualQuestionAnsweringOutputElement",
         "VisualQuestionAnsweringParameters",
         "ZeroShotClassificationInput",
-        "ZeroShotClassificationInputData",
         "ZeroShotClassificationOutputElement",
         "ZeroShotClassificationParameters",
         "ZeroShotImageClassificationInput",
-        "ZeroShotImageClassificationInputData",
         "ZeroShotImageClassificationOutputElement",
         "ZeroShotImageClassificationParameters",
         "ZeroShotObjectDetectionBoundingBox",
         "ZeroShotObjectDetectionInput",
-        "ZeroShotObjectDetectionInputData",
         "ZeroShotObjectDetectionOutputElement",
+        "ZeroShotObjectDetectionParameters",
     ],
     "inference_api": [
         "InferenceApi",
@@ -947,17 +945,15 @@ def __dir__():
         VisualQuestionAnsweringOutputElement,  # noqa: F401
         VisualQuestionAnsweringParameters,  # noqa: F401
         ZeroShotClassificationInput,  # noqa: F401
-        ZeroShotClassificationInputData,  # noqa: F401
         ZeroShotClassificationOutputElement,  # noqa: F401
         ZeroShotClassificationParameters,  # noqa: F401
         ZeroShotImageClassificationInput,  # noqa: F401
-        ZeroShotImageClassificationInputData,  # noqa: F401
         ZeroShotImageClassificationOutputElement,  # noqa: F401
         ZeroShotImageClassificationParameters,  # noqa: F401
         ZeroShotObjectDetectionBoundingBox,  # noqa: F401
         ZeroShotObjectDetectionInput,  # noqa: F401
-        ZeroShotObjectDetectionInputData,  # noqa: F401
         ZeroShotObjectDetectionOutputElement,  # noqa: F401
+        ZeroShotObjectDetectionParameters,  # noqa: F401
     )
     from .inference_api import InferenceApi  # noqa: F401
     from .keras_mixin import (
 
@@ -350,7 +350,7 @@ def audio_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -982,7 +982,7 @@ def document_question_answering(
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
         ```
         """
         inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
@@ -1133,7 +1133,7 @@ def image_classification(
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
             function_to_apply (`"ImageClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
         Returns:
@@ -1812,7 +1812,7 @@ def text_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -2484,11 +2484,11 @@ def text_to_speech(
             max_length (`int`, *optional*):
                 The maximum length (in tokens) of the generated text, including the input.
             max_new_tokens (`int`, *optional*):
-                The maximum number of tokens to generate. Takes precedence over maxLength.
+                The maximum number of tokens to generate. Takes precedence over max_length.
             min_length (`int`, *optional*):
                 The minimum length (in tokens) of the generated text, including the input.
             min_new_tokens (`int`, *optional*):
-                The minimum number of tokens to generate. Takes precedence over maxLength.
+                The minimum number of tokens to generate. Takes precedence over min_length.
             num_beam_groups (`int`, *optional*):
                 Number of groups to divide num_beams into in order to ensure diversity among different groups of beams.
                 See [this paper](https://hf.co/papers/1610.02424) for more details.
@@ -2791,12 +2791,13 @@ def zero_shot_classification(
                 the label likelihoods for each sequence is 1. If true, the labels are considered independent and
                 probabilities are normalized for each candidate.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
 
+
         Returns:
             `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
 
@@ -2887,12 +2888,12 @@ def zero_shot_image_classification(
         self,
         image: ContentT,
         # temporarily keeping it optional for backward compatibility.
-        candidate_labels: Optional[List[str]] = None,
+        candidate_labels: List[str] = None,  # type: ignore
         *,
         model: Optional[str] = None,
         hypothesis_template: Optional[str] = None,
         # deprecated argument
-        labels: Optional[List[str]] = None,  # type: ignore
+        labels: List[str] = None,  # type: ignore
     ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -2908,8 +2909,8 @@ def zero_shot_image_classification(
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate labels.
 
         Returns:
             `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
 
@@ -383,7 +383,7 @@ async def audio_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -1025,7 +1025,7 @@ async def document_question_answering(
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
         ```
         """
         inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
@@ -1178,7 +1178,7 @@ async def image_classification(
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
             function_to_apply (`"ImageClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
         Returns:
@@ -1874,7 +1874,7 @@ async def text_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -2549,11 +2549,11 @@ async def text_to_speech(
             max_length (`int`, *optional*):
                 The maximum length (in tokens) of the generated text, including the input.
             max_new_tokens (`int`, *optional*):
-                The maximum number of tokens to generate. Takes precedence over maxLength.
+                The maximum number of tokens to generate. Takes precedence over max_length.
             min_length (`int`, *optional*):
                 The minimum length (in tokens) of the generated text, including the input.
             min_new_tokens (`int`, *optional*):
-                The minimum number of tokens to generate. Takes precedence over maxLength.
+                The minimum number of tokens to generate. Takes precedence over min_length.
             num_beam_groups (`int`, *optional*):
                 Number of groups to divide num_beams into in order to ensure diversity among different groups of beams.
                 See [this paper](https://hf.co/papers/1610.02424) for more details.
@@ -2860,12 +2860,13 @@ async def zero_shot_classification(
                 the label likelihoods for each sequence is 1. If true, the labels are considered independent and
                 probabilities are normalized for each candidate.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
 
+
         Returns:
             `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
 
@@ -2958,12 +2959,12 @@ async def zero_shot_image_classification(
         self,
         image: ContentT,
         # temporarily keeping it optional for backward compatibility.
-        candidate_labels: Optional[List[str]] = None,
+        candidate_labels: List[str] = None,  # type: ignore
         *,
         model: Optional[str] = None,
         hypothesis_template: Optional[str] = None,
         # deprecated argument
-        labels: Optional[List[str]] = None,  # type: ignore
+        labels: List[str] = None,  # type: ignore
     ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -2979,8 +2980,8 @@ async def zero_shot_image_classification(
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate labels.
 
         Returns:
             `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.
 
@@ -168,19 +168,17 @@
 )
 from .zero_shot_classification import (
     ZeroShotClassificationInput,
-    ZeroShotClassificationInputData,
     ZeroShotClassificationOutputElement,
     ZeroShotClassificationParameters,
 )
 from .zero_shot_image_classification import (
     ZeroShotImageClassificationInput,
-    ZeroShotImageClassificationInputData,
     ZeroShotImageClassificationOutputElement,
     ZeroShotImageClassificationParameters,
 )
 from .zero_shot_object_detection import (
     ZeroShotObjectDetectionBoundingBox,
     ZeroShotObjectDetectionInput,
-    ZeroShotObjectDetectionInputData,
     ZeroShotObjectDetectionOutputElement,
+    ZeroShotObjectDetectionParameters,
 )
@@ -14,12 +14,10 @@
 
 @dataclass
 class AudioClassificationParameters(BaseInferenceType):
-    """Additional inference parameters
-    Additional inference parameters for Audio Classification
-    """
+    """Additional inference parameters for Audio Classification"""
 
     function_to_apply: Optional["AudioClassificationOutputTransform"] = None
-    """The function to apply to the output."""
+    """The function to apply to the model outputs in order to retrieve the scores."""
     top_k: Optional[int] = None
     """When specified, limits the output to the top K most probable classes."""
 
@@ -33,7 +31,7 @@ class AudioClassificationInput(BaseInferenceType):
     also provide the audio data as a raw bytes payload.
     """
     parameters: Optional[AudioClassificationParameters] = None
-    """Additional inference parameters"""
+    """Additional inference parameters for Audio Classification"""
 
 
 @dataclass
 
@@ -14,9 +14,7 @@
 
 @dataclass
 class AutomaticSpeechRecognitionGenerationParameters(BaseInferenceType):
-    """Parametrization of the text generation process
-    Ad-hoc parametrization of the text generation process
-    """
+    """Parametrization of the text generation process"""
 
     do_sample: Optional[bool] = None
     """Whether to use sampling instead of greedy decoding when generating new tokens."""
@@ -76,11 +74,9 @@ class AutomaticSpeechRecognitionGenerationParameters(BaseInferenceType):
 
 @dataclass
 class AutomaticSpeechRecognitionParameters(BaseInferenceType):
-    """Additional inference parameters
-    Additional inference parameters for Automatic Speech Recognition
-    """
+    """Additional inference parameters for Automatic Speech Recognition"""
 
-    generate: Optional[AutomaticSpeechRecognitionGenerationParameters] = None
+    generation_parameters: Optional[AutomaticSpeechRecognitionGenerationParameters] = None
     """Parametrization of the text generation process"""
     return_timestamps: Optional[bool] = None
     """Whether to output corresponding timestamps with the generated text"""
@@ -95,7 +91,7 @@ class AutomaticSpeechRecognitionInput(BaseInferenceType):
     also provide the audio data as a raw bytes payload.
     """
     parameters: Optional[AutomaticSpeechRecognitionParameters] = None
-    """Additional inference parameters"""
+    """Additional inference parameters for Automatic Speech Recognition"""
 
 
 @dataclass
 
@@ -16,7 +16,7 @@ class DepthEstimationInput(BaseInferenceType):
     inputs: Any
     """The input image data"""
     parameters: Optional[Dict[str, Any]] = None
-    """Additional inference parameters"""
+    """Additional inference parameters for Depth Estimation"""
 
 
 @dataclass
Original file line number	Diff line number	Diff line change
`@@ -168,19 +168,17 @@`
`168`	`168`	`)`
`169`	`169`	`from .zero_shot_classification import (`
`170`	`170`	`ZeroShotClassificationInput,`
`171`		`- ZeroShotClassificationInputData,`
`172`	`171`	`ZeroShotClassificationOutputElement,`
`173`	`172`	`ZeroShotClassificationParameters,`
`174`	`173`	`)`
`175`	`174`	`from .zero_shot_image_classification import (`
`176`	`175`	`ZeroShotImageClassificationInput,`
`177`		`- ZeroShotImageClassificationInputData,`
`178`	`176`	`ZeroShotImageClassificationOutputElement,`
`179`	`177`	`ZeroShotImageClassificationParameters,`
`180`	`178`	`)`
`181`	`179`	`from .zero_shot_object_detection import (`
`182`	`180`	`ZeroShotObjectDetectionBoundingBox,`
`183`	`181`	`ZeroShotObjectDetectionInput,`
`184`		`- ZeroShotObjectDetectionInputData,`
`185`	`182`	`ZeroShotObjectDetectionOutputElement,`
	`183`	`+ ZeroShotObjectDetectionParameters,`
`186`	`184`	`)`