From 6b0236d2c3aaed37973f170b34d7c709f98a5010 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 12:45:44 +0000
Subject: [PATCH] feat(multimodal): add Video support for Gemini with
 structured outputs

- Add Video and VideoWithGenaiFile classes to multimodal.py
- Support video file upload and processing with Gemini API
- Export Video classes from instructor package
- Update multimodal-gemini blog post with current API
- Add complete video extraction example with Gemini 2.5 Pro
- Update autodetect_media to handle video files
- Support multiple video formats (MP4, MOV, WebM, etc.)

Resolves #1850

Co-Authored-By: Jason Liu <jason@jxnl.co>
---
 docs/blog/posts/multimodal-gemini.md       |  95 ++++++--
 examples/video-extraction-gemini/README.md |  85 +++++++
 examples/video-extraction-gemini/run.py    | 110 +++++++++
 instructor/__init__.py                     |   4 +-
 instructor/processing/multimodal.py        | 251 +++++++++++++++++++--
 5 files changed, 518 insertions(+), 27 deletions(-)
 create mode 100644 examples/video-extraction-gemini/README.md
 create mode 100644 examples/video-extraction-gemini/run.py

diff --git a/docs/blog/posts/multimodal-gemini.md b/docs/blog/posts/multimodal-gemini.md
index cc40ee79d..c79d162d8 100644
--- a/docs/blog/posts/multimodal-gemini.md
+++ b/docs/blog/posts/multimodal-gemini.md
@@ -22,14 +22,20 @@ In this post, we'll explore how to use Google's Gemini model with Instructor to
 
 ## Setting Up the Environment
 
-First, let's set up our environment with the necessary libraries:
-
-```python
+First, install the required dependencies:
 
+```bash
+pip install "instructor[google-genai]" pydantic
 ```
 
 <!-- more -->
 
+Make sure you have your Google API key set as an environment variable:
+
+```bash
+export GOOGLE_API_KEY=your_api_key_here
+```
+
 ## Defining Our Data Models
 
 We'll use Pydantic to define our data models for tourist destinations and recommendations:
@@ -49,32 +55,37 @@ class Recommendations(BaseModel):
 
 ## Initializing the Gemini Client
 
-Next, we'll set up our Gemini client using Instructor:
+Next, set up the Gemini client using Instructor's unified provider interface:
 
 ```python
-client = instructor.from_gemini(
-    client=genai.GenerativeModel(
-        model_name="models/gemini-1.5-flash-latest",
-    ),
+import instructor
+
+client = instructor.from_provider(
+    "google/gemini-2.0-flash-exp",
+    async_client=False,
 )
 ```
 
 ## Uploading and Processing the Video
 
-To analyze a video, we first need to upload it:
+To analyze a video, first upload it using `VideoWithGenaiFile`:
 
 ```python
-file = genai.upload_file("./takayama.mp4")
+# Upload the video and wait for processing
+video = instructor.VideoWithGenaiFile.from_new_genai_file("./takayama.mp4")
 ```
 
-Then, we can process the video and extract recommendations:
+Then, process the video and extract recommendations:
 
 ```python
-resp = client.chat.completions.create(
+resp = client.messages.create(
     messages=[
         {
             "role": "user",
-            "content": ["What places do they recommend in this video?", file],
+            "content": [
+                "What places do they recommend in this video?",
+                video,
+            ],
         }
     ],
     response_model=Recommendations,
@@ -217,9 +228,67 @@ To address these limitations and expand the capabilities of our video analysis s
 
 By addressing these challenges and exploring these new directions, we can create a more comprehensive and nuanced video analysis system, opening up even more possibilities for applications in travel, education, and beyond.
 
+## Complete Working Example
+
+Here's a complete, runnable example that you can use with your own videos:
+
+```python
+import instructor
+from pydantic import BaseModel
+
+
+class TouristDestination(BaseModel):
+    name: str
+    description: str
+    location: str
+
+
+class Recommendations(BaseModel):
+    chain_of_thought: str
+    description: str
+    destinations: list[TouristDestination]
+
+
+def analyze_video(video_path: str):
+    # Initialize the client
+    client = instructor.from_provider(
+        "google/gemini-2.0-flash-exp",
+        async_client=False,
+    )
+    
+    # Upload the video
+    video = instructor.VideoWithGenaiFile.from_new_genai_file(video_path)
+    
+    # Extract structured data
+    recommendations = client.messages.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    "What tourist destinations do they recommend in this video?",
+                    video,
+                ],
+            }
+        ],
+        response_model=Recommendations,
+    )
+    
+    return recommendations
+
+
+# Usage
+results = analyze_video("./travel_video.mp4")
+for dest in results.destinations:
+    print(f"{dest.name} - {dest.location}")
+    print(f"  {dest.description}\n")
+```
+
+For a more detailed example with proper error handling and formatting, check out our [video extraction example](https://github.com/instructor-ai/instructor/tree/main/examples/video-extraction-gemini).
+
 ## Related Documentation
 - [Multimodal Concepts](../../concepts/multimodal.md) - Working with images, video, and audio
 - [Google Integration](../../integrations/google.md) - Complete Gemini setup guide
+- [Video Extraction Example](https://github.com/instructor-ai/instructor/tree/main/examples/video-extraction-gemini) - Complete working example with video
 
 ## See Also
 - [OpenAI Multimodal](openai-multimodal.md) - Compare multimodal approaches
diff --git a/examples/video-extraction-gemini/README.md b/examples/video-extraction-gemini/README.md
new file mode 100644
index 000000000..514b0cdb0
--- /dev/null
+++ b/examples/video-extraction-gemini/README.md
@@ -0,0 +1,85 @@
+# Video Analysis with Gemini 2.5 Pro
+
+This example demonstrates how to use Google's Gemini 2.5 Pro model with Instructor to analyze videos and extract structured information about tourist destinations.
+
+## Features
+
+- Upload videos to Gemini API using `VideoWithGenaiFile`
+- Extract structured recommendations using Pydantic models
+- Support for analyzing travel content and tourist destinations
+- Type-safe structured outputs
+
+## Requirements
+
+```bash
+pip install instructor google-genai pydantic
+```
+
+## Setup
+
+1. Get your Google API key from [Google AI Studio](https://makersuite.google.com/app/apikey)
+2. Set the environment variable:
+
+```bash
+export GOOGLE_API_KEY=your_api_key_here
+```
+
+## Usage
+
+Run the script with a path to your video file:
+
+```bash
+python run.py path/to/your/video.mp4
+```
+
+Example with a travel video:
+
+```bash
+python run.py takayama_travel.mp4
+```
+
+## How It Works
+
+The example:
+
+1. Uploads your video file to the Gemini API using `VideoWithGenaiFile.from_new_genai_file()`
+2. Sends a prompt asking for tourist destination recommendations
+3. Uses Instructor to parse the response into structured Pydantic models
+4. Returns a list of destinations with names, descriptions, and locations
+
+## Output Structure
+
+The analysis returns:
+
+- **chain_of_thought**: Detailed reasoning about the video content
+- **description**: Overall summary of the video
+- **destinations**: List of tourist destinations, each with:
+  - name: Name of the destination
+  - description: What makes it interesting
+  - location: Where it's located
+
+## Supported Video Formats
+
+Gemini supports the following video formats:
+- MP4
+- MPEG
+- MOV
+- AVI
+- FLV
+- MPG
+- WebM
+- WMV
+- 3GPP
+- QuickTime
+
+## Notes
+
+- Video files are uploaded to Google's servers for processing
+- Large videos may take longer to upload and process
+- The API automatically waits for the upload to complete before processing
+
+## Related Examples
+
+- [Multimodal Gemini Guide](../../docs/blog/posts/multimodal-gemini.md)
+- [Image Analysis with Gemini](../vision/)
+- [PDF Processing with Gemini](../../docs/blog/posts/chat-with-your-pdf-with-gemini.md)
diff --git a/examples/video-extraction-gemini/run.py b/examples/video-extraction-gemini/run.py
new file mode 100644
index 000000000..e5c2b8c54
--- /dev/null
+++ b/examples/video-extraction-gemini/run.py
@@ -0,0 +1,110 @@
+"""
+Video Analysis with Gemini 2.5 Pro
+
+This example demonstrates how to use Gemini 2.5 Pro with Instructor to analyze videos
+and extract structured information. We'll process a video and extract tourist destinations
+mentioned in it.
+
+Requirements:
+    pip install instructor google-genai pydantic
+
+Usage:
+    export GOOGLE_API_KEY=your_api_key_here
+    
+    python run.py path/to/your/video.mp4
+"""
+
+import instructor
+from pydantic import BaseModel
+import sys
+
+
+class TouristDestination(BaseModel):
+    """Represents a tourist destination mentioned in the video."""
+    
+    name: str
+    description: str
+    location: str
+
+
+class VideoRecommendations(BaseModel):
+    """Structured output containing recommendations from the video."""
+    
+    chain_of_thought: str
+    description: str
+    destinations: list[TouristDestination]
+
+
+def analyze_video(video_path: str):
+    """
+    Analyze a video and extract tourist destination recommendations.
+    
+    Args:
+        video_path: Path to the video file to analyze
+        
+    Returns:
+        VideoRecommendations object containing structured data
+    """
+    client = instructor.from_provider(
+        "google/gemini-2.0-flash-exp",
+        async_client=False,
+    )
+    
+    print(f"Uploading video: {video_path}")
+    video = instructor.VideoWithGenaiFile.from_new_genai_file(video_path)
+    print(f"Video uploaded successfully: {video.source}")
+    
+    print("Analyzing video content...")
+    recommendations = client.messages.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    "What tourist destinations and places do they recommend in this video? "
+                    "Provide a detailed analysis including the name, description, and location of each place.",
+                    video,
+                ],
+            }
+        ],
+        response_model=VideoRecommendations,
+    )
+    
+    return recommendations
+
+
+def main():
+    """Main function to run the video analysis."""
+    if len(sys.argv) < 2:
+        print("Usage: python run.py <path_to_video>")
+        print("Example: python run.py travel_video.mp4")
+        sys.exit(1)
+    
+    video_path = sys.argv[1]
+    
+    try:
+        results = analyze_video(video_path)
+        
+        print("\n" + "=" * 80)
+        print("VIDEO ANALYSIS RESULTS")
+        print("=" * 80)
+        
+        print(f"\nOverview: {results.description}")
+        print(f"\nAnalysis: {results.chain_of_thought}")
+        
+        print(f"\nDestinations Found: {len(results.destinations)}")
+        print("-" * 80)
+        
+        for i, dest in enumerate(results.destinations, 1):
+            print(f"\n{i}. {dest.name}")
+            print(f"   Location: {dest.location}")
+            print(f"   Description: {dest.description}")
+        
+        print("\n" + "=" * 80)
+        
+    except Exception as e:
+        print(f"Error analyzing video: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/instructor/__init__.py b/instructor/__init__.py
index 2ca9f4f6e..db7e27306 100644
--- a/instructor/__init__.py
+++ b/instructor/__init__.py
@@ -1,7 +1,7 @@
 import importlib.util
 
 from .mode import Mode
-from .processing.multimodal import Image, Audio
+from .processing.multimodal import Image, Audio, Video, VideoWithGenaiFile
 
 from .dsl import (
     CitationMixin,
@@ -38,6 +38,8 @@
     "Instructor",
     "Image",
     "Audio",
+    "Video",
+    "VideoWithGenaiFile",
     "from_openai",
     "from_litellm",
     "from_provider",
diff --git a/instructor/processing/multimodal.py b/instructor/processing/multimodal.py
index fc907eba7..aec2d4331 100644
--- a/instructor/processing/multimodal.py
+++ b/instructor/processing/multimodal.py
@@ -41,6 +41,18 @@
     "audio/webm",
 ]
 VALID_PDF_MIME_TYPES = ["application/pdf"]
+VALID_VIDEO_MIME_TYPES = [
+    "video/mp4",
+    "video/mpeg",
+    "video/mov",
+    "video/avi",
+    "video/x-flv",
+    "video/mpg",
+    "video/webm",
+    "video/wmv",
+    "video/3gpp",
+    "video/quicktime",
+]
 CacheControlType = Mapping[str, str]
 OptionalCacheControlType = Optional[CacheControlType]
 
@@ -458,6 +470,213 @@ def to_genai(self):
         )
 
 
+class Video(BaseModel):
+    """Represents a video that can be loaded from a URL or file path."""
+
+    source: Union[str, Path] = Field(description="URL or file path of the video")  # noqa: UP007
+    data: Union[str, None] = Field(  # noqa: UP007
+        None, description="Base64 encoded video data", repr=False
+    )
+    media_type: str = Field(description="MIME type of the video")
+
+    @classmethod
+    def autodetect(cls, source: str | Path) -> Video:
+        """Attempt to autodetect a video from a source string or Path."""
+        if isinstance(source, str):
+            if cls.is_base64(source):
+                return cls.from_base64(source)
+            if source.startswith(("http://", "https://")):
+                return cls.from_url(source)
+            if source.startswith("gs://"):
+                return cls.from_gs_url(source)
+            try:
+                path = Path(source)
+                if path.is_file():
+                    return cls.from_path(path)
+            except OSError:
+                pass
+
+            raise ValueError("Unable to determine video source")
+
+        if isinstance(source, Path):
+            return cls.from_path(source)
+
+    @classmethod
+    def autodetect_safely(cls, source: Union[str, Path]) -> Union[Video, str]:  # noqa: UP007
+        """Safely attempt to autodetect a video from a source string or path.
+
+        Args:
+            source (Union[str,path]): The source string or path.
+        Returns:
+            A Video if the source is detected to be a valid video, otherwise
+            the source itself as a string.
+        """
+        try:
+            return cls.autodetect(source)
+        except ValueError:
+            return str(source)
+
+    @classmethod
+    def is_base64(cls, s: str) -> bool:
+        return bool(re.match(r"^data:video/[a-zA-Z0-9+-]+;base64,", s))
+
+    @classmethod
+    def from_base64(cls, data_uri: str) -> Video:
+        header, encoded = data_uri.split(",", 1)
+        media_type = header.split(":")[1].split(";")[0]
+        if media_type not in VALID_VIDEO_MIME_TYPES:
+            raise ValueError(f"Unsupported video format: {media_type}")
+        return cls(
+            source=data_uri,
+            media_type=media_type,
+            data=encoded,
+        )
+
+    @classmethod
+    def from_url(cls, url: str) -> Video:
+        """Create a Video instance from a URL."""
+        if url.startswith("gs://"):
+            return cls.from_gs_url(url)
+        response = requests.get(url)
+        content_type = response.headers.get("content-type")
+        assert content_type in VALID_VIDEO_MIME_TYPES, (
+            f"Invalid video format. Must be one of: {', '.join(VALID_VIDEO_MIME_TYPES)}"
+        )
+
+        data = base64.b64encode(response.content).decode("utf-8")
+        return cls(source=url, data=data, media_type=content_type)
+
+    @classmethod
+    def from_path(cls, path: Union[str, Path]) -> Video:  # noqa: UP007
+        """Create a Video instance from a file path."""
+        path = Path(path)
+        assert path.is_file(), f"Video file not found: {path}"
+
+        mime_type = mimetypes.guess_type(str(path))[0]
+
+        assert mime_type in VALID_VIDEO_MIME_TYPES, (
+            f"Invalid video format. Must be one of: {', '.join(VALID_VIDEO_MIME_TYPES)}"
+        )
+
+        data = base64.b64encode(path.read_bytes()).decode("utf-8")
+        return cls(source=str(path), data=data, media_type=mime_type)
+
+    @classmethod
+    def from_gs_url(cls, data_uri: str, timeout: int = 30) -> Video:
+        """
+        Create a Video instance from a Google Cloud Storage URL.
+
+        Args:
+            data_uri: GCS URL starting with gs://
+            timeout: Request timeout in seconds (default: 30)
+        """
+        if not data_uri.startswith("gs://"):
+            raise ValueError("URL must start with gs://")
+
+        public_url = f"https://storage.googleapis.com/{data_uri[5:]}"
+
+        try:
+            response = requests.get(public_url, timeout=timeout)
+            response.raise_for_status()
+            media_type = response.headers.get("Content-Type")
+            if media_type not in VALID_VIDEO_MIME_TYPES:
+                raise ValueError(f"Unsupported video format: {media_type}")
+
+            data = base64.b64encode(response.content).decode("utf-8")
+
+            return cls(source=data_uri, media_type=media_type, data=data)
+        except requests.RequestException as e:
+            raise ValueError(
+                "Failed to access GCS video (must be publicly readable)"
+            ) from e
+
+    def to_openai(self, mode: Mode) -> dict[str, Any]:
+        """Convert the Video instance to OpenAI's API format."""
+        raise NotImplementedError("OpenAI does not support video inputs yet")
+
+    def to_anthropic(self) -> dict[str, Any]:
+        raise NotImplementedError("Anthropic does not support video inputs yet")
+
+    def to_genai(self):
+        """
+        Convert the Video instance to Google GenAI's API format.
+        """
+        try:
+            from google.genai import types
+        except ImportError as err:
+            raise ImportError(
+                "google-genai package is required for GenAI integration. Install with: pip install google-genai"
+            ) from err
+
+        return types.Part.from_bytes(
+            data=base64.b64decode(self.data),  # type: ignore
+            mime_type=self.media_type,
+        )
+
+
+class VideoWithGenaiFile(Video):
+    @classmethod
+    def from_new_genai_file(
+        cls, file_path: str, retry_delay: int = 10, max_retries: int = 20
+    ) -> VideoWithGenaiFile:
+        """Create a new VideoWithGenaiFile from a file path by uploading to Gemini API."""
+        from google.genai.types import FileState
+        import time
+        from google.genai import Client
+
+        client = Client()
+        file = client.files.upload(file=file_path)
+        while file.state != FileState.ACTIVE:
+            time.sleep(retry_delay)
+            file = client.files.get(name=file.name)  # type: ignore
+            if max_retries > 0:
+                max_retries -= 1
+            else:
+                raise Exception(
+                    "Max retries reached. File upload has been started but is still pending"
+                )
+
+        return cls(source=file.uri, media_type=file.mime_type, data=None)  # type: ignore
+
+    @classmethod
+    def from_existing_genai_file(cls, file_name: str) -> VideoWithGenaiFile:
+        """Create a new VideoWithGenaiFile from an existing uploaded file."""
+        from google.genai import types
+        from google.genai.types import FileState
+        from google.genai import Client
+
+        client = Client()
+        file = client.files.get(name=file_name)
+        if file.source == types.FileSource.UPLOADED and file.state == FileState.ACTIVE:
+            return cls(
+                source=file.uri,  # type: ignore
+                media_type=file.mime_type,  # type: ignore
+                data=None,
+            )
+        else:
+            raise ValueError("We only support uploaded videos for now")
+
+    def to_genai(self):
+        try:
+            from google.genai import types
+        except ImportError as err:
+            raise ImportError(
+                "google-genai package is required for GenAI integration. Install with: pip install google-genai"
+            ) from err
+
+        if (
+            self.source
+            and isinstance(self.source, str)
+            and "https://generativelanguage.googleapis.com/v1beta/files/" in self.source
+        ):
+            return types.Part.from_uri(
+                file_uri=self.source,
+                mime_type=self.media_type,
+            )
+
+        return super().to_genai()
+
+
 class ImageWithCacheControl(Image):
     """Image with Anthropic prompt caching support."""
 
@@ -841,14 +1060,15 @@ def convert_contents(
         dict[str, Any],
         Image,
         Audio,
-        list[Union[str, dict[str, Any], Image, Audio]],  # noqa: UP007
+        Video,
+        list[Union[str, dict[str, Any], Image, Audio, Video]],  # noqa: UP007
     ],
     mode: Mode,
 ) -> Union[str, list[dict[str, Any]]]:  # noqa: UP007
     """Convert content items to the appropriate format based on the specified mode."""
     if isinstance(contents, str):
         return contents
-    if isinstance(contents, (Image, Audio, PDF)) or isinstance(contents, dict):
+    if isinstance(contents, (Image, Audio, Video, PDF)) or isinstance(contents, dict):
         contents = [contents]
 
     converted_contents: list[dict[str, Union[str, Image]]] = []  # noqa: UP007
@@ -862,7 +1082,7 @@ def convert_contents(
             converted_contents.append({"type": text_file_type, "text": content})
         elif isinstance(content, dict):
             converted_contents.append(content)
-        elif isinstance(content, (Image, Audio, PDF)):
+        elif isinstance(content, (Image, Audio, Video, PDF)):
             if mode in {
                 Mode.ANTHROPIC_JSON,
                 Mode.ANTHROPIC_TOOLS,
@@ -884,18 +1104,18 @@ def convert_contents(
 
 
 def autodetect_media(
-    source: str | Path | Image | Audio | PDF,
-) -> Image | Audio | PDF | str:
-    """Autodetect images, audio, or PDFs from a given source.
+    source: str | Path | Image | Audio | Video | PDF,
+) -> Image | Audio | Video | PDF | str:
+    """Autodetect images, audio, videos, or PDFs from a given source.
 
     Args:
         source: URL, file path, Path, or data URI to inspect.
 
     Returns:
-        The detected :class:`Image`, :class:`Audio`, or :class:`PDF` instance.
+        The detected :class:`Image`, :class:`Audio`, :class:`Video`, or :class:`PDF` instance.
         If detection fails, the original source is returned.
     """
-    if isinstance(source, (Image, Audio, PDF)):
+    if isinstance(source, (Image, Audio, Video, PDF)):
         return source
 
     # Normalize once for cheap checks and mimetype guess
@@ -905,6 +1125,8 @@ def autodetect_media(
         return Image.autodetect_safely(source)
     if source.startswith("data:audio/"):
         return Audio.autodetect_safely(source)
+    if source.startswith("data:video/"):
+        return Video.autodetect_safely(source)
     if source.startswith("data:application/pdf"):
         return PDF.autodetect_safely(source)
 
@@ -913,10 +1135,12 @@ def autodetect_media(
         return Image.autodetect_safely(source)
     if media_type in VALID_AUDIO_MIME_TYPES:
         return Audio.autodetect_safely(source)
+    if media_type in VALID_VIDEO_MIME_TYPES:
+        return Video.autodetect_safely(source)
     if media_type in VALID_PDF_MIME_TYPES:
         return PDF.autodetect_safely(source)
 
-    for cls in (Image, Audio, PDF):
+    for cls in (Image, Audio, Video, PDF):
         item = cls.autodetect_safely(source)  # type: ignore[arg-type]
         if not isinstance(item, str):
             return item
@@ -932,8 +1156,9 @@ def convert_messages(
                 dict[str, Any],
                 Image,
                 Audio,
+                Video,
                 PDF,
-                list[Union[str, dict[str, Any], Image, Audio, PDF]],  # noqa: UP007
+                list[Union[str, dict[str, Any], Image, Audio, Video, PDF]],  # noqa: UP007
             ],
         ]
     ],
@@ -948,7 +1173,7 @@ def is_image_params(x: Any) -> bool:
 
     for message in messages:
         if "type" in message:
-            if message["type"] in {"audio", "image"}:
+            if message["type"] in {"audio", "image", "video"}:
                 converted_messages.append(message)  # type: ignore
             else:
                 raise ValueError(f"Unsupported message type: {message['type']}")
@@ -959,7 +1184,7 @@ def is_image_params(x: Any) -> bool:
         }
         if autodetect_images:
             if isinstance(content, list):
-                new_content: list[str | dict[str, Any] | Image | Audio | PDF] = []  # noqa: UP007
+                new_content: list[str | dict[str, Any] | Image | Audio | Video | PDF] = []  # noqa: UP007
                 for item in content:
                     if isinstance(item, str):
                         new_content.append(autodetect_media(item))
@@ -1023,7 +1248,7 @@ def extract_genai_multimodal_content(
             if content_part.text and autodetect_images:
                 converted_item = autodetect_media(content_part.text)
 
-                if isinstance(converted_item, (Image, Audio, PDF)):
+                if isinstance(converted_item, (Image, Audio, Video, PDF)):
                     converted_contents.append(converted_item.to_genai())
                     continue