From 6b0236d2c3aaed37973f170b34d7c709f98a5010 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 16 Oct 2025 12:45:44 +0000 Subject: [PATCH] feat(multimodal): add Video support for Gemini with structured outputs - Add Video and VideoWithGenaiFile classes to multimodal.py - Support video file upload and processing with Gemini API - Export Video classes from instructor package - Update multimodal-gemini blog post with current API - Add complete video extraction example with Gemini 2.5 Pro - Update autodetect_media to handle video files - Support multiple video formats (MP4, MOV, WebM, etc.) Resolves #1850 Co-Authored-By: Jason Liu --- docs/blog/posts/multimodal-gemini.md | 95 ++++++-- examples/video-extraction-gemini/README.md | 85 +++++++ examples/video-extraction-gemini/run.py | 110 +++++++++ instructor/__init__.py | 4 +- instructor/processing/multimodal.py | 251 +++++++++++++++++++-- 5 files changed, 518 insertions(+), 27 deletions(-) create mode 100644 examples/video-extraction-gemini/README.md create mode 100644 examples/video-extraction-gemini/run.py diff --git a/docs/blog/posts/multimodal-gemini.md b/docs/blog/posts/multimodal-gemini.md index cc40ee79d..c79d162d8 100644 --- a/docs/blog/posts/multimodal-gemini.md +++ b/docs/blog/posts/multimodal-gemini.md @@ -22,14 +22,20 @@ In this post, we'll explore how to use Google's Gemini model with Instructor to ## Setting Up the Environment -First, let's set up our environment with the necessary libraries: - -```python +First, install the required dependencies: +```bash +pip install "instructor[google-genai]" pydantic ``` +Make sure you have your Google API key set as an environment variable: + +```bash +export GOOGLE_API_KEY=your_api_key_here +``` + ## Defining Our Data Models We'll use Pydantic to define our data models for tourist destinations and recommendations: @@ -49,32 +55,37 @@ class Recommendations(BaseModel): ## Initializing the Gemini Client -Next, we'll set up our Gemini client using Instructor: +Next, set up the Gemini client using Instructor's unified provider interface: ```python -client = instructor.from_gemini( - client=genai.GenerativeModel( - model_name="models/gemini-1.5-flash-latest", - ), +import instructor + +client = instructor.from_provider( + "google/gemini-2.0-flash-exp", + async_client=False, ) ``` ## Uploading and Processing the Video -To analyze a video, we first need to upload it: +To analyze a video, first upload it using `VideoWithGenaiFile`: ```python -file = genai.upload_file("./takayama.mp4") +# Upload the video and wait for processing +video = instructor.VideoWithGenaiFile.from_new_genai_file("./takayama.mp4") ``` -Then, we can process the video and extract recommendations: +Then, process the video and extract recommendations: ```python -resp = client.chat.completions.create( +resp = client.messages.create( messages=[ { "role": "user", - "content": ["What places do they recommend in this video?", file], + "content": [ + "What places do they recommend in this video?", + video, + ], } ], response_model=Recommendations, @@ -217,9 +228,67 @@ To address these limitations and expand the capabilities of our video analysis s By addressing these challenges and exploring these new directions, we can create a more comprehensive and nuanced video analysis system, opening up even more possibilities for applications in travel, education, and beyond. +## Complete Working Example + +Here's a complete, runnable example that you can use with your own videos: + +```python +import instructor +from pydantic import BaseModel + + +class TouristDestination(BaseModel): + name: str + description: str + location: str + + +class Recommendations(BaseModel): + chain_of_thought: str + description: str + destinations: list[TouristDestination] + + +def analyze_video(video_path: str): + # Initialize the client + client = instructor.from_provider( + "google/gemini-2.0-flash-exp", + async_client=False, + ) + + # Upload the video + video = instructor.VideoWithGenaiFile.from_new_genai_file(video_path) + + # Extract structured data + recommendations = client.messages.create( + messages=[ + { + "role": "user", + "content": [ + "What tourist destinations do they recommend in this video?", + video, + ], + } + ], + response_model=Recommendations, + ) + + return recommendations + + +# Usage +results = analyze_video("./travel_video.mp4") +for dest in results.destinations: + print(f"{dest.name} - {dest.location}") + print(f" {dest.description}\n") +``` + +For a more detailed example with proper error handling and formatting, check out our [video extraction example](https://github.com/instructor-ai/instructor/tree/main/examples/video-extraction-gemini). + ## Related Documentation - [Multimodal Concepts](../../concepts/multimodal.md) - Working with images, video, and audio - [Google Integration](../../integrations/google.md) - Complete Gemini setup guide +- [Video Extraction Example](https://github.com/instructor-ai/instructor/tree/main/examples/video-extraction-gemini) - Complete working example with video ## See Also - [OpenAI Multimodal](openai-multimodal.md) - Compare multimodal approaches diff --git a/examples/video-extraction-gemini/README.md b/examples/video-extraction-gemini/README.md new file mode 100644 index 000000000..514b0cdb0 --- /dev/null +++ b/examples/video-extraction-gemini/README.md @@ -0,0 +1,85 @@ +# Video Analysis with Gemini 2.5 Pro + +This example demonstrates how to use Google's Gemini 2.5 Pro model with Instructor to analyze videos and extract structured information about tourist destinations. + +## Features + +- Upload videos to Gemini API using `VideoWithGenaiFile` +- Extract structured recommendations using Pydantic models +- Support for analyzing travel content and tourist destinations +- Type-safe structured outputs + +## Requirements + +```bash +pip install instructor google-genai pydantic +``` + +## Setup + +1. Get your Google API key from [Google AI Studio](https://makersuite.google.com/app/apikey) +2. Set the environment variable: + +```bash +export GOOGLE_API_KEY=your_api_key_here +``` + +## Usage + +Run the script with a path to your video file: + +```bash +python run.py path/to/your/video.mp4 +``` + +Example with a travel video: + +```bash +python run.py takayama_travel.mp4 +``` + +## How It Works + +The example: + +1. Uploads your video file to the Gemini API using `VideoWithGenaiFile.from_new_genai_file()` +2. Sends a prompt asking for tourist destination recommendations +3. Uses Instructor to parse the response into structured Pydantic models +4. Returns a list of destinations with names, descriptions, and locations + +## Output Structure + +The analysis returns: + +- **chain_of_thought**: Detailed reasoning about the video content +- **description**: Overall summary of the video +- **destinations**: List of tourist destinations, each with: + - name: Name of the destination + - description: What makes it interesting + - location: Where it's located + +## Supported Video Formats + +Gemini supports the following video formats: +- MP4 +- MPEG +- MOV +- AVI +- FLV +- MPG +- WebM +- WMV +- 3GPP +- QuickTime + +## Notes + +- Video files are uploaded to Google's servers for processing +- Large videos may take longer to upload and process +- The API automatically waits for the upload to complete before processing + +## Related Examples + +- [Multimodal Gemini Guide](../../docs/blog/posts/multimodal-gemini.md) +- [Image Analysis with Gemini](../vision/) +- [PDF Processing with Gemini](../../docs/blog/posts/chat-with-your-pdf-with-gemini.md) diff --git a/examples/video-extraction-gemini/run.py b/examples/video-extraction-gemini/run.py new file mode 100644 index 000000000..e5c2b8c54 --- /dev/null +++ b/examples/video-extraction-gemini/run.py @@ -0,0 +1,110 @@ +""" +Video Analysis with Gemini 2.5 Pro + +This example demonstrates how to use Gemini 2.5 Pro with Instructor to analyze videos +and extract structured information. We'll process a video and extract tourist destinations +mentioned in it. + +Requirements: + pip install instructor google-genai pydantic + +Usage: + export GOOGLE_API_KEY=your_api_key_here + + python run.py path/to/your/video.mp4 +""" + +import instructor +from pydantic import BaseModel +import sys + + +class TouristDestination(BaseModel): + """Represents a tourist destination mentioned in the video.""" + + name: str + description: str + location: str + + +class VideoRecommendations(BaseModel): + """Structured output containing recommendations from the video.""" + + chain_of_thought: str + description: str + destinations: list[TouristDestination] + + +def analyze_video(video_path: str): + """ + Analyze a video and extract tourist destination recommendations. + + Args: + video_path: Path to the video file to analyze + + Returns: + VideoRecommendations object containing structured data + """ + client = instructor.from_provider( + "google/gemini-2.0-flash-exp", + async_client=False, + ) + + print(f"Uploading video: {video_path}") + video = instructor.VideoWithGenaiFile.from_new_genai_file(video_path) + print(f"Video uploaded successfully: {video.source}") + + print("Analyzing video content...") + recommendations = client.messages.create( + messages=[ + { + "role": "user", + "content": [ + "What tourist destinations and places do they recommend in this video? " + "Provide a detailed analysis including the name, description, and location of each place.", + video, + ], + } + ], + response_model=VideoRecommendations, + ) + + return recommendations + + +def main(): + """Main function to run the video analysis.""" + if len(sys.argv) < 2: + print("Usage: python run.py ") + print("Example: python run.py travel_video.mp4") + sys.exit(1) + + video_path = sys.argv[1] + + try: + results = analyze_video(video_path) + + print("\n" + "=" * 80) + print("VIDEO ANALYSIS RESULTS") + print("=" * 80) + + print(f"\nOverview: {results.description}") + print(f"\nAnalysis: {results.chain_of_thought}") + + print(f"\nDestinations Found: {len(results.destinations)}") + print("-" * 80) + + for i, dest in enumerate(results.destinations, 1): + print(f"\n{i}. {dest.name}") + print(f" Location: {dest.location}") + print(f" Description: {dest.description}") + + print("\n" + "=" * 80) + + except Exception as e: + print(f"Error analyzing video: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/instructor/__init__.py b/instructor/__init__.py index 2ca9f4f6e..db7e27306 100644 --- a/instructor/__init__.py +++ b/instructor/__init__.py @@ -1,7 +1,7 @@ import importlib.util from .mode import Mode -from .processing.multimodal import Image, Audio +from .processing.multimodal import Image, Audio, Video, VideoWithGenaiFile from .dsl import ( CitationMixin, @@ -38,6 +38,8 @@ "Instructor", "Image", "Audio", + "Video", + "VideoWithGenaiFile", "from_openai", "from_litellm", "from_provider", diff --git a/instructor/processing/multimodal.py b/instructor/processing/multimodal.py index fc907eba7..aec2d4331 100644 --- a/instructor/processing/multimodal.py +++ b/instructor/processing/multimodal.py @@ -41,6 +41,18 @@ "audio/webm", ] VALID_PDF_MIME_TYPES = ["application/pdf"] +VALID_VIDEO_MIME_TYPES = [ + "video/mp4", + "video/mpeg", + "video/mov", + "video/avi", + "video/x-flv", + "video/mpg", + "video/webm", + "video/wmv", + "video/3gpp", + "video/quicktime", +] CacheControlType = Mapping[str, str] OptionalCacheControlType = Optional[CacheControlType] @@ -458,6 +470,213 @@ def to_genai(self): ) +class Video(BaseModel): + """Represents a video that can be loaded from a URL or file path.""" + + source: Union[str, Path] = Field(description="URL or file path of the video") # noqa: UP007 + data: Union[str, None] = Field( # noqa: UP007 + None, description="Base64 encoded video data", repr=False + ) + media_type: str = Field(description="MIME type of the video") + + @classmethod + def autodetect(cls, source: str | Path) -> Video: + """Attempt to autodetect a video from a source string or Path.""" + if isinstance(source, str): + if cls.is_base64(source): + return cls.from_base64(source) + if source.startswith(("http://", "https://")): + return cls.from_url(source) + if source.startswith("gs://"): + return cls.from_gs_url(source) + try: + path = Path(source) + if path.is_file(): + return cls.from_path(path) + except OSError: + pass + + raise ValueError("Unable to determine video source") + + if isinstance(source, Path): + return cls.from_path(source) + + @classmethod + def autodetect_safely(cls, source: Union[str, Path]) -> Union[Video, str]: # noqa: UP007 + """Safely attempt to autodetect a video from a source string or path. + + Args: + source (Union[str,path]): The source string or path. + Returns: + A Video if the source is detected to be a valid video, otherwise + the source itself as a string. + """ + try: + return cls.autodetect(source) + except ValueError: + return str(source) + + @classmethod + def is_base64(cls, s: str) -> bool: + return bool(re.match(r"^data:video/[a-zA-Z0-9+-]+;base64,", s)) + + @classmethod + def from_base64(cls, data_uri: str) -> Video: + header, encoded = data_uri.split(",", 1) + media_type = header.split(":")[1].split(";")[0] + if media_type not in VALID_VIDEO_MIME_TYPES: + raise ValueError(f"Unsupported video format: {media_type}") + return cls( + source=data_uri, + media_type=media_type, + data=encoded, + ) + + @classmethod + def from_url(cls, url: str) -> Video: + """Create a Video instance from a URL.""" + if url.startswith("gs://"): + return cls.from_gs_url(url) + response = requests.get(url) + content_type = response.headers.get("content-type") + assert content_type in VALID_VIDEO_MIME_TYPES, ( + f"Invalid video format. Must be one of: {', '.join(VALID_VIDEO_MIME_TYPES)}" + ) + + data = base64.b64encode(response.content).decode("utf-8") + return cls(source=url, data=data, media_type=content_type) + + @classmethod + def from_path(cls, path: Union[str, Path]) -> Video: # noqa: UP007 + """Create a Video instance from a file path.""" + path = Path(path) + assert path.is_file(), f"Video file not found: {path}" + + mime_type = mimetypes.guess_type(str(path))[0] + + assert mime_type in VALID_VIDEO_MIME_TYPES, ( + f"Invalid video format. Must be one of: {', '.join(VALID_VIDEO_MIME_TYPES)}" + ) + + data = base64.b64encode(path.read_bytes()).decode("utf-8") + return cls(source=str(path), data=data, media_type=mime_type) + + @classmethod + def from_gs_url(cls, data_uri: str, timeout: int = 30) -> Video: + """ + Create a Video instance from a Google Cloud Storage URL. + + Args: + data_uri: GCS URL starting with gs:// + timeout: Request timeout in seconds (default: 30) + """ + if not data_uri.startswith("gs://"): + raise ValueError("URL must start with gs://") + + public_url = f"https://storage.googleapis.com/{data_uri[5:]}" + + try: + response = requests.get(public_url, timeout=timeout) + response.raise_for_status() + media_type = response.headers.get("Content-Type") + if media_type not in VALID_VIDEO_MIME_TYPES: + raise ValueError(f"Unsupported video format: {media_type}") + + data = base64.b64encode(response.content).decode("utf-8") + + return cls(source=data_uri, media_type=media_type, data=data) + except requests.RequestException as e: + raise ValueError( + "Failed to access GCS video (must be publicly readable)" + ) from e + + def to_openai(self, mode: Mode) -> dict[str, Any]: + """Convert the Video instance to OpenAI's API format.""" + raise NotImplementedError("OpenAI does not support video inputs yet") + + def to_anthropic(self) -> dict[str, Any]: + raise NotImplementedError("Anthropic does not support video inputs yet") + + def to_genai(self): + """ + Convert the Video instance to Google GenAI's API format. + """ + try: + from google.genai import types + except ImportError as err: + raise ImportError( + "google-genai package is required for GenAI integration. Install with: pip install google-genai" + ) from err + + return types.Part.from_bytes( + data=base64.b64decode(self.data), # type: ignore + mime_type=self.media_type, + ) + + +class VideoWithGenaiFile(Video): + @classmethod + def from_new_genai_file( + cls, file_path: str, retry_delay: int = 10, max_retries: int = 20 + ) -> VideoWithGenaiFile: + """Create a new VideoWithGenaiFile from a file path by uploading to Gemini API.""" + from google.genai.types import FileState + import time + from google.genai import Client + + client = Client() + file = client.files.upload(file=file_path) + while file.state != FileState.ACTIVE: + time.sleep(retry_delay) + file = client.files.get(name=file.name) # type: ignore + if max_retries > 0: + max_retries -= 1 + else: + raise Exception( + "Max retries reached. File upload has been started but is still pending" + ) + + return cls(source=file.uri, media_type=file.mime_type, data=None) # type: ignore + + @classmethod + def from_existing_genai_file(cls, file_name: str) -> VideoWithGenaiFile: + """Create a new VideoWithGenaiFile from an existing uploaded file.""" + from google.genai import types + from google.genai.types import FileState + from google.genai import Client + + client = Client() + file = client.files.get(name=file_name) + if file.source == types.FileSource.UPLOADED and file.state == FileState.ACTIVE: + return cls( + source=file.uri, # type: ignore + media_type=file.mime_type, # type: ignore + data=None, + ) + else: + raise ValueError("We only support uploaded videos for now") + + def to_genai(self): + try: + from google.genai import types + except ImportError as err: + raise ImportError( + "google-genai package is required for GenAI integration. Install with: pip install google-genai" + ) from err + + if ( + self.source + and isinstance(self.source, str) + and "https://generativelanguage.googleapis.com/v1beta/files/" in self.source + ): + return types.Part.from_uri( + file_uri=self.source, + mime_type=self.media_type, + ) + + return super().to_genai() + + class ImageWithCacheControl(Image): """Image with Anthropic prompt caching support.""" @@ -841,14 +1060,15 @@ def convert_contents( dict[str, Any], Image, Audio, - list[Union[str, dict[str, Any], Image, Audio]], # noqa: UP007 + Video, + list[Union[str, dict[str, Any], Image, Audio, Video]], # noqa: UP007 ], mode: Mode, ) -> Union[str, list[dict[str, Any]]]: # noqa: UP007 """Convert content items to the appropriate format based on the specified mode.""" if isinstance(contents, str): return contents - if isinstance(contents, (Image, Audio, PDF)) or isinstance(contents, dict): + if isinstance(contents, (Image, Audio, Video, PDF)) or isinstance(contents, dict): contents = [contents] converted_contents: list[dict[str, Union[str, Image]]] = [] # noqa: UP007 @@ -862,7 +1082,7 @@ def convert_contents( converted_contents.append({"type": text_file_type, "text": content}) elif isinstance(content, dict): converted_contents.append(content) - elif isinstance(content, (Image, Audio, PDF)): + elif isinstance(content, (Image, Audio, Video, PDF)): if mode in { Mode.ANTHROPIC_JSON, Mode.ANTHROPIC_TOOLS, @@ -884,18 +1104,18 @@ def convert_contents( def autodetect_media( - source: str | Path | Image | Audio | PDF, -) -> Image | Audio | PDF | str: - """Autodetect images, audio, or PDFs from a given source. + source: str | Path | Image | Audio | Video | PDF, +) -> Image | Audio | Video | PDF | str: + """Autodetect images, audio, videos, or PDFs from a given source. Args: source: URL, file path, Path, or data URI to inspect. Returns: - The detected :class:`Image`, :class:`Audio`, or :class:`PDF` instance. + The detected :class:`Image`, :class:`Audio`, :class:`Video`, or :class:`PDF` instance. If detection fails, the original source is returned. """ - if isinstance(source, (Image, Audio, PDF)): + if isinstance(source, (Image, Audio, Video, PDF)): return source # Normalize once for cheap checks and mimetype guess @@ -905,6 +1125,8 @@ def autodetect_media( return Image.autodetect_safely(source) if source.startswith("data:audio/"): return Audio.autodetect_safely(source) + if source.startswith("data:video/"): + return Video.autodetect_safely(source) if source.startswith("data:application/pdf"): return PDF.autodetect_safely(source) @@ -913,10 +1135,12 @@ def autodetect_media( return Image.autodetect_safely(source) if media_type in VALID_AUDIO_MIME_TYPES: return Audio.autodetect_safely(source) + if media_type in VALID_VIDEO_MIME_TYPES: + return Video.autodetect_safely(source) if media_type in VALID_PDF_MIME_TYPES: return PDF.autodetect_safely(source) - for cls in (Image, Audio, PDF): + for cls in (Image, Audio, Video, PDF): item = cls.autodetect_safely(source) # type: ignore[arg-type] if not isinstance(item, str): return item @@ -932,8 +1156,9 @@ def convert_messages( dict[str, Any], Image, Audio, + Video, PDF, - list[Union[str, dict[str, Any], Image, Audio, PDF]], # noqa: UP007 + list[Union[str, dict[str, Any], Image, Audio, Video, PDF]], # noqa: UP007 ], ] ], @@ -948,7 +1173,7 @@ def is_image_params(x: Any) -> bool: for message in messages: if "type" in message: - if message["type"] in {"audio", "image"}: + if message["type"] in {"audio", "image", "video"}: converted_messages.append(message) # type: ignore else: raise ValueError(f"Unsupported message type: {message['type']}") @@ -959,7 +1184,7 @@ def is_image_params(x: Any) -> bool: } if autodetect_images: if isinstance(content, list): - new_content: list[str | dict[str, Any] | Image | Audio | PDF] = [] # noqa: UP007 + new_content: list[str | dict[str, Any] | Image | Audio | Video | PDF] = [] # noqa: UP007 for item in content: if isinstance(item, str): new_content.append(autodetect_media(item)) @@ -1023,7 +1248,7 @@ def extract_genai_multimodal_content( if content_part.text and autodetect_images: converted_item = autodetect_media(content_part.text) - if isinstance(converted_item, (Image, Audio, PDF)): + if isinstance(converted_item, (Image, Audio, Video, PDF)): converted_contents.append(converted_item.to_genai()) continue