|
| 1 | +"""S3 utility functions for uploading and managing digest files.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import hashlib |
| 6 | +import os |
| 7 | +from typing import Any |
| 8 | + |
| 9 | +import boto3 |
| 10 | +from botocore.exceptions import ClientError |
| 11 | + |
| 12 | + |
| 13 | +class S3UploadError(Exception): |
| 14 | + """Custom exception for S3 upload failures.""" |
| 15 | + |
| 16 | + |
| 17 | +def is_s3_enabled() -> bool: |
| 18 | + """Check if S3 is enabled via environment variables.""" |
| 19 | + return os.getenv("S3_ENABLED", "false").lower() == "true" |
| 20 | + |
| 21 | + |
| 22 | +def get_s3_config() -> dict[str, Any]: |
| 23 | + """Get S3 configuration from environment variables.""" |
| 24 | + return { |
| 25 | + "endpoint_url": os.getenv("S3_ENDPOINT"), |
| 26 | + "aws_access_key_id": os.getenv("S3_ACCESS_KEY"), |
| 27 | + "aws_secret_access_key": os.getenv("S3_SECRET_KEY"), |
| 28 | + "region_name": os.getenv("S3_REGION", "us-east-1"), |
| 29 | + } |
| 30 | + |
| 31 | + |
| 32 | +def get_s3_bucket_name() -> str: |
| 33 | + """Get S3 bucket name from environment variables.""" |
| 34 | + return os.getenv("S3_BUCKET_NAME", "gitingest-bucket") |
| 35 | + |
| 36 | + |
| 37 | +def get_s3_alias_host() -> str | None: |
| 38 | + """Get S3 alias host for public URLs.""" |
| 39 | + return os.getenv("S3_ALIAS_HOST") |
| 40 | + |
| 41 | + |
| 42 | +def generate_s3_file_path( |
| 43 | + source: str, |
| 44 | + user_name: str, |
| 45 | + repo_name: str, |
| 46 | + branch: str | None, |
| 47 | + commit: str | None, |
| 48 | + include_patterns: set[str] | None, |
| 49 | + ignore_patterns: set[str], |
| 50 | +) -> str: |
| 51 | + """Generate S3 file path with proper naming convention. |
| 52 | +
|
| 53 | + Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt |
| 54 | + The commit-ID is always included in the URL. If no specific commit is provided, |
| 55 | + the actual commit hash from the cloned repository is used. |
| 56 | +
|
| 57 | + Args: |
| 58 | + source: Git host (github, gitlab, etc.) |
| 59 | + user_name: Repository owner/user |
| 60 | + repo_name: Repository name |
| 61 | + branch: Branch name (if available) |
| 62 | + commit: Commit hash (should always be available now) |
| 63 | + include_patterns: Include patterns set |
| 64 | + ignore_patterns: Ignore patterns set |
| 65 | +
|
| 66 | + Returns: |
| 67 | + S3 file path string |
| 68 | +
|
| 69 | + """ |
| 70 | + # Extract source from URL or default to "unknown" |
| 71 | + if "github.com" in source: |
| 72 | + git_source = "github" |
| 73 | + elif "gitlab.com" in source: |
| 74 | + git_source = "gitlab" |
| 75 | + elif "bitbucket.org" in source: |
| 76 | + git_source = "bitbucket" |
| 77 | + else: |
| 78 | + git_source = "unknown" |
| 79 | + |
| 80 | + # Use branch, fallback to "main" if neither branch nor commit |
| 81 | + branch_name = branch or "main" |
| 82 | + |
| 83 | + # Create hash of exclude/include patterns for uniqueness |
| 84 | + patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" |
| 85 | + patterns_str += f"exclude:{sorted(ignore_patterns)}" |
| 86 | + |
| 87 | + patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] |
| 88 | + |
| 89 | + # Commit should always be available now, but provide fallback just in case |
| 90 | + commit_id = commit or "HEAD" |
| 91 | + |
| 92 | + # Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<hash>.txt |
| 93 | + return f"ingest/{git_source}/{user_name}/{repo_name}/{branch_name}/{commit_id}/{patterns_hash}.txt" |
| 94 | + |
| 95 | + |
| 96 | +def create_s3_client() -> boto3.client: |
| 97 | + """Create and return an S3 client with configuration from environment.""" |
| 98 | + config = get_s3_config() |
| 99 | + return boto3.client("s3", **config) |
| 100 | + |
| 101 | + |
| 102 | +def upload_to_s3(content: str, s3_file_path: str, ingest_id: str) -> str: |
| 103 | + """Upload content to S3 and return the public URL. |
| 104 | +
|
| 105 | + Args: |
| 106 | + content: The digest content to upload |
| 107 | + s3_file_path: The S3 file path |
| 108 | + ingest_id: The ingest ID to store as S3 object tag |
| 109 | +
|
| 110 | + Returns: |
| 111 | + Public URL to access the uploaded file |
| 112 | +
|
| 113 | + Raises: |
| 114 | + Exception: If upload fails |
| 115 | +
|
| 116 | + """ |
| 117 | + if not is_s3_enabled(): |
| 118 | + msg = "S3 is not enabled" |
| 119 | + raise ValueError(msg) |
| 120 | + |
| 121 | + try: |
| 122 | + s3_client = create_s3_client() |
| 123 | + bucket_name = get_s3_bucket_name() |
| 124 | + |
| 125 | + # Upload the content with ingest_id as tag |
| 126 | + s3_client.put_object( |
| 127 | + Bucket=bucket_name, |
| 128 | + Key=s3_file_path, |
| 129 | + Body=content.encode("utf-8"), |
| 130 | + ContentType="text/plain", |
| 131 | + Tagging=f"ingest_id={ingest_id}", |
| 132 | + ) |
| 133 | + |
| 134 | + # Generate public URL |
| 135 | + alias_host = get_s3_alias_host() |
| 136 | + if alias_host: |
| 137 | + # Use alias host if configured |
| 138 | + return f"{alias_host.rstrip('/')}/{s3_file_path}" |
| 139 | + # Fallback to direct S3 URL |
| 140 | + endpoint = get_s3_config()["endpoint_url"] |
| 141 | + if endpoint: |
| 142 | + return f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}" |
| 143 | + return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}" |
| 144 | + |
| 145 | + except ClientError as e: |
| 146 | + msg = f"Failed to upload to S3: {e}" |
| 147 | + raise S3UploadError(msg) from e |
| 148 | + |
| 149 | + |
| 150 | +def _build_s3_url(key: str) -> str: |
| 151 | + """Build S3 URL for a given key.""" |
| 152 | + alias_host = get_s3_alias_host() |
| 153 | + if alias_host: |
| 154 | + return f"{alias_host.rstrip('/')}/{key}" |
| 155 | + endpoint = get_s3_config()["endpoint_url"] |
| 156 | + if endpoint: |
| 157 | + bucket_name = get_s3_bucket_name() |
| 158 | + return f"{endpoint.rstrip('/')}/{bucket_name}/{key}" |
| 159 | + bucket_name = get_s3_bucket_name() |
| 160 | + return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{key}" |
| 161 | + |
| 162 | + |
| 163 | +def _check_object_tags(s3_client: boto3.client, bucket_name: str, key: str, target_ingest_id: str) -> bool: |
| 164 | + """Check if an S3 object has the matching ingest_id tag.""" |
| 165 | + try: |
| 166 | + tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key) |
| 167 | + tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])} |
| 168 | + return tags.get("ingest_id") == target_ingest_id |
| 169 | + except ClientError: |
| 170 | + return False |
| 171 | + |
| 172 | + |
| 173 | +def get_s3_url_for_ingest_id(ingest_id: str) -> str | None: |
| 174 | + """Get S3 URL for a given ingest ID if it exists. |
| 175 | +
|
| 176 | + This is used by the download endpoint to redirect to S3 if available. |
| 177 | + Searches for files using S3 object tags to find the matching ingest_id. |
| 178 | +
|
| 179 | + Args: |
| 180 | + ingest_id: The ingest ID |
| 181 | +
|
| 182 | + Returns: |
| 183 | + S3 URL if file exists, None otherwise |
| 184 | +
|
| 185 | + """ |
| 186 | + if not is_s3_enabled(): |
| 187 | + return None |
| 188 | + |
| 189 | + try: |
| 190 | + s3_client = create_s3_client() |
| 191 | + bucket_name = get_s3_bucket_name() |
| 192 | + |
| 193 | + # List all objects in the ingest/ prefix and check their tags |
| 194 | + paginator = s3_client.get_paginator("list_objects_v2") |
| 195 | + page_iterator = paginator.paginate( |
| 196 | + Bucket=bucket_name, |
| 197 | + Prefix="ingest/", |
| 198 | + ) |
| 199 | + |
| 200 | + for page in page_iterator: |
| 201 | + if "Contents" not in page: |
| 202 | + continue |
| 203 | + |
| 204 | + for obj in page["Contents"]: |
| 205 | + key = obj["Key"] |
| 206 | + if _check_object_tags(s3_client, bucket_name, key, ingest_id): |
| 207 | + return _build_s3_url(key) |
| 208 | + |
| 209 | + except ClientError: |
| 210 | + pass |
| 211 | + |
| 212 | + return None |
0 commit comments