|
1 | 1 | import logging |
| 2 | +import re |
2 | 3 | from http import HTTPStatus |
3 | 4 | from typing import List, Optional |
| 5 | +from urllib.parse import urlparse, urlunparse, unquote, quote |
4 | 6 |
|
| 7 | +import httpx |
5 | 8 | from fastapi import APIRouter, Body, File, Form, Header, HTTPException, Path as PathParam, Query, UploadFile |
6 | 9 | from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse |
7 | 10 |
|
|
12 | 15 |
|
13 | 16 | logger = logging.getLogger("file_management_app") |
14 | 17 |
|
| 18 | + |
| 19 | +def build_content_disposition_header(filename: Optional[str]) -> str: |
| 20 | + """ |
| 21 | + Build a Content-Disposition header that keeps the original filename. |
| 22 | +
|
| 23 | + - ASCII filenames are returned directly. |
| 24 | + - Non-ASCII filenames include both an ASCII fallback and RFC 5987 encoded value |
| 25 | + so modern browsers keep the original name. |
| 26 | + """ |
| 27 | + safe_name = (filename or "download").strip() or "download" |
| 28 | + |
| 29 | + def _sanitize_ascii(value: str) -> str: |
| 30 | + # Replace problematic characters that break HTTP headers |
| 31 | + # Remove control characters (newlines, carriage returns, tabs, etc.) |
| 32 | + # Remove control characters (0x00-0x1F and 0x7F) |
| 33 | + sanitized = re.sub(r'[\x00-\x1F\x7F]', '', value) |
| 34 | + # Replace problematic characters that break HTTP headers |
| 35 | + sanitized = sanitized.replace("\\", "_").replace('"', "_") |
| 36 | + # Remove leading/trailing spaces and dots (Windows filename restrictions) |
| 37 | + sanitized = sanitized.strip(' .') |
| 38 | + return sanitized if sanitized else "download" |
| 39 | + |
| 40 | + try: |
| 41 | + safe_name.encode("ascii") |
| 42 | + return f'attachment; filename="{_sanitize_ascii(safe_name)}"' |
| 43 | + except UnicodeEncodeError: |
| 44 | + try: |
| 45 | + encoded = quote(safe_name, safe="") |
| 46 | + except Exception: |
| 47 | + # quote failure, fallback to sanitized ASCII only |
| 48 | + logger.warning("Failed to encode filename '%s', using fallback", safe_name) |
| 49 | + return f'attachment; filename="{_sanitize_ascii(safe_name)}"' |
| 50 | + |
| 51 | + fallback = _sanitize_ascii( |
| 52 | + safe_name.encode("ascii", "ignore").decode("ascii") or "download" |
| 53 | + ) |
| 54 | + return f'attachment; filename="{fallback}"; filename*=UTF-8\'\'{encoded}' |
| 55 | + except Exception as exc: # pragma: no cover |
| 56 | + logger.warning( |
| 57 | + "Failed to encode filename '%s': %s. Using fallback.", |
| 58 | + safe_name, |
| 59 | + exc, |
| 60 | + ) |
| 61 | + return 'attachment; filename="download"' |
| 62 | + |
15 | 63 | # Create API router |
16 | 64 | file_management_runtime_router = APIRouter(prefix="/file") |
17 | 65 | file_management_config_router = APIRouter(prefix="/file") |
@@ -98,6 +146,64 @@ async def process_files( |
98 | 146 | ) |
99 | 147 |
|
100 | 148 |
|
| 149 | +@file_management_config_router.get("/download/{object_name:path}") |
| 150 | +async def get_storage_file( |
| 151 | + object_name: str = PathParam(..., description="File object name"), |
| 152 | + download: str = Query("ignore", description="How to get the file"), |
| 153 | + expires: int = Query(3600, description="URL validity period (seconds)"), |
| 154 | + filename: Optional[str] = Query(None, description="Original filename for download (optional)") |
| 155 | +): |
| 156 | + """ |
| 157 | + Get information, download link, or file stream for a single file |
| 158 | +
|
| 159 | + - **object_name**: File object name |
| 160 | + - **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL) |
| 161 | + - **expires**: URL validity period in seconds (default 3600) |
| 162 | + - **filename**: Original filename for download (optional, if not provided, will use object_name) |
| 163 | +
|
| 164 | + Returns file information, download link, or file content |
| 165 | + """ |
| 166 | + try: |
| 167 | + logger.info(f"[get_storage_file] Route matched! object_name={object_name}, download={download}, filename={filename}") |
| 168 | + if download == "redirect": |
| 169 | + # return a redirect download URL |
| 170 | + result = await get_file_url_impl(object_name=object_name, expires=expires) |
| 171 | + return RedirectResponse(url=result["url"]) |
| 172 | + elif download == "stream": |
| 173 | + # return a readable file stream |
| 174 | + file_stream, content_type = await get_file_stream_impl(object_name=object_name) |
| 175 | + logger.info(f"Streaming file: object_name={object_name}, content_type={content_type}") |
| 176 | + |
| 177 | + # Use provided filename or extract from object_name |
| 178 | + download_filename = filename |
| 179 | + if not download_filename: |
| 180 | + # Extract filename from object_name (get the last part after the last slash) |
| 181 | + download_filename = object_name.split("/")[-1] if "/" in object_name else object_name |
| 182 | + |
| 183 | + # Build Content-Disposition header with proper encoding for non-ASCII characters |
| 184 | + content_disposition = build_content_disposition_header(download_filename) |
| 185 | + |
| 186 | + return StreamingResponse( |
| 187 | + file_stream, |
| 188 | + media_type=content_type, |
| 189 | + headers={ |
| 190 | + "Content-Disposition": content_disposition, |
| 191 | + "Cache-Control": "public, max-age=3600", |
| 192 | + "ETag": f'"{object_name}"', |
| 193 | + } |
| 194 | + ) |
| 195 | + else: |
| 196 | + # return file metadata |
| 197 | + return await get_file_url_impl(object_name=object_name, expires=expires) |
| 198 | + except Exception as e: |
| 199 | + logger.error(f"Failed to get file: object_name={object_name}, error={str(e)}") |
| 200 | + raise HTTPException( |
| 201 | + status_code=HTTPStatus.INTERNAL_SERVER_ERROR, |
| 202 | + detail=f"Failed to get file information: {str(e)}" |
| 203 | + ) |
| 204 | + |
| 205 | + |
| 206 | + |
101 | 207 | @file_management_runtime_router.post("/storage") |
102 | 208 | async def storage_upload_files( |
103 | 209 | files: List[UploadFile] = File(..., description="List of files to upload"), |
@@ -158,43 +264,204 @@ async def get_storage_files( |
158 | 264 | ) |
159 | 265 |
|
160 | 266 |
|
161 | | -@file_management_config_router.get("/storage/{path}/{object_name}") |
162 | | -async def get_storage_file( |
163 | | - object_name: str = PathParam(..., description="File object name"), |
164 | | - download: str = Query("ignore", description="How to get the file"), |
165 | | - expires: int = Query(3600, description="URL validity period (seconds)") |
| 267 | +def _ensure_http_scheme(raw_url: str) -> str: |
| 268 | + """ |
| 269 | + Ensure the provided Datamate URL has an explicit HTTP or HTTPS scheme. |
| 270 | + """ |
| 271 | + candidate = (raw_url or "").strip() |
| 272 | + if not candidate: |
| 273 | + raise HTTPException( |
| 274 | + status_code=HTTPStatus.BAD_REQUEST, |
| 275 | + detail="URL cannot be empty" |
| 276 | + ) |
| 277 | + |
| 278 | + parsed = urlparse(candidate) |
| 279 | + if parsed.scheme: |
| 280 | + if parsed.scheme not in ("http", "https"): |
| 281 | + raise HTTPException( |
| 282 | + status_code=HTTPStatus.BAD_REQUEST, |
| 283 | + detail="URL must start with http:// or https://" |
| 284 | + ) |
| 285 | + return candidate |
| 286 | + |
| 287 | + if candidate.startswith("//"): |
| 288 | + return f"http:{candidate}" |
| 289 | + |
| 290 | + return f"http://{candidate}" |
| 291 | + |
| 292 | + |
| 293 | +def _normalize_datamate_download_url(raw_url: str) -> str: |
| 294 | + """ |
| 295 | + Normalize Datamate download URL to ensure it follows /data-management/datasets/{datasetId}/files/{fileId}/download |
| 296 | + """ |
| 297 | + normalized_source = _ensure_http_scheme(raw_url) |
| 298 | + parsed_url = urlparse(normalized_source) |
| 299 | + path_segments = [segment for segment in parsed_url.path.split("/") if segment] |
| 300 | + |
| 301 | + if "data-management" not in path_segments: |
| 302 | + raise HTTPException( |
| 303 | + status_code=HTTPStatus.BAD_REQUEST, |
| 304 | + detail="Invalid Datamate URL: missing 'data-management' segment" |
| 305 | + ) |
| 306 | + |
| 307 | + try: |
| 308 | + dm_index = path_segments.index("data-management") |
| 309 | + datasets_index = path_segments.index("datasets", dm_index) |
| 310 | + dataset_id = path_segments[datasets_index + 1] |
| 311 | + files_index = path_segments.index("files", datasets_index) |
| 312 | + file_id = path_segments[files_index + 1] |
| 313 | + except (ValueError, IndexError): |
| 314 | + raise HTTPException( |
| 315 | + status_code=HTTPStatus.BAD_REQUEST, |
| 316 | + detail="Invalid Datamate URL: unable to parse dataset_id or file_id" |
| 317 | + ) |
| 318 | + |
| 319 | + prefix_segments = path_segments[:dm_index] |
| 320 | + prefix_path = "/" + "/".join(prefix_segments) if prefix_segments else "" |
| 321 | + normalized_path = f"{prefix_path}/data-management/datasets/{dataset_id}/files/{file_id}/download" |
| 322 | + |
| 323 | + normalized_url = urlunparse(( |
| 324 | + parsed_url.scheme, |
| 325 | + parsed_url.netloc, |
| 326 | + normalized_path, |
| 327 | + "", |
| 328 | + "", |
| 329 | + "" |
| 330 | + )) |
| 331 | + |
| 332 | + return normalized_url |
| 333 | + |
| 334 | + |
| 335 | +def _build_datamate_url_from_parts(base_url: str, dataset_id: str, file_id: str) -> str: |
| 336 | + """ |
| 337 | + Build Datamate download URL from individual parts |
| 338 | + """ |
| 339 | + if not base_url: |
| 340 | + raise HTTPException( |
| 341 | + status_code=HTTPStatus.BAD_REQUEST, |
| 342 | + detail="base_url is required when dataset_id and file_id are provided" |
| 343 | + ) |
| 344 | + |
| 345 | + base_with_scheme = _ensure_http_scheme(base_url) |
| 346 | + parsed_base = urlparse(base_with_scheme) |
| 347 | + base_prefix = parsed_base.path.rstrip("/") |
| 348 | + |
| 349 | + if base_prefix and not base_prefix.endswith("/api"): |
| 350 | + if base_prefix.endswith("/"): |
| 351 | + base_prefix = f"{base_prefix}api" |
| 352 | + else: |
| 353 | + base_prefix = f"{base_prefix}/api" |
| 354 | + elif not base_prefix: |
| 355 | + base_prefix = "/api" |
| 356 | + |
| 357 | + normalized_path = f"{base_prefix}/data-management/datasets/{dataset_id}/files/{file_id}/download" |
| 358 | + |
| 359 | + return urlunparse(( |
| 360 | + parsed_base.scheme, |
| 361 | + parsed_base.netloc, |
| 362 | + normalized_path, |
| 363 | + "", |
| 364 | + "", |
| 365 | + "" |
| 366 | + )) |
| 367 | + |
| 368 | + |
| 369 | +@file_management_config_router.get("/datamate/download") |
| 370 | +async def download_datamate_file( |
| 371 | + url: Optional[str] = Query(None, description="Datamate file URL to download"), |
| 372 | + base_url: Optional[str] = Query(None, description="Datamate base server URL (e.g., host:port)"), |
| 373 | + dataset_id: Optional[str] = Query(None, description="Datamate dataset ID"), |
| 374 | + file_id: Optional[str] = Query(None, description="Datamate file ID"), |
| 375 | + filename: Optional[str] = Query(None, description="Optional filename for download"), |
| 376 | + authorization: Optional[str] = Header(None, alias="Authorization") |
166 | 377 | ): |
167 | 378 | """ |
168 | | - Get information, download link, or file stream for a single file |
| 379 | + Download file from Datamate knowledge base via HTTP URL |
169 | 380 |
|
170 | | - - **object_name**: File object name |
171 | | - - **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL) |
172 | | - - **expires**: URL validity period in seconds (default 3600) |
| 381 | + - **url**: Full HTTP URL of the file to download (optional) |
| 382 | + - **base_url**: Base server URL (e.g., host:port) |
| 383 | + - **dataset_id**: Datamate dataset ID |
| 384 | + - **file_id**: Datamate file ID |
| 385 | + - **filename**: Optional filename for the download (extracted automatically if not provided) |
| 386 | + - **authorization**: Optional authorizatio n header to pass to the target URL |
173 | 387 |
|
174 | | - Returns file information, download link, or file content |
| 388 | + Returns file stream for download |
175 | 389 | """ |
176 | 390 | try: |
177 | | - if download == "redirect": |
178 | | - # return a redirect download URL |
179 | | - result = await get_file_url_impl(object_name=object_name, expires=expires) |
180 | | - return RedirectResponse(url=result["url"]) |
181 | | - elif download == "stream": |
182 | | - # return a readable file stream |
183 | | - file_stream, content_type = await get_file_stream_impl(object_name=object_name) |
| 391 | + if url: |
| 392 | + logger.info(f"[download_datamate_file] Using full URL: {url}") |
| 393 | + normalized_url = _normalize_datamate_download_url(url) |
| 394 | + elif base_url and dataset_id and file_id: |
| 395 | + logger.info(f"[download_datamate_file] Building URL from parts: base_url={base_url}, dataset_id={dataset_id}, file_id={file_id}") |
| 396 | + normalized_url = _build_datamate_url_from_parts(base_url, dataset_id, file_id) |
| 397 | + else: |
| 398 | + raise HTTPException( |
| 399 | + status_code=HTTPStatus.BAD_REQUEST, |
| 400 | + detail="Either url or (base_url, dataset_id, file_id) must be provided" |
| 401 | + ) |
| 402 | + |
| 403 | + logger.info(f"[download_datamate_file] Normalized download URL: {normalized_url}") |
| 404 | + logger.info(f"[download_datamate_file] Authorization header present: {authorization is not None}") |
| 405 | + |
| 406 | + headers = {} |
| 407 | + if authorization: |
| 408 | + headers["Authorization"] = authorization |
| 409 | + logger.debug(f"[download_datamate_file] Using authorization header: {authorization[:20]}...") |
| 410 | + headers["User-Agent"] = "Nexent-File-Downloader/1.0" |
| 411 | + |
| 412 | + logger.info(f"[download_datamate_file] Request headers: {list(headers.keys())}") |
| 413 | + |
| 414 | + async with httpx.AsyncClient(timeout=30.0) as client: |
| 415 | + response = await client.get(normalized_url, headers=headers, follow_redirects=True) |
| 416 | + logger.info(f"[download_datamate_file] Response status: {response.status_code}") |
| 417 | + |
| 418 | + if response.status_code == 404: |
| 419 | + logger.error(f"[download_datamate_file] File not found at URL: {normalized_url}") |
| 420 | + logger.error(f"[download_datamate_file] Response headers: {dict(response.headers)}") |
| 421 | + raise HTTPException( |
| 422 | + status_code=HTTPStatus.NOT_FOUND, |
| 423 | + detail="File not found. Please verify dataset_id and file_id." |
| 424 | + ) |
| 425 | + |
| 426 | + response.raise_for_status() |
| 427 | + |
| 428 | + content_type = response.headers.get("Content-Type", "application/octet-stream") |
| 429 | + |
| 430 | + download_filename = filename |
| 431 | + if not download_filename: |
| 432 | + content_disposition = response.headers.get("Content-Disposition", "") |
| 433 | + if content_disposition: |
| 434 | + filename_match = re.search(r'filename="?(.+?)"?$', content_disposition) |
| 435 | + if filename_match: |
| 436 | + download_filename = filename_match.group(1) |
| 437 | + |
| 438 | + if not download_filename: |
| 439 | + path = unquote(urlparse(normalized_url).path) |
| 440 | + download_filename = path.split('/')[-1] or "download" |
| 441 | + |
| 442 | + # Build Content-Disposition header with proper encoding for non-ASCII characters |
| 443 | + content_disposition = build_content_disposition_header(download_filename) |
| 444 | + |
184 | 445 | return StreamingResponse( |
185 | | - file_stream, |
| 446 | + iter([response.content]), |
186 | 447 | media_type=content_type, |
187 | 448 | headers={ |
188 | | - "Content-Disposition": f'inline; filename="{object_name}"' |
| 449 | + "Content-Disposition": content_disposition |
189 | 450 | } |
190 | 451 | ) |
191 | | - else: |
192 | | - # return file metadata |
193 | | - return await get_file_url_impl(object_name=object_name, expires=expires) |
| 452 | + except httpx.HTTPError as e: |
| 453 | + logger.error(f"Failed to download file from URL {url}: {str(e)}") |
| 454 | + raise HTTPException( |
| 455 | + status_code=HTTPStatus.BAD_GATEWAY, |
| 456 | + detail=f"Failed to download file from URL: {str(e)}" |
| 457 | + ) |
| 458 | + except HTTPException: |
| 459 | + raise |
194 | 460 | except Exception as e: |
| 461 | + logger.error(f"Failed to download datamate file: {str(e)}") |
195 | 462 | raise HTTPException( |
196 | 463 | status_code=HTTPStatus.INTERNAL_SERVER_ERROR, |
197 | | - detail=f"Failed to get file information: {str(e)}" |
| 464 | + detail=f"Failed to download file: {str(e)}" |
198 | 465 | ) |
199 | 466 |
|
200 | 467 |
|
|
0 commit comments