ModelEngine-Group
diff --git a/‎backend/apps/file_management_app.py‎
Lines changed: 290 additions & 23 deletions b/‎backend/apps/file_management_app.py‎
Lines changed: 290 additions & 23 deletions
@@ -1,7 +1,10 @@
 import logging
+import re
 from http import HTTPStatus
 from typing import List, Optional
+from urllib.parse import urlparse, urlunparse, unquote, quote
 
+import httpx
 from fastapi import APIRouter, Body, File, Form, Header, HTTPException, Path as PathParam, Query, UploadFile
 from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
 
@@ -12,6 +15,51 @@
 
 logger = logging.getLogger("file_management_app")
 
+
+def build_content_disposition_header(filename: Optional[str]) -> str:
+    """
+    Build a Content-Disposition header that keeps the original filename.
+
+    - ASCII filenames are returned directly.
+    - Non-ASCII filenames include both an ASCII fallback and RFC 5987 encoded value
+      so modern browsers keep the original name.
+    """
+    safe_name = (filename or "download").strip() or "download"
+
+    def _sanitize_ascii(value: str) -> str:
+        # Replace problematic characters that break HTTP headers
+        # Remove control characters (newlines, carriage returns, tabs, etc.)
+        # Remove control characters (0x00-0x1F and 0x7F)
+        sanitized = re.sub(r'[\x00-\x1F\x7F]', '', value)
+        # Replace problematic characters that break HTTP headers
+        sanitized = sanitized.replace("\\", "_").replace('"', "_")
+        # Remove leading/trailing spaces and dots (Windows filename restrictions)
+        sanitized = sanitized.strip(' .')
+        return sanitized if sanitized else "download"
+
+    try:
+        safe_name.encode("ascii")
+        return f'attachment; filename="{_sanitize_ascii(safe_name)}"'
+    except UnicodeEncodeError:
+        try:
+            encoded = quote(safe_name, safe="")
+        except Exception:
+            # quote failure, fallback to sanitized ASCII only
+            logger.warning("Failed to encode filename '%s', using fallback", safe_name)
+            return f'attachment; filename="{_sanitize_ascii(safe_name)}"'
+
+        fallback = _sanitize_ascii(
+            safe_name.encode("ascii", "ignore").decode("ascii") or "download"
+        )
+        return f'attachment; filename="{fallback}"; filename*=UTF-8\'\'{encoded}'
+    except Exception as exc:  # pragma: no cover
+        logger.warning(
+            "Failed to encode filename '%s': %s. Using fallback.",
+            safe_name,
+            exc,
+        )
+        return 'attachment; filename="download"'
+
 # Create API router
 file_management_runtime_router = APIRouter(prefix="/file")
 file_management_config_router = APIRouter(prefix="/file")
@@ -98,6 +146,64 @@ async def process_files(
     )
 
 
+@file_management_config_router.get("/download/{object_name:path}")
+async def get_storage_file(
+    object_name: str = PathParam(..., description="File object name"),
+    download: str = Query("ignore", description="How to get the file"),
+    expires: int = Query(3600, description="URL validity period (seconds)"),
+    filename: Optional[str] = Query(None, description="Original filename for download (optional)")
+):
+    """
+    Get information, download link, or file stream for a single file
+
+    - **object_name**: File object name
+    - **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL)
+    - **expires**: URL validity period in seconds (default 3600)
+    - **filename**: Original filename for download (optional, if not provided, will use object_name)
+
+    Returns file information, download link, or file content
+    """
+    try:
+        logger.info(f"[get_storage_file] Route matched! object_name={object_name}, download={download}, filename={filename}")
+        if download == "redirect":
+            # return a redirect download URL
+            result = await get_file_url_impl(object_name=object_name, expires=expires)
+            return RedirectResponse(url=result["url"])
+        elif download == "stream":
+            # return a readable file stream
+            file_stream, content_type = await get_file_stream_impl(object_name=object_name)
+            logger.info(f"Streaming file: object_name={object_name}, content_type={content_type}")
+            
+            # Use provided filename or extract from object_name
+            download_filename = filename
+            if not download_filename:
+                # Extract filename from object_name (get the last part after the last slash)
+                download_filename = object_name.split("/")[-1] if "/" in object_name else object_name
+            
+            # Build Content-Disposition header with proper encoding for non-ASCII characters
+            content_disposition = build_content_disposition_header(download_filename)
+            
+            return StreamingResponse(
+                file_stream,
+                media_type=content_type,
+                headers={
+                    "Content-Disposition": content_disposition,
+                    "Cache-Control": "public, max-age=3600",
+                    "ETag": f'"{object_name}"',
+                }
+            )
+        else:
+            # return file metadata
+            return await get_file_url_impl(object_name=object_name, expires=expires)
+    except Exception as e:
+        logger.error(f"Failed to get file: object_name={object_name}, error={str(e)}")
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+            detail=f"Failed to get file information: {str(e)}"
+        )
+
+
+
 @file_management_runtime_router.post("/storage")
 async def storage_upload_files(
     files: List[UploadFile] = File(..., description="List of files to upload"),
@@ -158,43 +264,204 @@ async def get_storage_files(
         )
 
 
-@file_management_config_router.get("/storage/{path}/{object_name}")
-async def get_storage_file(
-    object_name: str = PathParam(..., description="File object name"),
-    download: str = Query("ignore", description="How to get the file"),
-    expires: int = Query(3600, description="URL validity period (seconds)")
+def _ensure_http_scheme(raw_url: str) -> str:
+    """
+    Ensure the provided Datamate URL has an explicit HTTP or HTTPS scheme.
+    """
+    candidate = (raw_url or "").strip()
+    if not candidate:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST,
+            detail="URL cannot be empty"
+        )
+
+    parsed = urlparse(candidate)
+    if parsed.scheme:
+        if parsed.scheme not in ("http", "https"):
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST,
+                detail="URL must start with http:// or https://"
+            )
+        return candidate
+
+    if candidate.startswith("//"):
+        return f"http:{candidate}"
+
+    return f"http://{candidate}"
+
+
+def _normalize_datamate_download_url(raw_url: str) -> str:
+    """
+    Normalize Datamate download URL to ensure it follows /data-management/datasets/{datasetId}/files/{fileId}/download
+    """
+    normalized_source = _ensure_http_scheme(raw_url)
+    parsed_url = urlparse(normalized_source)
+    path_segments = [segment for segment in parsed_url.path.split("/") if segment]
+
+    if "data-management" not in path_segments:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST,
+            detail="Invalid Datamate URL: missing 'data-management' segment"
+        )
+
+    try:
+        dm_index = path_segments.index("data-management")
+        datasets_index = path_segments.index("datasets", dm_index)
+        dataset_id = path_segments[datasets_index + 1]
+        files_index = path_segments.index("files", datasets_index)
+        file_id = path_segments[files_index + 1]
+    except (ValueError, IndexError):
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST,
+            detail="Invalid Datamate URL: unable to parse dataset_id or file_id"
+        )
+
+    prefix_segments = path_segments[:dm_index]
+    prefix_path = "/" + "/".join(prefix_segments) if prefix_segments else ""
+    normalized_path = f"{prefix_path}/data-management/datasets/{dataset_id}/files/{file_id}/download"
+
+    normalized_url = urlunparse((
+        parsed_url.scheme,
+        parsed_url.netloc,
+        normalized_path,
+        "",
+        "",
+        ""
+    ))
+
+    return normalized_url
+
+
+def _build_datamate_url_from_parts(base_url: str, dataset_id: str, file_id: str) -> str:
+    """
+    Build Datamate download URL from individual parts
+    """
+    if not base_url:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST,
+            detail="base_url is required when dataset_id and file_id are provided"
+        )
+
+    base_with_scheme = _ensure_http_scheme(base_url)
+    parsed_base = urlparse(base_with_scheme)
+    base_prefix = parsed_base.path.rstrip("/")
+
+    if base_prefix and not base_prefix.endswith("/api"):
+        if base_prefix.endswith("/"):
+            base_prefix = f"{base_prefix}api"
+        else:
+            base_prefix = f"{base_prefix}/api"
+    elif not base_prefix:
+        base_prefix = "/api"
+
+    normalized_path = f"{base_prefix}/data-management/datasets/{dataset_id}/files/{file_id}/download"
+
+    return urlunparse((
+        parsed_base.scheme,
+        parsed_base.netloc,
+        normalized_path,
+        "",
+        "",
+        ""
+    ))
+
+
+@file_management_config_router.get("/datamate/download")
+async def download_datamate_file(
+    url: Optional[str] = Query(None, description="Datamate file URL to download"),
+    base_url: Optional[str] = Query(None, description="Datamate base server URL (e.g., host:port)"),
+    dataset_id: Optional[str] = Query(None, description="Datamate dataset ID"),
+    file_id: Optional[str] = Query(None, description="Datamate file ID"),
+    filename: Optional[str] = Query(None, description="Optional filename for download"),
+    authorization: Optional[str] = Header(None, alias="Authorization")
 ):
     """
-    Get information, download link, or file stream for a single file
+    Download file from Datamate knowledge base via HTTP URL
 
-    - **object_name**: File object name
-    - **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL)
-    - **expires**: URL validity period in seconds (default 3600)
+    - **url**: Full HTTP URL of the file to download (optional)
+    - **base_url**: Base server URL (e.g., host:port)
+    - **dataset_id**: Datamate dataset ID
+    - **file_id**: Datamate file ID
+    - **filename**: Optional filename for the download (extracted automatically if not provided)
+    - **authorization**: Optional authorizatio  n header to pass to the target URL
 
-    Returns file information, download link, or file content
+    Returns file stream for download
     """
     try:
-        if download == "redirect":
-            # return a redirect download URL
-            result = await get_file_url_impl(object_name=object_name, expires=expires)
-            return RedirectResponse(url=result["url"])
-        elif download == "stream":
-            # return a readable file stream
-            file_stream, content_type = await get_file_stream_impl(object_name=object_name)
+        if url:
+            logger.info(f"[download_datamate_file] Using full URL: {url}")
+            normalized_url = _normalize_datamate_download_url(url)
+        elif base_url and dataset_id and file_id:
+            logger.info(f"[download_datamate_file] Building URL from parts: base_url={base_url}, dataset_id={dataset_id}, file_id={file_id}")
+            normalized_url = _build_datamate_url_from_parts(base_url, dataset_id, file_id)
+        else:
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST,
+                detail="Either url or (base_url, dataset_id, file_id) must be provided"
+            )
+
+        logger.info(f"[download_datamate_file] Normalized download URL: {normalized_url}")
+        logger.info(f"[download_datamate_file] Authorization header present: {authorization is not None}")
+
+        headers = {}
+        if authorization:
+            headers["Authorization"] = authorization
+            logger.debug(f"[download_datamate_file] Using authorization header: {authorization[:20]}...")
+        headers["User-Agent"] = "Nexent-File-Downloader/1.0"
+
+        logger.info(f"[download_datamate_file] Request headers: {list(headers.keys())}")
+
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.get(normalized_url, headers=headers, follow_redirects=True)
+            logger.info(f"[download_datamate_file] Response status: {response.status_code}")
+
+            if response.status_code == 404:
+                logger.error(f"[download_datamate_file] File not found at URL: {normalized_url}")
+                logger.error(f"[download_datamate_file] Response headers: {dict(response.headers)}")
+                raise HTTPException(
+                    status_code=HTTPStatus.NOT_FOUND,
+                    detail="File not found. Please verify dataset_id and file_id."
+                )
+
+            response.raise_for_status()
+
+            content_type = response.headers.get("Content-Type", "application/octet-stream")
+
+            download_filename = filename
+            if not download_filename:
+                content_disposition = response.headers.get("Content-Disposition", "")
+                if content_disposition:
+                    filename_match = re.search(r'filename="?(.+?)"?$', content_disposition)
+                    if filename_match:
+                        download_filename = filename_match.group(1)
+
+                if not download_filename:
+                    path = unquote(urlparse(normalized_url).path)
+                    download_filename = path.split('/')[-1] or "download"
+
+            # Build Content-Disposition header with proper encoding for non-ASCII characters
+            content_disposition = build_content_disposition_header(download_filename)
+            
             return StreamingResponse(
-                file_stream,
+                iter([response.content]),
                 media_type=content_type,
                 headers={
-                    "Content-Disposition": f'inline; filename="{object_name}"'
+                    "Content-Disposition": content_disposition
                 }
             )
-        else:
-            # return file metadata
-            return await get_file_url_impl(object_name=object_name, expires=expires)
+    except httpx.HTTPError as e:
+        logger.error(f"Failed to download file from URL {url}: {str(e)}")
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_GATEWAY,
+            detail=f"Failed to download file from URL: {str(e)}"
+        )
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"Failed to download datamate file: {str(e)}")
         raise HTTPException(
             status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
-            detail=f"Failed to get file information: {str(e)}"
+            detail=f"Failed to download file: {str(e)}"
         )