Skip to content

Commit 6dc6ac1

Browse files
committed
Merge remote-tracking branch 'origin/develop' into xyc/kb_enforcement
# Conflicts: # frontend/public/locales/en/common.json # frontend/public/locales/zh/common.json
2 parents d73e40f + 958fee2 commit 6dc6ac1

36 files changed

+4427
-1313
lines changed

backend/apps/file_management_app.py

Lines changed: 290 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import logging
2+
import re
23
from http import HTTPStatus
34
from typing import List, Optional
5+
from urllib.parse import urlparse, urlunparse, unquote, quote
46

7+
import httpx
58
from fastapi import APIRouter, Body, File, Form, Header, HTTPException, Path as PathParam, Query, UploadFile
69
from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
710

@@ -12,6 +15,51 @@
1215

1316
logger = logging.getLogger("file_management_app")
1417

18+
19+
def build_content_disposition_header(filename: Optional[str]) -> str:
20+
"""
21+
Build a Content-Disposition header that keeps the original filename.
22+
23+
- ASCII filenames are returned directly.
24+
- Non-ASCII filenames include both an ASCII fallback and RFC 5987 encoded value
25+
so modern browsers keep the original name.
26+
"""
27+
safe_name = (filename or "download").strip() or "download"
28+
29+
def _sanitize_ascii(value: str) -> str:
30+
# Replace problematic characters that break HTTP headers
31+
# Remove control characters (newlines, carriage returns, tabs, etc.)
32+
# Remove control characters (0x00-0x1F and 0x7F)
33+
sanitized = re.sub(r'[\x00-\x1F\x7F]', '', value)
34+
# Replace problematic characters that break HTTP headers
35+
sanitized = sanitized.replace("\\", "_").replace('"', "_")
36+
# Remove leading/trailing spaces and dots (Windows filename restrictions)
37+
sanitized = sanitized.strip(' .')
38+
return sanitized if sanitized else "download"
39+
40+
try:
41+
safe_name.encode("ascii")
42+
return f'attachment; filename="{_sanitize_ascii(safe_name)}"'
43+
except UnicodeEncodeError:
44+
try:
45+
encoded = quote(safe_name, safe="")
46+
except Exception:
47+
# quote failure, fallback to sanitized ASCII only
48+
logger.warning("Failed to encode filename '%s', using fallback", safe_name)
49+
return f'attachment; filename="{_sanitize_ascii(safe_name)}"'
50+
51+
fallback = _sanitize_ascii(
52+
safe_name.encode("ascii", "ignore").decode("ascii") or "download"
53+
)
54+
return f'attachment; filename="{fallback}"; filename*=UTF-8\'\'{encoded}'
55+
except Exception as exc: # pragma: no cover
56+
logger.warning(
57+
"Failed to encode filename '%s': %s. Using fallback.",
58+
safe_name,
59+
exc,
60+
)
61+
return 'attachment; filename="download"'
62+
1563
# Create API router
1664
file_management_runtime_router = APIRouter(prefix="/file")
1765
file_management_config_router = APIRouter(prefix="/file")
@@ -98,6 +146,64 @@ async def process_files(
98146
)
99147

100148

149+
@file_management_config_router.get("/download/{object_name:path}")
150+
async def get_storage_file(
151+
object_name: str = PathParam(..., description="File object name"),
152+
download: str = Query("ignore", description="How to get the file"),
153+
expires: int = Query(3600, description="URL validity period (seconds)"),
154+
filename: Optional[str] = Query(None, description="Original filename for download (optional)")
155+
):
156+
"""
157+
Get information, download link, or file stream for a single file
158+
159+
- **object_name**: File object name
160+
- **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL)
161+
- **expires**: URL validity period in seconds (default 3600)
162+
- **filename**: Original filename for download (optional, if not provided, will use object_name)
163+
164+
Returns file information, download link, or file content
165+
"""
166+
try:
167+
logger.info(f"[get_storage_file] Route matched! object_name={object_name}, download={download}, filename={filename}")
168+
if download == "redirect":
169+
# return a redirect download URL
170+
result = await get_file_url_impl(object_name=object_name, expires=expires)
171+
return RedirectResponse(url=result["url"])
172+
elif download == "stream":
173+
# return a readable file stream
174+
file_stream, content_type = await get_file_stream_impl(object_name=object_name)
175+
logger.info(f"Streaming file: object_name={object_name}, content_type={content_type}")
176+
177+
# Use provided filename or extract from object_name
178+
download_filename = filename
179+
if not download_filename:
180+
# Extract filename from object_name (get the last part after the last slash)
181+
download_filename = object_name.split("/")[-1] if "/" in object_name else object_name
182+
183+
# Build Content-Disposition header with proper encoding for non-ASCII characters
184+
content_disposition = build_content_disposition_header(download_filename)
185+
186+
return StreamingResponse(
187+
file_stream,
188+
media_type=content_type,
189+
headers={
190+
"Content-Disposition": content_disposition,
191+
"Cache-Control": "public, max-age=3600",
192+
"ETag": f'"{object_name}"',
193+
}
194+
)
195+
else:
196+
# return file metadata
197+
return await get_file_url_impl(object_name=object_name, expires=expires)
198+
except Exception as e:
199+
logger.error(f"Failed to get file: object_name={object_name}, error={str(e)}")
200+
raise HTTPException(
201+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
202+
detail=f"Failed to get file information: {str(e)}"
203+
)
204+
205+
206+
101207
@file_management_runtime_router.post("/storage")
102208
async def storage_upload_files(
103209
files: List[UploadFile] = File(..., description="List of files to upload"),
@@ -158,43 +264,204 @@ async def get_storage_files(
158264
)
159265

160266

161-
@file_management_config_router.get("/storage/{path}/{object_name}")
162-
async def get_storage_file(
163-
object_name: str = PathParam(..., description="File object name"),
164-
download: str = Query("ignore", description="How to get the file"),
165-
expires: int = Query(3600, description="URL validity period (seconds)")
267+
def _ensure_http_scheme(raw_url: str) -> str:
268+
"""
269+
Ensure the provided Datamate URL has an explicit HTTP or HTTPS scheme.
270+
"""
271+
candidate = (raw_url or "").strip()
272+
if not candidate:
273+
raise HTTPException(
274+
status_code=HTTPStatus.BAD_REQUEST,
275+
detail="URL cannot be empty"
276+
)
277+
278+
parsed = urlparse(candidate)
279+
if parsed.scheme:
280+
if parsed.scheme not in ("http", "https"):
281+
raise HTTPException(
282+
status_code=HTTPStatus.BAD_REQUEST,
283+
detail="URL must start with http:// or https://"
284+
)
285+
return candidate
286+
287+
if candidate.startswith("//"):
288+
return f"http:{candidate}"
289+
290+
return f"http://{candidate}"
291+
292+
293+
def _normalize_datamate_download_url(raw_url: str) -> str:
294+
"""
295+
Normalize Datamate download URL to ensure it follows /data-management/datasets/{datasetId}/files/{fileId}/download
296+
"""
297+
normalized_source = _ensure_http_scheme(raw_url)
298+
parsed_url = urlparse(normalized_source)
299+
path_segments = [segment for segment in parsed_url.path.split("/") if segment]
300+
301+
if "data-management" not in path_segments:
302+
raise HTTPException(
303+
status_code=HTTPStatus.BAD_REQUEST,
304+
detail="Invalid Datamate URL: missing 'data-management' segment"
305+
)
306+
307+
try:
308+
dm_index = path_segments.index("data-management")
309+
datasets_index = path_segments.index("datasets", dm_index)
310+
dataset_id = path_segments[datasets_index + 1]
311+
files_index = path_segments.index("files", datasets_index)
312+
file_id = path_segments[files_index + 1]
313+
except (ValueError, IndexError):
314+
raise HTTPException(
315+
status_code=HTTPStatus.BAD_REQUEST,
316+
detail="Invalid Datamate URL: unable to parse dataset_id or file_id"
317+
)
318+
319+
prefix_segments = path_segments[:dm_index]
320+
prefix_path = "/" + "/".join(prefix_segments) if prefix_segments else ""
321+
normalized_path = f"{prefix_path}/data-management/datasets/{dataset_id}/files/{file_id}/download"
322+
323+
normalized_url = urlunparse((
324+
parsed_url.scheme,
325+
parsed_url.netloc,
326+
normalized_path,
327+
"",
328+
"",
329+
""
330+
))
331+
332+
return normalized_url
333+
334+
335+
def _build_datamate_url_from_parts(base_url: str, dataset_id: str, file_id: str) -> str:
336+
"""
337+
Build Datamate download URL from individual parts
338+
"""
339+
if not base_url:
340+
raise HTTPException(
341+
status_code=HTTPStatus.BAD_REQUEST,
342+
detail="base_url is required when dataset_id and file_id are provided"
343+
)
344+
345+
base_with_scheme = _ensure_http_scheme(base_url)
346+
parsed_base = urlparse(base_with_scheme)
347+
base_prefix = parsed_base.path.rstrip("/")
348+
349+
if base_prefix and not base_prefix.endswith("/api"):
350+
if base_prefix.endswith("/"):
351+
base_prefix = f"{base_prefix}api"
352+
else:
353+
base_prefix = f"{base_prefix}/api"
354+
elif not base_prefix:
355+
base_prefix = "/api"
356+
357+
normalized_path = f"{base_prefix}/data-management/datasets/{dataset_id}/files/{file_id}/download"
358+
359+
return urlunparse((
360+
parsed_base.scheme,
361+
parsed_base.netloc,
362+
normalized_path,
363+
"",
364+
"",
365+
""
366+
))
367+
368+
369+
@file_management_config_router.get("/datamate/download")
370+
async def download_datamate_file(
371+
url: Optional[str] = Query(None, description="Datamate file URL to download"),
372+
base_url: Optional[str] = Query(None, description="Datamate base server URL (e.g., host:port)"),
373+
dataset_id: Optional[str] = Query(None, description="Datamate dataset ID"),
374+
file_id: Optional[str] = Query(None, description="Datamate file ID"),
375+
filename: Optional[str] = Query(None, description="Optional filename for download"),
376+
authorization: Optional[str] = Header(None, alias="Authorization")
166377
):
167378
"""
168-
Get information, download link, or file stream for a single file
379+
Download file from Datamate knowledge base via HTTP URL
169380
170-
- **object_name**: File object name
171-
- **download**: Download mode: ignore (default, return file info), stream (return file stream), redirect (redirect to download URL)
172-
- **expires**: URL validity period in seconds (default 3600)
381+
- **url**: Full HTTP URL of the file to download (optional)
382+
- **base_url**: Base server URL (e.g., host:port)
383+
- **dataset_id**: Datamate dataset ID
384+
- **file_id**: Datamate file ID
385+
- **filename**: Optional filename for the download (extracted automatically if not provided)
386+
- **authorization**: Optional authorizatio n header to pass to the target URL
173387
174-
Returns file information, download link, or file content
388+
Returns file stream for download
175389
"""
176390
try:
177-
if download == "redirect":
178-
# return a redirect download URL
179-
result = await get_file_url_impl(object_name=object_name, expires=expires)
180-
return RedirectResponse(url=result["url"])
181-
elif download == "stream":
182-
# return a readable file stream
183-
file_stream, content_type = await get_file_stream_impl(object_name=object_name)
391+
if url:
392+
logger.info(f"[download_datamate_file] Using full URL: {url}")
393+
normalized_url = _normalize_datamate_download_url(url)
394+
elif base_url and dataset_id and file_id:
395+
logger.info(f"[download_datamate_file] Building URL from parts: base_url={base_url}, dataset_id={dataset_id}, file_id={file_id}")
396+
normalized_url = _build_datamate_url_from_parts(base_url, dataset_id, file_id)
397+
else:
398+
raise HTTPException(
399+
status_code=HTTPStatus.BAD_REQUEST,
400+
detail="Either url or (base_url, dataset_id, file_id) must be provided"
401+
)
402+
403+
logger.info(f"[download_datamate_file] Normalized download URL: {normalized_url}")
404+
logger.info(f"[download_datamate_file] Authorization header present: {authorization is not None}")
405+
406+
headers = {}
407+
if authorization:
408+
headers["Authorization"] = authorization
409+
logger.debug(f"[download_datamate_file] Using authorization header: {authorization[:20]}...")
410+
headers["User-Agent"] = "Nexent-File-Downloader/1.0"
411+
412+
logger.info(f"[download_datamate_file] Request headers: {list(headers.keys())}")
413+
414+
async with httpx.AsyncClient(timeout=30.0) as client:
415+
response = await client.get(normalized_url, headers=headers, follow_redirects=True)
416+
logger.info(f"[download_datamate_file] Response status: {response.status_code}")
417+
418+
if response.status_code == 404:
419+
logger.error(f"[download_datamate_file] File not found at URL: {normalized_url}")
420+
logger.error(f"[download_datamate_file] Response headers: {dict(response.headers)}")
421+
raise HTTPException(
422+
status_code=HTTPStatus.NOT_FOUND,
423+
detail="File not found. Please verify dataset_id and file_id."
424+
)
425+
426+
response.raise_for_status()
427+
428+
content_type = response.headers.get("Content-Type", "application/octet-stream")
429+
430+
download_filename = filename
431+
if not download_filename:
432+
content_disposition = response.headers.get("Content-Disposition", "")
433+
if content_disposition:
434+
filename_match = re.search(r'filename="?(.+?)"?$', content_disposition)
435+
if filename_match:
436+
download_filename = filename_match.group(1)
437+
438+
if not download_filename:
439+
path = unquote(urlparse(normalized_url).path)
440+
download_filename = path.split('/')[-1] or "download"
441+
442+
# Build Content-Disposition header with proper encoding for non-ASCII characters
443+
content_disposition = build_content_disposition_header(download_filename)
444+
184445
return StreamingResponse(
185-
file_stream,
446+
iter([response.content]),
186447
media_type=content_type,
187448
headers={
188-
"Content-Disposition": f'inline; filename="{object_name}"'
449+
"Content-Disposition": content_disposition
189450
}
190451
)
191-
else:
192-
# return file metadata
193-
return await get_file_url_impl(object_name=object_name, expires=expires)
452+
except httpx.HTTPError as e:
453+
logger.error(f"Failed to download file from URL {url}: {str(e)}")
454+
raise HTTPException(
455+
status_code=HTTPStatus.BAD_GATEWAY,
456+
detail=f"Failed to download file from URL: {str(e)}"
457+
)
458+
except HTTPException:
459+
raise
194460
except Exception as e:
461+
logger.error(f"Failed to download datamate file: {str(e)}")
195462
raise HTTPException(
196463
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
197-
detail=f"Failed to get file information: {str(e)}"
464+
detail=f"Failed to download file: {str(e)}"
198465
)
199466

200467

0 commit comments

Comments
 (0)