⚡️ refactor(exceptions): Rename exceptions for clarity and update error handling in detection methods

sudoskys · sudoskys · commit fc1898039c6e · 2025-09-16T17:19:16.000+08:00
diff --git a/README.md b/README.md
@@ -19,7 +19,18 @@
 
 > ### Memory note
 > 
-> The lite model runs offline and is memory-friendly; the full model is larger and offers higher accuracy. Choose the model that best fits your constraints.
+> The lite model runs offline and is memory-friendly; the full model is larger and offers higher accuracy.
+> 
+> Approximate memory usage (RSS after load):
+> - Lite: ~45–60 MB
+> - Full: ~170–210 MB
+> - Auto: tries full first, falls back to lite only on MemoryError.
+> 
+> Notes:
+> - Measurements vary by Python version, OS, allocator, and import graph; treat these as practical ranges.
+> - Validate on your system if constrained; see `examples/memory_usage_check.py` (credit: script by github@JackyHe398).
+> 
+> Choose the model that best fits your constraints.
 
 ## Installation 💻
 
@@ -75,30 +86,35 @@ from fast_langdetect import LangDetectConfig, detect
 
 cfg = LangDetectConfig(cache_dir="/custom/cache/path")
 print(detect("Hello", model='full', config=cfg))
+
+# Set a default model via config and let calls omit model
+cfg_lite = LangDetectConfig(model="lite")
+print(detect("Hello", config=cfg_lite))          # uses lite by default
+print(detect("Bonjour", config=cfg_lite))        # uses lite by default
+print(detect("Hello", model='full', config=cfg_lite))  # per-call override to full
+
 ```
 
 ### Native API (Recommended)
 
 ```python
-from fast_langdetect import detect, LangDetector, LangDetectConfig, DetectError
+from fast_langdetect import detect, LangDetector, LangDetectConfig
 
-# Simple detection (auto behavior)
-print(detect("Hello, world!", model='auto', k=1))
+# Simple detection (uses config default if not provided; defaults to 'auto')
+print(detect("Hello, world!", k=1))
 # Output: [{'lang': 'en', 'score': 0.98}]
 
 # Using full model for better accuracy
 print(detect("Hello, world!", model='full', k=1))
 # Output: [{'lang': 'en', 'score': 0.99}]
 
 # Custom configuration
-config = LangDetectConfig(cache_dir="/custom/cache/path")  # Custom model cache directory
+config = LangDetectConfig(cache_dir="/custom/cache/path", model="auto")  # Custom cache + default model
 detector = LangDetector(config)
 
-try:
-    result = detector.detect("Hello world", model='full', k=1)
-    print(result)  # [{'lang': 'en', 'score': 0.98}]
-except DetectError as e:
-    print(f"Detection failed: {e}")
+# Omit model to use config.model; pass model to override
+result = detector.detect("Hello world", k=1)
+print(result)  # [{'lang': 'en', 'score': 0.98}]
 
 # Multiline text is handled automatically (newlines are replaced)
 multiline_text = "Hello, world!\nThis is a multiline text."
@@ -121,10 +137,16 @@ print(results)
 
 #### Fallback Policy (Keep It Simple)
 
-- Only MemoryError triggers fallback (in `model='auto'`): when loading the full model runs out of memory, it falls back to the lite model.
-- I/O/network/permission/path/integrity errors raise `DetectError` (with original exception) — no silent fallback.
+- Only `MemoryError` triggers fallback (in `model='auto'`): when loading the full model runs out of memory, it falls back to the lite model.
+- I/O/network/permission/path/integrity errors raise standard exceptions (e.g., `FileNotFoundError`, `PermissionError`) or library-specific errors where applicable — no silent fallback.
 - `model='lite'` and `model='full'` never fallback by design.
 
+#### Errors
+
+- Base error: `FastLangdetectError` (library-specific failures).
+- Model loading failures: `ModelLoadError`.
+- Standard Python exceptions (e.g., `ValueError`, `TypeError`, `FileNotFoundError`, `MemoryError`) propagate when they are not library-specific.
+
 ### Convenient `detect_language` Function
 
 ```python
@@ -177,7 +199,7 @@ print(detector.detect("Some very long text..."))
 ### Cache Directory Behavior
 
 - Default cache: if `cache_dir` is not set, models are stored under a system temp-based directory specified by `FTLANG_CACHE` or an internal default. This directory is created automatically when needed.
-- User-provided cache_dir: if you set `LangDetectConfig(cache_dir=...)` to a path that does not exist, the library raises `DetectError` instead of silently creating or using another location. Create the directory yourself if that’s intended.
+- User-provided cache_dir: if you set `LangDetectConfig(cache_dir=...)` to a path that does not exist, the library raises `FileNotFoundError` instead of silently creating or using another location. Create the directory yourself if that’s intended.
 
 ### Advanced Options (Optional)
 
diff --git a/feature_test/__init__.py b/feature_test/__init__.py
@@ -6,7 +6,6 @@
     detect_language,
     LangDetector,
     LangDetectConfig,
-    DetectError,
 )
 
 # 多语言候选（使用 full 模型，返回前 5 个候选）
@@ -28,7 +27,7 @@
     )
 )
 
-# 当离线或无网络时，使用 full 模型会抛出 DetectError；lite 模型离线可用
+# 当离线或无网络时，使用 full 模型可能抛出标准 I/O/网络异常或库内异常；lite 模型离线可用
 try:
     print(
         detect(
@@ -38,7 +37,7 @@
             config=LangDetectConfig(),
         )
     )
-except DetectError as e:
+except Exception as e:
     print(f"Detection failed: {e}")
 
 # 使用自定义配置与实例化 Detector
diff --git a/src/fast_langdetect/__init__.py b/src/fast_langdetect/__init__.py
@@ -1,8 +1,13 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/1/17 下午4:00
 
-from .infer import LangDetector, LangDetectConfig, DetectError  # noqa: F401
-from .infer import detect
+from .infer import (
+    LangDetector,
+    LangDetectConfig,
+    detect,
+    FastLangdetectError,
+    ModelLoadError,
+)  # noqa: F401
 
 
 def is_japanese(string):
diff --git a/src/fast_langdetect/infer.py b/src/fast_langdetect/infer.py
@@ -27,9 +27,13 @@
 _LOCAL_SMALL_MODEL_PATH = Path(__file__).parent / "resources" / "lid.176.ftz"
 
 
-class DetectError(Exception):
-    """Base exception for language detection errors."""
+class FastLangdetectError(Exception):
+    """Base exception for library-specific failures."""
+    pass
+
 
+class ModelLoadError(FastLangdetectError):
+    """Raised when a FastText model fails to load."""
     pass
 
 
@@ -46,7 +50,8 @@ def download(url: str, save_path: Path, proxy: Optional[str] = None) -> None:
         :param proxy: Optional proxy URL
 
         :raises:
-            DetectError: If download fails
+            FastLangdetectError: If download fails
+            FileNotFoundError: If a user-provided cache directory does not exist
         """
         if save_path.exists():
             logger.info(f"fast-langdetect: Model exists at {save_path}")
@@ -62,10 +67,12 @@ def download(url: str, save_path: Path, proxy: Optional[str] = None) -> None:
                 try:
                     parent_dir.mkdir(parents=True, exist_ok=True)
                 except Exception as e:
-                    raise DetectError(f"fast-langdetect: Cannot create cache directory {parent_dir}: {e}") from e
+                    raise FastLangdetectError(
+                        f"fast-langdetect: Cannot create cache directory {parent_dir}: {e}"
+                    ) from e
             else:
                 # For user-specified cache_dir, do not fallback; raise
-                raise DetectError(f"fast-langdetect: Cache directory not found: {parent_dir}")
+                raise FileNotFoundError(f"fast-langdetect: Cache directory not found: {parent_dir}")
         try:
             download(
                 url=url,
@@ -77,7 +84,8 @@ def download(url: str, save_path: Path, proxy: Optional[str] = None) -> None:
                 timeout=7,
             )
         except Exception as e:
-            raise DetectError(f"fast-langdetect: Download failed: {e}") from e
+            # Download failures are library-specific
+            raise FastLangdetectError(f"fast-langdetect: Download failed: {e}") from e
 
 
 class ModelLoader:
@@ -89,7 +97,8 @@ def __init__(self):
     def load_local(self, model_path: Path) -> Any:
         """Load model from local file."""
         if not model_path.exists():
-            raise DetectError(f"Model file not found: {model_path}")
+            # Missing path is a standard I/O error
+            raise FileNotFoundError(f"Model file not found: {model_path}")
 
         if platform.system() == "Windows":
             return self._load_windows_compatible(model_path)
@@ -112,7 +121,7 @@ def _load_windows_compatible(self, model_path: Path) -> Any:
         
         :param model_path: Path to the model file
         :return: Loaded FastText model
-        :raises DetectError: If all loading strategies fail
+        :raises ModelLoadError: If all loading strategies fail
         """
         model_path_str = str(model_path.resolve())
 
@@ -142,7 +151,7 @@ def _load_windows_compatible(self, model_path: Path) -> Any:
             shutil.copy2(model_path, tmp_path)
             return fasttext.load_model(tmp_path)
         except Exception as e:
-            raise DetectError(f"Failed to load model using temporary file: {e}") from e
+            raise ModelLoadError(f"Failed to load model using temporary file: {e}") from e
         finally:
             # Clean up temporary file
             if tmp_path and os.path.exists(tmp_path):
@@ -166,7 +175,7 @@ def _load_unix(self, model_path: Path) -> Any:
             # Let MemoryError propagate up to be handled by _get_model
             raise e
         except Exception as e:
-            raise DetectError(f"fast-langdetect: Failed to load model: {e}") from e
+            raise ModelLoadError(f"fast-langdetect: Failed to load model: {e}") from e
 
 
 class LangDetectConfig:
@@ -178,6 +187,7 @@ class LangDetectConfig:
     :param proxy: HTTP proxy for downloads
     :param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
     :param max_input_length: If set, truncate input to this many characters (always debug-log the change)
+    :param model: Default model selection ('auto' | 'full' | 'lite') used when detect() is called without a model
     """
 
     def __init__(
@@ -187,13 +197,15 @@ def __init__(
             proxy: Optional[str] = None,
             normalize_input: bool = True,
             max_input_length: Optional[int] = 80,
+            model: Literal["lite", "full", "auto"] = "auto",
     ):
         self.cache_dir = cache_dir or CACHE_DIRECTORY
         self.custom_model_path = custom_model_path
         self.proxy = proxy
         self.normalize_input = normalize_input
         # Input handling
         self.max_input_length = max_input_length
+        self.model: Literal["lite", "full", "auto"] = model
         if self.custom_model_path and not Path(self.custom_model_path).exists():
             raise FileNotFoundError(f"fast-langdetect: Target model file not found: {self.custom_model_path}")
 
@@ -285,41 +297,17 @@ def _get_model(self, low_memory: bool = True, *, fallback_on_memory_error: bool
             self._models[cache_key] = model
             return model
         except MemoryError as e:
-            if low_memory is not True and fallback_on_memory_error:
+            if (not low_memory) and fallback_on_memory_error:
                 logger.info("fast-langdetect: Falling back to low-memory model...")
                 return self._get_model(low_memory=True, fallback_on_memory_error=False)
-            raise DetectError("Failed to load model") from e
-
-    def detect(self, text: str) -> Dict[str, Union[str, float]]:
-        """
-        Detect primary language of text.
-
-        :param text: Input text
-
-        :return: Dictionary with language and confidence score
-
-        :raises:
-            DetectError: If detection fails
-        """
-        # Default to smart behavior: try large, fallback on MemoryError
-        model = self._get_model(low_memory=False, fallback_on_memory_error=True)
-        text = self._preprocess_text(text)
-        normalized_text = self._normalize_text(text, self.config.normalize_input)
-        try:
-            labels, scores = model.predict(normalized_text)
-            return {
-                "lang": labels[0].replace("__label__", ""),
-                "score": min(float(scores[0]), 1.0),
-            }
-        except Exception as e:
-            logger.error(f"fast-langdetect: Language detection error: {e}")
-            raise DetectError("Language detection failed") from e
+            # Preserve original MemoryError and traceback
+            raise
 
     def detect(
             self,
             text: str,
             *,
-            model: Literal["lite", "full", "auto"] = "auto",
+            model: Optional[Literal["lite", "full", "auto"]] = None,
             k: int = 1,
             threshold: float = 0.0,
     ) -> List[Dict[str, Any]]:
@@ -330,34 +318,37 @@ def detect(
         :param model: 'lite' | 'full' | 'auto' (auto falls back on MemoryError)
         :param k: Number of top languages to return
         :param threshold: Minimum confidence threshold
-        :raises DetectError: On detection failures
+        :raises FastLangdetectError: For library-specific failures (e.g., invalid model)
+        :raises Exception: Standard Python exceptions propagate, such as MemoryError, FileNotFoundError
         """
-        if model not in {"lite", "full", "auto"}:
-            raise DetectError(f"Invalid model: {model}")
+        # Determine model to use (config default if not provided)
+        sel_model: Literal["lite", "full", "auto"]
+        if model is None:
+            sel_model = self.config.model
+        else:
+            if model not in {"lite", "full", "auto"}:  # type: ignore[comparison-overlap]
+                raise FastLangdetectError(f"Invalid model: {model}")
+            sel_model = model
 
         # Select model backend
-        if model == "lite":
+        if sel_model == "lite":
             ft_model = self._get_model(low_memory=True, fallback_on_memory_error=False)
-        elif model == "full":
+        elif sel_model == "full":
             ft_model = self._get_model(low_memory=False, fallback_on_memory_error=False)
         else:
             ft_model = self._get_model(low_memory=False, fallback_on_memory_error=True)
 
         text = self._preprocess_text(text)
         normalized_text = self._normalize_text(text, self.config.normalize_input)
-        try:
-            labels, scores = ft_model.predict(normalized_text, k=k, threshold=threshold)
-            results = [
-                {
-                    "lang": label.replace("__label__", ""),
-                    "score": min(float(score), 1.0),
-                }
-                for label, score in zip(labels, scores)
-            ]
-            return sorted(results, key=lambda x: x["score"], reverse=True)
-        except Exception as e:
-            logger.error(f"fast-langdetect: Detection error: {e}")
-            raise DetectError("Language detection failed") from e
+        labels, scores = ft_model.predict(normalized_text, k=k, threshold=threshold)
+        results = [
+            {
+                "lang": label.replace("__label__", ""),
+                "score": min(float(score), 1.0),
+            }
+            for label, score in zip(labels, scores)
+        ]
+        return sorted(results, key=lambda x: x["score"], reverse=True)
 
 
 # Global instance for simple usage
@@ -367,7 +358,7 @@ def detect(
 def detect(
     text: str,
     *,
-    model: Literal["lite", "full", "auto"] = "auto",
+    model: Optional[Literal["lite", "full", "auto"]] = None,
     k: int = 1,
     threshold: float = 0.0,
     config: Optional[LangDetectConfig] = None,
diff --git a/tests/test_real_detection.py b/tests/test_real_detection.py
@@ -5,7 +5,6 @@
     detect,
     LangDetector,
     LangDetectConfig,
-    DetectError,
 )
 
 # Test samples with known languages
@@ -96,7 +95,7 @@ def test_not_found_model_without_fallback_on_io_error(self):
             cache_dir="/nonexistent/path",
         )
         detector = LangDetector(config)
-        with pytest.raises(DetectError):
+        with pytest.raises(FileNotFoundError):
             detector.detect("Hello world", model="full", k=1)
 
 @pytest.mark.real

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,6 @@`
`6`	`6`	`detect_language,`
`7`	`7`	`LangDetector,`
`8`	`8`	`LangDetectConfig,`
`9`		`- DetectError,`
`10`	`9`	`)`
`11`	`10`
`12`	`11`	`# 多语言候选（使用 full 模型，返回前 5 个候选）`
`@@ -28,7 +27,7 @@`
`28`	`27`	`)`
`29`	`28`	`)`
`30`	`29`
`31`		`-# 当离线或无网络时，使用 full 模型会抛出 DetectError；lite 模型离线可用`
	`30`	`+# 当离线或无网络时，使用 full 模型可能抛出标准 I/O/网络异常或库内异常；lite 模型离线可用`
`32`	`31`	`try:`
`33`	`32`	`print(`
`34`	`33`	`detect(`
`@@ -38,7 +37,7 @@`
`38`	`37`	`config=LangDetectConfig(),`
`39`	`38`	`)`
`40`	`39`	`)`
`41`		`-except DetectError as e:`
	`40`	`+except Exception as e:`
`42`	`41`	`print(f"Detection failed: {e}")`
`43`	`42`
`44`	`43`	`# 使用自定义配置与实例化 Detector`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`detect,`
`6`	`6`	`LangDetector,`
`7`	`7`	`LangDetectConfig,`
`8`		`- DetectError,`
`9`	`8`	`)`
`10`	`9`
`11`	`10`	`# Test samples with known languages`
`@@ -96,7 +95,7 @@ def test_not_found_model_without_fallback_on_io_error(self):`
`96`	`95`	`cache_dir="/nonexistent/path",`
`97`	`96`	`)`
`98`	`97`	`detector = LangDetector(config)`
`99`		`- with pytest.raises(DetectError):`
	`98`	`+ with pytest.raises(FileNotFoundError):`
`100`	`99`	`detector.detect("Hello world", model="full", k=1)`
`101`	`100`
`102`	`101`	`@pytest.mark.real`