Merge pull request #21 from LlmKira/fix/0915

sudoskys · web-flow · commit c5bd9015acce · 2025-09-15T21:28:04.000+08:00
🔧 fix: bump version to 0.4.0 for new features and improvements
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,11 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+## [0.4.0] - 2025-09-15
+
+- Behavior: Always replace newline characters in input to prevent FastText errors. This adjustment is logged at DEBUG level only.
+- Default input truncation: Truncate inputs to 80 characters by default for stable predictions. Configurable via `LangDetectConfig(max_input_length=...)`; set `None` to disable.
+- Simplified config: Removed previously proposed `verbose` and `replace_newlines` options; newline replacement is unconditional and logging of adjustments is controlled by global logger level.
+- Logging: Deprecated-parameter messages lowered from WARNING to INFO to reduce noise.
+- Documentation: README now includes language code → name mapping guidance and an explicit model license note (CC BY-SA 3.0) alongside MIT for code.
diff --git a/README.md b/README.md
@@ -74,14 +74,10 @@ try:
 except DetectError as e:
     print(f"Detection failed: {e}")
 
-# How to deal with multiline text
-multiline_text = """
-Hello, world!
-This is a multiline text.
-"""
-multiline_text = multiline_text.replace("\n", " ")  
+# Multiline text is handled automatically (newlines are replaced)
+multiline_text = "Hello, world!\nThis is a multiline text."
 print(detect(multiline_text))
-# Output: {'lang': 'en', 'score': 0.8509423136711121}
+# Output: {'lang': 'en', 'score': 0.85}
 
 # Multi-language detection
 results = detect_multilingual(
@@ -151,6 +147,74 @@ result = detector.detect("Hello world")
 For text splitting based on language, please refer to the [split-lang](https://github.com/DoodleBears/split-lang)
 repository.
 
+
+### Input Handling
+
+You can control log verbosity and input normalization via `LangDetectConfig`:
+
+```python
+from fast_langdetect import LangDetectConfig, LangDetector
+
+config = LangDetectConfig(
+    max_input_length=80,    # default: auto-truncate long inputs for stable results
+)
+detector = LangDetector(config)
+print(detector.detect("Some very long text..."))
+```
+
+- Newlines are always replaced with spaces to avoid FastText errors (silent, no log).
+- When truncation happens, a WARNING is logged because it may reduce accuracy.
+- `max_input_length=80` truncates overly long inputs; set `None` to disable if you prefer no truncation.
+
+### Fallback Behavior
+
+- As of the latest change, the library only falls back to the bundled small model when a MemoryError occurs while loading the large model.
+- For other errors (e.g., I/O/permission errors, corrupted files, invalid paths), the error is raised as `DetectError` so you can diagnose the root cause quickly.
+- This avoids silently masking real issues and prevents unnecessary re-downloads that can slow execution.
+
+### Language Codes → English Names
+
+The detector returns fastText language codes (e.g., `en`, `zh`, `ja`, `pt-br`). To present user-friendly names, you can map codes to English names using a third-party library. Example using `langcodes`:
+
+```python
+# pip install langcodes
+from langcodes import Language
+
+OVERRIDES = {
+    # fastText-specific or variant tags commonly used
+    "yue": "Cantonese",
+    "wuu": "Wu Chinese",
+    "arz": "Egyptian Arabic",
+    "ckb": "Central Kurdish",
+    "kab": "Kabyle",
+    "zh-cn": "Chinese (China)",
+    "zh-tw": "Chinese (Taiwan)",
+    "pt-br": "Portuguese (Brazil)",
+}
+
+def code_to_english_name(code: str) -> str:
+    code = code.replace("_", "-").lower()
+    if code in OVERRIDES:
+        return OVERRIDES[code]
+    try:
+        # Display name in English; e.g. 'Portuguese (Brazil)'
+        return Language.get(code).display_name("en")
+    except Exception:
+        # Try the base language (e.g., 'pt' from 'pt-br')
+        base = code.split("-")[0]
+        try:
+            return Language.get(base).display_name("en")
+        except Exception:
+            return code
+
+# Usage
+from fast_langdetect import detect
+result = detect("Olá mundo", low_memory=False)
+print(code_to_english_name(result["lang"]))  # Portuguese (Brazil) or Portuguese
+```
+
+Alternatively, `pycountry` can be used for ISO 639 lookups (install with `pip install pycountry`), combined with a small override dict for non-standard tags like `pt-br`, `zh-cn`, `yue`, etc.
+
 ## Benchmark 📊
 
 For detailed benchmark results, refer
@@ -180,3 +244,12 @@ models
   year={2016}
 }
 ```
+
+## License 📄
+
+- Code: Released under the MIT License (see `LICENSE`).
+- Models: This package uses the pre-trained fastText language identification models (`lid.176.ftz` bundled for offline use and `lid.176.bin` downloaded as needed). These models are licensed under the Creative Commons Attribution-ShareAlike 3.0 (CC BY-SA 3.0) license.
+- Attribution: fastText language identification models by Facebook AI Research. See the fastText docs and license for details:
+  - https://fasttext.cc/docs/en/language-identification.html
+  - https://creativecommons.org/licenses/by-sa/3.0/
+- Note: If you redistribute or modify the model files, you must comply with CC BY-SA 3.0. Inference usage via this library does not change the license of the model files themselves.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fast-langdetect"
-version = "0.3.2"
+version = "0.4.0"
 description = "Quickly detect text language and segment language"
 authors = [
     { name = "sudoskys", email = "coldlando@hotmail.com" },
diff --git a/src/fast_langdetect/infer.py b/src/fast_langdetect/infer.py
@@ -207,6 +207,7 @@ class LangDetectConfig:
     :param allow_fallback: Whether to fallback to small model
     :param disable_verify: Whether to disable MD5 verification
     :param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
+    :param max_input_length: If set, truncate input to this many characters (always debug-log the change)
     """
 
     def __init__(
@@ -218,6 +219,7 @@ def __init__(
             disable_verify: bool = False,
             verify_hash: Optional[str] = None,
             normalize_input: bool = True,
+            max_input_length: Optional[int] = 80,
     ):
         self.cache_dir = cache_dir or CACHE_DIRECTORY
         self.custom_model_path = custom_model_path
@@ -227,6 +229,8 @@ def __init__(
         self.disable_verify = disable_verify
         self.verify_hash = verify_hash
         self.normalize_input = normalize_input
+        # Input handling
+        self.max_input_length = max_input_length
         if self.custom_model_path and not Path(self.custom_model_path).exists():
             raise FileNotFoundError(f"fast-langdetect: Target model file not found: {self.custom_model_path}")
 
@@ -245,25 +249,23 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
         self.config = config or LangDetectConfig()
         self._model_loader = ModelLoader()
 
-    @staticmethod
-    def _preprocess_text(text: str) -> str:
+    def _preprocess_text(self, text: str) -> str:
         """
         Check text for newline characters and length.
 
         :param text: Input text
         :return: Processed text
         """
-        if len(text) > 100:
-            logger.warning(
-                "fast-langdetect: Text may be too long. "
-                "Consider passing only a single sentence for accurate prediction."
-            )
+        # Always replace newline characters to avoid FastText errors (silent)
         if "\n" in text:
-            logger.warning(
-                "fast-langdetect: Newline characters will be removed. "
-                "Input should not contain newline characters. or FastText will raise an error."
-            )
             text = text.replace("\n", " ")
+
+        # Auto-truncate overly long input if configured
+        if self.config.max_input_length is not None and len(text) > self.config.max_input_length:
+            logger.info(
+                f"fast-langdetect: Truncating input from {len(text)} to {self.config.max_input_length} characters; may reduce accuracy."
+            )
+            text = text[: self.config.max_input_length]
         return text
 
     @staticmethod
@@ -427,11 +429,11 @@ def detect(
         use_strict_mode,
     ])
     if has_custom_params:
-        # Show warning if using individual parameters
+        # Warn on deprecated individual parameters
         logger.warning(
-            "fast-langdetect: Using individual parameters is deprecated. "
-            "Consider using LangDetectConfig for better configuration management. "
-            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
+            "fast-langdetect: Individual parameters are deprecated. "
+            "Use LangDetectConfig for configuration. "
+            "See https://github.com/LlmKira/fast-langdetect/pull/16"
         )
         custom_config = LangDetectConfig(
             proxy=model_download_proxy,
@@ -482,11 +484,11 @@ def detect_multilingual(
         use_strict_mode,
     ])
     if has_custom_params:
-        # Show warning if using individual parameters
+        # Warn on deprecated individual parameters
         logger.warning(
-            "fast-langdetect: Using individual parameters is deprecated. "
-            "Consider using LangDetectConfig for better configuration management. "
-            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
+            "fast-langdetect: Individual parameters are deprecated. "
+            "Use LangDetectConfig for configuration. "
+            "See https://github.com/LlmKira/fast-langdetect/pull/16"
         )
         custom_config = LangDetectConfig(
             proxy=model_download_proxy,
diff --git a/tests/test_real_detection.py b/tests/test_real_detection.py
@@ -93,16 +93,15 @@ def test_not_found_model(self):
             detector = LangDetector(config)
             detector.detect("Hello world", low_memory=False)
 
-    def test_not_found_model_with_fallback(self):
-        """Test fallback to small model when large model fails to load."""
+    def test_not_found_model_without_fallback_on_io_error(self):
+        """Non-memory errors should not fallback; they should raise."""
         config = LangDetectConfig(
             cache_dir="/nonexistent/path",
             allow_fallback=True,
         )
         detector = LangDetector(config)
-        result = detector.detect("Hello world", low_memory=False)
-        assert result["lang"] == "en"
-        assert 0.1 <= result["score"] <= 1.0
+        with pytest.raises(DetectError):
+            detector.detect("Hello world", low_memory=False)
 
 @pytest.mark.real
 @pytest.mark.slow