🔧 refactor(infer.py): integrate max_input_length for auto-truncation and improve parameter deprecation warnings

sudoskys · sudoskys · commit d7d255c7ca14 · 2025-09-15T20:40:50.000+08:00
📚 docs(README.md): update usage examples and add input handling details
🔖 chore(pyproject.toml): bump version to 0.4.0 for new features and improvements
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,11 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+## [0.4.0] - 2025-09-15
+
+- Behavior: Always replace newline characters in input to prevent FastText errors. This adjustment is logged at DEBUG level only.
+- Default input truncation: Truncate inputs to 80 characters by default for stable predictions. Configurable via `LangDetectConfig(max_input_length=...)`; set `None` to disable.
+- Simplified config: Removed previously proposed `verbose` and `replace_newlines` options; newline replacement is unconditional and logging of adjustments is controlled by global logger level.
+- Logging: Deprecated-parameter messages lowered from WARNING to INFO to reduce noise.
+- Documentation: README now includes language code → name mapping guidance and an explicit model license note (CC BY-SA 3.0) alongside MIT for code.
diff --git a/README.md b/README.md
@@ -74,14 +74,10 @@ try:
 except DetectError as e:
     print(f"Detection failed: {e}")
 
-# How to deal with multiline text
-multiline_text = """
-Hello, world!
-This is a multiline text.
-"""
-multiline_text = multiline_text.replace("\n", " ")  
+# Multiline text is handled automatically (newlines are replaced)
+multiline_text = "Hello, world!\nThis is a multiline text."
 print(detect(multiline_text))
-# Output: {'lang': 'en', 'score': 0.8509423136711121}
+# Output: {'lang': 'en', 'score': 0.85}
 
 # Multi-language detection
 results = detect_multilingual(
@@ -151,6 +147,25 @@ result = detector.detect("Hello world")
 For text splitting based on language, please refer to the [split-lang](https://github.com/DoodleBears/split-lang)
 repository.
 
+
+### Input Handling
+
+You can control log verbosity and input normalization via `LangDetectConfig`:
+
+```python
+from fast_langdetect import LangDetectConfig, LangDetector
+
+config = LangDetectConfig(
+    max_input_length=80,    # default: auto-truncate long inputs for stable results
+)
+detector = LangDetector(config)
+print(detector.detect("Some very long text..."))
+```
+
+- Newlines are always replaced with spaces to avoid FastText errors (silent, no log).
+- When truncation happens, a WARNING is logged because it may reduce accuracy.
+- `max_input_length=80` truncates overly long inputs; set `None` to disable if you prefer no truncation.
+
 ## Benchmark 📊
 
 For detailed benchmark results, refer
@@ -180,3 +195,12 @@ models
   year={2016}
 }
 ```
+
+## License 📄
+
+- Code: Released under the MIT License (see `LICENSE`).
+- Models: This package uses the pre-trained fastText language identification models (`lid.176.ftz` bundled for offline use and `lid.176.bin` downloaded as needed). These models are licensed under the Creative Commons Attribution-ShareAlike 3.0 (CC BY-SA 3.0) license.
+- Attribution: fastText language identification models by Facebook AI Research. See the fastText docs and license for details:
+  - https://fasttext.cc/docs/en/language-identification.html
+  - https://creativecommons.org/licenses/by-sa/3.0/
+- Note: If you redistribute or modify the model files, you must comply with CC BY-SA 3.0. Inference usage via this library does not change the license of the model files themselves.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fast-langdetect"
-version = "0.3.2"
+version = "0.4.0"
 description = "Quickly detect text language and segment language"
 authors = [
     { name = "sudoskys", email = "coldlando@hotmail.com" },
diff --git a/src/fast_langdetect/infer.py b/src/fast_langdetect/infer.py
@@ -207,6 +207,7 @@ class LangDetectConfig:
     :param allow_fallback: Whether to fallback to small model
     :param disable_verify: Whether to disable MD5 verification
     :param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
+    :param max_input_length: If set, truncate input to this many characters (always debug-log the change)
     """
 
     def __init__(
@@ -218,6 +219,7 @@ def __init__(
             disable_verify: bool = False,
             verify_hash: Optional[str] = None,
             normalize_input: bool = True,
+            max_input_length: Optional[int] = 80,
     ):
         self.cache_dir = cache_dir or CACHE_DIRECTORY
         self.custom_model_path = custom_model_path
@@ -227,6 +229,8 @@ def __init__(
         self.disable_verify = disable_verify
         self.verify_hash = verify_hash
         self.normalize_input = normalize_input
+        # Input handling
+        self.max_input_length = max_input_length
         if self.custom_model_path and not Path(self.custom_model_path).exists():
             raise FileNotFoundError(f"fast-langdetect: Target model file not found: {self.custom_model_path}")
 
@@ -245,25 +249,23 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
         self.config = config or LangDetectConfig()
         self._model_loader = ModelLoader()
 
-    @staticmethod
-    def _preprocess_text(text: str) -> str:
+    def _preprocess_text(self, text: str) -> str:
         """
         Check text for newline characters and length.
 
         :param text: Input text
         :return: Processed text
         """
-        if len(text) > 100:
-            logger.warning(
-                "fast-langdetect: Text may be too long. "
-                "Consider passing only a single sentence for accurate prediction."
-            )
+        # Always replace newline characters to avoid FastText errors (silent)
         if "\n" in text:
+            text = text.replace("\n", " ")
+
+        # Auto-truncate overly long input if configured
+        if self.config.max_input_length is not None and len(text) > self.config.max_input_length:
             logger.warning(
-                "fast-langdetect: Newline characters will be removed. "
-                "Input should not contain newline characters. or FastText will raise an error."
+                f"fast-langdetect: Truncating input from {len(text)} to {self.config.max_input_length} characters; may reduce accuracy."
             )
-            text = text.replace("\n", " ")
+            text = text[: self.config.max_input_length]
         return text
 
     @staticmethod
@@ -427,11 +429,11 @@ def detect(
         use_strict_mode,
     ])
     if has_custom_params:
-        # Show warning if using individual parameters
+        # Warn on deprecated individual parameters
         logger.warning(
-            "fast-langdetect: Using individual parameters is deprecated. "
-            "Consider using LangDetectConfig for better configuration management. "
-            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
+            "fast-langdetect: Individual parameters are deprecated. "
+            "Use LangDetectConfig for configuration. "
+            "See https://github.com/LlmKira/fast-langdetect/pull/16"
         )
         custom_config = LangDetectConfig(
             proxy=model_download_proxy,
@@ -482,11 +484,11 @@ def detect_multilingual(
         use_strict_mode,
     ])
     if has_custom_params:
-        # Show warning if using individual parameters
+        # Warn on deprecated individual parameters
         logger.warning(
-            "fast-langdetect: Using individual parameters is deprecated. "
-            "Consider using LangDetectConfig for better configuration management. "
-            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
+            "fast-langdetect: Individual parameters are deprecated. "
+            "Use LangDetectConfig for configuration. "
+            "See https://github.com/LlmKira/fast-langdetect/pull/16"
         )
         custom_config = LangDetectConfig(
             proxy=model_download_proxy,