Skip to content

Commit d7d255c

Browse files
committed
🔧 refactor(infer.py): integrate max_input_length for auto-truncation and improve parameter deprecation warnings
📚 docs(README.md): update usage examples and add input handling details 🔖 chore(pyproject.toml): bump version to 0.4.0 for new features and improvements
1 parent 42ea53f commit d7d255c

File tree

4 files changed

+63
-26
lines changed

4 files changed

+63
-26
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
## [0.4.0] - 2025-09-15
6+
7+
- Behavior: Always replace newline characters in input to prevent FastText errors. This adjustment is logged at DEBUG level only.
8+
- Default input truncation: Truncate inputs to 80 characters by default for stable predictions. Configurable via `LangDetectConfig(max_input_length=...)`; set `None` to disable.
9+
- Simplified config: Removed previously proposed `verbose` and `replace_newlines` options; newline replacement is unconditional and logging of adjustments is controlled by global logger level.
10+
- Logging: Deprecated-parameter messages lowered from WARNING to INFO to reduce noise.
11+
- Documentation: README now includes language code → name mapping guidance and an explicit model license note (CC BY-SA 3.0) alongside MIT for code.

README.md

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,10 @@ try:
7474
except DetectError as e:
7575
print(f"Detection failed: {e}")
7676

77-
# How to deal with multiline text
78-
multiline_text = """
79-
Hello, world!
80-
This is a multiline text.
81-
"""
82-
multiline_text = multiline_text.replace("\n", " ")
77+
# Multiline text is handled automatically (newlines are replaced)
78+
multiline_text = "Hello, world!\nThis is a multiline text."
8379
print(detect(multiline_text))
84-
# Output: {'lang': 'en', 'score': 0.8509423136711121}
80+
# Output: {'lang': 'en', 'score': 0.85}
8581

8682
# Multi-language detection
8783
results = detect_multilingual(
@@ -151,6 +147,25 @@ result = detector.detect("Hello world")
151147
For text splitting based on language, please refer to the [split-lang](https://github.com/DoodleBears/split-lang)
152148
repository.
153149

150+
151+
### Input Handling
152+
153+
You can control log verbosity and input normalization via `LangDetectConfig`:
154+
155+
```python
156+
from fast_langdetect import LangDetectConfig, LangDetector
157+
158+
config = LangDetectConfig(
159+
max_input_length=80, # default: auto-truncate long inputs for stable results
160+
)
161+
detector = LangDetector(config)
162+
print(detector.detect("Some very long text..."))
163+
```
164+
165+
- Newlines are always replaced with spaces to avoid FastText errors (silent, no log).
166+
- When truncation happens, a WARNING is logged because it may reduce accuracy.
167+
- `max_input_length=80` truncates overly long inputs; set `None` to disable if you prefer no truncation.
168+
154169
## Benchmark 📊
155170

156171
For detailed benchmark results, refer
@@ -180,3 +195,12 @@ models
180195
year={2016}
181196
}
182197
```
198+
199+
## License 📄
200+
201+
- Code: Released under the MIT License (see `LICENSE`).
202+
- Models: This package uses the pre-trained fastText language identification models (`lid.176.ftz` bundled for offline use and `lid.176.bin` downloaded as needed). These models are licensed under the Creative Commons Attribution-ShareAlike 3.0 (CC BY-SA 3.0) license.
203+
- Attribution: fastText language identification models by Facebook AI Research. See the fastText docs and license for details:
204+
- https://fasttext.cc/docs/en/language-identification.html
205+
- https://creativecommons.org/licenses/by-sa/3.0/
206+
- Note: If you redistribute or modify the model files, you must comply with CC BY-SA 3.0. Inference usage via this library does not change the license of the model files themselves.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "fast-langdetect"
3-
version = "0.3.2"
3+
version = "0.4.0"
44
description = "Quickly detect text language and segment language"
55
authors = [
66
{ name = "sudoskys", email = "coldlando@hotmail.com" },

src/fast_langdetect/infer.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ class LangDetectConfig:
207207
:param allow_fallback: Whether to fallback to small model
208208
:param disable_verify: Whether to disable MD5 verification
209209
:param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
210+
:param max_input_length: If set, truncate input to this many characters (always debug-log the change)
210211
"""
211212

212213
def __init__(
@@ -218,6 +219,7 @@ def __init__(
218219
disable_verify: bool = False,
219220
verify_hash: Optional[str] = None,
220221
normalize_input: bool = True,
222+
max_input_length: Optional[int] = 80,
221223
):
222224
self.cache_dir = cache_dir or CACHE_DIRECTORY
223225
self.custom_model_path = custom_model_path
@@ -227,6 +229,8 @@ def __init__(
227229
self.disable_verify = disable_verify
228230
self.verify_hash = verify_hash
229231
self.normalize_input = normalize_input
232+
# Input handling
233+
self.max_input_length = max_input_length
230234
if self.custom_model_path and not Path(self.custom_model_path).exists():
231235
raise FileNotFoundError(f"fast-langdetect: Target model file not found: {self.custom_model_path}")
232236

@@ -245,25 +249,23 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
245249
self.config = config or LangDetectConfig()
246250
self._model_loader = ModelLoader()
247251

248-
@staticmethod
249-
def _preprocess_text(text: str) -> str:
252+
def _preprocess_text(self, text: str) -> str:
250253
"""
251254
Check text for newline characters and length.
252255
253256
:param text: Input text
254257
:return: Processed text
255258
"""
256-
if len(text) > 100:
257-
logger.warning(
258-
"fast-langdetect: Text may be too long. "
259-
"Consider passing only a single sentence for accurate prediction."
260-
)
259+
# Always replace newline characters to avoid FastText errors (silent)
261260
if "\n" in text:
261+
text = text.replace("\n", " ")
262+
263+
# Auto-truncate overly long input if configured
264+
if self.config.max_input_length is not None and len(text) > self.config.max_input_length:
262265
logger.warning(
263-
"fast-langdetect: Newline characters will be removed. "
264-
"Input should not contain newline characters. or FastText will raise an error."
266+
f"fast-langdetect: Truncating input from {len(text)} to {self.config.max_input_length} characters; may reduce accuracy."
265267
)
266-
text = text.replace("\n", " ")
268+
text = text[: self.config.max_input_length]
267269
return text
268270

269271
@staticmethod
@@ -427,11 +429,11 @@ def detect(
427429
use_strict_mode,
428430
])
429431
if has_custom_params:
430-
# Show warning if using individual parameters
432+
# Warn on deprecated individual parameters
431433
logger.warning(
432-
"fast-langdetect: Using individual parameters is deprecated. "
433-
"Consider using LangDetectConfig for better configuration management. "
434-
"Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
434+
"fast-langdetect: Individual parameters are deprecated. "
435+
"Use LangDetectConfig for configuration. "
436+
"See https://github.com/LlmKira/fast-langdetect/pull/16"
435437
)
436438
custom_config = LangDetectConfig(
437439
proxy=model_download_proxy,
@@ -482,11 +484,11 @@ def detect_multilingual(
482484
use_strict_mode,
483485
])
484486
if has_custom_params:
485-
# Show warning if using individual parameters
487+
# Warn on deprecated individual parameters
486488
logger.warning(
487-
"fast-langdetect: Using individual parameters is deprecated. "
488-
"Consider using LangDetectConfig for better configuration management. "
489-
"Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
489+
"fast-langdetect: Individual parameters are deprecated. "
490+
"Use LangDetectConfig for configuration. "
491+
"See https://github.com/LlmKira/fast-langdetect/pull/16"
490492
)
491493
custom_config = LangDetectConfig(
492494
proxy=model_download_proxy,

0 commit comments

Comments
 (0)