|
74 | 74 | except DetectError as e: |
75 | 75 | print(f"Detection failed: {e}") |
76 | 76 |
|
77 | | -# How to deal with multiline text |
78 | | -multiline_text = """ |
79 | | -Hello, world! |
80 | | -This is a multiline text. |
81 | | -""" |
82 | | -multiline_text = multiline_text.replace("\n", " ") |
| 77 | +# Multiline text is handled automatically (newlines are replaced) |
| 78 | +multiline_text = "Hello, world!\nThis is a multiline text." |
83 | 79 | print(detect(multiline_text)) |
84 | | -# Output: {'lang': 'en', 'score': 0.8509423136711121} |
| 80 | +# Output: {'lang': 'en', 'score': 0.85} |
85 | 81 |
|
86 | 82 | # Multi-language detection |
87 | 83 | results = detect_multilingual( |
@@ -151,6 +147,74 @@ result = detector.detect("Hello world") |
151 | 147 | For text splitting based on language, please refer to the [split-lang](https://github.com/DoodleBears/split-lang) |
152 | 148 | repository. |
153 | 149 |
|
| 150 | + |
| 151 | +### Input Handling |
| 152 | + |
| 153 | +You can control log verbosity and input normalization via `LangDetectConfig`: |
| 154 | + |
| 155 | +```python |
| 156 | +from fast_langdetect import LangDetectConfig, LangDetector |
| 157 | + |
| 158 | +config = LangDetectConfig( |
| 159 | + max_input_length=80, # default: auto-truncate long inputs for stable results |
| 160 | +) |
| 161 | +detector = LangDetector(config) |
| 162 | +print(detector.detect("Some very long text...")) |
| 163 | +``` |
| 164 | + |
| 165 | +- Newlines are always replaced with spaces to avoid FastText errors (silent, no log). |
| 166 | +- When truncation happens, a WARNING is logged because it may reduce accuracy. |
| 167 | +- `max_input_length=80` truncates overly long inputs; set `None` to disable if you prefer no truncation. |
| 168 | + |
| 169 | +### Fallback Behavior |
| 170 | + |
| 171 | +- As of the latest change, the library only falls back to the bundled small model when a MemoryError occurs while loading the large model. |
| 172 | +- For other errors (e.g., I/O/permission errors, corrupted files, invalid paths), the error is raised as `DetectError` so you can diagnose the root cause quickly. |
| 173 | +- This avoids silently masking real issues and prevents unnecessary re-downloads that can slow execution. |
| 174 | + |
| 175 | +### Language Codes → English Names |
| 176 | + |
| 177 | +The detector returns fastText language codes (e.g., `en`, `zh`, `ja`, `pt-br`). To present user-friendly names, you can map codes to English names using a third-party library. Example using `langcodes`: |
| 178 | + |
| 179 | +```python |
| 180 | +# pip install langcodes |
| 181 | +from langcodes import Language |
| 182 | + |
| 183 | +OVERRIDES = { |
| 184 | + # fastText-specific or variant tags commonly used |
| 185 | + "yue": "Cantonese", |
| 186 | + "wuu": "Wu Chinese", |
| 187 | + "arz": "Egyptian Arabic", |
| 188 | + "ckb": "Central Kurdish", |
| 189 | + "kab": "Kabyle", |
| 190 | + "zh-cn": "Chinese (China)", |
| 191 | + "zh-tw": "Chinese (Taiwan)", |
| 192 | + "pt-br": "Portuguese (Brazil)", |
| 193 | +} |
| 194 | + |
| 195 | +def code_to_english_name(code: str) -> str: |
| 196 | + code = code.replace("_", "-").lower() |
| 197 | + if code in OVERRIDES: |
| 198 | + return OVERRIDES[code] |
| 199 | + try: |
| 200 | + # Display name in English; e.g. 'Portuguese (Brazil)' |
| 201 | + return Language.get(code).display_name("en") |
| 202 | + except Exception: |
| 203 | + # Try the base language (e.g., 'pt' from 'pt-br') |
| 204 | + base = code.split("-")[0] |
| 205 | + try: |
| 206 | + return Language.get(base).display_name("en") |
| 207 | + except Exception: |
| 208 | + return code |
| 209 | + |
| 210 | +# Usage |
| 211 | +from fast_langdetect import detect |
| 212 | +result = detect("Olá mundo", low_memory=False) |
| 213 | +print(code_to_english_name(result["lang"])) # Portuguese (Brazil) or Portuguese |
| 214 | +``` |
| 215 | + |
| 216 | +Alternatively, `pycountry` can be used for ISO 639 lookups (install with `pip install pycountry`), combined with a small override dict for non-standard tags like `pt-br`, `zh-cn`, `yue`, etc. |
| 217 | + |
154 | 218 | ## Benchmark 📊 |
155 | 219 |
|
156 | 220 | For detailed benchmark results, refer |
@@ -180,3 +244,12 @@ models |
180 | 244 | year={2016} |
181 | 245 | } |
182 | 246 | ``` |
| 247 | + |
| 248 | +## License 📄 |
| 249 | + |
| 250 | +- Code: Released under the MIT License (see `LICENSE`). |
| 251 | +- Models: This package uses the pre-trained fastText language identification models (`lid.176.ftz` bundled for offline use and `lid.176.bin` downloaded as needed). These models are licensed under the Creative Commons Attribution-ShareAlike 3.0 (CC BY-SA 3.0) license. |
| 252 | +- Attribution: fastText language identification models by Facebook AI Research. See the fastText docs and license for details: |
| 253 | + - https://fasttext.cc/docs/en/language-identification.html |
| 254 | + - https://creativecommons.org/licenses/by-sa/3.0/ |
| 255 | +- Note: If you redistribute or modify the model files, you must comply with CC BY-SA 3.0. Inference usage via this library does not change the license of the model files themselves. |
0 commit comments