Skip to content

Commit f4fc032

Browse files
authored
Merge pull request #16 from LlmKira/dev-20250304
✨ feat(app): [Compatibility changes] add input normalization to language detection
2 parents 2f3f5cd + 0cddac3 commit f4fc032

File tree

7 files changed

+164
-79
lines changed

7 files changed

+164
-79
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ In scenarios **where accuracy is important**, you should not rely on the detecti
4343

4444
### Prerequisites
4545

46-
- The "\n" character in the argument string must be removed before calling the function.
4746
- If the sample is too long or too short, the accuracy will be reduced.
4847
- The model will be downloaded to system temporary directory by default. You can customize it by:
4948
- Setting `FTLANG_CACHE` environment variable
@@ -79,7 +78,6 @@ except DetectError as e:
7978
multiline_text = """
8079
Hello, world!
8180
This is a multiline text.
82-
But we need remove \n characters or it will raise a DetectError.
8381
"""
8482
multiline_text = multiline_text.replace("\n", " ")
8583
print(detect(multiline_text))

feature_test/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
# When offline, its raise error
3131
print(
3232
detect_multilingual(
33-
"Hello, world!你好世界!Привет, мир!", low_memory=False, use_strict_mode=True
33+
"Hello, world!你好世界!Привет, мир!",
34+
low_memory=False,
35+
config=LangDetectConfig(allow_fallback=True)
3436
)
3537
)
3638

src/fast_langdetect/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# -*- coding: utf-8 -*-
22
# @Time : 2024/1/17 下午4:00
33

4+
from .infer import LangDetector, LangDetectConfig, DetectError # noqa: F401
45
from .infer import detect
56
from .infer import detect_multilingual # noqa: F401
6-
from .infer import LangDetector, LangDetectConfig, DetectError # noqa: F401
7+
78

89
def is_japanese(string):
910
for ch in string:
@@ -19,7 +20,7 @@ def detect_language(sentence: str, *, low_memory: bool = True):
1920
:param low_memory: bool (default: True) whether to use low memory mode
2021
:return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters)
2122
"""
22-
lang_code = detect(sentence.lower(), low_memory=low_memory).get("lang").upper()
23+
lang_code = detect(sentence, low_memory=low_memory).get("lang").upper()
2324
if lang_code == "JA" and not is_japanese(sentence):
2425
lang_code = "ZH"
2526
return lang_code

src/fast_langdetect/infer.py

Lines changed: 141 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
import hashlib
77
import logging
88
import os
9-
import tempfile
109
import platform
1110
import re
1211
import shutil
12+
import tempfile
1313
from pathlib import Path
1414
from typing import Dict, List, Optional, Union, Any
1515

@@ -143,29 +143,29 @@ def _load_windows_compatible(self, model_path: Path) -> Any:
143143
:raises DetectError: If all loading strategies fail
144144
"""
145145
model_path_str = str(model_path.resolve())
146-
146+
147147
# Try to load model directly
148148
try:
149149
return fasttext.load_model(model_path_str)
150150
except Exception as e:
151151
logger.debug(f"fast-langdetect: Load model failed: {e}")
152-
152+
153153
# Try to load model using relative path
154154
try:
155155
cwd = Path.cwd()
156156
rel_path = os.path.relpath(model_path, cwd)
157157
return fasttext.load_model(rel_path)
158158
except Exception as e:
159159
logger.debug(f"fast-langdetect: Failed to load model using relative path: {e}")
160-
160+
161161
# Use temporary file as last resort
162162
logger.debug(f"fast-langdetect: Using temporary file to load model: {model_path}")
163163
tmp_path = None
164164
try:
165165
# Use NamedTemporaryFile to create a temporary file
166166
tmp_fd, tmp_path = tempfile.mkstemp(suffix='.bin')
167167
os.close(tmp_fd) # Close file descriptor
168-
168+
169169
# Copy model file to temporary location
170170
shutil.copy2(model_path, tmp_path)
171171
return fasttext.load_model(tmp_path)
@@ -203,16 +203,18 @@ class LangDetectConfig:
203203
:param proxy: HTTP proxy for downloads
204204
:param allow_fallback: Whether to fallback to small model
205205
:param disable_verify: Whether to disable MD5 verification
206+
:param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
206207
"""
207208

208209
def __init__(
209-
self,
210-
cache_dir: Optional[str] = None,
211-
custom_model_path: Optional[str] = None,
212-
proxy: Optional[str] = None,
213-
allow_fallback: bool = True,
214-
disable_verify: bool = False,
215-
verify_hash: Optional[str] = None,
210+
self,
211+
cache_dir: Optional[str] = None,
212+
custom_model_path: Optional[str] = None,
213+
proxy: Optional[str] = None,
214+
allow_fallback: bool = True,
215+
disable_verify: bool = False,
216+
verify_hash: Optional[str] = None,
217+
normalize_input: bool = True,
216218
):
217219
self.cache_dir = cache_dir or CACHE_DIRECTORY
218220
self.custom_model_path = custom_model_path
@@ -221,9 +223,11 @@ def __init__(
221223
# Only verify large model
222224
self.disable_verify = disable_verify
223225
self.verify_hash = verify_hash
226+
self.normalize_input = normalize_input
224227
if self.custom_model_path and not Path(self.custom_model_path).exists():
225228
raise FileNotFoundError(f"fast-langdetect: Target model file not found: {self.custom_model_path}")
226229

230+
227231
class LangDetector:
228232
"""Language detector using FastText models."""
229233
VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
@@ -238,6 +242,54 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
238242
self.config = config or LangDetectConfig()
239243
self._model_loader = ModelLoader()
240244

245+
@staticmethod
246+
def _preprocess_text(text: str) -> str:
247+
"""
248+
Check text for newline characters and length.
249+
250+
:param text: Input text
251+
:return: Processed text
252+
"""
253+
if len(text) > 100:
254+
logger.warning(
255+
"fast-langdetect: Text may be too long. "
256+
"Consider passing only a single sentence for accurate prediction."
257+
)
258+
if "\n" in text:
259+
logger.warning(
260+
"fast-langdetect: Newline characters will be removed. "
261+
"Input should not contain newline characters. or FastText will raise an error."
262+
)
263+
text = text.replace("\n", " ")
264+
return text
265+
266+
@staticmethod
267+
def _normalize_text(text: str, should_normalize: bool = False) -> str:
268+
"""
269+
Normalize text based on configuration.
270+
271+
Currently, handles:
272+
- Removing newline characters for better prediction
273+
- Lowercasing uppercase text to prevent misdetection as Japanese
274+
275+
:param text: Input text
276+
:param should_normalize: Whether normalization should be applied
277+
:return: Normalized text
278+
"""
279+
# If not normalization is needed, return the processed text
280+
if not should_normalize:
281+
return text
282+
283+
# Check if text is all uppercase or mostly uppercase
284+
# https://github.com/LlmKira/fast-langdetect/issues/14
285+
if text.isupper() or (
286+
len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
287+
and len(text) > 5
288+
):
289+
return text.lower()
290+
291+
return text
292+
241293
def _get_model(self, low_memory: bool = True) -> Any:
242294
"""Get or load appropriate model."""
243295
cache_key = "low_memory" if low_memory else "high_memory"
@@ -272,7 +324,7 @@ def _get_model(self, low_memory: bool = True) -> Any:
272324
raise DetectError("Failed to load model") from e
273325

274326
def detect(
275-
self, text: str, low_memory: bool = True
327+
self, text: str, low_memory: bool = True
276328
) -> Dict[str, Union[str, float]]:
277329
"""
278330
Detect primary language of text.
@@ -286,8 +338,10 @@ def detect(
286338
DetectError: If detection fails
287339
"""
288340
model = self._get_model(low_memory)
341+
text = self._preprocess_text(text)
342+
normalized_text = self._normalize_text(text, self.config.normalize_input)
289343
try:
290-
labels, scores = model.predict(text)
344+
labels, scores = model.predict(normalized_text)
291345
return {
292346
"lang": labels[0].replace("__label__", ""),
293347
"score": min(float(scores[0]), 1.0),
@@ -297,11 +351,11 @@ def detect(
297351
raise DetectError("Language detection failed") from e
298352

299353
def detect_multilingual(
300-
self,
301-
text: str,
302-
low_memory: bool = False,
303-
k: int = 5,
304-
threshold: float = 0.0,
354+
self,
355+
text: str,
356+
low_memory: bool = False,
357+
k: int = 5,
358+
threshold: float = 0.0,
305359
) -> List[Dict[str, Any]]:
306360
"""
307361
Detect multiple possible languages in text.
@@ -317,8 +371,10 @@ def detect_multilingual(
317371
DetectError: If detection fails
318372
"""
319373
model = self._get_model(low_memory)
374+
text = self._preprocess_text(text)
375+
normalized_text = self._normalize_text(text, self.config.normalize_input)
320376
try:
321-
labels, scores = model.predict(text, k=k, threshold=threshold)
377+
labels, scores = model.predict(normalized_text, k=k, threshold=threshold)
322378
results = [
323379
{
324380
"lang": label.replace("__label__", ""),
@@ -337,78 +393,108 @@ def detect_multilingual(
337393

338394

339395
def detect(
340-
text: str,
341-
*,
342-
low_memory: bool = True,
343-
model_download_proxy: Optional[str] = None,
344-
use_strict_mode: bool = False,
396+
text: str,
397+
*,
398+
low_memory: bool = True,
399+
model_download_proxy: Optional[str] = None,
400+
use_strict_mode: bool = False,
401+
config: Optional[LangDetectConfig] = None,
345402
) -> Dict[str, Union[str, float]]:
346403
"""
347404
Simple interface for language detection.
348-
349-
Before passing a text to this function, you remove all the newline characters.
350-
405+
351406
Too long or too short text will effect the accuracy of the prediction.
352407
353408
:param text: Input text without newline characters
354409
:param low_memory: Whether to use memory-efficient model
355-
:param model_download_proxy: Optional proxy for model download
356-
:param use_strict_mode: Disable fallback to small model
410+
:param model_download_proxy: [DEPRECATED] Optional proxy for model download
411+
:param use_strict_mode: [DEPRECATED] Disable fallback to small model
412+
:param config: Optional LangDetectConfig object for advanced configuration
357413
358414
:return: Dictionary with language and confidence score
359415
"""
360-
if "\n" in text or len(text) > 1000:
416+
# Provide config
417+
if config is not None:
418+
detector = LangDetector(config)
419+
return detector.detect(text, low_memory=low_memory)
420+
421+
# Check if any custom parameters are provided
422+
has_custom_params = any([
423+
model_download_proxy is not None,
424+
use_strict_mode,
425+
])
426+
if has_custom_params:
427+
# Show warning if using individual parameters
361428
logger.warning(
362-
"fast-langdetect: Text contains newline characters or is too long. "
363-
"You should only pass a single sentence for accurate prediction."
429+
"fast-langdetect: Using individual parameters is deprecated. "
430+
"Consider using LangDetectConfig for better configuration management. "
431+
"Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
364432
)
365-
if model_download_proxy or use_strict_mode:
366-
config = LangDetectConfig(
367-
proxy=model_download_proxy, allow_fallback=not use_strict_mode
433+
custom_config = LangDetectConfig(
434+
proxy=model_download_proxy,
435+
allow_fallback=not use_strict_mode,
368436
)
369-
detector = LangDetector(config)
437+
detector = LangDetector(custom_config)
370438
return detector.detect(text, low_memory=low_memory)
439+
440+
# Use default detector
371441
return _default_detector.detect(text, low_memory=low_memory)
372442

373443

374444
def detect_multilingual(
375-
text: str,
376-
*,
377-
low_memory: bool = False,
378-
model_download_proxy: Optional[str] = None,
379-
k: int = 5,
380-
threshold: float = 0.0,
381-
use_strict_mode: bool = False,
445+
text: str,
446+
*,
447+
low_memory: bool = False,
448+
model_download_proxy: Optional[str] = None,
449+
k: int = 5,
450+
threshold: float = 0.0,
451+
use_strict_mode: bool = False,
452+
config: Optional[LangDetectConfig] = None,
382453
) -> List[Dict[str, Any]]:
383454
"""
384455
Simple interface for multi-language detection.
385456
386-
Before passing a text to this function, you remove all the newline characters.
387-
388457
Too long or too short text will effect the accuracy of the prediction.
389458
390459
:param text: Input text without newline characters
391460
:param low_memory: Whether to use memory-efficient model
392-
:param model_download_proxy: Optional proxy for model download
393461
:param k: Number of top languages to return
394462
:param threshold: Minimum confidence threshold
395-
:param use_strict_mode: Disable fallback to small model
463+
:param model_download_proxy: [DEPRECATED] Optional proxy for model download
464+
:param use_strict_mode: [DEPRECATED] Disable fallback to small model
465+
:param config: Optional LangDetectConfig object for advanced configuration
396466
397467
:return: List of dictionaries with languages and scores
398468
"""
399-
if "\n" in text or len(text) > 100:
469+
# Use provided config or create new config
470+
if config is not None:
471+
detector = LangDetector(config)
472+
return detector.detect_multilingual(
473+
text, low_memory=low_memory, k=k, threshold=threshold
474+
)
475+
476+
# Check if any custom parameters are provided
477+
has_custom_params = any([
478+
model_download_proxy is not None,
479+
use_strict_mode,
480+
])
481+
if has_custom_params:
482+
# Show warning if using individual parameters
400483
logger.warning(
401-
"fast-langdetect: Text contains newline characters or is too long. "
402-
"You should only pass a single sentence for accurate prediction."
484+
"fast-langdetect: Using individual parameters is deprecated. "
485+
"Consider using LangDetectConfig for better configuration management. "
486+
"Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
403487
)
404-
if model_download_proxy or use_strict_mode:
405-
config = LangDetectConfig(
406-
proxy=model_download_proxy, allow_fallback=not use_strict_mode
488+
custom_config = LangDetectConfig(
489+
proxy=model_download_proxy,
490+
allow_fallback=not use_strict_mode,
407491
)
408-
detector = LangDetector(config)
492+
detector = LangDetector(custom_config)
409493
return detector.detect_multilingual(
410494
text, low_memory=low_memory, k=k, threshold=threshold
411495
)
496+
497+
# Use default detector
412498
return _default_detector.detect_multilingual(
413499
text, low_memory=low_memory, k=k, threshold=threshold
414500
)

tests/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ def pytest_configure(config):
44
"""注册自定义标记。"""
55
config.addinivalue_line(
66
"markers",
7-
"slow: 标记需要较长时间运行的测试"
7+
"slow: Run in long progress"
88
)
99
config.addinivalue_line(
1010
"markers",
11-
"real: 标记使用真实模型的测试"
11+
"real: Test with real model"
1212
)

0 commit comments

Comments
 (0)