Merge pull request #16 from LlmKira/dev-20250304

sudoskys · web-flow · commit f4fc0321fa0a · 2025-03-29T14:49:15.000+08:00
✨ feat(app): [Compatibility changes] add input normalization to language detection
diff --git a/README.md b/README.md
@@ -43,7 +43,6 @@ In scenarios **where accuracy is important**, you should not rely on the detecti
 
 ### Prerequisites
 
-- The "\n" character in the argument string must be removed before calling the function.
 - If the sample is too long or too short, the accuracy will be reduced.
 - The model will be downloaded to system temporary directory by default. You can customize it by:
   - Setting `FTLANG_CACHE` environment variable
@@ -79,7 +78,6 @@ except DetectError as e:
 multiline_text = """
 Hello, world!
 This is a multiline text.
-But we need remove \n characters or it will raise a DetectError.
 """
 multiline_text = multiline_text.replace("\n", " ")  
 print(detect(multiline_text))
diff --git a/feature_test/__init__.py b/feature_test/__init__.py
@@ -30,7 +30,9 @@
 # When offline, its raise error
 print(
     detect_multilingual(
-        "Hello, world!你好世界!Привет, мир!", low_memory=False, use_strict_mode=True
+        "Hello, world!你好世界!Привет, мир!",
+        low_memory=False,
+        config=LangDetectConfig(allow_fallback=True)
     )
 )
 
diff --git a/src/fast_langdetect/__init__.py b/src/fast_langdetect/__init__.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/1/17 下午4:00
 
+from .infer import LangDetector, LangDetectConfig, DetectError  # noqa: F401
 from .infer import detect
 from .infer import detect_multilingual  # noqa: F401
-from .infer import LangDetector, LangDetectConfig, DetectError  # noqa: F401
+
 
 def is_japanese(string):
     for ch in string:
@@ -19,7 +20,7 @@ def detect_language(sentence: str, *, low_memory: bool = True):
     :param low_memory: bool (default: True) whether to use low memory mode
     :return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters)
     """
-    lang_code = detect(sentence.lower(), low_memory=low_memory).get("lang").upper()
+    lang_code = detect(sentence, low_memory=low_memory).get("lang").upper()
     if lang_code == "JA" and not is_japanese(sentence):
         lang_code = "ZH"
     return lang_code
diff --git a/src/fast_langdetect/infer.py b/src/fast_langdetect/infer.py
@@ -6,10 +6,10 @@
 import hashlib
 import logging
 import os
-import tempfile
 import platform
 import re
 import shutil
+import tempfile
 from pathlib import Path
 from typing import Dict, List, Optional, Union, Any
 
@@ -143,29 +143,29 @@ def _load_windows_compatible(self, model_path: Path) -> Any:
         :raises DetectError: If all loading strategies fail
         """
         model_path_str = str(model_path.resolve())
-        
+
         # Try to load model directly
         try:
             return fasttext.load_model(model_path_str)
         except Exception as e:
             logger.debug(f"fast-langdetect: Load model failed: {e}")
-        
+
         # Try to load model using relative path
         try:
             cwd = Path.cwd()
             rel_path = os.path.relpath(model_path, cwd)
             return fasttext.load_model(rel_path)
         except Exception as e:
             logger.debug(f"fast-langdetect: Failed to load model using relative path: {e}")
-        
+
         # Use temporary file as last resort
         logger.debug(f"fast-langdetect: Using temporary file to load model: {model_path}")
         tmp_path = None
         try:
             # Use NamedTemporaryFile to create a temporary file
             tmp_fd, tmp_path = tempfile.mkstemp(suffix='.bin')
             os.close(tmp_fd)  # Close file descriptor
-            
+
             # Copy model file to temporary location
             shutil.copy2(model_path, tmp_path)
             return fasttext.load_model(tmp_path)
@@ -203,16 +203,18 @@ class LangDetectConfig:
     :param proxy: HTTP proxy for downloads
     :param allow_fallback: Whether to fallback to small model
     :param disable_verify: Whether to disable MD5 verification
+    :param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
     """
 
     def __init__(
-        self,
-        cache_dir: Optional[str] = None,
-        custom_model_path: Optional[str] = None,
-        proxy: Optional[str] = None,
-        allow_fallback: bool = True,
-        disable_verify: bool = False,
-        verify_hash: Optional[str] = None,
+            self,
+            cache_dir: Optional[str] = None,
+            custom_model_path: Optional[str] = None,
+            proxy: Optional[str] = None,
+            allow_fallback: bool = True,
+            disable_verify: bool = False,
+            verify_hash: Optional[str] = None,
+            normalize_input: bool = True,
     ):
         self.cache_dir = cache_dir or CACHE_DIRECTORY
         self.custom_model_path = custom_model_path
@@ -221,9 +223,11 @@ def __init__(
         # Only verify large model
         self.disable_verify = disable_verify
         self.verify_hash = verify_hash
+        self.normalize_input = normalize_input
         if self.custom_model_path and not Path(self.custom_model_path).exists():
             raise FileNotFoundError(f"fast-langdetect: Target model file not found: {self.custom_model_path}")
 
+
 class LangDetector:
     """Language detector using FastText models."""
     VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
@@ -238,6 +242,54 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
         self.config = config or LangDetectConfig()
         self._model_loader = ModelLoader()
 
+    @staticmethod
+    def _preprocess_text(text: str) -> str:
+        """
+        Check text for newline characters and length.
+
+        :param text: Input text
+        :return: Processed text
+        """
+        if len(text) > 100:
+            logger.warning(
+                "fast-langdetect: Text may be too long. "
+                "Consider passing only a single sentence for accurate prediction."
+            )
+        if "\n" in text:
+            logger.warning(
+                "fast-langdetect: Newline characters will be removed. "
+                "Input should not contain newline characters. or FastText will raise an error."
+            )
+            text = text.replace("\n", " ")
+        return text
+
+    @staticmethod
+    def _normalize_text(text: str, should_normalize: bool = False) -> str:
+        """
+        Normalize text based on configuration.
+        
+        Currently, handles:
+        - Removing newline characters for better prediction
+        - Lowercasing uppercase text to prevent misdetection as Japanese
+        
+        :param text: Input text
+        :param should_normalize: Whether normalization should be applied
+        :return: Normalized text
+        """
+        # If not normalization is needed, return the processed text
+        if not should_normalize:
+            return text
+
+        # Check if text is all uppercase or mostly uppercase
+        # https://github.com/LlmKira/fast-langdetect/issues/14
+        if text.isupper() or (
+                len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
+                and len(text) > 5
+        ):
+            return text.lower()
+
+        return text
+
     def _get_model(self, low_memory: bool = True) -> Any:
         """Get or load appropriate model."""
         cache_key = "low_memory" if low_memory else "high_memory"
@@ -272,7 +324,7 @@ def _get_model(self, low_memory: bool = True) -> Any:
             raise DetectError("Failed to load model") from e
 
     def detect(
-        self, text: str, low_memory: bool = True
+            self, text: str, low_memory: bool = True
     ) -> Dict[str, Union[str, float]]:
         """
         Detect primary language of text.
@@ -286,8 +338,10 @@ def detect(
             DetectError: If detection fails
         """
         model = self._get_model(low_memory)
+        text = self._preprocess_text(text)
+        normalized_text = self._normalize_text(text, self.config.normalize_input)
         try:
-            labels, scores = model.predict(text)
+            labels, scores = model.predict(normalized_text)
             return {
                 "lang": labels[0].replace("__label__", ""),
                 "score": min(float(scores[0]), 1.0),
@@ -297,11 +351,11 @@ def detect(
             raise DetectError("Language detection failed") from e
 
     def detect_multilingual(
-        self,
-        text: str,
-        low_memory: bool = False,
-        k: int = 5,
-        threshold: float = 0.0,
+            self,
+            text: str,
+            low_memory: bool = False,
+            k: int = 5,
+            threshold: float = 0.0,
     ) -> List[Dict[str, Any]]:
         """
         Detect multiple possible languages in text.
@@ -317,8 +371,10 @@ def detect_multilingual(
             DetectError: If detection fails
         """
         model = self._get_model(low_memory)
+        text = self._preprocess_text(text)
+        normalized_text = self._normalize_text(text, self.config.normalize_input)
         try:
-            labels, scores = model.predict(text, k=k, threshold=threshold)
+            labels, scores = model.predict(normalized_text, k=k, threshold=threshold)
             results = [
                 {
                     "lang": label.replace("__label__", ""),
@@ -337,78 +393,108 @@ def detect_multilingual(
 
 
 def detect(
-    text: str,
-    *,
-    low_memory: bool = True,
-    model_download_proxy: Optional[str] = None,
-    use_strict_mode: bool = False,
+        text: str,
+        *,
+        low_memory: bool = True,
+        model_download_proxy: Optional[str] = None,
+        use_strict_mode: bool = False,
+        config: Optional[LangDetectConfig] = None,
 ) -> Dict[str, Union[str, float]]:
     """
     Simple interface for language detection.
-
-    Before passing a text to this function, you remove all the newline characters.
-
+    
     Too long or too short text will effect the accuracy of the prediction.
 
     :param text: Input text without newline characters
     :param low_memory: Whether to use memory-efficient model
-    :param model_download_proxy: Optional proxy for model download
-    :param use_strict_mode: Disable fallback to small model
+    :param model_download_proxy: [DEPRECATED] Optional proxy for model download
+    :param use_strict_mode: [DEPRECATED] Disable fallback to small model
+    :param config: Optional LangDetectConfig object for advanced configuration
 
     :return: Dictionary with language and confidence score
     """
-    if "\n" in text or len(text) > 1000:
+    # Provide config
+    if config is not None:
+        detector = LangDetector(config)
+        return detector.detect(text, low_memory=low_memory)
+
+    # Check if any custom parameters are provided
+    has_custom_params = any([
+        model_download_proxy is not None,
+        use_strict_mode,
+    ])
+    if has_custom_params:
+        # Show warning if using individual parameters
         logger.warning(
-            "fast-langdetect: Text contains newline characters or is too long. "
-            "You should only pass a single sentence for accurate prediction."
+            "fast-langdetect: Using individual parameters is deprecated. "
+            "Consider using LangDetectConfig for better configuration management. "
+            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
         )
-    if model_download_proxy or use_strict_mode:
-        config = LangDetectConfig(
-            proxy=model_download_proxy, allow_fallback=not use_strict_mode
+        custom_config = LangDetectConfig(
+            proxy=model_download_proxy,
+            allow_fallback=not use_strict_mode,
         )
-        detector = LangDetector(config)
+        detector = LangDetector(custom_config)
         return detector.detect(text, low_memory=low_memory)
+
+    # Use default detector
     return _default_detector.detect(text, low_memory=low_memory)
 
 
 def detect_multilingual(
-    text: str,
-    *,
-    low_memory: bool = False,
-    model_download_proxy: Optional[str] = None,
-    k: int = 5,
-    threshold: float = 0.0,
-    use_strict_mode: bool = False,
+        text: str,
+        *,
+        low_memory: bool = False,
+        model_download_proxy: Optional[str] = None,
+        k: int = 5,
+        threshold: float = 0.0,
+        use_strict_mode: bool = False,
+        config: Optional[LangDetectConfig] = None,
 ) -> List[Dict[str, Any]]:
     """
     Simple interface for multi-language detection.
 
-    Before passing a text to this function, you remove all the newline characters.
-
     Too long or too short text will effect the accuracy of the prediction.
 
     :param text: Input text without newline characters
     :param low_memory: Whether to use memory-efficient model
-    :param model_download_proxy: Optional proxy for model download
     :param k: Number of top languages to return
     :param threshold: Minimum confidence threshold
-    :param use_strict_mode: Disable fallback to small model
+    :param model_download_proxy: [DEPRECATED] Optional proxy for model download
+    :param use_strict_mode: [DEPRECATED] Disable fallback to small model
+    :param config: Optional LangDetectConfig object for advanced configuration
 
     :return: List of dictionaries with languages and scores
     """
-    if "\n" in text or len(text) > 100:
+    # Use provided config or create new config
+    if config is not None:
+        detector = LangDetector(config)
+        return detector.detect_multilingual(
+            text, low_memory=low_memory, k=k, threshold=threshold
+        )
+
+    # Check if any custom parameters are provided
+    has_custom_params = any([
+        model_download_proxy is not None,
+        use_strict_mode,
+    ])
+    if has_custom_params:
+        # Show warning if using individual parameters
         logger.warning(
-            "fast-langdetect: Text contains newline characters or is too long. "
-            "You should only pass a single sentence for accurate prediction."
+            "fast-langdetect: Using individual parameters is deprecated. "
+            "Consider using LangDetectConfig for better configuration management. "
+            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
         )
-    if model_download_proxy or use_strict_mode:
-        config = LangDetectConfig(
-            proxy=model_download_proxy, allow_fallback=not use_strict_mode
+        custom_config = LangDetectConfig(
+            proxy=model_download_proxy,
+            allow_fallback=not use_strict_mode,
         )
-        detector = LangDetector(config)
+        detector = LangDetector(custom_config)
         return detector.detect_multilingual(
             text, low_memory=low_memory, k=k, threshold=threshold
         )
+
+    # Use default detector
     return _default_detector.detect_multilingual(
         text, low_memory=low_memory, k=k, threshold=threshold
     )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,9 +4,9 @@ def pytest_configure(config):
     """注册自定义标记。"""
     config.addinivalue_line(
         "markers",
-        "slow: 标记需要较长时间运行的测试"
+        "slow: Run in long progress"
     )
     config.addinivalue_line(
         "markers",
-        "real: 标记使用真实模型的测试"
+        "real: Test with real model"
     ) 
diff --git a/tests/test_detect.py b/tests/test_detect.py
diff --git a/tests/test_real_detection.py b/tests/test_real_detection.py

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,9 @@`
`30`	`30`	`# When offline, its raise error`
`31`	`31`	`print(`
`32`	`32`	`detect_multilingual(`
`33`		`- "Hello, world!你好世界!Привет, мир!", low_memory=False, use_strict_mode=True`
	`33`	`+ "Hello, world!你好世界!Привет, мир!",`
	`34`	`+ low_memory=False,`
	`35`	`+ config=LangDetectConfig(allow_fallback=True)`
`34`	`36`	`)`
`35`	`37`	`)`
`36`	`38`
Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,9 @@ def pytest_configure(config):`
`4`	`4`	`"""注册自定义标记。"""`
`5`	`5`	`config.addinivalue_line(`
`6`	`6`	`"markers",`
`7`		`- "slow: 标记需要较长时间运行的测试"`
	`7`	`+ "slow: Run in long progress"`
`8`	`8`	`)`
`9`	`9`	`config.addinivalue_line(`
`10`	`10`	`"markers",`
`11`		`- "real: 标记使用真实模型的测试"`
	`11`	`+ "real: Test with real model"`
`12`	`12`	`)`