Skip to content

Commit b3eb1db

Browse files
committed
✨ refactor(app): enhance text normalization and logging
Improved `_normalize_text` to static method and refined logging messages for better clarity. This change enhances text processing by explicitly handling newline characters and long inputs, as well as aligning with issue #14. 🛠️ Refactoring ensures better code maintainability and readability.
1 parent 45fbec5 commit b3eb1db

File tree

1 file changed

+11
-13
lines changed

1 file changed

+11
-13
lines changed

src/fast_langdetect/infer.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -242,11 +242,12 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242242
self.config = config or LangDetectConfig()
243243
self._model_loader = ModelLoader()
244244

245-
def _normalize_text(self, text: str, should_normalize: bool = False) -> str:
245+
@staticmethod
246+
def _normalize_text(text: str, should_normalize: bool = False) -> str:
246247
"""
247248
Normalize text based on configuration.
248249
249-
Currently handles:
250+
Currently, handles:
250251
- Removing newline characters for better prediction
251252
- Lowercasing uppercase text to prevent misdetection as Japanese
252253
@@ -257,12 +258,9 @@ def _normalize_text(self, text: str, should_normalize: bool = False) -> str:
257258
# If not normalization is needed, return the processed text
258259
if not should_normalize:
259260
return text
260-
261-
# Check and record newline and long text
262-
if "\n" in text:
263-
text = text.replace("\n", " ")
264261

265262
# Check if text is all uppercase or mostly uppercase
263+
# https://github.com/LlmKira/fast-langdetect/issues/14
266264
if text.isupper() or (
267265
len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
268266
and len(text) > 5
@@ -322,13 +320,13 @@ def detect(
322320
normalized_text = self._normalize_text(text, self.config.normalize_input)
323321
if len(normalized_text) > 100:
324322
logger.warning(
325-
"fast-langdetect: Text is too long. "
323+
"fast-langdetect: Text may be too long. "
326324
"Consider passing only a single sentence for accurate prediction."
327325
)
328326
if "\n" in normalized_text:
329327
logger.warning(
330-
"fast-langdetect: Text contains newline characters. "
331-
"Removing newlines for better prediction accuracy."
328+
"fast-langdetect: Input should not contain newline characters. "
329+
"Removing them or FastText will raise an error."
332330
)
333331
normalized_text = normalized_text.replace("\n", " ")
334332
try:
@@ -407,7 +405,7 @@ def detect(
407405
if config is not None:
408406
detector = LangDetector(config)
409407
return detector.detect(text, low_memory=low_memory)
410-
408+
411409
# Check if any custom parameters are provided
412410
has_custom_params = any([
413411
model_download_proxy is not None,
@@ -426,7 +424,7 @@ def detect(
426424
)
427425
detector = LangDetector(custom_config)
428426
return detector.detect(text, low_memory=low_memory)
429-
427+
430428
# Use default detector
431429
return _default_detector.detect(text, low_memory=low_memory)
432430

@@ -462,7 +460,7 @@ def detect_multilingual(
462460
return detector.detect_multilingual(
463461
text, low_memory=low_memory, k=k, threshold=threshold
464462
)
465-
463+
466464
# Check if any custom parameters are provided
467465
has_custom_params = any([
468466
model_download_proxy is not None,
@@ -483,7 +481,7 @@ def detect_multilingual(
483481
return detector.detect_multilingual(
484482
text, low_memory=low_memory, k=k, threshold=threshold
485483
)
486-
484+
487485
# Use default detector
488486
return _default_detector.detect_multilingual(
489487
text, low_memory=low_memory, k=k, threshold=threshold

0 commit comments

Comments
 (0)