@@ -242,11 +242,12 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242
242
self .config = config or LangDetectConfig ()
243
243
self ._model_loader = ModelLoader ()
244
244
245
- def _normalize_text (self , text : str , should_normalize : bool = False ) -> str :
245
+ @staticmethod
246
+ def _normalize_text (text : str , should_normalize : bool = False ) -> str :
246
247
"""
247
248
Normalize text based on configuration.
248
249
249
- Currently handles:
250
+ Currently, handles:
250
251
- Removing newline characters for better prediction
251
252
- Lowercasing uppercase text to prevent misdetection as Japanese
252
253
@@ -257,12 +258,9 @@ def _normalize_text(self, text: str, should_normalize: bool = False) -> str:
257
258
# If not normalization is needed, return the processed text
258
259
if not should_normalize :
259
260
return text
260
-
261
- # Check and record newline and long text
262
- if "\n " in text :
263
- text = text .replace ("\n " , " " )
264
261
265
262
# Check if text is all uppercase or mostly uppercase
263
+ # https://github.com/LlmKira/fast-langdetect/issues/14
266
264
if text .isupper () or (
267
265
len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
268
266
and len (text ) > 5
@@ -322,13 +320,13 @@ def detect(
322
320
normalized_text = self ._normalize_text (text , self .config .normalize_input )
323
321
if len (normalized_text ) > 100 :
324
322
logger .warning (
325
- "fast-langdetect: Text is too long. "
323
+ "fast-langdetect: Text may be too long. "
326
324
"Consider passing only a single sentence for accurate prediction."
327
325
)
328
326
if "\n " in normalized_text :
329
327
logger .warning (
330
- "fast-langdetect: Text contains newline characters. "
331
- "Removing newlines for better prediction accuracy ."
328
+ "fast-langdetect: Input should not contain newline characters. "
329
+ "Removing them or FastText will raise an error ."
332
330
)
333
331
normalized_text = normalized_text .replace ("\n " , " " )
334
332
try :
@@ -407,7 +405,7 @@ def detect(
407
405
if config is not None :
408
406
detector = LangDetector (config )
409
407
return detector .detect (text , low_memory = low_memory )
410
-
408
+
411
409
# Check if any custom parameters are provided
412
410
has_custom_params = any ([
413
411
model_download_proxy is not None ,
@@ -426,7 +424,7 @@ def detect(
426
424
)
427
425
detector = LangDetector (custom_config )
428
426
return detector .detect (text , low_memory = low_memory )
429
-
427
+
430
428
# Use default detector
431
429
return _default_detector .detect (text , low_memory = low_memory )
432
430
@@ -462,7 +460,7 @@ def detect_multilingual(
462
460
return detector .detect_multilingual (
463
461
text , low_memory = low_memory , k = k , threshold = threshold
464
462
)
465
-
463
+
466
464
# Check if any custom parameters are provided
467
465
has_custom_params = any ([
468
466
model_download_proxy is not None ,
@@ -483,7 +481,7 @@ def detect_multilingual(
483
481
return detector .detect_multilingual (
484
482
text , low_memory = low_memory , k = k , threshold = threshold
485
483
)
486
-
484
+
487
485
# Use default detector
488
486
return _default_detector .detect_multilingual (
489
487
text , low_memory = low_memory , k = k , threshold = threshold
0 commit comments