@@ -242,6 +242,27 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242
242
self .config = config or LangDetectConfig ()
243
243
self ._model_loader = ModelLoader ()
244
244
245
+ @staticmethod
246
+ def _preprocess_text (text : str ) -> str :
247
+ """
248
+ Check text for newline characters and length.
249
+
250
+ :param text: Input text
251
+ :return: Processed text
252
+ """
253
+ if len (text ) > 100 :
254
+ logger .warning (
255
+ "fast-langdetect: Text may be too long. "
256
+ "Consider passing only a single sentence for accurate prediction."
257
+ )
258
+ if "\n " in text :
259
+ logger .warning (
260
+ "fast-langdetect: Newline characters will be removed. "
261
+ "Input should not contain newline characters. or FastText will raise an error."
262
+ )
263
+ text = text .replace ("\n " , " " )
264
+ return text
265
+
245
266
@staticmethod
246
267
def _normalize_text (text : str , should_normalize : bool = False ) -> str :
247
268
"""
@@ -258,7 +279,7 @@ def _normalize_text(text: str, should_normalize: bool = False) -> str:
258
279
# If not normalization is needed, return the processed text
259
280
if not should_normalize :
260
281
return text
261
-
282
+
262
283
# Check if text is all uppercase or mostly uppercase
263
284
# https://github.com/LlmKira/fast-langdetect/issues/14
264
285
if text .isupper () or (
@@ -317,18 +338,8 @@ def detect(
317
338
DetectError: If detection fails
318
339
"""
319
340
model = self ._get_model (low_memory )
341
+ text = self ._preprocess_text (text )
320
342
normalized_text = self ._normalize_text (text , self .config .normalize_input )
321
- if len (normalized_text ) > 100 :
322
- logger .warning (
323
- "fast-langdetect: Text may be too long. "
324
- "Consider passing only a single sentence for accurate prediction."
325
- )
326
- if "\n " in normalized_text :
327
- logger .warning (
328
- "fast-langdetect: Input should not contain newline characters. "
329
- "Removing them or FastText will raise an error."
330
- )
331
- normalized_text = normalized_text .replace ("\n " , " " )
332
343
try :
333
344
labels , scores = model .predict (normalized_text )
334
345
return {
@@ -360,6 +371,7 @@ def detect_multilingual(
360
371
DetectError: If detection fails
361
372
"""
362
373
model = self ._get_model (low_memory )
374
+ text = self ._preprocess_text (text )
363
375
normalized_text = self ._normalize_text (text , self .config .normalize_input )
364
376
try :
365
377
labels , scores = model .predict (normalized_text , k = k , threshold = threshold )
0 commit comments