6
6
import hashlib
7
7
import logging
8
8
import os
9
- import tempfile
10
9
import platform
11
10
import re
12
11
import shutil
12
+ import tempfile
13
13
from pathlib import Path
14
14
from typing import Dict , List , Optional , Union , Any
15
15
@@ -143,29 +143,29 @@ def _load_windows_compatible(self, model_path: Path) -> Any:
143
143
:raises DetectError: If all loading strategies fail
144
144
"""
145
145
model_path_str = str (model_path .resolve ())
146
-
146
+
147
147
# Try to load model directly
148
148
try :
149
149
return fasttext .load_model (model_path_str )
150
150
except Exception as e :
151
151
logger .debug (f"fast-langdetect: Load model failed: { e } " )
152
-
152
+
153
153
# Try to load model using relative path
154
154
try :
155
155
cwd = Path .cwd ()
156
156
rel_path = os .path .relpath (model_path , cwd )
157
157
return fasttext .load_model (rel_path )
158
158
except Exception as e :
159
159
logger .debug (f"fast-langdetect: Failed to load model using relative path: { e } " )
160
-
160
+
161
161
# Use temporary file as last resort
162
162
logger .debug (f"fast-langdetect: Using temporary file to load model: { model_path } " )
163
163
tmp_path = None
164
164
try :
165
165
# Use NamedTemporaryFile to create a temporary file
166
166
tmp_fd , tmp_path = tempfile .mkstemp (suffix = '.bin' )
167
167
os .close (tmp_fd ) # Close file descriptor
168
-
168
+
169
169
# Copy model file to temporary location
170
170
shutil .copy2 (model_path , tmp_path )
171
171
return fasttext .load_model (tmp_path )
@@ -203,16 +203,18 @@ class LangDetectConfig:
203
203
:param proxy: HTTP proxy for downloads
204
204
:param allow_fallback: Whether to fallback to small model
205
205
:param disable_verify: Whether to disable MD5 verification
206
+ :param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
206
207
"""
207
208
208
209
def __init__ (
209
- self ,
210
- cache_dir : Optional [str ] = None ,
211
- custom_model_path : Optional [str ] = None ,
212
- proxy : Optional [str ] = None ,
213
- allow_fallback : bool = True ,
214
- disable_verify : bool = False ,
215
- verify_hash : Optional [str ] = None ,
210
+ self ,
211
+ cache_dir : Optional [str ] = None ,
212
+ custom_model_path : Optional [str ] = None ,
213
+ proxy : Optional [str ] = None ,
214
+ allow_fallback : bool = True ,
215
+ disable_verify : bool = False ,
216
+ verify_hash : Optional [str ] = None ,
217
+ normalize_input : bool = True ,
216
218
):
217
219
self .cache_dir = cache_dir or CACHE_DIRECTORY
218
220
self .custom_model_path = custom_model_path
@@ -221,9 +223,11 @@ def __init__(
221
223
# Only verify large model
222
224
self .disable_verify = disable_verify
223
225
self .verify_hash = verify_hash
226
+ self .normalize_input = normalize_input
224
227
if self .custom_model_path and not Path (self .custom_model_path ).exists ():
225
228
raise FileNotFoundError (f"fast-langdetect: Target model file not found: { self .custom_model_path } " )
226
229
230
+
227
231
class LangDetector :
228
232
"""Language detector using FastText models."""
229
233
VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
@@ -238,6 +242,54 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
238
242
self .config = config or LangDetectConfig ()
239
243
self ._model_loader = ModelLoader ()
240
244
245
+ @staticmethod
246
+ def _preprocess_text (text : str ) -> str :
247
+ """
248
+ Check text for newline characters and length.
249
+
250
+ :param text: Input text
251
+ :return: Processed text
252
+ """
253
+ if len (text ) > 100 :
254
+ logger .warning (
255
+ "fast-langdetect: Text may be too long. "
256
+ "Consider passing only a single sentence for accurate prediction."
257
+ )
258
+ if "\n " in text :
259
+ logger .warning (
260
+ "fast-langdetect: Newline characters will be removed. "
261
+ "Input should not contain newline characters. or FastText will raise an error."
262
+ )
263
+ text = text .replace ("\n " , " " )
264
+ return text
265
+
266
+ @staticmethod
267
+ def _normalize_text (text : str , should_normalize : bool = False ) -> str :
268
+ """
269
+ Normalize text based on configuration.
270
+
271
+ Currently, handles:
272
+ - Removing newline characters for better prediction
273
+ - Lowercasing uppercase text to prevent misdetection as Japanese
274
+
275
+ :param text: Input text
276
+ :param should_normalize: Whether normalization should be applied
277
+ :return: Normalized text
278
+ """
279
+ # If not normalization is needed, return the processed text
280
+ if not should_normalize :
281
+ return text
282
+
283
+ # Check if text is all uppercase or mostly uppercase
284
+ # https://github.com/LlmKira/fast-langdetect/issues/14
285
+ if text .isupper () or (
286
+ len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
287
+ and len (text ) > 5
288
+ ):
289
+ return text .lower ()
290
+
291
+ return text
292
+
241
293
def _get_model (self , low_memory : bool = True ) -> Any :
242
294
"""Get or load appropriate model."""
243
295
cache_key = "low_memory" if low_memory else "high_memory"
@@ -272,7 +324,7 @@ def _get_model(self, low_memory: bool = True) -> Any:
272
324
raise DetectError ("Failed to load model" ) from e
273
325
274
326
def detect (
275
- self , text : str , low_memory : bool = True
327
+ self , text : str , low_memory : bool = True
276
328
) -> Dict [str , Union [str , float ]]:
277
329
"""
278
330
Detect primary language of text.
@@ -286,8 +338,10 @@ def detect(
286
338
DetectError: If detection fails
287
339
"""
288
340
model = self ._get_model (low_memory )
341
+ text = self ._preprocess_text (text )
342
+ normalized_text = self ._normalize_text (text , self .config .normalize_input )
289
343
try :
290
- labels , scores = model .predict (text )
344
+ labels , scores = model .predict (normalized_text )
291
345
return {
292
346
"lang" : labels [0 ].replace ("__label__" , "" ),
293
347
"score" : min (float (scores [0 ]), 1.0 ),
@@ -297,11 +351,11 @@ def detect(
297
351
raise DetectError ("Language detection failed" ) from e
298
352
299
353
def detect_multilingual (
300
- self ,
301
- text : str ,
302
- low_memory : bool = False ,
303
- k : int = 5 ,
304
- threshold : float = 0.0 ,
354
+ self ,
355
+ text : str ,
356
+ low_memory : bool = False ,
357
+ k : int = 5 ,
358
+ threshold : float = 0.0 ,
305
359
) -> List [Dict [str , Any ]]:
306
360
"""
307
361
Detect multiple possible languages in text.
@@ -317,8 +371,10 @@ def detect_multilingual(
317
371
DetectError: If detection fails
318
372
"""
319
373
model = self ._get_model (low_memory )
374
+ text = self ._preprocess_text (text )
375
+ normalized_text = self ._normalize_text (text , self .config .normalize_input )
320
376
try :
321
- labels , scores = model .predict (text , k = k , threshold = threshold )
377
+ labels , scores = model .predict (normalized_text , k = k , threshold = threshold )
322
378
results = [
323
379
{
324
380
"lang" : label .replace ("__label__" , "" ),
@@ -337,78 +393,108 @@ def detect_multilingual(
337
393
338
394
339
395
def detect (
340
- text : str ,
341
- * ,
342
- low_memory : bool = True ,
343
- model_download_proxy : Optional [str ] = None ,
344
- use_strict_mode : bool = False ,
396
+ text : str ,
397
+ * ,
398
+ low_memory : bool = True ,
399
+ model_download_proxy : Optional [str ] = None ,
400
+ use_strict_mode : bool = False ,
401
+ config : Optional [LangDetectConfig ] = None ,
345
402
) -> Dict [str , Union [str , float ]]:
346
403
"""
347
404
Simple interface for language detection.
348
-
349
- Before passing a text to this function, you remove all the newline characters.
350
-
405
+
351
406
Too long or too short text will effect the accuracy of the prediction.
352
407
353
408
:param text: Input text without newline characters
354
409
:param low_memory: Whether to use memory-efficient model
355
- :param model_download_proxy: Optional proxy for model download
356
- :param use_strict_mode: Disable fallback to small model
410
+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
411
+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
412
+ :param config: Optional LangDetectConfig object for advanced configuration
357
413
358
414
:return: Dictionary with language and confidence score
359
415
"""
360
- if "\n " in text or len (text ) > 1000 :
416
+ # Provide config
417
+ if config is not None :
418
+ detector = LangDetector (config )
419
+ return detector .detect (text , low_memory = low_memory )
420
+
421
+ # Check if any custom parameters are provided
422
+ has_custom_params = any ([
423
+ model_download_proxy is not None ,
424
+ use_strict_mode ,
425
+ ])
426
+ if has_custom_params :
427
+ # Show warning if using individual parameters
361
428
logger .warning (
362
- "fast-langdetect: Text contains newline characters or is too long. "
363
- "You should only pass a single sentence for accurate prediction."
429
+ "fast-langdetect: Using individual parameters is deprecated. "
430
+ "Consider using LangDetectConfig for better configuration management. "
431
+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
364
432
)
365
- if model_download_proxy or use_strict_mode :
366
- config = LangDetectConfig (
367
- proxy = model_download_proxy , allow_fallback = not use_strict_mode
433
+ custom_config = LangDetectConfig (
434
+ proxy = model_download_proxy ,
435
+ allow_fallback = not use_strict_mode ,
368
436
)
369
- detector = LangDetector (config )
437
+ detector = LangDetector (custom_config )
370
438
return detector .detect (text , low_memory = low_memory )
439
+
440
+ # Use default detector
371
441
return _default_detector .detect (text , low_memory = low_memory )
372
442
373
443
374
444
def detect_multilingual (
375
- text : str ,
376
- * ,
377
- low_memory : bool = False ,
378
- model_download_proxy : Optional [str ] = None ,
379
- k : int = 5 ,
380
- threshold : float = 0.0 ,
381
- use_strict_mode : bool = False ,
445
+ text : str ,
446
+ * ,
447
+ low_memory : bool = False ,
448
+ model_download_proxy : Optional [str ] = None ,
449
+ k : int = 5 ,
450
+ threshold : float = 0.0 ,
451
+ use_strict_mode : bool = False ,
452
+ config : Optional [LangDetectConfig ] = None ,
382
453
) -> List [Dict [str , Any ]]:
383
454
"""
384
455
Simple interface for multi-language detection.
385
456
386
- Before passing a text to this function, you remove all the newline characters.
387
-
388
457
Too long or too short text will effect the accuracy of the prediction.
389
458
390
459
:param text: Input text without newline characters
391
460
:param low_memory: Whether to use memory-efficient model
392
- :param model_download_proxy: Optional proxy for model download
393
461
:param k: Number of top languages to return
394
462
:param threshold: Minimum confidence threshold
395
- :param use_strict_mode: Disable fallback to small model
463
+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
464
+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
465
+ :param config: Optional LangDetectConfig object for advanced configuration
396
466
397
467
:return: List of dictionaries with languages and scores
398
468
"""
399
- if "\n " in text or len (text ) > 100 :
469
+ # Use provided config or create new config
470
+ if config is not None :
471
+ detector = LangDetector (config )
472
+ return detector .detect_multilingual (
473
+ text , low_memory = low_memory , k = k , threshold = threshold
474
+ )
475
+
476
+ # Check if any custom parameters are provided
477
+ has_custom_params = any ([
478
+ model_download_proxy is not None ,
479
+ use_strict_mode ,
480
+ ])
481
+ if has_custom_params :
482
+ # Show warning if using individual parameters
400
483
logger .warning (
401
- "fast-langdetect: Text contains newline characters or is too long. "
402
- "You should only pass a single sentence for accurate prediction."
484
+ "fast-langdetect: Using individual parameters is deprecated. "
485
+ "Consider using LangDetectConfig for better configuration management. "
486
+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
403
487
)
404
- if model_download_proxy or use_strict_mode :
405
- config = LangDetectConfig (
406
- proxy = model_download_proxy , allow_fallback = not use_strict_mode
488
+ custom_config = LangDetectConfig (
489
+ proxy = model_download_proxy ,
490
+ allow_fallback = not use_strict_mode ,
407
491
)
408
- detector = LangDetector (config )
492
+ detector = LangDetector (custom_config )
409
493
return detector .detect_multilingual (
410
494
text , low_memory = low_memory , k = k , threshold = threshold
411
495
)
496
+
497
+ # Use default detector
412
498
return _default_detector .detect_multilingual (
413
499
text , low_memory = low_memory , k = k , threshold = threshold
414
500
)
0 commit comments