Skip to content

Commit 0cddac3

Browse files
committed
✨ feat(app): add text preprocessing for improved accuracy 🚀
Introduced `_preprocess_text` method to clean and validate text before detection. This ensures removal of newline characters and warns if text length exceeds 100 characters, enhancing prediction accuracy and preventing errors.
1 parent b3eb1db commit 0cddac3

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

src/fast_langdetect/infer.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,27 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242242
self.config = config or LangDetectConfig()
243243
self._model_loader = ModelLoader()
244244

245+
@staticmethod
246+
def _preprocess_text(text: str) -> str:
247+
"""
248+
Check text for newline characters and length.
249+
250+
:param text: Input text
251+
:return: Processed text
252+
"""
253+
if len(text) > 100:
254+
logger.warning(
255+
"fast-langdetect: Text may be too long. "
256+
"Consider passing only a single sentence for accurate prediction."
257+
)
258+
if "\n" in text:
259+
logger.warning(
260+
"fast-langdetect: Newline characters will be removed. "
261+
"Input should not contain newline characters. or FastText will raise an error."
262+
)
263+
text = text.replace("\n", " ")
264+
return text
265+
245266
@staticmethod
246267
def _normalize_text(text: str, should_normalize: bool = False) -> str:
247268
"""
@@ -258,7 +279,7 @@ def _normalize_text(text: str, should_normalize: bool = False) -> str:
258279
# If not normalization is needed, return the processed text
259280
if not should_normalize:
260281
return text
261-
282+
262283
# Check if text is all uppercase or mostly uppercase
263284
# https://github.com/LlmKira/fast-langdetect/issues/14
264285
if text.isupper() or (
@@ -317,18 +338,8 @@ def detect(
317338
DetectError: If detection fails
318339
"""
319340
model = self._get_model(low_memory)
341+
text = self._preprocess_text(text)
320342
normalized_text = self._normalize_text(text, self.config.normalize_input)
321-
if len(normalized_text) > 100:
322-
logger.warning(
323-
"fast-langdetect: Text may be too long. "
324-
"Consider passing only a single sentence for accurate prediction."
325-
)
326-
if "\n" in normalized_text:
327-
logger.warning(
328-
"fast-langdetect: Input should not contain newline characters. "
329-
"Removing them or FastText will raise an error."
330-
)
331-
normalized_text = normalized_text.replace("\n", " ")
332343
try:
333344
labels, scores = model.predict(normalized_text)
334345
return {
@@ -360,6 +371,7 @@ def detect_multilingual(
360371
DetectError: If detection fails
361372
"""
362373
model = self._get_model(low_memory)
374+
text = self._preprocess_text(text)
363375
normalized_text = self._normalize_text(text, self.config.normalize_input)
364376
try:
365377
labels, scores = model.predict(normalized_text, k=k, threshold=threshold)

0 commit comments

Comments
 (0)