From 591acc155877386e2e9cbe03a0ed613961b1c7bd Mon Sep 17 00:00:00 2001 From: intellild Date: Mon, 26 May 2025 09:58:27 +0800 Subject: [PATCH] make detect_language return Option --- src/bin/assess.rs | 2 +- src/lib.rs | 40 ++++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/bin/assess.rs b/src/bin/assess.rs index 69efe20..f4a7d8e 100644 --- a/src/bin/assess.rs +++ b/src/bin/assess.rs @@ -25,7 +25,7 @@ fn main() -> io::Result<()> { if !label_codes.contains(&id_label_sentence[1]) { continue; } - let detected: Lang = whichlang::detect_language(id_label_sentence[2]); + let detected: Lang = whichlang::detect_language(id_label_sentence[2]).unwrap_or(Lang::Eng); total += 1; if detected.three_letter_code() != id_label_sentence[1] { error += 1; diff --git a/src/lib.rs b/src/lib.rs index a497e76..a141580 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +use std::cmp::Ordering; pub use crate::weights::{Lang, LANGUAGES}; #[allow(clippy::all)] @@ -45,7 +46,7 @@ impl Feature { } } -pub fn detect_language(text: &str) -> Lang { +pub fn detect_language(text: &str) -> Option { let mut scores: [f32; NUM_LANGUAGES] = Default::default(); let mut num_features: u32 = 0; emit_tokens( @@ -62,8 +63,7 @@ pub fn detect_language(text: &str) -> Lang { }, ); if num_features == 0 { - // By default, we return English - return Lang::Eng; + return None; } let sqrt_inv_num_features = 1.0f32 / (num_features as f32).sqrt(); @@ -73,13 +73,13 @@ pub fn detect_language(text: &str) -> Lang { scores[i] = scores[i] * sqrt_inv_num_features + weights::INTERCEPTS[i]; } - let lang_id = scores + scores .iter() .enumerate() - .max_by(|(_, &score_left), (_, &score_right)| score_left.partial_cmp(&score_right).unwrap()) + .max_by(|(_, &score_left), (_, &score_right)| score_left.partial_cmp(&score_right).unwrap_or(Ordering::Equal)) .map(|(pos, _val)| pos) - .unwrap(); - weights::LANGUAGES[lang_id] + .map(|lang_id | weights::LANGUAGES[lang_id]) + } #[doc(hidden)] @@ -241,33 +241,33 @@ mod tests { #[test] fn test_empty_str() { - assert_eq!(detect_language(""), Lang::Eng); + assert_eq!(detect_language(""), None); } #[test] fn test_detect_language() { // English - assert_eq!(detect_language("Hello, happy tax payer"), Lang::Eng); + assert_eq!(detect_language("Hello, happy tax payer").unwrap(), Lang::Eng); // French - assert_eq!(detect_language("Bonjour joyeux contribuable"), Lang::Fra); + assert_eq!(detect_language("Bonjour joyeux contribuable").unwrap(), Lang::Fra); // German - assert_eq!(detect_language("Hallo glücklicher Steuerzahler"), Lang::Deu); + assert_eq!(detect_language("Hallo glücklicher Steuerzahler").unwrap(), Lang::Deu); // Japanese - assert_eq!(detect_language("こんにちは幸せな税金納め"), Lang::Jpn); + assert_eq!(detect_language("こんにちは幸せな税金納め").unwrap(), Lang::Jpn); // Mandarin chinese - assert_eq!(detect_language("你好幸福的纳税人"), Lang::Cmn); + assert_eq!(detect_language("你好幸福的纳税人").unwrap(), Lang::Cmn); // Turkish - assert_eq!(detect_language("Merhaba, mutlu vergi mükellefi"), Lang::Tur); + assert_eq!(detect_language("Merhaba, mutlu vergi mükellefi").unwrap(), Lang::Tur); // Dutch - assert_eq!(detect_language("Hallo, blije belastingbetaler"), Lang::Nld); + assert_eq!(detect_language("Hallo, blije belastingbetaler").unwrap(), Lang::Nld); // Korean - assert_eq!(detect_language("안녕하세요 행복한 납세자입니다"), Lang::Kor); + assert_eq!(detect_language("안녕하세요 행복한 납세자입니다").unwrap(), Lang::Kor); // Italian - assert_eq!(detect_language("Ciao, felice contribuente!"), Lang::Ita); + assert_eq!(detect_language("Ciao, felice contribuente!").unwrap(), Lang::Ita); // Spanish - assert_eq!(detect_language("Hola feliz contribuyente"), Lang::Spa); - assert_eq!(detect_language("¡Hola!"), Lang::Spa); + assert_eq!(detect_language("Hola feliz contribuyente").unwrap(), Lang::Spa); + assert_eq!(detect_language("¡Hola!").unwrap(), Lang::Spa); // Portuguese - assert_eq!(detect_language("Olá feliz contribuinte"), Lang::Por); + assert_eq!(detect_language("Olá feliz contribuinte").unwrap(), Lang::Por); } }