Skip to content

Commit 08010cc

Browse files
committed
support cymraeg (welsh)
1 parent ce16b11 commit 08010cc

File tree

10 files changed

+328
-7
lines changed

10 files changed

+328
-7
lines changed

SUPPORTED_LANGUAGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,4 @@ and [documentation](https://docs.rs/whatlang/).
7777
| Catalan | cat | `Lang::Cat` |
7878
| Tagalog | tgl | `Lang::Tgl` |
7979
| Armenian | hye | `Lang::Hye` |
80+
| Welsh | cym | `Lang::Cym` |

misc/alphabets/latin.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ aka: abdefghiklmnoprstuwyɔɛ
44
aze: abcdefghijklmnopqrstuvxyzçöüğışə̇
55
cat: abcdefghijklmnopqrstuvwxyz·àçèéíïòóúü
66
ces: abcdefghijklmnopqrstuvwxyzáéíóúýčďěňřšťůž
7+
cym: abcdefghijklmnopqrstuvwxyzàáâäèéêëìíîïòóôöùúûüýÿŵŷẁẃẅỳ
78
dan: abcdefghijklmnopqrstuvwxyzåæø
89
deu: abcdefghijklmnopqrstuvwxyzßäöü
910
eng: abcdefghijklmnopqrstuvwxyz

misc/alphabets/raw_latin.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ latin_based:
3535
est: "ÄÖÕÜäöõü"
3636
lat: ""
3737
tgl: "áéíñóú"
38+
cym: "ÂÊÎÔÛŴŶÁÉÍÏâêîôûŵŷáéíïÓÚẂÝÀÈÌÒÙẀỲÄËÖÜẄŸóúẃýàèìòùẁỳäëöüẅÿ"
3839
others:
3940
tuk: "ABÇDEÄFGHIJŽKLMNŇOÖPRSŞTUÜWYÝZ"
4041
epo: "ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ"

misc/data.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@
120120
"ndo": "na |oku|wa | na|a o|a n|ka |ntu| uu|tu |uth| om|e o|mba|ong|omu|ba | ok|uut| ne|he |the|ang|hem|emb|unt|o o|a u| wo|nge| iy|ehe|kal| no|a w|o n|no |nga|e n|ko |mun|oka|lo |o i|lon|we |ulu|a m|ala| ke|la |a k|u n|han|ku |gwa|osh|shi|ana|ngu|ilo|ano|ngo|keh| mo|ga |nen|man|ho |luk|tha|ge |gul|u k|eng|ha |a y|elo|uko|a e|ye |hil|uka|li |go |wan|ath|wo |thi|dhi|uun| pa|kwa| ta|a p|ya | sh| ko|nka|lwa| os|mwe|oma|ta |ema|sho| ka|e m| yo|sha|wok|ika|po |o w|onk|e p|pan|ith|a i|opa|gel|hik|iya|hi |aan|una|o g|kuk|alo|o e|nok|ndj|le |a a|men|yom|a s|i n| li|and| po|pam|lat|kan|ash|waa|aka|ame|gam|umb|a t|ond|yuu|o k|olo|ane|ing|igw|aa |ele|kul|mon| gw|ilw|gan|o y|iil|iyo| el|kut|nin|oko|ike|o m| ku|adh| ye|amw|ome|yeh|aye| ga| on| yi|a g|lyo|ne | ng|mbo|opo|kug|eko|yok|wom| oy|non|iye| go|ulo|e e| we| e |ina|ant|omo|ene| a |i k|mok|him| dh|und|ndu| me|eho|wen|nek| op|alu|e g|ima|kat|ota|oye|ila|ngw|yop|wat|ela|o u|a l| ii| ay| nd| th|o l|yon|ili|oon|okw|yaa|taa|lwe|omb| ni|aku|i m|mo |ula|ekw|enw|iyu|pok|epa|uki|ke | wu| mb|meh|e t|uni|nom|dho|pau|eta|yi | ly|o a|ono|lun|lak|ola|yo |lol|ank|bo |i o|awa|nwa|a h|naw|hok|nem|kom|ndo|o s|u t|vet|mbu|ani|uga|ndi|ukw|udh|lok|e k|alw|kwe|kun| ya",
121121
"quy": "chi|nch|hik|una| ka|anc|kun|man|ana|aq |cha|aku|pas|as |sqa|paq|nan|qa |apa|kan|ikp|ik |ech|spa| de|pa |cho|ere|der|rec|am | ru|an | ma| ch|kpa|asq|ta |na |nam|nak|taq|a k|qan|ina|run|lli|ach|nap|pi |mi | ll|yoq|asp|ima|hay|hin|aqa|nku|ant|ayn|oyo| hi| im|hoy|cio|nta|nas|q k|api|iw |wan|kuy|kay|liw|aci|ion|ipa|lla|oq |npa|ay |kas|a m|nac| na|inc|all|ama|ari|anp| ya|chu| hu|nin|pip|i k|qmi|hon|w r|ata|awa|a c|ota|in |yku|yna| wa|a h|has|a d|iku|a l| li|pan|ich|may| pi| ha|onc|a r|onk| ot|ku | qa|ank|aqm|mun|anm|hu |a p|nma| mu|qta|n h|pap|isq|yni|ikm|ma |wsa|aws|kaw|ibr|bre|lib|ayk|usp|nqa|e k| al|lin|n k|re |ara|nat|yac|kma|war|huk|uwa|yta|hwa|chw| sa|was|kus|yan|m d|kpi|q m|a i|q l|kin|tap|a a|kta|ikt|i c|a s|uy | ca|qaw|uku| tu| re|aqt|ask|qsi|sak|uch|q h|cas|tin|pak|ris|ski|sic|q d|nmi|s l|naq|tuk|mpa|a y|k c|uma|ien|ypi| am|qaq|qap|eqs|ayp|req|qpa|aqp|law|ayt|q c|pun| ni|a q|ruw|i h|haw|n c| pa|amp|par|k h| le|yma|ñun|ern|huñ|nni|n r|anq|map|aya|tar|s m|uñu|ten|val|ura|ita|arm|isu|s c|onn|igu| ri|qku|naw|k l|u l|his|ley|say|s y|rim|aru|rma|sun|ier|s o|qar|n p|a f|a t|esq|n a|oqm|s i|awk| va|w n|hap|lap|kup|i r|kam|uyk|sap| qe|ual|m p|ran|nya|gua| pe| go|gob|maq|sum|ast| su| ig",
122122
"rmn": "aj |en | te|te | sa| le|aka|pen| si| e |el |ipe|si |kaj|sar| th|and| o |sav|qe |les| ma|es | ha|j t|hak|ja |ar |ave| an|a s|ta |i l|ia |nas| aj|ne | so|imn|mna|sqe|esq|nd |tha|haj|e s|e t|e a|enq|asq|man| ja|kan|e m| i | ta|the|mes|cia|bar|as |isa|utn|qo |hem|o s|s s| me|vel|ark|i t| na|kas|est| ba|s h|avo| di|ard| bi| pe|rka|lo | ak|ika|e r|a a| pr|e k|qi |mat|ima|e p|a t| av|e d|r s|n s|anu|nuś|o t|avi|orr|o a| ka| re|n a|re |aja|e o|sqo|sti| ov|õl |l p|nqe|ere|d o|vor|so |no |dik|rel|ove|n t|ve |e b|res|tim|ren| de|àci|o m|i a|but|len|ali|ari|rre|de | pa|ver| va|sqi|ara|ana|vip|rak|ang|vi | ra|or |ker|i s|eme|e z|ata|e l|a e|rip|rim|akh|la |o p|kar|e h|a p|na |ane|rin|ste|j b|er |ind|ni |tne| ph|nip|r t| ke|ti |are|ndo| je|l a|uśi|e n|khi| bu|kon|lim|al |tar|ekh|jek|àlo|o k| ko|rde|rab|aba| zi|ri |aća|ćar|śik|dõl|dor|on |ano|ven| ni|śaj| śa|khe|ća |ast|j s|uti|uni|tni|naś|i d|mut| po|i p|a m| pu|a l|l s|som|n n|ikh|nik|del|ala|ris|pes|pe |j m|enć|e e|nća|ndi|rdõ|kri|erd|śka|emu|men|alo|nis|aśt|śti|amu|kh |tis|uj |j p|do |ani|ate|nda|o b|nge|o z|soc|a d|muj|o j|da |pri|rdo| as|cie|l t|ro |i r|kla|ing|a j| ze|zen|j e|ziv|hin|aśk| st|maś|ran|pal|khl|mam|i b|oci|rea|l o|nqo| vi|n e",
123-
"lat": "is |et |us |um | et|ae |tat|ati| co|que|ue |ion| qu|em |ent|oni|est| su| iu| in| po|tio|tes|tis|ate|bus|e i|ita|ibu|ium|ius|qui|nti|eri|es |s p|con|s e|per|end|pot|ote| ha|nis| pr|s i|abe|uis|am |uae|tem|hab|bet|m h|ndi| ho|sta| de|sua|isq|squ|ter|ici|min|iur|one| re|hom| di| om|omn|rum|s a|t c|rat|lib|ibe|m e| pe|gen| li|ert|ine|nte|nem|ri |ber|tia|e q|dis| ip|ips| ad|di |nes|e s|e c|m p|s c| ve|e p| pa|ili| ge|a e|i p|nt |omi|atu|tur|rit| si|ne |psi|in |ia |ra |ari| cu|vit|rta|mo |to |mni|s h|e e|int|siu|m c|qua|t p|ivi|ini|ut |re |ers|it |s s|iae| es|t s|and| ne|pro| nu|st | ex|nda|cie|nib|t a|ere|tri|nit| at|tiu|ta |ris| ci|civ|ni |uri|ur |rim| vi|par|ad |ess|lic|i i| so| pu| op|rae| fa|s v| ut|dem|se |ons|o e|ria| se|e a| mo|leg|atq|tqu|com|te |niu|ien|vel|el | ma|t e|iis|gni|equ|oci|cip|ura|unt|s d|t i|ali|quo|ect| te|a s|t d| do|tut|ant|isc|ina|men|sin|ua |pra|oru|omm|eta|s n|a p|tum|iam|io |i c|sti| au|ver| ae|ito|dic|imi|s l|e d|fic|cia|t o|pub|ubl|bli|mun|i s|soc|aru|lar|ull|ori|t h|i e|sse|omo|cto|itu|tus| ea|ea |aeq|gio|ui |m s|er |m r| ra| fi|ffi|cog|da | le|mod|a c|mqu|nul|e o|era|ten|ntu|spe|o n|emo|cri|s f| ca|de |a d|rel|ii |ene| tu|sui|rti|sci|nae|m q|m a|egi|ces"
123+
"lat": "is |et |us |um | et|ae |tat|ati| co|que|ue |ion| qu|em |ent|oni|est| su| iu| in| po|tio|tes|tis|ate|bus|e i|ita|ibu|ium|ius|qui|nti|eri|es |s p|con|s e|per|end|pot|ote| ha|nis| pr|s i|abe|uis|am |uae|tem|hab|bet|m h|ndi| ho|sta| de|sua|isq|squ|ter|ici|min|iur|one| re|hom| di| om|omn|rum|s a|t c|rat|lib|ibe|m e| pe|gen| li|ert|ine|nte|nem|ri |ber|tia|e q|dis| ip|ips| ad|di |nes|e s|e c|m p|s c| ve|e p| pa|ili| ge|a e|i p|nt |omi|atu|tur|rit| si|ne |psi|in |ia |ra |ari| cu|vit|rta|mo |to |mni|s h|e e|int|siu|m c|qua|t p|ivi|ini|ut |re |ers|it |s s|iae| es|t s|and| ne|pro| nu|st | ex|nda|cie|nib|t a|ere|tri|nit| at|tiu|ta |ris| ci|civ|ni |uri|ur |rim| vi|par|ad |ess|lic|i i| so| pu| op|rae| fa|s v| ut|dem|se |ons|o e|ria| se|e a| mo|leg|atq|tqu|com|te |niu|ien|vel|el | ma|t e|iis|gni|equ|oci|cip|ura|unt|s d|t i|ali|quo|ect| te|a s|t d| do|tut|ant|isc|ina|men|sin|ua |pra|oru|omm|eta|s n|a p|tum|iam|io |i c|sti| au|ver| ae|ito|dic|imi|s l|e d|fic|cia|t o|pub|ubl|bli|mun|i s|soc|aru|lar|ull|ori|t h|i e|sse|omo|cto|itu|tus| ea|ea |aeq|gio|ui |m s|er |m r| ra| fi|ffi|cog|da | le|mod|a c|mqu|nul|e o|era|ten|ntu|spe|o n|emo|cri|s f| ca|de |a d|rel|ii |ene| tu|sui|rti|sci|nae|m q|m a|egi|ces",
124+
"cym": "yn | yn|dd | ma|ae |mae|au | y |d y|edd| r | m|ydd| ar| i |n y| o | cy|th | gw|ddi|eth|oed|ol |ar | gy| dd|wyd| ei| n | a |yd |odd| ga|aet|an | rh|iad|io |n a|ei |yr |wn |n c| ll| ca|n g|di |wed| me|od |o |el |n d|edi|r y|a |ith| we|ad | fe|er |r a|dau| da| am|on |d a|ch |l y|ddo| he| ch|roe| hy|e r| di|ynn|i | yr|dda|r g|gan|ir |d |ewn| ro|en | dy|fod| ff|iau|ll |mew| ym| de|id | sy|yw |dia| d|hyn| r|fyd|i g| un|eu |i d|nol|lla|u a|eit| ac| y|dol|i r|dio|wy |cyn|e |fel| ni|y |o r|idd|rth| go|l a|ai |efy|dyn| bo|rha| dr|rwy|ed |ada|n f|wyr|fer|ac |n e|rdd|aid|ael|nt |all|ion| tr|nyd|ach|gyf|cyf|r d|ig |h y|chw|ell|n b|d e|n o| by| ne| c|da | be|han|nia| oe|d o|r c|d g|dde|r o|ni |af |ara|n s| pe|lwy|gwe|wr |i a| br|in |gol| ge| g|rch|hef| ad|nod|nna|gyd|un | fa|d h| ys|d i|y d|e n|ria|es |am | an|dwy|ysg|y g|n |wyn|u c|l e|i f|gwy|efn|ddy|y c|dig|wys| eu|yda|n h|ych|thi|ant| yw|wei| ba|d c|n n|na |s y|yst|ryd|o a|i n|n m|u g|d d|law|i w|n i|n r| fo|ys |w |iae| co|do |nd |lia|red|t |y n|hau| ha|neu|u y|rhy|u r|bod| pr| ce|rae|gor|enn|gwa| a| pa|i c| f| er|lyn|rai|rif|ian|lli|nau|r h|lan|nwy|yfe|tha|r e|d m|diw|os |lle|ang| se|ddw|al |lad|o g|cae|ann|oli|a r|r b|rio"
124125
},
125126
"Cyrillic": {
126127
"rus": " пр| и |рав|ств| на|пра|го |ени|ове|во | ка|ани|ть | в | по| об|ия |сво| св|лов|на | че|ело|о н| со|ост|чел|ие |ого|ет |ния|ест|аво|ый |ажд| им|ние|век| не|льн|ли |ова|име|ать|при|т п|и п|каж|или|обо| ра|ых |жды| до|дый|воб|ек |бод|ва |й ч|его|ся |и с|ии |аци|еет|но |мее|и и|лен|ой |тва|ных|то | ил|к и|енн| бы|ию | за|ми |тво|и н|о п|ван|о с|сто|аль| вс|ом |о в|ьно|их |ног|и в|нов|ако|про|ий |сти|и о|пол|олж|дол|ое |бра|я в| ос|ным|жен|раз|ти |нос|я и| во|тор|все| ег|ей |тел|не |и р|ред|ель|тве|оди| ко|общ|о и| де|има|а и|чес|ним|сно|как| ли|щес|вле|ься|нны|аст|тьс|нно|осу|е д| от|пре|шен|а с|бще|осн|одн|быт|сов|ыть|лжн|ран|нию|иче|ак |ым |ват|что|сту|чен|е в| ст|рес|оль| ни|ном|род|ля |нар|вен|ду |оже|ны |е и| то|вер|а о|зов|м и|нац|ден|рин|туп|ежд|стр| чт|я п|она|дос|х и|й и|тоя|есп|лич|бес|обр|ото|о б|ьны|ь в|нии|е м|ую | мо|ем | ме|аро| ре|ава|кот|ав | вы|ам |жно|ста|ая |под|и к|ное| к | та| го|гос|суд|еоб|я н|ен |и д|мож|еск|ели|авн|ве |ече|уще|печ|дно|о д|ход|ка | дл|для|ово|ате|льс|ю и|в к|нен|ции|ной|уда|вов| бе|оро|нст|ами|циа|кон|сем|е о|вно| эт|азо|х п|ни |жде|м п|ког|от |дст|вны|сть|ые |о о|пос|сре|тра|ейс|так|и б|дов|му |я к|нал|дру| др|кой|тер|ь п|арс|изн|соц|еди|олн",

misc/supported_languages.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,4 @@ slk,Slovak,Slovenčina,5
6868
cat,Catalan,Català,10
6969
tgl,Tagalog,Tagalog,
7070
hye,Armenian,Հայերեն,7
71+
cym,Welsh,Cymraeg,0.5

src/alphabets/latin.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ const AKA: &str = "abdefghiklmnoprstuwyɔɛ";
1010
const AZE: &str = "abcdefghijklmnopqrstuvxyzçöüğışə̇";
1111
const CAT: &str = "abcdefghijklmnopqrstuvwxyz·àçèéíïòóúü";
1212
const CES: &str = "abcdefghijklmnopqrstuvwxyzáéíóúýčďěňřšťůž";
13+
const CYM: &str = "abcdefghijklmnopqrstuvwxyzàáâäèéêëìíîïòóôöùúûüýÿŵŷẁẃẅỳ";
1314
const DAN: &str = "abcdefghijklmnopqrstuvwxyzåæø";
1415
const DEU: &str = "abcdefghijklmnopqrstuvwxyzßäöü";
1516
const ENG: &str = "abcdefghijklmnopqrstuvwxyz";
@@ -49,6 +50,7 @@ const LATIN_ALPHABETS: &[(Lang, &str)] = &[
4950
(Lang::Aze, AZE),
5051
(Lang::Cat, CAT),
5152
(Lang::Ces, CES),
53+
(Lang::Cym, CYM),
5254
(Lang::Dan, DAN),
5355
(Lang::Deu, DEU),
5456
(Lang::Eng, ENG),
@@ -163,8 +165,8 @@ mod tests {
163165

164166
let outcome = alphabet_calculate_scores(&text, &filter);
165167
assert_eq!(outcome.count, 50);
166-
assert_eq!(outcome.raw_scores.len(), 36);
167-
assert_eq!(outcome.scores.len(), 36);
168+
assert_eq!(outcome.raw_scores.len(), 37);
169+
assert_eq!(outcome.scores.len(), 37);
168170

169171
let raw_scores_for = |lang: Lang| {
170172
outcome

src/core/detect.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ mod tests {
140140
Lang::Swe,
141141
Lang::Nob,
142142
Lang::Tgl,
143+
Lang::Cym,
143144
]);
144145
let options = Options::new().set_filter_list(filter_list);
145146
let output = detect_with_options(text, &options);

src/lang.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,12 @@ pub enum Lang {
223223

224224
/// Հայերեն (Armenian)
225225
Hye = 68,
226+
227+
/// Cymraeg (Welsh)
228+
Cym = 69,
226229
}
227230

228-
const VALUES: [Lang; 69] = [
231+
const VALUES: [Lang; 70] = [
229232
Lang::Epo,
230233
Lang::Eng,
231234
Lang::Rus,
@@ -295,6 +298,7 @@ const VALUES: [Lang; 69] = [
295298
Lang::Cat,
296299
Lang::Tgl,
297300
Lang::Hye,
301+
Lang::Cym,
298302
];
299303

300304
fn lang_from_code<S: Into<String>>(code: S) -> Option<Lang> {
@@ -368,6 +372,7 @@ fn lang_from_code<S: Into<String>>(code: S) -> Option<Lang> {
368372
"cat" => Some(Lang::Cat),
369373
"tgl" => Some(Lang::Tgl),
370374
"hye" => Some(Lang::Hye),
375+
"cym" => Some(Lang::Cym),
371376
_ => None,
372377
}
373378
}
@@ -443,6 +448,7 @@ fn lang_to_code(lang: Lang) -> &'static str {
443448
Lang::Cat => "cat",
444449
Lang::Tgl => "tgl",
445450
Lang::Hye => "hye",
451+
Lang::Cym => "cym",
446452
}
447453
}
448454

@@ -517,6 +523,7 @@ fn lang_to_name(lang: Lang) -> &'static str {
517523
Lang::Cat => "Català",
518524
Lang::Tgl => "Tagalog",
519525
Lang::Hye => "Հայերեն",
526+
Lang::Cym => "Cymraeg",
520527
}
521528
}
522529

@@ -591,6 +598,7 @@ fn lang_to_eng_name(lang: Lang) -> &'static str {
591598
Lang::Cat => "Catalan",
592599
Lang::Tgl => "Tagalog",
593600
Lang::Hye => "Armenian",
601+
Lang::Cym => "Welsh",
594602
}
595603
}
596604

@@ -700,7 +708,7 @@ mod tests {
700708

701709
#[test]
702710
fn test_all() {
703-
assert_eq!(Lang::all().len(), 69);
711+
assert_eq!(Lang::all().len(), 70);
704712
let all = Lang::all();
705713
assert!(all.contains(&Lang::Ukr));
706714
assert!(all.contains(&Lang::Swe));
@@ -726,7 +734,6 @@ mod tests {
726734
assert_eq!(Lang::Deu.to_string(), "Deutsch");
727735
assert_eq!(Lang::Eng.to_string(), "English");
728736
}
729-
730737
#[cfg(feature = "serde")]
731738
#[test]
732739
fn test_serialize_and_deserialize() {

src/scripts/lang_mapping.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use super::Script;
22
use crate::Lang;
33

4-
const LATIN_LANGS: [Lang; 36] = [
4+
const LATIN_LANGS: [Lang; 37] = [
55
Lang::Spa,
66
Lang::Eng,
77
Lang::Por,
@@ -38,6 +38,7 @@ const LATIN_LANGS: [Lang; 36] = [
3838
Lang::Lav,
3939
Lang::Est,
4040
Lang::Lat,
41+
Lang::Cym,
4142
];
4243
const CYRILLIC_LANGS: [Lang; 6] = [
4344
Lang::Rus,

0 commit comments

Comments
 (0)