diff --git a/pythainlp/util/collate.py b/pythainlp/util/collate.py index 7c2395d57..b92a75e5f 100644 --- a/pythainlp/util/collate.py +++ b/pythainlp/util/collate.py @@ -15,7 +15,9 @@ def _thkey(word: str) -> str: cv = _RE_TONE.sub("", word) # remove tone cv = _RE_LV_C.sub("\\2\\1", cv) # switch lead vowel - tone = _RE_TONE.sub(" ", word) # just tone + + tone_match = _RE_TONE.search(word) + tone = tone_match.group() if tone_match else "" return cv + tone diff --git a/tests/test_util.py b/tests/test_util.py index 3fdcfee04..ce29784e5 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -79,6 +79,14 @@ def test_collate(self): collate(["ไก่", "เป็ด", "หมู", "วัว"]), ["ไก่", "เป็ด", "วัว", "หมู"], ) + self.assertEqual( + collate(["ก้วย", "ก๋วย", "กวย", "ก่วย", "ก๊วย"]), + collate(["ก๋วย", "ก่วย", "ก้วย", "ก๊วย", "กวย"]), + ) # should guarantee same order + self.assertEqual( + collate(["ก้วย", "ก๋วย", "ก่วย", "กวย", "ก้วย", "ก่วย", "ก๊วย"]), + ["กวย", "ก่วย", "ก่วย", "ก้วย", "ก้วย", "ก๊วย", "ก๋วย"], + ) # ### pythainlp.util.numtoword