@@ -251,45 +251,45 @@ def normalize(text: str) -> str:
251251
252252
253253def expand_maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
254- """
255- Expand Maiyamok.
256-
257- Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
258- repetition. This function preprocesses Thai text by replacing
259- Maiyamok with a word being repeated.
260-
261- :param Union[str, List[str]] sent: input sentence (list or str)
262- :return: list of words
263- :rtype: List[str]
264-
265- :Example:
266- ::
267-
268- from pythainlp.util import expand_maiyamok
269-
270- expand_maiyamok("เด็กๆกิน")
271- # output: ['เด็ก', 'เด็ก', 'กิน']
272- """
273254 if isinstance (sent , str ):
274255 sent = word_tokenize (sent )
256+
257+ # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
258+ temp_toks : list [str ] = []
259+ for _ , token in enumerate (sent ):
260+ toks = re .split (r"(ๆ)" , token )
261+ toks = [tok for tok in toks if tok ] # remove empty string ("")
262+ temp_toks .extend (toks )
263+ sent = temp_toks
264+
275265 output_toks : list [str ] = []
276- i = 0
277- for j , token in enumerate (sent ):
278- if token .isspace () and "ๆ" in sent [j + 1 ]:
279- continue
280- token = re .sub (r"\s+ๆ" , "ๆ" , token )
281- if "ๆ" == token :
282- token = output_toks [i - 1 ]
283- elif "ๆ" in token :
284- count = token .count ("ๆ" )
285- token = output_toks [i - 1 ]
286- for _ in range (count ):
287- output_toks .append (token )
288- i += 1
266+
267+ yamok = "ๆ"
268+ yamok_count = 0
269+ len_sent = len (sent )
270+ for i in range (len_sent - 1 , - 1 , - 1 ): # do it backward
271+ print (i , sent [i ])
272+ print (i , output_toks )
273+ if yamok_count == 0 or (i + 1 >= len_sent ):
274+ if sent [i ] == yamok :
275+ yamok_count = yamok_count + 1
276+ else :
277+ output_toks .append (sent [i ])
289278 continue
290- output_toks .append (token )
291- i += 1
292- return output_toks
279+
280+ if sent [i ] == yamok :
281+ yamok_count = yamok_count + 1
282+ else :
283+ if sent [i ].isspace ():
284+ if yamok_count > 0 : # remove space before yamok
285+ continue
286+ else : # with preprocessing above, this should not happen
287+ output_toks .append (sent [i ])
288+ else :
289+ output_toks .extend ([sent [i ]] * (yamok_count + 1 ))
290+ yamok_count = 0
291+
292+ return output_toks [::- 1 ]
293293
294294
295295def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
0 commit comments