Skip to content

Commit 22aa4c3

Browse files
committed
Fix expand_maiyamok
1 parent cd9c8b8 commit 22aa4c3

File tree

1 file changed

+35
-35
lines changed

1 file changed

+35
-35
lines changed

pythainlp/util/normalize.py

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -251,45 +251,45 @@ def normalize(text: str) -> str:
251251

252252

253253
def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
254-
"""
255-
Expand Maiyamok.
256-
257-
Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
258-
repetition. This function preprocesses Thai text by replacing
259-
Maiyamok with a word being repeated.
260-
261-
:param Union[str, List[str]] sent: input sentence (list or str)
262-
:return: list of words
263-
:rtype: List[str]
264-
265-
:Example:
266-
::
267-
268-
from pythainlp.util import expand_maiyamok
269-
270-
expand_maiyamok("เด็กๆกิน")
271-
# output: ['เด็ก', 'เด็ก', 'กิน']
272-
"""
273254
if isinstance(sent, str):
274255
sent = word_tokenize(sent)
256+
257+
# Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
258+
temp_toks: list[str] = []
259+
for _, token in enumerate(sent):
260+
toks = re.split(r"(ๆ)", token)
261+
toks = [tok for tok in toks if tok] # remove empty string ("")
262+
temp_toks.extend(toks)
263+
sent = temp_toks
264+
275265
output_toks: list[str] = []
276-
i = 0
277-
for j, token in enumerate(sent):
278-
if token.isspace() and "ๆ" in sent[j + 1]:
279-
continue
280-
token = re.sub(r"\s+ๆ", "ๆ", token)
281-
if "ๆ" == token:
282-
token = output_toks[i - 1]
283-
elif "ๆ" in token:
284-
count = token.count("ๆ")
285-
token = output_toks[i - 1]
286-
for _ in range(count):
287-
output_toks.append(token)
288-
i += 1
266+
267+
yamok = "ๆ"
268+
yamok_count = 0
269+
len_sent = len(sent)
270+
for i in range(len_sent - 1, -1, -1): # do it backward
271+
print(i, sent[i])
272+
print(i, output_toks)
273+
if yamok_count == 0 or (i + 1 >= len_sent):
274+
if sent[i] == yamok:
275+
yamok_count = yamok_count + 1
276+
else:
277+
output_toks.append(sent[i])
289278
continue
290-
output_toks.append(token)
291-
i += 1
292-
return output_toks
279+
280+
if sent[i] == yamok:
281+
yamok_count = yamok_count + 1
282+
else:
283+
if sent[i].isspace():
284+
if yamok_count > 0: # remove space before yamok
285+
continue
286+
else: # with preprocessing above, this should not happen
287+
output_toks.append(sent[i])
288+
else:
289+
output_toks.extend([sent[i]] * (yamok_count + 1))
290+
yamok_count = 0
291+
292+
return output_toks[::-1]
293293

294294

295295
def maiyamok(sent: Union[str, List[str]]) -> List[str]:

0 commit comments

Comments
 (0)