|
52 | 52 |
|
53 | 53 |
|
54 | 54 | def tcc(w): |
55 | | - p = 0 |
56 | | - pat = re.compile("|".join(pat_tcc)) |
57 | | - while p < len(w): |
58 | | - m = pat.match(w[p:]) |
59 | | - if m: |
60 | | - n = m.span()[1] |
61 | | - else: |
62 | | - n = 1 |
63 | | - yield w[p:p + n] |
64 | | - p += n |
| 55 | + p = 0 |
| 56 | + pat = re.compile("|".join(pat_tcc)) |
| 57 | + while p < len(w): |
| 58 | + m = pat.match(w[p:]) |
| 59 | + if m: |
| 60 | + n = m.span()[1] |
| 61 | + else: |
| 62 | + n = 1 |
| 63 | + yield w[p:p + n] |
| 64 | + p += n |
65 | 65 |
|
66 | 66 |
|
67 | 67 | def tcc_pos(text): |
68 | | - p_set = set() |
69 | | - p = 0 |
70 | | - for w in tcc(text): |
71 | | - p += len(w) |
72 | | - p_set.add(p) |
73 | | - return p_set |
| 68 | + p_set = set() |
| 69 | + p = 0 |
| 70 | + for w in tcc(text): |
| 71 | + p += len(w) |
| 72 | + p_set.add(p) |
| 73 | + return p_set |
74 | 74 |
|
75 | 75 |
|
76 | 76 | def serialize(words_at, p, p2): |
77 | | - # find path ทั้งหมด แบบ depth first |
78 | | - for w in words_at[p]: |
79 | | - p_ = p + len(w) |
80 | | - if p_ == p2: |
81 | | - yield [w] |
82 | | - elif p_ < p2: |
83 | | - for path in serialize(words_at, p_, p2): |
84 | | - yield [w] + path |
| 77 | + # find path ทั้งหมด แบบ depth first |
| 78 | + for w in words_at[p]: |
| 79 | + p_ = p + len(w) |
| 80 | + if p_ == p2: |
| 81 | + yield [w] |
| 82 | + elif p_ < p2: |
| 83 | + for path in serialize(words_at, p_, p2): |
| 84 | + yield [w] + path |
85 | 85 |
|
86 | 86 |
|
87 | 87 | def onecut(text, data=['']): |
88 | | - if(data != ['']): |
89 | | - trie = Trie(data) |
90 | | - else: |
91 | | - trie = THAI_WORDS |
92 | | - words_at = defaultdict(list) # main data structure |
93 | | - allow_pos = tcc_pos(text) # ตำแหน่งที่ตัด ต้องตรงกับ tcc |
94 | | - |
95 | | - q = [0] # min-heap queue |
96 | | - last_p = 0 # last position for yield |
97 | | - while q[0] < len(text): |
98 | | - p = heappop(q) |
99 | | - |
100 | | - for w in trie.prefixes(text[p:]): |
101 | | - p_ = p + len(w) |
102 | | - if p_ in allow_pos: # เลือกที่สอดคล้อง tcc |
103 | | - words_at[p].append(w) |
104 | | - if p_ not in q: |
105 | | - heappush(q, p_) |
106 | | - |
107 | | - # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้ |
108 | | - if len(q) == 1: |
109 | | - paths = serialize(words_at, last_p, q[0]) |
110 | | - for w in min(paths, key=len): |
111 | | - yield w |
112 | | - last_p = q[0] |
113 | | - |
114 | | - # กรณี length 0 คือ ไม่มีใน dict |
115 | | - if len(q) == 0: |
116 | | - m = pat_eng.match(text[p:]) |
117 | | - if m: # อังกฤษ, เลข, ว่าง |
118 | | - i = p + m.end() |
119 | | - else: # skip น้อยที่สุด ที่เป็นไปได้ |
120 | | - for i in range(p + 1, len(text)): |
121 | | - if i in allow_pos: # ใช้ tcc ด้วย |
122 | | - ww = trie.prefixes(text[i:]) |
123 | | - m = pat_eng.match(text[i:]) |
124 | | - if ww or m: |
125 | | - break |
126 | | - else: |
127 | | - i = len(text) |
128 | | - w = text[p:i] |
129 | | - words_at[p].append(w) |
130 | | - yield w |
131 | | - last_p = i |
132 | | - heappush(q, i) |
| 88 | + if(data != ['']): |
| 89 | + trie = Trie(data) |
| 90 | + else: |
| 91 | + trie = THAI_WORDS |
| 92 | + words_at = defaultdict(list) # main data structure |
| 93 | + allow_pos = tcc_pos(text) # ตำแหน่งที่ตัด ต้องตรงกับ tcc |
| 94 | + |
| 95 | + q = [0] # min-heap queue |
| 96 | + last_p = 0 # last position for yield |
| 97 | + while q[0] < len(text): |
| 98 | + p = heappop(q) |
| 99 | + |
| 100 | + for w in trie.prefixes(text[p:]): |
| 101 | + p_ = p + len(w) |
| 102 | + if p_ in allow_pos: # เลือกที่สอดคล้อง tcc |
| 103 | + words_at[p].append(w) |
| 104 | + if p_ not in q: |
| 105 | + heappush(q, p_) |
| 106 | + |
| 107 | + # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้ |
| 108 | + if len(q) == 1: |
| 109 | + paths = serialize(words_at, last_p, q[0]) |
| 110 | + for w in min(paths, key=len): |
| 111 | + yield w |
| 112 | + last_p = q[0] |
| 113 | + |
| 114 | + # กรณี length 0 คือ ไม่มีใน dict |
| 115 | + if len(q) == 0: |
| 116 | + m = pat_eng.match(text[p:]) |
| 117 | + if m: # อังกฤษ, เลข, ว่าง |
| 118 | + i = p + m.end() |
| 119 | + else: # skip น้อยที่สุด ที่เป็นไปได้ |
| 120 | + for i in range(p + 1, len(text)): |
| 121 | + if i in allow_pos: # ใช้ tcc ด้วย |
| 122 | + ww = trie.prefixes(text[i:]) |
| 123 | + m = pat_eng.match(text[i:]) |
| 124 | + if ww or m: |
| 125 | + break |
| 126 | + else: |
| 127 | + i = len(text) |
| 128 | + w = text[p:i] |
| 129 | + words_at[p].append(w) |
| 130 | + yield w |
| 131 | + last_p = i |
| 132 | + heappush(q, i) |
133 | 133 |
|
134 | 134 | # ช่วยให้ไม่ต้องพิมพ์ยาวๆ |
135 | 135 |
|
136 | 136 |
|
137 | 137 | def mmcut(text, data=['']): |
138 | | - return list(onecut(text, data=data)) |
| 138 | + return list(onecut(text, data=data)) |
0 commit comments