We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent c016135 commit 7a42a23Copy full SHA for 7a42a23
webstruct/text_tokenizers.py
@@ -110,6 +110,14 @@ class DefaultTokenizer(WordTokenizer):
110
def tokenize(self, text):
111
tokens = super(DefaultTokenizer, self).tokenize(text)
112
# remove standalone commas and semicolons
113
+ # as they broke tag sets, e.g. PERSON->FUNCTION in case "PERSON, FUNCTION"
114
+
115
+ # but it has negative consequences, e.g.
116
+ # etalon: [PER-B, PER-I, FUNC-B]
117
+ # predicted: [PER-B, PER-I, PER-I ]
118
+ # because we removed punctuation
119
120
+ # FIXME: remove as token, but save as feature left/right_punct:","
121
return [t for t in tokens if t not in {',', ';'}]
122
123
0 commit comments