Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
36d56f2
text tokenizer return postions of token
whalebot-helmsman Sep 21, 2017
2d4d2ef
update tests
whalebot-helmsman Sep 21, 2017
80658ca
separate statement for every action
whalebot-helmsman Sep 21, 2017
c52e449
comma preserving test
whalebot-helmsman Sep 21, 2017
8178776
too much tokens around
whalebot-helmsman Sep 21, 2017
51c0932
encode in indices instead of entities
whalebot-helmsman Sep 21, 2017
1a667ec
handle empty lists
whalebot-helmsman Sep 21, 2017
24465b1
pass token length and position from TextToken to HtmlToken
whalebot-helmsman Sep 21, 2017
06befbb
letter perfect detokenization
whalebot-helmsman Sep 22, 2017
e5730b2
do not cleanup tokenized tree by default, separate method for tree cl…
Sep 25, 2017
e340444
update tests for separate tree cleaning
Sep 25, 2017
89673c1
update tests for correct punctuation positions
Sep 25, 2017
7c45984
correct length for replaced quotes
Sep 25, 2017
46fc4df
pep8
Sep 29, 2017
388170e
comma at line end, not start
Sep 29, 2017
71caf61
one join instead of many additions, dont be Schleimel
Sep 29, 2017
37d7470
correct formatting
Sep 29, 2017
e93c6dc
add clarification
Sep 29, 2017
e02c275
fix typo
Sep 29, 2017
f26569f
pep8
Sep 29, 2017
d1aecbb
preserve tokenize method for compatibility
Sep 29, 2017
35a9d88
function to reduce code in tests
Sep 29, 2017
9033188
remove test for nltk tokenizer
Sep 29, 2017
c14f363
test our behaviour, which difers from original treebank tokenizer
Sep 29, 2017
a071cd4
remove useless conversion
Sep 29, 2017
a33f564
rename method to avoid confusion with nltk tokenize_span method
Sep 29, 2017
75a9698
remove brittle tests
Sep 29, 2017
4729323
small benchmark for html tokenizer
Sep 29, 2017
943a44e
Revert "remove brittle tests"
whalebot-helmsman Oct 2, 2017
ba7d6fe
move brittle tests to pytest xfail
whalebot-helmsman Oct 2, 2017
b72bcc1
expect behaviour of nltk tokenizer
whalebot-helmsman Oct 2, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions webstruct/tests/test_text_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import unittest
import pytest

from webstruct.text_tokenizers import TextToken, WordTokenizer

class TestTokenizerTest(unittest.TestCase):
def do_tokenize(self, text, result):
self.assertEqual(result, WordTokenizer().segment_words(text))

@pytest.mark.xfail
def test_phone(self):
return self.do_tokenize(
"Phone:855-349-1914",
[TextToken(chars='Phone:855-349-1914', position=0, length=18)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not the output we're expecting, phone should be separated (like it was in old doctests)

)

@pytest.mark.xfail
def test_hyphen_mid(self):
return self.do_tokenize(
"Powai Campus, Mumbai-400077",
[TextToken(chars='Powai', position=0, length=5),
TextToken(chars='Campus', position=6, length=6),
TextToken(chars=',', position=12, length=1),
TextToken(chars='Mumbai-400077', position=14, length=13)]
)

@pytest.mark.xfail
def test_hyphen_end(self):
return self.do_tokenize(
"Saudi Arabia-",
[TextToken(chars='Saudi', position=0, length=5),
TextToken(chars='Arabia-', position=6, length=7)]
)

@pytest.mark.xfail
def test_hyphen_end(self):
return self.do_tokenize(
"1 5858/ 1800",
[TextToken(chars='1', position=0, length=1),
TextToken(chars='5858/', position=2, length=5),
TextToken(chars='1800', position=8, length=4)]
)
18 changes: 0 additions & 18 deletions webstruct/text_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ class WordTokenizer(object):

Some issues:

>>> WordTokenizer().segment_words("Phone:855-349-1914")
[TextToken(chars='Phone:855-349-1914', position=0, length=18)]

>>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")
[TextToken(chars='Copyright', position=0, length=9),
TextToken(chars=u'\xa9', position=10, length=1),
Expand All @@ -120,21 +117,6 @@ class WordTokenizer(object):
TextToken(chars='Reserved', position=51, length=8),
TextToken(chars='.', position=59, length=1)]

>>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077")
[TextToken(chars='Powai', position=0, length=5),
TextToken(chars='Campus', position=6, length=6),
TextToken(chars=',', position=12, length=1),
TextToken(chars='Mumbai-400077', position=14, length=13)]

>>> WordTokenizer().segment_words("1 5858/ 1800")
[TextToken(chars='1', position=0, length=1),
TextToken(chars='5858/', position=2, length=5),
TextToken(chars='1800', position=8, length=4)]

>>> WordTokenizer().segment_words("Saudi Arabia-")
[TextToken(chars='Saudi', position=0, length=5),
TextToken(chars='Arabia-', position=6, length=7)]

"""

# regex, token
Expand Down