Skip to content

Commit 834ac42

Browse files
committed
add inside_bold_feature and also tokenize the comma on end of a string
1 parent bbbd394 commit 834ac42

File tree

4 files changed

+17
-3
lines changed

4 files changed

+17
-3
lines changed

webstruct/features/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
DEFAULT = CombinedFeatures(
1313
parent_tag,
1414
inside_a_tag,
15+
inside_bold_tag,
1516
borders,
1617
block_length,
1718

webstruct/features/block_features.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
# -*- coding: utf-8 -*-
22
from __future__ import absolute_import
33

4-
__all__ = ['parent_tag', 'inside_a_tag', 'borders', 'block_length']
4+
__all__ = ['parent_tag', 'inside_a_tag', 'inside_bold_tag', 'borders', 'block_length']
55

66
def parent_tag(index, tokens, elem, is_tail):
77
return {'parent_tag': elem.tag if not is_tail else elem.getparent().tag}
88

99
def inside_a_tag(index, token, elem, is_tail):
1010
return {'inside_a_tag': any(e is not None for e in elem.iterancestors('a'))}
1111

12+
def inside_bold_tag(index, token, elem, is_tail):
13+
return {'inside_bold_tag': any(e is not None for e in elem.iterancestors('strong'))}
14+
1215
def borders(index, tokens, elem, is_tail):
1316
return {
1417
'border_at_left': index == 0,

webstruct/features/data_features.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@
3030
place pl
3131
ridgeway parkway highway
3232
park
33+
unit
34+
block
3335
'''.split())
3436

35-
COMMON_ADDRESS_PARTS = set('''suite floor p.o. center'''.split())
37+
COMMON_ADDRESS_PARTS = set('''suite floor p.o. po center'''.split())
3638
DIRECTIONS = set('''
3739
north south east west
3840
N S E W N. S. E. W.

webstruct/tokenize.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,14 @@ class WordTokenizer(object):
1515
>>> WordTokenizer().tokenize(s)
1616
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com']
1717
18+
>>> s = '''Shelbourne Road,'''
19+
>>> WordTokenizer().tokenize(s)
20+
['Shelbourne', 'Road', ',']
21+
22+
>>> s = '''Shelbourne Road,1000'''
23+
>>> WordTokenizer().tokenize(s)
24+
['Shelbourne', 'Road,1000']
25+
1826
"""
1927
def tokenize(self, text):
2028
#starting quotes
@@ -23,7 +31,7 @@ def tokenize(self, text):
2331
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
2432

2533
#punctuation
26-
text = re.sub(r'([,])([^\d])', r' \1 \2', text) # CHANGED :
34+
text = re.sub(r'([,])(?![\d])', r' \1 ', text) # CHANGED :
2735
text = re.sub(r'\.\.\.', r' ... ', text)
2836
text = re.sub(r'[;#$%&]', r' \g<0> ', text) # CHANGED @
2937

0 commit comments

Comments
 (0)