add inside_bold_feature and also tokenize the comma on end of a string

tpeng · tpeng · commit 834ac424f392 · 2013-11-01T13:54:58.000+08:00
diff --git a/webstruct/features/__init__.py b/webstruct/features/__init__.py
@@ -12,6 +12,7 @@
 DEFAULT = CombinedFeatures(
     parent_tag,
     inside_a_tag,
+    inside_bold_tag,
     borders,
     block_length,
 
diff --git a/webstruct/features/block_features.py b/webstruct/features/block_features.py
@@ -1,14 +1,17 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 
-__all__ = ['parent_tag', 'inside_a_tag', 'borders', 'block_length']
+__all__ = ['parent_tag', 'inside_a_tag', 'inside_bold_tag', 'borders', 'block_length']
 
 def parent_tag(index, tokens, elem, is_tail):
     return {'parent_tag': elem.tag if not is_tail else elem.getparent().tag}
 
 def inside_a_tag(index, token, elem, is_tail):
     return {'inside_a_tag': any(e is not None for e in elem.iterancestors('a'))}
 
+def inside_bold_tag(index, token, elem, is_tail):
+    return {'inside_bold_tag': any(e is not None for e in elem.iterancestors('strong'))}
+
 def borders(index, tokens, elem, is_tail):
     return {
         'border_at_left': index == 0,
diff --git a/webstruct/features/data_features.py b/webstruct/features/data_features.py
@@ -30,9 +30,11 @@
 place pl
 ridgeway parkway highway
 park
+unit
+block
 '''.split())
 
-COMMON_ADDRESS_PARTS = set('''suite floor p.o. center'''.split())
+COMMON_ADDRESS_PARTS = set('''suite floor p.o. po center'''.split())
 DIRECTIONS = set('''
 north south east west
 N S E W N. S. E. W.
diff --git a/webstruct/tokenize.py b/webstruct/tokenize.py
@@ -15,6 +15,14 @@ class WordTokenizer(object):
         >>> WordTokenizer().tokenize(s)
         ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com']
 
+        >>> s = '''Shelbourne Road,'''
+        >>> WordTokenizer().tokenize(s)
+        ['Shelbourne', 'Road', ',']
+
+        >>> s = '''Shelbourne Road,1000'''
+        >>> WordTokenizer().tokenize(s)
+        ['Shelbourne', 'Road,1000']
+
     """
     def tokenize(self, text):
         #starting quotes
@@ -23,7 +31,7 @@ def tokenize(self, text):
         text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
 
         #punctuation
-        text = re.sub(r'([,])([^\d])', r' \1 \2', text)     # CHANGED :
+        text = re.sub(r'([,])(?![\d])', r' \1 ', text)     # CHANGED :
         text = re.sub(r'\.\.\.', r' ... ', text)
         text = re.sub(r'[;#$%&]', r' \g<0> ', text)         # CHANGED @