Skip to content

Commit 3556a57

Browse files
authored
Merge pull request #47 from whalebot-helmsman/master
Move annotations from GATE to WebAnnotator format
2 parents 7a42a23 + 2a7b013 commit 3556a57

File tree

376 files changed

+35426
-34890
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

376 files changed

+35426
-34890
lines changed

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ notebooks/*.ipynb
5050
notebooks/*.marisa
5151
notebooks/*.wapiti
5252
notebooks/*.crfsuite
53-
webstruct_data/corpus/random_pages/wa/*.html
5453
webstruct_data/corpus/us_contact_pages/cleaned
5554
example/_data/*
5655
example/*.joblib
57-
example/*.html
56+
example/*.html

example/ner/data.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,14 @@ def load_webstruct_data() -> List:
4444
)
4545

4646
trees2 = webstruct.load_trees(
47-
str(WEBSTRUCT_DATA / "corpus/us_contact_pages/annotated/*.xml"),
48-
loader=gate_loader
47+
str(WEBSTRUCT_DATA / "corpus/us_contact_pages/wa/*.html"),
48+
loader=wa_loader
4949
)
5050
trees = chain(trees1, trees2)
51-
return list(pages_progress(trees, desc="Loading webstruct default annotated data"))
51+
return list(pages_progress(
52+
trees,
53+
desc="Loading webstruct default annotated data"
54+
))
5255

5356

5457
def load_countries() -> Set[str]:

webstruct/annotation_converter.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import logging
2+
import argparse
3+
4+
import webstruct.loaders
5+
from webstruct.webannotator import EntityColors, to_webannotator
6+
7+
8+
def main():
9+
cmdline = argparse.ArgumentParser(description=('utility '
10+
'to convert annotations '
11+
'from GATE format to '
12+
'WebAnnotator format'))
13+
cmdline.add_argument('--GATE',
14+
help='path to file annotated in GATE format',
15+
type=str,
16+
required=True)
17+
cmdline.add_argument('--sample',
18+
help=('path to file annotated in WebAnnotator format '
19+
'for colors and entities transfer'),
20+
type=str,
21+
required=True)
22+
cmdline.add_argument('--WebAnnotator',
23+
help='path to result file in WebAnnotator format',
24+
type=str,
25+
required=True)
26+
cmdline.add_argument('--loglevel',
27+
help='logging level',
28+
type=str,
29+
default='INFO')
30+
args = cmdline.parse_args()
31+
32+
logging.basicConfig(level=getattr(logging, args.loglevel.upper()),
33+
format=('%(asctime)s [%(levelname)s]'
34+
'%(pathname)s:%(lineno)d %(message)s'))
35+
with open(args.sample, 'rb') as sample_reader:
36+
colors = EntityColors.from_htmlbytes(sample_reader.read())
37+
entities = [typ for typ in colors]
38+
39+
logging.debug('Current entities %s', entities)
40+
logging.debug('Current colors %s', colors)
41+
42+
gate = webstruct.loaders.GateLoader(known_entities=entities)
43+
tokenizer = webstruct.HtmlTokenizer(tagset=entities)
44+
with open(args.GATE, 'rb') as reader:
45+
data = reader.read()
46+
tree = gate.loadbytes(data)
47+
tokens, annotations = tokenizer.tokenize_single(tree)
48+
tree = to_webannotator(tree, entity_colors=colors)
49+
with open(args.WebAnnotator, 'wb') as writer:
50+
tree.write(writer, method='html', pretty_print=True)
51+
52+
if __name__ == "__main__":
53+
main()

webstruct/annotation_verifier.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import json
2+
import logging
3+
import argparse
4+
5+
import webstruct.loaders
6+
import webstruct.webannotator
7+
8+
DEFAULT_ENTITIES = [
9+
'ORG', 'TEL', 'FAX', 'HOURS',
10+
'STREET', 'CITY', 'STATE', 'ZIPCODE', 'COUNTRY',
11+
'EMAIL', 'PER', 'FUNC', 'SUBJ'
12+
]
13+
14+
15+
def nodes_difference(l, r):
16+
if l.tag != r.tag:
17+
return {'tag': '"{0}" != "{1}"'.format(l.tag, r.tag)}
18+
19+
l_attrib = [(k, l.attrib[k]) for k in l.attrib]
20+
l_attrib.sort(key=lambda x: x[0])
21+
r_attrib = [(k, r.attrib[k]) for k in r.attrib]
22+
r_attrib.sort(key=lambda x: x[0])
23+
24+
idx = 0
25+
while idx < len(l_attrib) and idx < len(r_attrib):
26+
l_attr = l_attrib[idx]
27+
r_attr = r_attrib[idx]
28+
idx = idx + 1
29+
30+
if l_attr != r_attr:
31+
return {'attributes': '"{0}" != "{1}"'.format(l_attr, r_attr)}
32+
33+
if idx < len(l_attrib):
34+
return {'attributes': "{0} != None".format(l_attrib[idx])}
35+
36+
if idx < len(r_attrib):
37+
return {'attributes': "None != {0}".format(r_attrib[idx])}
38+
39+
l_text = ''
40+
if l.text:
41+
l.text = l.text.strip()
42+
43+
r_text = ''
44+
if r.text:
45+
r.text = r.text.strip()
46+
47+
if l_text != r_text:
48+
return {'text': "{0} != {1}".format(l_text, r_text)}
49+
50+
l_tail = ''
51+
if l.tail:
52+
l.tail = l.tail.strip()
53+
54+
r_tail = ''
55+
if r.tail:
56+
r.tail = r.tail.strip()
57+
58+
if l_tail != r_tail:
59+
return {'tail': "{0} != {1}".format(l_tail, r_tail)}
60+
61+
if len(l) != len(r):
62+
return {'children count': "{0} != {1}".format(len(l), len(r))}
63+
64+
return None
65+
66+
67+
def node_path(node):
68+
ret = ''
69+
current = node
70+
while current is not None:
71+
parent = current.getparent()
72+
idx = 0
73+
if parent:
74+
idx = parent.index(current)
75+
step = '{0}:{1}'.format(idx, current.tag)
76+
ret = step + '/' + ret
77+
current = parent
78+
79+
return ret
80+
81+
82+
def tree_difference(l, r):
83+
stack = [(l, r)]
84+
while stack:
85+
l_node, r_node = stack.pop(0)
86+
diff = nodes_difference(l_node, r_node)
87+
88+
if diff:
89+
return {"l": node_path(l_node),
90+
"r": node_path(r_node),
91+
"diff": diff}
92+
93+
for idx, l_child in enumerate(l_node):
94+
stack.append((l_child, r_node[idx]))
95+
96+
return None
97+
98+
99+
def main():
100+
cmdline = argparse.ArgumentParser(description=('utility to verify '
101+
'annotation conversion '
102+
'from GATE format '
103+
'to WebAnnotator format'))
104+
cmdline.add_argument('--GATE',
105+
help='path to file annotated in GATE format',
106+
type=str,
107+
required=True)
108+
cmdline.add_argument('--WebAnnotator',
109+
help='path to file annotated in WebAnnotator format',
110+
type=str,
111+
required=True)
112+
cmdline.add_argument('--entity',
113+
help='enitity type to verify against',
114+
type=str,
115+
action='append',
116+
required=False)
117+
cmdline.add_argument('--loglevel',
118+
help='logging level',
119+
type=str,
120+
default='INFO')
121+
args = cmdline.parse_args()
122+
123+
logging.basicConfig(level=getattr(logging, args.loglevel.upper()),
124+
format=('%(asctime)s [%(levelname)s] '
125+
'%(pathname)s:%(lineno)d %(message)s'))
126+
127+
if args.entity:
128+
entities = args.entity
129+
else:
130+
entities = DEFAULT_ENTITIES
131+
132+
logging.debug('Known entities %s', entities)
133+
134+
gate = webstruct.loaders.GateLoader(known_entities=entities)
135+
wa = webstruct.loaders.WebAnnotatorLoader(known_entities=entities)
136+
137+
tokenizer = webstruct.HtmlTokenizer(tagset=entities)
138+
with open(args.GATE, 'rb') as reader:
139+
data = reader.read()
140+
gate_tree = gate.loadbytes(data)
141+
gate_tokens, gate_annotations = tokenizer.tokenize_single(gate_tree)
142+
143+
with open(args.WebAnnotator, 'rb') as reader:
144+
data = reader.read()
145+
wa_tree = wa.loadbytes(data)
146+
wa_tokens, wa_annotations = tokenizer.tokenize_single(wa_tree)
147+
148+
is_diff = False
149+
tree_diff = tree_difference(gate_tree, wa_tree)
150+
if tree_diff:
151+
logging.error('tree differs %s', json.dumps(tree_diff))
152+
is_diff = True
153+
154+
annot_diff = list()
155+
for idx, (gate_a, wa_a) in enumerate(zip(gate_annotations,
156+
wa_annotations)):
157+
if gate_a == wa_a:
158+
continue
159+
160+
annot_diff.append({'idx': idx,
161+
'gate_a': gate_a,
162+
'wa_a': wa_a})
163+
164+
if annot_diff:
165+
logging.error('annotation differs %s', json.dumps(annot_diff))
166+
is_diff = True
167+
168+
return is_diff is False
169+
170+
if __name__ == "__main__":
171+
main()

webstruct/tests/test_html_tools.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import unittest
2+
3+
import lxml
4+
5+
import webstruct.annotation_verifier
6+
7+
html_1 = '<span style="abc" class="span">aa</span>'
8+
html_2 = '<span style="abd" class="span">aa</span>'
9+
html_3 = '<span style="abd" class="span">aa<p>s</p></span>'
10+
html_3 = '<span style="abd" class="span">aa<p>s</p><p>ss</p></span>'
11+
12+
13+
class SomethingTest(unittest.TestCase):
14+
15+
def test_is_node_equal_to_self(self):
16+
tree_1 = lxml.etree.fromstring(html_1)
17+
diff = webstruct.annotation_verifier.nodes_difference(tree_1, tree_1)
18+
self.assertIsNone(diff)
19+
20+
def test_is_different_nodes_are_diffirent(self):
21+
tree_1 = lxml.etree.fromstring(html_1)
22+
tree_2 = lxml.etree.fromstring(html_2)
23+
diff = webstruct.annotation_verifier.nodes_difference(tree_1, tree_2)
24+
self.assertIsNotNone(diff)
25+
26+
def test_is_tree_equal_to_self(self):
27+
tree_1 = lxml.etree.fromstring(html_3)
28+
diff = webstruct.annotation_verifier.tree_difference(tree_1, tree_1)
29+
self.assertIsNone(diff)
30+
31+
def test_is_different_trees_are_diffirent(self):
32+
tree_1 = lxml.etree.fromstring(html_2)
33+
tree_2 = lxml.etree.fromstring(html_3)
34+
diff = webstruct.annotation_verifier.tree_difference(tree_1, tree_2)
35+
self.assertIsNotNone(diff)

webstruct/tests/test_webannotator.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,24 @@ def assertApplyWaTitle(self, source, result):
1515
webannotator.apply_wa_title(tree)
1616
self.assertHtmlTreeEqual(tree, html_document_fromstring(result))
1717

18+
def test_wa_title_no_attributes(self):
19+
self.assertApplyWaTitle(
20+
b"""
21+
<html>
22+
<head><title>Foo</title></head>
23+
<body>contents</body>
24+
<wa-title class="classy"><b>hello</b>, world</wa-title>
25+
</html>
26+
""",
27+
28+
b"""
29+
<html>
30+
<head><title><b>hello</b>, world</title></head>
31+
<body>contents</body>
32+
</html>
33+
"""
34+
)
35+
1836
def test_wa_title(self):
1937
self.assertApplyWaTitle(
2038
b"""

webstruct/webannotator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ def apply_wa_title(tree):
100100
head.insert(head.index(title), wa_title)
101101
title.drop_tree()
102102
wa_title.tag = 'title'
103+
for attr in wa_title.attrib:
104+
wa_title.attrib.pop(attr)
103105
return
104106

105107

@@ -254,7 +256,7 @@ def _ensure_head(tree):
254256

255257

256258
def _set_base(tree, baseurl):
257-
"""
259+
"""
258260
Add <base> tag to the tree. If <base> tag already exists do nothing.
259261
"""
260262
if tree.xpath('//base'):

0 commit comments

Comments
 (0)