1111 'EMAIL' , 'PER' , 'FUNC' , 'SUBJ'
1212]
1313
14+
1415def nodes_difference (l , r ):
1516 if l .tag != r .tag :
16- return {'tag' : '"{0}" != "{1}"' .format (l .tag , r .tag ) }
17+ return {'tag' : '"{0}" != "{1}"' .format (l .tag , r .tag )}
1718
1819 l_attrib = [(k , l .attrib [k ]) for k in l .attrib ]
19- l_attrib .sort (key = lambda x :x [0 ])
20+ l_attrib .sort (key = lambda x : x [0 ])
2021 r_attrib = [(k , r .attrib [k ]) for k in r .attrib ]
21- r_attrib .sort (key = lambda x :x [0 ])
22+ r_attrib .sort (key = lambda x : x [0 ])
2223
2324 idx = 0
2425 while idx < len (l_attrib ) and idx < len (r_attrib ):
@@ -27,13 +28,13 @@ def nodes_difference(l, r):
2728 idx = idx + 1
2829
2930 if l_attr != r_attr :
30- return {'attributes' : '"{0}" != "{1}"' .format (l_attr , r_attr )}
31+ return {'attributes' : '"{0}" != "{1}"' .format (l_attr , r_attr )}
3132
3233 if idx < len (l_attrib ):
33- return {'attributes' : "{0} != None" .format (l_attrib [idx ])}
34+ return {'attributes' : "{0} != None" .format (l_attrib [idx ])}
3435
3536 if idx < len (r_attrib ):
36- return {'attributes' : "None != {0}" .format (r_attrib [idx ])}
37+ return {'attributes' : "None != {0}" .format (r_attrib [idx ])}
3738
3839 l_text = ''
3940 if l .text :
@@ -44,7 +45,7 @@ def nodes_difference(l, r):
4445 r .text = r .text .strip ()
4546
4647 if l_text != r_text :
47- return {'text' : "{0} != {1}" .format (l_text , r_text )}
48+ return {'text' : "{0} != {1}" .format (l_text , r_text )}
4849
4950 l_tail = ''
5051 if l .tail :
@@ -55,17 +56,18 @@ def nodes_difference(l, r):
5556 r .tail = r .tail .strip ()
5657
5758 if l_tail != r_tail :
58- return {'tail' : "{0} != {1}" .format (l_tail , r_tail )}
59+ return {'tail' : "{0} != {1}" .format (l_tail , r_tail )}
5960
6061 if len (l ) != len (r ):
61- return {'children count' : "{0} != {1}" .format (len (l ), len (r ))}
62+ return {'children count' : "{0} != {1}" .format (len (l ), len (r ))}
6263
6364 return None
6465
66+
6567def node_path (node ):
6668 ret = ''
6769 current = node
68- while current != None :
70+ while current is not None :
6971 parent = current .getparent ()
7072 idx = 0
7173 if parent :
@@ -76,38 +78,50 @@ def node_path(node):
7678
7779 return ret
7880
81+
7982def tree_difference (l , r ):
8083 stack = [(l , r )]
8184 while stack :
8285 l_node , r_node = stack .pop (0 )
8386 diff = nodes_difference (l_node , r_node )
8487
8588 if diff :
86- return { "l" : node_path (l_node )
87- , "r" : node_path (r_node )
88- , "diff" : diff }
89+ return {"l" : node_path (l_node ),
90+ "r" : node_path (r_node ),
91+ "diff" : diff }
8992
9093 for idx , l_child in enumerate (l_node ):
9194 stack .append ((l_child , r_node [idx ]))
9295
9396 return None
9497
98+
9599def main ():
96100 cmdline = argparse .ArgumentParser ()
97- cmdline .add_argument ('--gate' , help = 'path to gate annotated file' , type = str , required = True )
98- cmdline .add_argument ('--wa' , help = 'path to wa annotated file' , type = str , required = True )
99- cmdline .add_argument ('--loglevel' , help = 'logging level' , type = str , default = 'INFO' )
101+ cmdline .add_argument ('--gate' ,
102+ help = 'path to gate annotated file' ,
103+ type = str ,
104+ required = True )
105+ cmdline .add_argument ('--wa' ,
106+ help = 'path to wa annotated file' ,
107+ type = str ,
108+ required = True )
109+ cmdline .add_argument ('--loglevel' ,
110+ help = 'logging level' ,
111+ type = str ,
112+ default = 'INFO' )
100113 args = cmdline .parse_args ()
101114
102- logging .basicConfig ( level = getattr (logging , args .loglevel .upper ())
103- , format = '%(asctime)s [%(levelname)s] %(pathname)s:%(lineno)d %(message)s' )
115+ logging .basicConfig (level = getattr (logging , args .loglevel .upper ()),
116+ format = ('%(asctime)s [%(levelname)s] '
117+ '%(pathname)s:%(lineno)d %(message)s' ))
104118
105119 entities = KNOWN_ENTITIES
106120
107- gate = webstruct .loaders .GateLoader (known_entities = entities )
108- wa = webstruct .loaders .WebAnnotatorLoader (known_entities = entities )
121+ gate = webstruct .loaders .GateLoader (known_entities = entities )
122+ wa = webstruct .loaders .WebAnnotatorLoader (known_entities = entities )
109123
110- tokenizer = webstruct .HtmlTokenizer (tagset = entities )
124+ tokenizer = webstruct .HtmlTokenizer (tagset = entities )
111125 with open (args .gate , 'rb' ) as reader :
112126 data = reader .read ()
113127 gate_tree = gate .loadbytes (data )
@@ -125,20 +139,20 @@ def main():
125139 is_diff = True
126140
127141 annot_diff = list ()
128- for idx , (gate_a , wa_a ) in enumerate (zip (gate_annotations , wa_annotations )):
142+ for idx , (gate_a , wa_a ) in enumerate (zip (gate_annotations ,
143+ wa_annotations )):
129144 if gate_a == wa_a :
130145 continue
131146
132- annot_diff .append ({ 'idx' : idx
133- , 'gate_a' : gate_a
134- , 'wa_a' : wa_a })
147+ annot_diff .append ({'idx' : idx ,
148+ 'gate_a' : gate_a ,
149+ 'wa_a' : wa_a })
135150
136151 if annot_diff :
137152 logging .error ('annotation differs %s' , json .dumps (annot_diff ))
138153 is_diff = True
139154
140- return is_diff == False
155+ return is_diff is False
141156
142157if __name__ == "__main__" :
143158 main ()
144-
0 commit comments