@@ -10,20 +10,22 @@ def __init__(self, path):
1010 self .entity_mentions = []
1111 self .event_mentions = []
1212 self .sentences = []
13+ self .sgm_text = ''
1314
1415 self .entity_mentions , self .event_mentions = self .parse_xml (path + '.apf.xml' )
1516 self .sents_with_pos = self .parse_sgm (path + '.sgm' )
17+ self .fix_wrong_position ()
18+
19+ @staticmethod
20+ def clean_text (text ):
21+ return text .replace ('\n ' , ' ' )
1622
1723 def get_data (self ):
1824 data = []
19-
20- def clean_text (text ):
21- return text .replace ('\n ' , ' ' )
22-
2325 for sent in self .sents_with_pos :
2426 item = dict ()
2527
26- item ['sentence' ] = clean_text (sent ['text' ])
28+ item ['sentence' ] = self . clean_text (sent ['text' ])
2729 item ['position' ] = sent ['position' ]
2830 text_position = sent ['position' ]
2931
@@ -42,7 +44,7 @@ def clean_text(text):
4244 entity_position = entity_mention ['position' ]
4345 if text_position [0 ] <= entity_position [0 ] and entity_position [1 ] <= text_position [1 ]:
4446 item ['golden-entity-mentions' ].append ({
45- 'text' : clean_text (entity_mention ['text' ]),
47+ 'text' : self . clean_text (entity_mention ['text' ]),
4648 'position' : entity_position ,
4749 'entity-type' : entity_mention ['entity-type' ]
4850 })
@@ -57,7 +59,7 @@ def clean_text(text):
5759 'role' : argument ['role' ],
5860 'position' : argument ['position' ],
5961 'entity-type' : entity_map [argument ['entity-id' ]]['entity-type' ],
60- 'text' : clean_text (argument ['text' ]),
62+ 'text' : self . clean_text (argument ['text' ]),
6163 })
6264
6365 item ['golden-event-mentions' ].append ({
@@ -66,15 +68,51 @@ def clean_text(text):
6668 'position' : event_position ,
6769 'event_type' : event_mention ['event_type' ],
6870 })
69-
7071 data .append (item )
7172 return data
7273
7374 @staticmethod
74- def parse_sgm (sgm_path ):
75+ def find_correct_offset (sgm_text , start_index , text ):
76+ offset = 0
77+ for i in range (0 , 50 ):
78+ for j in [- 1 , 1 ]:
79+ offset = i * j
80+ if sgm_text [start_index + offset :start_index + offset + len (text )] == text :
81+ return offset
82+
83+ print ('[Warning] fail to find offset! (start_index: {}, text: {})' .format (start_index , text ))
84+ return offset
85+
86+ def fix_wrong_position (self ):
87+ for entity_mention in self .entity_mentions :
88+ offset = self .find_correct_offset (
89+ sgm_text = self .sgm_text ,
90+ start_index = entity_mention ['position' ][0 ],
91+ text = entity_mention ['text' ])
92+
93+ entity_mention ['position' ][0 ] += offset
94+ entity_mention ['position' ][1 ] += offset
95+
96+ for event_mention in self .event_mentions :
97+ offset1 = self .find_correct_offset (
98+ sgm_text = self .sgm_text ,
99+ start_index = event_mention ['trigger' ]['position' ][0 ],
100+ text = event_mention ['trigger' ]['text' ])
101+ event_mention ['trigger' ]['position' ][0 ] += offset1
102+ event_mention ['trigger' ]['position' ][1 ] += offset1
103+
104+ for argument in event_mention ['arguments' ]:
105+ offset2 = self .find_correct_offset (
106+ sgm_text = self .sgm_text ,
107+ start_index = argument ['position' ][0 ],
108+ text = argument ['text' ])
109+ argument ['position' ][0 ] += offset2
110+ argument ['position' ][1 ] += offset2
111+
112+ def parse_sgm (self , sgm_path ):
75113 with open (sgm_path , 'r' ) as f :
76114 soup = BeautifulSoup (f .read (), features = 'html.parser' )
77- sgm_text = soup .text
115+ self . sgm_text = soup .text
78116
79117 doc_type = soup .doc .doctype .text .strip ()
80118
@@ -92,27 +130,6 @@ def remove_tags(selector):
92130
93131 sents = []
94132 converted_text = soup .text
95- # converted_text = converted_text.replace('Ltd.', 'Limited')
96- # converted_text = converted_text.replace('Co.', 'Company')
97- # converted_text = converted_text.replace('Corp.', 'Corporation')
98- # converted_text = converted_text.replace('Inc.', 'Incorporated')
99- # converted_text = converted_text.replace('p.m.', 'pm')
100- # converted_text = converted_text.replace('U.N.', 'UN')
101- # converted_text = converted_text.replace('U.S.', 'US')
102- # converted_text = converted_text.replace(' ill. ', ' ill ')
103- # converted_text = converted_text.replace(' pa. ', ' pa ')
104- #
105- # converted_text = converted_text.replace(".? ", "? ")
106- # converted_text = converted_text.replace("?). ", "? ")
107- #
108- # converted_text = converted_text.replace('. his', '. His')
109- # converted_text = converted_text.replace(". i'm", ". I'm")
110- # converted_text = converted_text.replace(". the", ". The")
111- # converted_text = converted_text.replace(". all", ". All")
112- # converted_text = converted_text.replace(". during", ". During")
113- # converted_text = converted_text.replace(". soon", ". Soon")
114- #
115- # converted_text = re.sub(r'(\d)\.\s', ' ', converted_text)
116133
117134 for sent in nltk .sent_tokenize (converted_text ):
118135 sents .extend (sent .split ('\n \n ' ))
@@ -121,7 +138,7 @@ def remove_tags(selector):
121138 sents_with_pos = []
122139 last_pos = 0
123140 for sent in sents :
124- pos = sgm_text .find (sent , last_pos )
141+ pos = self . sgm_text .find (sent , last_pos )
125142 last_pos = pos
126143 sents_with_pos .append ({
127144 'text' : sent ,
@@ -191,7 +208,7 @@ def parse_event_tag(node):
191208 'text' : charset .text ,
192209 'position' : [int (charset .attrib ['START' ]), int (charset .attrib ['END' ])],
193210 'role' : child2 .attrib ['ROLE' ],
194- 'entity-id' : child2 .attrib ['REFID' ]
211+ 'entity-id' : child2 .attrib ['REFID' ],
195212 })
196213 event_mentions .append (event_mention )
197214 return event_mentions
@@ -223,6 +240,11 @@ def parse_value_timex_tag(node):
223240
224241
225242if __name__ == '__main__' :
226- data = Parser ('./data/ace_2005_td_v7/data/English/nw/timex2norm/AFP_ENG_20030304.0250.apf.xml' ).get_data ()
227- with open ('output/sample.json' , 'w' ) as f :
228- json .dump (data [0 ], f , indent = 2 )
243+ parser = Parser ('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331' )
244+ # parser = Parser('./data/ace_2005_td_v7/data/English/un/adj/alt.atheism_20041104.2428')
245+ data = parser .get_data ()
246+ with open ('./output/debug.json' , 'w' ) as f :
247+ json .dump (data , f , indent = 2 )
248+
249+ index = parser .sgm_text .find ("the two" )
250+ # print(parser.sgm_text[index:])
0 commit comments