Skip to content

Commit 63f5268

Browse files
committed
fix critical start, end position errors
1 parent 19e27a9 commit 63f5268

File tree

2 files changed

+61
-36
lines changed

2 files changed

+61
-36
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ data/ace_2005_td_v7
77
output/dev.json
88
output/train.json
99
output/dev.json
10+
output/debug.json
11+
12+
test.json
1013

1114
stanford-corenlp-full-2018-10-05.zip
1215
stanford-corenlp-full-2018-10-05/

parser.py

Lines changed: 58 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,22 @@ def __init__(self, path):
1010
self.entity_mentions = []
1111
self.event_mentions = []
1212
self.sentences = []
13+
self.sgm_text = ''
1314

1415
self.entity_mentions, self.event_mentions = self.parse_xml(path + '.apf.xml')
1516
self.sents_with_pos = self.parse_sgm(path + '.sgm')
17+
self.fix_wrong_position()
18+
19+
@staticmethod
20+
def clean_text(text):
21+
return text.replace('\n', ' ')
1622

1723
def get_data(self):
1824
data = []
19-
20-
def clean_text(text):
21-
return text.replace('\n', ' ')
22-
2325
for sent in self.sents_with_pos:
2426
item = dict()
2527

26-
item['sentence'] = clean_text(sent['text'])
28+
item['sentence'] = self.clean_text(sent['text'])
2729
item['position'] = sent['position']
2830
text_position = sent['position']
2931

@@ -42,7 +44,7 @@ def clean_text(text):
4244
entity_position = entity_mention['position']
4345
if text_position[0] <= entity_position[0] and entity_position[1] <= text_position[1]:
4446
item['golden-entity-mentions'].append({
45-
'text': clean_text(entity_mention['text']),
47+
'text': self.clean_text(entity_mention['text']),
4648
'position': entity_position,
4749
'entity-type': entity_mention['entity-type']
4850
})
@@ -57,7 +59,7 @@ def clean_text(text):
5759
'role': argument['role'],
5860
'position': argument['position'],
5961
'entity-type': entity_map[argument['entity-id']]['entity-type'],
60-
'text': clean_text(argument['text']),
62+
'text': self.clean_text(argument['text']),
6163
})
6264

6365
item['golden-event-mentions'].append({
@@ -66,15 +68,51 @@ def clean_text(text):
6668
'position': event_position,
6769
'event_type': event_mention['event_type'],
6870
})
69-
7071
data.append(item)
7172
return data
7273

7374
@staticmethod
74-
def parse_sgm(sgm_path):
75+
def find_correct_offset(sgm_text, start_index, text):
76+
offset = 0
77+
for i in range(0, 50):
78+
for j in [-1, 1]:
79+
offset = i * j
80+
if sgm_text[start_index + offset:start_index + offset + len(text)] == text:
81+
return offset
82+
83+
print('[Warning] fail to find offset! (start_index: {}, text: {})'.format(start_index, text))
84+
return offset
85+
86+
def fix_wrong_position(self):
87+
for entity_mention in self.entity_mentions:
88+
offset = self.find_correct_offset(
89+
sgm_text=self.sgm_text,
90+
start_index=entity_mention['position'][0],
91+
text=entity_mention['text'])
92+
93+
entity_mention['position'][0] += offset
94+
entity_mention['position'][1] += offset
95+
96+
for event_mention in self.event_mentions:
97+
offset1 = self.find_correct_offset(
98+
sgm_text=self.sgm_text,
99+
start_index=event_mention['trigger']['position'][0],
100+
text=event_mention['trigger']['text'])
101+
event_mention['trigger']['position'][0] += offset1
102+
event_mention['trigger']['position'][1] += offset1
103+
104+
for argument in event_mention['arguments']:
105+
offset2 = self.find_correct_offset(
106+
sgm_text=self.sgm_text,
107+
start_index=argument['position'][0],
108+
text=argument['text'])
109+
argument['position'][0] += offset2
110+
argument['position'][1] += offset2
111+
112+
def parse_sgm(self, sgm_path):
75113
with open(sgm_path, 'r') as f:
76114
soup = BeautifulSoup(f.read(), features='html.parser')
77-
sgm_text = soup.text
115+
self.sgm_text = soup.text
78116

79117
doc_type = soup.doc.doctype.text.strip()
80118

@@ -92,27 +130,6 @@ def remove_tags(selector):
92130

93131
sents = []
94132
converted_text = soup.text
95-
# converted_text = converted_text.replace('Ltd.', 'Limited')
96-
# converted_text = converted_text.replace('Co.', 'Company')
97-
# converted_text = converted_text.replace('Corp.', 'Corporation')
98-
# converted_text = converted_text.replace('Inc.', 'Incorporated')
99-
# converted_text = converted_text.replace('p.m.', 'pm')
100-
# converted_text = converted_text.replace('U.N.', 'UN')
101-
# converted_text = converted_text.replace('U.S.', 'US')
102-
# converted_text = converted_text.replace(' ill. ', ' ill ')
103-
# converted_text = converted_text.replace(' pa. ', ' pa ')
104-
#
105-
# converted_text = converted_text.replace(".? ", "? ")
106-
# converted_text = converted_text.replace("?). ", "? ")
107-
#
108-
# converted_text = converted_text.replace('. his', '. His')
109-
# converted_text = converted_text.replace(". i'm", ". I'm")
110-
# converted_text = converted_text.replace(". the", ". The")
111-
# converted_text = converted_text.replace(". all", ". All")
112-
# converted_text = converted_text.replace(". during", ". During")
113-
# converted_text = converted_text.replace(". soon", ". Soon")
114-
#
115-
# converted_text = re.sub(r'(\d)\.\s', ' ', converted_text)
116133

117134
for sent in nltk.sent_tokenize(converted_text):
118135
sents.extend(sent.split('\n\n'))
@@ -121,7 +138,7 @@ def remove_tags(selector):
121138
sents_with_pos = []
122139
last_pos = 0
123140
for sent in sents:
124-
pos = sgm_text.find(sent, last_pos)
141+
pos = self.sgm_text.find(sent, last_pos)
125142
last_pos = pos
126143
sents_with_pos.append({
127144
'text': sent,
@@ -191,7 +208,7 @@ def parse_event_tag(node):
191208
'text': charset.text,
192209
'position': [int(charset.attrib['START']), int(charset.attrib['END'])],
193210
'role': child2.attrib['ROLE'],
194-
'entity-id': child2.attrib['REFID']
211+
'entity-id': child2.attrib['REFID'],
195212
})
196213
event_mentions.append(event_mention)
197214
return event_mentions
@@ -223,6 +240,11 @@ def parse_value_timex_tag(node):
223240

224241

225242
if __name__ == '__main__':
226-
data = Parser('./data/ace_2005_td_v7/data/English/nw/timex2norm/AFP_ENG_20030304.0250.apf.xml').get_data()
227-
with open('output/sample.json', 'w') as f:
228-
json.dump(data[0], f, indent=2)
243+
parser = Parser('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331')
244+
# parser = Parser('./data/ace_2005_td_v7/data/English/un/adj/alt.atheism_20041104.2428')
245+
data = parser.get_data()
246+
with open('./output/debug.json', 'w') as f:
247+
json.dump(data, f, indent=2)
248+
249+
index = parser.sgm_text.find("the two")
250+
# print(parser.sgm_text[index:])

0 commit comments

Comments
 (0)