@@ -28,25 +28,22 @@ def get_data_paths(ace2005_path):
2828
2929
3030def find_token_index (tokens , start_pos , end_pos , phrase ):
31- start_idx , end_idx = - 1 , - 1
31+ start_idx = - 1
3232 for idx , token in enumerate (tokens ):
3333 if token ['characterOffsetBegin' ] <= start_pos :
3434 start_idx = idx
35- # if token['characterOffsetEnd'] == end_pos:
36- # end_idx = idx - 1
3735
3836 # Some of the ACE2005 data has annotation position errors.
39- if end_idx == - 1 :
40- end_idx = start_idx + len (phrase .split ())
37+ end_idx = start_idx + len (phrase .split ())
4138
4239 return start_idx , end_idx
4340
4441
4542def preprocessing (data_type , files ):
4643 result = []
47- event_count , entity_count , sent_count = 0 , 0 , 0
44+ event_count , entity_count , sent_count , argument_count = 0 , 0 , 0 , 0
4845
49- print ('- ' * 20 )
46+ print ('= ' * 20 )
5047 print ('[preprocessing] type: ' , data_type )
5148 for file in tqdm (files ):
5249 parser = Parser (path = file )
@@ -65,15 +62,14 @@ def preprocessing(data_type, files):
6562 nlp_text = nlp .annotate (item ['sentence' ], properties = {'annotators' : 'tokenize,ssplit,pos,lemma,parse' })
6663 nlp_res = json .loads (nlp_text )
6764 except Exception as e :
68- print ('StanfordCore Exception ' , e )
69- print ('item["sentence"] :' , item ['sentence' ])
70- print ('nlp_text :' , nlp_text )
65+ print ('[Warning] StanfordCore Exception: ' , nlp_text , 'This sentence will be ignored.' )
7166 continue
7267
7368 tokens = nlp_res ['sentences' ][0 ]['tokens' ]
7469
7570 if len (nlp_res ['sentences' ]) >= 2 :
76- print ('len >=2! Sentence :' , data ['sentence' ])
71+ # TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match
72+ # This error occurred so little that it was temporarily ignored (< 20 sentences).
7773 continue
7874
7975 data ['stanford-colcc' ] = []
@@ -104,7 +100,7 @@ def preprocessing(data_type, files):
104100 data ['golden-entity-mentions' ].append (entity_mention )
105101
106102 for event_mention in item ['golden-event-mentions' ]:
107- # same event mention cab be shared
103+ # same event mention can be shared
108104 event_mention = copy .deepcopy (event_mention )
109105 position = event_mention ['trigger' ]['position' ]
110106 start_idx , end_idx = find_token_index (
@@ -120,6 +116,7 @@ def preprocessing(data_type, files):
120116 del event_mention ['position' ]
121117
122118 arguments = []
119+ argument_count += len (event_mention ['arguments' ])
123120 for argument in event_mention ['arguments' ]:
124121 position = argument ['position' ]
125122 start_idx , end_idx = find_token_index (
@@ -139,9 +136,11 @@ def preprocessing(data_type, files):
139136
140137 result .append (data )
141138
142- print ('sent_count :' , sent_count )
143- print ('event_count :' , event_count )
144- print ('entity_count :' , entity_count )
139+ print ('======[Statistics]======' )
140+ print ('sent :' , sent_count )
141+ print ('event :' , event_count )
142+ print ('entity :' , entity_count )
143+ print ('argument:' , argument_count )
145144
146145 with open ('output/{}.json' .format (data_type ), 'w' ) as f :
147146 json .dump (result , f , indent = 2 )
@@ -156,6 +155,6 @@ def preprocessing(data_type, files):
156155 with StanfordCoreNLP ('./stanford-corenlp-full-2018-10-05' , memory = '8g' , timeout = 60000 ) as nlp :
157156 # res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
158157 # print(res)
159- preprocessing ('dev' , dev_files )
160158 preprocessing ('train' , train_files )
161159 preprocessing ('test' , test_files )
160+ preprocessing ('dev' , dev_files )
0 commit comments