@@ -417,6 +417,55 @@ def convert(self, data):
417
417
data_set .set_input ("seq_len" )
418
418
return data_set
419
419
420
+
421
+ class Conll2003Loader (DataSetLoader ):
422
+ """Self-defined loader of conll2003 dataset
423
+
424
+ More information about the given dataset cound be found on
425
+ https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
426
+
427
+ """
428
+
429
+ def __init__ (self ):
430
+ super (Conll2003Loader , self ).__init__ ()
431
+
432
+ def load (self , dataset_path ):
433
+ with open (dataset_path , "r" , encoding = "utf-8" ) as f :
434
+ lines = f .readlines ()
435
+
436
+ ##Parse the dataset line by line
437
+ parsed_data = []
438
+ sentence = []
439
+ tokens = []
440
+ for line in lines :
441
+ if '-DOCSTART- -X- -X- O' in line or line == '\n ' :
442
+ if sentence != []:
443
+ parsed_data .append ((sentence , tokens ))
444
+ sentence = []
445
+ tokens = []
446
+ continue
447
+
448
+ temp = line .strip ().split (" " )
449
+ sentence .append (temp [0 ])
450
+ tokens .append (temp [1 :4 ])
451
+
452
+ return self .convert (parsed_data )
453
+
454
+ def convert (self , parsed_data ):
455
+ dataset = DataSet ()
456
+ for sample in parsed_data :
457
+ label0_list = list (map (
458
+ lambda labels : labels [0 ], sample [1 ]))
459
+ label1_list = list (map (
460
+ lambda labels : labels [1 ], sample [1 ]))
461
+ label2_list = list (map (
462
+ lambda labels : labels [2 ], sample [1 ]))
463
+ dataset .append (Instance (token_list = sample [0 ],
464
+ label0_list = label0_list ,
465
+ label1_list = label1_list ,
466
+ label2_list = label2_list ))
467
+
468
+ return dataset
420
469
421
470
class SNLIDataSetLoader (DataSetLoader ):
422
471
"""A data set loader for SNLI data set.
0 commit comments