Skip to content

Commit dfb62ec

Browse files
authored
Merge pull request #115 from hazelnutsgz/conll2003
[new] Add a loader for conll2003 dataset
2 parents 7371593 + 5f4ab13 commit dfb62ec

File tree

3 files changed

+514
-0
lines changed

3 files changed

+514
-0
lines changed

fastNLP/io/dataset_loader.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,55 @@ def convert(self, data):
417417
data_set.set_input("seq_len")
418418
return data_set
419419

420+
421+
class Conll2003Loader(DataSetLoader):
422+
"""Self-defined loader of conll2003 dataset
423+
424+
More information about the given dataset cound be found on
425+
https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
426+
427+
"""
428+
429+
def __init__(self):
430+
super(Conll2003Loader, self).__init__()
431+
432+
def load(self, dataset_path):
433+
with open(dataset_path, "r", encoding="utf-8") as f:
434+
lines = f.readlines()
435+
436+
##Parse the dataset line by line
437+
parsed_data = []
438+
sentence = []
439+
tokens = []
440+
for line in lines:
441+
if '-DOCSTART- -X- -X- O' in line or line == '\n':
442+
if sentence != []:
443+
parsed_data.append((sentence, tokens))
444+
sentence = []
445+
tokens = []
446+
continue
447+
448+
temp = line.strip().split(" ")
449+
sentence.append(temp[0])
450+
tokens.append(temp[1:4])
451+
452+
return self.convert(parsed_data)
453+
454+
def convert(self, parsed_data):
455+
dataset = DataSet()
456+
for sample in parsed_data:
457+
label0_list = list(map(
458+
lambda labels: labels[0], sample[1]))
459+
label1_list = list(map(
460+
lambda labels: labels[1], sample[1]))
461+
label2_list = list(map(
462+
lambda labels: labels[2], sample[1]))
463+
dataset.append(Instance(token_list=sample[0],
464+
label0_list=label0_list,
465+
label1_list=label1_list,
466+
label2_list=label2_list))
467+
468+
return dataset
420469

421470
class SNLIDataSetLoader(DataSetLoader):
422471
"""A data set loader for SNLI data set.

0 commit comments

Comments
 (0)