From b4ff71efc1a2e7771189470ee81e9f9c72e3e84d Mon Sep 17 00:00:00 2001 From: Holy Lovenia Date: Fri, 8 Jul 2022 16:42:46 +0800 Subject: [PATCH 1/3] Create dataloader for PMC-Patients Task 1: Patient Note Recognition (PNR) --- .../biodatasets/pmc_patients_pnr/__init__.py | 0 .../pmc_patients_pnr/pmc_patients_pnr.py | 161 ++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 bigbio/biodatasets/pmc_patients_pnr/__init__.py create mode 100644 bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py diff --git a/bigbio/biodatasets/pmc_patients_pnr/__init__.py b/bigbio/biodatasets/pmc_patients_pnr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py new file mode 100644 index 00000000..a41224c6 --- /dev/null +++ b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py @@ -0,0 +1,161 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn. +""" + +import json +import os +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from bigbio.utils import schemas +from bigbio.utils.configs import BigBioConfig +from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.license import Licenses + +_LANGUAGES = [Lang.EN] +_PUBMED = True +_LOCAL = False +_CITATION = """\ +@misc{zhao2022pmcpatients, + title={PMC-Patients: A Large-scale Dataset of Patient Notes and Relations Extracted from Case + Reports in PubMed Central}, + author={Zhengyun Zhao and Qiao Jin and Sheng Yu}, + year={2022}, + eprint={2202.13876}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""" + +_DATASETNAME = "pmc_patients_pnr" +_DISPLAYNAME = "PMC-Patients Task 1: Patient Note Recognition (PNR)" + +_DESCRIPTION = """\ +PMC-Patients PNR is a paragraph-level sequential modeling to recognize patient notes. +""" + +_HOMEPAGE = "https://github.com/zhao-zy15/PMC-Patients" + +_LICENSE = Licenses.CC_BY_NC_SA_4p0 + +_URLS = { + _DATASETNAME: "https://drive.google.com/u/0/uc?id=1vFCLy_CF8fxPDZvDtHPR6Dl6x9l0TyvW&export=download", +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.2.0" + +_BIGBIO_VERSION = "1.0.0" + + +class PMCPatientsPNRDataset(datasets.GeneratorBasedBuilder): + """PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="pmc_patients_pnr_source", + version=SOURCE_VERSION, + description="pmc_patients_pnr source schema", + schema="source", + subset_id="pmc_patients_pnr", + ), + ] + + DEFAULT_CONFIG_NAME = "pmc_patients_pnr_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "texts": [datasets.Value("string")], + "tags": [datasets.Value("string")], + } + ) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=str(_LICENSE), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join( + data_dir, + "datasets/task_1_patient_note_recognition/PNR_train.json", + ), + "split": "train", + "data_dir": data_dir, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join( + data_dir, + "datasets/task_1_patient_note_recognition/PNR_test.json", + ), + "split": "test", + "data_dir": data_dir, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join( + data_dir, + "datasets/task_1_patient_note_recognition/PNR_dev.json", + ), + "split": "dev", + "data_dir": data_dir, + }, + ), + ] + + def _generate_examples( + self, filepath, split: str, data_dir: str + ) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + with open(filepath, "r") as j: + file = json.load(j) + + if self.config.schema == "source": + + for uid, article in enumerate(file): + feature_dict = { + "id": uid, + "texts": article["texts"], + "tags": article["tags"], + } + yield uid, feature_dict \ No newline at end of file From 366487d25af907fadf2f513e89c060f003d14e55 Mon Sep 17 00:00:00 2001 From: Holy Lovenia Date: Fri, 8 Jul 2022 16:50:40 +0800 Subject: [PATCH 2/3] Format code --- .../pmc_patients_pnr/pmc_patients_pnr.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py index a41224c6..b5caedb1 100644 --- a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py +++ b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py @@ -14,7 +14,11 @@ # limitations under the License. """ -PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn. +PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). +PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, +similar to the named entity recognition (NER) task. +For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, +the output is a sequence of BIO tags t1, t2, ..., tn. """ import json @@ -22,9 +26,7 @@ from typing import Dict, List, Tuple import datasets -import pandas as pd -from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig from bigbio.utils.constants import Lang, Tasks from bigbio.utils.license import Licenses @@ -66,7 +68,12 @@ class PMCPatientsPNRDataset(datasets.GeneratorBasedBuilder): - """PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn. + """ + PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). + PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, + similar to the named entity recognition (NER) task. + For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, + the output is a sequence of BIO tags t1, t2, ..., tn. """ SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) @@ -142,9 +149,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: ), ] - def _generate_examples( - self, filepath, split: str, data_dir: str - ) -> Tuple[int, Dict]: + def _generate_examples(self, filepath, split: str, data_dir: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" with open(filepath, "r") as j: @@ -158,4 +163,4 @@ def _generate_examples( "texts": article["texts"], "tags": article["tags"], } - yield uid, feature_dict \ No newline at end of file + yield uid, feature_dict From f3520739693e55df2633b56a158b419d7c2290d3 Mon Sep 17 00:00:00 2001 From: Holy Lovenia Date: Fri, 8 Jul 2022 17:00:06 +0800 Subject: [PATCH 3/3] Remove support tasks --- bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py index b5caedb1..aaf22311 100644 --- a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py +++ b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py @@ -60,7 +60,7 @@ _DATASETNAME: "https://drive.google.com/u/0/uc?id=1vFCLy_CF8fxPDZvDtHPR6Dl6x9l0TyvW&export=download", } -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] +_SUPPORTED_TASKS = [] _SOURCE_VERSION = "1.2.0"