From b4ff71efc1a2e7771189470ee81e9f9c72e3e84d Mon Sep 17 00:00:00 2001
From: Holy Lovenia <holy.lovenia@gmail.com>
Date: Fri, 8 Jul 2022 16:42:46 +0800
Subject: [PATCH 1/3] Create dataloader for PMC-Patients Task 1: Patient Note
 Recognition (PNR)

---
 .../biodatasets/pmc_patients_pnr/__init__.py  |   0
 .../pmc_patients_pnr/pmc_patients_pnr.py      | 161 ++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 bigbio/biodatasets/pmc_patients_pnr/__init__.py
 create mode 100644 bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py

diff --git a/bigbio/biodatasets/pmc_patients_pnr/__init__.py b/bigbio/biodatasets/pmc_patients_pnr/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
new file mode 100644
index 00000000..a41224c6
--- /dev/null
+++ b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn.
+"""
+
+import json
+import os
+from typing import Dict, List, Tuple
+
+import datasets
+import pandas as pd
+
+from bigbio.utils import schemas
+from bigbio.utils.configs import BigBioConfig
+from bigbio.utils.constants import Lang, Tasks
+from bigbio.utils.license import Licenses
+
+_LANGUAGES = [Lang.EN]
+_PUBMED = True
+_LOCAL = False
+_CITATION = """\
+@misc{zhao2022pmcpatients,
+      title={PMC-Patients: A Large-scale Dataset of Patient Notes and Relations Extracted from Case
+          Reports in PubMed Central},
+      author={Zhengyun Zhao and Qiao Jin and Sheng Yu},
+      year={2022},
+      eprint={2202.13876},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}"""
+
+_DATASETNAME = "pmc_patients_pnr"
+_DISPLAYNAME = "PMC-Patients Task 1: Patient Note Recognition (PNR)"
+
+_DESCRIPTION = """\
+PMC-Patients PNR is a paragraph-level sequential modeling to recognize patient notes.
+"""
+
+_HOMEPAGE = "https://github.com/zhao-zy15/PMC-Patients"
+
+_LICENSE = Licenses.CC_BY_NC_SA_4p0
+
+_URLS = {
+    _DATASETNAME: "https://drive.google.com/u/0/uc?id=1vFCLy_CF8fxPDZvDtHPR6Dl6x9l0TyvW&export=download",
+}
+
+_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION]
+
+_SOURCE_VERSION = "1.2.0"
+
+_BIGBIO_VERSION = "1.0.0"
+
+
+class PMCPatientsPNRDataset(datasets.GeneratorBasedBuilder):
+    """PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn. 
+    """
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+    BUILDER_CONFIGS = [
+        BigBioConfig(
+            name="pmc_patients_pnr_source",
+            version=SOURCE_VERSION,
+            description="pmc_patients_pnr source schema",
+            schema="source",
+            subset_id="pmc_patients_pnr",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "pmc_patients_pnr_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "texts": [datasets.Value("string")],
+                    "tags": [datasets.Value("string")],
+                }
+            )
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=str(_LICENSE),
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir,
+                        "datasets/task_1_patient_note_recognition/PNR_train.json",
+                    ),
+                    "split": "train",
+                    "data_dir": data_dir,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir,
+                        "datasets/task_1_patient_note_recognition/PNR_test.json",
+                    ),
+                    "split": "test",
+                    "data_dir": data_dir,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir,
+                        "datasets/task_1_patient_note_recognition/PNR_dev.json",
+                    ),
+                    "split": "dev",
+                    "data_dir": data_dir,
+                },
+            ),
+        ]
+
+    def _generate_examples(
+        self, filepath, split: str, data_dir: str
+    ) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+
+        with open(filepath, "r") as j:
+            file = json.load(j)
+
+        if self.config.schema == "source":
+
+            for uid, article in enumerate(file):
+                feature_dict = {
+                    "id": uid,
+                    "texts": article["texts"],
+                    "tags": article["tags"],
+                }
+                yield uid, feature_dict
\ No newline at end of file

From 366487d25af907fadf2f513e89c060f003d14e55 Mon Sep 17 00:00:00 2001
From: Holy Lovenia <holy.lovenia@gmail.com>
Date: Fri, 8 Jul 2022 16:50:40 +0800
Subject: [PATCH 2/3] Format code

---
 .../pmc_patients_pnr/pmc_patients_pnr.py      | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
index a41224c6..b5caedb1 100644
--- a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
+++ b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
@@ -14,7 +14,11 @@
 # limitations under the License.
 
 """
-PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn.
+PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR).
+PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task,
+similar to the named entity recognition (NER) task.
+For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs,
+the output is a sequence of BIO tags t1, t2, ..., tn.
 """
 
 import json
@@ -22,9 +26,7 @@
 from typing import Dict, List, Tuple
 
 import datasets
-import pandas as pd
 
-from bigbio.utils import schemas
 from bigbio.utils.configs import BigBioConfig
 from bigbio.utils.constants import Lang, Tasks
 from bigbio.utils.license import Licenses
@@ -66,7 +68,12 @@
 
 
 class PMCPatientsPNRDataset(datasets.GeneratorBasedBuilder):
-    """PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR). PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task, similar to the named entity recognition (NER) task. For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs, the output is a sequence of BIO tags t1, t2, ..., tn. 
+    """
+    PMC-Patients dataset consists of 4 tasks. One of the task is Patient Note Recognition (PNR).
+    PMC-Patients PNR dataset is modeled as a paragraph-level sequential labeling task,
+    similar to the named entity recognition (NER) task.
+    For each article, given input as a sequence of texts p1, p2, ..., pn, where n is the number of paragraphs,
+    the output is a sequence of BIO tags t1, t2, ..., tn.
     """
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
@@ -142,9 +149,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
             ),
         ]
 
-    def _generate_examples(
-        self, filepath, split: str, data_dir: str
-    ) -> Tuple[int, Dict]:
+    def _generate_examples(self, filepath, split: str, data_dir: str) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
 
         with open(filepath, "r") as j:
@@ -158,4 +163,4 @@ def _generate_examples(
                     "texts": article["texts"],
                     "tags": article["tags"],
                 }
-                yield uid, feature_dict
\ No newline at end of file
+                yield uid, feature_dict

From f3520739693e55df2633b56a158b419d7c2290d3 Mon Sep 17 00:00:00 2001
From: Holy Lovenia <holy.lovenia@gmail.com>
Date: Fri, 8 Jul 2022 17:00:06 +0800
Subject: [PATCH 3/3] Remove support tasks

---
 bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
index b5caedb1..aaf22311 100644
--- a/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
+++ b/bigbio/biodatasets/pmc_patients_pnr/pmc_patients_pnr.py
@@ -60,7 +60,7 @@
     _DATASETNAME: "https://drive.google.com/u/0/uc?id=1vFCLy_CF8fxPDZvDtHPR6Dl6x9l0TyvW&export=download",
 }
 
-_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION]
+_SUPPORTED_TASKS = []
 
 _SOURCE_VERSION = "1.2.0"