From 69c9d7bd7ded56c1392f12dc16ce6bcee52e144a Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 01:59:49 +0800
Subject: [PATCH 1/4] Closes #57

---
 bigbio/biodatasets/why_qa/why_qa.py | 217 ++++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 bigbio/biodatasets/why_qa/why_qa.py

diff --git a/bigbio/biodatasets/why_qa/why_qa.py b/bigbio/biodatasets/why_qa/why_qa.py
new file mode 100644
index 00000000..6dbd59bd
--- /dev/null
+++ b/bigbio/biodatasets/why_qa/why_qa.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A dataset loader for the n2c2 community-annotated Why Questions dataset.
+
+https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/
+
+The dataset consists of a single archive (no splits) and it is available
+as a JSON file and as an XLSX file:
+
+    - relations_whyqa_ann-v7-share.json (in SQUAD 2.0 format)
+    - relations_whyqa_ann-v7-share.xlsx
+
+The dataset also includes TXT files with the full texts of the
+clinical notes.
+
+The files comprising this dataset must be on the users local machine
+in a single directory that is passed to `datasets.load_dataset` via
+the `data_dir` kwarg. This loader script will read the archive files
+directly (i.e. the user should not uncompress, untar or unzip any of
+the files).
+
+Registration AND submission of DUA is required to access the dataset.
+
+[bigbio_schema_name] = qa
+"""
+
+import os
+import zipfile
+import json
+from collections import defaultdict
+from typing import List, Tuple, Dict
+
+import datasets
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@inproceedings{,
+  author    = {Annotating and Characterizing Clinical Sentences with Explicit Why-{QA} Cues},
+  title     = {Fan, Jungwei},
+  booktitle = {Proceedings of the 2nd Clinical Natural Language Processing Workshop},
+  month     = {jun},
+  year      = {2019},
+  address   = {Minneapolis, Minnesota, USA},
+  publisher = {Association for Computational Linguistics},
+  url       = {https://aclanthology.org/W19-1913},
+  doi       = {10.18653/v1/W19-1913}
+
+}
+
+}
+"""
+
+_DATASETNAME = "[why_qa]"
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+
+This dataset is a collection of why-questions and their answers generated
+from a corpus of clincal notes. The corpus is the 2010 i2b2/VA NLP
+challenge and consists of 426 discharge summaries from Partners
+Healthcare and Beth Israel Deaconess Medical Center.
+
+"""
+_HOMEPAGE = "https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/"
+
+_LICENSE = "External Data User Agreement"
+
+_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_BIGBIO_VERSION = "1.0.0"
+
+def read_zip_file(file_path):
+    with zipfile.ZipFile(file_path) as zf:
+        with zf.open("n2c2-community-annotations_2010-fan-why-QA/relations_whyqa_ann-v7-share.json") as f:
+            dataset = json.load(f)
+            return dataset
+
+def _get_samples(dataset):
+    samples = dataset['data'][0]['paragraphs']
+    return samples
+
+# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
+class WhyQaDataset(datasets.GeneratorBasedBuilder):
+    """n2c2 community-annotated Why Questions dataset."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+
+    BUILDER_CONFIGS = [
+        BigBioConfig(
+            name="why_qa_source",
+            version=SOURCE_VERSION,
+            description="why_qa source schema",
+            schema="source",
+            subset_id="why_qa",
+        ),
+        BigBioConfig(
+            name="why_qa_bigbio_qa",
+            version=BIGBIO_VERSION,
+            description="why_wa BigBio schema",
+            schema="bigbio_qa",
+            subset_id="why_qa",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "why_qa_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+
+            {
+            "note_id": datasets.Value("string"),
+            "qas": [
+                {"question_template": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "id": datasets.Value("string"),
+                "answers": [
+                    {"text": datasets.Value("string"),
+                    "answer_start": datasets.Value("int32"),
+                    },
+                    ],
+                "is_impossible": datasets.Value("bool"),
+                },
+                ],
+            "context": datasets.Value("string"),
+            },
+            )
+
+        elif self.config.schema == "bigbio_qa":
+            features = schemas.qa_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+
+        if self.config.data_dir is None:
+            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+        else:
+            data_dir = self.config.data_dir
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "split": "train",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, data_dir, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        dataset = read_zip_file(data_dir)
+        samples = _get_samples(dataset)
+
+        if self.config.schema == "source":
+            _id = 0
+            for sample in samples:
+                yield _id, sample
+                _id += 1
+
+        elif self.config.schema == "bigbio_[bigbio_schema_name]":
+            _id = 0
+            for sample in samples:
+                for qa in sample['qas']:
+                    ans_list = []
+                    for answer in qa["answer"]:
+                        ans = answer["text"]
+                        ans_list.append(ans)
+                    bigbio_sample = {
+                                        "id" : qa["note_id"],
+                                        "question_id" : qa["id"],
+                                        "document_id" : sample["note_id"],
+                                        "question" : qa["question"],
+                                        "type" : qa["question_template"],
+                                        "choices" : [],
+                                        "context" : sample["context"],
+                                        "answer" : ans_list,
+                                    }
+                    yield _id, bigbio_sample
+                    _id += 1
+
+
+# This template is based on the following template from the datasets package:
+# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
\ No newline at end of file

From 0e6ffb62ddfc98dfca63d8a5c1954e7183862034 Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 09:49:50 +0800
Subject: [PATCH 2/4] Update why_qa.py

---
 bigbio/biodatasets/why_qa/why_qa.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bigbio/biodatasets/why_qa/why_qa.py b/bigbio/biodatasets/why_qa/why_qa.py
index 6dbd59bd..fab511ad 100644
--- a/bigbio/biodatasets/why_qa/why_qa.py
+++ b/bigbio/biodatasets/why_qa/why_qa.py
@@ -49,7 +49,9 @@
 from utils.configs import BigBioConfig
 from utils.constants import Tasks
 
-# TODO: Add BibTeX citation
+
+_LOCAL = True
+
 _CITATION = """\
 @inproceedings{,
   author    = {Annotating and Characterizing Clinical Sentences with Explicit Why-{QA} Cues},

From 377139f611a7a24e4d6d3f5afc0b5e672669b0f3 Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 17:09:19 +0800
Subject: [PATCH 3/4] Update why_qa.py

---
 bigbio/biodatasets/why_qa/why_qa.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/bigbio/biodatasets/why_qa/why_qa.py b/bigbio/biodatasets/why_qa/why_qa.py
index fab511ad..22df7660 100644
--- a/bigbio/biodatasets/why_qa/why_qa.py
+++ b/bigbio/biodatasets/why_qa/why_qa.py
@@ -71,8 +71,6 @@
 
 _DATASETNAME = "[why_qa]"
 
-# TODO: Add description of the dataset here
-# You can copy an official description
 _DESCRIPTION = """\
 
 This dataset is a collection of why-questions and their answers generated
@@ -101,8 +99,6 @@ def _get_samples(dataset):
     samples = dataset['data'][0]['paragraphs']
     return samples
 
-# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
-#  Append "Dataset" to the class name: BioASQ --> BioasqDataset
 class WhyQaDataset(datasets.GeneratorBasedBuilder):
     """n2c2 community-annotated Why Questions dataset."""
 

From c91f098f15474eeecfa096f7b29c441d39218dbb Mon Sep 17 00:00:00 2001
From: clancyoftheoverflow
 <32432020+clancyoftheoverflow@users.noreply.github.com>
Date: Mon, 6 Jun 2022 17:37:35 +0800
Subject: [PATCH 4/4] Update why_qa.py

---
 bigbio/biodatasets/why_qa/why_qa.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigbio/biodatasets/why_qa/why_qa.py b/bigbio/biodatasets/why_qa/why_qa.py
index 22df7660..34809ec0 100644
--- a/bigbio/biodatasets/why_qa/why_qa.py
+++ b/bigbio/biodatasets/why_qa/why_qa.py
@@ -117,7 +117,7 @@ class WhyQaDataset(datasets.GeneratorBasedBuilder):
         BigBioConfig(
             name="why_qa_bigbio_qa",
             version=BIGBIO_VERSION,
-            description="why_wa BigBio schema",
+            description="why_qa BigBio schema",
             schema="bigbio_qa",
             subset_id="why_qa",
         ),