diff --git a/bigbio/biodatasets/gad/gad.py b/bigbio/biodatasets/gad/gad.py index b1ac6164..a63dc519 100644 --- a/bigbio/biodatasets/gad/gad.py +++ b/bigbio/biodatasets/gad/gad.py @@ -34,22 +34,23 @@ annotation procedure based on the Genetic Association Database """ -_HOMEPAGE = "https://github.com/dmis-lab/biobert" # This data source is used by the BLURB benchmark +_PUBMED = True + +_HOMEPAGE = "https://github.com/dmis-lab/biobert" # This data source is used by the BLURB benchmark _LICENSE = "Creative Common Attribution 4.0 International" _URLs = { "source": "https://drive.google.com/uc?export=download&id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw", - "bigbio_text": "https://drive.google.com/uc?export=download&id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw" + "bigbio_text": "https://drive.google.com/uc?export=download&id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw", } -_SUPPORTED_TASKS = [ - Tasks.TEXT_CLASSIFICATION -] +_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] _SOURCE_VERSION = "1.0.0" _BIGBIO_VERSION = "1.0.0" + class GAD(datasets.GeneratorBasedBuilder): """GAD is a weakly labeled dataset for Entity Relations (REL) task which is treated as a sentence classification task.""" @@ -61,7 +62,8 @@ class GAD(datasets.GeneratorBasedBuilder): description="GAD source schema", schema="source", subset_id=f"gad_fold{i}", - ) for i in range(10) + ) + for i in range(10) ] + [ # 10-fold bigbio schema BigBioConfig( @@ -70,7 +72,8 @@ class GAD(datasets.GeneratorBasedBuilder): description="GAD BigBio schema", schema="bigbio_text", subset_id=f"gad_fold{i}", - ) for i in range(10) + ) + for i in range(10) ] DEFAULT_CONFIG_NAME = "gad_fold0_source" @@ -81,7 +84,7 @@ def _info(self): { "index": datasets.Value("string"), "sentence": datasets.Value("string"), - "label": datasets.Value("int32") + "label": datasets.Value("int32"), } ) elif self.config.schema == "bigbio_text": @@ -99,12 +102,12 @@ def _split_generators( self, dl_manager: datasets.DownloadManager ) -> List[datasets.SplitGenerator]: fold_id = int(self.config.subset_id.split("_fold")[1][0]) + 1 - + my_urls = _URLs[self.config.schema] data_dir = Path(dl_manager.download_and_extract(my_urls)) data_files = { "train": data_dir / "GAD" / str(fold_id) / "train.tsv", - "test": data_dir / "GAD" / str(fold_id) / "test.tsv" + "test": data_dir / "GAD" / str(fold_id) / "test.tsv", } return [ @@ -119,19 +122,19 @@ def _split_generators( ] def _generate_examples(self, filepath: Path): - if 'train.tsv' in str(filepath): - df = pd.read_csv(filepath, sep='\t', header=None).reset_index() + if "train.tsv" in str(filepath): + df = pd.read_csv(filepath, sep="\t", header=None).reset_index() else: - df = pd.read_csv(filepath, sep='\t') - df.columns = ['id', 'sentence', 'label'] + df = pd.read_csv(filepath, sep="\t") + df.columns = ["id", "sentence", "label"] if self.config.schema == "source": for id, row in enumerate(df.itertuples()): ex = { "index": row.id, "sentence": row.sentence, - "label": int(row.label) - } + "label": int(row.label), + } yield id, ex elif self.config.schema == "bigbio_text": for id, row in enumerate(df.itertuples()): @@ -139,8 +142,8 @@ def _generate_examples(self, filepath: Path): "id": id, "document_id": row.id, "text": row.sentence, - "labels": [str(row.label)] + "labels": [str(row.label)], } - yield id, ex + yield id, ex else: raise ValueError(f"Invalid config: {self.config.name}") diff --git a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py index 10a2e184..883ecdbc 100644 --- a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py +++ b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py @@ -61,6 +61,8 @@ multiple PTM types at once in a unified framework. """ +_PUBMED = True + _HOMEPAGE = "http://www.geniaproject.org/other-corpora/ptm-event-corpus" _LICENSE = "GENIA Project License for Annotated Corpora" @@ -69,7 +71,11 @@ _DATASETNAME: "http://www.geniaproject.org/other-corpora/ptm-event-corpus/post-translational_modifications_training_data.tar.gz?attredirects=0&d=1", } -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.COREFERENCE_RESOLUTION, Tasks.EVENT_EXTRACTION] +_SUPPORTED_TASKS = [ + Tasks.NAMED_ENTITY_RECOGNITION, + Tasks.COREFERENCE_RESOLUTION, + Tasks.EVENT_EXTRACTION, +] _SOURCE_VERSION = "1.0.0" @@ -119,7 +125,9 @@ def _info(self) -> datasets.DatasetInfo: "events": [ # E line in brat { "id": datasets.Value("string"), - "type": datasets.Value("string"), # refers to the text_bound_annotation of the trigger + "type": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger "trigger": datasets.Value("string"), "arguments": [ { @@ -183,12 +191,16 @@ def _generate_examples(self, data_dir) -> Tuple[int, Dict]: if filename.endswith(".txt"): txt_file_path = Path(dirpath, filename) if self.config.schema == "source": - example = parsing.parse_brat_file(txt_file_path, annotation_file_suffixes=[".a1", ".a2"]) + example = parsing.parse_brat_file( + txt_file_path, annotation_file_suffixes=[".a1", ".a2"] + ) example["id"] = str(guid) for key in ["attributes", "normalizations"]: del example[key] yield guid, example elif self.config.schema == "bigbio_kb": - example = parsing.brat_parse_to_bigbio_kb(parsing.parse_brat_file(txt_file_path)) + example = parsing.brat_parse_to_bigbio_kb( + parsing.parse_brat_file(txt_file_path) + ) example["id"] = str(guid) yield guid, example diff --git a/bigbio/biodatasets/medical_data/medical_data.py b/bigbio/biodatasets/medical_data/medical_data.py index 897ce924..5c9c1836 100644 --- a/bigbio/biodatasets/medical_data/medical_data.py +++ b/bigbio/biodatasets/medical_data/medical_data.py @@ -42,6 +42,8 @@ _LICENSE = "" +_PUBMED = False + _URLS = {} _SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT] diff --git a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py index 69eabb0b..590febb6 100644 --- a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py +++ b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py @@ -78,6 +78,8 @@ } """ +_PUBMED = False + _DATASETNAME = "n2c2_2014_risk_factors" _DESCRIPTION = """\ @@ -268,4 +270,4 @@ def _read_task2_file(self, file_object, file_name): risk_factors.append(risk_factor) document = {"document_id": file_name, "text": text, "cardiac_risk_factors": risk_factors} - return document \ No newline at end of file + return document