Skip to content

Commit 81b425a

Browse files
pierrot0The TensorFlow Datasets Authors
authored andcommitted
add support for config-level freeform tags in TFDS.
PiperOrigin-RevId: 523952029
1 parent 877e481 commit 81b425a

File tree

7 files changed

+61
-9
lines changed

7 files changed

+61
-9
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,21 @@ and this project adheres to
1010

1111
### Added
1212

13+
- [Experimental] A list of freeform text tags can now be attached to a
14+
`BuilderConfig`. For example:
15+
```py
16+
BUILDER_CONFIGS = [
17+
tfds.core.BuilderConfig(name="foo", tags=["foo", "live"]),
18+
tfds.core.BuilderConfig(name="bar", tags=["bar", "old"]),
19+
]
20+
```
21+
The tags are recorded with the dataset metadata and can later be retrieved
22+
using the info object:
23+
```py
24+
builder.info.config_tags # ["foo", "live"]
25+
```
26+
This feature is experimental and there are no guidelines on tags format.
27+
1328
### Changed
1429

1530
### Deprecated

tensorflow_datasets/core/dataset_builder.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,16 @@ class BuilderConfig:
8383
8484
DatasetBuilder subclasses with data configuration options should subclass
8585
`BuilderConfig` and add their own properties.
86+
87+
Attributes:
88+
name: The name of the config.
89+
version: The version of the config.
90+
release_notes: A dictionary associating versions to changes.
91+
supported_versions: A list of versions which this Builder Config supports.
92+
description: a human description of the config.
93+
tags: [Experimental] a list of freeform tags applying to the config. This is
94+
not used by TFDS, but can be retrieved later from a ConfigBuilder
95+
instance.
8696
"""
8797

8898
# TODO(py3.10): Should update dataclass to be:
@@ -96,6 +106,7 @@ class BuilderConfig:
96106
default_factory=list
97107
)
98108
description: Optional[str] = None
109+
tags: List[str] = dataclasses.field(default_factory=list)
99110

100111
@classmethod
101112
def from_dataset_info(
@@ -108,6 +119,7 @@ def from_dataset_info(
108119
description=info_proto.config_description,
109120
version=info_proto.version,
110121
release_notes=info_proto.release_notes or {},
122+
tags=info_proto.config_tags or [],
111123
)
112124

113125

tensorflow_datasets/core/dataset_builder_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,15 @@ class DummyDatasetWithConfigs(dataset_builder.GeneratorBasedBuilder):
5757
name="plus1",
5858
version=utils.Version("0.0.1"),
5959
description="Add 1 to the records",
60+
tags=["foo:bar"],
6061
increment=1,
6162
),
6263
DummyBuilderConfig(
6364
name="plus2",
6465
version=utils.Version("0.0.2"),
6566
supported_versions=[utils.Version("0.0.1")],
6667
description="Add 2 to the records",
68+
tags=["foo:baz"],
6769
increment=2,
6870
),
6971
]

tensorflow_datasets/core/dataset_info.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
import posixpath
4040
import tempfile
4141
import time
42-
from typing import Any, Dict, Iterable, Optional, Tuple, Union
42+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
4343

4444
from absl import logging
4545
from etils import epath
@@ -106,23 +106,27 @@ class DatasetIdentity:
106106
module_name: str
107107
config_name: Optional[str] = None
108108
config_description: Optional[str] = None
109+
config_tags: Optional[List[str]] = None
109110
release_notes: Optional[Dict[str, str]] = None
110111

111112
@classmethod
112113
def from_builder(cls, builder) -> "DatasetIdentity":
113114
if builder.builder_config:
114115
config_name = builder.builder_config.name
115116
config_description = builder.builder_config.description
117+
config_tags = builder.builder_config.tags
116118
else:
117119
config_name = None
118120
config_description = None
121+
config_tags = None
119122
return cls(
120123
name=builder.name,
121124
version=utils.Version(builder.version),
122125
data_dir=builder.data_dir,
123126
module_name=str(builder.__module__),
124127
config_name=config_name,
125128
config_description=config_description,
129+
config_tags=config_tags,
126130
release_notes=builder.release_notes,
127131
)
128132

@@ -139,6 +143,7 @@ def from_proto(
139143
module_name=info_proto.module_name,
140144
config_name=info_proto.config_name,
141145
config_description=info_proto.config_description,
146+
config_tags=info_proto.config_tags or [],
142147
release_notes={k: v for k, v in info_proto.release_notes.items()},
143148
)
144149

@@ -228,6 +233,7 @@ def __init__(
228233
disable_shuffling=disable_shuffling,
229234
config_name=self._identity.config_name,
230235
config_description=self._identity.config_description,
236+
config_tags=self._identity.config_tags,
231237
citation=utils.dedent(citation),
232238
module_name=self._identity.module_name,
233239
redistribution_info=dataset_info_pb2.RedistributionInfo(
@@ -320,6 +326,10 @@ def config_name(self) -> str:
320326
def config_description(self) -> str:
321327
return self._identity.config_description
322328

329+
@property
330+
def config_tags(self) -> List[str]:
331+
return self._identity.config_tags
332+
323333
@property
324334
def full_name(self):
325335
"""Full canonical name: (<dataset_name>/<config_name>/<version>)."""
@@ -650,6 +660,8 @@ def read_from_directory(self, dataset_info_dir: epath.PathLike) -> None:
650660
# Otherwise, we restore the dataset_info.json value
651661
if field.type == field.TYPE_MESSAGE:
652662
field_value.MergeFrom(field_value_restored)
663+
elif field.label == field.LABEL_REPEATED:
664+
field_value.extend(field_value_restored)
653665
else:
654666
setattr(self._info_proto, field_name, field_value_restored)
655667

@@ -754,6 +766,11 @@ def __repr__(self):
754766
else:
755767
config_description = SKIP
756768

769+
if self._info_proto.config_tags:
770+
config_tags = ", ".join(self.config_tags)
771+
else:
772+
config_tags = SKIP
773+
757774
file_format_str = (
758775
self.file_format.value
759776
if self.file_format
@@ -765,6 +782,7 @@ def __repr__(self):
765782
("full_name", repr(self.full_name)),
766783
("description", _indent(f'"""\n{self.description}\n"""')),
767784
("config_description", config_description),
785+
("config_tags", config_tags),
768786
("homepage", repr(self.homepage)),
769787
("data_path", repr(self.data_dir)),
770788
("file_format", file_format_str),

tensorflow_datasets/core/dataset_info_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ def test_restore_after_modification(self):
198198
citation="some citation",
199199
license="some license",
200200
)
201+
info.as_proto.config_tags.extend(["foo", "bar"])
201202
info.download_size = 456
202203
filepath_template = "{DATASET}-{SPLIT}.{FILEFORMAT}-{SHARD_X_OF_Y}"
203204
info.as_proto.splits.add(

tensorflow_datasets/core/proto/dataset_info.proto

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ message DatasetInfo {
156156
// Config description of the dataset
157157
string config_description = 14;
158158

159+
// Config tags of the dataset
160+
repeated string config_tags = 21;
161+
159162
// The structure and characteristics of the features of this dataset.
160163
Feature features = 19;
161164

@@ -203,5 +206,5 @@ message DatasetInfo {
203206
// The data that was used to generate this dataset.
204207
repeated DataSourceAccess data_source_accesses = 20;
205208

206-
// Next available: 21
209+
// Next available: 22
207210
}

tensorflow_datasets/core/proto/dataset_info_generated_pb2.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,13 @@
6363
b' \x01(\t\x12\x0f\n\x07version\x18\x03'
6464
b' \x01(\t\x12\x10\n\x08\x64\x61ta_dir\x18\x04'
6565
b' \x01(\t\x12\x14\n\x0c\x64s_namespace\x18\x05'
66-
b' \x01(\t"\x9f\x07\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01'
66+
b' \x01(\t"\xb4\x07\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01'
6767
b' \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02'
6868
b' \x01(\t\x12\x0f\n\x07version\x18\t \x01(\t\x12I\n\rrelease_notes\x18\x12'
6969
b' \x03(\x0b\x32\x32.tensorflow_datasets.DatasetInfo.ReleaseNotesEntry\x12\x13\n\x0b\x63onfig_name\x18\r'
7070
b' \x01(\t\x12\x1a\n\x12\x63onfig_description\x18\x0e'
71-
b' \x01(\t\x12.\n\x08\x66\x65\x61tures\x18\x13'
71+
b' \x01(\t\x12\x13\n\x0b\x63onfig_tags\x18\x15'
72+
b' \x03(\t\x12.\n\x08\x66\x65\x61tures\x18\x13'
7273
b' \x01(\x0b\x32\x1c.tensorflow_datasets.Feature\x12\x10\n\x08\x63itation\x18\x03'
7374
b' \x01(\t\x12\x19\n\rsize_in_bytes\x18\x04'
7475
b' \x01(\x03\x42\x02\x18\x01\x12\x15\n\rdownload_size\x18\x0c'
@@ -138,9 +139,9 @@
138139
_TFDSDATASETREFERENCE._serialized_start = 1203
139140
_TFDSDATASETREFERENCE._serialized_end = 1312
140141
_DATASETINFO._serialized_start = 1315
141-
_DATASETINFO._serialized_end = 2242
142-
_DATASETINFO_RELEASENOTESENTRY._serialized_start = 2133
143-
_DATASETINFO_RELEASENOTESENTRY._serialized_end = 2184
144-
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_start = 2186
145-
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_end = 2242
142+
_DATASETINFO._serialized_end = 2263
143+
_DATASETINFO_RELEASENOTESENTRY._serialized_start = 2154
144+
_DATASETINFO_RELEASENOTESENTRY._serialized_end = 2205
145+
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_start = 2207
146+
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_end = 2263
146147
# @@protoc_insertion_point(module_scope)

0 commit comments

Comments
 (0)