|
22 | 22 | import json
|
23 | 23 | import logging
|
24 | 24 | import types as python_types
|
25 |
| -from typing import Dict, List, Optional, Text |
| 25 | +from typing import Dict, List, Optional, Text, Union |
26 | 26 |
|
27 | 27 | from tensorflow_data_validation import types
|
28 | 28 | from tensorflow_data_validation.statistics.generators import stats_generator
|
@@ -76,14 +76,17 @@ def __init__(
|
76 | 76 | types.FeatureName]] = None,
|
77 | 77 | vocab_paths: Optional[Dict[types.VocabName, types.VocabPath]] = None,
|
78 | 78 | add_default_generators: bool = True,
|
79 |
| - feature_allowlist: Optional[List[types.FeatureName]] = None, |
| 79 | + # TODO(b/255895499): Support "from schema" for feature_allowlist. |
| 80 | + feature_allowlist: Optional[Union[List[types.FeatureName], |
| 81 | + List[types.FeaturePath]]] = None, |
80 | 82 | experimental_use_sketch_based_topk_uniques: Optional[bool] = None,
|
81 | 83 | use_sketch_based_topk_uniques: Optional[bool] = None,
|
82 | 84 | experimental_slice_functions: Optional[List[types.SliceFunction]] = None,
|
83 | 85 | experimental_slice_sqls: Optional[List[Text]] = None,
|
84 | 86 | experimental_result_partitions: int = 1,
|
85 | 87 | experimental_num_feature_partitions: int = 1,
|
86 |
| - slicing_config: Optional[slicing_spec_pb2.SlicingConfig] = None): |
| 88 | + slicing_config: Optional[slicing_spec_pb2.SlicingConfig] = None, |
| 89 | + experimental_filter_read_paths: bool = False): |
87 | 90 | """Initializes statistics options.
|
88 | 91 |
|
89 | 92 | Args:
|
@@ -151,7 +154,7 @@ def __init__(
|
151 | 154 | (controlled by `enable_semantic_domain_stats`) and 4) schema-based
|
152 | 155 | generators that are enabled based on information provided in the schema.
|
153 | 156 | feature_allowlist: An optional list of names of the features to calculate
|
154 |
| - statistics for. |
| 157 | + statistics for, or a list of paths. |
155 | 158 | experimental_use_sketch_based_topk_uniques: Deprecated, prefer
|
156 | 159 | use_sketch_based_topk_uniques.
|
157 | 160 | use_sketch_based_topk_uniques: if True, use the sketch based
|
@@ -193,8 +196,11 @@ def __init__(
|
193 | 196 | number of features in a dataset, and never more than the available beam
|
194 | 197 | parallelism.
|
195 | 198 | slicing_config: an optional SlicingConfig. SlicingConfig includes
|
196 |
| - slicing_specs specified with feature keys, feature values or slicing |
197 |
| - SQL queries. |
| 199 | + slicing_specs specified with feature keys, feature values or slicing |
| 200 | + SQL queries. |
| 201 | + experimental_filter_read_paths: If provided, tries to push down either |
| 202 | + paths passed via feature_allowlist or via the schema (in that priority) |
| 203 | + to the underlying read operation. Support depends on the file reader. |
198 | 204 | """
|
199 | 205 | self.generators = generators
|
200 | 206 | self.feature_allowlist = feature_allowlist
|
@@ -241,6 +247,7 @@ def __init__(
|
241 | 247 | self.experimental_num_feature_partitions = experimental_num_feature_partitions
|
242 | 248 | self.experimental_result_partitions = experimental_result_partitions
|
243 | 249 | self.slicing_config = slicing_config
|
| 250 | + self.experimental_filter_read_paths = experimental_filter_read_paths |
244 | 251 |
|
245 | 252 | def to_json(self) -> Text:
|
246 | 253 | """Convert from an object to JSON representation of the __dict__ attribute.
|
@@ -340,12 +347,16 @@ def generators(
|
340 | 347 | self._generators = generators
|
341 | 348 |
|
342 | 349 | @property
|
343 |
| - def feature_allowlist(self) -> Optional[List[types.FeatureName]]: |
| 350 | + def feature_allowlist( |
| 351 | + self |
| 352 | + ) -> Optional[Union[List[types.FeatureName], List[types.FeaturePath]]]: |
344 | 353 | return self._feature_allowlist
|
345 | 354 |
|
346 | 355 | @feature_allowlist.setter
|
347 | 356 | def feature_allowlist(
|
348 |
| - self, feature_allowlist: Optional[List[types.FeatureName]]) -> None: |
| 357 | + self, feature_allowlist: Optional[Union[List[types.FeatureName], |
| 358 | + List[types.FeaturePath]]] |
| 359 | + ) -> None: |
349 | 360 | if feature_allowlist is not None and not isinstance(feature_allowlist,
|
350 | 361 | list):
|
351 | 362 | raise TypeError('feature_allowlist is of type %s, should be a list.' %
|
@@ -554,6 +565,14 @@ def experimental_num_feature_partitions(self,
|
554 | 565 | raise ValueError('experimental_num_feature_partitions must be > 0.')
|
555 | 566 | self._experimental_num_feature_partitions = feature_partitions
|
556 | 567 |
|
| 568 | + @property |
| 569 | + def experimental_filter_read_paths(self) -> bool: |
| 570 | + return self._experimental_filter_read_paths |
| 571 | + |
| 572 | + @experimental_filter_read_paths.setter |
| 573 | + def experimental_filter_read_paths(self, filter_read: bool) -> None: |
| 574 | + self._experimental_filter_read_paths = filter_read |
| 575 | + |
557 | 576 |
|
558 | 577 | def _validate_sql(sql_query: Text, schema: schema_pb2.Schema):
|
559 | 578 | arrow_schema = example_coder.ExamplesToRecordBatchDecoder(
|
|
0 commit comments