Skip to content

Commit 905f2be

Browse files
committed
Identify ctime and unix days in TimeStatsGenerator.
PiperOrigin-RevId: 490495036
1 parent 86f528f commit 905f2be

File tree

2 files changed

+36
-28
lines changed

2 files changed

+36
-28
lines changed

tensorflow_data_validation/statistics/generators/time_stats_generator.py

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,15 @@
5555
_MATCH_RATIO = 0.8
5656
_VALUES_THRESHOLD = 100
5757

58-
59-
_UnixTime = collections.namedtuple(
60-
'_UnixTime', ['format_constant', 'begin', 'end']
61-
)
58+
_UnixTime = collections.namedtuple('_UnixTime',
59+
['format_constant', 'begin', 'end'])
6260

6361
# Named tuples containing values used to detect integer times.
6462
# The beginning times correspond to 01-Jan-90 00:00:00 UTC.
6563
# The ending times correspond to 01-Jan-30 00:00:00 UTC.
6664
_UNIX_TIMES = [
65+
_UnixTime(
66+
format_constant=schema_pb2.TimeDomain.UNIX_DAYS, begin=7305, end=21915),
6767
_UnixTime(
6868
format_constant=schema_pb2.TimeDomain.UNIX_SECONDS,
6969
begin=631152000,
@@ -92,7 +92,10 @@
9292
# This is consistent with Python's strptime()'s mapping of format directives to
9393
# regexes.
9494
_STRPTIME_TO_RE = {
95-
# Do not include month_name[0], since it's an empty string.
95+
# Do not include month_name[0] or month_abbr[0], since they are empty
96+
# strings.
97+
'%a': r'(?:' + r'|'.join(calendar.day_abbr) + ')',
98+
'%b': r'(?:' + r'|'.join(calendar.month_abbr[1:]) + ')',
9699
'%B': r'(?:' + r'|'.join(calendar.month_name[1:]) + ')',
97100
'%f': r'(?:[0-9]{1,6})',
98101
'%d': r'(?:3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])',
@@ -137,6 +140,10 @@
137140
'%H:%M:%S.%f' # 23:59:58[.123456]
138141
]
139142

143+
_COMBINED_FORMATS = [
144+
'%a %b %d %H:%M:%S %Y' # Fri Nov 30 10:47:02 2018
145+
]
146+
140147

141148
def _convert_strptime_to_regex(strptime_str: Text) -> Text:
142149
"""Converts a string that includes strptime directives to a regex.
@@ -159,11 +166,13 @@ def _get_replacement_regex(matchobj):
159166
def _build_all_formats() -> Iterable[Text]:
160167
"""Yields all valid date, time, and combination formats.
161168
162-
The valid formats are defined by _DATE_ONLY_FORMATS, _TIME_ONLY_FORMATS, and
169+
The valid formats are defined by _COMBINED_FORMATS, _DATE_ONLY_FORMATS,
170+
_TIME_ONLY_FORMATS,
163171
_TIME_DELIMITERS. This function yields each date only and time only format.
164172
For combination formats, each date format from _DATE_ONLY_FORMATS is combined
165173
with each time format from _TIME_ONLY_FORMATS in two ways: one with the time
166-
delimiter and one with a space.
174+
delimiter and one with a space. Additionally, some combined formats are
175+
specified directly by _COMBINED_FORMATS and yielded.
167176
168177
Yields:
169178
All valid date, time, and combination date and time formats.
@@ -172,15 +181,16 @@ def _build_all_formats() -> Iterable[Text]:
172181
yield date_fmt
173182
for time_fmt in _TIME_ONLY_FORMATS:
174183
yield time_fmt
184+
for combined_fmt in _COMBINED_FORMATS:
185+
yield combined_fmt
175186
for date_fmt in _DATE_ONLY_FORMATS:
176187
for time_fmt in _TIME_ONLY_FORMATS:
177188
for time_delimiter in _TIME_DELIMITERS:
178189
yield ''.join([date_fmt, time_delimiter, time_fmt])
179190

180191

181192
def _build_all_formats_regexes(
182-
strptime_formats: Iterable[Text]
183-
) -> Iterable[Tuple[Text, Pattern[Text]]]:
193+
strptime_formats: Iterable[Text]) -> Iterable[Tuple[Text, Pattern[Text]]]:
184194
"""Yields compiled regexes corresponding to the input formats.
185195
186196
Args:
@@ -195,16 +205,13 @@ def _build_all_formats_regexes(
195205
yield (strptime_format, compiled_regex)
196206

197207

198-
_TIME_RE_LIST = list(
199-
_build_all_formats_regexes(_build_all_formats()))
208+
_TIME_RE_LIST = list(_build_all_formats_regexes(_build_all_formats()))
200209

201210

202211
class _PartialTimeStats(object):
203212
"""Partial feature stats for dates/times."""
204213

205-
def __init__(self,
206-
considered: int = 0,
207-
invalidated: bool = False) -> None:
214+
def __init__(self, considered: int = 0, invalidated: bool = False) -> None:
208215
# The total number of values considered for classification.
209216
self.considered = considered
210217
# True only if this feature should never be considered, e.g., some
@@ -243,10 +250,8 @@ def update(self, values: np.ndarray,
243250
self.matching_formats[
244251
unix_time.format_constant] += num_matching_values
245252
else:
246-
raise ValueError(
247-
'Attempt to update partial time stats with values of an '
248-
'unsupported type.'
249-
)
253+
raise ValueError('Attempt to update partial time stats with values of an '
254+
'unsupported type.')
250255

251256

252257
class TimeStatsGenerator(stats_generator.CombinerFeatureStatsGenerator):
@@ -268,12 +273,12 @@ def __init__(self,
268273
269274
Args:
270275
name: The unique name associated with this statistics generator.
271-
match_ratio: For a feature to be marked as a Time, the classifier
272-
match ratio must meet or exceed this ratio. This ratio must be in
273-
(0, 1]. The classifier match ratio is determined by comparing the most
274-
common valid matching format to the total number of values considered.
275-
values_threshold: For a feature to be marked as a Time, at least
276-
this many values must be considered.
276+
match_ratio: For a feature to be marked as a Time, the classifier match
277+
ratio must meet or exceed this ratio. This ratio must be in (0, 1]. The
278+
classifier match ratio is determined by comparing the most common valid
279+
matching format to the total number of values considered.
280+
values_threshold: For a feature to be marked as a Time, at least this many
281+
values must be considered.
277282
278283
Raises:
279284
ValueError: If values_threshold <= 0 or match_ratio not in (0, 1].
@@ -304,8 +309,8 @@ def add_input(self, accumulator: _PartialTimeStats,
304309
Args:
305310
accumulator: The current accumulator.
306311
feature_path: The path of the feature.
307-
feature_array: An arrow Array representing a batch of feature values
308-
which should be added to the accumulator.
312+
feature_array: An arrow Array representing a batch of feature values which
313+
should be added to the accumulator.
309314
310315
Returns:
311316
The accumulator after updating the statistics for the batch of inputs.
@@ -352,8 +357,9 @@ def merge_accumulators(
352357
result += acc
353358
return result
354359

355-
def extract_output(self, accumulator: _PartialTimeStats
356-
) -> statistics_pb2.FeatureNameStatistics:
360+
def extract_output(
361+
self,
362+
accumulator: _PartialTimeStats) -> statistics_pb2.FeatureNameStatistics:
357363
"""Returns the result of converting accumulator into the output value.
358364
359365
This method will add the time_domain custom stat to the proto if the match

tensorflow_data_validation/statistics/generators/time_stats_generator_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,12 @@
6868
'input_batch': pa.array([[
6969
'2018-11-30T23:59',
7070
'2018/11/30 23:59',
71+
'Fri Nov 30 10:47:02 2018'
7172
]]),
7273
'expected_matching_formats': {
7374
'%Y-%m-%dT%H:%M': 1,
7475
'%Y/%m/%d %H:%M': 1,
76+
'%a %b %d %H:%M:%S %Y': 1
7577
},
7678
},
7779
]

0 commit comments

Comments
 (0)