55
55
_MATCH_RATIO = 0.8
56
56
_VALUES_THRESHOLD = 100
57
57
58
-
59
- _UnixTime = collections .namedtuple (
60
- '_UnixTime' , ['format_constant' , 'begin' , 'end' ]
61
- )
58
+ _UnixTime = collections .namedtuple ('_UnixTime' ,
59
+ ['format_constant' , 'begin' , 'end' ])
62
60
63
61
# Named tuples containing values used to detect integer times.
64
62
# The beginning times correspond to 01-Jan-90 00:00:00 UTC.
65
63
# The ending times correspond to 01-Jan-30 00:00:00 UTC.
66
64
_UNIX_TIMES = [
65
+ _UnixTime (
66
+ format_constant = schema_pb2 .TimeDomain .UNIX_DAYS , begin = 7305 , end = 21915 ),
67
67
_UnixTime (
68
68
format_constant = schema_pb2 .TimeDomain .UNIX_SECONDS ,
69
69
begin = 631152000 ,
92
92
# This is consistent with Python's strptime()'s mapping of format directives to
93
93
# regexes.
94
94
_STRPTIME_TO_RE = {
95
- # Do not include month_name[0], since it's an empty string.
95
+ # Do not include month_name[0] or month_abbr[0], since they are empty
96
+ # strings.
97
+ '%a' : r'(?:' + r'|' .join (calendar .day_abbr ) + ')' ,
98
+ '%b' : r'(?:' + r'|' .join (calendar .month_abbr [1 :]) + ')' ,
96
99
'%B' : r'(?:' + r'|' .join (calendar .month_name [1 :]) + ')' ,
97
100
'%f' : r'(?:[0-9]{1,6})' ,
98
101
'%d' : r'(?:3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])' ,
137
140
'%H:%M:%S.%f' # 23:59:58[.123456]
138
141
]
139
142
143
+ _COMBINED_FORMATS = [
144
+ '%a %b %d %H:%M:%S %Y' # Fri Nov 30 10:47:02 2018
145
+ ]
146
+
140
147
141
148
def _convert_strptime_to_regex (strptime_str : Text ) -> Text :
142
149
"""Converts a string that includes strptime directives to a regex.
@@ -159,11 +166,13 @@ def _get_replacement_regex(matchobj):
159
166
def _build_all_formats () -> Iterable [Text ]:
160
167
"""Yields all valid date, time, and combination formats.
161
168
162
- The valid formats are defined by _DATE_ONLY_FORMATS, _TIME_ONLY_FORMATS, and
169
+ The valid formats are defined by _COMBINED_FORMATS, _DATE_ONLY_FORMATS,
170
+ _TIME_ONLY_FORMATS,
163
171
_TIME_DELIMITERS. This function yields each date only and time only format.
164
172
For combination formats, each date format from _DATE_ONLY_FORMATS is combined
165
173
with each time format from _TIME_ONLY_FORMATS in two ways: one with the time
166
- delimiter and one with a space.
174
+ delimiter and one with a space. Additionally, some combined formats are
175
+ specified directly by _COMBINED_FORMATS and yielded.
167
176
168
177
Yields:
169
178
All valid date, time, and combination date and time formats.
@@ -172,15 +181,16 @@ def _build_all_formats() -> Iterable[Text]:
172
181
yield date_fmt
173
182
for time_fmt in _TIME_ONLY_FORMATS :
174
183
yield time_fmt
184
+ for combined_fmt in _COMBINED_FORMATS :
185
+ yield combined_fmt
175
186
for date_fmt in _DATE_ONLY_FORMATS :
176
187
for time_fmt in _TIME_ONLY_FORMATS :
177
188
for time_delimiter in _TIME_DELIMITERS :
178
189
yield '' .join ([date_fmt , time_delimiter , time_fmt ])
179
190
180
191
181
192
def _build_all_formats_regexes (
182
- strptime_formats : Iterable [Text ]
183
- ) -> Iterable [Tuple [Text , Pattern [Text ]]]:
193
+ strptime_formats : Iterable [Text ]) -> Iterable [Tuple [Text , Pattern [Text ]]]:
184
194
"""Yields compiled regexes corresponding to the input formats.
185
195
186
196
Args:
@@ -195,16 +205,13 @@ def _build_all_formats_regexes(
195
205
yield (strptime_format , compiled_regex )
196
206
197
207
198
- _TIME_RE_LIST = list (
199
- _build_all_formats_regexes (_build_all_formats ()))
208
+ _TIME_RE_LIST = list (_build_all_formats_regexes (_build_all_formats ()))
200
209
201
210
202
211
class _PartialTimeStats (object ):
203
212
"""Partial feature stats for dates/times."""
204
213
205
- def __init__ (self ,
206
- considered : int = 0 ,
207
- invalidated : bool = False ) -> None :
214
+ def __init__ (self , considered : int = 0 , invalidated : bool = False ) -> None :
208
215
# The total number of values considered for classification.
209
216
self .considered = considered
210
217
# True only if this feature should never be considered, e.g., some
@@ -243,10 +250,8 @@ def update(self, values: np.ndarray,
243
250
self .matching_formats [
244
251
unix_time .format_constant ] += num_matching_values
245
252
else :
246
- raise ValueError (
247
- 'Attempt to update partial time stats with values of an '
248
- 'unsupported type.'
249
- )
253
+ raise ValueError ('Attempt to update partial time stats with values of an '
254
+ 'unsupported type.' )
250
255
251
256
252
257
class TimeStatsGenerator (stats_generator .CombinerFeatureStatsGenerator ):
@@ -268,12 +273,12 @@ def __init__(self,
268
273
269
274
Args:
270
275
name: The unique name associated with this statistics generator.
271
- match_ratio: For a feature to be marked as a Time, the classifier
272
- match ratio must meet or exceed this ratio. This ratio must be in
273
- (0, 1]. The classifier match ratio is determined by comparing the most
274
- common valid matching format to the total number of values considered.
275
- values_threshold: For a feature to be marked as a Time, at least
276
- this many values must be considered.
276
+ match_ratio: For a feature to be marked as a Time, the classifier match
277
+ ratio must meet or exceed this ratio. This ratio must be in (0, 1]. The
278
+ classifier match ratio is determined by comparing the most common valid
279
+ matching format to the total number of values considered.
280
+ values_threshold: For a feature to be marked as a Time, at least this many
281
+ values must be considered.
277
282
278
283
Raises:
279
284
ValueError: If values_threshold <= 0 or match_ratio not in (0, 1].
@@ -304,8 +309,8 @@ def add_input(self, accumulator: _PartialTimeStats,
304
309
Args:
305
310
accumulator: The current accumulator.
306
311
feature_path: The path of the feature.
307
- feature_array: An arrow Array representing a batch of feature values
308
- which should be added to the accumulator.
312
+ feature_array: An arrow Array representing a batch of feature values which
313
+ should be added to the accumulator.
309
314
310
315
Returns:
311
316
The accumulator after updating the statistics for the batch of inputs.
@@ -352,8 +357,9 @@ def merge_accumulators(
352
357
result += acc
353
358
return result
354
359
355
- def extract_output (self , accumulator : _PartialTimeStats
356
- ) -> statistics_pb2 .FeatureNameStatistics :
360
+ def extract_output (
361
+ self ,
362
+ accumulator : _PartialTimeStats ) -> statistics_pb2 .FeatureNameStatistics :
357
363
"""Returns the result of converting accumulator into the output value.
358
364
359
365
This method will add the time_domain custom stat to the proto if the match
0 commit comments