Add support for running custom validations in validate_statistics().

caveness · tfx-copybara · commit 4d0b51d8b23d · 2022-11-15T12:02:12.000-08:00
PiperOrigin-RevId: 488721853
diff --git a/RELEASE.md b/RELEASE.md
@@ -9,8 +9,9 @@
 
 ## Major Features and Improvements
 
-*  Add a `custom_validate_statistics` function to the validation API. Note that
-   this function is not available on Windows.
+*  Add a `custom_validate_statistics` function to the validation API, and
+   support passing custom validations to `validate_statistics`. Note that
+   custom validation is not supported on Windows.
 
 ## Bug Fixes and Other Changes
 
diff --git a/tensorflow_data_validation/api/validation_api.py b/tensorflow_data_validation/api/validation_api.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import itertools
 import logging
 from typing import Callable, Iterable, List, Optional, Text, Tuple, Set
 import apache_beam as beam
@@ -58,6 +59,8 @@
     anomalies_pb2.AnomalyInfo.DATASET_HIGH_NUM_EXAMPLES,
 ])
 
+_MULTIPLE_ERRORS = 'Multiple errors'
+
 
 def infer_schema(
     statistics: statistics_pb2.DatasetFeatureStatisticsList,
@@ -189,6 +192,47 @@ def update_schema(schema: schema_pb2.Schema,
   return result
 
 
+def _merge_descriptions(
+    anomaly_info: anomalies_pb2.AnomalyInfo,
+    other_anomaly_info: Optional[anomalies_pb2.AnomalyInfo]) -> str:
+  """Merges anomaly descriptions."""
+  descriptions = []
+  if other_anomaly_info is not None:
+    for reason in itertools.chain(anomaly_info.reason,
+                                  other_anomaly_info.reason):
+      descriptions.append(reason.description)
+  else:
+    descriptions = [reason.description for reason in anomaly_info.reason]
+  return ' '.join(descriptions)
+
+
+def _merge_custom_anomalies(
+    anomalies: anomalies_pb2.Anomalies,
+    custom_anomalies: anomalies_pb2.Anomalies) -> anomalies_pb2.Anomalies:
+  """Merges custom_anomalies with anomalies."""
+  for key, custom_anomaly_info in custom_anomalies.anomaly_info.items():
+    if key in anomalies.anomaly_info:
+      # If the key is found in in both inputs, we know it has multiple errors.
+      anomalies.anomaly_info[key].short_description = _MULTIPLE_ERRORS
+      anomalies.anomaly_info[key].description = _merge_descriptions(
+          anomalies.anomaly_info[key], custom_anomaly_info)
+      anomalies.anomaly_info[key].severity = max(
+          anomalies.anomaly_info[key].severity, custom_anomaly_info.severity)
+      anomalies.anomaly_info[key].reason.extend(custom_anomaly_info.reason)
+    else:
+      anomalies.anomaly_info[key].CopyFrom(custom_anomaly_info)
+      # Also populate top-level descriptions.
+      anomalies.anomaly_info[key].description = _merge_descriptions(
+          custom_anomaly_info, None)
+      if len(anomalies.anomaly_info[key].reason) > 1:
+        anomalies.anomaly_info[key].short_description = _MULTIPLE_ERRORS
+      else:
+        anomalies.anomaly_info[
+            key].short_description = custom_anomaly_info.reason[
+                0].short_description
+  return anomalies
+
+
 def validate_statistics(
     statistics: statistics_pb2.DatasetFeatureStatisticsList,
     schema: schema_pb2.Schema,
@@ -197,6 +241,8 @@ def validate_statistics(
         statistics_pb2.DatasetFeatureStatisticsList] = None,
     serving_statistics: Optional[
         statistics_pb2.DatasetFeatureStatisticsList] = None,
+    custom_validation_config: Optional[
+        custom_validation_config_pb2.CustomValidationConfig] = None
 ) -> anomalies_pb2.Anomalies:
   """Validates the input statistics against the provided input schema.
 
@@ -248,6 +294,14 @@ def validate_statistics(
         distribution skew between current data and serving data. Configuration
         for skew detection can be done by specifying a `skew_comparator` in the
         schema.
+    custom_validation_config: An optional config that can be used to specify
+        custom validations to perform. If doing single-feature validations,
+        the test feature will come from `statistics` and will be mapped to
+        `feature` in the SQL query. If doing feature pair validations, the test
+        feature will come from `statistics` and will be mapped to `feature_test`
+        in the SQL query, and the base feature will come from
+        `previous_statistics` and will be mapped to `feature_base` in the SQL
+        query. Custom validations are not supported on Windows.
 
   Returns:
     An Anomalies protocol buffer.
@@ -270,7 +324,9 @@ def validate_statistics(
           % type(previous_statistics).__name__)
 
   return validate_statistics_internal(statistics, schema, environment,
-                                      previous_statistics, serving_statistics)
+                                      previous_statistics, serving_statistics,
+                                      None, None, False,
+                                      custom_validation_config)
 
 
 def validate_statistics_internal(
@@ -284,7 +340,9 @@ def validate_statistics_internal(
     previous_version_statistics: Optional[
         statistics_pb2.DatasetFeatureStatisticsList] = None,
     validation_options: Optional[vo.ValidationOptions] = None,
-    enable_diff_regions: bool = False
+    enable_diff_regions: bool = False,
+    custom_validation_config: Optional[
+        custom_validation_config_pb2.CustomValidationConfig] = None
 ) -> anomalies_pb2.Anomalies:
   """Validates the input statistics against the provided input schema.
 
@@ -341,6 +399,14 @@ def validate_statistics_internal(
     enable_diff_regions: Specifies whether to include a comparison between the
         existing schema and the fixed schema in the Anomalies protocol buffer
         output.
+    custom_validation_config: An optional config that can be used to specify
+        custom validations to perform. If doing single-feature validations,
+        the test feature will come from `statistics` and will be mapped to
+        `feature` in the SQL query. If doing feature pair validations, the test
+        feature will come from `statistics` and will be mapped to `feature_test`
+        in the SQL query, and the base feature will come from
+        `previous_statistics` and will be mapped to `feature_base` in the SQL
+        query. Custom validations are not supported on Windows.
 
   Returns:
     An Anomalies protocol buffer.
@@ -449,10 +515,23 @@ def validate_statistics_internal(
   # Parse the serialized Anomalies proto.
   result = anomalies_pb2.Anomalies()
   result.ParseFromString(anomalies_proto_string)
+
+  if custom_validation_config is not None:
+    serialized_previous_statistics = previous_span_statistics.SerializeToString(
+    ) if previous_span_statistics is not None else ''
+    custom_anomalies_string = (
+        pywrap_tensorflow_data_validation.CustomValidateStatistics(
+            tf.compat.as_bytes(statistics.SerializeToString()),
+            tf.compat.as_bytes(serialized_previous_statistics),
+            tf.compat.as_bytes(custom_validation_config.SerializeToString()),
+            tf.compat.as_bytes(environment)))
+    custom_anomalies = anomalies_pb2.Anomalies()
+    custom_anomalies.ParseFromString(custom_anomalies_string)
+    result = _merge_custom_anomalies(result, custom_anomalies)
+
   return result
 
 
-# TODO(b/239095455): Also integrate with validate_statistics.
 def custom_validate_statistics(
     statistics: statistics_pb2.DatasetFeatureStatisticsList,
     validations: custom_validation_config_pb2.CustomValidationConfig,
diff --git a/tensorflow_data_validation/api/validation_api_test.py b/tensorflow_data_validation/api/validation_api_test.py
@@ -2087,6 +2087,82 @@ def test_validate_stats_invalid_previous_version_stats_multiple_datasets(
           schema,
           previous_version_statistics=previous_version_stats)
 
+  # Custom validation uses ZetaSQL, which cannot be compiled on Windows.
+  @unittest.skipIf(
+      sys.platform.startswith('win'),
+      'Custom validation is not supported on Windows.')
+  def test_validate_stats_with_custom_validations(self):
+    statistics = text_format.Parse(
+        """
+        datasets{
+          num_examples: 10
+          features {
+            path { step: 'annotated_enum' }
+            type: STRING
+            string_stats {
+              common_stats {
+                num_missing: 3
+                num_non_missing: 7
+                min_num_values: 1
+                max_num_values: 1
+              }
+              unique: 3
+              rank_histogram {
+                buckets {
+                  label: "D"
+                  sample_count: 1
+                }
+              }
+            }
+          }
+        }
+        """, statistics_pb2.DatasetFeatureStatisticsList())
+    schema = text_format.Parse(
+        """
+        feature {
+          name: 'annotated_enum'
+          type: BYTES
+          unique_constraints {
+            min: 4
+            max: 4
+          }
+        }
+        """, schema_pb2.Schema())
+    validation_config = text_format.Parse("""
+      feature_validations {
+       feature_path { step: 'annotated_enum' }
+       validations {
+         sql_expression: 'feature.string_stats.common_stats.num_missing < 3'
+         severity: WARNING
+         description: 'Feature has too many missing.'
+       }
+     }
+    """, custom_validation_config_pb2.CustomValidationConfig())
+    expected_anomalies = {
+        'annotated_enum':
+            text_format.Parse(
+                """
+               path { step: 'annotated_enum' }
+               short_description: 'Multiple errors'
+               description: 'Expected at least 4 unique values but found only 3. Custom validation triggered anomaly. Query: feature.string_stats.common_stats.num_missing < 3 Test dataset: default slice'
+               severity: ERROR
+               reason {
+                 type: FEATURE_TYPE_LOW_UNIQUE
+                 short_description: 'Low number of unique values'
+                 description: 'Expected at least 4 unique values but found only 3.'
+               }
+               reason {
+                 type: CUSTOM_VALIDATION
+                 short_description: 'Feature has too many missing.'
+                 description: 'Custom validation triggered anomaly. Query: feature.string_stats.common_stats.num_missing < 3 Test dataset: default slice'
+               }
+    """, anomalies_pb2.AnomalyInfo())
+    }
+    anomalies = validation_api.validate_statistics(statistics, schema, None,
+                                                   None, None,
+                                                   validation_config)
+    self._assert_equal_anomalies(anomalies, expected_anomalies)
+
   def test_validate_stats_internal_with_previous_version_stats(self):
     statistics = text_format.Parse(
         """