Add documentation for custom data validation.

caveness · tfx-copybara · commit 4e7ae8cff493 · 2022-11-29T14:24:12.000-08:00
PiperOrigin-RevId: 491737047
diff --git a/g3doc/custom_data_validation.md b/g3doc/custom_data_validation.md
@@ -0,0 +1,47 @@
+# Custom Data Validation
+
+<!--*
+freshness: { owner: 'caveness' reviewed: '2022-11-29' }
+*-->
+
+TFDV supports custom data validation using SQL. You can run custom data
+validation using
+[validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py;l=236;rcl=488721853)
+or
+[custom_validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py;l=535;rcl=488721853).
+Use `validate_statistics` to run standard, schema-based data validation along
+with custom validation. Use `custom_validate_statistics` to run only custom
+validation.
+
+## Configuring Custom Data Validation
+
+Use the
+[CustomValidationConfig](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto)
+to define custom validations to run. For each validation, provide an
+SQL expression, which returns a boolean value. Each SQL expression is run
+against the summary statistics for the specified feature. If the expression
+returns false, TFDV generates a custom anomaly using the provided severity and
+anomaly description.
+
+You may configure custom validations that run against individual features or
+feature pairs. For each feature, specify both the dataset (i.e., slice) and the
+feature path to use, though you may leave the dataset name blank if you want to
+validate the default slice (i.e., all examples). For single feature validations,
+the feature statistics are bound to `feature`. For feature pair validations, the
+test feature statistics are bound to `feature_test` and the base feature
+statistics are bound to `feature_base`. See the section below for example
+queries.
+
+If a custom validation triggers an anomaly, TFDV will return an Anomalies proto
+with the reason(s) for the anomaly. Each reason will have a short description,
+which is user configured, and a description with the query that caused the
+anomaly, the dataset names on which the query was run, and the base feature path
+(if running a feature-pair validation). See the section below for example
+results of custom validation.
+
+See the
+[documentation](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto)
+in the `CustomValidationConfig` proto for example
+configurations.
+
+
diff --git a/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto b/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto
@@ -20,6 +20,97 @@ package tensorflow.data_validation;
 import "tensorflow_metadata/proto/v0/anomalies.proto";
 import "tensorflow_metadata/proto/v0/path.proto";
 
+// Use this proto to configure custom validations in TFDV.
+// Example usages follow.
+// -----------------------------------------------------------------------------
+// Example Single-Feature Validation
+// Statistics
+  // datasets {
+  //   name: "All Examples"
+  //     num_examples: 10
+  //       features {
+  //         path { step: 'test_feature' }
+  //         type: INT
+  //         num_stats { num_zeros: 5 max: 25 }
+  //       }
+  // }
+// CustomValidationConfig
+  // feature_validations {
+  //   feature_path { step: 'test_feature' }
+  //     validations {
+  //       sql_expression: 'feature.num_stats.num_zeros < 3'
+  //       severity: ERROR
+  //       description: 'Feature has too many zeros.'
+  //     }
+  //     validations {
+  //       sql_expression: 'feature.num_stats.max > 10'
+  //       severity: ERROR
+  //       description: 'Maximum value is too low.'
+  //     }
+  // }
+// Anomalies
+  // anomaly_info {
+  //   key: 'test_feature'
+  //     value: {
+  //       path { step: 'test_feature' }
+  //       severity: ERROR
+  //       reason {
+  //         type: CUSTOM_VALIDATION
+  //         short_description: 'Feature has too many zeros.'
+  //         description: 'Custom validation triggered anomaly. Query: feature.num_stats.num_zeros < 3 Test dataset: default slice'
+  //      }
+  //   }
+  // }
+// -----------------------------------------------------------------------------
+// Example Feature Pair Validation
+// Statistics
+// Test statistics
+  // datasets {
+  //   name: "slice_1"
+  //   num_examples: 10
+  //   features {
+  //     path { step: 'test_feature' }
+  //     type: INT
+  //     num_stats { num_zeros: 5 max: 25 }
+  //   }
+  // }
+// Base statistics
+  // datasets {
+  //   name: "slice_2"
+  //   num_examples: 10
+  //   features {
+  //     path { step: 'test_feature' }
+  //     type: INT
+  //     num_stats { num_zeros: 1 max: 1 }
+  //   }
+  // }
+// CustomValidationConfig
+  // feature_pair_validations {
+  //   dataset_name: 'slice_1'
+  //   feature_test_path { step: 'test_feature' }
+  //   base_dataset_name: 'slice_2'
+  //   feature_base_path { step: 'test_feature' }
+  //   validations {
+  //     sql_expression: 'feature_test.num_stats.num_zeros < feature_base.num_stats.num_zeros'
+  //     severity: ERROR
+  //     description: 'Test feature has too many zeros.'
+  //   }
+  // }
+// Anomalies
+  // anomaly_info {
+  //   key: 'test_feature'
+  //   value: {
+  //   path { step: 'test_feature' }
+  //   severity: ERROR
+  //   reason {
+  //     type: CUSTOM_VALIDATION
+  //     short_description: 'Test feature has too many zeros.'
+  //      description: 'Custom validation triggered anomaly. Query: feature_test.num_stats.num_zeros < feature_base.num_stats.num_zeros Test dataset: slice_1 Base dataset: slice_2 Base path: test_feature'
+  //     }
+  //   }
+  // }
+// =============================================================================
+
 message Validation {
   // Expression to evaluate. If the expression returns false, the anomaly is
   // returned.