derrickaw commented on code in PR #35952: URL: https://github.com/apache/beam/pull/35952#discussion_r2322416027
########## sdks/python/apache_beam/yaml/yaml_transform.py: ########## @@ -522,6 +548,241 @@ def expand_leaf_transform(spec, scope): f'{type(outputs)}') +def expand_output_schema_transform(spec, outputs, error_handling_spec): + """Applies a `Validate` transform to the output of another transform. + + This function is called when an `output_schema` is defined on a transform. + It wraps the original transform's output(s) with a `Validate` transform + to ensure the data conforms to the specified schema. + + If the original transform has error handling configured, validation errors + will be routed to the specified error output. If not, validation failures + will cause the pipeline to fail. + + Args: + spec (dict): The `output_schema` specification from the YAML config. + outputs (beam.PCollection or dict[str, beam.PCollection]): The output(s) + from the transform to be validated. + error_handling_spec (dict): The `error_handling` configuration from the + original transform. + + Returns: + The validated PCollection(s). If error handling is enabled, this will be a + dictionary containing the 'good' output and any error outputs. + + Raises: + ValueError: If `error_handling` is incorrectly specified within the + `output_schema` spec itself, or if the main output of a multi-output + transform cannot be determined. + """ + if 'error_handling' in spec: + raise ValueError( + 'error_handling config is not supported directly in ' + 'the output_schema. Please use error_handling config in ' + 'the transform.') + + # Strip metadata such as __line__ and __uuid__ as these will interfere with + # the validation downstream. + clean_schema = SafeLineLoader.strip_metadata(spec) + + # If no error handling is specified for the main transform, warn the user + # that the pipeline may fail if any output data fails the output schema + # validation. + if not error_handling_spec: + _LOGGER.warning("Output_schema config is attached to a transform that has "\ + "no error_handling config specified. Any failures validating on output" \ + "schema will fail the pipeline unless the user specifies an" \ + "error_handling config on a capable transform or the user can remove the" \ + "output_schema config on this transform and add a ValidateWithSchema " \ + "transform downstream of the current transform.") Review Comment: This is inaccurate. I think the `_` throws it off. Ignoring. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@beam.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org