Hisoka-X commented on code in PR #9445: URL: https://github.com/apache/seatunnel/pull/9445#discussion_r2155997406
########## seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/validator/udf/DataValidatorUDF.java: ########## @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.transform.validator.udf; + +import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.transform.validator.ValidationContext; +import org.apache.seatunnel.transform.validator.ValidationResult; + +import java.io.Serializable; + +public interface DataValidatorUDF extends Serializable { Review Comment: any docs for udf and email? ########## docs/en/transform-v2/data-validator.md: ########## @@ -0,0 +1,200 @@ +# DataValidator + +> Data validation transform plugin + +## Description + +The DataValidator transform validates field values according to configured rules and handles validation failures based on the specified error handling strategy. It supports multiple validation rule types including null checks, range validation, length validation, and regex pattern matching. + +## Options + +| name | type | required | default value | +|-----------------|--------|----------|---------------| +| error_handle_way| enum | no | FAIL | +| error_table | string | no | | +| field_rules | array | yes | | + +### error_handle_way [enum] + +Error handling strategy when validation fails: +- `FAIL`: Fail the entire task when validation errors occur +- `SKIP`: Skip invalid rows and continue processing +- `ROUTE_TO_TABLE`: Route invalid data to a specified error table + +**Note**: `ROUTE_TO_TABLE` mode only works with sinks that support multiple tables. The sink must be capable of handling data routed to different table destinations. + +### error_table [string] + +Target table name for routing invalid data when `error_handle_way` is set to `ROUTE_TO_TABLE`. This parameter is required when using `ROUTE_TO_TABLE` mode. + +### field_rules [array] + +Array of field validation rules. Each rule defines validation criteria for a specific field. + +#### Field Rule Structure + +Each field rule contains: +- `field_name`: Name of the field to validate +- `rules`: Array of validation rules to apply (nested format), or individual rule properties (flat format) + +#### Validation Rule Types + +##### NOT_NULL +Validates that a field value is not null. + +Parameters: +- `rule_type`: "NOT_NULL" +- `custom_message` (optional): Custom error message + +##### RANGE +Validates that a numeric value is within a specified range. + +Parameters: +- `rule_type`: "RANGE" +- `min_value` (optional): Minimum allowed value +- `max_value` (optional): Maximum allowed value +- `min_inclusive` (optional): Whether minimum value is inclusive (default: true) +- `max_inclusive` (optional): Whether maximum value is inclusive (default: true) +- `custom_message` (optional): Custom error message + +##### LENGTH +Validates the length of string, array, or collection values. + +Parameters: +- `rule_type`: "LENGTH" +- `min_length` (optional): Minimum allowed length +- `max_length` (optional): Maximum allowed length +- `exact_length` (optional): Exact required length +- `custom_message` (optional): Custom error message + +##### REGEX +Validates that a string value matches a regular expression pattern. + +Parameters: +- `rule_type`: "REGEX" +- `pattern`: Regular expression pattern (required) +- `case_sensitive` (optional): Whether pattern matching is case sensitive (default: true) +- `custom_message` (optional): Custom error message + +### common options [string] + +Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details + +## Examples + +### Example 1: Basic Validation with FAIL Mode + +```hocon +transform { + DataValidator { + plugin_input = "source_table" + plugin_output = "validated_table" + error_handle_way = "FAIL" + field_rules = [ + { + field_name = "name" + rule_type = "NOT_NULL" + }, + { + field_name = "age" + rule_type = "RANGE" + min_value = 0 + max_value = 150 + }, + { + field_name = "email" + rule_type = "REGEX" + pattern = "^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{2,4}$" + } + ] + } +} +``` + +### Example 2: Validation with SKIP Mode + +```hocon +transform { + DataValidator { + plugin_input = "source_table" + plugin_output = "validated_table" + error_handle_way = "SKIP" + field_rules = [ + { + field_name = "name" + rule_type = "NOT_NULL" + }, + { + field_name = "name" + rule_type = "LENGTH" + min_length = 2 + max_length = 50 + } + ] + } +} +``` + +### Example 3: Validation with ROUTE_TO_TABLE Mode + +```hocon +transform { + DataValidator { + plugin_input = "source_table" + plugin_output = "validated_table" + error_handle_way = "ROUTE_TO_TABLE" + error_table = "error_data" + field_rules = [ + { + field_name = "name" + rule_type = "NOT_NULL" + }, + { + field_name = "age" + rule_type = "RANGE" + min_value = 0 + max_value = 150 + } + ] + } +} +``` + +**Note**: When using `ROUTE_TO_TABLE`, ensure your sink connector supports multiple tables. Valid data will be sent to the main output table, while invalid data will be routed to the specified error table. + +### Example 4: Nested Rules Format + +```hocon +transform { + DataValidator { + plugin_input = "source_table" + plugin_output = "validated_table" + error_handle_way = "FAIL" + field_rules = [ + { + field_name = "name" + rules = [ + { + rule_type = "NOT_NULL" + custom_message = "Name is required" + }, + { + rule_type = "LENGTH" + min_length = 2 + max_length = 50 + custom_message = "Name must be between 2 and 50 characters" + } + ] + } + ] + } +} +``` + +## Changelog + +### new version +- Add DataValidator Transform Connector +- Support NOT_NULL, RANGE, LENGTH, and REGEX validation rules +- Support FAIL, SKIP, and ROUTE_TO_TABLE error handling modes +- Support both flat and nested rule configuration formats Review Comment: We don't need changelog for transform now. It is hard to maintain. ########## seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/validator/DataValidatorTransformConfig.java: ########## @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.transform.validator; + +import org.apache.seatunnel.shade.com.fasterxml.jackson.annotation.JsonAlias; +import org.apache.seatunnel.shade.com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; +import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.seatunnel.shade.com.google.common.collect.Lists; + +import org.apache.seatunnel.api.configuration.Option; +import org.apache.seatunnel.api.configuration.Options; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.transform.validator.rule.LengthValidationRule; +import org.apache.seatunnel.transform.validator.rule.NotNullValidationRule; +import org.apache.seatunnel.transform.validator.rule.RangeValidationRule; +import org.apache.seatunnel.transform.validator.rule.RegexValidationRule; +import org.apache.seatunnel.transform.validator.rule.UDFValidationRule; +import org.apache.seatunnel.transform.validator.rule.ValidationRule; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@Data +@NoArgsConstructor +@AllArgsConstructor +@Slf4j +public class DataValidatorTransformConfig implements Serializable { + + public static final Option<ValidationErrorHandleWay> ERROR_HANDLE_WAY = + Options.key("error_handle_way") + .enumType(ValidationErrorHandleWay.class) + .defaultValue(ValidationErrorHandleWay.FAIL) + .withDescription( + "Error handling strategy: FAIL - fail the task, SKIP - skip invalid data, ROUTE_TO_TABLE - route to specified table"); Review Comment: Please reuse https://github.com/apache/seatunnel/blob/2f96f2e46c8fe4cc527924d02340bfb991ed3f6d/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/common/TransformCommonOptions.java#L50-L68 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
