wangxianghu commented on a change in pull request #4987: URL: https://github.com/apache/hudi/pull/4987#discussion_r826611663
########## File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/processor/maxwell/MaxwellJsonKafkaSourcePostProcessor.java ########## @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.processor.maxwell; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.DateTimeUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.utilities.exception.HoodieSourcePostProcessException; +import org.apache.hudi.utilities.sources.processor.JsonKafkaSourcePostProcessor; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; + +import java.util.Locale; +import java.util.Objects; +import java.util.regex.Pattern; + +import static org.apache.hudi.utilities.sources.processor.maxwell.PreCombineFieldType.DATE_STRING; +import static org.apache.hudi.utilities.sources.processor.maxwell.PreCombineFieldType.EPOCHMILLISECONDS; +import static org.apache.hudi.utilities.sources.processor.maxwell.PreCombineFieldType.NON_TIMESTAMP; +import static org.apache.hudi.utilities.sources.processor.maxwell.PreCombineFieldType.UNIX_TIMESTAMP; +import static org.apache.hudi.utilities.sources.processor.maxwell.PreCombineFieldType.valueOf; + +/** + * A {@link JsonKafkaSourcePostProcessor} help to extract fresh data from maxwell json string and tag the record as + * delete or not. + */ +public class MaxwellJsonKafkaSourcePostProcessor extends JsonKafkaSourcePostProcessor { + + private static final Logger LOG = LogManager.getLogger(MaxwellJsonKafkaSourcePostProcessor.class); + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + public MaxwellJsonKafkaSourcePostProcessor(TypedProperties props) { + super(props); + } + + // ------------------------------------------------------------------------ + // Partial fields in maxwell json string + // ------------------------------------------------------------------------ + + private static final String DATABASE = "database"; + private static final String TABLE = "table"; + private static final String DATA = "data"; + private static final String OPERATION_TYPE = "type"; + private static final String TS = "ts"; + + // ------------------------------------------------------------------------ + // Operation types + // ------------------------------------------------------------------------ + + private static final String INSERT = "insert"; + private static final String UPDATE = "update"; + private static final String DELETE = "delete"; + + /** + * Configs to be passed for this processor. + */ + public static class Config { + public static final ConfigProperty<String> DATABASE_NAME_REGEX_PROP = ConfigProperty + .key("hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.database.regex") + .noDefaultValue() + .withDocumentation("Database name regex."); + + public static final ConfigProperty<String> TABLE_NAME_REGEX_PROP = ConfigProperty + .key("hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.table.regex") + .noDefaultValue() + .withDocumentation("Table name regex."); + + public static final ConfigProperty<String> PRECOMBINE_FIELD_TYPE_PROP = ConfigProperty + .key("hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.precombine.field.type") + .defaultValue("DATA_STRING") + .withDocumentation("Data type of the preCombine field. could be NON_TIMESTAMP, DATE_STRING," + + "UNIX_TIMESTAMP or EPOCHMILLISECONDS. DATA_STRING by default "); + + public static final ConfigProperty<String> PRECOMBINE_FIELD_FORMAT_PROP = ConfigProperty + .key("hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.precombine.field.format") + .defaultValue("yyyy-MM-dd HH:mm:ss") + .withDocumentation("When the preCombine filed is in DATE_STRING format, use should tell hoodie" + + "what format it is. 'yyyy-MM-dd HH:mm:ss' by default"); + } + + @Override + public JavaRDD<String> process(JavaRDD<String> maxwellJsonRecords) { + return maxwellJsonRecords.map(record -> { + JsonNode inputJson = MAPPER.readTree(record); + String database = inputJson.get(DATABASE).textValue(); + String table = inputJson.get(TABLE).textValue(); + + // filter out target databases and tables + if (isTargetTable(database, table)) { + + LOG.info(String.format("Maxwell source processor starts process table : %s.%s", database, table)); + + ObjectNode result = (ObjectNode) inputJson.get(DATA); + String type = inputJson.get(OPERATION_TYPE).textValue(); + + // insert or update + if (INSERT.equals(type) || UPDATE.equals(type)) { + // tag this record not delete. + result.put(HoodieRecord.HOODIE_IS_DELETED, false); + return result.toString(); + + // delete + } else if (DELETE.equals(type)) { + // tag this record as delete. + result.put(HoodieRecord.HOODIE_IS_DELETED, true); Review comment: > Can delete logic be put into a method? done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
