wgtmac commented on code in PR #1026: URL: https://github.com/apache/parquet-mr/pull/1026#discussion_r1108635269
########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/RewriteOptions.java: ########## @@ -101,37 +103,121 @@ public static class Builder { private List<String> encryptColumns; private FileEncryptionProperties fileEncryptionProperties; + /** + * Create a builder to create a RewriterOptions. + * + * @param conf configuration for reading from input files and writing to output file + * @param inputFile input file path to read from + * @param outputFile output file path to rewrite to + */ public Builder(Configuration conf, Path inputFile, Path outputFile) { this.conf = conf; this.inputFiles = Arrays.asList(inputFile); this.outputFile = outputFile; } + /** + * Create a builder to create a RewriterOptions. + * <p> + * Please note that if merging more than one file, the schema of all files must be the same. + * Otherwise, the rewrite will fail. + * <p> + * The rewrite will keep original row groups from all input files. This may not be optimal Review Comment: I have added some comment to elaborate the small row group problem. Please check @gszadovszky -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@parquet.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org