wgtmac commented on code in PR #3036:
URL: https://github.com/apache/parquet-java/pull/3036#discussion_r1832063981
##########
parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/RewriteOptions.java:
##########
@@ -192,6 +198,10 @@ public Map<String, MaskMode> getMaskColumns() {
return maskColumns;
}
+ public Map<String, String> gerRenameColumns() {
Review Comment:
```suggestion
public Map<String, String> getRenameColumns() {
```
##########
parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java:
##########
@@ -149,27 +149,29 @@ public class ParquetRewriter implements Closeable {
private final IndexCache.CacheStrategy indexCacheStrategy;
private final boolean overwriteInputWithJoinColumns;
private final InternalFileEncryptor nullColumnEncryptor;
+ private final Map<String, String> renamedColumns;
public ParquetRewriter(RewriteOptions options) throws IOException {
this.newCodecName = options.getNewCodecName();
this.indexCacheStrategy = options.getIndexCacheStrategy();
this.overwriteInputWithJoinColumns =
options.getOverwriteInputWithJoinColumns();
+ this.renamedColumns = options.gerRenameColumns();
Review Comment:
```suggestion
this.renamedColumns = options.getRenameColumns();
```
##########
parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java:
##########
@@ -504,6 +504,29 @@ public boolean hasDictionaryPage() {
public boolean isEncrypted() {
return false;
}
+
+ /**
+ * Copies this ColumnChunkMetaData with path and type changed to provided
ones.
+ *
+ * @param path a new ColumnPath of a chunk
+ * @param type a new PrimitiveType of a chunk
+ * @return resulting chunk
+ */
+ public ColumnChunkMetaData copy(ColumnPath path, PrimitiveType type) {
Review Comment:
This method still looks weird since it changes some metadata after copying.
If we do not have a clean solution, I'm inclined to use the original solution
which simply uses the constructor and add a comment saying that we need to keep
it up to date.
##########
parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java:
##########
@@ -149,27 +149,29 @@ public class ParquetRewriter implements Closeable {
private final IndexCache.CacheStrategy indexCacheStrategy;
private final boolean overwriteInputWithJoinColumns;
private final InternalFileEncryptor nullColumnEncryptor;
+ private final Map<String, String> renamedColumns;
public ParquetRewriter(RewriteOptions options) throws IOException {
this.newCodecName = options.getNewCodecName();
this.indexCacheStrategy = options.getIndexCacheStrategy();
this.overwriteInputWithJoinColumns =
options.getOverwriteInputWithJoinColumns();
+ this.renamedColumns = options.gerRenameColumns();
ParquetConfiguration conf = options.getParquetConfiguration();
- OutputFile out = options.getParquetOutputFile();
- inputFiles.addAll(getFileReaders(options.getParquetInputFiles(), conf));
-
inputFilesToJoin.addAll(getFileReaders(options.getParquetInputFilesToJoin(),
conf));
+ this.inputFiles.addAll(getFileReaders(options.getParquetInputFiles(),
conf));
+
this.inputFilesToJoin.addAll(getFileReaders(options.getParquetInputFilesToJoin(),
conf));
+ this.outSchema = pruneColumnsInSchema(getSchema(),
options.getPruneColumns());
+ this.extraMetaData = getExtraMetadata(options);
ensureSameSchema(inputFiles);
ensureSameSchema(inputFilesToJoin);
ensureRowCount();
+ ensureRenamingCorrectness(outSchema, renamedColumns);
+ OutputFile out = options.getParquetOutputFile();
LOG.info(
"Start rewriting {} input file(s) {} to {}",
inputFiles.size() + inputFilesToJoin.size(),
Stream.concat(options.getParquetInputFiles().stream(),
options.getParquetInputFilesToJoin().stream())
.collect(Collectors.toList()),
- out);
-
- this.outSchema = pruneColumnsInSchema(getSchema(),
options.getPruneColumns());
- this.extraMetaData = getExtraMetadata(options);
+ options.getParquetOutputFile());
Review Comment:
```suggestion
out);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]