[ https://issues.apache.org/jira/browse/PARQUET-2372?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17782894#comment-17782894 ]
ASF GitHub Bot commented on PARQUET-2372: ----------------------------------------- wgtmac commented on code in PR #1183: URL: https://github.com/apache/parquet-mr/pull/1183#discussion_r1382413034 ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java: ########## @@ -265,14 +265,9 @@ public void processBlocks() throws IOException { } private void processBlocksFromReader(IndexCache indexCache) throws IOException { - PageReadStore store = reader.readNextRowGroup(); - ColumnReadStoreImpl crStore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, originalCreatedBy); - - int blockId = 0; - while (store != null) { - writer.startBlock(store.getRowCount()); - + for (int blockId = 0; blockId < meta.getBlocks().size(); blockId ++) { BlockMetaData blockMetaData = meta.getBlocks().get(blockId); + writer.startBlock(blockMetaData.getRowCount()); Review Comment: Make sense! > Avoid unnecessary reading of RowGroup data during rewriting > ----------------------------------------------------------- > > Key: PARQUET-2372 > URL: https://issues.apache.org/jira/browse/PARQUET-2372 > Project: Parquet > Issue Type: Improvement > Reporter: Xianyang Liu > Priority: Major > > This patch aims to reduce the unnecessary RowGroup data reading during > rewriting. -- This message was sent by Atlassian Jira (v8.20.10#820010)