[ https://issues.apache.org/jira/browse/PARQUET-2366?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17777534#comment-17777534 ]
ASF GitHub Bot commented on PARQUET-2366: ----------------------------------------- wgtmac commented on code in PR #1174: URL: https://github.com/apache/parquet-mr/pull/1174#discussion_r1366360839 ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/PrefetchIndexCache.java: ########## @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import org.apache.parquet.Preconditions; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * This index cache will prefetch those columns' indexes when calling {@link #setBlockMetadata(BlockMetaData)}. Review Comment: ```suggestion * This index cache will prefetch indexes of all columns when calling {@link #setBlockMetadata(BlockMetaData)}. ``` ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/PrefetchIndexCache.java: ########## @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import org.apache.parquet.Preconditions; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * This index cache will prefetch those columns' indexes when calling {@link #setBlockMetadata(BlockMetaData)}. + * <p> + * + * Note: the given index will be freed from the cache after calling the related get method. + */ +class PrefetchIndexCache implements IndexCache { + private final ParquetFileReader fileReader; + private final Set<ColumnPath> columns; + + private Map<ColumnPath, ColumnIndex> columnIndexCache; + private Map<ColumnPath, OffsetIndex> offsetIndexCache; + private Map<ColumnPath, BloomFilter> bloomIndexCache; + + PrefetchIndexCache( + ParquetFileReader fileReader, + Set<ColumnPath> columns) { + this.fileReader = fileReader; + this.columns = columns; + } + + @Override + public void setBlockMetadata(BlockMetaData currentBlockMetadata) throws IOException { + this.columnIndexCache = readAllColumnIndexes(currentBlockMetadata); + this.offsetIndexCache = readAllOffsetIndexes(currentBlockMetadata); + this.bloomIndexCache = readAllBloomFilters(currentBlockMetadata); + } + + @Override + public ColumnIndex getColumnIndex(ColumnChunkMetaData chunk) throws IOException { + ColumnPath columnPath = chunk.getPath(); + if (columns.contains(columnPath)) { + Preconditions.checkState( + columnIndexCache.containsKey(columnPath), + "Not found cached ColumnIndex for column: %s with cache strategy: %s", + columnPath.toDotString(), + CacheStrategy.PRECACHE_BLOCK); + } + + return columnIndexCache.remove(columnPath); Review Comment: This does not support repeated read, we should either document them in the comment or make it configurable. In some cases (e.g. filter pushdown), page index may be repeated accessed. ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/IndexCache.java: ########## @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +import java.io.IOException; +import java.util.Set; + +/** + * A cache for caching indexes(including: ColumnIndex, OffsetIndex and BloomFilter) + */ +public interface IndexCache { + + enum CacheStrategy { + NONE, /* No cache */ + PRECACHE_BLOCK /* Precache for block indexes */ + } + + /** + * Create an index cache for the given file reader + * + * @param fileReader the file reader + * @param columns the columns that need to do cache + * @param cacheStrategy the cache strategy, supports NONE and PRECACHE_BLOCK + * @return the index cache + */ + static IndexCache create( + ParquetFileReader fileReader, + Set<ColumnPath> columns, + CacheStrategy cacheStrategy) { + if (cacheStrategy == CacheStrategy.NONE) { + return new NoneIndexCache(fileReader); + } else if (cacheStrategy == CacheStrategy.PRECACHE_BLOCK) { + return new PrefetchIndexCache(fileReader, columns); + } else { + throw new UnsupportedOperationException("Unknown cache strategy: " + cacheStrategy); + } + } + + /** + * Set the current BlockMetadata Review Comment: Will calling this invalidate all previously cached items? > Optimize random seek during rewriting > ------------------------------------- > > Key: PARQUET-2366 > URL: https://issues.apache.org/jira/browse/PARQUET-2366 > Project: Parquet > Issue Type: Improvement > Reporter: Xianyang Liu > Priority: Major > > The `ColunIndex`, `OffsetIndex`, and `BloomFilter` are stored at the end of > the file. We need to randomly seek 4 times when rewriting a column chunk. We > found this could impact the rewrite performance heavily for files with a > number of columns(~1000). In this PR, we read the `ColumnIndex`, > `OffsetIndex`, and `BloomFilter` into a cache to avoid the random seek. We > got about 60 times performance improvement in production environments for the > files with about one thousand columns. -- This message was sent by Atlassian Jira (v8.20.10#820010)