alexeykudinkin commented on code in PR #5208:
URL: https://github.com/apache/hudi/pull/5208#discussion_r844105251
##########
hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java:
##########
@@ -44,97 +33,117 @@
import org.apache.hadoop.hbase.io.hfile.HFileInfo;
import org.apache.hadoop.hbase.io.hfile.HFileScanner;
import org.apache.hadoop.hbase.nio.ByteBuff;
-import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hudi.avro.HoodieAvroUtils;
+import org.apache.hudi.common.bloom.BloomFilter;
+import org.apache.hudi.common.bloom.BloomFilterFactory;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.ClosableIterator;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.io.ByteBufferBackedInputStream;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.hudi.util.LazyRef;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.HashMap;
+import java.util.Collections;
import java.util.Iterator;
-import java.util.LinkedList;
import java.util.List;
-import java.util.Map;
+import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
+import static org.apache.hudi.common.util.CollectionUtils.toStream;
+import static org.apache.hudi.common.util.ValidationUtils.checkState;
+
+/**
+ * NOTE: PLEASE READ DOCS & COMMENTS CAREFULLY BEFORE MAKING CHANGES
+ * <p>
+ * {@link HoodieFileReader} implementation allowing to read from {@link HFile}.
+ */
public class HoodieHFileReader<R extends IndexedRecord> implements
HoodieFileReader<R> {
- public static final String KEY_FIELD_NAME = "key";
- public static final String KEY_SCHEMA = "schema";
+
+ // TODO HoodieHFileReader right now tightly coupled to MT, we should break
that coupling
+ public static final String SCHEMA_KEY = "schema";
public static final String KEY_BLOOM_FILTER_META_BLOCK = "bloomFilter";
public static final String KEY_BLOOM_FILTER_TYPE_CODE =
"bloomFilterTypeCode";
+
+ public static final String KEY_FIELD_NAME = "key";
public static final String KEY_MIN_RECORD = "minRecordKey";
public static final String KEY_MAX_RECORD = "maxRecordKey";
private static final Logger LOG =
LogManager.getLogger(HoodieHFileReader.class);
- private Path path;
- private Configuration conf;
- private HFile.Reader reader;
- private FSDataInputStream fsDataInputStream;
- private Schema schema;
- // Scanner used to read individual keys. This is cached to prevent the
overhead of opening the scanner for each
- // key retrieval.
- private HFileScanner keyScanner;
-
- public HoodieHFileReader(Configuration configuration, Path path, CacheConfig
cacheConfig) throws IOException {
- this.conf = configuration;
- this.path = path;
- this.reader =
HoodieHFileUtils.createHFileReader(FSUtils.getFs(path.toString(),
configuration), path, cacheConfig, conf);
+ private final Path path;
+
+ private final LazyRef<Schema> schema;
+
+ // NOTE: Reader is ONLY THREAD-SAFE for {@code Scanner} operating in
Positional Read ("pread")
+ // mode (ie created w/ "pread = true")
+ private final HFile.Reader reader;
+ // NOTE: Scanner caches read blocks, therefore it's important to re-use
scanner
Review Comment:
Not sure i understand what your point is about? You mean who is using
cachedScanner w/in the HFileReader itself, or who uses those flows APIs in turn?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]