[
https://issues.apache.org/jira/browse/TAJO-2073?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15143959#comment-15143959
]
ASF GitHub Bot commented on TAJO-2073:
--------------------------------------
Github user jihoonson commented on a diff in the pull request:
https://github.com/apache/tajo/pull/958#discussion_r52702202
--- Diff:
tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/InternalParquetRecordReader.java
---
@@ -70,37 +81,50 @@
private long totalCountLoadedSoFar = 0;
private Path file;
+ private UnmaterializableRecordCounter unmaterializableRecordCounter;
+
+ /**
+ * @param readSupport Object which helps reads files of the given type,
e.g. Thrift, Avro.
+ * @param filter for filtering individual records
+ */
+ public InternalParquetRecordReader(ReadSupport<T> readSupport, Filter
filter) {
+ this.readSupport = readSupport;
+ this.filter = checkNotNull(filter, "filter");
+ }
/**
* @param readSupport Object which helps reads files of the given type,
e.g. Thrift, Avro.
*/
public InternalParquetRecordReader(ReadSupport<T> readSupport) {
- this(readSupport, null);
+ this(readSupport, FilterCompat.NOOP);
}
/**
* @param readSupport Object which helps reads files of the given type,
e.g. Thrift, Avro.
* @param filter Optional filter for only returning matching records.
+ * @deprecated use {@link #InternalParquetRecordReader(ReadSupport,
Filter)}
*/
- public InternalParquetRecordReader(ReadSupport<T> readSupport,
UnboundRecordFilter
- filter) {
- this.readSupport = readSupport;
- this.recordFilter = filter;
+ @Deprecated
+ public InternalParquetRecordReader(ReadSupport<T> readSupport,
UnboundRecordFilter filter) {
+ this(readSupport, FilterCompat.get(filter));
}
private void checkRead() throws IOException {
if (current == totalCountLoadedSoFar) {
if (current != 0) {
- long timeAssembling = System.currentTimeMillis() -
startedAssemblingCurrentBlockAt;
- totalTimeSpentProcessingRecords += timeAssembling;
- if (DEBUG) LOG.debug("Assembled and processed " +
totalCountLoadedSoFar + " records from " + columnCount + " columns in " +
totalTimeSpentProcessingRecords + " ms: " + ((float) totalCountLoadedSoFar /
totalTimeSpentProcessingRecords) + " rec/ms, " + ((float) totalCountLoadedSoFar
* columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
- long totalTime = totalTimeSpentProcessingRecords +
totalTimeSpentReadingBytes;
- long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
- long percentProcessing = 100 * totalTimeSpentProcessingRecords /
totalTime;
- if (DEBUG) LOG.debug("time spent so far " + percentReading + "%
reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "%
processing ("+totalTimeSpentProcessingRecords+" ms)");
+ totalTimeSpentProcessingRecords += (System.currentTimeMillis() -
startedAssemblingCurrentBlockAt);
+ if (Log.INFO) {
--- End diff --
Even though these logs seem to be printed whenever a row group is fully
read, I'm concerned with there will be too many logs.
> Upgrade parquet-mr to 1.8.1
> ---------------------------
>
> Key: TAJO-2073
> URL: https://issues.apache.org/jira/browse/TAJO-2073
> Project: Tajo
> Issue Type: Improvement
> Components: Storage
> Affects Versions: 0.11.1
> Reporter: Jinho Kim
> Assignee: Jinho Kim
>
> There are lots of changes since parquet's graduation.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)