[
https://issues.apache.org/jira/browse/ORC-21?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14997020#comment-14997020
]
ASF GitHub Bot commented on ORC-21:
-----------------------------------
Github user omalley commented on a diff in the pull request:
https://github.com/apache/orc/pull/12#discussion_r44307077
--- Diff: c++/src/Reader.cc ---
@@ -1364,6 +1365,111 @@ namespace orc {
int64_t getEpochOffset() const override;
};
+ uint64_t maxStreamsForType(const proto::Type& type) {
+ switch (static_cast<int64_t>(type.kind())) {
+ case proto::Type_Kind_STRUCT:
+ return 1;
+ case proto::Type_Kind_INT:
+ case proto::Type_Kind_LONG:
+ case proto::Type_Kind_SHORT:
+ case proto::Type_Kind_FLOAT:
+ case proto::Type_Kind_DOUBLE:
+ case proto::Type_Kind_BOOLEAN:
+ case proto::Type_Kind_BYTE:
+ case proto::Type_Kind_DATE:
+ case proto::Type_Kind_LIST:
+ case proto::Type_Kind_MAP:
+ case proto::Type_Kind_UNION:
+ return 2;
+ case proto::Type_Kind_BINARY:
+ case proto::Type_Kind_DECIMAL:
+ case proto::Type_Kind_TIMESTAMP:
+ return 3;
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_VARCHAR:
+ return 4;
+ default:
+ return 0;
+ }
+ }
+
+ uint64_t ReaderImpl::getMemoryUse(int stripeIx) {
+ uint64_t maxDataLength = 0;
+
+ if (stripeIx >= 0 && stripeIx < footer->stripes_size()) {
+ uint64_t stripe = footer->stripes(stripeIx).datalength();
+ if (maxDataLength < stripe) {
+ maxDataLength = stripe;
+ }
+ } else {
+ for (int i=0; i < footer->stripes_size(); i++) {
+ uint64_t stripe = footer->stripes(i).datalength();
+ if (maxDataLength < stripe) {
+ maxDataLength = stripe;
+ }
+ }
+ }
+
+ bool hasStringColumn = false;
+ uint64_t nSelectedStreams = 0;
+ for (int i=0; !hasStringColumn && i < footer->types_size(); i++) {
+ if (selectedColumns[i]) {
+ const proto::Type& type = footer->types(i);
+ nSelectedStreams += maxStreamsForType(type) ;
+ switch (static_cast<int64_t>(type.kind())) {
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_VARCHAR:
+ case proto::Type_Kind_BINARY: {
+ hasStringColumn = true;
+ break;
+ }
+ default: {
+ break;
+ }
+ }
+ }
+ }
+
+ /* If a string column is read, use stripe datalength as a memory
estimate
+ * because we don't know the dictionary size. Multiply by 2 because
+ * a string column requires two buffers:
+ * in the input stream and in the seekable input stream.
+ * If no string column is read, estimate from the number of streams.
+ */
+ uint64_t memory = hasStringColumn ? 2 * maxDataLength :
+ std::min(uint64_t(maxDataLength),
+ nSelectedStreams * stream->getNaturalReadSize());
+
+ // Do we need even more memory to read the footer or the metadata?
+ if (memory < postscript->footerlength() + DIRECTORY_SIZE_GUESS) {
+ memory = postscript->footerlength() + DIRECTORY_SIZE_GUESS;
+ }
+ if (memory < postscript->metadatalength()) {
+ memory = postscript->metadatalength();
+ }
+
+ // Account for firstRowOfStripe.
+ memory += firstRowOfStripe.capacity() * sizeof(uint64_t);
+
+ // Decompressors need buffers for each stream
+ uint64_t decompressorMemory = 0;
+ if (compression != CompressionKind_NONE) {
+ for (int i=0; i < footer->types_size(); i++) {
+ if (selectedColumns[i]) {
--- End diff --
you need the static cast to size_t here too.
> Add functionality to estimate memory footprint
> ----------------------------------------------
>
> Key: ORC-21
> URL: https://issues.apache.org/jira/browse/ORC-21
> Project: Orc
> Issue Type: Task
> Reporter: Aliaksei Sandryhaila
> Assignee: Aliaksei Sandryhaila
>
> ORC library allocates multiple large buffers to read and materialize ORC
> files. For stability of applications that use the library, it may be
> desirable to have an estimate (preferably, a tight upper bound) of a memory
> footprint.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)