[jira] [Commented] (ORC-21) Add functionality to estimate memory footprint

ASF GitHub Bot (JIRA) Mon, 09 Nov 2015 10:05:16 -0800

    [ 
https://issues.apache.org/jira/browse/ORC-21?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14997020#comment-14997020
 ]


ASF GitHub Bot commented on ORC-21:
-----------------------------------

Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/12#discussion_r44307077
  
    --- Diff: c++/src/Reader.cc ---
    @@ -1364,6 +1365,111 @@ namespace orc {
         int64_t getEpochOffset() const override;
       };
     
    +  uint64_t maxStreamsForType(const proto::Type& type) {
    +    switch (static_cast<int64_t>(type.kind())) {
    +      case proto::Type_Kind_STRUCT:
    +        return 1;
    +      case proto::Type_Kind_INT:
    +      case proto::Type_Kind_LONG:
    +      case proto::Type_Kind_SHORT:
    +      case proto::Type_Kind_FLOAT:
    +      case proto::Type_Kind_DOUBLE:
    +      case proto::Type_Kind_BOOLEAN:
    +      case proto::Type_Kind_BYTE:
    +      case proto::Type_Kind_DATE:
    +      case proto::Type_Kind_LIST:
    +      case proto::Type_Kind_MAP:
    +      case proto::Type_Kind_UNION:
    +        return 2;
    +      case proto::Type_Kind_BINARY:
    +      case proto::Type_Kind_DECIMAL:
    +      case proto::Type_Kind_TIMESTAMP:
    +        return 3;
    +      case proto::Type_Kind_CHAR:
    +      case proto::Type_Kind_STRING:
    +      case proto::Type_Kind_VARCHAR:
    +        return 4;
    +      default:
    +          return 0;
    +      }
    +  }
    +
    +  uint64_t ReaderImpl::getMemoryUse(int stripeIx) {
    +    uint64_t maxDataLength = 0;
    +
    +    if (stripeIx >= 0 && stripeIx < footer->stripes_size()) {
    +      uint64_t stripe = footer->stripes(stripeIx).datalength();
    +      if (maxDataLength < stripe) {
    +        maxDataLength = stripe;
    +      }
    +    } else {
    +      for (int i=0; i < footer->stripes_size(); i++) {
    +        uint64_t stripe = footer->stripes(i).datalength();
    +        if (maxDataLength < stripe) {
    +          maxDataLength = stripe;
    +        }
    +      }
    +    }
    +
    +    bool hasStringColumn = false;
    +    uint64_t nSelectedStreams = 0;
    +    for (int i=0; !hasStringColumn && i < footer->types_size(); i++) {
    +      if (selectedColumns[i]) {
    +        const proto::Type& type = footer->types(i);
    +        nSelectedStreams += maxStreamsForType(type) ;
    +        switch (static_cast<int64_t>(type.kind())) {
    +          case proto::Type_Kind_CHAR:
    +          case proto::Type_Kind_STRING:
    +          case proto::Type_Kind_VARCHAR:
    +          case proto::Type_Kind_BINARY: {
    +            hasStringColumn = true;
    +            break;
    +          }
    +          default: {
    +            break;
    +          }
    +        }
    +      }
    +    }
    +
    +    /* If a string column is read, use stripe datalength as a memory 
estimate
    +     * because we don't know the dictionary size. Multiply by 2 because
    +     * a string column requires two buffers:
    +     * in the input stream and in the seekable input stream.
    +     * If no string column is read, estimate from the number of streams.
    +     */
    +    uint64_t memory = hasStringColumn ? 2 * maxDataLength :
    +        std::min(uint64_t(maxDataLength),
    +                 nSelectedStreams * stream->getNaturalReadSize());
    +
    +    // Do we need even more memory to read the footer or the metadata?
    +    if (memory < postscript->footerlength() + DIRECTORY_SIZE_GUESS) {
    +      memory =  postscript->footerlength() + DIRECTORY_SIZE_GUESS;
    +    }
    +    if (memory < postscript->metadatalength()) {
    +      memory =  postscript->metadatalength();
    +    }
    +
    +    // Account for firstRowOfStripe.
    +    memory += firstRowOfStripe.capacity() * sizeof(uint64_t);
    +
    +    // Decompressors need buffers for each stream
    +    uint64_t decompressorMemory = 0;
    +    if (compression != CompressionKind_NONE) {
    +      for (int i=0; i < footer->types_size(); i++) {
    +        if (selectedColumns[i]) {
    --- End diff --
    
    you need the static cast to size_t here too.


> Add functionality to estimate memory footprint
> ----------------------------------------------
>
>                 Key: ORC-21
>                 URL: https://issues.apache.org/jira/browse/ORC-21
>             Project: Orc
>          Issue Type: Task
>            Reporter: Aliaksei Sandryhaila
>            Assignee: Aliaksei Sandryhaila
>
> ORC library allocates multiple large buffers to read and materialize ORC 
> files. For stability of applications that use the library, it may be 
> desirable to have an estimate (preferably, a tight upper bound) of a memory 
> footprint.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

[jira] [Commented] (ORC-21) Add functionality to estimate memory footprint

Reply via email to