[ 
https://issues.apache.org/jira/browse/ORC-192?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16020025#comment-16020025
 ] 

ASF GitHub Bot commented on ORC-192:
------------------------------------

Github user xndai commented on a diff in the pull request:

    https://github.com/apache/orc/pull/122#discussion_r117822601
  
    --- Diff: c++/src/Compression.cc ---
    @@ -636,6 +884,33 @@ DIAGNOSTIC_POP
         return static_cast<uint64_t>(result);
       }
     
    +  std::unique_ptr<BufferedOutputStream>
    +     createCompressor(
    +                      CompressionKind kind,
    +                      OutputStream * outStream,
    +                      CompressionStrategy strategy,
    +                      uint64_t bufferCapacity,
    +                      uint64_t blockSize,
    +                      MemoryPool& pool) {
    +    switch (static_cast<int64_t>(kind)) {
    +    case CompressionKind_NONE: {
    +      return std::unique_ptr<BufferedOutputStream>
    +        (new BufferedOutputStream(pool, outStream, bufferCapacity, 
blockSize));
    +    }
    +    case CompressionKind_ZLIB: {
    +      int level = (strategy == CompressionStrategy_SPEED) ? -1 : 9;
    --- End diff --
    
    According to this - https://orc.apache.org/docs/hive-config.html, there are 
only two compression strategy defined: SPEED and COMPRESSION. I also checked 
Java implementation, SPEED maps to zlib level Z_BEST_SPEED + 1, and COMPRESSION 
maps to Z_DEFAULT_COMPRESSION. I will do the same for C++.
    
    Java implementation for your reference -
    
    WriterImpl.java
    
    `
        CompressionCodec result = physicalWriter.getCompressionCodec();
        if (result != null) {
          switch (kind) {
            case BLOOM_FILTER:
            case DATA:
            case DICTIONARY_DATA:
            case BLOOM_FILTER_UTF8:
              if (compressionStrategy == OrcFile.CompressionStrategy.SPEED) {
                result = 
result.modify(EnumSet.of(CompressionCodec.Modifier.FAST,
                    CompressionCodec.Modifier.TEXT));
              } else {
                result = 
result.modify(EnumSet.of(CompressionCodec.Modifier.DEFAULT,
                    CompressionCodec.Modifier.TEXT));
              }
              break;
            case LENGTH:
            case DICTIONARY_COUNT:
            case PRESENT:
            case ROW_INDEX:
            case SECONDARY:
              // easily compressed using the fastest modes
              result = 
result.modify(EnumSet.of(CompressionCodec.Modifier.FASTEST,
                  CompressionCodec.Modifier.BINARY));
              break;
            default:
              LOG.info("Missing ORC compression modifiers for " + kind);
              break;
          }
        }
    
    `
    
    ZlibCodec.java
    
    `
    public CompressionCodec modify(/* @Nullable */ EnumSet<Modifier> modifiers) 
{
    
        if (modifiers == null) {
          return this;
        }
    
        int l = this.level;
        int s = this.strategy;
    
        for (Modifier m : modifiers) {
          switch (m) {
          case BINARY:
            /* filtered == less LZ77, more huffman */
            s = Deflater.FILTERED;
            break;
          case TEXT:
            s = Deflater.DEFAULT_STRATEGY;
            break;
          case FASTEST:
            // deflate_fast looking for 8 byte patterns
            l = Deflater.BEST_SPEED;
            break;
          case FAST:
            // deflate_fast looking for 16 byte patterns
            l = Deflater.BEST_SPEED + 1;
            break;
          case DEFAULT:
            // deflate_slow looking for 128 byte patterns
            l = Deflater.DEFAULT_COMPRESSION;
            break;
          default:
            break;
          }
        }
        return new ZlibCodec(l, s);
      }
    `


> Zlib compression stream
> -----------------------
>
>                 Key: ORC-192
>                 URL: https://issues.apache.org/jira/browse/ORC-192
>             Project: ORC
>          Issue Type: Sub-task
>          Components: C++
>            Reporter: Xiening Dai
>            Assignee: Xiening Dai
>




--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Reply via email to