Github user xndai commented on a diff in the pull request:

    https://github.com/apache/orc/pull/122#discussion_r117822601
  
    --- Diff: c++/src/Compression.cc ---
    @@ -636,6 +884,33 @@ DIAGNOSTIC_POP
         return static_cast<uint64_t>(result);
       }
     
    +  std::unique_ptr<BufferedOutputStream>
    +     createCompressor(
    +                      CompressionKind kind,
    +                      OutputStream * outStream,
    +                      CompressionStrategy strategy,
    +                      uint64_t bufferCapacity,
    +                      uint64_t blockSize,
    +                      MemoryPool& pool) {
    +    switch (static_cast<int64_t>(kind)) {
    +    case CompressionKind_NONE: {
    +      return std::unique_ptr<BufferedOutputStream>
    +        (new BufferedOutputStream(pool, outStream, bufferCapacity, 
blockSize));
    +    }
    +    case CompressionKind_ZLIB: {
    +      int level = (strategy == CompressionStrategy_SPEED) ? -1 : 9;
    --- End diff --
    
    According to this - https://orc.apache.org/docs/hive-config.html, there are 
only two compression strategy defined: SPEED and COMPRESSION. I also checked 
Java implementation, SPEED maps to zlib level Z_BEST_SPEED + 1, and COMPRESSION 
maps to Z_DEFAULT_COMPRESSION. I will do the same for C++.
    
    Java implementation for your reference -
    
    WriterImpl.java
    
    `
        CompressionCodec result = physicalWriter.getCompressionCodec();
        if (result != null) {
          switch (kind) {
            case BLOOM_FILTER:
            case DATA:
            case DICTIONARY_DATA:
            case BLOOM_FILTER_UTF8:
              if (compressionStrategy == OrcFile.CompressionStrategy.SPEED) {
                result = 
result.modify(EnumSet.of(CompressionCodec.Modifier.FAST,
                    CompressionCodec.Modifier.TEXT));
              } else {
                result = 
result.modify(EnumSet.of(CompressionCodec.Modifier.DEFAULT,
                    CompressionCodec.Modifier.TEXT));
              }
              break;
            case LENGTH:
            case DICTIONARY_COUNT:
            case PRESENT:
            case ROW_INDEX:
            case SECONDARY:
              // easily compressed using the fastest modes
              result = 
result.modify(EnumSet.of(CompressionCodec.Modifier.FASTEST,
                  CompressionCodec.Modifier.BINARY));
              break;
            default:
              LOG.info("Missing ORC compression modifiers for " + kind);
              break;
          }
        }
    
    `
    
    ZlibCodec.java
    
    `
    public CompressionCodec modify(/* @Nullable */ EnumSet<Modifier> modifiers) 
{
    
        if (modifiers == null) {
          return this;
        }
    
        int l = this.level;
        int s = this.strategy;
    
        for (Modifier m : modifiers) {
          switch (m) {
          case BINARY:
            /* filtered == less LZ77, more huffman */
            s = Deflater.FILTERED;
            break;
          case TEXT:
            s = Deflater.DEFAULT_STRATEGY;
            break;
          case FASTEST:
            // deflate_fast looking for 8 byte patterns
            l = Deflater.BEST_SPEED;
            break;
          case FAST:
            // deflate_fast looking for 16 byte patterns
            l = Deflater.BEST_SPEED + 1;
            break;
          case DEFAULT:
            // deflate_slow looking for 128 byte patterns
            l = Deflater.DEFAULT_COMPRESSION;
            break;
          default:
            break;
          }
        }
        return new ZlibCodec(l, s);
      }
    `


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

Reply via email to