[protobuf] Protocol Buffers using Lzip

Jacob Rief Tue, 08 Dec 2009 09:06:34 -0800

Hello Brian, hello Kenton, hello list,
as an alternative to GzipInputStream and GzipOutputStream I have
written a compression and an uncompression stream class which are
stackable into Protocol Buffers streams. They are named
LzipInputStream and LzipOutputStream and use the Lempel-Ziv-Markov
chain algorithm, as implemented by LZIP
http://www.nongnu.org/lzip/lzip.html


An advantage for using Lzip instead of Gzip is, that Lzip supports
multi member compression. So one can jump into the stream at any
position, forward up to the next synchronization boundary and start
reading from there.
Using the default compression level, Lzip has a better compression
ratio at the cost of being slower than Gzip, but when Lzip is used
with a low compression level, speed and output size of Lzip are
comparable to that of Gzip.

I would like to donate these classes to the ProtoBuf software
repository. They will be released under an OSS license, compatible to
LZIP and Google's. Could someone please check them and tell me in what
kind of repository I can publish them. In Google's license agreements
there is a passage telling: "Neither the name of Google Inc. nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission."
Since I have to use the name "google" in the C++ namespace of
LzipIn/OutputStream, hereby I ask for permission to do so.

Comments are appreciated,
Jacob

--

You received this message because you are subscribed to the Google Groups 
"Protocol Buffers" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to 
[email protected].
For more options, visit this group at 
http://groups.google.com/group/protobuf?hl=en.

// This file contains the implementation of classes 
// LzipInputStream and LzipOutputStream used to compress
// and decompress Google's Protocol Buffer Streams using
// the Lempel-Ziv-Markow-Algorithm.
//
// Derived from 
http://protobuf.googlecode.com/svn/tags/2.2.0/src/google/protobuf/io/gzip_stream.cc
// Copyright 2009 by Jacob Rief <[email protected]>
// Evaluation copy - don't use in production code

#include <lzip_stream.h>
#include <google/protobuf/stubs/common.h>

namespace google {
namespace protobuf {
namespace io {

static const int kDefaultBufferSize = 8192;

// === LzipInputStream ===

LzipInputStream::LzipInputStream(ZeroCopyInputStream* sub_stream) :
  sub_stream_(sub_stream),
  finished_(false),
  output_buffer_length_(kDefaultBufferSize),
  output_buffer_(operator new(output_buffer_length_)),
  output_position_(NULL),
  next_out_(NULL),
  avail_out_(0),
  errno_(LZ_ok)
{
  GOOGLE_CHECK(output_buffer_ != NULL);
  decoder_ = LZ_decompress_open();
  errno_ = LZ_decompress_errno(decoder_);
  GOOGLE_CHECK(errno_ == LZ_ok);
}

LzipInputStream::~LzipInputStream() {
  if (decoder_ != NULL) {
    Close();
  }
  if (output_buffer_ != NULL) {
    operator delete(output_buffer_);
  }
}

bool LzipInputStream::Close() {
  errno_ = LZ_decompress_errno(decoder_);
  bool ok = LZ_decompress_close(decoder_) == LZ_ok;
  decoder_ = NULL;
  return ok;
}

// --- implements ZeroCopyInputStream ---
bool LzipInputStream::Next(const void** data, int* size) {
  GOOGLE_CHECK_GE(next_out_, output_position_);
  if (next_out_ == output_position_) {
    if (finished_ && LZ_decompress_finished(decoder_))
      return false;
    output_position_ = next_out_ = static_cast<uint8_t*>(output_buffer_);
    avail_out_ = output_buffer_length_;
    Decompress();
  }
  *data = output_position_;
  *size = next_out_ - output_position_;
  output_position_ = next_out_;
  return true;
}

void LzipInputStream::BackUp(int count) {
  GOOGLE_CHECK_GE(output_position_-static_cast<uint8_t*>(output_buffer_), 
count);
  output_position_ -= count;
}

bool LzipInputStream::Skip(int count) {
  const void* data;
  int size;
  bool ok = Next(&data, &size);
  while (ok && (size < count)) {
    count -= size;
    ok = Next(&data, &size);
  }
  if (size > count) {
    BackUp(size - count);
  }
  return ok;
}

int64 LzipInputStream::ByteCount() const {
  return LZ_decompress_total_out_size(decoder_);
}

// --- private ---
void LzipInputStream::Decompress() {
  GOOGLE_CHECK_GT(avail_out_, 0);
  if (!finished_) {
    int avail_in;
    const void* next_in;
    if (sub_stream_->Next(&next_in, &avail_in)) {
      int bytes_written = LZ_decompress_write(decoder_, static_cast<const 
uint8_t*>(next_in), avail_in);
      errno_ = LZ_decompress_errno(decoder_);
      GOOGLE_CHECK(errno_ == LZ_ok);
      GOOGLE_CHECK_GE(bytes_written, 0);
      sub_stream_->BackUp(avail_in - bytes_written);
    } else {
      GOOGLE_CHECK(LZ_decompress_finish(decoder_) == LZ_ok);
      finished_ = true;
    }
  }
  int bytes_read = LZ_decompress_read(decoder_, next_out_, avail_out_);
  errno_ = LZ_decompress_errno(decoder_);
  GOOGLE_CHECK(errno_ == LZ_ok);
  GOOGLE_CHECK_GE(bytes_read, 0);
  next_out_ += bytes_read;
  avail_out_ -= bytes_read;
}

// === LzipOutputStream ===

LzipOutputStream::LzipOutputStream(ZeroCopyOutputStream* sub_stream, size_t 
compression_level, int64_t member_size) :
  input_buffer_length_(kDefaultBufferSize),
  input_buffer_(operator new(input_buffer_length_)),
  input_position_(static_cast<uint8_t*>(input_buffer_)),
  input_buffer_end_(input_position_ + input_buffer_length_),
  sub_stream_(sub_stream),
  finished_(false),
  member_size_(member_size)
{
  GOOGLE_CHECK(input_buffer_ != NULL);
  GOOGLE_CHECK_GT(compression_level, 0);
  compression_level--;
  GOOGLE_CHECK_LT(compression_level, sizeof(options)/sizeof(Options));
  encoder_ = LZ_compress_open(options[compression_level].dictionary_size, 
options[compression_level].match_len_limit, member_size);
  errno_ = LZ_compress_errno(encoder_);
  GOOGLE_CHECK(errno_ == LZ_ok);
}

LzipOutputStream::~LzipOutputStream() {
  if (encoder_ != NULL) {
    Close();
  }
  if (input_buffer_ != NULL) {
    operator delete(input_buffer_);
  }
}

bool LzipOutputStream::Flush() {
  Compress(true);
  input_position_ = static_cast<uint8_t*>(input_buffer_);
  return true;
}

bool LzipOutputStream::Close() {
  if (finished_)
    return false;
  Compress();
  GOOGLE_CHECK(LZ_compress_finish(encoder_) == LZ_ok);
  do {
    int avail_out;
    void* next_out;
    if (sub_stream_->Next(&next_out, &avail_out)) {
      int bytes_read = LZ_compress_read(encoder_, 
static_cast<uint8_t*>(next_out), avail_out);
      errno_ = LZ_compress_errno(encoder_);
      GOOGLE_CHECK(errno_ == LZ_ok);
      GOOGLE_CHECK_GE(bytes_read, 0);
      sub_stream_->BackUp(avail_out - bytes_read);
    } else {
      // disk full?
      return false;
    }
  } while (!LZ_compress_finished(encoder_));
  bool ok = LZ_compress_close(encoder_) == LZ_ok;
  encoder_ = NULL;
  return ok;
}

// --- implements ZeroCopyOutputStream ---
bool LzipOutputStream::Next(void** data, int* size) {
  GOOGLE_CHECK_LE(input_position_, input_buffer_end_);
  if (input_position_ == input_buffer_end_) {
    if (finished_)
      return false;
    Compress();
    *data = input_buffer_;
    *size = input_buffer_length_;
  } else {
    *data = input_position_;
    *size = input_buffer_end_ - input_position_;
  }
  input_position_ = input_buffer_end_;
  return true;
}

void LzipOutputStream::BackUp(int count) {
  GOOGLE_CHECK_LE(input_buffer_length_ - count, input_position_ - 
static_cast<uint8_t*>(input_buffer_));
  input_position_ -= count;
}

int64 LzipOutputStream::ByteCount() const {
  return LZ_compress_total_in_size(encoder_);
}

// --- private ---
void LzipOutputStream::Compress(bool flush) {
  uint8_t* next_in = static_cast<uint8_t*>(input_buffer_);
  int avail_in = input_position_ - next_in;
  int bytes_written, bytes_read;
  do {
    bytes_written = LZ_compress_write(encoder_, next_in, avail_in);
    errno_ = LZ_compress_errno(encoder_);
    GOOGLE_CHECK(errno_ == LZ_ok);
    GOOGLE_CHECK_GE(bytes_written, 0);
    next_in += bytes_written;
    avail_in -= bytes_written;
    if (flush) {
      GOOGLE_CHECK(LZ_compress_sync_flush(encoder_) == LZ_ok);
      flush = false;
    }
    int avail_out;
    void* next_out;
    if (sub_stream_->Next(&next_out, &avail_out)) {
      bytes_read = LZ_compress_read(encoder_, static_cast<uint8_t*>(next_out), 
avail_out);
      errno_ = LZ_compress_errno(encoder_);
      GOOGLE_CHECK(errno_ == LZ_ok);
      GOOGLE_CHECK_GE(bytes_read, 0);
      if (LZ_compress_member_finished(encoder_)==1) {
        LZ_compress_restart_member(encoder_, member_size_);
      }
      sub_stream_->BackUp(avail_out - bytes_read);
    } else {
      // disk full?
      finished_ = true;
    }
  } while (bytes_written>0 || bytes_read>0);
}

const LzipOutputStream::Options LzipOutputStream::options[9] = {
    { 1 << 20,  10 }, // -1
    { 1 << 20,  12 }, // -2
    { 1 << 20,  17 }, // -3
    { 1 << 21,  26 }, // -4
    { 1 << 22,  44 }, // -5
    { 1 << 23,  80 }, // -6
    { 1 << 24, 108 }, // -7
    { 1 << 24, 163 }, // -8
    { 1 << 25, 273 }  // -9
};

}  // namespace io
}  // namespace protobuf
}  // namespace google

// This file contains the declaration of classes 
// LzipInputStream and LzipOutputStream used to compress
// and decompress Google's Protocol Buffer Streams using
// the Lempel-Ziv-Markow-Algorithm.
//
// Derived from 
http://protobuf.googlecode.com/svn/tags/2.2.0/src/google/protobuf/io/gzip_stream.h
// Copyright 2009 by Jacob Rief <[email protected]>
// Evaluation copy - don't use in production code

#ifndef GOOGLE_PROTOBUF_IO_LZIP_STREAM_H__
#define GOOGLE_PROTOBUF_IO_LZIP_STREAM_H__

#include <stdint.h>
#include <lzlib.h>
#include <limits>
#include <google/protobuf/io/zero_copy_stream.h>

namespace google {
namespace protobuf {
namespace io {

// A ZeroCopyInputStream that reads compressed data through lzib
class LIBPROTOBUF_EXPORT LzipInputStream : public ZeroCopyInputStream {
 public:
  explicit LzipInputStream(ZeroCopyInputStream* sub_stream);

  virtual ~LzipInputStream();

  // Releases the decoder. 
  bool Close();

  // --- implements ZeroCopyInputStream ---
  bool Next(const void** data, int* size);
  void BackUp(int count);
  bool Skip(int count);
  int64 ByteCount() const;

 private:
  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(LzipInputStream);

  void Decompress();

  // compressed input stream
  ZeroCopyInputStream* sub_stream_;
  bool finished_;

  // plain text output stream
  const int output_buffer_length_;
  void* const output_buffer_;
  uint8_t* output_position_;
  uint8_t* next_out_;
  int avail_out_;

  // Lzip decoder
  void* decoder_;
  LZ_errno errno_;
};

class LIBPROTOBUF_EXPORT LzipOutputStream : public ZeroCopyOutputStream {
 public:
  // Create a LzipOutputStream with default options.
  explicit LzipOutputStream(ZeroCopyOutputStream* sub_stream, size_t 
compression_level = 5, int64_t member_size = std::numeric_limits<long 
long>::max());

  virtual ~LzipOutputStream();

  // Flushes data written so far to zipped data in the underlying stream.
  // It is the caller's responsibility to flush the underlying stream if
  // necessary.
  // Compression may be less efficient stopping and starting around flushes.
  // Returns true if no error.
  bool Flush();

  // Writes out all data and closes the lzip stream.
  // It is the caller's responsibility to close the underlying stream if
  // necessary.
  // Returns true if no error.
  bool Close();

  // --- implements ZeroCopyOutputStream ---
  bool Next(void** data, int* size);
  void BackUp(int count);
  int64 ByteCount() const;

 private:
  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(LzipOutputStream);

  void Compress(bool flush = false);

  // plain text input stream
  const int input_buffer_length_;
  void* const input_buffer_;
  uint8_t* input_position_;
  uint8_t* const input_buffer_end_;

  // compressed output stream
  ZeroCopyOutputStream* sub_stream_;
  bool finished_;

  // Lzip encoder
  struct Options {
    int dictionary_size; // 4KiB..512MiB
    int match_len_limit; // 5..273
  };
  static const Options options[9];

  void* encoder_;
  int member_size_;
  LZ_errno errno_;
};

}  // namespace io
}  // namespace protobuf
}  // namespace google

#endif  // GOOGLE_PROTOBUF_IO_LZIP_STREAM_H__

[protobuf] Protocol Buffers using Lzip

Reply via email to