[11/45] tajo git commit: TAJO-1233: Merge hbase_storage branch to the master branch. (Hyoungjun Kim via hyunsik)

jihoonson Fri, 12 Dec 2014 00:23:19 -0800

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/ByteBufLineReader.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/ByteBufLineReader.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/ByteBufLineReader.java
new file mode 100644
index 0000000..2f742c6
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/ByteBufLineReader.java
@@ -0,0 +1,182 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.util.CharsetUtil;
+import org.apache.tajo.storage.BufferPool;
+import org.apache.tajo.storage.ByteBufInputChannel;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class ByteBufLineReader implements Closeable {
+  private static int DEFAULT_BUFFER = 64 * 1024;
+
+  private int bufferSize;
+  private long readBytes;
+  private int startIndex;
+  private boolean eof = false;
+  private ByteBuf buffer;
+  private final ByteBufInputChannel channel;
+  private final AtomicInteger lineReadBytes = new AtomicInteger();
+  private final LineSplitProcessor processor = new LineSplitProcessor();
+
+  public ByteBufLineReader(ByteBufInputChannel channel) {
+    this(channel, BufferPool.directBuffer(DEFAULT_BUFFER));
+  }
+
+  public ByteBufLineReader(ByteBufInputChannel channel, ByteBuf buf) {
+    this.readBytes = 0;
+    this.channel = channel;
+    this.buffer = buf;
+    this.bufferSize = buf.capacity();
+  }
+
+  public long readBytes() {
+    return readBytes - buffer.readableBytes();
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (this.buffer.refCnt() > 0) {
+      this.buffer.release();
+    }
+    this.channel.close();
+  }
+
+  public String readLine() throws IOException {
+    ByteBuf buf = readLineBuf(lineReadBytes);
+    if (buf != null) {
+      return buf.toString(CharsetUtil.UTF_8);
+    }
+    return null;
+  }
+
+  private void fillBuffer() throws IOException {
+
+    int tailBytes = 0;
+    if (this.readBytes > 0) {
+      //startIndex = 0, readIndex = tailBytes length, writable = (buffer 
capacity - tailBytes)
+      this.buffer.markReaderIndex();
+      this.buffer.discardReadBytes();  // compact the buffer
+      tailBytes = this.buffer.writerIndex();
+      if (!this.buffer.isWritable()) {
+        // a line bytes is large than the buffer
+        BufferPool.ensureWritable(buffer, bufferSize * 2);
+        this.bufferSize = buffer.capacity();
+      }
+      this.startIndex = 0;
+    }
+
+    boolean release = true;
+    try {
+      int readBytes = tailBytes;
+      for (; ; ) {
+        int localReadBytes = buffer.writeBytes(channel, this.bufferSize - 
readBytes);
+        if (localReadBytes < 0) {
+          if (buffer.isWritable()) {
+            //if read bytes is less than the buffer capacity,  there is no 
more bytes in the channel
+            eof = true;
+          }
+          break;
+        }
+        readBytes += localReadBytes;
+        if (readBytes == bufferSize) {
+          break;
+        }
+      }
+      this.readBytes += (readBytes - tailBytes);
+      release = false;
+
+      this.buffer.readerIndex(this.buffer.readerIndex() + tailBytes); //skip 
past buffer (tail)
+    } finally {
+      if (release) {
+        buffer.release();
+      }
+    }
+  }
+
+  /**
+   * Read a line terminated by one of CR, LF, or CRLF.
+   */
+  public ByteBuf readLineBuf(AtomicInteger reads) throws IOException {
+    int readBytes = 0; // newline + text line bytes
+    int newlineLength = 0; //length of terminating newline
+    int readable;
+
+    this.startIndex = buffer.readerIndex();
+
+    loop:
+    while (true) {
+      readable = buffer.readableBytes();
+      if (readable <= 0) {
+        buffer.readerIndex(this.startIndex);
+        fillBuffer(); //compact and fill buffer
+
+        //if buffer.writerIndex() is zero, there is no bytes in buffer
+        if (!buffer.isReadable() && buffer.writerIndex() == 0) {
+          reads.set(0);
+          return null;
+        } else {
+          //skip first newLine
+          if (processor.isPrevCharCR() && buffer.getByte(buffer.readerIndex()) 
== LineSplitProcessor.LF) {
+            buffer.skipBytes(1);
+            if(eof && !buffer.isReadable()) {
+              reads.set(1);
+              return null;
+            }
+
+            newlineLength++;
+            readBytes++;
+            startIndex = buffer.readerIndex();
+          }
+        }
+        readable = buffer.readableBytes();
+      }
+
+      int endIndex = buffer.forEachByte(buffer.readerIndex(), readable, 
processor);
+      if (endIndex < 0) {
+        //does not appeared terminating newline
+        buffer.readerIndex(buffer.writerIndex()); // set to end buffer
+        if(eof){
+          readBytes += (buffer.readerIndex() - startIndex);
+          break loop;
+        }
+      } else {
+        buffer.readerIndex(endIndex + 1);
+        readBytes += (buffer.readerIndex() - startIndex); //past newline + 
text line
+
+        //appeared terminating CRLF
+        if (processor.isPrevCharCR() && buffer.isReadable()
+            && buffer.getByte(buffer.readerIndex()) == LineSplitProcessor.LF) {
+          buffer.skipBytes(1);
+          readBytes++;
+          newlineLength += 2;
+        } else {
+          newlineLength += 1;
+        }
+        break loop;
+      }
+    }
+    reads.set(readBytes);
+    return buffer.slice(startIndex, readBytes - newlineLength);
+  }
+}


http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
new file mode 100644
index 0000000..1599f62
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBuf;
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.datum.Datum;
+import org.apache.tajo.storage.FieldSerializerDeserializer;
+import org.apache.tajo.storage.Tuple;
+
+import java.io.IOException;
+
+public class CSVLineDeserializer extends TextLineDeserializer {
+  private FieldSplitProcessor processor;
+  private FieldSerializerDeserializer fieldSerDer;
+  private ByteBuf nullChars;
+
+  public CSVLineDeserializer(Schema schema, TableMeta meta, int[] 
targetColumnIndexes) {
+    super(schema, meta, targetColumnIndexes);
+  }
+
+  @Override
+  public void init() {
+    this.processor = new 
FieldSplitProcessor(CSVLineSerDe.getFieldDelimiter(meta));
+
+    if (nullChars != null) {
+      nullChars.release();
+    }
+    nullChars = TextLineSerDe.getNullChars(meta);
+
+    fieldSerDer = new TextFieldSerializerDeserializer(meta);
+  }
+
+  public void deserialize(final ByteBuf lineBuf, Tuple output) throws 
IOException, TextLineParsingError {
+    int[] projection = targetColumnIndexes;
+    if (lineBuf == null || targetColumnIndexes == null || 
targetColumnIndexes.length == 0) {
+      return;
+    }
+
+    final int rowLength = lineBuf.readableBytes();
+    int start = 0, fieldLength = 0, end = 0;
+
+    //Projection
+    int currentTarget = 0;
+    int currentIndex = 0;
+
+    while (end != -1) {
+      end = lineBuf.forEachByte(start, rowLength - start, processor);
+
+      if (end < 0) {
+        fieldLength = rowLength - start;
+      } else {
+        fieldLength = end - start;
+      }
+
+      if (projection.length > currentTarget && currentIndex == 
projection[currentTarget]) {
+        lineBuf.setIndex(start, start + fieldLength);
+        Datum datum = fieldSerDer.deserialize(lineBuf, 
schema.getColumn(currentIndex), currentIndex, nullChars);
+        output.put(currentIndex, datum);
+        currentTarget++;
+      }
+
+      if (projection.length == currentTarget) {
+        break;
+      }
+
+      start = end + 1;
+      currentIndex++;
+    }
+  }
+
+  @Override
+  public void release() {
+    if (nullChars != null) {
+      nullChars.release();
+      nullChars = null;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java
new file mode 100644
index 0000000..2fe7f23
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.storage.StorageConstants;
+
+public class CSVLineSerDe extends TextLineSerDe {
+  @Override
+  public TextLineDeserializer createDeserializer(Schema schema, TableMeta 
meta, int[] targetColumnIndexes) {
+    return new CSVLineDeserializer(schema, meta, targetColumnIndexes);
+  }
+
+  @Override
+  public TextLineSerializer createSerializer(Schema schema, TableMeta meta) {
+    return new CSVLineSerializer(schema, meta);
+  }
+
+  public static char getFieldDelimiter(TableMeta meta) {
+    return 
StringEscapeUtils.unescapeJava(meta.getOption(StorageConstants.TEXT_DELIMITER,
+        StorageConstants.DEFAULT_FIELD_DELIMITER)).charAt(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java
new file mode 100644
index 0000000..53a0ef3
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.datum.Datum;
+import org.apache.tajo.storage.FieldSerializerDeserializer;
+import org.apache.tajo.storage.Tuple;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class CSVLineSerializer extends TextLineSerializer {
+  private FieldSerializerDeserializer serde;
+
+  private byte [] nullChars;
+  private char delimiter;
+  private int columnNum;
+
+  public CSVLineSerializer(Schema schema, TableMeta meta) {
+    super(schema, meta);
+  }
+
+  @Override
+  public void init() {
+    nullChars = TextLineSerDe.getNullCharsAsBytes(meta);
+    delimiter = CSVLineSerDe.getFieldDelimiter(meta);
+    columnNum = schema.size();
+
+    serde = new TextFieldSerializerDeserializer(meta);
+  }
+
+  @Override
+  public int serialize(OutputStream out, Tuple input) throws IOException {
+    int writtenBytes = 0;
+
+    for (int i = 0; i < columnNum; i++) {
+      Datum datum = input.get(i);
+      writtenBytes += serde.serialize(out, datum, schema.getColumn(i), i, 
nullChars);
+
+      if (columnNum - 1 > i) {
+        out.write((byte) delimiter);
+        writtenBytes += 1;
+      }
+    }
+
+    return writtenBytes;
+  }
+
+  @Override
+  public void release() {
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedLineReader.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedLineReader.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedLineReader.java
new file mode 100644
index 0000000..1b433b5
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedLineReader.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBuf;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.io.compress.Decompressor;
+import org.apache.hadoop.io.compress.SplittableCompressionCodec;
+import org.apache.tajo.common.exception.NotImplementedException;
+import org.apache.tajo.conf.TajoConf;
+import org.apache.tajo.storage.BufferPool;
+import org.apache.tajo.storage.ByteBufInputChannel;
+import org.apache.tajo.storage.FileScanner;
+import org.apache.tajo.storage.compress.CodecPool;
+import org.apache.tajo.storage.fragment.FileFragment;
+
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class DelimitedLineReader implements Closeable {
+  private static final Log LOG = LogFactory.getLog(DelimitedLineReader.class);
+  private final static int DEFAULT_PAGE_SIZE = 128 * 1024;
+
+  private FileSystem fs;
+  private FSDataInputStream fis;
+  private InputStream is; //decompressd stream
+  private CompressionCodecFactory factory;
+  private CompressionCodec codec;
+  private Decompressor decompressor;
+
+  private long startOffset, end, pos;
+  private boolean eof = true;
+  private ByteBufLineReader lineReader;
+  private AtomicInteger lineReadBytes = new AtomicInteger();
+  private FileFragment fragment;
+  private Configuration conf;
+
+  public DelimitedLineReader(Configuration conf, final FileFragment fragment) 
throws IOException {
+    this.fragment = fragment;
+    this.conf = conf;
+    this.factory = new CompressionCodecFactory(conf);
+    this.codec = factory.getCodec(fragment.getPath());
+    if (this.codec instanceof SplittableCompressionCodec) {
+      throw new NotImplementedException(); // bzip2 does not support 
multi-thread model
+    }
+  }
+
+  public void init() throws IOException {
+    if (fs == null) {
+      fs = FileScanner.getFileSystem((TajoConf) conf, fragment.getPath());
+    }
+    if (fis == null) fis = fs.open(fragment.getPath());
+    pos = startOffset = fragment.getStartKey();
+    end = startOffset + fragment.getLength();
+
+    if (codec != null) {
+      decompressor = CodecPool.getDecompressor(codec);
+      is = new DataInputStream(codec.createInputStream(fis, decompressor));
+      ByteBufInputChannel channel = new ByteBufInputChannel(is);
+      lineReader = new ByteBufLineReader(channel, 
BufferPool.directBuffer(DEFAULT_PAGE_SIZE));
+    } else {
+      fis.seek(startOffset);
+      is = fis;
+
+      ByteBufInputChannel channel = new ByteBufInputChannel(is);
+      lineReader = new ByteBufLineReader(channel,
+          BufferPool.directBuffer((int) Math.min(DEFAULT_PAGE_SIZE, end)));
+    }
+    eof = false;
+  }
+
+  public long getCompressedPosition() throws IOException {
+    long retVal;
+    if (isCompressed()) {
+      retVal = fis.getPos();
+    } else {
+      retVal = pos;
+    }
+    return retVal;
+  }
+
+  public long getUnCompressedPosition() throws IOException {
+    return pos;
+  }
+
+  public long getReadBytes() {
+    return pos - startOffset;
+  }
+
+  public boolean isReadable() {
+    return !eof;
+  }
+
+  public ByteBuf readLine() throws IOException {
+    if (eof) {
+      return null;
+    }
+
+    ByteBuf buf = lineReader.readLineBuf(lineReadBytes);
+    pos += lineReadBytes.get();
+    if (buf == null) {
+      eof = true;
+    }
+
+    if (!isCompressed() && getCompressedPosition() > end) {
+      eof = true;
+    }
+    return buf;
+  }
+
+  public boolean isCompressed() {
+    return codec != null;
+  }
+
+  @Override
+  public void close() throws IOException {
+    try {
+      IOUtils.cleanup(LOG, lineReader, is, fis);
+      fs = null;
+      is = null;
+      fis = null;
+      lineReader = null;
+    } finally {
+      if (decompressor != null) {
+        CodecPool.returnDecompressor(decompressor);
+        decompressor = null;
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java
new file mode 100644
index 0000000..8824e3e
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java
@@ -0,0 +1,481 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBuf;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.io.compress.CompressionOutputStream;
+import org.apache.hadoop.io.compress.Compressor;
+import org.apache.tajo.QueryUnitAttemptId;
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.catalog.statistics.TableStats;
+import org.apache.tajo.storage.*;
+import org.apache.tajo.storage.compress.CodecPool;
+import org.apache.tajo.storage.exception.AlreadyExistsStorageException;
+import org.apache.tajo.storage.fragment.FileFragment;
+import org.apache.tajo.storage.fragment.Fragment;
+import org.apache.tajo.storage.rcfile.NonSyncByteArrayOutputStream;
+import org.apache.tajo.util.ReflectionUtil;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import static 
org.apache.tajo.storage.StorageConstants.DEFAULT_TEXT_ERROR_TOLERANCE_MAXNUM;
+import static 
org.apache.tajo.storage.StorageConstants.TEXT_ERROR_TOLERANCE_MAXNUM;
+
+public class DelimitedTextFile {
+
+  public static final byte LF = '\n';
+
+  private static final Log LOG = LogFactory.getLog(DelimitedTextFile.class);
+
+  /** it caches line serde classes. */
+  private static final Map<String, Class<? extends TextLineSerDe>> 
serdeClassCache =
+      new ConcurrentHashMap<String, Class<? extends TextLineSerDe>>();
+
+  /**
+   * By default, DelimitedTextFileScanner uses CSVLineSerder. If a table 
property 'text.serde.class' is given,
+   * it will use the specified serder class.
+   *
+   * @return TextLineSerder
+   */
+  public static TextLineSerDe getLineSerde(TableMeta meta) {
+    TextLineSerDe lineSerder;
+
+    String serDeClassName;
+
+    // if there is no given serde class, it will use CSV line serder.
+    serDeClassName = meta.getOption(StorageConstants.TEXT_SERDE_CLASS, 
StorageConstants.DEFAULT_TEXT_SERDE_CLASS);
+
+    try {
+      Class<? extends TextLineSerDe> serdeClass;
+
+      if (serdeClassCache.containsKey(serDeClassName)) {
+        serdeClass = serdeClassCache.get(serDeClassName);
+      } else {
+        serdeClass = (Class<? extends TextLineSerDe>) 
Class.forName(serDeClassName);
+        serdeClassCache.put(serDeClassName, serdeClass);
+      }
+      lineSerder = (TextLineSerDe) ReflectionUtil.newInstance(serdeClass);
+    } catch (Throwable e) {
+      throw new RuntimeException("TextLineSerde class cannot be initialized.", 
e);
+    }
+
+    return lineSerder;
+  }
+
+  public static class DelimitedTextFileAppender extends FileAppender {
+    private final TableMeta meta;
+    private final Schema schema;
+    private final FileSystem fs;
+    private FSDataOutputStream fos;
+    private DataOutputStream outputStream;
+    private CompressionOutputStream deflateFilter;
+    private TableStatistics stats = null;
+    private Compressor compressor;
+    private CompressionCodecFactory codecFactory;
+    private CompressionCodec codec;
+    private Path compressedPath;
+    private byte[] nullChars;
+    private int BUFFER_SIZE = 128 * 1024;
+    private int bufferedBytes = 0;
+    private long pos = 0;
+
+    private NonSyncByteArrayOutputStream os;
+    private TextLineSerializer serializer;
+
+    public DelimitedTextFileAppender(Configuration conf, QueryUnitAttemptId 
taskAttemptId,
+                                     final Schema schema, final TableMeta 
meta, final Path path)
+        throws IOException {
+      super(conf, taskAttemptId, schema, meta, path);
+      this.fs = path.getFileSystem(conf);
+      this.meta = meta;
+      this.schema = schema;
+    }
+
+    public TextLineSerDe getLineSerde() {
+      return DelimitedTextFile.getLineSerde(meta);
+    }
+
+    @Override
+    public void init() throws IOException {
+      if (!fs.exists(path.getParent())) {
+        throw new FileNotFoundException(path.toString());
+      }
+
+      if (this.meta.containsOption(StorageConstants.COMPRESSION_CODEC)) {
+        String codecName = 
this.meta.getOption(StorageConstants.COMPRESSION_CODEC);
+        codecFactory = new CompressionCodecFactory(conf);
+        codec = codecFactory.getCodecByClassName(codecName);
+        compressor = CodecPool.getCompressor(codec);
+        if (compressor != null) compressor.reset();  //builtin gzip is null
+
+        String extension = codec.getDefaultExtension();
+        compressedPath = path.suffix(extension);
+
+        if (fs.exists(compressedPath)) {
+          throw new AlreadyExistsStorageException(compressedPath);
+        }
+
+        fos = fs.create(compressedPath);
+        deflateFilter = codec.createOutputStream(fos, compressor);
+        outputStream = new DataOutputStream(deflateFilter);
+
+      } else {
+        if (fs.exists(path)) {
+          throw new AlreadyExistsStorageException(path);
+        }
+        fos = fs.create(path);
+        outputStream = new DataOutputStream(new BufferedOutputStream(fos));
+      }
+
+      if (enabledStats) {
+        this.stats = new TableStatistics(this.schema);
+      }
+
+      serializer = getLineSerde().createSerializer(schema, meta);
+      serializer.init();
+
+      if (os == null) {
+        os = new NonSyncByteArrayOutputStream(BUFFER_SIZE);
+      }
+
+      os.reset();
+      pos = fos.getPos();
+      bufferedBytes = 0;
+      super.init();
+    }
+
+    @Override
+    public void addTuple(Tuple tuple) throws IOException {
+      // write
+      int rowBytes = serializer.serialize(os, tuple);
+
+      // new line
+      os.write(LF);
+      rowBytes += 1;
+
+      // update positions
+      pos += rowBytes;
+      bufferedBytes += rowBytes;
+
+      // refill buffer if necessary
+      if (bufferedBytes > BUFFER_SIZE) {
+        flushBuffer();
+      }
+      // Statistical section
+      if (enabledStats) {
+        stats.incrementRow();
+      }
+    }
+
+    private void flushBuffer() throws IOException {
+      if (os.getLength() > 0) {
+        os.writeTo(outputStream);
+        os.reset();
+        bufferedBytes = 0;
+      }
+    }
+
+    @Override
+    public long getOffset() throws IOException {
+      return pos;
+    }
+
+    @Override
+    public void flush() throws IOException {
+      flushBuffer();
+      outputStream.flush();
+    }
+
+    @Override
+    public void close() throws IOException {
+
+      try {
+        serializer.release();
+
+        if(outputStream != null){
+          flush();
+        }
+
+        // Statistical section
+        if (enabledStats) {
+          stats.setNumBytes(getOffset());
+        }
+
+        if (deflateFilter != null) {
+          deflateFilter.finish();
+          deflateFilter.resetState();
+          deflateFilter = null;
+        }
+
+        os.close();
+      } finally {
+        IOUtils.cleanup(LOG, fos);
+        if (compressor != null) {
+          CodecPool.returnCompressor(compressor);
+          compressor = null;
+        }
+      }
+    }
+
+    @Override
+    public TableStats getStats() {
+      if (enabledStats) {
+        return stats.getTableStat();
+      } else {
+        return null;
+      }
+    }
+
+    public boolean isCompress() {
+      return compressor != null;
+    }
+
+    public String getExtension() {
+      return codec != null ? codec.getDefaultExtension() : "";
+    }
+  }
+
+  public static class DelimitedTextFileScanner extends FileScanner {
+    private boolean splittable = false;
+    private final long startOffset;
+
+    private final long endOffset;
+    /** The number of actual read records */
+    private int recordCount = 0;
+    private int[] targetColumnIndexes;
+
+    private DelimitedLineReader reader;
+    private TextLineDeserializer deserializer;
+
+    private int errorPrintOutMaxNum = 5;
+    /** Maximum number of permissible errors */
+    private int errorTorrenceMaxNum;
+    /** How many errors have occurred? */
+    private int errorNum;
+
+    public DelimitedTextFileScanner(Configuration conf, final Schema schema, 
final TableMeta meta,
+                                    final Fragment fragment)
+        throws IOException {
+      super(conf, schema, meta, fragment);
+      reader = new DelimitedLineReader(conf, this.fragment);
+      if (!reader.isCompressed()) {
+        splittable = true;
+      }
+
+      startOffset = this.fragment.getStartKey();
+      endOffset = startOffset + fragment.getLength();
+
+      errorTorrenceMaxNum =
+          Integer.parseInt(meta.getOption(TEXT_ERROR_TOLERANCE_MAXNUM, 
DEFAULT_TEXT_ERROR_TOLERANCE_MAXNUM));
+    }
+
+
+    @Override
+    public void init() throws IOException {
+      if (reader != null) {
+        reader.close();
+      }
+
+      reader = new DelimitedLineReader(conf, fragment);
+      reader.init();
+      recordCount = 0;
+
+      if (targets == null) {
+        targets = schema.toArray();
+      }
+
+      targetColumnIndexes = new int[targets.length];
+      for (int i = 0; i < targets.length; i++) {
+        targetColumnIndexes[i] = 
schema.getColumnId(targets[i].getQualifiedName());
+      }
+
+      super.init();
+      Arrays.sort(targetColumnIndexes);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("DelimitedTextFileScanner open:" + fragment.getPath() + "," 
+ startOffset + "," + endOffset);
+      }
+
+      if (startOffset > 0) {
+        reader.readLine();  // skip first line;
+      }
+
+      deserializer = getLineSerde().createDeserializer(schema, meta, 
targetColumnIndexes);
+      deserializer.init();
+    }
+
+    public TextLineSerDe getLineSerde() {
+      return DelimitedTextFile.getLineSerde(meta);
+    }
+
+    @Override
+    public float getProgress() {
+      try {
+        if (!reader.isReadable()) {
+          return 1.0f;
+        }
+        long filePos = reader.getCompressedPosition();
+        if (startOffset == filePos) {
+          return 0.0f;
+        } else {
+          long readBytes = filePos - startOffset;
+          long remainingBytes = Math.max(endOffset - filePos, 0);
+          return Math.min(1.0f, (float) (readBytes) / (float) (readBytes + 
remainingBytes));
+        }
+      } catch (IOException e) {
+        LOG.error(e.getMessage(), e);
+        return 0.0f;
+      }
+    }
+
+    @Override
+    public Tuple next() throws IOException {
+      VTuple tuple;
+
+      if (!reader.isReadable()) {
+        return null;
+      }
+
+      try {
+
+        // this loop will continue until one tuple is build or EOS (end of 
stream).
+        do {
+
+          ByteBuf buf = reader.readLine();
+
+          // if no more line, then return EOT (end of tuple)
+          if (buf == null) {
+            return null;
+          }
+
+          // If there is no required column, we just read each line
+          // and then return an empty tuple without parsing line.
+          if (targets.length == 0) {
+            recordCount++;
+            return EmptyTuple.get();
+          }
+
+          tuple = new VTuple(schema.size());
+
+          try {
+            deserializer.deserialize(buf, tuple);
+            // if a line is read normaly, it exists this loop.
+            break;
+
+          } catch (TextLineParsingError tae) {
+
+            errorNum++;
+
+            // suppress too many log prints, which probably cause performance 
degradation
+            if (errorNum < errorPrintOutMaxNum) {
+              LOG.warn("Ignore JSON Parse Error (" + errorNum + "): ", tae);
+            }
+
+            // Only when the maximum error torrence limit is set (i.e., 
errorTorrenceMaxNum >= 0),
+            // it checks if the number of parsing error exceeds the max limit.
+            // Otherwise, it will ignore all parsing errors.
+            if (errorTorrenceMaxNum >= 0 && errorNum > errorTorrenceMaxNum) {
+              throw tae;
+            }
+            continue;
+          }
+
+        } while (reader.isReadable()); // continue until EOS
+
+        // recordCount means the number of actual read records. We increment 
the count here.
+        recordCount++;
+
+        return tuple;
+
+      } catch (Throwable t) {
+        LOG.error(t);
+        throw new IOException(t);
+      }
+    }
+
+    @Override
+    public void reset() throws IOException {
+      init();
+    }
+
+    @Override
+    public void close() throws IOException {
+      try {
+        if (deserializer != null) {
+          deserializer.release();
+        }
+
+        if (tableStats != null && reader != null) {
+          tableStats.setReadBytes(reader.getReadBytes());  //Actual Processed 
Bytes. (decompressed bytes + overhead)
+          tableStats.setNumRows(recordCount);
+        }
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("DelimitedTextFileScanner processed record:" + 
recordCount);
+        }
+      } finally {
+        IOUtils.cleanup(LOG, reader);
+        reader = null;
+      }
+    }
+
+    @Override
+    public boolean isProjectable() {
+      return true;
+    }
+
+    @Override
+    public boolean isSelectable() {
+      return false;
+    }
+
+    @Override
+    public void setSearchCondition(Object expr) {
+    }
+
+    @Override
+    public boolean isSplittable() {
+      return splittable;
+    }
+
+    @Override
+    public TableStats getInputStats() {
+      if (tableStats != null && reader != null) {
+        tableStats.setReadBytes(reader.getReadBytes());  //Actual Processed 
Bytes. (decompressed bytes + overhead)
+        tableStats.setNumRows(recordCount);
+        tableStats.setNumBytes(fragment.getLength());
+      }
+      return tableStats;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java
new file mode 100644
index 0000000..a5ac142
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBufProcessor;
+
+public class FieldSplitProcessor implements ByteBufProcessor {
+  private char delimiter; //the ascii separate character
+
+  public FieldSplitProcessor(char recordDelimiterByte) {
+    this.delimiter = recordDelimiterByte;
+  }
+
+  @Override
+  public boolean process(byte value) throws Exception {
+    return delimiter != value;
+  }
+
+  public char getDelimiter() {
+    return delimiter;
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/LineSplitProcessor.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/LineSplitProcessor.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/LineSplitProcessor.java
new file mode 100644
index 0000000..a130527
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/LineSplitProcessor.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBufProcessor;
+
+public class LineSplitProcessor implements ByteBufProcessor {
+  public static final byte CR = '\r';
+  public static final byte LF = '\n';
+  private boolean prevCharCR = false; //true of prev char was CR
+
+  @Override
+  public boolean process(byte value) throws Exception {
+    switch (value) {
+      case LF:
+        return false;
+      case CR:
+        prevCharCR = true;
+        return false;
+      default:
+        prevCharCR = false;
+        return true;
+    }
+  }
+
+  public boolean isPrevCharCR() {
+    return prevCharCR;
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java
new file mode 100644
index 0000000..ae7565d
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import com.google.protobuf.Message;
+import io.netty.buffer.ByteBuf;
+import io.netty.util.CharsetUtil;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tajo.TajoConstants;
+import org.apache.tajo.catalog.Column;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.common.TajoDataTypes;
+import org.apache.tajo.conf.TajoConf;
+import org.apache.tajo.datum.*;
+import org.apache.tajo.datum.protobuf.ProtobufJsonFormat;
+import org.apache.tajo.storage.FieldSerializerDeserializer;
+import org.apache.tajo.storage.StorageConstants;
+import org.apache.tajo.util.NumberUtil;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.CharsetDecoder;
+import java.util.TimeZone;
+
+public class TextFieldSerializerDeserializer implements 
FieldSerializerDeserializer {
+  public static final byte[] trueBytes = "true".getBytes();
+  public static final byte[] falseBytes = "false".getBytes();
+  private static ProtobufJsonFormat protobufJsonFormat = 
ProtobufJsonFormat.getInstance();
+  private final CharsetDecoder decoder = 
CharsetUtil.getDecoder(CharsetUtil.UTF_8);
+
+  private final boolean hasTimezone;
+  private final TimeZone timezone;
+
+  public TextFieldSerializerDeserializer(TableMeta meta) {
+    hasTimezone = meta.containsOption(StorageConstants.TIMEZONE);
+    timezone = TimeZone.getTimeZone(meta.getOption(StorageConstants.TIMEZONE, 
TajoConstants.DEFAULT_SYSTEM_TIMEZONE));
+  }
+
+  private static boolean isNull(ByteBuf val, ByteBuf nullBytes) {
+    return !val.isReadable() || nullBytes.equals(val);
+  }
+
+  private static boolean isNullText(ByteBuf val, ByteBuf nullBytes) {
+    return val.readableBytes() > 0 && nullBytes.equals(val);
+  }
+
+  @Override
+  public int serialize(OutputStream out, Datum datum, Column col, int 
columnIndex, byte[] nullChars)
+      throws IOException {
+    byte[] bytes;
+    int length = 0;
+    TajoDataTypes.DataType dataType = col.getDataType();
+
+    if (datum == null || datum instanceof NullDatum) {
+      switch (dataType.getType()) {
+        case CHAR:
+        case TEXT:
+          length = nullChars.length;
+          out.write(nullChars);
+          break;
+        default:
+          break;
+      }
+      return length;
+    }
+
+    switch (dataType.getType()) {
+      case BOOLEAN:
+        out.write(datum.asBool() ? trueBytes : falseBytes);
+        length = trueBytes.length;
+        break;
+      case CHAR:
+        byte[] pad = new byte[dataType.getLength() - datum.size()];
+        bytes = datum.asTextBytes();
+        out.write(bytes);
+        out.write(pad);
+        length = bytes.length + pad.length;
+        break;
+      case TEXT:
+      case BIT:
+      case INT2:
+      case INT4:
+      case INT8:
+      case FLOAT4:
+      case FLOAT8:
+      case INET4:
+      case DATE:
+      case INTERVAL:
+        bytes = datum.asTextBytes();
+        length = bytes.length;
+        out.write(bytes);
+        break;
+      case TIME:
+        if (hasTimezone) {
+          bytes = ((TimeDatum) datum).asChars(timezone, true).getBytes();
+        } else {
+          bytes = datum.asTextBytes();
+        }
+        length = bytes.length;
+        out.write(bytes);
+        break;
+      case TIMESTAMP:
+        if (hasTimezone) {
+          bytes = ((TimestampDatum) datum).asChars(timezone, true).getBytes();
+        } else {
+          bytes = datum.asTextBytes();
+        }
+        length = bytes.length;
+        out.write(bytes);
+        break;
+      case INET6:
+      case BLOB:
+        bytes = Base64.encodeBase64(datum.asByteArray(), false);
+        length = bytes.length;
+        out.write(bytes, 0, length);
+        break;
+      case PROTOBUF:
+        ProtobufDatum protobuf = (ProtobufDatum) datum;
+        byte[] protoBytes = 
protobufJsonFormat.printToString(protobuf.get()).getBytes();
+        length = protoBytes.length;
+        out.write(protoBytes, 0, protoBytes.length);
+        break;
+      case NULL_TYPE:
+      default:
+        break;
+    }
+    return length;
+  }
+
+  @Override
+  public Datum deserialize(ByteBuf buf, Column col, int columnIndex, ByteBuf 
nullChars) throws IOException {
+    Datum datum;
+    TajoDataTypes.Type type = col.getDataType().getType();
+    boolean nullField;
+    if (type == TajoDataTypes.Type.TEXT || type == TajoDataTypes.Type.CHAR) {
+      nullField = isNullText(buf, nullChars);
+    } else {
+      nullField = isNull(buf, nullChars);
+    }
+
+    if (nullField) {
+      datum = NullDatum.get();
+    } else {
+      switch (type) {
+        case BOOLEAN:
+          byte bool = buf.readByte();
+          datum = DatumFactory.createBool(bool == 't' || bool == 'T');
+          break;
+        case BIT:
+          datum = DatumFactory.createBit(Byte.parseByte(
+              decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString()));
+          break;
+        case CHAR:
+          datum = DatumFactory.createChar(
+              decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString().trim());
+          break;
+        case INT1:
+        case INT2:
+          datum = DatumFactory.createInt2((short) NumberUtil.parseInt(buf));
+          break;
+        case INT4:
+          datum = DatumFactory.createInt4(NumberUtil.parseInt(buf));
+          break;
+        case INT8:
+          datum = DatumFactory.createInt8(NumberUtil.parseLong(buf));
+          break;
+        case FLOAT4:
+          datum = DatumFactory.createFloat4(
+              decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString());
+          break;
+        case FLOAT8:
+          datum = DatumFactory.createFloat8(NumberUtil.parseDouble(buf));
+          break;
+        case TEXT: {
+          byte[] bytes = new byte[buf.readableBytes()];
+          buf.readBytes(bytes);
+          datum = DatumFactory.createText(bytes);
+          break;
+        }
+        case DATE:
+          datum = DatumFactory.createDate(
+              decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString());
+          break;
+        case TIME:
+          if (hasTimezone) {
+            datum = DatumFactory.createTime(
+                decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString(), timezone);
+          } else {
+            datum = DatumFactory.createTime(
+                decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString());
+          }
+          break;
+        case TIMESTAMP:
+          if (hasTimezone) {
+            datum = DatumFactory.createTimestamp(
+                decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString(), timezone);
+          } else {
+            datum = DatumFactory.createTimestamp(
+                decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString());
+          }
+          break;
+        case INTERVAL:
+          datum = DatumFactory.createInterval(
+              decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString());
+          break;
+        case PROTOBUF: {
+          ProtobufDatumFactory factory = 
ProtobufDatumFactory.get(col.getDataType());
+          Message.Builder builder = factory.newBuilder();
+          try {
+            byte[] bytes = new byte[buf.readableBytes()];
+            buf.readBytes(bytes);
+            protobufJsonFormat.merge(bytes, builder);
+            datum = factory.createDatum(builder.build());
+          } catch (IOException e) {
+            e.printStackTrace();
+            throw new RuntimeException(e);
+          }
+          break;
+        }
+        case INET4:
+          datum = DatumFactory.createInet4(
+              decoder.decode(buf.nioBuffer(buf.readerIndex(), 
buf.readableBytes())).toString());
+          break;
+        case BLOB: {
+          byte[] bytes = new byte[buf.readableBytes()];
+          buf.readBytes(bytes);
+          datum = DatumFactory.createBlob(Base64.decodeBase64(bytes));
+          break;
+        }
+        default:
+          datum = NullDatum.get();
+          break;
+      }
+    }
+    return datum;
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java
new file mode 100644
index 0000000..7ebfa79
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBuf;
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.storage.Tuple;
+
+import java.io.IOException;
+
+/**
+ * Reads a text line and fills a Tuple with values
+ */
+public abstract class TextLineDeserializer {
+  protected Schema schema;
+  protected TableMeta meta;
+  protected int [] targetColumnIndexes;
+
+  public TextLineDeserializer(Schema schema, TableMeta meta, int [] 
targetColumnIndexes) {
+    this.schema = schema;
+    this.meta = meta;
+    this.targetColumnIndexes = targetColumnIndexes;
+  }
+
+  /**
+   * Initialize SerDe
+   */
+  public abstract void init();
+
+  /**
+   * It fills a tuple with a read fields in a given line.
+   *
+   * @param buf Read line
+   * @param output Tuple to be filled with read fields
+   * @throws java.io.IOException
+   */
+  public abstract void deserialize(final ByteBuf buf, Tuple output) throws 
IOException, TextLineParsingError;
+
+  /**
+   * Release external resources
+   */
+  public abstract void release();
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineParsingError.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineParsingError.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineParsingError.java
new file mode 100644
index 0000000..f0bae5e
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineParsingError.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+public class TextLineParsingError extends Exception {
+
+  public TextLineParsingError(Throwable t) {
+    super(t);
+  }
+
+  public TextLineParsingError(String message, Throwable t) {
+    super(t.getMessage() + ", Error line: " + message);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java
new file mode 100644
index 0000000..e81e289
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import io.netty.buffer.ByteBuf;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.datum.NullDatum;
+import org.apache.tajo.storage.BufferPool;
+import org.apache.tajo.storage.StorageConstants;
+
+/**
+ * Pluggable Text Line SerDe class
+ */
+public abstract class TextLineSerDe {
+
+  public TextLineSerDe() {
+  }
+
+  public abstract TextLineDeserializer createDeserializer(Schema schema, 
TableMeta meta, int [] targetColumnIndexes);
+
+  public abstract TextLineSerializer createSerializer(Schema schema, TableMeta 
meta);
+
+  public static ByteBuf getNullChars(TableMeta meta) {
+    byte[] nullCharByteArray = getNullCharsAsBytes(meta);
+
+    ByteBuf nullChars = BufferPool.directBuffer(nullCharByteArray.length, 
nullCharByteArray.length);
+    nullChars.writeBytes(nullCharByteArray);
+
+    return nullChars;
+  }
+
+  public static byte [] getNullCharsAsBytes(TableMeta meta) {
+    byte [] nullChars;
+
+    String nullCharacters = 
StringEscapeUtils.unescapeJava(meta.getOption(StorageConstants.TEXT_NULL,
+        NullDatum.DEFAULT_TEXT));
+    if (StringUtils.isEmpty(nullCharacters)) {
+      nullChars = NullDatum.get().asTextBytes();
+    } else {
+      nullChars = nullCharacters.getBytes();
+    }
+
+    return nullChars;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerializer.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerializer.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerializer.java
new file mode 100644
index 0000000..0c2761f
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerializer.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.text;
+
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.storage.Tuple;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * Write a Tuple into single text formatted line
+ */
+public abstract class TextLineSerializer {
+  protected Schema schema;
+  protected TableMeta meta;
+
+  public TextLineSerializer(Schema schema, TableMeta meta) {
+    this.schema = schema;
+    this.meta = meta;
+  }
+
+  public abstract void init();
+
+  public abstract int serialize(OutputStream out, Tuple input) throws 
IOException;
+
+  public abstract void release();
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/CodecFactory.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/CodecFactory.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/CodecFactory.java
new file mode 100644
index 0000000..f76593e
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/CodecFactory.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.parquet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.compress.*;
+import org.apache.hadoop.util.ReflectionUtils;
+import parquet.bytes.BytesInput;
+import parquet.hadoop.BadConfigurationException;
+import parquet.hadoop.metadata.CompressionCodecName;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+class CodecFactory {
+
+  public class BytesDecompressor {
+
+    private final CompressionCodec codec;
+    private final Decompressor decompressor;
+
+    public BytesDecompressor(CompressionCodec codec) {
+      this.codec = codec;
+      if (codec != null) {
+        decompressor = CodecPool.getDecompressor(codec);
+      } else {
+        decompressor = null;
+      }
+    }
+
+    public BytesInput decompress(BytesInput bytes, int uncompressedSize) 
throws IOException {
+      final BytesInput decompressed;
+      if (codec != null) {
+        decompressor.reset();
+        InputStream is = codec.createInputStream(new 
ByteArrayInputStream(bytes.toByteArray()), decompressor);
+        decompressed = BytesInput.from(is, uncompressedSize);
+      } else {
+        decompressed = bytes;
+      }
+      return decompressed;
+    }
+
+    private void release() {
+      if (decompressor != null) {
+        CodecPool.returnDecompressor(decompressor);
+      }
+    }
+  }
+
+  /**
+   * Encapsulates the logic around hadoop compression
+   *
+   * @author Julien Le Dem
+   *
+   */
+  public static class BytesCompressor {
+
+    private final CompressionCodec codec;
+    private final Compressor compressor;
+    private final ByteArrayOutputStream compressedOutBuffer;
+    private final CompressionCodecName codecName;
+
+    public BytesCompressor(CompressionCodecName codecName, CompressionCodec 
codec, int pageSize) {
+      this.codecName = codecName;
+      this.codec = codec;
+      if (codec != null) {
+        this.compressor = CodecPool.getCompressor(codec);
+        this.compressedOutBuffer = new ByteArrayOutputStream(pageSize);
+      } else {
+        this.compressor = null;
+        this.compressedOutBuffer = null;
+      }
+    }
+
+    public BytesInput compress(BytesInput bytes) throws IOException {
+      final BytesInput compressedBytes;
+      if (codec == null) {
+        compressedBytes = bytes;
+      } else {
+        compressedOutBuffer.reset();
+        if (compressor != null) {
+          // null compressor for non-native gzip
+          compressor.reset();
+        }
+        CompressionOutputStream cos = 
codec.createOutputStream(compressedOutBuffer, compressor);
+        bytes.writeAllTo(cos);
+        cos.finish();
+        cos.close();
+        compressedBytes = BytesInput.from(compressedOutBuffer);
+      }
+      return compressedBytes;
+    }
+
+    private void release() {
+      if (compressor != null) {
+        CodecPool.returnCompressor(compressor);
+      }
+    }
+
+    public CompressionCodecName getCodecName() {
+      return codecName;
+    }
+
+  }
+
+  private final Map<CompressionCodecName, BytesCompressor> compressors = new 
HashMap<CompressionCodecName, BytesCompressor>();
+  private final Map<CompressionCodecName, BytesDecompressor> decompressors = 
new HashMap<CompressionCodecName, BytesDecompressor>();
+  private final Map<String, CompressionCodec> codecByName = new 
HashMap<String, CompressionCodec>();
+  private final Configuration configuration;
+
+  public CodecFactory(Configuration configuration) {
+    this.configuration = configuration;
+  }
+
+  /**
+   *
+   * @param codecName the requested codec
+   * @return the corresponding hadoop codec. null if UNCOMPRESSED
+   */
+  private CompressionCodec getCodec(CompressionCodecName codecName) {
+    String codecClassName = codecName.getHadoopCompressionCodecClassName();
+    if (codecClassName == null) {
+      return null;
+    }
+    CompressionCodec codec = codecByName.get(codecClassName);
+    if (codec != null) {
+      return codec;
+    }
+
+    try {
+      Class<?> codecClass = Class.forName(codecClassName);
+      codec = (CompressionCodec)ReflectionUtils.newInstance(codecClass, 
configuration);
+      codecByName.put(codecClassName, codec);
+      return codec;
+    } catch (ClassNotFoundException e) {
+      throw new BadConfigurationException("Class " + codecClassName + " was 
not found", e);
+    }
+  }
+
+  public BytesCompressor getCompressor(CompressionCodecName codecName, int 
pageSize) {
+    BytesCompressor comp = compressors.get(codecName);
+    if (comp == null) {
+      CompressionCodec codec = getCodec(codecName);
+      comp = new BytesCompressor(codecName, codec, pageSize);
+      compressors.put(codecName, comp);
+    }
+    return comp;
+  }
+
+  public BytesDecompressor getDecompressor(CompressionCodecName codecName) {
+    BytesDecompressor decomp = decompressors.get(codecName);
+    if (decomp == null) {
+      CompressionCodec codec = getCodec(codecName);
+      decomp = new BytesDecompressor(codec);
+      decompressors.put(codecName, decomp);
+    }
+    return decomp;
+  }
+
+  public void release() {
+    for (BytesCompressor compressor : compressors.values()) {
+      compressor.release();
+    }
+    compressors.clear();
+    for (BytesDecompressor decompressor : decompressors.values()) {
+      decompressor.release();
+    }
+    decompressors.clear();
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/ColumnChunkPageWriteStore.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/ColumnChunkPageWriteStore.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/ColumnChunkPageWriteStore.java
new file mode 100644
index 0000000..0dedd9b
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/ColumnChunkPageWriteStore.java
@@ -0,0 +1,206 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.parquet;
+
+import parquet.Log;
+import parquet.bytes.BytesInput;
+import parquet.bytes.CapacityByteArrayOutputStream;
+import parquet.column.ColumnDescriptor;
+import parquet.column.Encoding;
+import parquet.column.page.DictionaryPage;
+import parquet.column.page.PageWriteStore;
+import parquet.column.page.PageWriter;
+import parquet.column.statistics.BooleanStatistics;
+import parquet.column.statistics.Statistics;
+import parquet.format.converter.ParquetMetadataConverter;
+import parquet.io.ParquetEncodingException;
+import parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.util.*;
+
+import static 
org.apache.tajo.storage.thirdparty.parquet.CodecFactory.BytesCompressor;
+import static parquet.Log.INFO;
+
+class ColumnChunkPageWriteStore implements PageWriteStore {
+  private static final Log LOG = Log.getLog(ColumnChunkPageWriteStore.class);
+
+  private static ParquetMetadataConverter parquetMetadataConverter = new 
ParquetMetadataConverter();
+
+  private static final class ColumnChunkPageWriter implements PageWriter {
+
+    private final ColumnDescriptor path;
+    private final BytesCompressor compressor;
+
+    private final CapacityByteArrayOutputStream buf;
+    private DictionaryPage dictionaryPage;
+
+    private long uncompressedLength;
+    private long compressedLength;
+    private long totalValueCount;
+    private int pageCount;
+
+    private Set<Encoding> encodings = new HashSet<Encoding>();
+
+    private Statistics totalStatistics;
+
+    private ColumnChunkPageWriter(ColumnDescriptor path, BytesCompressor 
compressor, int initialSize) {
+      this.path = path;
+      this.compressor = compressor;
+      this.buf = new CapacityByteArrayOutputStream(initialSize);
+      this.totalStatistics = 
Statistics.getStatsBasedOnType(this.path.getType());
+    }
+
+    @Deprecated
+    @Override
+    public void writePage(BytesInput bytes,
+                          int valueCount,
+                          Encoding rlEncoding,
+                          Encoding dlEncoding,
+                          Encoding valuesEncoding) throws IOException {
+      long uncompressedSize = bytes.size();
+      BytesInput compressedBytes = compressor.compress(bytes);
+      long compressedSize = compressedBytes.size();
+      BooleanStatistics statistics = new BooleanStatistics(); // dummy stats 
object
+      parquetMetadataConverter.writeDataPageHeader(
+          (int)uncompressedSize,
+          (int)compressedSize,
+          valueCount,
+          statistics,
+          rlEncoding,
+          dlEncoding,
+          valuesEncoding,
+          buf);
+      this.uncompressedLength += uncompressedSize;
+      this.compressedLength += compressedSize;
+      this.totalValueCount += valueCount;
+      this.pageCount += 1;
+      compressedBytes.writeAllTo(buf);
+      encodings.add(rlEncoding);
+      encodings.add(dlEncoding);
+      encodings.add(valuesEncoding);
+    }
+
+    @Override
+    public void writePage(BytesInput bytes,
+                          int valueCount,
+                          Statistics statistics,
+                          Encoding rlEncoding,
+                          Encoding dlEncoding,
+                          Encoding valuesEncoding) throws IOException {
+      long uncompressedSize = bytes.size();
+      BytesInput compressedBytes = compressor.compress(bytes);
+      long compressedSize = compressedBytes.size();
+      parquetMetadataConverter.writeDataPageHeader(
+          (int)uncompressedSize,
+          (int)compressedSize,
+          valueCount,
+          statistics,
+          rlEncoding,
+          dlEncoding,
+          valuesEncoding,
+          buf);
+      this.uncompressedLength += uncompressedSize;
+      this.compressedLength += compressedSize;
+      this.totalValueCount += valueCount;
+      this.pageCount += 1;
+      this.totalStatistics.mergeStatistics(statistics);
+      compressedBytes.writeAllTo(buf);
+      encodings.add(rlEncoding);
+      encodings.add(dlEncoding);
+      encodings.add(valuesEncoding);
+    }
+
+    @Override
+    public long getMemSize() {
+      return buf.size();
+    }
+
+    public void writeToFileWriter(ParquetFileWriter writer) throws IOException 
{
+      writer.startColumn(path, totalValueCount, compressor.getCodecName());
+      if (dictionaryPage != null) {
+        writer.writeDictionaryPage(dictionaryPage);
+        encodings.add(dictionaryPage.getEncoding());
+      }
+      writer.writeDataPages(BytesInput.from(buf), uncompressedLength, 
compressedLength, totalStatistics, new ArrayList<Encoding>(encodings));
+      writer.endColumn();
+      if (INFO) {
+        LOG.info(
+            String.format(
+                "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d 
pages, encodings: %s",
+                buf.size(), path, totalValueCount, uncompressedLength, 
compressedLength, pageCount, encodings)
+                + (dictionaryPage != null ? String.format(
+                ", dic { %,d entries, %,dB raw, %,dB comp}",
+                dictionaryPage.getDictionarySize(), 
dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize())
+                : ""));
+      }
+      encodings.clear();
+      pageCount = 0;
+    }
+
+    @Override
+    public long allocatedSize() {
+      return buf.getCapacity();
+    }
+
+    @Override
+    public void writeDictionaryPage(DictionaryPage dictionaryPage) throws 
IOException {
+      if (this.dictionaryPage != null) {
+        throw new ParquetEncodingException("Only one dictionary page is 
allowed");
+      }
+      BytesInput dictionaryBytes = dictionaryPage.getBytes();
+      int uncompressedSize = (int)dictionaryBytes.size();
+      BytesInput compressedBytes = compressor.compress(dictionaryBytes);
+      this.dictionaryPage = new 
DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, 
dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
+    }
+
+    @Override
+    public String memUsageString(String prefix) {
+      return buf.memUsageString(prefix + " ColumnChunkPageWriter");
+    }
+  }
+
+  private final Map<ColumnDescriptor, ColumnChunkPageWriter> writers = new 
HashMap<ColumnDescriptor, ColumnChunkPageWriter>();
+  private final MessageType schema;
+  private final BytesCompressor compressor;
+  private final int initialSize;
+
+  public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType 
schema, int initialSize) {
+    this.compressor = compressor;
+    this.schema = schema;
+    this.initialSize = initialSize;
+  }
+
+  @Override
+  public PageWriter getPageWriter(ColumnDescriptor path) {
+    if (!writers.containsKey(path)) {
+      writers.put(path,  new ColumnChunkPageWriter(path, compressor, 
initialSize));
+    }
+    return writers.get(path);
+  }
+
+  public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
+    List<ColumnDescriptor> columns = schema.getColumns();
+    for (ColumnDescriptor columnDescriptor : columns) {
+      ColumnChunkPageWriter pageWriter = writers.get(columnDescriptor);
+      pageWriter.writeToFileWriter(writer);
+    }
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4561711f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/InternalParquetRecordReader.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/InternalParquetRecordReader.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/InternalParquetRecordReader.java
new file mode 100644
index 0000000..6bbd7b5
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/parquet/InternalParquetRecordReader.java
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.thirdparty.parquet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import parquet.Log;
+import parquet.column.ColumnDescriptor;
+import parquet.column.page.PageReadStore;
+import parquet.filter.UnboundRecordFilter;
+import parquet.hadoop.ParquetFileReader;
+import parquet.hadoop.api.ReadSupport;
+import parquet.hadoop.metadata.BlockMetaData;
+import parquet.hadoop.util.counters.BenchmarkCounter;
+import parquet.io.ColumnIOFactory;
+import parquet.io.MessageColumnIO;
+import parquet.io.ParquetDecodingException;
+import parquet.io.api.RecordMaterializer;
+import parquet.schema.GroupType;
+import parquet.schema.MessageType;
+import parquet.schema.Type;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import static java.lang.String.format;
+import static parquet.Log.DEBUG;
+
+class InternalParquetRecordReader<T> {
+  private static final Log LOG = Log.getLog(InternalParquetRecordReader.class);
+
+  private final ColumnIOFactory columnIOFactory = new ColumnIOFactory();
+
+  private MessageType requestedSchema;
+  private MessageType fileSchema;
+  private int columnCount;
+  private final ReadSupport<T> readSupport;
+
+  private RecordMaterializer<T> recordConverter;
+
+  private T currentValue;
+  private long total;
+  private int current = 0;
+  private int currentBlock = -1;
+  private ParquetFileReader reader;
+  private parquet.io.RecordReader<T> recordReader;
+  private UnboundRecordFilter recordFilter;
+
+  private long totalTimeSpentReadingBytes;
+  private long totalTimeSpentProcessingRecords;
+  private long startedAssemblingCurrentBlockAt;
+
+  private long totalCountLoadedSoFar = 0;
+
+  private Path file;
+
+  /**
+   * @param readSupport Object which helps reads files of the given type, e.g. 
Thrift, Avro.
+   */
+  public InternalParquetRecordReader(ReadSupport<T> readSupport) {
+    this(readSupport, null);
+  }
+
+  /**
+   * @param readSupport Object which helps reads files of the given type, e.g. 
Thrift, Avro.
+   * @param filter Optional filter for only returning matching records.
+   */
+  public InternalParquetRecordReader(ReadSupport<T> readSupport, 
UnboundRecordFilter
+      filter) {
+    this.readSupport = readSupport;
+    this.recordFilter = filter;
+  }
+
+  private void checkRead() throws IOException {
+    if (current == totalCountLoadedSoFar) {
+      if (current != 0) {
+        long timeAssembling = System.currentTimeMillis() - 
startedAssemblingCurrentBlockAt;
+        totalTimeSpentProcessingRecords += timeAssembling;
+        LOG.info("Assembled and processed " + totalCountLoadedSoFar + " 
records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords 
+ " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " 
rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / 
totalTimeSpentProcessingRecords) + " cell/ms");
+        long totalTime = totalTimeSpentProcessingRecords + 
totalTimeSpentReadingBytes;
+        long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
+        long percentProcessing = 100 * totalTimeSpentProcessingRecords / 
totalTime;
+        LOG.info("time spent so far " + percentReading + "% reading 
("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing 
("+totalTimeSpentProcessingRecords+" ms)");
+      }
+
+      LOG.info("at row " + current + ". reading next block");
+      long t0 = System.currentTimeMillis();
+      PageReadStore pages = reader.readNextRowGroup();
+      if (pages == null) {
+        throw new IOException("expecting more rows but reached last block. 
Read " + current + " out of " + total);
+      }
+      long timeSpentReading = System.currentTimeMillis() - t0;
+      totalTimeSpentReadingBytes += timeSpentReading;
+      BenchmarkCounter.incrementTime(timeSpentReading);
+      LOG.info("block read in memory in " + timeSpentReading + " ms. row count 
= " + pages.getRowCount());
+      if (Log.DEBUG) LOG.debug("initializing Record assembly with requested 
schema " + requestedSchema);
+      MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, 
fileSchema);
+      recordReader = columnIO.getRecordReader(pages, recordConverter, 
recordFilter);
+      startedAssemblingCurrentBlockAt = System.currentTimeMillis();
+      totalCountLoadedSoFar += pages.getRowCount();
+      ++ currentBlock;
+    }
+  }
+
+  public void close() throws IOException {
+    reader.close();
+  }
+
+  public Void getCurrentKey() throws IOException, InterruptedException {
+    return null;
+  }
+
+  public T getCurrentValue() throws IOException,
+      InterruptedException {
+    return currentValue;
+  }
+
+  public float getProgress() throws IOException, InterruptedException {
+    return (float) current / total;
+  }
+
+  public void initialize(MessageType requestedSchema, MessageType fileSchema,
+                         Map<String, String> extraMetadata, Map<String, 
String> readSupportMetadata,
+                         Path file, List<BlockMetaData> blocks, Configuration 
configuration)
+      throws IOException {
+    this.requestedSchema = requestedSchema;
+    this.fileSchema = fileSchema;
+    this.file = file;
+    this.columnCount = this.requestedSchema.getPaths().size();
+    this.recordConverter = readSupport.prepareForRead(
+        configuration, extraMetadata, fileSchema,
+        new ReadSupport.ReadContext(requestedSchema, readSupportMetadata));
+
+    List<ColumnDescriptor> columns = requestedSchema.getColumns();
+    reader = new ParquetFileReader(configuration, file, blocks, columns);
+    for (BlockMetaData block : blocks) {
+      total += block.getRowCount();
+    }
+    LOG.info("RecordReader initialized will read a total of " + total + " 
records.");
+  }
+
+  private boolean contains(GroupType group, String[] path, int index) {
+    if (index == path.length) {
+      return false;
+    }
+    if (group.containsField(path[index])) {
+      Type type = group.getType(path[index]);
+      if (type.isPrimitive()) {
+        return index + 1 == path.length;
+      } else {
+        return contains(type.asGroupType(), path, index + 1);
+      }
+    }
+    return false;
+  }
+
+  public boolean nextKeyValue() throws IOException, InterruptedException {
+    if (current < total) {
+      try {
+        checkRead();
+        currentValue = recordReader.read();
+        if (DEBUG) LOG.debug("read value: " + currentValue);
+        current ++;
+      } catch (RuntimeException e) {
+        throw new ParquetDecodingException(format("Can not read value at %d in 
block %d in file %s", current, currentBlock, file), e);
+      }
+      return true;
+    }
+    return false;
+  }
+}

[11/45] tajo git commit: TAJO-1233: Merge hbase_storage branch to the master branch. (Hyoungjun Kim via hyunsik)

Reply via email to