Modified: hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/TCTLSeparatedProtocol.java URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/TCTLSeparatedProtocol.java?rev=712905&r1=712904&r2=712905&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/TCTLSeparatedProtocol.java (original) +++ hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/TCTLSeparatedProtocol.java Mon Nov 10 17:50:06 2008 @@ -28,6 +28,7 @@ import com.facebook.thrift.protocol.*; import java.util.*; import java.util.regex.Pattern; +import java.util.regex.Matcher; import java.io.*; import org.apache.hadoop.conf.Configuration; import java.util.Properties; @@ -39,7 +40,8 @@ * This is not thrift compliant in that it doesn't write out field ids * so things cannot actually be versioned. */ -public class TCTLSeparatedProtocol extends TProtocol implements ConfigurableTProtocol { +public class TCTLSeparatedProtocol extends TProtocol + implements ConfigurableTProtocol, WriteNullsProtocol, SkippableTProtocol { final static Log LOG = LogFactory.getLog(TCTLSeparatedProtocol.class.getName()); @@ -57,29 +59,36 @@ /** * These are defaults, but for now leaving them like this */ - final static protected byte defaultPrimarySeparatorByte = 1; - final static protected byte defaultSecondarySeparatorByte = 2; - final static protected byte defaultRowSeparatorByte = (byte)'\n'; - final static protected byte defaultMapSeparatorByte = 3; + final static protected String defaultPrimarySeparator = "\001"; + final static protected String defaultSecondarySeparator = "\002"; + final static protected String defaultRowSeparator = "\n"; + final static protected String defaultMapSeparator = "\003"; /** * The separators for this instance */ - protected byte primarySeparatorByte; - protected byte secondarySeparatorByte; - protected byte rowSeparatorByte; - protected byte mapSeparatorByte; + protected String primarySeparator; + protected String secondarySeparator; + protected String rowSeparator; + protected String mapSeparator; protected Pattern primaryPattern; protected Pattern secondaryPattern; protected Pattern mapPattern; /** + * The quote character when supporting quotes with ability to not split across quoted entries. Like csv. + * Note that escaping the quote is not currently supported. + */ + protected String quote; + + + /** * Inspect the separators this instance is configured with. */ - public byte getPrimarySeparator() { return primarySeparatorByte; } - public byte getSecondarySeparator() { return secondarySeparatorByte; } - public byte getRowSeparator() { return rowSeparatorByte; } - public byte getMapSeparator() { return mapSeparatorByte; } + public String getPrimarySeparator() { return primarySeparator; } + public String getSecondarySeparator() { return secondarySeparator; } + public String getRowSeparator() { return rowSeparator; } + public String getMapSeparator() { return mapSeparator; } /** @@ -164,6 +173,17 @@ protected int bufferSize; /** + * The string representing nulls in the serialized data. e.g., \N as in mysql + */ + protected String nullString; + + /** + * The nullString in bytes + */ + protected byte nullBuf[]; + + + /** * A convenience class for tokenizing a TTransport */ @@ -174,12 +194,12 @@ final String separator; byte buf[]; - public SimpleTransportTokenizer(TTransport trans, byte separator, int buffer_length) { + public SimpleTransportTokenizer(TTransport trans, String separator, int buffer_length) { this.trans = trans; - byte [] separators = new byte[1]; - separators[0] = separator; - this.separator = new String(separators); + this.separator = separator; buf = new byte[buffer_length]; + // do not fill tokenizer until user requests since filling it could read in data + // not meant for this instantiation. fillTokenizer(); } @@ -191,7 +211,7 @@ return false; } String row = new String(buf, 0, length); - tokenizer = new StringTokenizer(row, new String(separator), true); + tokenizer = new StringTokenizer(row, separator, true); } catch(TTransportException e) { e.printStackTrace(); tokenizer = null; @@ -204,6 +224,10 @@ StringBuffer ret = null; boolean done = false; + if(tokenizer == null) { + fillTokenizer(); + } + while(! done) { if(! tokenizer.hasMoreTokens()) { @@ -211,7 +235,6 @@ break; } } - try { final String nextToken = tokenizer.nextToken(); @@ -229,7 +252,8 @@ done = true; } } // while ! done - return ret == null ? null : ret.toString(); + final String theRet = ret == null ? null : ret.toString(); + return theRet; } }; @@ -242,23 +266,23 @@ */ public TCTLSeparatedProtocol(TTransport trans) { - this(trans, defaultPrimarySeparatorByte, defaultSecondarySeparatorByte, defaultMapSeparatorByte, defaultRowSeparatorByte, false, 4096); + this(trans, defaultPrimarySeparator, defaultSecondarySeparator, defaultMapSeparator, defaultRowSeparator, false, 4096); } public TCTLSeparatedProtocol(TTransport trans, int buffer_size) { - this(trans, defaultPrimarySeparatorByte, defaultSecondarySeparatorByte, defaultMapSeparatorByte, defaultRowSeparatorByte, false, buffer_size); + this(trans, defaultPrimarySeparator, defaultSecondarySeparator, defaultMapSeparator, defaultRowSeparator, false, buffer_size); } /** * @param trans - the ttransport to use as input or output - * @param primarySeparatorByte the separator between columns (aka fields) - * @param secondarySeparatorByte the separator within a field for things like sets and maps and lists - * @param mapSeparatorByte - the key/value separator - * @param rowSeparatorByte - the record separator + * @param primarySeparator the separator between columns (aka fields) + * @param secondarySeparator the separator within a field for things like sets and maps and lists + * @param mapSeparator - the key/value separator + * @param rowSeparator - the record separator * @param returnNulls - whether to return a null or an empty string for fields that seem empty (ie two primary separators back to back) */ - public TCTLSeparatedProtocol(TTransport trans, byte primarySeparatorByte, byte secondarySeparatorByte, byte mapSeparatorByte, byte rowSeparatorByte, + public TCTLSeparatedProtocol(TTransport trans, String primarySeparator, String secondarySeparator, String mapSeparator, String rowSeparator, boolean returnNulls, int bufferSize) { super(trans); @@ -266,34 +290,103 @@ returnNulls = returnNulls; - this.primarySeparatorByte = primarySeparatorByte; - this.secondarySeparatorByte = secondarySeparatorByte; - this.rowSeparatorByte = rowSeparatorByte; - this.mapSeparatorByte = mapSeparatorByte; + this.primarySeparator = primarySeparator; + this.secondarySeparator = secondarySeparator; + this.rowSeparator = rowSeparator; + this.mapSeparator = mapSeparator; this.innerTransport = trans; this.bufferSize = bufferSize; + this.nullString = "\\N"; internalInitialize(); } + /** * Sets the internal separator patterns and creates the internal tokenizer. */ protected void internalInitialize() { - byte []primarySeparator = new byte[1]; - byte []secondarySeparator = new byte[1]; - primarySeparator[0] = primarySeparatorByte; - secondarySeparator[0] = secondarySeparatorByte; - - primaryPattern = Pattern.compile(new String(primarySeparator)); - secondaryPattern = Pattern.compile(new String(secondarySeparator)); - mapPattern = Pattern.compile("\\0" + secondarySeparatorByte + "|\\0" + mapSeparatorByte); - transportTokenizer = new SimpleTransportTokenizer(innerTransport, rowSeparatorByte, bufferSize); + // in the future could allow users to specify a quote character that doesn't need escaping but for now ... + final String primaryPatternString = + quote == null ? primarySeparator : + "(?:^|" + primarySeparator + ")(" + quote + "(?:[^" + quote + "]+|" + quote + quote + ")*" + quote + "|[^" + primarySeparator + "]*)"; + + if (quote != null) { + stripSeparatorPrefix = Pattern.compile("^" + primarySeparator); + stripQuotePrefix = Pattern.compile("^" + quote); + stripQuotePostfix = Pattern.compile(quote + "$"); + } + + primaryPattern = Pattern.compile(primaryPatternString); + secondaryPattern = Pattern.compile(secondarySeparator); + mapPattern = Pattern.compile(secondarySeparator + "|" + mapSeparator); + nullBuf = nullString.getBytes(); + transportTokenizer = new SimpleTransportTokenizer(innerTransport, rowSeparator, bufferSize); + } + + /** + * For quoted fields, strip away the quotes and also need something to strip away the control separator when using + * complex split method defined here. + */ + protected Pattern stripSeparatorPrefix; + protected Pattern stripQuotePrefix; + protected Pattern stripQuotePostfix; + + + /** + * + * Split the line based on a complex regex pattern + * + * @param line the current row + * @param p the pattern for matching fields in the row + * @return List of Strings - not including the separator in them + */ + protected String[] complexSplit(String line, Pattern p) { + + ArrayList<String> list = new ArrayList<String>(); + Matcher m = p.matcher(line); + // For each field + while (m.find()) { + String match = m.group(); + if (match == null) + break; + if (match.length() == 0) + match = null; + else { + if(stripSeparatorPrefix.matcher(match).find()) { + match = match.substring(1); + } + if(stripQuotePrefix.matcher(match).find()) { + match = match.substring(1); + } + if(stripQuotePostfix.matcher(match).find()) { + match = match.substring(0,match.length() - 1); + } + } + list.add(match); + } + return (String[])list.toArray(new String[1]); } + + + protected String getByteValue(String altValue, String defaultVal) { + if (altValue != null && altValue.length() > 0) { + try { + byte b [] = new byte[1]; + b[0] = Byte.valueOf(altValue).byteValue(); + return new String(b); + } catch(NumberFormatException e) { + return altValue; + } + } + return defaultVal; + } + + /** * Initialize the TProtocol * @param conf System properties @@ -301,13 +394,16 @@ * @throws TException */ public void initialize(Configuration conf, Properties tbl) throws TException { - primarySeparatorByte = Byte.valueOf(tbl.getProperty(Constants.FIELD_DELIM, String.valueOf(primarySeparatorByte))).byteValue(); - LOG.debug("collections delim=<" + tbl.getProperty(Constants.COLLECTION_DELIM) + ">" ); - secondarySeparatorByte = Byte.valueOf(tbl.getProperty(Constants.COLLECTION_DELIM, String.valueOf(secondarySeparatorByte))).byteValue(); - rowSeparatorByte = Byte.valueOf(tbl.getProperty(Constants.LINE_DELIM, String.valueOf(rowSeparatorByte))).byteValue(); - mapSeparatorByte = Byte.valueOf(tbl.getProperty(Constants.MAPKEY_DELIM, String.valueOf(mapSeparatorByte))).byteValue(); + + + primarySeparator = getByteValue(tbl.getProperty(Constants.FIELD_DELIM), primarySeparator); + secondarySeparator = getByteValue(tbl.getProperty(Constants.COLLECTION_DELIM), secondarySeparator); + rowSeparator = getByteValue(tbl.getProperty(Constants.LINE_DELIM), rowSeparator); + mapSeparator = getByteValue(tbl.getProperty(Constants.MAPKEY_DELIM), mapSeparator); returnNulls = Boolean.valueOf(tbl.getProperty(ReturnNullsKey, String.valueOf(returnNulls))).booleanValue(); bufferSize = Integer.valueOf(tbl.getProperty(BufferSizeKey, String.valueOf(bufferSize))).intValue(); + nullString = tbl.getProperty(Constants.SERIALIZATION_NULL_FORMAT, "\\N"); + quote = tbl.getProperty(Constants.QUOTE_CHAR, null); internalInitialize(); @@ -329,7 +425,7 @@ public void writeFieldBegin(TField field) throws TException { if(! firstField) { - writeByte(primarySeparatorByte); + internalWriteString(primarySeparator); } firstField = false; } @@ -424,21 +520,34 @@ writeString(String.valueOf(dub)); } + public void internalWriteString(String str) throws TException { + if(str != null) { + final byte buf[] = str.getBytes(); + trans_.write(buf, 0, buf.length); + } else { + trans_.write(nullBuf, 0, nullBuf.length); + } + } + public void writeString(String str) throws TException { if(inner) { if(!firstInnerField) { // super hack city notice the mod plus only happens after firstfield hit, so == 0 is right. if(isMap && elemIndex++ % 2 == 0) { - writeByte(mapSeparatorByte); + internalWriteString(mapSeparator); } else { - writeByte(secondarySeparatorByte); + internalWriteString(secondarySeparator); } } else { firstInnerField = false; } } - final byte buf[] = str.getBytes(); - trans_.write(buf, 0, buf.length); + if(str != null) { + final byte buf[] = str.getBytes(); + trans_.write(buf, 0, buf.length); + } else { + trans_.write(nullBuf, 0, nullBuf.length); + } } public void writeBinary(byte[] bin) throws TException { @@ -456,7 +565,7 @@ assert(!inner); try { final String tmp = transportTokenizer.nextToken(); - columns = primaryPattern.split(tmp); + columns = quote == null ? primaryPattern.split(tmp) : complexSplit(tmp, primaryPattern); index = 0; return new TStruct(); } catch(EOFException e) { @@ -468,6 +577,20 @@ columns = null; } + + /** + * Skip past the current field + * Just increments the field index counter. + */ + public void skip(byte type) { + if( inner) { + innerIndex++; + } else { + index++; + } + } + + public TField readFieldBegin() throws TException { assert( !inner); TField f = new TField(); @@ -483,11 +606,19 @@ public TMap readMapBegin() throws TException { assert( !inner); TMap map = new TMap(); - fields = mapPattern.split(columns[index++]); - if(fields != null) { - map.size = fields.length/2; - } else { + if(columns[index] == null || + columns[index].equals(nullString)) { + index++; + if(returnNulls) { + return null; + } + map.size = 0; + } else if(columns[index].isEmpty()) { map.size = 0; + index++; + } else { + fields = mapPattern.split(columns[index++]); + map.size = fields.length/2; } innerIndex = 0; inner = true; @@ -503,11 +634,19 @@ public TList readListBegin() throws TException { assert( !inner); TList list = new TList(); - fields = secondaryPattern.split(columns[index++]); - if(fields != null) { - list.size = fields.length ; - } else { + if(columns[index] == null || + columns[index].equals(nullString)) { + index++; + if(returnNulls) { + return null; + } + list.size = 0; + } else if(columns[index].isEmpty()) { list.size = 0; + index++; + } else { + fields = secondaryPattern.split(columns[index++]); + list.size = fields.length ; } innerIndex = 0; inner = true; @@ -521,53 +660,88 @@ public TSet readSetBegin() throws TException { assert( !inner); TSet set = new TSet(); - fields = secondaryPattern.split(columns[index++]); - if(fields != null) { - set.size = fields.length ; - } else { + if(columns[index] == null || + columns[index].equals(nullString)) { + index++; + if(returnNulls) { + return null; + } set.size = 0; + } else if(columns[index].isEmpty()) { + set.size = 0; + index++; + } else { + fields = secondaryPattern.split(columns[index++]); + set.size = fields.length ; } inner = true; innerIndex = 0; return set; } + protected boolean lastPrimitiveWasNullFlag; + + public boolean lastPrimitiveWasNull() throws TException { + return lastPrimitiveWasNullFlag; + } + + public void writeNull() throws TException { + writeString(null); + } + public void readSetEnd() throws TException { inner = false; } + public boolean readBool() throws TException { - return Boolean.valueOf(readString()).booleanValue(); + String val = readString(); + lastPrimitiveWasNullFlag = val == null; + return val == null || val.isEmpty() ? false : Boolean.valueOf(val).booleanValue(); } public byte readByte() throws TException { - return Byte.valueOf(readString()).byteValue(); + String val = readString(); + lastPrimitiveWasNullFlag = val == null; + return val == null || val.isEmpty() ? 0 : Byte.valueOf(val).byteValue(); } public short readI16() throws TException { - return Short.valueOf(readString()).shortValue(); + String val = readString(); + lastPrimitiveWasNullFlag = val == null; + return val == null || val.isEmpty() ? 0 : Short.valueOf(val).shortValue(); } public int readI32() throws TException { - return Integer.valueOf(readString()).intValue(); + String val = readString(); + lastPrimitiveWasNullFlag = val == null; + return val == null || val.isEmpty() ? 0 : Integer.valueOf(val).intValue(); } public long readI64() throws TException { - return Long.valueOf(readString()).longValue(); + String val = readString(); + lastPrimitiveWasNullFlag = val == null; + return val == null || val.isEmpty() ? 0 : Long.valueOf(val).longValue(); } public double readDouble() throws TException { - return Double.valueOf(readString()).doubleValue(); + String val = readString(); + lastPrimitiveWasNullFlag = val == null; + return val == null || val.isEmpty() ? 0 :Double.valueOf(val).doubleValue(); } - protected String [] curMapPair; public String readString() throws TException { String ret; if(!inner) { - ret = columns != null && index < columns.length ? columns[index++] : null; + ret = columns != null && index < columns.length ? columns[index] : null; + index++; } else { - ret = fields != null && innerIndex < fields.length ? fields[innerIndex++] : null; + ret = fields != null && innerIndex < fields.length ? fields[innerIndex] : null; + innerIndex++; } - return ret == null && ! returnNulls ? "" : ret; + if(ret == null || ret.equals(nullString)) + return returnNulls ? null : ""; + else + return ret; } public byte[] readBinary() throws TException {
Added: hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/WriteNullsProtocol.java URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/WriteNullsProtocol.java?rev=712905&view=auto ============================================================================== --- hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/WriteNullsProtocol.java (added) +++ hadoop/core/trunk/src/contrib/hive/serde/src/java/org/apache/hadoop/hive/serde2/thrift/WriteNullsProtocol.java Mon Nov 10 17:50:06 2008 @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.thrift; + +import com.facebook.thrift.TException; + +/** + * An interface for TProtocols that actually write out nulls - + * This should be for all those that don't actually use + * fieldids in the written data like TCTLSeparatedProtocol. + * + */ +public interface WriteNullsProtocol { + /** + * Was the last primitive read really a NULL. Need + * only be called when the value of the primitive + * was 0. ie the protocol should return 0 on nulls + * and the caller will then check if it was actually null + * For boolean this is false. + */ + public boolean lastPrimitiveWasNull() throws TException; + + /** + * Write a null + */ + public void writeNull() throws TException; + +} Modified: hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/TestTCTLSeparatedProtocol.java URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/TestTCTLSeparatedProtocol.java?rev=712905&r1=712904&r2=712905&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/TestTCTLSeparatedProtocol.java (original) +++ hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/TestTCTLSeparatedProtocol.java Mon Nov 10 17:50:06 2008 @@ -18,6 +18,9 @@ package org.apache.hadoop.hive.serde2; + +import org.apache.hadoop.hive.serde.Constants; + import junit.framework.TestCase; import java.io.*; import org.apache.hadoop.hive.serde2.*; @@ -27,6 +30,7 @@ import com.facebook.thrift.transport.*; import com.facebook.thrift.*; import com.facebook.thrift.protocol.*; +import org.apache.hadoop.conf.Configuration; public class TestTCTLSeparatedProtocol extends TestCase { @@ -71,7 +75,7 @@ // use 3 as the row buffer size to force lots of re-buffering. - TCTLSeparatedProtocol prot = new TCTLSeparatedProtocol(trans, 3); + TCTLSeparatedProtocol prot = new TCTLSeparatedProtocol(trans, 1024); prot.readStructBegin(); @@ -116,7 +120,7 @@ public void testWrites() throws Exception { try { TMemoryBuffer trans = new TMemoryBuffer(1024); - TCTLSeparatedProtocol prot = new TCTLSeparatedProtocol(trans, 3); + TCTLSeparatedProtocol prot = new TCTLSeparatedProtocol(trans, 1024); prot.writeStructBegin(new TStruct()); prot.writeFieldBegin(new TField()); @@ -164,8 +168,6 @@ String test = new String(b, 0, len); String testRef = "100348.55234.22hello world!key1val1key2val2key3val3elem1elem2bye!"; - // System.err.println("test=" + test + ">"); - // System.err.println(" ref=" + testRef + ">"); assertTrue(test.equals(testRef)); trans = new TMemoryBuffer(1023); @@ -242,4 +244,263 @@ } } + public void testQuotedWrites() throws Exception { + try { + TMemoryBuffer trans = new TMemoryBuffer(4096); + TCTLSeparatedProtocol prot = new TCTLSeparatedProtocol(trans, 4096); + Properties schema = new Properties(); + schema.setProperty(Constants.QUOTE_CHAR, "\""); + schema.setProperty(Constants.FIELD_DELIM, ","); + prot.initialize(new Configuration(), schema); + + String testStr = "\"hello, world!\""; + + prot.writeStructBegin(new TStruct()); + + prot.writeFieldBegin(new TField()); + prot.writeString(testStr); + prot.writeFieldEnd(); + + prot.writeFieldBegin(new TField()); + prot.writeListBegin(new TList()); + prot.writeString("elem1"); + prot.writeString("elem2"); + prot.writeListEnd(); + prot.writeFieldEnd(); + + prot.writeStructEnd(); + prot.writeString("\n"); + + trans.flush(); + + byte b[] = new byte[4096]; + int len = trans.read(b,0,b.length); + + + trans = new TMemoryBuffer(4096); + trans.write(b,0,len); + prot = new TCTLSeparatedProtocol(trans, 1024); + prot.initialize(new Configuration(), schema); + + prot.readStructBegin(); + prot.readFieldBegin(); + final String firstRead = prot.readString(); + prot.readFieldEnd(); + + testStr = testStr.replace("\"",""); + + assertEquals(testStr, firstRead); + + + // the 2 element list + prot.readFieldBegin(); + TList l = prot.readListBegin(); + assertTrue(l.size == 2); + assertTrue(prot.readString().equals("elem1")); + assertTrue(prot.readString().equals("elem2")); + prot.readListEnd(); + prot.readFieldEnd(); + + // shouldl return nulls at end + prot.readFieldBegin(); + assertTrue(prot.readString().equals("")); + prot.readFieldEnd(); + + // shouldl return nulls at end + prot.readFieldBegin(); + assertTrue(prot.readString().equals("")); + prot.readFieldEnd(); + + prot.readStructEnd(); + + + } catch(Exception e) { + e.printStackTrace(); + } + } + + + /** + * Tests a sample apache log format. This is actually better done in general with a more TRegexLike protocol, but for this + * case, TCTLSeparatedProtocol can do it. + */ + public void test1ApacheLogFormat() throws Exception { + try { + final String sample = "127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] \"GET /apache_pb.gif HTTP/1.0\" 200 2326"; + + TMemoryBuffer trans = new TMemoryBuffer(4096); + trans.write(sample.getBytes(), 0, sample.getBytes().length); + trans.flush(); + + TCTLSeparatedProtocol prot = new TCTLSeparatedProtocol(trans, 4096); + Properties schema = new Properties(); + + // this is a hacky way of doing the quotes since it will match any 2 of these, so + // "[ hello this is something to split [" would be considered to be quoted. + schema.setProperty(Constants.QUOTE_CHAR, "(\"|\\[|\\])"); + + schema.setProperty(Constants.FIELD_DELIM, " "); + schema.setProperty(Constants.SERIALIZATION_NULL_FORMAT, "-"); + prot.initialize(new Configuration(), schema); + + prot.readStructBegin(); + + // ip address + prot.readFieldBegin(); + final String ip = prot.readString(); + prot.readFieldEnd(); + + assertEquals("127.0.0.1", ip); + + // identd + prot.readFieldBegin(); + final String identd = prot.readString(); + prot.readFieldEnd(); + + assertEquals("", identd); + + // user + prot.readFieldBegin(); + final String user = prot.readString(); + prot.readFieldEnd(); + + assertEquals("frank",user); + + // finishTime + prot.readFieldBegin(); + final String finishTime = prot.readString(); + prot.readFieldEnd(); + + assertEquals("10/Oct/2000:13:55:36 -0700",finishTime); + + // requestLine + prot.readFieldBegin(); + final String requestLine = prot.readString(); + prot.readFieldEnd(); + + assertEquals("GET /apache_pb.gif HTTP/1.0",requestLine); + + // returncode + prot.readFieldBegin(); + final int returnCode = prot.readI32(); + prot.readFieldEnd(); + + assertEquals(200, returnCode); + + // return size + prot.readFieldBegin(); + final int returnSize = prot.readI32(); + prot.readFieldEnd(); + + assertEquals(2326, returnSize); + + prot.readStructEnd(); + + } catch(Exception e) { + e.printStackTrace(); + } + } + + + + public void testNulls() throws Exception { + try { + TMemoryBuffer trans = new TMemoryBuffer(1024); + TCTLSeparatedProtocol prot = new TCTLSeparatedProtocol(trans, 10); + + prot.writeStructBegin(new TStruct()); + + prot.writeFieldBegin(new TField()); + prot.writeString(null); + prot.writeFieldEnd(); + + prot.writeFieldBegin(new TField()); + prot.writeString(null); + prot.writeFieldEnd(); + + prot.writeFieldBegin(new TField()); + prot.writeI32(100); + prot.writeFieldEnd(); + + prot.writeFieldBegin(new TField()); + prot.writeString(null); + prot.writeFieldEnd(); + + prot.writeFieldBegin(new TField()); + prot.writeMapBegin(new TMap()); + prot.writeString(null); + prot.writeString(null); + prot.writeString("key2"); + prot.writeString(null); + prot.writeString(null); + prot.writeString("val3"); + prot.writeMapEnd(); + prot.writeFieldEnd(); + + prot.writeStructEnd(); + + byte b[] = new byte[3*1024]; + int len = trans.read(b,0,b.length); + String written = new String(b,0,len); + + String testRef = "\\N\\N100\\N\\N\\Nkey2\\N\\Nval3"; + + assertTrue(testRef.equals(written)); + + trans = new TMemoryBuffer(1023); + trans.write(b, 0, len); + + prot = new TCTLSeparatedProtocol(trans, 3); + + prot.readStructBegin(); + + prot.readFieldBegin(); + String ret = prot.readString(); + prot.readFieldEnd(); + + assertTrue(ret.equals("")); + + prot.readFieldBegin(); + ret = prot.readString(); + prot.readFieldEnd(); + + assertTrue(ret.equals("")); + + prot.readFieldBegin(); + int ret1 = prot.readI32(); + prot.readFieldEnd(); + + assertTrue(ret1 == 100); + + + prot.readFieldBegin(); + ret1 = prot.readI32(); + prot.readFieldEnd(); + + prot.readFieldBegin(); + TMap map = prot.readMapBegin(); + + assertTrue(map.size == 3); + + assertTrue(prot.readString().isEmpty()); + assertTrue(prot.readString().isEmpty()); + + assertTrue(prot.readString().equals("key2")); + assertTrue(prot.readString().isEmpty()); + + assertTrue(prot.readString().isEmpty()); + assertTrue(prot.readString().equals("val3")); + + prot.readMapEnd(); + prot.readFieldEnd(); + + assertTrue(ret1 == 0); + + } catch(Exception e) { + e.printStackTrace(); + } + } + + + } Modified: hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/dynamic_type/TestDynamicSerDe.java URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/dynamic_type/TestDynamicSerDe.java?rev=712905&r1=712904&r2=712905&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/dynamic_type/TestDynamicSerDe.java (original) +++ hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/dynamic_type/TestDynamicSerDe.java Mon Nov 10 17:50:06 2008 @@ -20,8 +20,11 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.Map; import java.util.List; import java.util.Properties; +import java.util.Random; +import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde.Constants; @@ -35,6 +38,13 @@ public class TestDynamicSerDe extends TestCase { + public static HashMap<String,String> makeHashMap(String... params) { + HashMap<String,String> r = new HashMap<String,String>(); + for(int i=0; i<params.length; i+=2) { + r.put(params[i], params[i+1]); + } + return r; + } public void testDynamicSerDe() throws Throwable { try { @@ -49,24 +59,41 @@ struct.add(Integer.valueOf(234)); struct.add(bye); struct.add(another); + struct.add(Integer.valueOf(-234)); + struct.add(Double.valueOf(1.0)); + struct.add(Double.valueOf(-2.5)); + // All protocols ArrayList<String> protocols = new ArrayList<String>(); ArrayList<Boolean> isBinaries = new ArrayList<Boolean>(); - + ArrayList<HashMap<String,String>> additionalParams = new ArrayList<HashMap<String,String>>(); + + protocols.add(org.apache.hadoop.hive.serde2.thrift.TBinarySortableProtocol.class.getName()); + isBinaries.add(true); + additionalParams.add(makeHashMap("serialization.sort.order", "++++++")); + protocols.add(org.apache.hadoop.hive.serde2.thrift.TBinarySortableProtocol.class.getName()); + isBinaries.add(true); + additionalParams.add(makeHashMap("serialization.sort.order", "------")); + + protocols.add(com.facebook.thrift.protocol.TBinaryProtocol.class.getName()); isBinaries.add(true); + additionalParams.add(null); protocols.add(com.facebook.thrift.protocol.TJSONProtocol.class.getName()); isBinaries.add(false); + additionalParams.add(null); // TSimpleJSONProtocol does not support deserialization. // protocols.add(com.facebook.thrift.protocol.TSimpleJSONProtocol.class.getName()); // isBinaries.add(false); + // additionalParams.add(null); // TCTLSeparatedProtocol is not done yet. protocols.add(org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol.class.getName()); isBinaries.add(false); + additionalParams.add(null); System.out.println("input struct = " + struct); @@ -80,8 +107,14 @@ schema.setProperty(Constants.SERIALIZATION_FORMAT, protocol); schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); schema.setProperty(Constants.SERIALIZATION_DDL, - "struct test { i32 hello, list<string> bye, map<string,i32> another}"); + "struct test { i32 hello, list<string> bye, map<string,i32> another, i32 nhello, double d, double nd}"); schema.setProperty(Constants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString()); + HashMap<String, String> p = additionalParams.get(pp); + if (p != null) { + for(Entry<String, String> e: p.entrySet()) { + schema.setProperty(e.getKey(), e.getValue()); + } + } DynamicSerDe serde = new DynamicSerDe(); serde.initialize(new Configuration(), schema); @@ -93,15 +126,8 @@ // Try to serialize BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); - - StringBuilder sb = new StringBuilder(); - for (int i=0; i<bytes.getSize(); i++) { - byte b = bytes.get()[i]; - int v = (b<0 ? 256 + b : b); - sb.append(String.format("x%02x", v)); - } - System.out.println("bytes =" + sb); - + System.out.println("bytes =" + hexString(bytes)); + if (!isBinary) { System.out.println("bytes in text =" + new String(bytes.get(), 0, bytes.getSize())); } @@ -116,7 +142,7 @@ System.out.println("o[2] class = " + olist.get(2).getClass()); System.out.println("o = " + o); - assertEquals(o, struct); + assertEquals(struct, o); } } catch (Throwable e) { @@ -124,9 +150,216 @@ throw e; } + } + public String hexString(BytesWritable bytes) { + StringBuilder sb = new StringBuilder(); + for (int i=0; i<bytes.getSize(); i++) { + byte b = bytes.get()[i]; + int v = (b<0 ? 256 + b : b); + sb.append(String.format("x%02x", v)); + } + return sb.toString(); + } + + + private void testTBinarySortableProtocol(Object[] structs, String ddl, boolean ascending) throws Throwable{ + int fields = ((List)structs[structs.length-1]).size(); + String order = ""; + for(int i=0; i<fields; i++) { + order = order + (ascending ? "+" : "-"); + } + + Properties schema = new Properties(); + schema.setProperty(Constants.SERIALIZATION_FORMAT, org.apache.hadoop.hive.serde2.thrift.TBinarySortableProtocol.class.getName()); + schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); + schema.setProperty(Constants.SERIALIZATION_DDL, ddl); + schema.setProperty(Constants.SERIALIZATION_LIB, DynamicSerDe.class.getName()); + schema.setProperty(Constants.SERIALIZATION_SORT_ORDER, order); + + DynamicSerDe serde = new DynamicSerDe(); + serde.initialize(new Configuration(), schema); + + ObjectInspector oi = serde.getObjectInspector(); + + // Try to serialize + BytesWritable bytes[] = new BytesWritable[structs.length]; + for (int i=0; i<structs.length; i++) { + bytes[i] = new BytesWritable(); + BytesWritable s = (BytesWritable)serde.serialize(structs[i], oi); + bytes[i].set(s); + if (i>0) { + int compareResult = bytes[i-1].compareTo(bytes[i]); + if ( (compareResult<0 && !ascending) || (compareResult>0 && ascending) ) { + System.out.println("Test failed in " + (ascending ? "ascending" : "descending") + " order."); + System.out.println("serialized data of " + structs[i-1] + " = " + hexString(bytes[i-1])); + System.out.println("serialized data of " + structs[i] + " = " + hexString(bytes[i])); + fail("Sort order of serialized " + structs[i-1] + " and " + structs[i] + " are reversed!"); + } + } + } + + // Try to deserialize + Object[] deserialized = new Object[structs.length]; + for (int i=0; i<structs.length; i++) { + deserialized[i] = serde.deserialize(bytes[i]); + if (!structs[i].equals(deserialized[i])) { + System.out.println("structs[i] = " + structs[i]); + System.out.println("deserialized[i] = " + deserialized[i]); + System.out.println("serialized[i] = " + hexString(bytes[i])); + assertEquals(structs[i], deserialized[i]); + } + } + } + + static int compare(Object a, Object b) { + if (a == null && b == null) return 0; + if (a == null) return -1; + if (b == null) return 1; + if (a instanceof List) { + List la = (List)a; + List lb = (List)b; + assert(la.size() == lb.size()); + for (int i=0; i<la.size(); i++) { + int r = compare(la.get(i), lb.get(i)); + if (r != 0) return r; + } + return 0; + } else if (a instanceof Number) { + Number na = (Number) a; + Number nb = (Number) b; + if (na.doubleValue() < nb.doubleValue()) return -1; + if (na.doubleValue() > nb.doubleValue()) return 1; + return 0; + } else if (a instanceof String) { + String sa = (String) a; + String sb = (String) b; + return sa.compareTo(sb); + } + return 0; + } + + private void sort(Object[] structs) { + for (int i=0; i<structs.length; i++) for (int j=i+1; j<structs.length; j++) + if (compare(structs[i], structs[j])>0) { + Object t = structs[i]; + structs[i] = structs[j]; + structs[j] = t; + } } + public void testTBinarySortableProtocol() throws Throwable { + try { + System.out.println("Beginning Test testTBinarySortableProtocol:"); + + int num = 100; + Random r = new Random(1234); + Object structs[] = new Object[num]; + String ddl; + + // Test double + for (int i=0; i<num; i++) { + ArrayList<Object> struct = new ArrayList<Object>(); + if (i==0) { + struct.add(null); + } else { + struct.add(Double.valueOf((r.nextDouble()-0.5)*10)); + } + structs[i] = struct; + } + sort(structs); + ddl = "struct test { double hello}"; + System.out.println("Testing " + ddl); + testTBinarySortableProtocol(structs, ddl, true); + testTBinarySortableProtocol(structs, ddl, false); + + // Test integer + for (int i=0; i<num; i++) { + ArrayList<Object> struct = new ArrayList<Object>(); + if (i==0) { + struct.add(null); + } else { + struct.add((int)((r.nextDouble()-0.5)*1.5*Integer.MAX_VALUE)); + } + structs[i] = struct; + } + sort(structs); + // Null should be smaller than any other value, so put a null at the front end + // to test whether that is held. + ((List)structs[0]).set(0, null); + ddl = "struct test { i32 hello}"; + System.out.println("Testing " + ddl); + testTBinarySortableProtocol(structs, ddl, true); + testTBinarySortableProtocol(structs, ddl, false); + + // Test long + for (int i=0; i<num; i++) { + ArrayList<Object> struct = new ArrayList<Object>(); + if (i==0) { + struct.add(null); + } else { + struct.add((long)((r.nextDouble()-0.5)*1.5*Long.MAX_VALUE)); + } + structs[i] = struct; + } + sort(structs); + // Null should be smaller than any other value, so put a null at the front end + // to test whether that is held. + ((List)structs[0]).set(0, null); + ddl = "struct test { i64 hello}"; + System.out.println("Testing " + ddl); + testTBinarySortableProtocol(structs, ddl, true); + testTBinarySortableProtocol(structs, ddl, false); + + // Test string + for (int i=0; i<num; i++) { + ArrayList<Object> struct = new ArrayList<Object>(); + if (i==0) { + struct.add(null); + } else { + struct.add(String.valueOf((r.nextDouble()-0.5)*1000)); + } + structs[i] = struct; + } + sort(structs); + // Null should be smaller than any other value, so put a null at the front end + // to test whether that is held. + ((List)structs[0]).set(0, null); + ddl = "struct test { string hello}"; + System.out.println("Testing " + ddl); + testTBinarySortableProtocol(structs, ddl, true); + testTBinarySortableProtocol(structs, ddl, false); + + // Test string + double + for (int i=0; i<num; i++) { + ArrayList<Object> struct = new ArrayList<Object>(); + if (i%9==0) { + struct.add(null); + } else { + struct.add("str" + (i/5)); + } + if (i%7==0) { + struct.add(null); + } else { + struct.add(Double.valueOf((r.nextDouble()-0.5)*10)); + } + structs[i] = struct; + } + sort(structs); + // Null should be smaller than any other value, so put a null at the front end + // to test whether that is held. + ((List)structs[0]).set(0, null); + ddl = "struct test { string hello, double another}"; + System.out.println("Testing " + ddl); + testTBinarySortableProtocol(structs, ddl, true); + testTBinarySortableProtocol(structs, ddl, false); + + System.out.println("Test testTBinarySortableProtocol passed!"); + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + } public void testConfigurableTCTLSeparated() throws Throwable { @@ -161,20 +394,14 @@ serde.initialize(new Configuration(), schema); TCTLSeparatedProtocol prot = (TCTLSeparatedProtocol)serde.oprot_; - assertTrue(prot.getPrimarySeparator() == 9); + assertTrue(prot.getPrimarySeparator().equals("\u0009")); ObjectInspector oi = serde.getObjectInspector(); // Try to serialize BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); - StringBuilder sb = new StringBuilder(); - for (int i=0; i<bytes.getSize(); i++) { - byte b = bytes.get()[i]; - int v = (b<0 ? 256 + b : b); - sb.append(String.format("x%02x", v)); - } - System.out.println("bytes =" + sb); + hexString(bytes); String compare = "234" + "\u0009" + "firstString" + "\u0001" + "secondString" + "\u0009" + "firstKey" + "\u0004" + "1" + "\u0001" + "secondKey" + "\u0004" + "2"; @@ -201,4 +428,356 @@ } } + + + /** + * Tests a single null list within a struct with return nulls on + */ + + public void testNulls1() throws Throwable { + try { + + + // Try to construct an object + ArrayList<String> bye = null; + HashMap<String, Integer> another = new HashMap<String, Integer>(); + another.put("firstKey", 1); + another.put("secondKey", 2); + ArrayList<Object> struct = new ArrayList<Object>(); + struct.add(Integer.valueOf(234)); + struct.add(bye); + struct.add(another); + + Properties schema = new Properties(); + schema.setProperty(Constants.SERIALIZATION_FORMAT, org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol.class.getName()); + schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); + schema.setProperty(Constants.SERIALIZATION_DDL, + "struct test { i32 hello, list<string> bye, map<string,i32> another}"); + schema.setProperty(Constants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString()); + schema.setProperty(TCTLSeparatedProtocol.ReturnNullsKey, "true"); + + DynamicSerDe serde = new DynamicSerDe(); + serde.initialize(new Configuration(), schema); + + ObjectInspector oi = serde.getObjectInspector(); + + // Try to serialize + BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); + + hexString(bytes); + + // Try to deserialize + Object o = serde.deserialize(bytes); + assertEquals(struct, o); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + + } + + /** + * Tests all elements of a struct being null with return nulls on + */ + + public void testNulls2() throws Throwable { + try { + + + // Try to construct an object + ArrayList<String> bye = null; + HashMap<String, Integer> another = null; + ArrayList<Object> struct = new ArrayList<Object>(); + struct.add(null); + struct.add(bye); + struct.add(another); + + Properties schema = new Properties(); + schema.setProperty(Constants.SERIALIZATION_FORMAT, org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol.class.getName()); + schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); + schema.setProperty(Constants.SERIALIZATION_DDL, + "struct test { i32 hello, list<string> bye, map<string,i32> another}"); + schema.setProperty(Constants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString()); + schema.setProperty(TCTLSeparatedProtocol.ReturnNullsKey, "true"); + + DynamicSerDe serde = new DynamicSerDe(); + serde.initialize(new Configuration(), schema); + + ObjectInspector oi = serde.getObjectInspector(); + + // Try to serialize + BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); + + hexString(bytes); + + // Try to deserialize + Object o = serde.deserialize(bytes); + List<?> olist = (List<?>)o; + + assertTrue(olist.size() == 3); + assertEquals(null, olist.get(0)); + assertEquals(null, olist.get(1)); + assertEquals(null, olist.get(2)); + + // assertEquals(o, struct); Cannot do this because types of null lists are wrong. + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + + } + + /** + * Tests map and list being empty with return nulls on + */ + + public void testNulls3() throws Throwable { + try { + + + // Try to construct an object + ArrayList<String> bye = new ArrayList<String> (); + HashMap<String, Integer> another = null; + ArrayList<Object> struct = new ArrayList<Object>(); + struct.add(null); + struct.add(bye); + struct.add(another); + + Properties schema = new Properties(); + schema.setProperty(Constants.SERIALIZATION_FORMAT, org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol.class.getName()); + schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); + schema.setProperty(Constants.SERIALIZATION_DDL, + "struct test { i32 hello, list<string> bye, map<string,i32> another}"); + schema.setProperty(Constants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString()); + + schema.setProperty(TCTLSeparatedProtocol.ReturnNullsKey, "true"); + DynamicSerDe serde = new DynamicSerDe(); + serde.initialize(new Configuration(), schema); + + ObjectInspector oi = serde.getObjectInspector(); + + // Try to serialize + BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); + + hexString(bytes); + + // Try to deserialize + Object o = serde.deserialize(bytes); + List<?> olist = (List<?>)o; + + assertTrue(olist.size() == 3); + assertEquals(null, olist.get(0)); + assertEquals(0, ((List<?>)olist.get(1)).size()); + assertEquals(null, olist.get(2)); + + // assertEquals(o, struct); Cannot do this because types of null lists are wrong. + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + + } + + + /** + * Tests map and list null/empty with return nulls *off* + */ + + public void testNulls4() throws Throwable { + try { + + + // Try to construct an object + ArrayList<String> bye = new ArrayList<String> (); + HashMap<String, Integer> another = null; + ArrayList<Object> struct = new ArrayList<Object>(); + struct.add(null); + struct.add(bye); + struct.add(another); + + Properties schema = new Properties(); + schema.setProperty(Constants.SERIALIZATION_FORMAT, org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol.class.getName()); + schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); + schema.setProperty(Constants.SERIALIZATION_DDL, + "struct test { i32 hello, list<string> bye, map<string,i32> another}"); + schema.setProperty(Constants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString()); + + schema.setProperty(TCTLSeparatedProtocol.ReturnNullsKey, "false"); + DynamicSerDe serde = new DynamicSerDe(); + serde.initialize(new Configuration(), schema); + + ObjectInspector oi = serde.getObjectInspector(); + + // Try to serialize + BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); + + hexString(bytes); + + // Try to deserialize + Object o = serde.deserialize(bytes); + List<?> olist = (List<?>)o; + + assertTrue(olist.size() == 3); + assertEquals(new Integer(0), (Integer)olist.get(0)); + List<?> num1 = (List<?>)olist.get(1); + assertTrue(num1.size() == 0); + Map<?,?> num2 = (Map<?,?>)olist.get(2); + assertTrue(num2.size() == 0); + + // assertEquals(o, struct); Cannot do this because types of null lists are wrong. + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + + } + + + /** + * Tests map and list null/empty with return nulls *off* + */ + + public void testStructsinStructs() throws Throwable { + try { + + + Properties schema = new Properties(); + // schema.setProperty(Constants.SERIALIZATION_FORMAT, com.facebook.thrift.protocol.TJSONProtocol.class.getName()); + schema.setProperty(Constants.SERIALIZATION_FORMAT, com.facebook.thrift.protocol.TBinaryProtocol.class.getName()); + schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); + schema.setProperty(Constants.SERIALIZATION_DDL, + "struct inner { i32 field1, string field2 },struct test {inner foo, i32 hello, list<string> bye, map<string,i32> another}"); + schema.setProperty(Constants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString()); + + + // + // construct object of above type + // + + // construct the inner struct + ArrayList<Object> innerStruct = new ArrayList<Object>(); + innerStruct.add(new Integer(22)); + innerStruct.add(new String("hello world")); + + // construct outer struct + ArrayList<String> bye = new ArrayList<String> (); + bye.add("firstString"); + bye.add("secondString"); + HashMap<String, Integer> another = new HashMap<String, Integer>(); + another.put("firstKey", 1); + another.put("secondKey", 2); + + ArrayList<Object> struct = new ArrayList<Object>(); + + struct.add(innerStruct); + struct.add(Integer.valueOf(234)); + struct.add(bye); + struct.add(another); + + DynamicSerDe serde = new DynamicSerDe(); + serde.initialize(new Configuration(), schema); + + ObjectInspector oi = serde.getObjectInspector(); + + // Try to serialize + BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); + + // Try to deserialize + Object o = serde.deserialize(bytes); + List<?> olist = (List<?>)o; + + + assertEquals(4, olist.size()); + assertEquals(innerStruct, olist.get(0)); + assertEquals(new Integer(234), olist.get(1)); + assertEquals(bye, olist.get(2)); + assertEquals(another, olist.get(3)); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + + } + + + + + public void testSkip() throws Throwable { + try { + + // Try to construct an object + ArrayList<String> bye = new ArrayList<String>(); + bye.add("firstString"); + bye.add("secondString"); + HashMap<String, Integer> another = new HashMap<String, Integer>(); + another.put("firstKey", 1); + another.put("secondKey", 2); + ArrayList<Object> struct = new ArrayList<Object>(); + struct.add(Integer.valueOf(234)); + struct.add(bye); + struct.add(another); + + Properties schema = new Properties(); + schema.setProperty(Constants.SERIALIZATION_FORMAT, org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol.class.getName()); + schema.setProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME, "test"); + schema.setProperty(Constants.SERIALIZATION_DDL, + "struct test { i32 hello, list<string> bye, map<string,i32> another}"); + schema.setProperty(Constants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString()); + + schema.setProperty(Constants.FIELD_DELIM, "9"); + schema.setProperty(Constants.COLLECTION_DELIM, "1"); + schema.setProperty(Constants.LINE_DELIM, "2"); + schema.setProperty(Constants.MAPKEY_DELIM, "4"); + + DynamicSerDe serde = new DynamicSerDe(); + serde.initialize(new Configuration(), schema); + + TCTLSeparatedProtocol prot = (TCTLSeparatedProtocol)serde.oprot_; + assertTrue(prot.getPrimarySeparator().equals("\u0009")); + + ObjectInspector oi = serde.getObjectInspector(); + + // Try to serialize + BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi); + + hexString(bytes); + + String compare = "234" + "\u0009" + "firstString" + "\u0001" + "secondString" + "\u0009" + "firstKey" + "\u0004" + "1" + "\u0001" + "secondKey" + "\u0004" + "2"; + + System.out.println("bytes in text =" + new String(bytes.get(), 0, bytes.getSize()) + ">"); + System.out.println("compare to =" + compare + ">"); + + assertTrue(compare.equals( new String(bytes.get(), 0, bytes.getSize()))); + + schema.setProperty(Constants.SERIALIZATION_DDL, + "struct test { i32 hello, skip list<string> bye, map<string,i32> another}"); + + serde.initialize(new Configuration(), schema); + + // Try to deserialize + Object o = serde.deserialize(bytes); + System.out.println("o class = " + o.getClass()); + List<?> olist = (List<?>)o; + System.out.println("o size = " + olist.size()); + System.out.println("o = " + o); + + assertEquals(null, olist.get(1)); + + // set the skipped field to null + struct.set(1,null); + + assertEquals(o, struct); + + } catch (Throwable e) { + e.printStackTrace(); + throw e; + } + + } + } Modified: hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestStandardObjectInspectors.java URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestStandardObjectInspectors.java?rev=712905&r1=712904&r2=712905&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestStandardObjectInspectors.java (original) +++ hadoop/core/trunk/src/contrib/hive/serde/src/test/org/apache/hadoop/hive/serde2/objectinspector/TestStandardObjectInspectors.java Mon Nov 10 17:50:06 2008 @@ -34,7 +34,7 @@ assertEquals(oi1, oi2); assertEquals(Category.PRIMITIVE, oi1.getCategory()); assertEquals(c, oi1.getPrimitiveClass()); - assertEquals(ObjectInspectorUtils.getClassShortName(c.getName()), + assertEquals(ObjectInspectorUtils.getClassShortName(c), oi1.getTypeName()); } catch (Throwable e) { e.printStackTrace();
