Author: xuefu
Date: Wed Sep 24 16:39:04 2014
New Revision: 1627364
URL: http://svn.apache.org/r1627364
Log:
HIVE-8224: Support Char, Varchar in AvroSerDe (Mohit via Xuefu)
Added:
hive/trunk/data/files/avro_charvarchar.txt
hive/trunk/ql/src/test/queries/clientpositive/avro_charvarchar.q
hive/trunk/ql/src/test/results/clientpositive/avro_charvarchar.q.out
Modified:
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java
hive/trunk/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java
hive/trunk/serde/src/test/resources/avro-struct.avsc
Added: hive/trunk/data/files/avro_charvarchar.txt
URL:
http://svn.apache.org/viewvc/hive/trunk/data/files/avro_charvarchar.txt?rev=1627364&view=auto
==============================================================================
--- hive/trunk/data/files/avro_charvarchar.txt (added)
+++ hive/trunk/data/files/avro_charvarchar.txt Wed Sep 24 16:39:04 2014
@@ -0,0 +1,4 @@
+a |a |k1:v1|101,x200|10,abcdef
+ab|ab |k2:v123456|102,y200|10,abc
+abc|abc|k3:v1234|103,200|10,a
+abcdefghijklm|abcdefghijklmnop|k9:v12|109,200|10, abcdef
Added: hive/trunk/ql/src/test/queries/clientpositive/avro_charvarchar.q
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/avro_charvarchar.q?rev=1627364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/avro_charvarchar.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/avro_charvarchar.q Wed Sep 24
16:39:04 2014
@@ -0,0 +1,27 @@
+DROP TABLE avro_charvarchar_staging;
+DROP TABLE avro_charvarchar;
+
+CREATE TABLE avro_charvarchar_staging (
+ cchar char(5),
+ cvarchar varchar(10),
+ m1 map<string, char(2)>,
+ l1 array<string>,
+ st1 struct<c1:int, c2:varchar(4)>
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':';
+
+CREATE TABLE avro_charvarchar (
+ cchar char(5),
+ cvarchar varchar(10),
+ m1 map<string, char(2)>,
+ l1 array<string>,
+ st1 struct<c1:int, c2:varchar(4)>
+) STORED AS AVRO;
+
+LOAD DATA LOCAL INPATH '../../data/files/avro_charvarchar.txt' OVERWRITE INTO
TABLE avro_charvarchar_staging;
+
+INSERT OVERWRITE TABLE avro_charvarchar SELECT * FROM avro_charvarchar_staging;
+
+SELECT * FROM avro_charvarchar;
Added: hive/trunk/ql/src/test/results/clientpositive/avro_charvarchar.q.out
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/avro_charvarchar.q.out?rev=1627364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/avro_charvarchar.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/avro_charvarchar.q.out Wed
Sep 24 16:39:04 2014
@@ -0,0 +1,87 @@
+PREHOOK: query: DROP TABLE avro_charvarchar_staging
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE avro_charvarchar_staging
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE avro_charvarchar
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE avro_charvarchar
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE avro_charvarchar_staging (
+ cchar char(5),
+ cvarchar varchar(10),
+ m1 map<string, char(2)>,
+ l1 array<string>,
+ st1 struct<c1:int, c2:varchar(4)>
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@avro_charvarchar_staging
+POSTHOOK: query: CREATE TABLE avro_charvarchar_staging (
+ cchar char(5),
+ cvarchar varchar(10),
+ m1 map<string, char(2)>,
+ l1 array<string>,
+ st1 struct<c1:int, c2:varchar(4)>
+) ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+COLLECTION ITEMS TERMINATED BY ','
+MAP KEYS TERMINATED BY ':'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@avro_charvarchar_staging
+PREHOOK: query: CREATE TABLE avro_charvarchar (
+ cchar char(5),
+ cvarchar varchar(10),
+ m1 map<string, char(2)>,
+ l1 array<string>,
+ st1 struct<c1:int, c2:varchar(4)>
+) STORED AS AVRO
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@avro_charvarchar
+POSTHOOK: query: CREATE TABLE avro_charvarchar (
+ cchar char(5),
+ cvarchar varchar(10),
+ m1 map<string, char(2)>,
+ l1 array<string>,
+ st1 struct<c1:int, c2:varchar(4)>
+) STORED AS AVRO
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@avro_charvarchar
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/avro_charvarchar.txt'
OVERWRITE INTO TABLE avro_charvarchar_staging
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@avro_charvarchar_staging
+POSTHOOK: query: LOAD DATA LOCAL INPATH
'../../data/files/avro_charvarchar.txt' OVERWRITE INTO TABLE
avro_charvarchar_staging
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@avro_charvarchar_staging
+PREHOOK: query: INSERT OVERWRITE TABLE avro_charvarchar SELECT * FROM
avro_charvarchar_staging
+PREHOOK: type: QUERY
+PREHOOK: Input: default@avro_charvarchar_staging
+PREHOOK: Output: default@avro_charvarchar
+POSTHOOK: query: INSERT OVERWRITE TABLE avro_charvarchar SELECT * FROM
avro_charvarchar_staging
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@avro_charvarchar_staging
+POSTHOOK: Output: default@avro_charvarchar
+POSTHOOK: Lineage: avro_charvarchar.cchar SIMPLE
[(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:cchar,
type:char(5), comment:null), ]
+POSTHOOK: Lineage: avro_charvarchar.cvarchar SIMPLE
[(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:cvarchar,
type:varchar(10), comment:null), ]
+POSTHOOK: Lineage: avro_charvarchar.l1 SIMPLE
[(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:l1,
type:array<string>, comment:null), ]
+POSTHOOK: Lineage: avro_charvarchar.m1 SIMPLE
[(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:m1,
type:map<string,char(2)>, comment:null), ]
+POSTHOOK: Lineage: avro_charvarchar.st1 SIMPLE
[(avro_charvarchar_staging)avro_charvarchar_staging.FieldSchema(name:st1,
type:struct<c1:int,c2:varchar(4)>, comment:null), ]
+PREHOOK: query: SELECT * FROM avro_charvarchar
+PREHOOK: type: QUERY
+PREHOOK: Input: default@avro_charvarchar
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM avro_charvarchar
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@avro_charvarchar
+#### A masked pattern was here ####
+a a {"k1":"v1"} ["101","x200"] {"c1":10,"c2":"abcd"}
+ab ab {"k2":"v1"} ["102","y200"] {"c1":10,"c2":"abc"}
+abc abc {"k3":"v1"} ["103","200"] {"c1":10,"c2":"a "}
+abcde abcdefghij {"k9":"v1"} ["109","200"] {"c1":10,"c2":" a"}
Modified:
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java
URL:
http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java?rev=1627364&r1=1627363&r2=1627364&view=diff
==============================================================================
---
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java
(original)
+++
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java
Wed Sep 24 16:39:04 2014
@@ -42,7 +42,9 @@ import org.apache.avro.io.EncoderFactory
import org.apache.avro.util.Utf8;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.HiveVarchar;
import
org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
@@ -200,7 +202,6 @@ class AvroDeserializer {
return deserializeNullableUnion(datum, fileSchema, recordSchema,
columnType);
}
-
switch(columnType.getCategory()) {
case STRUCT:
return deserializeStruct((GenericData.Record) datum, fileSchema,
(StructTypeInfo) columnType);
@@ -249,6 +250,36 @@ class AvroDeserializer {
JavaHiveDecimalObjectInspector oi = (JavaHiveDecimalObjectInspector)
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector((DecimalTypeInfo)columnType);
return oi.set(null, dec);
+ case CHAR:
+ if (fileSchema == null) {
+ throw new AvroSerdeException("File schema is missing for char field.
Reader schema is " + columnType);
+ }
+
+ int maxLength = 0;
+ try {
+ maxLength =
fileSchema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt();
+ } catch (Exception ex) {
+ throw new AvroSerdeException("Failed to obtain maxLength value for
char field from file schema: " + fileSchema, ex);
+ }
+
+ String str = datum.toString();
+ HiveChar hc = new HiveChar(str, maxLength);
+ return hc;
+ case VARCHAR:
+ if (fileSchema == null) {
+ throw new AvroSerdeException("File schema is missing for varchar
field. Reader schema is " + columnType);
+ }
+
+ maxLength = 0;
+ try {
+ maxLength =
fileSchema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt();
+ } catch (Exception ex) {
+ throw new AvroSerdeException("Failed to obtain maxLength value for
varchar field from file schema: " + fileSchema, ex);
+ }
+
+ str = datum.toString();
+ HiveVarchar hvc = new HiveVarchar(str, maxLength);
+ return hvc;
default:
return datum;
}
Modified:
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java
URL:
http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java?rev=1627364&r1=1627363&r2=1627364&view=diff
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java
(original)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerDe.java
Wed Sep 24 16:39:04 2014
@@ -41,9 +41,13 @@ public class AvroSerDe extends AbstractS
private static final Log LOG = LogFactory.getLog(AvroSerDe.class);
public static final String DECIMAL_TYPE_NAME = "decimal";
+ public static final String CHAR_TYPE_NAME = "char";
+ public static final String VARCHAR_TYPE_NAME = "varchar";
public static final String AVRO_PROP_LOGICAL_TYPE = "logicalType";
public static final String AVRO_PROP_PRECISION = "precision";
public static final String AVRO_PROP_SCALE = "scale";
+ public static final String AVRO_PROP_MAX_LENGTH = "maxLength";
+ public static final String AVRO_STRING_TYPE_NAME = "string";
private ObjectInspector oi;
private List<String> columnNames;
Modified:
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java
URL:
http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java?rev=1627364&r1=1627363&r2=1627364&view=diff
==============================================================================
---
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java
(original)
+++
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroSerializer.java
Wed Sep 24 16:39:04 2014
@@ -30,7 +30,9 @@ import org.apache.avro.generic.GenericDa
import org.apache.avro.generic.GenericEnumSymbol;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -193,6 +195,12 @@ class AvroSerializer {
case DECIMAL:
HiveDecimal dec =
(HiveDecimal)fieldOI.getPrimitiveJavaObject(structFieldData);
return AvroSerdeUtils.getBufferFromDecimal(dec,
((DecimalTypeInfo)typeInfo).scale());
+ case CHAR:
+ HiveChar ch = (HiveChar)fieldOI.getPrimitiveJavaObject(structFieldData);
+ return ch.getStrippedValue();
+ case VARCHAR:
+ HiveVarchar vc =
(HiveVarchar)fieldOI.getPrimitiveJavaObject(structFieldData);
+ return vc.getValue();
case UNKNOWN:
throw new AvroSerdeException("Received UNKNOWN primitive category.");
case VOID:
Modified:
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java
URL:
http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java?rev=1627364&r1=1627363&r2=1627364&view=diff
==============================================================================
---
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java
(original)
+++
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/SchemaToTypeInfo.java
Wed Sep 24 16:39:04 2014
@@ -128,6 +128,28 @@ class SchemaToTypeInfo {
return TypeInfoFactory.getDecimalTypeInfo(precision, scale);
}
+ if (type == Schema.Type.STRING &&
+
AvroSerDe.CHAR_TYPE_NAME.equalsIgnoreCase(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE)))
{
+ int maxLength = 0;
+ try {
+ maxLength =
schema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt();
+ } catch (Exception ex) {
+ throw new AvroSerdeException("Failed to obtain maxLength value from
file schema: " + schema, ex);
+ }
+ return TypeInfoFactory.getCharTypeInfo(maxLength);
+ }
+
+ if (type == Schema.Type.STRING &&
+
AvroSerDe.VARCHAR_TYPE_NAME.equalsIgnoreCase(schema.getProp(AvroSerDe.AVRO_PROP_LOGICAL_TYPE)))
{
+ int maxLength = 0;
+ try {
+ maxLength =
schema.getJsonProp(AvroSerDe.AVRO_PROP_MAX_LENGTH).getValueAsInt();
+ } catch (Exception ex) {
+ throw new AvroSerdeException("Failed to obtain maxLength value from
file schema: " + schema, ex);
+ }
+ return TypeInfoFactory.getVarcharTypeInfo(maxLength);
+ }
+
return typeInfoCache.retrieve(schema);
}
Modified:
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java
URL:
http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java?rev=1627364&r1=1627363&r2=1627364&view=diff
==============================================================================
---
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java
(original)
+++
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/avro/TypeInfoToSchema.java
Wed Sep 24 16:39:04 2014
@@ -19,6 +19,7 @@ package org.apache.hadoop.hive.serde2.av
import org.apache.avro.Schema;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
@@ -26,6 +27,7 @@ import org.apache.hadoop.hive.serde2.typ
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.JsonNodeFactory;
@@ -105,10 +107,16 @@ public class TypeInfoToSchema {
schema = Schema.create(Schema.Type.STRING);
break;
case CHAR:
- schema = Schema.create(Schema.Type.STRING);
+ schema = AvroSerdeUtils.getSchemaFor("{" +
+ "\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," +
+ "\"logicalType\":\"" + AvroSerDe.CHAR_TYPE_NAME + "\"," +
+ "\"maxLength\":" + ((CharTypeInfo) typeInfo).getLength() + "}");
break;
case VARCHAR:
- schema = Schema.create(Schema.Type.STRING);
+ schema = AvroSerdeUtils.getSchemaFor("{" +
+ "\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," +
+ "\"logicalType\":\"" + AvroSerDe.VARCHAR_TYPE_NAME + "\"," +
+ "\"maxLength\":" + ((VarcharTypeInfo) typeInfo).getLength() + "}");
break;
case BINARY:
schema = Schema.create(Schema.Type.BYTES);
Modified:
hive/trunk/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java
URL:
http://svn.apache.org/viewvc/hive/trunk/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java?rev=1627364&r1=1627363&r2=1627364&view=diff
==============================================================================
---
hive/trunk/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java
(original)
+++
hive/trunk/serde/src/test/org/apache/hadoop/hive/serde2/avro/TestTypeInfoToSchema.java
Wed Sep 24 16:39:04 2014
@@ -205,6 +205,30 @@ public class TestTypeInfoToSchema {
}
@Test
+ public void createAvroCharSchema() {
+ final String specificSchema = "{" +
+ "\"type\":\"string\"," +
+ "\"logicalType\":\"char\"," +
+ "\"maxLength\":" + CHAR_LEN + "}";
+ String expectedSchema = genSchema(specificSchema);
+
+ Assert.assertEquals("Test for char's avro schema failed",
+ expectedSchema, getAvroSchemaString(CHAR));
+ }
+
+ @Test
+ public void createAvroVarcharSchema() {
+ final String specificSchema = "{" +
+ "\"type\":\"string\"," +
+ "\"logicalType\":\"varchar\"," +
+ "\"maxLength\":" + CHAR_LEN + "}";
+ String expectedSchema = genSchema(specificSchema);
+
+ Assert.assertEquals("Test for varchar's avro schema failed",
+ expectedSchema, getAvroSchemaString(VARCHAR));
+ }
+
+ @Test
public void createAvroListSchema() {
ListTypeInfo listTypeInfo = new ListTypeInfo();
listTypeInfo.setListElementTypeInfo(STRING);
Modified: hive/trunk/serde/src/test/resources/avro-struct.avsc
URL:
http://svn.apache.org/viewvc/hive/trunk/serde/src/test/resources/avro-struct.avsc?rev=1627364&r1=1627363&r2=1627364&view=diff
==============================================================================
--- hive/trunk/serde/src/test/resources/avro-struct.avsc (original)
+++ hive/trunk/serde/src/test/resources/avro-struct.avsc Wed Sep 24 16:39:04
2014
@@ -7,8 +7,8 @@ field6:smallint,field7:int,field8:bigint
field12:decimal(4,2),field13:void>",
"fields":[
{"name":"field1","type":["null","string"],"doc":"string","default":null},
-{"name":"field2","type":["null","string"],"doc":"char(5)","default":null},
-{"name":"field3","type":["null","string"],"doc":"varchar(5)","default":null},
+{"name":"field2","type":["null",{"type":"string","logicalType":"char","maxLength":5}],"doc":"char(5)","default":null},
+{"name":"field3","type":["null",{"type":"string","logicalType":"varchar","maxLength":5}],"doc":"varchar(5)","default":null},
{"name":"field4","type":["null","bytes"],"doc":"binary","default":null},
{"name":"field5","type":["null","int"],"doc":"tinyint","default":null},
{"name":"field6","type":["null","int"],"doc":"smallint","default":null},
@@ -21,4 +21,4 @@ field12:decimal(4,2),field13:void>",
"scale":2}],"doc":"decimal(4,2)","default":null},
{"name":"field13","type":"null","doc":"void","default":null}
]
-}
\ No newline at end of file
+}