Luis Gonzalez created HIVE-12955:
------------------------------------
Summary: avro.schema.literal don't support more than 50 fields.
Key: HIVE-12955
URL: https://issues.apache.org/jira/browse/HIVE-12955
Project: Hive
Issue Type: Bug
Components: Hive
Affects Versions: 1.0.0
Reporter: Luis Gonzalez
Priority: Minor
Hi!
we think we have hitted a bug. We have tested this in many ways for a couple of
hours now and there is apparently a problem with avro.schema.literal when you
specify more than 50 fields. In our tests we have found that regarless the avro
file (table) we want to load, it fails with 60 fields.. some tables we uses
have more than 400 fields and have the same problem.
For instance if we launch the command
```
hive> drop table tableName;
OK
Time taken: 0.162 seconds
hive>
> CREATE EXTERNAL TABLE tableName
> ROW FORMAT
> SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
> WITH SERDEPROPERTIES ('avro.schema.literal'='
> { "namespace": "cdr.avro", "type": "record", "name": "Account", "fields":
[
> {"name": "Id", "type":["null", "string"],"default":null},
> {"name": "IsDeleted", "type":["null", "boolean"],"default":null},
> {"name": "MasterRecordId", "type":["null", "string"],"default":null},
> {"name": "Name", "type":["null", "string"],"default":null},
> {"name": "Type", "type":["null", "string"],"default":null},
> {"name": "RecordTypeId", "type":["null", "string"],"default":null},
> {"name": "ParentId", "type":["null", "string"],"default":null},
> {"name": "Phone", "type":["null", "string"],"default":null},
> {"name": "Fax", "type":["null", "string"],"default":null},
> {"name": "AccountNumber", "type":["null", "string"],"default":null},
> {"name": "Website", "type":["null", "string"],"default":null},
> {"name": "Industry", "type":["null", "string"],"default":null},
> {"name": "AnnualRevenue", "type":["null", "double"],"default":null},
> {"name": "NumberOfEmployees", "type":["null", "int"],"default":null},
> {"name": "Description", "type":["null", "string"],"default":null},
> {"name": "OwnerId", "type":["null", "string"],"default":null},
> {"name": "CreatedDate", "type":["null", "string"],"default":null},
> {"name": "CreatedById", "type":["null", "string"],"default":null},
> {"name": "LastModifiedDate", "type":["null", "string"],"default":null},
> {"name": "LastModifiedById", "type":["null", "string"],"default":null},
> {"name": "SystemModstamp", "type":["null", "string"],"default":null},
> {"name": "LastActivityDate", "type":["null", "string"],"default":null},
> {"name": "IsPartner", "type":["null", "boolean"],"default":null},
> {"name": "IsCustomerPortal", "type":["null", "boolean"],"default":null},
> {"name": "JigsawCompanyId", "type":["null", "string"],"default":null},
> {"name": "Invoice_Level__c", "type":["null", "string"],"default":null},
> {"name": "IT_Developer_Fee__c", "type":["null",
"boolean"],"default":null},
> {"name": "Customer_Type__c", "type":["null", "string"],"default":null},
> {"name": "Shortname__c", "type":["null", "string"],"default":null},
> {"name": "Excluir_compensacion_por_desvio__c", "type":["null",
"boolean"],"default":null},
> {"name": "Commercial_Area__c", "type":["null", "string"],"default":null},
> {"name": "Account_Status__c", "type":["null", "string"],"default":null},
> {"name": "Active_Fiscal_Details__c", "type":["null",
"boolean"],"default":null},
> {"name": "Office_Code__c", "type":["null", "string"],"default":null},
> {"name": "Commercial_Brand__c", "type":["null",
"string"],"default":null},
> {"name": "Agreed_payment_method__c", "type":["null",
"string"],"default":null},
> {"name": "Division__c", "type":["null", "string"],"default":null},
> {"name": "Inactive_Date__c", "type":["null", "string"],"default":null},
> {"name": "SAP_Code__c", "type":["null", "string"],"default":null},
> {"name": "Country_fiscal__c", "type":["null", "string"],"default":null},
> {"name": "Fiscal_Number_1__c", "type":["null", "string"],"default":null},
> {"name": "Fiscal_Number_2__c", "type":["null", "string"],"default":null},
> {"name": "Street_Fiscal__c", "type":["null", "string"],"default":null},
> {"name": "City_fiscal__c", "type":["null", "string"],"default":null},
> {"name": "Post_Code_fiscal__c", "type":["null",
"string"],"default":null},
> {"name": "Web_Prepayment__c", "type":["null", "boolean"],"default":null},
> {"name": "Customer_Subtype__c", "type":["null",
"string"],"default":null},
> {"name": "Relationship__c", "type":["null", "string"],"default":null},
> {"name": "Market_Country__c", "type":["null", "string"],"default":null}
> ] }
> ')
> STORED AS
> INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
> OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
> LOCATION 's3://bucket.../path/to/avro';
OK
Time taken: 0.412 seconds
hive>
> select * from tableName limit 10;
OK
```
but when using the same AVRO file and more fields it fails
```
hive> drop table tableName;
OK
Time taken: 0.146 seconds
hive>
> CREATE EXTERNAL TABLE tableName
> ROW FORMAT
> SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
> WITH SERDEPROPERTIES ('avro.schema.literal'='
> { "namespace": "cdr.avro", "type": "record", "name": "Account", "fields":
[
> {"name": "Id", "type":["null", "string"],"default":null},
> {"name": "IsDeleted", "type":["null", "boolean"],"default":null},
> {"name": "MasterRecordId", "type":["null", "string"],"default":null},
> {"name": "Name", "type":["null", "string"],"default":null},
> {"name": "Type", "type":["null", "string"],"default":null},
> {"name": "RecordTypeId", "type":["null", "string"],"default":null},
> {"name": "ParentId", "type":["null", "string"],"default":null},
> {"name": "Phone", "type":["null", "string"],"default":null},
> {"name": "Fax", "type":["null", "string"],"default":null},
> {"name": "AccountNumber", "type":["null", "string"],"default":null},
> {"name": "Website", "type":["null", "string"],"default":null},
> {"name": "Industry", "type":["null", "string"],"default":null},
> {"name": "AnnualRevenue", "type":["null", "double"],"default":null},
> {"name": "NumberOfEmployees", "type":["null", "int"],"default":null},
> {"name": "Description", "type":["null", "string"],"default":null},
> {"name": "OwnerId", "type":["null", "string"],"default":null},
> {"name": "CreatedDate", "type":["null", "string"],"default":null},
> {"name": "CreatedById", "type":["null", "string"],"default":null},
> {"name": "LastModifiedDate", "type":["null", "string"],"default":null},
> {"name": "LastModifiedById", "type":["null", "string"],"default":null},
> {"name": "SystemModstamp", "type":["null", "string"],"default":null},
> {"name": "LastActivityDate", "type":["null", "string"],"default":null},
> {"name": "IsPartner", "type":["null", "boolean"],"default":null},
> {"name": "IsCustomerPortal", "type":["null", "boolean"],"default":null},
> {"name": "JigsawCompanyId", "type":["null", "string"],"default":null},
> {"name": "Invoice_Level__c", "type":["null", "string"],"default":null},
> {"name": "IT_Developer_Fee__c", "type":["null",
"boolean"],"default":null},
> {"name": "Customer_Type__c", "type":["null", "string"],"default":null},
> {"name": "Shortname__c", "type":["null", "string"],"default":null},
> {"name": "Excluir_compensacion_por_desvio__c", "type":["null",
"boolean"],"default":null},
> {"name": "Commercial_Area__c", "type":["null", "string"],"default":null},
> {"name": "Account_Status__c", "type":["null", "string"],"default":null},
> {"name": "Active_Fiscal_Details__c", "type":["null",
"boolean"],"default":null},
> {"name": "Office_Code__c", "type":["null", "string"],"default":null},
> {"name": "Commercial_Brand__c", "type":["null",
"string"],"default":null},
> {"name": "Agreed_payment_method__c", "type":["null",
"string"],"default":null},
> {"name": "Division__c", "type":["null", "string"],"default":null},
> {"name": "Inactive_Date__c", "type":["null", "string"],"default":null},
> {"name": "SAP_Code__c", "type":["null", "string"],"default":null},
> {"name": "Country_fiscal__c", "type":["null", "string"],"default":null},
> {"name": "Fiscal_Number_1__c", "type":["null", "string"],"default":null},
> {"name": "Fiscal_Number_2__c", "type":["null", "string"],"default":null},
> {"name": "Street_Fiscal__c", "type":["null", "string"],"default":null},
> {"name": "City_fiscal__c", "type":["null", "string"],"default":null},
> {"name": "Post_Code_fiscal__c", "type":["null",
"string"],"default":null},
> {"name": "Web_Prepayment__c", "type":["null", "boolean"],"default":null},
> {"name": "Customer_Subtype__c", "type":["null",
"string"],"default":null},
> {"name": "Relationship__c", "type":["null", "string"],"default":null},
> {"name": "Market_Country__c", "type":["null", "string"],"default":null},
> {"name": "Customer_Service_Centre__c", "type":["null",
"string"],"default":null},
> {"name": "Acquisition_Channel_Type__c", "type":["null",
"string"],"default":null},
> {"name": "Acquisition_Channel_Description__c", "type":["null",
"string"],"default":null},
> {"name": "Comments__c", "type":["null", "string"],"default":null},
> {"name": "Street_Commercial__c", "type":["null",
"string"],"default":null},
> {"name": "Country_Commercial__c", "type":["null",
"string"],"default":null},
> {"name": "City_Commercial__c", "type":["null", "string"],"default":null},
> {"name": "Post_Code_Commercial__c", "type":["null",
"string"],"default":null},
> {"name": "Atlas_Branch_Number__c", "type":["null",
"double"],"default":null},
> {"name": "Timezone__c", "type":["null", "string"],"default":null},
> {"name": "Billing_Language__c", "type":["null",
"string"],"default":null},
> {"name": "Fiscal_Name__c", "type":["null", "string"],"default":null}
> ] }
> ')
> STORED AS
> INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
> OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
> LOCATION 's3://bucket/path/to/avro';
OK
Time taken: 0.48 seconds
hive>
> select * from tableName limit 10;
OK
Failed with exception java.io.IOException:org.apache.avro.AvroTypeException:
Found cdr.avro.Account, expecting
org.apache.hadoop.hive.CannotDetermineSchemaSentinel
Time taken: 0.028 seconds
```
This problem doesn't affect when we store the avro schema with 400 fields in a
file in s3 and use the avro.schema.url field.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)