This is an automated email from the ASF dual-hosted git repository.
suvasude pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new f60409e
[GOBBLIN-924][GOBBLIN-922][GOBBLIN-909][GOBBLIN-923]GOBBLIN-923][GOBBLIN-906][GOBBLIN-921]
Get rid of orc.schema.literal in ORC-ingestion and registration
f60409e is described below
commit f60409ef0b6768bf46ddd137333d8d56981798fc
Author: Zihan Li <[email protected]>
AuthorDate: Thu Oct 31 13:30:34 2019 -0700
[GOBBLIN-924][GOBBLIN-922][GOBBLIN-909][GOBBLIN-923]GOBBLIN-923][GOBBLIN-906][GOBBLIN-921]
Get rid of orc.schema.literal in ORC-ingestion and registration
Closes #2780 from ZihanLi58/GOBBLIN-924
---
.../gobblin/hive/orc/HiveOrcSerDeManager.java | 22 ++++++++++------------
.../gobblin/hive/orc/HiveOrcSerDeManagerTest.java | 17 +++++++++++------
2 files changed, 21 insertions(+), 18 deletions(-)
diff --git
a/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
b/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
index 7fdceba..f20f962 100644
---
a/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
+++
b/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManager.java
@@ -61,8 +61,6 @@ import org.apache.gobblin.util.HadoopUtils;
*/
@Slf4j
public class HiveOrcSerDeManager extends HiveSerDeManager {
- // Schema is in the format of TypeDescriptor
- public static final String SCHEMA_LITERAL = "orc.schema.literal";
// Extensions of files containing ORC data
public static final String FILE_EXTENSIONS_KEY =
"hiveOrcSerdeManager.fileExtensions";
@@ -114,10 +112,13 @@ public class HiveOrcSerDeManager extends HiveSerDeManager
{
}
@Override
+ //Using LIST_COLUMNS and LIST_COLUMN_TYPES to compare schema
public boolean haveSameSchema(HiveRegistrationUnit unit1,
HiveRegistrationUnit unit2)
throws IOException {
- if (unit1.getSerDeProps().contains(SCHEMA_LITERAL) &&
unit2.getSerDeProps().contains(SCHEMA_LITERAL)) {
- return
unit1.getSerDeProps().getProp(SCHEMA_LITERAL).equals(unit2.getSerDeProps().getProp(SCHEMA_LITERAL));
+ if (unit1.getSerDeProps().contains(serdeConstants.LIST_COLUMNS) &&
unit2.getSerDeProps().contains(serdeConstants.LIST_COLUMNS)
+ && unit1.getSerDeProps().contains(serdeConstants.LIST_COLUMN_TYPES) &&
unit2.getSerDeProps().contains(serdeConstants.LIST_COLUMN_TYPES)) {
+ return
unit1.getSerDeProps().getProp(serdeConstants.LIST_COLUMNS).equals(unit2.getSerDeProps().getProp(serdeConstants.LIST_COLUMNS))
+ &&
unit1.getSerDeProps().getProp(serdeConstants.LIST_COLUMN_TYPES).equals(unit2.getSerDeProps().getProp(serdeConstants.LIST_COLUMN_TYPES));
} else {
return false;
}
@@ -152,18 +153,18 @@ public class HiveOrcSerDeManager extends HiveSerDeManager
{
if (source.getOutputFormat().isPresent()) {
target.setOutputFormat(source.getOutputFormat().get());
}
- if (source.getSerDeProps().contains(SCHEMA_LITERAL)) {
- target.setSerDeProp(SCHEMA_LITERAL,
source.getSerDeProps().getProp(SCHEMA_LITERAL));
- }
}
@Override
public void updateSchema(HiveRegistrationUnit existingUnit,
HiveRegistrationUnit newUnit)
throws IOException {
Preconditions.checkArgument(
- newUnit.getSerDeProps().contains(SCHEMA_LITERAL));
+ newUnit.getSerDeProps().contains(serdeConstants.LIST_COLUMNS));
+ Preconditions.checkArgument(
+ newUnit.getSerDeProps().contains(serdeConstants.LIST_COLUMN_TYPES));
- existingUnit.setSerDeProp(SCHEMA_LITERAL,
newUnit.getSerDeProps().getProp(SCHEMA_LITERAL));
+ existingUnit.setSerDeProp(serdeConstants.LIST_COLUMNS,
newUnit.getSerDeProps().getProp(serdeConstants.LIST_COLUMNS));
+ existingUnit.setSerDeProp(serdeConstants.LIST_COLUMN_TYPES,
newUnit.getSerDeProps().getProp(serdeConstants.LIST_COLUMN_TYPES));
}
/**
@@ -261,14 +262,11 @@ public class HiveOrcSerDeManager extends HiveSerDeManager
{
* org.apache.hadoop.hive.serde.serdeConstants#LIST_COLUMNS and
* org.apache.hadoop.hive.serde.serdeConstants#LIST_COLUMN_TYPES
*
- * Keeping {@link #SCHEMA_LITERAL} will be a nice-to-have thing but not
actually necessary in terms of functionality.
*/
protected void addSchemaPropertiesHelper(Path path, HiveRegistrationUnit
hiveUnit) throws IOException {
TypeInfo schema = getSchemaFromLatestFile(path, this.fs);
if (schema instanceof StructTypeInfo) {
StructTypeInfo structTypeInfo = (StructTypeInfo) schema;
-
- hiveUnit.setSerDeProp(SCHEMA_LITERAL, schema);
hiveUnit.setSerDeProp(serdeConstants.LIST_COLUMNS,
Joiner.on(",").join(structTypeInfo.getAllStructFieldNames()));
hiveUnit.setSerDeProp(serdeConstants.LIST_COLUMN_TYPES,
diff --git
a/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
b/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
index 782469e..a5b2ee5 100644
---
a/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
+++
b/gobblin-hive-registration/src/test/java/org/apache/gobblin/hive/orc/HiveOrcSerDeManagerTest.java
@@ -81,8 +81,6 @@ public class HiveOrcSerDeManagerTest {
manager.addSerDeProperties(this.testRegisterPath, registrationUnit);
-
Assert.assertTrue(registrationUnit.getSerDeProps().getProp(HiveOrcSerDeManager.SCHEMA_LITERAL).contains(
- "name:string,timestamp:bigint"));
List<String> columns =
Arrays.asList(registrationUnit.getSerDeProps().getProp(serdeConstants.LIST_COLUMNS).split(","));
Assert.assertTrue(columns.get(0).equals("name"));
@@ -105,8 +103,7 @@ public class HiveOrcSerDeManagerTest {
manager.addSerDeProperties(this.testRegisterPath, registrationUnit);
-
Assert.assertTrue(registrationUnit.getSerDeProps().getProp(HiveOrcSerDeManager.SCHEMA_LITERAL).contains(
- "name:string,timestamp:bigint"));
+ examineSchema(registrationUnit);
}
/**
@@ -124,8 +121,7 @@ public class HiveOrcSerDeManagerTest {
manager.addSerDeProperties(this.testRegisterPath, registrationUnit);
-
Assert.assertTrue(registrationUnit.getSerDeProps().getProp(HiveOrcSerDeManager.SCHEMA_LITERAL).contains(
- "name:string,timestamp:bigint"));
+ examineSchema(registrationUnit);
Assert.assertEquals(registrationUnit.getSerDeType().get(),
OrcSerde.class.getName());
Assert.assertEquals(registrationUnit.getInputFormat().get(),
"customInputFormat");
Assert.assertEquals(registrationUnit.getOutputFormat().get(),
"customOutputFormat");
@@ -158,6 +154,15 @@ public class HiveOrcSerDeManagerTest {
manager.addSerDeProperties(this.testRegisterPath, registrationUnit);
}
+ public void examineSchema(HiveRegistrationUnit registrationUnit) {
+ List<String> columns =
Arrays.asList(registrationUnit.getSerDeProps().getProp(serdeConstants.LIST_COLUMNS).split(","));
+ Assert.assertTrue(columns.get(0).equals("name"));
+ Assert.assertTrue(columns.get(1).equals("timestamp"));
+ List<String> columnTypes =
Arrays.asList(registrationUnit.getSerDeProps().getProp(serdeConstants.LIST_COLUMN_TYPES).split(","));
+ Assert.assertTrue(columnTypes.get(0).equals("string"));
+ Assert.assertTrue(columnTypes.get(1).equals("bigint"));
+ }
+
@AfterClass
public void tearDown() throws IOException {
FileSystem fs = FileSystem.getLocal(new Configuration());