Repository: sqoop Updated Branches: refs/heads/trunk 0c8b10548 -> 1dd50cfb2
SQOOP-2561: Special Character removal from Column name as avro data results in duplicate column and fails the import (VISHNU S NAIR via Jarek Jarcec Cecho) Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/1dd50cfb Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/1dd50cfb Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/1dd50cfb Branch: refs/heads/trunk Commit: 1dd50cfb2ae327b0df8393dd96d1adb86bb2f65f Parents: 0c8b105 Author: Jarek Jarcec Cecho <[email protected]> Authored: Tue Mar 29 09:26:47 2016 -0700 Committer: Jarek Jarcec Cecho <[email protected]> Committed: Tue Mar 29 09:26:47 2016 -0700 ---------------------------------------------------------------------- src/java/org/apache/sqoop/avro/AvroUtil.java | 2 +- src/test/com/cloudera/sqoop/TestAvroImport.java | 36 ++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/sqoop/blob/1dd50cfb/src/java/org/apache/sqoop/avro/AvroUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/avro/AvroUtil.java b/src/java/org/apache/sqoop/avro/AvroUtil.java index 90cc9d0..319be0f 100644 --- a/src/java/org/apache/sqoop/avro/AvroUtil.java +++ b/src/java/org/apache/sqoop/avro/AvroUtil.java @@ -114,7 +114,7 @@ public final class AvroUtil { * Format candidate to avro specifics */ public static String toAvroIdentifier(String candidate) { - String formattedCandidate = candidate.replaceAll("\\W+", ""); + String formattedCandidate = candidate.replaceAll("\\W+", "_"); if (formattedCandidate.substring(0,1).matches("[a-zA-Z_]")) { return formattedCandidate; } else { http://git-wip-us.apache.org/repos/asf/sqoop/blob/1dd50cfb/src/test/com/cloudera/sqoop/TestAvroImport.java ---------------------------------------------------------------------- diff --git a/src/test/com/cloudera/sqoop/TestAvroImport.java b/src/test/com/cloudera/sqoop/TestAvroImport.java index 00d7a95..b611627 100644 --- a/src/test/com/cloudera/sqoop/TestAvroImport.java +++ b/src/test/com/cloudera/sqoop/TestAvroImport.java @@ -85,20 +85,24 @@ public class TestAvroImport extends ImportJobTestCase { } public void testAvroImport() throws IOException { + this.setCurTableName("Avro_Import_Test"); avroImportTestHelper(null, null); } public void testDeflateCompressedAvroImport() throws IOException { + this.setCurTableName("Deflate_Compressed_Avro_Import_Test_1"); avroImportTestHelper(new String[] {"--compression-codec", "org.apache.hadoop.io.compress.DefaultCodec", }, "deflate"); } public void testDefaultCompressedAvroImport() throws IOException { + this.setCurTableName("Deflate_Compressed_Avro_Import_Test_2"); avroImportTestHelper(new String[] {"--compress", }, "deflate"); } public void testUnsupportedCodec() throws IOException { try { + this.setCurTableName("Deflate_Compressed_Avro_Import_Test_3"); avroImportTestHelper(new String[] {"--compression-codec", "foobar", }, null); fail("Expected IOException"); @@ -212,6 +216,7 @@ public class TestAvroImport extends ImportJobTestCase { String [] names = { "avro\uC3A11" }; String [] types = { "INT" }; String [] vals = { "1987" }; + this.setCurTableName("Non_Std_Character_Test"); createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); @@ -223,10 +228,10 @@ public class TestAvroImport extends ImportJobTestCase { List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); - checkField(fields.get(0), "AVRO1", Type.INT); + checkField(fields.get(0), "AVRO_1", Type.INT); GenericRecord record1 = reader.next(); - assertEquals("AVRO1", 1987, record1.get("AVRO1")); + assertEquals("AVRO_1", 1987, record1.get("AVRO_1")); } public void testNonIdentCharactersInColumnName() throws IOException { @@ -250,6 +255,33 @@ public class TestAvroImport extends ImportJobTestCase { assertEquals("TEST_A_V_R_O", 2015, record1.get("TEST_A_V_R_O")); } + /* + * Test Case For checking multiple columns having non standard characters in multiple columns + */ + public void testNonstandardCharactersInMultipleColumns() throws IOException { + String[] names = { "id$1", "id1$" }; + String[] types = { "INT", "INT" }; + String[] vals = { "1987", "1988" }; + this.setCurTableName("Non_Std_Character_Test_For_Multiple_Columns"); + createTableWithColTypesAndNames(names, types, vals); + + runImport(getOutputArgv(true, null)); + + Path outputFile = new Path(getTablePath(), "part-m-00000.avro"); + DataFileReader<GenericRecord> reader = read(outputFile); + Schema schema = reader.getSchema(); + assertEquals(Schema.Type.RECORD, schema.getType()); + List<Field> fields = schema.getFields(); + assertEquals(types.length, fields.size()); + + checkField(fields.get(0), "ID_1", Type.INT); + + GenericRecord record1 = reader.next(); + assertEquals("ID_1", 1987, record1.get("ID_1")); + checkField(fields.get(1), "ID1_", Type.INT); + assertEquals("ID1_", 1988, record1.get("ID1_")); + } + protected void checkField(Field field, String name, Type type) { assertEquals(name, field.name()); assertEquals(Schema.Type.UNION, field.schema().getType());
