Updated Branches: refs/heads/sqoop2 ae23cb26d -> adef39bbb
SQOOP-693: Intermediate data format support for export (Bilung Lee via Jarek Jarcec Cecho) Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/adef39bb Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/adef39bb Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/adef39bb Branch: refs/heads/sqoop2 Commit: adef39bbb58e70bdcaf028a183d62feaacb2e916 Parents: ae23cb2 Author: Jarek Jarcec Cecho <[email protected]> Authored: Thu Nov 15 16:03:17 2012 -0800 Committer: Jarek Jarcec Cecho <[email protected]> Committed: Thu Nov 15 16:03:17 2012 -0800 ---------------------------------------------------------------------- .../main/java/org/apache/sqoop/job/io/Data.java | 153 ++++++++++++++- .../java/org/apache/sqoop/job/io/TestData.java | 59 +++++- 2 files changed, 203 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/sqoop/blob/adef39bb/execution/mapreduce/src/main/java/org/apache/sqoop/job/io/Data.java ---------------------------------------------------------------------- diff --git a/execution/mapreduce/src/main/java/org/apache/sqoop/job/io/Data.java b/execution/mapreduce/src/main/java/org/apache/sqoop/job/io/Data.java index f6fff0b..41fceb8 100644 --- a/execution/mapreduce/src/main/java/org/apache/sqoop/job/io/Data.java +++ b/execution/mapreduce/src/main/java/org/apache/sqoop/job/io/Data.java @@ -57,10 +57,16 @@ public class Data implements WritableComparable<Data> { stringEscape, stringDelimiter }); + private int[] fieldTypes = null; + public void setFieldDelimiter(char fieldDelimiter) { this.fieldDelimiter = fieldDelimiter; } + public void setFieldTypes(int[] fieldTypes) { + this.fieldTypes = fieldTypes; + } + public void setContent(Object content, int type) { switch (type) { case EMPTY_DATA: @@ -356,7 +362,37 @@ public class Data implements WritableComparable<Data> { case CSV_RECORD: ArrayList<Object> list = new ArrayList<Object>(); - // todo: need to parse CSV into Array + char[] record = ((String)content).toCharArray(); + int start = 0; + int position = start; + boolean stringDelimited = false; + boolean arrayDelimited = false; + int index = 0; + while (position < record.length) { + if (record[position] == fieldDelimiter) { + if (!stringDelimited && !arrayDelimited) { + index = parseField(list, record, start, position, index); + start = position + 1; + } + } else if (record[position] == stringDelimiter) { + if (!stringDelimited) { + stringDelimited = true; + } + else if (position > 0 && record[position-1] != stringEscape) { + stringDelimited = false; + } + } else if (record[position] == '[') { + if (!stringDelimited) { + arrayDelimited = true; + } + } else if (record[position] == ']') { + if (!stringDelimited) { + arrayDelimited = false; + } + } + position++; + } + parseField(list, record, start, position, index); return list.toArray(); case ARRAY_RECORD: @@ -367,6 +403,114 @@ public class Data implements WritableComparable<Data> { } } + private int parseField(ArrayList<Object> list, char[] record, + int start, int end, int index) { + String field = String.valueOf(record, start, end-start).trim(); + + int fieldType; + if (fieldTypes == null) { + fieldType = guessType(field); + } else { + fieldType = fieldTypes[index]; + } + + switch (fieldType) { + case FieldTypes.UTF: + if (field.charAt(0) != stringDelimiter || + field.charAt(field.length()-1) != stringDelimiter) { + throw new SqoopException(MapreduceExecutionError.MAPRED_EXEC_0022); + } + list.add(index, unescape(field.substring(1, field.length()-1))); + break; + + case FieldTypes.BIN: + if (field.charAt(0) != '[' || + field.charAt(field.length()-1) != ']') { + throw new SqoopException(MapreduceExecutionError.MAPRED_EXEC_0022); + } + String[] splits = + field.substring(1, field.length()-1).split(String.valueOf(',')); + byte[] bytes = new byte[splits.length]; + for (int i=0; i<bytes.length; i++) { + bytes[i] = Byte.parseByte(splits[i].trim()); + } + list.add(index, bytes); + break; + + case FieldTypes.DOUBLE: + list.add(index, Double.parseDouble(field)); + break; + + case FieldTypes.FLOAT: + list.add(index, Float.parseFloat(field)); + break; + + case FieldTypes.LONG: + list.add(index, Long.parseLong(field)); + break; + + case FieldTypes.INT: + list.add(index, Integer.parseInt(field)); + break; + + case FieldTypes.SHORT: + list.add(index, Short.parseShort(field)); + break; + + case FieldTypes.CHAR: + list.add(index, Character.valueOf(field.charAt(0))); + break; + + case FieldTypes.BYTE: + list.add(index, Byte.parseByte(field)); + break; + + case FieldTypes.BOOLEAN: + list.add(index, Boolean.parseBoolean(field)); + break; + + case FieldTypes.NULL: + list.add(index, null); + break; + + default: + throw new SqoopException(MapreduceExecutionError.MAPRED_EXEC_0012, String.valueOf(fieldType)); + } + + return ++index; + } + + private int guessType(String field) { + char[] value = field.toCharArray(); + + if (value[0] == stringDelimiter) { + return FieldTypes.UTF; + } + + switch (value[0]) { + case 'n': + case 'N': + return FieldTypes.NULL; + case '[': + return FieldTypes.BIN; + case 't': + case 'f': + case 'T': + case 'F': + return FieldTypes.BOOLEAN; + } + + int position = 1; + while (position < value.length) { + switch (value[position++]) { + case '.': + return FieldTypes.DOUBLE; + } + } + + return FieldTypes.LONG; + } + private String escape(String string) { // TODO: Also need to escape those special characters as documented in: // https://cwiki.apache.org/confluence/display/SQOOP/Sqoop2+Intermediate+representation#Sqoop2Intermediaterepresentation-Intermediateformatrepresentationproposal @@ -375,4 +519,11 @@ public class Data implements WritableComparable<Data> { return string.replaceAll(regex, replacement); } + private String unescape(String string) { + // TODO: Also need to unescape those special characters as documented in: + // https://cwiki.apache.org/confluence/display/SQOOP/Sqoop2+Intermediate+representation#Sqoop2Intermediaterepresentation-Intermediateformatrepresentationproposal + String regex = Matcher.quoteReplacement(escapedStringDelimiter); + String replacement = String.valueOf(stringDelimiter); + return string.replaceAll(regex, replacement); + } } http://git-wip-us.apache.org/repos/asf/sqoop/blob/adef39bb/execution/mapreduce/src/test/java/org/apache/sqoop/job/io/TestData.java ---------------------------------------------------------------------- diff --git a/execution/mapreduce/src/test/java/org/apache/sqoop/job/io/TestData.java b/execution/mapreduce/src/test/java/org/apache/sqoop/job/io/TestData.java index ea7ac70..91df426 100644 --- a/execution/mapreduce/src/test/java/org/apache/sqoop/job/io/TestData.java +++ b/execution/mapreduce/src/test/java/org/apache/sqoop/job/io/TestData.java @@ -34,13 +34,13 @@ public class TestData extends TestCase { // with special characters: expected = - (long) TEST_NUMBER + "," + - TEST_NUMBER + "," + + Long.valueOf((long)TEST_NUMBER) + "," + + Double.valueOf(TEST_NUMBER) + "," + "'" + String.valueOf(TEST_NUMBER) + "\\',s'" + "," + Arrays.toString(new byte[] {1, 2, 3, 4, 5}); data.setContent(new Object[] { - (long) TEST_NUMBER, - TEST_NUMBER, + Long.valueOf((long)TEST_NUMBER), + Double.valueOf(TEST_NUMBER), String.valueOf(TEST_NUMBER) + "',s", new byte[] {1, 2, 3, 4, 5} }, Data.ARRAY_RECORD); @@ -49,13 +49,13 @@ public class TestData extends TestCase { // with null characters: expected = - (long) TEST_NUMBER + "," + - TEST_NUMBER + "," + + Long.valueOf((long)TEST_NUMBER) + "," + + Double.valueOf(TEST_NUMBER) + "," + "null" + "," + Arrays.toString(new byte[] {1, 2, 3, 4, 5}); data.setContent(new Object[] { - (long) TEST_NUMBER, - TEST_NUMBER, + Long.valueOf((long)TEST_NUMBER), + Double.valueOf(TEST_NUMBER), null, new byte[] {1, 2, 3, 4, 5} }, Data.ARRAY_RECORD); @@ -63,6 +63,49 @@ public class TestData extends TestCase { assertEquals(expected, actual); } + @Test + public void testCsvToArray() throws Exception { + Data data = new Data(); + Object[] expected; + Object[] actual; + + // with special characters: + expected = new Object[] { + Long.valueOf((long)TEST_NUMBER), + Double.valueOf(TEST_NUMBER), + String.valueOf(TEST_NUMBER) + "',s", + new byte[] {1, 2, 3, 4, 5} }; + data.setContent( + Long.valueOf((long)TEST_NUMBER) + "," + + Double.valueOf(TEST_NUMBER) + "," + + "'" + String.valueOf(TEST_NUMBER) + "\\',s'" + "," + + Arrays.toString(new byte[] {1, 2, 3, 4, 5}), + Data.CSV_RECORD); + actual = (Object[])data.getContent(Data.ARRAY_RECORD); + assertEquals(expected.length, actual.length); + for (int c=0; c<expected.length; c++) { + assertEquals(expected[c], actual[c]); + } + + // with null characters: + expected = new Object[] { + Long.valueOf((long)TEST_NUMBER), + Double.valueOf(TEST_NUMBER), + null, + new byte[] {1, 2, 3, 4, 5} }; + data.setContent( + Long.valueOf((long)TEST_NUMBER) + "," + + Double.valueOf(TEST_NUMBER) + "," + + "null" + "," + + Arrays.toString(new byte[] {1, 2, 3, 4, 5}), + Data.CSV_RECORD); + actual = (Object[])data.getContent(Data.ARRAY_RECORD); + assertEquals(expected.length, actual.length); + for (int c=0; c<expected.length; c++) { + assertEquals(expected[c], actual[c]); + } + } + public static void assertEquals(Object expected, Object actual) { if (expected instanceof byte[]) { assertEquals(Arrays.toString((byte[])expected),
