Repository: sqoop Updated Branches: refs/heads/trunk 44e05df21 -> 5771a2da5
SQOOP-3074: Fix Avro import not to fail with Javac errors in case of non UTF-8 locale (Attila Szabo) Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/5771a2da Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/5771a2da Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/5771a2da Branch: refs/heads/trunk Commit: 5771a2da5fc071ca8f80f222e8468a29419e845e Parents: 44e05df Author: Attila Szabo <[email protected]> Authored: Thu Dec 8 23:07:31 2016 +0100 Committer: Attila Szabo <[email protected]> Committed: Thu Dec 8 23:07:31 2016 +0100 ---------------------------------------------------------------------- src/java/org/apache/sqoop/avro/AvroUtil.java | 6 +++++- src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java | 5 ++++- src/java/org/apache/sqoop/orm/ClassWriter.java | 11 ++++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/sqoop/blob/5771a2da/src/java/org/apache/sqoop/avro/AvroUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/avro/AvroUtil.java b/src/java/org/apache/sqoop/avro/AvroUtil.java index ee29f14..8d90130 100644 --- a/src/java/org/apache/sqoop/avro/AvroUtil.java +++ b/src/java/org/apache/sqoop/avro/AvroUtil.java @@ -28,6 +28,7 @@ import org.apache.avro.generic.GenericFixed; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.mapred.FsInput; +import org.apache.commons.lang.StringEscapeUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -106,7 +107,10 @@ public final class AvroUtil { * Convert Column name into Avro column name. */ public static String toAvroColumn(String column) { - String candidate = ClassWriter.toJavaIdentifier(column); + // We're unescaping identifiers to get the real Unicode characters + // back, and not the escaped versions. + String candidate = StringEscapeUtils.unescapeJava( + ClassWriter.toJavaIdentifier(column)); return toAvroIdentifier(candidate); } http://git-wip-us.apache.org/repos/asf/sqoop/blob/5771a2da/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java index 3c31c43..5b1c745 100644 --- a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java +++ b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java @@ -29,6 +29,7 @@ import org.apache.avro.LogicalType; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; +import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -88,7 +89,9 @@ public class AvroSchemaGenerator { List<Field> fields = new ArrayList<Field>(); for (String columnName : columnNames) { - String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName)); + // We're unescaping identifiers to get the real Unicode characters + // back, and not the escaped versions. + String cleanedCol = AvroUtil.toAvroIdentifier(StringEscapeUtils.unescapeJava(ClassWriter.toJavaIdentifier(columnName))); List<Integer> columnInfoList = columnInfo.get(columnName); int sqlType = columnInfoList.get(0); Integer precision = columnInfoList.get(1); http://git-wip-us.apache.org/repos/asf/sqoop/blob/5771a2da/src/java/org/apache/sqoop/orm/ClassWriter.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/orm/ClassWriter.java b/src/java/org/apache/sqoop/orm/ClassWriter.java index 6f6e66b..0c8d86d 100644 --- a/src/java/org/apache/sqoop/orm/ClassWriter.java +++ b/src/java/org/apache/sqoop/orm/ClassWriter.java @@ -284,7 +284,16 @@ public class ClassWriter { return "_" + output; } - return output; + // Calling StringEscapeUtils#escapeJava is required because we'd like to + // support Unicode characters in identifiers even if the locale of the host + // system is not supporting UTF-8, or by any reason the locale is different + // from that. Good example: if a column name would contain a \uC3A1 char + // in it's name, though the locale would not support Unicode characters + // then the generated java file would contain unrecognizable characters + // for the compiler, and javac would fail with a compile error. If the name + // of the column would be Alm\uC3A1a then it would be Alm\uC3A1a after the + // escaping, and this every places where it's used/ + return StringEscapeUtils.escapeJava(output); } private String toJavaType(String columnName, int sqlType) {
