Repository: sqoop Updated Branches: refs/heads/trunk 5771a2da5 -> be30a344e
SQOOP-3075: Simplify Unicode character support in source files (introduced by SQOOP-3074) by defining explicit locales instead of using EscapeUtils (Attila Szabo) Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/be30a344 Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/be30a344 Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/be30a344 Branch: refs/heads/trunk Commit: be30a344ee28ae60fcce9e9e45a0ec73c93209a7 Parents: 5771a2d Author: Attila Szabo <[email protected]> Authored: Fri Dec 16 11:48:52 2016 +0100 Committer: Attila Szabo <[email protected]> Committed: Fri Dec 16 11:48:52 2016 +0100 ---------------------------------------------------------------------- src/java/org/apache/sqoop/avro/AvroUtil.java | 6 +----- .../org/apache/sqoop/orm/AvroSchemaGenerator.java | 5 +---- src/java/org/apache/sqoop/orm/ClassWriter.java | 14 +++----------- src/java/org/apache/sqoop/orm/CompilationManager.java | 4 ++++ 4 files changed, 9 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/sqoop/blob/be30a344/src/java/org/apache/sqoop/avro/AvroUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/avro/AvroUtil.java b/src/java/org/apache/sqoop/avro/AvroUtil.java index 8d90130..ee29f14 100644 --- a/src/java/org/apache/sqoop/avro/AvroUtil.java +++ b/src/java/org/apache/sqoop/avro/AvroUtil.java @@ -28,7 +28,6 @@ import org.apache.avro.generic.GenericFixed; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.mapred.FsInput; -import org.apache.commons.lang.StringEscapeUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -107,10 +106,7 @@ public final class AvroUtil { * Convert Column name into Avro column name. */ public static String toAvroColumn(String column) { - // We're unescaping identifiers to get the real Unicode characters - // back, and not the escaped versions. - String candidate = StringEscapeUtils.unescapeJava( - ClassWriter.toJavaIdentifier(column)); + String candidate = ClassWriter.toJavaIdentifier(column); return toAvroIdentifier(candidate); } http://git-wip-us.apache.org/repos/asf/sqoop/blob/be30a344/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java index 5b1c745..3c31c43 100644 --- a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java +++ b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java @@ -29,7 +29,6 @@ import org.apache.avro.LogicalType; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; -import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -89,9 +88,7 @@ public class AvroSchemaGenerator { List<Field> fields = new ArrayList<Field>(); for (String columnName : columnNames) { - // We're unescaping identifiers to get the real Unicode characters - // back, and not the escaped versions. - String cleanedCol = AvroUtil.toAvroIdentifier(StringEscapeUtils.unescapeJava(ClassWriter.toJavaIdentifier(columnName))); + String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName)); List<Integer> columnInfoList = columnInfo.get(columnName); int sqlType = columnInfoList.get(0); Integer precision = columnInfoList.get(1); http://git-wip-us.apache.org/repos/asf/sqoop/blob/be30a344/src/java/org/apache/sqoop/orm/ClassWriter.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/orm/ClassWriter.java b/src/java/org/apache/sqoop/orm/ClassWriter.java index 0c8d86d..c18a36f 100644 --- a/src/java/org/apache/sqoop/orm/ClassWriter.java +++ b/src/java/org/apache/sqoop/orm/ClassWriter.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.util.Date; import java.util.HashSet; import java.util.List; @@ -284,16 +285,7 @@ public class ClassWriter { return "_" + output; } - // Calling StringEscapeUtils#escapeJava is required because we'd like to - // support Unicode characters in identifiers even if the locale of the host - // system is not supporting UTF-8, or by any reason the locale is different - // from that. Good example: if a column name would contain a \uC3A1 char - // in it's name, though the locale would not support Unicode characters - // then the generated java file would contain unrecognizable characters - // for the compiler, and javac would fail with a compile error. If the name - // of the column would be Alm\uC3A1a then it would be Alm\uC3A1a after the - // escaping, and this every places where it's used/ - return StringEscapeUtils.escapeJava(output); + return output; } private String toJavaType(String columnName, int sqlType) { @@ -1796,7 +1788,7 @@ public class ClassWriter { Writer writer = null; try { ostream = new FileOutputStream(filename); - writer = new OutputStreamWriter(ostream); + writer = new OutputStreamWriter(ostream, StandardCharsets.UTF_8); writer.append(sb.toString()); } finally { if (null != writer) { http://git-wip-us.apache.org/repos/asf/sqoop/blob/be30a344/src/java/org/apache/sqoop/orm/CompilationManager.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/sqoop/orm/CompilationManager.java b/src/java/org/apache/sqoop/orm/CompilationManager.java index 0a2a87f..c1a656b 100644 --- a/src/java/org/apache/sqoop/orm/CompilationManager.java +++ b/src/java/org/apache/sqoop/orm/CompilationManager.java @@ -23,6 +23,7 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -170,6 +171,9 @@ public class CompilationManager { String curClasspath = System.getProperty("java.class.path"); LOG.debug("Current sqoop classpath = " + curClasspath); + args.add("-encoding"); + args.add(StandardCharsets.UTF_8.toString()); + args.add("-sourcepath"); args.add(jarOutDir);
