fhueske commented on a change in pull request #6710: [FLINK-10134] UTF-16 
support for TextInputFormat bug fixed
URL: https://github.com/apache/flink/pull/6710#discussion_r221892225
 
 

 ##########
 File path: 
flink-core/src/main/java/org/apache/flink/api/common/io/FileInputFormat.java
 ##########
 @@ -862,35 +865,87 @@ public void close() throws IOException {
                        stream = null;
                }
        }
-       
+
        /**
         * Override this method to supports multiple paths.
         * When this method will be removed, all FileInputFormats have to 
support multiple paths.
         *
         * @return True if the FileInputFormat supports multiple paths, false 
otherwise.
-        *
         * @deprecated Will be removed for Flink 2.0.
         */
        @Deprecated
        public boolean supportsMultiPaths() {
                return false;
        }
 
+       @Override
        public String toString() {
                return getFilePaths() == null || getFilePaths().length == 0 ?
                        "File Input (unknown file)" :
-                       "File Input (" +  Arrays.toString(this.getFilePaths()) 
+ ')';
+                       "File Input (" + Arrays.toString(this.getFilePaths()) + 
')';
+       }
+
+       /**
+        * Get file bom encoding
+        *
+        * @param fs
+        * @return
+        */
+       public String getBomCharset(FileStatus fs) {
+               FSDataInputStream inStream = null;
+               String charset, testFileSystem = "TestFileSystem";
+               byte[] bom = new byte[4];
+               byte[] bytes = new byte[]{(byte) 0x00, (byte) 0xFE, (byte) 
0xFF, (byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
+               try {
+                       /*
+                        * int read() Reads a data byte from this input stream. 
Int read(byte[] b) will be at most b.length from this input stream
+                        * Bytes of data are read into a byte array. Int 
read(byte[] b, int off, int len)
+                        * Reads up to len bytes of data from this input stream 
into a byte array.
+                        */
+                       FileSystem fileSystem = fs.getPath().getFileSystem();
+                       if 
(testFileSystem.equals(fileSystem.getClass().getSimpleName())) {
+                               fileSystem = new LocalFileSystem();
+                       }
+
+                       inStream = fileSystem.open(fs.getPath());
+                       inStream.read(bom, 0, bom.length);
+
+                       if ((bom[0] == bytes[0]) && (bom[1] == bytes[0]) && 
(bom[2] == bytes[1]) && (bom[3] == bytes[2])) {
+                               charset = "UTF-32BE";
+                       } else if ((bom[0] == bytes[2]) && (bom[1] == bytes[1]) 
&& (bom[2] == bytes[0]) && (bom[3] == bytes[0])) {
+                               charset = "UTF-32LE";
+                       } else if ((bom[0] == bytes[3]) && (bom[1] == bytes[4]) 
&& (bom[2] == bytes[5])) {
+                               charset = "UTF-8";
+                       } else if ((bom[0] == bytes[1]) && (bom[1] == 
bytes[2])) {
+                               charset = "UTF-16BE";
+                       } else if ((bom[0] == bytes[2]) && (bom[1] == 
bytes[1])) {
+                               charset = "UTF-16LE";
+                       } else {
+                               charset = null;
+                       }
+               } catch (Exception e) {
+                       throw new IllegalArgumentException("Failed to get file 
bom encoding.");
 
 Review comment:
   We should not fail if there is an empty file in a directory.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to