[GitHub] fhueske commented on a change in pull request #6823: [FLINK-10134] UTF-16 support for TextInputFormat bug refixed

GitBox Tue, 16 Oct 2018 08:08:42 -0700

fhueske commented on a change in pull request #6823: [FLINK-10134] UTF-16 
support for TextInputFormat bug refixed
URL: https://github.com/apache/flink/pull/6823#discussion_r225581316


 ##########
 File path: 
flink-java/src/test/java/org/apache/flink/api/java/io/TextInputFormatTest.java
 ##########
 @@ -207,12 +207,212 @@ private void testRemovingTrailingCR(String lineBreaker, 
String delimiter) {
                                assertEquals(content, result);
                        }
 
+               } catch (Throwable t) {
+                       System.err.println("test failed with exception: " + 
t.getMessage());
+                       t.printStackTrace(System.err);
+                       fail("Test erroneous");
                }
-               catch (Throwable t) {
+       }
+
+       /**
+        * Test different file encodings,for example: UTF-8, UTF-8 with bom, 
UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE.
+        */
+       @Test
+       public void testFileCharset() {
+               String first = "First line";
+
+               // Test special different languages
+               for (final String data : new String[]{"Hello", "ハロー", "привет", 
"Bonjour", "Сайн байна уу", "안녕하세요."}) {
+                       testAllFileCharsetNoDelimiter(data);
+               }
+
+               // Test special symbol
+               for (final String delimiterStr : new String[]{"\\", "^", "|", 
"[", ".", "*"}) {
+                       first = "Fir" + delimiterStr + "st li" + delimiterStr + 
"ne";
+                       testAllFileCharsetWithDelimiter(first, delimiterStr);
+               }
+       }
+
+       private void testAllFileCharsetNoDelimiter(String first) {
+               testAllFileCharsetWithDelimiter(first, "");
+       }
+
+       private void testAllFileCharsetWithDelimiter(String first, String 
delimiter) {
+               try {
+                       final byte[] noBom = new byte[]{};
+                       final byte[] utf8Bom = new byte[]{(byte) 0xEF, (byte) 
0xBB, (byte) 0xBF};
+                       final byte[] utf16LEBom = new byte[]{(byte) 0xFF, 
(byte) 0xFE};
+                       final byte[] utf16BEBom = new byte[]{(byte) 0xFE, 
(byte) 0xFF};
+                       final byte[] utf32LEBom = new byte[]{(byte) 0xFF, 
(byte) 0xFE, (byte) 0x00, (byte) 0x00};
+                       final byte[] utf32BEBom = new byte[]{(byte) 0x00, 
(byte) 0x00, (byte) 0xFE, (byte) 0xFF};
+
+                       // test UTF-8 have bom
+                       testFileCharset(first, "UTF-8", "UTF-32", 1, utf8Bom, 
delimiter.getBytes("UTF-8"));
+                       // test UTF-8 without bom
+                       testFileCharset(first, "UTF-8", "UTF-8", 0, noBom, 
delimiter.getBytes("UTF-8"));
+                       // test UTF-16LE without bom
+                       testFileCharset(first, "UTF-16LE", "UTF-16LE", 0, 
noBom, delimiter.getBytes("UTF-16LE"));
+                       // test UTF-16BE without bom
+                       testFileCharset(first, "UTF-16BE", "UTF-16BE", 0, 
noBom, delimiter.getBytes("UTF-16BE"));
+                       // test UTF-16LE have bom
+                       testFileCharset(first, "UTF-16LE", "UTF-16LE", 1, 
utf16LEBom, delimiter.getBytes("UTF-16LE"));
+                       // test UTF-16BE have bom
+                       testFileCharset(first, "UTF-16BE", "UTF-16", 1, 
utf16BEBom, delimiter.getBytes("UTF-16BE"));
+                       // test UTF-32LE without bom
+                       testFileCharset(first, "UTF-32LE", "UTF-32LE", 0, 
noBom, delimiter.getBytes("UTF-32LE"));
+                       // test UTF-32BE without bom
+                       testFileCharset(first, "UTF-32BE", "UTF-32BE", 0, 
noBom, delimiter.getBytes("UTF-32BE"));
+                       // test UTF-32LE have bom
+                       testFileCharset(first, "UTF-32LE", "UTF-32LE", 0, 
utf32LEBom, delimiter.getBytes("UTF-32LE"));
+                       // test UTF-32BE have bom
+                       testFileCharset(first, "UTF-32BE", "UTF-32", 0, 
utf32BEBom, delimiter.getBytes("UTF-32BE"));
+               } catch (Throwable t) {
                        System.err.println("test failed with exception: " + 
t.getMessage());
                        t.printStackTrace(System.err);
                        fail("Test erroneous");
                }
        }
 
+       /**
+        * Test different file encodings.
+        *
+        * @param data
+        * @param fileCharset   File itself encoding
+        * @param targetCharset User specified code
+        * @param offset        Return result offset
+        * @param bom           Bom content
+        * @param delimiter
+        */
+       private void testFileCharset(String data, String fileCharset, String 
targetCharset, int offset, byte[] bom, byte[] delimiter) {
+               BufferedWriter bw = null;
+               OutputStreamWriter osw = null;
+               FileOutputStream fos = null;
+               try {
+                       // create input file
+                       File tempFile = 
File.createTempFile("TextInputFormatTest", "tmp");
+                       tempFile.deleteOnExit();
+                       tempFile.setWritable(true);
+//                     System.out.println(tempFile.toString());
+                       fos = new FileOutputStream(tempFile, true);
+
+                       // write UTF8 BOM mark if file is empty
+                       if (tempFile.length() < 1) {
+                               if (bom.length > 0) {
+                                       fos.write(bom);
+                               }
+                       }
+
+                       osw = new OutputStreamWriter(fos, fileCharset);
+                       bw = new BufferedWriter(osw);
+                       bw.write(data);
+                       bw.newLine();
+
+                       bw.close();
+                       fos.close();
+
+                       TextInputFormat inputFormat = new TextInputFormat(new 
Path(tempFile.toURI().toString()));
+                       inputFormat.setCharsetName(targetCharset);
+//                     inputFormat.setCharset(targetCharset);
+
+                       if (delimiter.length > 0) {
+                               inputFormat.setDelimiter(delimiter);
+                       }
+
+                       Configuration parameters = new Configuration();
+                       inputFormat.configure(parameters);
+
+                       FileInputSplit[] splits = 
inputFormat.createInputSplits(1);
+                       assertTrue("expected at least one input split", 
splits.length >= 1);
+                       inputFormat.open(splits[0]);
+
+                       String result = "";
+
+                       if (delimiter.length <= 0) {
+//                             System.out.println("bomCharsetName:" + 
inputFormat.getCharset());
+
+                               assertFalse(inputFormat.reachedEnd());
+                               result = inputFormat.nextRecord("");
+//                             System.out.println(result);
+                               assertNotNull("Expecting first record here", 
result);
+                               assertEquals(data, result.substring(offset));
+
+//                             assertTrue(inputFormat.reachedEnd() || null == 
inputFormat.nextRecord(result));
+                       } else {
+//                             System.out.println("bomCharsetName:" + 
inputFormat.getCharset());
+                               int i = 0;
+                               data = data + 
java.security.AccessController.doPrivileged(
+                                       new 
sun.security.action.GetPropertyAction("line.separator"));
+                               String delimiterStr = new String(delimiter, 0, 
delimiter.length, fileCharset);
+                               String[] strArr = data.split(delimiterStr
 
 Review comment:
   Use the actual delimiter instead of all that you are calling this function 
with

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] fhueske commented on a change in pull request #6823: [FLINK-10134] UTF-16 support for TextInputFormat bug refixed

Reply via email to