This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new a43d3a2  TIKA-3282 -- small fixes to legacy utf16le extraction in 
dumpstrings
a43d3a2 is described below

commit a43d3a26ee1d64a4740f69d13573eb4842f644c8
Author: tallison <[email protected]>
AuthorDate: Wed Feb 3 14:19:46 2021 -0500

    TIKA-3282 -- small fixes to legacy utf16le extraction in dumpstrings
---
 .../parser/microsoft/onenote/OneNoteLegacyDumpStrings.java   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
index bdcff02..1dbd4ba 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
@@ -116,13 +116,13 @@ class OneNoteLegacyDumpStrings {
                 }
                 ByteBuffer byteBuffer = 
ByteBuffer.allocate((int)nextBufferSize);
                 oneNoteDirectFileResource.read(byteBuffer);
-
-                for (long i = 0; i < nextBufferSize - 1; ++i) {
-                    int c1 = byteBuffer.get((int)i);
+                for (long i = 0; i < nextBufferSize - 1; i++) {
+                    int c1 = byteBuffer.get((int)i) & 0xff;
                     int c2 = byteBuffer.get((int)i+1);
-                    if (c1 == 0x00 && c2 >= 0x20 && c2 < 0x7F) {
+
+                    if (c2 == 0x00 && c1 >= 0x20) {// add this back? && c1 < 
0x7F) {
                         ++i;
-                        os.write(c2);
+                        os.write(c1);
                     } else {
                         if (os.size() >= MIN_STRING_LENGTH) {
                             writeIfUseful(os);
@@ -144,7 +144,7 @@ class OneNoteLegacyDumpStrings {
      * @param os Byte array output stream containing the buffer.
      */
     private void writeIfUseful(ByteArrayOutputStream os) throws SAXException {
-        String str = new String(os.toByteArray(), StandardCharsets.US_ASCII);
+        String str = new String(os.toByteArray(), StandardCharsets.ISO_8859_1);
         String [] spl = str.split(" ");
         if (spl.length > 1) {
             int numAlpha = 0;

Reply via email to