This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a43d3a2 TIKA-3282 -- small fixes to legacy utf16le extraction in
dumpstrings
a43d3a2 is described below
commit a43d3a26ee1d64a4740f69d13573eb4842f644c8
Author: tallison <[email protected]>
AuthorDate: Wed Feb 3 14:19:46 2021 -0500
TIKA-3282 -- small fixes to legacy utf16le extraction in dumpstrings
---
.../parser/microsoft/onenote/OneNoteLegacyDumpStrings.java | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
index bdcff02..1dbd4ba 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
@@ -116,13 +116,13 @@ class OneNoteLegacyDumpStrings {
}
ByteBuffer byteBuffer =
ByteBuffer.allocate((int)nextBufferSize);
oneNoteDirectFileResource.read(byteBuffer);
-
- for (long i = 0; i < nextBufferSize - 1; ++i) {
- int c1 = byteBuffer.get((int)i);
+ for (long i = 0; i < nextBufferSize - 1; i++) {
+ int c1 = byteBuffer.get((int)i) & 0xff;
int c2 = byteBuffer.get((int)i+1);
- if (c1 == 0x00 && c2 >= 0x20 && c2 < 0x7F) {
+
+ if (c2 == 0x00 && c1 >= 0x20) {// add this back? && c1 <
0x7F) {
++i;
- os.write(c2);
+ os.write(c1);
} else {
if (os.size() >= MIN_STRING_LENGTH) {
writeIfUseful(os);
@@ -144,7 +144,7 @@ class OneNoteLegacyDumpStrings {
* @param os Byte array output stream containing the buffer.
*/
private void writeIfUseful(ByteArrayOutputStream os) throws SAXException {
- String str = new String(os.toByteArray(), StandardCharsets.US_ASCII);
+ String str = new String(os.toByteArray(), StandardCharsets.ISO_8859_1);
String [] spl = str.split(" ");
if (spl.length > 1) {
int numAlpha = 0;