This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4a65796943 TIKA-4710-rtf-attachments-in-html-decapsulation (#2744)
4a65796943 is described below
commit 4a657969439a4fb382a95fde98d461f991875e52
Author: Tim Allison <[email protected]>
AuthorDate: Mon Apr 6 20:14:34 2026 -0400
TIKA-4710-rtf-attachments-in-html-decapsulation (#2744)
---
.../tika-parser-microsoft-module/pom.xml | 24 +
.../tika/parser/microsoft/OfficeParserConfig.java | 20 +
.../tika/parser/microsoft/OutlookExtractor.java | 9 +-
.../parser/microsoft/rtf/jflex/RTFCharsetMaps.java | 180 ++++++++
.../microsoft/rtf/jflex/RTFEmbeddedHandler.java | 254 ++++++++++
.../parser/microsoft/rtf/jflex/RTFGroupState.java | 76 +++
.../microsoft/rtf/jflex/RTFHtmlDecapsulator.java | 239 ++++++++++
.../rtf/jflex/RTFObjDataStreamParser.java | 510 +++++++++++++++++++++
.../microsoft/rtf/jflex/RTFPictStreamParser.java | 101 ++++
.../tika/parser/microsoft/rtf/jflex/RTFState.java | 342 ++++++++++++++
.../tika/parser/microsoft/rtf/jflex/RTFToken.java | 116 +++++
.../parser/microsoft/rtf/jflex/RTFTokenType.java | 30 ++
.../parser/microsoft/rtf/jflex/RTFTokenizer.jflex | 153 +++++++
.../rtf/jflex/RTFEmbeddedHandlerTest.java | 132 ++++++
.../rtf/jflex/RTFHtmlDecapsulatorTest.java | 260 +++++++++++
.../parser/microsoft/rtf/jflex/RTFStateTest.java | 250 ++++++++++
.../microsoft/rtf/jflex/RTFTokenizerTest.java | 191 ++++++++
17 files changed, 2884 insertions(+), 3 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index 63cc9605cd..a3c7e1f1c8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -125,6 +125,30 @@
</dependencies>
<build>
<plugins>
+ <plugin>
+ <groupId>de.jflex</groupId>
+ <artifactId>jflex-maven-plugin</artifactId>
+ <version>1.9.1</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>generate</goal>
+ </goals>
+ <configuration>
+ <lexDefinitions>
+ <lexDefinition>src/main/jflex</lexDefinition>
+ </lexDefinitions>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-checkstyle-plugin</artifactId>
+ <configuration>
+ <excludes>**/rtf/jflex/RTFTokenizer.java</excludes>
+ </configuration>
+ </plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 9f21b0b798..363b0a0773 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -39,6 +39,13 @@ public class OfficeParserConfig implements Serializable {
private boolean writeSelectHeadersInBody = false;
+ /**
+ * Maximum bytes per embedded object/pict when extracting from RTF within
+ * MSG files. Since embedded data is streamed to disk (not held in
memory),
+ * the default is 2 GB. Set to -1 for unlimited.
+ */
+ private int rtfEmbeddedMaxBytesInKb = 2 * 1024 * 1024; // 2 GB
+
private boolean includeGlossary = true;
private String dateOverrideFormat = null;
private int maxOverride = 0;//ignore
@@ -319,4 +326,17 @@ public class OfficeParserConfig implements Serializable {
public void setWriteSelectHeadersInBody(boolean writeSelectHeadersInBody) {
this.writeSelectHeadersInBody = writeSelectHeadersInBody;
}
+
+ /**
+ * Maximum bytes (in KB) per embedded object/pict when extracting from RTF
+ * within MSG files. Data is streamed to disk, so the default is 2 GB.
+ * Set to -1 for unlimited.
+ */
+ public int getRtfEmbeddedMaxBytesInKb() {
+ return rtfEmbeddedMaxBytesInKb;
+ }
+
+ public void setRtfEmbeddedMaxBytesInKb(int rtfEmbeddedMaxBytesInKb) {
+ this.rtfEmbeddedMaxBytesInKb = rtfEmbeddedMaxBytesInKb;
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a2ef6de04f..01b357bddf 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -84,8 +84,8 @@ import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.parser.mailcommons.MailDateParser;
import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
-import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor;
import org.apache.tika.parser.microsoft.rtf.RTFParser;
+import org.apache.tika.parser.microsoft.rtf.jflex.RTFHtmlDecapsulator;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.sax.BodyContentHandler;
@@ -600,8 +600,11 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED,
Types.BINARY.getId(),
chunk.getValue());
byte[] rtfData = rtf.getData();
- // Try to extract encapsulated HTML — returns null if not
present
- String html = RTFEncapsulatedHTMLExtractor.extract(rtfData);
+ // Try to extract encapsulated HTML + embedded objects in one
pass
+ RTFHtmlDecapsulator decapsulator =
+ new RTFHtmlDecapsulator(xhtml, parseContext,
+
officeParserConfig.getRtfEmbeddedMaxBytesInKb());
+ String html = decapsulator.extract(rtfData);
if (html != null) {
parseHtmlString(html, xhtml, contentIdNames);
parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
new file mode 100644
index 0000000000..aaac2552ac
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.utils.CharsetUtils;
+
+/**
+ * Shared charset maps for RTF parsing. Maps RTF {@code \fcharsetN} and
+ * {@code \ansicpgN} values to Java {@link Charset} instances.
+ *
+ * <p>Extracted from the original {@code TextExtractor} so both the JFlex-based
+ * parser and decapsulator can reuse them.</p>
+ */
+public final class RTFCharsetMaps {
+
+ public static final Charset WINDOWS_1252 = Charset.forName("windows-1252");
+
+ /**
+ * Maps {@code \fcharsetN} values to Java charsets.
+ * The RTF font table uses these to declare per-font character encodings.
+ */
+ public static final Map<Integer, Charset> FCHARSET_MAP;
+
+ /**
+ * Maps {@code \ansicpgN} values to Java charsets.
+ * This is the global ANSI code page declared in the RTF header.
+ */
+ public static final Map<Integer, Charset> ANSICPG_MAP;
+
+ static {
+ Map<Integer, Charset> fcharset = new HashMap<>();
+
+ fcharset.put(0, WINDOWS_1252); // ANSI
+ // charset 1 = Default, charset 2 = Symbol
+
+ fcharset.put(77, getCharset("MacRoman")); // Mac Roman
+ fcharset.put(78, getCharset("Shift_JIS")); // Mac Shift Jis
+ fcharset.put(79, getCharset("ms949")); // Mac Hangul
+ fcharset.put(80, getCharset("GB2312")); // Mac GB2312
+ fcharset.put(81, getCharset("Big5")); // Mac Big5
+ fcharset.put(82, getCharset("johab")); // Mac Johab (old)
+ fcharset.put(83, getCharset("MacHebrew")); // Mac Hebrew
+ fcharset.put(84, getCharset("MacArabic")); // Mac Arabic
+ fcharset.put(85, getCharset("MacGreek")); // Mac Greek
+ fcharset.put(86, getCharset("MacTurkish")); // Mac Turkish
+ fcharset.put(87, getCharset("MacThai")); // Mac Thai
+ fcharset.put(88, getCharset("cp1250")); // Mac East Europe
+ fcharset.put(89, getCharset("cp1251")); // Mac Russian
+
+ fcharset.put(128, getCharset("MS932")); // Shift JIS
+ fcharset.put(129, getCharset("ms949")); // Hangul
+ fcharset.put(130, getCharset("ms1361")); // Johab
+ fcharset.put(134, getCharset("ms936")); // GB2312
+ fcharset.put(136, getCharset("ms950")); // Big5
+ fcharset.put(161, getCharset("cp1253")); // Greek
+ fcharset.put(162, getCharset("cp1254")); // Turkish
+ fcharset.put(163, getCharset("cp1258")); // Vietnamese
+ fcharset.put(177, getCharset("cp1255")); // Hebrew
+ fcharset.put(178, getCharset("cp1256")); // Arabic
+ fcharset.put(186, getCharset("cp1257")); // Baltic
+
+ fcharset.put(204, getCharset("cp1251")); // Russian
+ fcharset.put(222, getCharset("ms874")); // Thai
+ fcharset.put(238, getCharset("cp1250")); // Eastern European
+ fcharset.put(254, getCharset("cp437")); // PC 437
+ fcharset.put(255, getCharset("cp850")); // OEM
+
+ FCHARSET_MAP = Collections.unmodifiableMap(fcharset);
+ }
+
+ static {
+ Map<Integer, Charset> ansicpg = new HashMap<>();
+
+ ansicpg.put(437, getCharset("CP437")); // US IBM
+ ansicpg.put(708, getCharset("ISO-8859-6")); // Arabic (ASMO 708)
+ ansicpg.put(709, getCharset("windows-709")); // Arabic (ASMO 449+)
+ ansicpg.put(710, getCharset("windows-710")); // Arabic
(transparent)
+ ansicpg.put(711, getCharset("windows-711")); // Arabic (Nafitha)
+ ansicpg.put(720, getCharset("windows-720")); // Arabic
(transparent ASMO)
+ ansicpg.put(819, getCharset("CP819")); // Windows 3.1
(US/Western)
+ ansicpg.put(850, getCharset("CP850")); // IBM Multilingual
+ ansicpg.put(852, getCharset("CP852")); // Eastern European
+ ansicpg.put(860, getCharset("CP860")); // Portuguese
+ ansicpg.put(862, getCharset("CP862")); // Hebrew
+ ansicpg.put(863, getCharset("CP863")); // French Canadian
+ ansicpg.put(864, getCharset("CP864")); // Arabic
+ ansicpg.put(865, getCharset("CP865")); // Norwegian
+ ansicpg.put(866, getCharset("CP866")); // Soviet Union
+ ansicpg.put(874, getCharset("MS874")); // Thai
+ ansicpg.put(932, getCharset("MS932")); // Japanese
+ ansicpg.put(936, getCharset("MS936")); // Simplified Chinese
+ ansicpg.put(949, getCharset("CP949")); // Korean
+ ansicpg.put(950, getCharset("CP950")); // Traditional
Chinese
+ ansicpg.put(1250, getCharset("CP1250")); // Eastern European
+ ansicpg.put(1251, getCharset("CP1251")); // Cyrillic
+ ansicpg.put(1252, getCharset("CP1252")); // Western European
+ ansicpg.put(1253, getCharset("CP1253")); // Greek
+ ansicpg.put(1254, getCharset("CP1254")); // Turkish
+ ansicpg.put(1255, getCharset("CP1255")); // Hebrew
+ ansicpg.put(1256, getCharset("CP1256")); // Arabic
+ ansicpg.put(1257, getCharset("CP1257")); // Baltic
+ ansicpg.put(1258, getCharset("CP1258")); // Vietnamese
+ ansicpg.put(1361, getCharset("x-Johab")); // Johab
+ ansicpg.put(10000, getCharset("MacRoman")); // Mac Roman
+ ansicpg.put(10001, getCharset("Shift_JIS")); // Mac Japan
+ ansicpg.put(10004, getCharset("MacArabic")); // Mac Arabic
+ ansicpg.put(10005, getCharset("MacHebrew")); // Mac Hebrew
+ ansicpg.put(10006, getCharset("MacGreek")); // Mac Greek
+ ansicpg.put(10007, getCharset("MacCyrillic")); // Mac Cyrillic
+ ansicpg.put(10029, getCharset("x-MacCentralEurope")); // Mac Latin2
+ ansicpg.put(10081, getCharset("MacTurkish")); // Mac Turkish
+ ansicpg.put(57002, getCharset("x-ISCII91")); // Devanagari
+ ansicpg.put(57003, getCharset("windows-57003")); // Bengali
+ ansicpg.put(57004, getCharset("windows-57004")); // Tamil
+ ansicpg.put(57005, getCharset("windows-57005")); // Telugu
+ ansicpg.put(57006, getCharset("windows-57006")); // Assamese
+ ansicpg.put(57007, getCharset("windows-57007")); // Oriya
+ ansicpg.put(57008, getCharset("windows-57008")); // Kannada
+ ansicpg.put(57009, getCharset("windows-57009")); // Malayalam
+ ansicpg.put(57010, getCharset("windows-57010")); // Gujarati
+ ansicpg.put(57011, getCharset("windows-57011")); // Punjabi
+
+ ANSICPG_MAP = Collections.unmodifiableMap(ansicpg);
+ }
+
+ private RTFCharsetMaps() {
+ }
+
+ /**
+ * Resolve a charset by name, falling back to US-ASCII if unavailable.
+ */
+ static Charset getCharset(String name) {
+ try {
+ return CharsetUtils.forName(name);
+ } catch (IllegalArgumentException e) {
+ return StandardCharsets.US_ASCII;
+ }
+ }
+
+ /**
+ * Resolve an ANSI code page number to a Java Charset.
+ * Tries the ANSICPG_MAP first, then falls back to {@code windows-N} and
{@code cpN}.
+ * Returns {@code WINDOWS_1252} if nothing matches.
+ */
+ public static Charset resolveCodePage(int cpNumber) {
+ Charset cs = ANSICPG_MAP.get(cpNumber);
+ if (cs != null) {
+ return cs;
+ }
+ try {
+ return Charset.forName("windows-" + cpNumber);
+ } catch (Exception e) {
+ try {
+ return Charset.forName("cp" + cpNumber);
+ } catch (Exception e2) {
+ return WINDOWS_1252;
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
new file mode 100644
index 0000000000..dda6d98dff
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+
+/**
+ * Handles embedded objects and pictures within the JFlex-based RTF token
stream.
+ *
+ * <p>Uses streaming parsers ({@link RTFObjDataStreamParser} and
+ * {@link RTFPictStreamParser}) so that large embedded objects are written
+ * to temp files rather than buffered entirely in memory.</p>
+ */
+public class RTFEmbeddedHandler {
+
+ private final ContentHandler handler;
+ private final ParseContext context;
+ private final EmbeddedDocumentUtil embeddedDocumentUtil;
+ private final long maxBytes;
+
+ private boolean inObject;
+ private boolean isPictBitmap;
+ private int hi = -1;
+ private int thumbCount;
+ private final AtomicInteger unknownFilenameCount = new AtomicInteger();
+
+ private String sn = "";
+ private String sv = "";
+ private final StringBuilder metadataBuffer = new StringBuilder();
+
+ private Metadata metadata;
+
+ // Streaming parsers -- one active at a time
+ private RTFObjDataStreamParser objParser;
+ private RTFPictStreamParser pictParser;
+
+ public RTFEmbeddedHandler(ContentHandler handler, ParseContext context,
+ int maxBytesInKb) {
+ this.handler = handler;
+ this.context = context;
+ this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+ this.maxBytes = maxBytesInKb > 0 ? (long) maxBytesInKb * 1024 : -1;
+ this.metadata = Metadata.newInstance(context);
+ }
+
+ /**
+ * Process a token for embedded object/pict handling.
+ * Call this AFTER {@link RTFState#processToken(RTFToken)} has run.
+ */
+ public void processToken(RTFToken tok, RTFState rtfState, RTFGroupState
closingGroup)
+ throws IOException, SAXException, TikaException {
+ RTFGroupState group = rtfState.getCurrentGroup();
+
+ switch (tok.getType()) {
+ case GROUP_CLOSE:
+ if (closingGroup.objdata) {
+ handleCompletedObjData();
+ } else if (closingGroup.pictDepth == 1) {
+ handleCompletedPict();
+ } else if (closingGroup.sn) {
+ sn = metadataBuffer.toString();
+ } else if (closingGroup.sv) {
+ sv = metadataBuffer.toString();
+ } else if (closingGroup.sp) {
+ metadata.add(sn, sv);
+ }
+ if (closingGroup.object) {
+ inObject = false;
+ }
+ break;
+
+ case CONTROL_WORD:
+ switch (tok.getName()) {
+ case "object":
+ inObject = true;
+ break;
+ case "objdata":
+ metadata = Metadata.newInstance(context);
+ objParser = new RTFObjDataStreamParser(maxBytes);
+ break;
+ case "pict":
+ metadata = Metadata.newInstance(context);
+ pictParser = new RTFPictStreamParser(maxBytes);
+ break;
+ case "sn":
+ metadataBuffer.setLength(0);
+
metadataBuffer.append(RTFMetadata.RTF_PICT_META_PREFIX);
+ break;
+ case "sv":
+ metadataBuffer.setLength(0);
+ break;
+ case "wbitmap":
+ isPictBitmap = true;
+ break;
+ }
+ break;
+
+ case TEXT:
+ if (group.objdata || group.pictDepth == 1) {
+ writeHexChar(tok.getChar());
+ } else if (group.sn || group.sv) {
+ metadataBuffer.append(tok.getChar());
+ }
+ break;
+
+ case HEX_ESCAPE:
+ if (group.sn || group.sv) {
+ metadataBuffer.append((char) tok.getHexValue());
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ private void handleCompletedObjData() throws IOException, SAXException,
TikaException {
+ try (TikaInputStream tis = objParser.onComplete(metadata,
unknownFilenameCount)) {
+ if (tis != null) {
+ extractObj(tis, metadata);
+ }
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ } finally {
+ objParser.close();
+ objParser = null;
+ reset();
+ }
+ }
+
+ private void handleCompletedPict() throws IOException, SAXException,
TikaException {
+ try {
+ String filePath =
+ metadata.get(RTFMetadata.RTF_PICT_META_PREFIX +
"wzDescription");
+ if (filePath != null && !filePath.isEmpty()) {
+ metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID,
filePath);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+ FilenameUtils.getName(filePath));
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME,
filePath);
+ }
+ metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+ if (isPictBitmap) {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+ "image/x-rtf-raw-bitmap");
+ }
+ try (TikaInputStream tis = pictParser.onComplete(metadata)) {
+ if (tis != null) {
+ extractObj(tis, metadata);
+ }
+ }
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ } finally {
+ pictParser = null;
+ reset();
+ }
+ }
+
+ private void writeHexChar(int b) throws IOException, TikaException {
+ if (isHexChar(b)) {
+ if (hi == -1) {
+ hi = 16 * hexValue(b);
+ } else {
+ int decoded = hi + hexValue(b);
+ hi = -1;
+ if (objParser != null) {
+ objParser.onByte(decoded);
+ } else if (pictParser != null) {
+ pictParser.onByte(decoded);
+ }
+ }
+ }
+ }
+
+ private void extractObj(TikaInputStream tis, Metadata meta)
+ throws SAXException, IOException, TikaException {
+ meta.set(Metadata.CONTENT_LENGTH, Long.toString(tis.getLength()));
+
+ if (embeddedDocumentUtil.shouldParseEmbedded(meta)) {
+ if (meta.get(TikaCoreProperties.RESOURCE_NAME_KEY) == null) {
+ String extension = embeddedDocumentUtil.getExtension(tis,
meta);
+ if (inObject && pictParser != null) {
+ meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+
EmbeddedDocumentUtil.EmbeddedResourcePrefix.THUMBNAIL.getPrefix()
+ + "-" + thumbCount++ + extension);
+ meta.set(RTFMetadata.THUMBNAIL, "true");
+ } else {
+ meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+
EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED.getPrefix()
+ + "-" +
unknownFilenameCount.getAndIncrement()
+ + extension);
+ }
+ meta.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
+ }
+ try {
+ embeddedDocumentUtil.parseEmbedded(
+ tis, new EmbeddedContentHandler(handler), meta, true);
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, meta);
+ }
+ }
+ }
+
+ private void reset() {
+ metadata = Metadata.newInstance(context);
+ hi = -1;
+ sn = "";
+ sv = "";
+ metadataBuffer.setLength(0);
+ isPictBitmap = false;
+ }
+
+ private static boolean isHexChar(int ch) {
+ return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >=
'A' && ch <= 'F');
+ }
+
+ private static int hexValue(int ch) {
+ if (ch >= '0' && ch <= '9') {
+ return ch - '0';
+ } else if (ch >= 'a' && ch <= 'z') {
+ return 10 + (ch - 'a');
+ } else {
+ return 10 + (ch - 'A');
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
new file mode 100644
index 0000000000..c5f9f8c444
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+
+/**
+ * State associated with a single RTF group ({@code \{ ... \}}).
+ * <p>
+ * When a new group opens, the current state is pushed onto the stack and a
+ * child state is created that inherits the parent's properties. When the group
+ * closes, the state is popped.
+ */
+public class RTFGroupState {
+
+ /** Nesting depth (0 = root). */
+ int depth;
+
+ /** Current font charset, set by {@code \fN} if the font table maps it.
May be null. */
+ Charset fontCharset;
+
+ /** Current font ID, set by {@code \fN}. -1 if unset. */
+ int fontId = -1;
+
+ /** Number of ANSI chars to skip after a unicode escape (ucN control
word). Default 1. */
+ int ucSkip = 1;
+
+ /** True if this group's content should be ignored (e.g. {@code \*}
destination). */
+ boolean ignore;
+
+ /** True if bold. */
+ boolean bold;
+
+ /** True if italic. */
+ boolean italic;
+
+ // Embedded object / picture state
+ boolean objdata;
+ int pictDepth;
+ boolean sp;
+ boolean sn;
+ boolean sv;
+ boolean object;
+ boolean annotation;
+
+ /** Create a root group state with defaults. */
+ public RTFGroupState() {
+ }
+
+ /** Create a child group state inheriting from the parent. */
+ public RTFGroupState(RTFGroupState parent) {
+ this.depth = parent.depth + 1;
+ this.fontCharset = parent.fontCharset;
+ this.fontId = parent.fontId;
+ this.ucSkip = parent.ucSkip;
+ this.ignore = parent.ignore;
+ this.bold = parent.bold;
+ this.italic = parent.italic;
+ this.pictDepth = parent.pictDepth > 0 ? parent.pictDepth + 1 : 0;
+ // objdata, sp, sn, sv, object, annotation are NOT inherited
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
new file mode 100644
index 0000000000..7ef06f1ebe
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Extracts the original HTML from an RTF document that contains encapsulated
HTML
+ * (as indicated by the {@code \fromhtml1} control word), using a JFlex-based
tokenizer
+ * and shared {@link RTFState} for font/codepage tracking.
+ *
+ * <p>Embedded objects and pictures are extracted in the same pass via
+ * {@link RTFEmbeddedHandler}.</p>
+ */
+public class RTFHtmlDecapsulator {
+
+ private static final int DEFAULT_MAX_BYTES_KB = 2 * 1024 * 1024; // 2 GB
+
+ private final RTFEmbeddedHandler embHandler;
+
+ public RTFHtmlDecapsulator(ContentHandler handler, ParseContext context,
+ int maxBytesInKb) {
+ this.embHandler = new RTFEmbeddedHandler(handler, context,
maxBytesInKb);
+ }
+
+ public RTFHtmlDecapsulator(ContentHandler handler, ParseContext context) {
+ this(handler, context, DEFAULT_MAX_BYTES_KB);
+ }
+
+ public String extract(byte[] rtfBytes) throws IOException, SAXException,
TikaException {
+ if (rtfBytes == null || rtfBytes.length == 0) {
+ return null;
+ }
+ // Wrap byte[] in a Reader directly — RTF is 7-bit ASCII, so
+ // US_ASCII decoding is a 1:1 byte-to-char mapping with no
+ // intermediate String allocation.
+ Reader reader = new InputStreamReader(
+ new ByteArrayInputStream(rtfBytes), StandardCharsets.US_ASCII);
+ RTFTokenizer tokenizer = new RTFTokenizer(reader);
+ RTFState state = new RTFState();
+ StringBuilder html = new StringBuilder(rtfBytes.length / 2);
+ ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+
+ boolean foundFromHtml = false;
+ boolean foundHtmlTag = false;
+ boolean inHtmlRtfSkip = false;
+ boolean sawIgnorable = false;
+ int htmlTagDepth = -1;
+ boolean inHtmlTag = false;
+
+ RTFToken tok;
+ while ((tok = tokenizer.yylex()) != null) {
+ RTFTokenType type = tok.getType();
+ if (type == RTFTokenType.EOF) {
+ break;
+ }
+
+ // Flush pending bytes before charset-changing events
+ if (type == RTFTokenType.GROUP_CLOSE
+ || (type == RTFTokenType.CONTROL_WORD &&
"f".equals(tok.getName())
+ && tok.hasParameter())) {
+ flushPendingBytes(pendingBytes, html, state);
+ }
+
+ boolean consumed = state.processToken(tok);
+
+ // Embedded handler processes objdata/pict/sp in the same pass
+ if (!consumed) {
+ RTFGroupState closingGroup =
+ (type == RTFTokenType.GROUP_CLOSE) ?
state.getLastClosedGroup() : null;
+ try {
+ embHandler.processToken(tok, state, closingGroup);
+ } catch (TikaException | IOException e) {
+ // don't let a bad embedded object kill decapsulation
+ }
+ }
+
+ RTFGroupState group = state.getCurrentGroup();
+
+ // Skip tokens that are part of objdata/pict hex streams
+ if (!consumed && (group.objdata || group.pictDepth > 0)) {
+ continue;
+ }
+
+ switch (type) {
+ case GROUP_OPEN:
+ sawIgnorable = false;
+ break;
+
+ case GROUP_CLOSE:
+ if (inHtmlTag && state.getDepth() < htmlTagDepth) {
+ flushPendingBytes(pendingBytes, html, state);
+ inHtmlTag = false;
+ htmlTagDepth = -1;
+ }
+ break;
+
+ case CONTROL_SYMBOL:
+ if (tok.getChar() == '*') {
+ sawIgnorable = true;
+ }
+ if (!foundHtmlTag || inHtmlRtfSkip) {
+ break;
+ }
+ if (inHtmlTag || htmlTagDepth == -1) {
+ char sym = tok.getChar();
+ if (sym == '{' || sym == '}' || sym == '\\') {
+ flushPendingBytes(pendingBytes, html, state);
+ html.append(sym);
+ }
+ }
+ break;
+
+ case CONTROL_WORD:
+ if (consumed) {
+ break;
+ }
+ String name = tok.getName();
+
+ if ("fromhtml".equals(name)) {
+ foundFromHtml = true;
+ break;
+ }
+ if ("htmltag".equals(name) && sawIgnorable) {
+ if (!foundFromHtml) {
+ break;
+ }
+ foundHtmlTag = true;
+ flushPendingBytes(pendingBytes, html, state);
+ inHtmlTag = true;
+ htmlTagDepth = state.getDepth();
+ break;
+ }
+ if ("htmlrtf".equals(name)) {
+ flushPendingBytes(pendingBytes, html, state);
+ inHtmlRtfSkip = !(tok.hasParameter() &&
tok.getParameter() == 0);
+ break;
+ }
+ if (!foundHtmlTag || inHtmlRtfSkip) {
+ break;
+ }
+ if (inHtmlTag || htmlTagDepth == -1) {
+ flushPendingBytes(pendingBytes, html, state);
+ switch (name) {
+ case "par":
+ case "pard":
+ html.append('\n');
+ break;
+ case "tab":
+ html.append('\t');
+ break;
+ case "line":
+ html.append("<br>");
+ break;
+ default:
+ break;
+ }
+ }
+ break;
+
+ case HEX_ESCAPE:
+ if (consumed || !foundHtmlTag || inHtmlRtfSkip) {
+ break;
+ }
+ if (inHtmlTag || htmlTagDepth == -1) {
+ pendingBytes.write(tok.getHexValue());
+ }
+ break;
+
+ case UNICODE_ESCAPE:
+ if (!foundHtmlTag || inHtmlRtfSkip) {
+ break;
+ }
+ if (inHtmlTag || htmlTagDepth == -1) {
+ flushPendingBytes(pendingBytes, html, state);
+ int cp = tok.getParameter();
+ if (Character.isValidCodePoint(cp)) {
+ html.appendCodePoint(cp);
+ }
+ }
+ break;
+
+ case TEXT:
+ if (consumed || !foundHtmlTag || inHtmlRtfSkip) {
+ break;
+ }
+ if (inHtmlTag || htmlTagDepth == -1) {
+ flushPendingBytes(pendingBytes, html, state);
+ html.append(tok.getChar());
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ flushPendingBytes(pendingBytes, html, state);
+ if (!foundFromHtml || html.length() == 0) {
+ return null;
+ }
+ return html.toString();
+ }
+
+ private static void flushPendingBytes(ByteArrayOutputStream pending,
StringBuilder out,
+ RTFState state) {
+ if (pending.size() > 0) {
+ Charset cs = state.getCurrentCharset();
+ out.append(new String(pending.toByteArray(), cs));
+ pending.reset();
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
new file mode 100644
index 0000000000..8f7801f790
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
@@ -0,0 +1,510 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Locale;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+/**
+ * Parses OLE objdata from an RTF stream inline, byte by byte.
+ *
+ * <p>The OLE objdata structure is:
+ * <pre>
+ * [4 bytes version][4 bytes formatId]
+ * [4 bytes classNameLen][classNameLen bytes className]
+ * [4 bytes topicNameLen][topicNameLen bytes topicName]
+ * [4 bytes itemNameLen][itemNameLen bytes itemName]
+ * [4 bytes dataSz][dataSz bytes payload]
+ * </pre>
+ * The small header fields are parsed byte-by-byte via a state machine.
+ * Once the header is complete and {@code dataSz} is known, the payload
+ * bytes stream directly to a temp file -- never buffered in memory.</p>
+ *
+ * <p>On {@link #onComplete(Metadata, AtomicInteger)}, the payload is
+ * interpreted based on {@code className} (Package, PBrush, POIFS, etc.)
+ * and the extracted content is returned as a {@link TikaInputStream} whose
+ * close will clean up all temp files via {@link TemporaryResources}.</p>
+ */
+public class RTFObjDataStreamParser implements Closeable {
+
+ private static final String WIN_ASCII = "WINDOWS-1252";
+
+ private final long maxBytes;
+ private final TemporaryResources tmp = new TemporaryResources();
+
+ // State machine
+ private Field currentField = Field.VERSION;
+ private byte[] fieldBuf = new byte[4];
+ private int fieldPos;
+ private int fieldTarget = 4;
+
+ // Parsed header values
+ private long version;
+ private long formatId;
+ private String className;
+ private String topicName;
+ private String itemName;
+ private long dataSz;
+
+ // String accumulator for length-prefixed ANSI strings
+ private byte[] stringBuf;
+ private int stringPos;
+
+ // Payload streaming
+ private Path tempFile;
+ private OutputStream dataOut;
+ private long dataWritten;
+
+ /**
+ * @param maxBytes maximum payload bytes to accept (-1 for unlimited)
+ */
+ public RTFObjDataStreamParser(long maxBytes) {
+ this.maxBytes = maxBytes;
+ }
+
+ /**
+ * Receive a single decoded byte from the objdata hex stream.
+ */
+ public void onByte(int b) throws IOException, TikaException {
+ switch (currentField) {
+ case VERSION:
+ fieldBuf[fieldPos++] = (byte) b;
+ if (fieldPos >= fieldTarget) {
+ version = readLE32(fieldBuf);
+ initUint32Field(Field.FORMAT_ID);
+ }
+ break;
+
+ case FORMAT_ID:
+ fieldBuf[fieldPos++] = (byte) b;
+ if (fieldPos >= fieldTarget) {
+ formatId = readLE32(fieldBuf);
+ if (formatId != 2L) {
+ // Not an embedded object (1 = link). Skip everything.
+ currentField = Field.SKIP;
+ } else {
+ initUint32Field(Field.CLASS_LEN);
+ }
+ }
+ break;
+
+ case CLASS_LEN:
+ fieldBuf[fieldPos++] = (byte) b;
+ if (fieldPos >= fieldTarget) {
+ int len = (int) readLE32(fieldBuf);
+ initStringField(Field.CLASS_NAME, len);
+ }
+ break;
+
+ case CLASS_NAME:
+ stringBuf[stringPos++] = (byte) b;
+ if (stringPos >= fieldTarget) {
+ className = decodeString(stringBuf, fieldTarget);
+ initUint32Field(Field.TOPIC_LEN);
+ }
+ break;
+
+ case TOPIC_LEN:
+ fieldBuf[fieldPos++] = (byte) b;
+ if (fieldPos >= fieldTarget) {
+ int len = (int) readLE32(fieldBuf);
+ initStringField(Field.TOPIC_NAME, len);
+ }
+ break;
+
+ case TOPIC_NAME:
+ stringBuf[stringPos++] = (byte) b;
+ if (stringPos >= fieldTarget) {
+ topicName = decodeString(stringBuf, fieldTarget);
+ initUint32Field(Field.ITEM_LEN);
+ }
+ break;
+
+ case ITEM_LEN:
+ fieldBuf[fieldPos++] = (byte) b;
+ if (fieldPos >= fieldTarget) {
+ int len = (int) readLE32(fieldBuf);
+ initStringField(Field.ITEM_NAME, len);
+ }
+ break;
+
+ case ITEM_NAME:
+ stringBuf[stringPos++] = (byte) b;
+ if (stringPos >= fieldTarget) {
+ itemName = decodeString(stringBuf, fieldTarget);
+ initUint32Field(Field.DATA_SIZE);
+ }
+ break;
+
+ case DATA_SIZE:
+ fieldBuf[fieldPos++] = (byte) b;
+ if (fieldPos >= fieldTarget) {
+ dataSz = readLE32(fieldBuf);
+ if (dataSz <= 0) {
+ currentField = Field.DONE;
+ } else {
+ currentField = Field.DATA;
+ tempFile = tmp.createTempFile(".bin");
+ dataOut = new
BufferedOutputStream(Files.newOutputStream(tempFile));
+ }
+ }
+ break;
+
+ case DATA:
+ if (maxBytes > 0 && dataWritten >= maxBytes) {
+ throw new TikaMemoryLimitException(dataWritten + 1,
maxBytes);
+ }
+ dataOut.write(b);
+ dataWritten++;
+ if (dataWritten >= dataSz) {
+ dataOut.close();
+ dataOut = null;
+ currentField = Field.DONE;
+ }
+ break;
+
+ case DONE:
+ case SKIP:
+ break;
+ }
+ }
+
+ /**
+ * Called when the objdata group closes. Populates metadata and returns
+ * a TikaInputStream with the extracted embedded content, or null if
+ * the object couldn't be parsed.
+ *
+ * <p>The caller owns the returned TikaInputStream -- closing it will
+ * clean up all temp files via TemporaryResources.</p>
+ */
+ public TikaInputStream onComplete(Metadata metadata, AtomicInteger
unknownFilenameCount)
+ throws IOException, TikaException {
+ if (currentField == Field.SKIP || tempFile == null) {
+ return null;
+ }
+
+ metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
+ if (className != null && !className.isEmpty()) {
+ metadata.add(RTFMetadata.EMB_CLASS, className);
+ }
+ if (topicName != null && !topicName.isEmpty()) {
+ metadata.add(RTFMetadata.EMB_TOPIC, topicName);
+ }
+ if (itemName != null && !itemName.isEmpty()) {
+ metadata.add(RTFMetadata.EMB_ITEM, itemName);
+ }
+
+ String cn = className != null ? className.toLowerCase(Locale.ROOT) :
"";
+
+ if ("package".equals(cn)) {
+ return handlePackage(metadata);
+ } else if ("pbrush".equals(cn)) {
+ return TikaInputStream.get(tempFile, metadata, tmp);
+ } else {
+ return handleGenericOrPOIFS(metadata, unknownFilenameCount);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (dataOut != null) {
+ dataOut.close();
+ dataOut = null;
+ }
+ tmp.close();
+ }
+
+ // --- Package handling ---
+
+ private TikaInputStream handlePackage(Metadata metadata) throws
IOException, TikaException {
+ try (InputStream is = new
BufferedInputStream(Files.newInputStream(tempFile))) {
+ readUShortLE(is); // type
+
+ String displayName = readNullTerminatedString(is);
+ readNullTerminatedString(is); // iconFilePath
+ readUShortBE(is); // iconIndex
+ int type2 = readUShortLE(is);
+
+ if (type2 != 3) {
+ return null;
+ }
+
+ readUIntLE(is); // filePathLen
+ String ansiFilePath = readNullTerminatedString(is);
+ long bytesLen = readUIntLE(is);
+
+ // Write the embedded file content to a new temp file
+ Path contentFile = tmp.createTempFile(".bin");
+ try (OutputStream contentOut = new BufferedOutputStream(
+ Files.newOutputStream(contentFile))) {
+ copyBounded(is, contentOut, bytesLen);
+ }
+
+ // Try to read unicode file path (optional)
+ StringBuilder unicodePath = new StringBuilder();
+ try {
+ long unicodeLen = readUIntLE(is);
+ for (int i = 0; i < unicodeLen; i++) {
+ int lo = is.read();
+ int hi = is.read();
+ if (lo == -1 || hi == -1) {
+ unicodePath.setLength(0);
+ break;
+ }
+ unicodePath.append((char) (lo + 256 * hi));
+ }
+ } catch (IOException e) {
+ unicodePath.setLength(0);
+ }
+
+ String fileNameToUse;
+ String pathToUse;
+ if (unicodePath.length() > 0) {
+ fileNameToUse = unicodePath.toString();
+ pathToUse = unicodePath.toString();
+ } else {
+ fileNameToUse = displayName != null ? displayName : "";
+ pathToUse = ansiFilePath != null ? ansiFilePath : "";
+ }
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME,
fileNameToUse);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+ FilenameUtils.getName(fileNameToUse));
+ metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID,
pathToUse);
+
+ // Return TikaInputStream backed by contentFile; closing it cleans
up
+ // both contentFile and the original tempFile via
TemporaryResources
+ return TikaInputStream.get(contentFile, metadata, tmp);
+ }
+ }
+
+ // --- Generic / POIFS handling ---
+
+ private TikaInputStream handleGenericOrPOIFS(Metadata metadata,
+ AtomicInteger
unknownFilenameCount)
+ throws IOException, TikaException {
+ try (InputStream probe = new
BufferedInputStream(Files.newInputStream(tempFile))) {
+ boolean isOLE2 = FileMagic.valueOf(probe) == FileMagic.OLE2;
+ if (!isOLE2) {
+ return TikaInputStream.get(tempFile, metadata, tmp);
+ }
+ }
+
+ // It's POIFS -- parse it
+ try (InputStream poifsIn = new
BufferedInputStream(Files.newInputStream(tempFile));
+ POIFSFileSystem fs = new POIFSFileSystem(poifsIn)) {
+ DirectoryNode root = fs.getRoot();
+ if (root == null) {
+ return null;
+ }
+
+ byte[] content = null;
+
+ if (root.hasEntry("Package")) {
+ Entry pkg = root.getEntry("Package");
+ try (BoundedInputStream bis = new BoundedInputStream(
+ maxBytes > 0 ? maxBytes : Long.MAX_VALUE,
+ new DocumentInputStream((DocumentEntry) pkg))) {
+ content = IOUtils.toByteArray(bis);
+ if (bis.hasHitBound()) {
+ throw new TikaMemoryLimitException(maxBytes + 1,
maxBytes);
+ }
+ }
+ } else {
+ POIFSDocumentType type = POIFSDocumentType.detectType(root);
+ if (type == POIFSDocumentType.OLE10_NATIVE) {
+ try {
+ Ole10Native ole =
Ole10Native.createFromEmbeddedOleObject(root);
+ content = ole.getDataBuffer();
+ } catch (Ole10NativeException ex) {
+ // Not valid OLE10Native
+ }
+ } else if (type == POIFSDocumentType.COMP_OBJ) {
+ DocumentEntry contentsEntry;
+ try {
+ contentsEntry = (DocumentEntry)
root.getEntry("CONTENTS");
+ } catch (FileNotFoundException e) {
+ contentsEntry = (DocumentEntry)
root.getEntry("Contents");
+ }
+ try (DocumentInputStream inp = new
DocumentInputStream(contentsEntry)) {
+ content = new byte[contentsEntry.getSize()];
+ inp.readFully(content);
+ }
+ } else {
+ // Unknown POIFS type -- return the whole thing
+ metadata.set(Metadata.CONTENT_TYPE,
type.getType().toString());
+ return TikaInputStream.get(tempFile, metadata, tmp);
+ }
+ }
+
+ if (content != null) {
+ Path contentFile = tmp.createTempFile(".bin");
+ Files.write(contentFile, content);
+ return TikaInputStream.get(contentFile, metadata, tmp);
+ }
+ }
+ return null;
+ }
+
+ // --- Helper methods ---
+
+ private void initUint32Field(Field next) {
+ currentField = next;
+ fieldPos = 0;
+ fieldTarget = 4;
+ }
+
+ private static final int MAX_HEADER_STRING_LENGTH = 4096;
+
+ private void initStringField(Field next, int len) {
+ currentField = next;
+ if (len > MAX_HEADER_STRING_LENGTH) {
+ // Corrupt or crafted header — bail out
+ currentField = Field.SKIP;
+ return;
+ }
+ if (len <= 0) {
+ switch (next) {
+ case CLASS_NAME:
+ className = "";
+ initUint32Field(Field.TOPIC_LEN);
+ break;
+ case TOPIC_NAME:
+ topicName = "";
+ initUint32Field(Field.ITEM_LEN);
+ break;
+ case ITEM_NAME:
+ itemName = "";
+ initUint32Field(Field.DATA_SIZE);
+ break;
+ default:
+ break;
+ }
+ return;
+ }
+ stringBuf = new byte[len];
+ stringPos = 0;
+ fieldTarget = len;
+ }
+
+ private static long readLE32(byte[] buf) {
+ return (buf[0] & 0xFFL)
+ | ((buf[1] & 0xFFL) << 8)
+ | ((buf[2] & 0xFFL) << 16)
+ | ((buf[3] & 0xFFL) << 24);
+ }
+
+ private static String decodeString(byte[] buf, int len) {
+ try {
+ return new String(buf, 0, len, WIN_ASCII).trim();
+ } catch (java.io.UnsupportedEncodingException e) {
+ return new String(buf, 0, len,
java.nio.charset.StandardCharsets.US_ASCII).trim();
+ }
+ }
+
+ private static int readUShortLE(InputStream is) throws IOException {
+ try {
+ return EndianUtils.readUShortLE(is);
+ } catch (EndianUtils.BufferUnderrunException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private static int readUShortBE(InputStream is) throws IOException {
+ try {
+ return EndianUtils.readUShortBE(is);
+ } catch (EndianUtils.BufferUnderrunException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private static long readUIntLE(InputStream is) throws IOException {
+ try {
+ return EndianUtils.readUIntLE(is);
+ } catch (EndianUtils.BufferUnderrunException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private static String readNullTerminatedString(InputStream is) throws
IOException {
+ StringBuilder sb = new StringBuilder();
+ int c = is.read();
+ while (c > 0) {
+ sb.append((char) c);
+ c = is.read();
+ }
+ if (c == -1) {
+ throw new IOException("hit end of stream before null terminator");
+ }
+ return sb.toString();
+ }
+
+ private static long copyBounded(InputStream in, OutputStream out, long
maxLen)
+ throws IOException {
+ byte[] buf = new byte[8192];
+ long total = 0;
+ while (total < maxLen) {
+ int toRead = (int) Math.min(buf.length, maxLen - total);
+ int read = in.read(buf, 0, toRead);
+ if (read == -1) {
+ break;
+ }
+ out.write(buf, 0, read);
+ total += read;
+ }
+ return total;
+ }
+
+ private enum Field {
+ VERSION, FORMAT_ID,
+ CLASS_LEN, CLASS_NAME,
+ TOPIC_LEN, TOPIC_NAME,
+ ITEM_LEN, ITEM_NAME,
+ DATA_SIZE, DATA,
+ DONE, SKIP
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
new file mode 100644
index 0000000000..906d351e26
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Streams decoded bytes from an RTF {@code \pict} group to a temp file.
+ *
+ * <p>Pict data is raw image bytes (after hex-pair decoding). There is no
+ * header to parse -- bytes are written directly to a temp file. On
+ * {@link #onComplete(Metadata)}, a {@link TikaInputStream} is returned
+ * whose close will clean up the temp file via {@link TemporaryResources}.</p>
+ */
+public class RTFPictStreamParser implements Closeable {
+
+ private final long maxBytes;
+ private final TemporaryResources tmp = new TemporaryResources();
+ private Path tempFile;
+ private OutputStream out;
+ private long bytesWritten;
+
+ /**
+ * @param maxBytes maximum number of bytes to accept (-1 for unlimited)
+ */
+ public RTFPictStreamParser(long maxBytes) throws IOException {
+ this.maxBytes = maxBytes;
+ this.tempFile = tmp.createTempFile(".bin");
+ this.out = new BufferedOutputStream(Files.newOutputStream(tempFile));
+ }
+
+ /**
+ * Receive a single decoded byte from the pict hex stream.
+ */
+ public void onByte(int b) throws IOException, TikaException {
+ if (maxBytes > 0 && bytesWritten >= maxBytes) {
+ throw new TikaMemoryLimitException(bytesWritten + 1, maxBytes);
+ }
+ out.write(b);
+ bytesWritten++;
+ }
+
+ /**
+ * Called when the pict group closes. Returns a TikaInputStream backed
+ * by the temp file. The caller owns the TikaInputStream -- closing it
+ * will delete the temp file.
+ *
+ * @return a TikaInputStream, or null if no bytes were written
+ */
+ public TikaInputStream onComplete(Metadata metadata) throws IOException {
+ out.close();
+ out = null;
+ if (bytesWritten == 0) {
+ tmp.close();
+ return null;
+ }
+ // Hand ownership of the temp file to the TikaInputStream.
+ // TikaInputStream.close() will close the TemporaryResources,
+ // which deletes the temp file.
+ return TikaInputStream.get(tempFile, metadata, tmp);
+ }
+
+ /** Returns the number of bytes written so far. */
+ public long getBytesWritten() {
+ return bytesWritten;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (out != null) {
+ out.close();
+ out = null;
+ }
+ tmp.close();
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
new file mode 100644
index 0000000000..00ecd4d70d
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Shared RTF parsing state: group stack, font table, codepage tracking,
+ * and unicode skip handling.
+ *
+ * <p>Both the HTML decapsulator and the full RTF parser use this class
+ * to manage the stateful parts of RTF processing.</p>
+ *
+ * <p>Typical usage: feed every token to {@link #processToken(RTFToken)}
+ * and query the current charset via {@link #getCurrentCharset()}.</p>
+ */
+public class RTFState {
+
+ /** Global charset from {@code \ansicpgN} or charset family selectors. */
+ private Charset globalCharset = RTFCharsetMaps.WINDOWS_1252;
+
+ /** Default font ID from {@code \deffN}. */
+ private int globalDefaultFont = -1;
+
+ /** Font table: maps font number ({@code \fN}) to charset ({@code
\fcharsetN}). */
+ private final Map<Integer, Charset> fontToCharset = new HashMap<>();
+
+ private static final int MAX_GROUP_DEPTH = 10_000;
+
+ /** Group state stack. */
+ private final Deque<RTFGroupState> stack = new ArrayDeque<>();
+
+ /** Current (active) group state. */
+ private RTFGroupState current = new RTFGroupState();
+
+ /** Number of ANSI chars remaining to skip after a unicode escape. */
+ private int ansiSkip = 0;
+
+ /** The group state that was just closed (before popGroup). Set on
GROUP_CLOSE. */
+ private RTFGroupState lastClosedGroup;
+
+ // Font table parsing state
+ // 0 = not yet seen, 1 = inside fonttbl, 2 = finished fonttbl
+ private int fontTableState = 0;
+ private int fontTableDepth = -1;
+ private int currentFontId = -1;
+
+ private boolean inHeader = true;
+
+ /**
+ * Process a single token to update internal state.
+ * <p>
+ * This handles: group open/close, charset selectors (ansi, ansicpg,
+ * deff), font table parsing (fonttbl, f, fcharset),
+ * unicode skip tracking (uc), and font changes (f in body).
+ *
+ * @return true if the token was consumed by state management (caller
should skip it),
+ * false if the caller should also process it
+ */
+ public boolean processToken(RTFToken tok) {
+ switch (tok.getType()) {
+ case GROUP_OPEN:
+ pushGroup();
+ return false;
+
+ case GROUP_CLOSE:
+ lastClosedGroup = current;
+ popGroup();
+ // Check if we've exited the font table
+ if (fontTableState == 1 && current.depth < fontTableDepth) {
+ fontTableState = 2;
+ }
+ return false;
+
+ case CONTROL_SYMBOL:
+ if (tok.getChar() == '*') {
+ current.ignore = true;
+ }
+ return false;
+
+ case CONTROL_WORD:
+ return processControlWord(tok);
+
+ case UNICODE_ESCAPE:
+ // After a unicode escape, skip the next ucSkip ANSI chars
+ ansiSkip = current.ucSkip;
+ return false;
+
+ case HEX_ESCAPE:
+ // If we're in the ANSI shadow of a unicode escape, skip this
byte
+ if (ansiSkip > 0) {
+ ansiSkip--;
+ return true; // consumed — caller should ignore
+ }
+ return false;
+
+ case TEXT:
+ // If we're in the ANSI shadow, skip text chars
+ if (ansiSkip > 0) {
+ // Each TEXT token is one char
+ ansiSkip--;
+ return true;
+ }
+ return false;
+
+ default:
+ return false;
+ }
+ }
+
+ private boolean processControlWord(RTFToken tok) {
+ String name = tok.getName();
+ boolean hasParam = tok.hasParameter();
+ int param = tok.getParameter();
+
+ // Global charset selectors (header)
+ switch (name) {
+ case "ansi":
+ globalCharset = RTFCharsetMaps.WINDOWS_1252;
+ return true;
+ case "pca":
+ globalCharset = RTFCharsetMaps.getCharset("cp850");
+ return true;
+ case "pc":
+ globalCharset = RTFCharsetMaps.getCharset("cp437");
+ return true;
+ case "mac":
+ globalCharset = RTFCharsetMaps.getCharset("MacRoman");
+ return true;
+ case "ansicpg":
+ if (hasParam) {
+ Charset cs = RTFCharsetMaps.ANSICPG_MAP.get(param);
+ if (cs != null) {
+ globalCharset = cs;
+ } else {
+ globalCharset = RTFCharsetMaps.resolveCodePage(param);
+ }
+ }
+ return true;
+ case "deff":
+ if (hasParam) {
+ globalDefaultFont = param;
+ }
+ return true;
+ }
+
+ // Font table management
+ if ("fonttbl".equals(name)) {
+ fontTableState = 1;
+ fontTableDepth = current.depth;
+ current.ignore = true;
+ return true;
+ }
+
+ if (fontTableState == 1) {
+ // Inside font table
+ if (current.depth < fontTableDepth) {
+ fontTableState = 2;
+ } else {
+ if ("f".equals(name) && hasParam) {
+ currentFontId = param;
+ return true;
+ } else if ("fcharset".equals(name) && hasParam) {
+ Charset cs = RTFCharsetMaps.FCHARSET_MAP.get(param);
+ if (cs != null) {
+ fontToCharset.put(currentFontId, cs);
+ }
+ return true;
+ }
+ }
+ }
+
+ // Unicode skip count
+ if ("uc".equals(name) && hasParam) {
+ current.ucSkip = param;
+ return true;
+ }
+
+ // Font change in body
+ if ("f".equals(name) && hasParam) {
+ current.fontId = param;
+ Charset fontCs = fontToCharset.get(param);
+ current.fontCharset = fontCs; // may be null
+ // If we've seen the font table and this is a body font change,
+ // we're out of the header
+ if (fontTableState == 2 && !current.ignore) {
+ inHeader = false;
+ }
+ return false; // caller may also want to know about font changes
+ }
+
+ // Header-ending control words
+ if (inHeader && !current.ignore) {
+ switch (name) {
+ case "par":
+ case "pard":
+ case "sect":
+ case "sectd":
+ case "plain":
+ case "ltrch":
+ case "rtlch":
+ case "htmlrtf":
+ case "line":
+ inHeader = false;
+ break;
+ }
+ }
+
+ // Embedded object / picture control words
+ switch (name) {
+ case "object":
+ current.object = true;
+ return false; // caller may want to know
+ case "objdata":
+ current.objdata = true;
+ return false;
+ case "pict":
+ current.pictDepth = 1;
+ return false;
+ case "sp":
+ current.sp = true;
+ return false;
+ case "sn":
+ current.sn = true;
+ return false;
+ case "sv":
+ current.sv = true;
+ return false;
+ case "wbitmap":
+ return false; // caller handles
+ }
+
+ // Ignorable destinations
+ if (inHeader) {
+ switch (name) {
+ case "colortbl":
+ case "stylesheet":
+ current.ignore = true;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /** Open a new group: push current state and create a child. */
+ public void pushGroup() {
+ if (stack.size() >= MAX_GROUP_DEPTH) {
+ // Silently ignore — treat further { as flat content
+ return;
+ }
+ stack.push(current);
+ current = new RTFGroupState(current);
+ }
+
+ /** Close the current group: pop and restore the parent state. */
+ public void popGroup() {
+ if (!stack.isEmpty()) {
+ current = stack.pop();
+ }
+ }
+
+ /**
+ * Returns the charset that should be used to decode the current hex escape
+ * or text byte. Priority:
+ * <ol>
+ * <li>Font-specific charset (from {@code \fN → \fcharsetN})</li>
+ * <li>Global default font's charset (from {@code \deffN})</li>
+ * <li>Global charset (from {@code \ansicpgN} or family selector)</li>
+ * </ol>
+ */
+ public Charset getCurrentCharset() {
+ if (current.fontCharset != null) {
+ return current.fontCharset;
+ }
+ if (globalDefaultFont != -1 && !inHeader) {
+ Charset cs = fontToCharset.get(globalDefaultFont);
+ if (cs != null) {
+ return cs;
+ }
+ }
+ return globalCharset;
+ }
+
+ /** Returns the global charset ({@code \ansicpgN}). */
+ public Charset getGlobalCharset() {
+ return globalCharset;
+ }
+
+ /** Returns the current group state. */
+ public RTFGroupState getCurrentGroup() {
+ return current;
+ }
+
+ /** Returns true if we're still in the RTF header (before body content). */
+ public boolean isInHeader() {
+ return inHeader;
+ }
+
+ /** Returns the current group nesting depth. */
+ public int getDepth() {
+ return current.depth;
+ }
+
+ /** Returns the font-to-charset mapping table. */
+ public Map<Integer, Charset> getFontToCharset() {
+ return fontToCharset;
+ }
+
+ /** Returns the number of ANSI chars remaining to skip. */
+ public int getAnsiSkip() {
+ return ansiSkip;
+ }
+
+ /**
+ * Returns the group state that was just closed on the most recent
GROUP_CLOSE.
+ * This is the child group's state before it was popped.
+ * Useful for checking flags like objdata, pictDepth, sn, sv, sp, object
+ * to trigger completion handlers.
+ */
+ public RTFGroupState getLastClosedGroup() {
+ return lastClosedGroup;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
new file mode 100644
index 0000000000..3278a9a1a4
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+/**
+ * A single token produced by the RTF tokenizer.
+ * <p>
+ * Mutable and reused by the tokenizer to avoid allocation in the hot loop.
+ * Consumers must copy any data they need before requesting the next token.
+ * <p>
+ * For TEXT and CONTROL_SYMBOL tokens (single character), use {@link
#getChar()}
+ * to avoid String allocation. For CONTROL_WORD tokens, use {@link #getName()}.
+ */
+public class RTFToken {
+
+ private RTFTokenType type;
+ private String name;
+ private char ch;
+ private int parameter;
+ private boolean hasParameter;
+
+ public void reset(RTFTokenType type) {
+ this.type = type;
+ this.name = null;
+ this.ch = 0;
+ this.parameter = -1;
+ this.hasParameter = false;
+ }
+
+ public void set(RTFTokenType type, String name, int parameter, boolean
hasParameter) {
+ this.type = type;
+ this.name = name;
+ this.ch = 0;
+ this.parameter = parameter;
+ this.hasParameter = hasParameter;
+ }
+
+ public void setChar(RTFTokenType type, char ch) {
+ this.type = type;
+ this.name = null;
+ this.ch = ch;
+ this.parameter = -1;
+ this.hasParameter = false;
+ }
+
+ public RTFTokenType getType() {
+ return type;
+ }
+
+ /** For CONTROL_WORD tokens: the control word name. */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * For TEXT and CONTROL_SYMBOL tokens: the single character, without
+ * allocating a String.
+ */
+ public char getChar() {
+ return ch;
+ }
+
+ public int getParameter() {
+ return parameter;
+ }
+
+ public boolean hasParameter() {
+ return hasParameter;
+ }
+
+ public int getHexValue() {
+ return parameter;
+ }
+
+ @Override
+ public String toString() {
+ switch (type) {
+ case GROUP_OPEN:
+ return "{";
+ case GROUP_CLOSE:
+ return "}";
+ case CONTROL_WORD:
+ return "\\" + name + (hasParameter ? String.valueOf(parameter)
: "");
+ case CONTROL_SYMBOL:
+ return "\\" + ch;
+ case HEX_ESCAPE:
+ return String.format(java.util.Locale.ROOT, "\\'%02x",
parameter);
+ case UNICODE_ESCAPE:
+ return "\\u" + parameter;
+ case TEXT:
+ return "TEXT[" + ch + "]";
+ case BIN:
+ return "\\bin" + parameter;
+ case CRLF:
+ return "CRLF";
+ case EOF:
+ return "EOF";
+ default:
+ return type.name();
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
new file mode 100644
index 0000000000..dcdcf511f9
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+public enum RTFTokenType {
+ GROUP_OPEN,
+ GROUP_CLOSE,
+ CONTROL_WORD,
+ CONTROL_SYMBOL,
+ HEX_ESCAPE,
+ UNICODE_ESCAPE,
+ TEXT,
+ BIN,
+ CRLF,
+ EOF
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
new file mode 100644
index 0000000000..2f5baff0f4
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+%%
+
+%public
+%class RTFTokenizer
+%unicode
+%type RTFToken
+%char
+
+%{
+ private final RTFToken token = new RTFToken();
+
+ /**
+ * Returns the reusable token instance. Callers must copy data
+ * before the next call to {@link #yylex()}.
+ */
+ public RTFToken getToken() {
+ return token;
+ }
+
+ /** Control word with parameter: \ letters [-] digits [space] */
+ private RTFToken controlWordWithParam() {
+ int len = yylength();
+ if (yycharat(len - 1) == ' ') {
+ len--;
+ }
+ // find where letters end
+ int nameEnd = 1;
+ while (nameEnd < len && Character.isLetter(yycharat(nameEnd))) {
+ nameEnd++;
+ }
+ String name = new String(zzBuffer, zzStartRead + 1, nameEnd - 1);
+ int param = parseIntFromBuffer(nameEnd, len);
+ token.set(RTFTokenType.CONTROL_WORD, name, param, true);
+ return token;
+ }
+
+ /** Control word without parameter: \ letters [space] */
+ private RTFToken controlWord() {
+ int len = yylength();
+ if (yycharat(len - 1) == ' ') {
+ len--;
+ }
+ String name = new String(zzBuffer, zzStartRead + 1, len - 1);
+ token.set(RTFTokenType.CONTROL_WORD, name, -1, false);
+ return token;
+ }
+
+ private RTFToken hexEscape() {
+ // layout: \' hex hex (4 chars)
+ int hi = Character.digit(yycharat(2), 16);
+ int lo = Character.digit(yycharat(3), 16);
+ token.set(RTFTokenType.HEX_ESCAPE, null, (hi << 4) | lo, true);
+ return token;
+ }
+
+ private RTFToken unicodeEscape() {
+ // layout: backslash u [-] digits [space]
+ int len = yylength();
+ if (yycharat(len - 1) == ' ') {
+ len--;
+ }
+ int codePoint = parseIntFromBuffer(2, len);
+ // RTF uses signed 16-bit: negative values map to 65536 + value
+ if (codePoint < 0) {
+ codePoint = 65536 + codePoint;
+ }
+ token.set(RTFTokenType.UNICODE_ESCAPE, null, codePoint, true);
+ return token;
+ }
+
+ private RTFToken binToken() {
+ // layout: \bin digits [space]
+ int len = yylength();
+ if (yycharat(len - 1) == ' ') {
+ len--;
+ }
+ int count = parseIntFromBuffer(4, len);
+ token.set(RTFTokenType.BIN, null, count, true);
+ return token;
+ }
+
+ /**
+ * Parse an integer from JFlex's internal char buffer between positions
+ * start (inclusive) and end (exclusive), relative to the current match.
+ * Handles optional leading '-'.
+ */
+ private int parseIntFromBuffer(int start, int end) {
+ boolean neg = false;
+ int pos = start;
+ if (yycharat(pos) == '-') {
+ neg = true;
+ pos++;
+ }
+ int result = 0;
+ while (pos < end) {
+ result = result * 10 + (yycharat(pos) - '0');
+ pos++;
+ }
+ return neg ? -result : result;
+ }
+%}
+
+/* RTF is 7-bit ASCII; bytes above 127 are escaped. We read as Latin1/byte
stream. */
+
+/* RTF spec: a control word's delimiter space is consumed and not part of the
output.
+ We include the optional trailing space in each pattern so the tokenizer
eats it. */
+ControlWordWithParam = "\\" [a-zA-Z]+ "-"? [0-9]+ " "?
+ControlWord = "\\" [a-zA-Z]+ " "?
+HexEscape = "\\'" [0-9a-fA-F]{2}
+UnicodeEscape = "\\u" "-"? [0-9]+ " "?
+BinControl = "\\bin" [0-9]+ " "?
+ControlSymbol = "\\" [^a-zA-Z0-9\r\n]
+GroupOpen = "{"
+GroupClose = "}"
+CrLf = \r\n | \r | \n
+
+%%
+
+/* Order matters: more specific rules first */
+
+{BinControl} { return binToken(); }
+{UnicodeEscape} { return unicodeEscape(); }
+{HexEscape} { return hexEscape(); }
+{ControlWordWithParam} { return controlWordWithParam(); }
+{ControlWord} { return controlWord(); }
+{ControlSymbol} { token.setChar(RTFTokenType.CONTROL_SYMBOL,
yycharat(1)); return token; }
+{GroupOpen} { token.reset(RTFTokenType.GROUP_OPEN); return token;
}
+{GroupClose} { token.reset(RTFTokenType.GROUP_CLOSE); return
token; }
+{CrLf} { token.reset(RTFTokenType.CRLF); return token; }
+
+/* Text: one char at a time. Uses yycharat(0) to avoid String allocation. */
+[^\\\{\}\r\n] { token.setChar(RTFTokenType.TEXT, yycharat(0));
return token; }
+
+<<EOF>> { token.reset(RTFTokenType.EOF); return token; }
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
new file mode 100644
index 0000000000..32b8ae58f9
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Tests for {@link RTFEmbeddedHandler} driven by the JFlex tokenizer,
+ * both standalone and integrated into the decapsulator.
+ */
+public class RTFEmbeddedHandlerTest {
+
+ private static ParseContext buildContext(List<Metadata> extracted) {
+ ParseContext context = new ParseContext();
+ context.set(EmbeddedDocumentExtractor.class, new
EmbeddedDocumentExtractor() {
+ @Override
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ return true;
+ }
+
+ @Override
+ public void parseEmbedded(TikaInputStream stream, ContentHandler
handler,
+ Metadata metadata, ParseContext
parseContext,
+ boolean outputHtml) {
+ Metadata copy = new Metadata();
+ for (String name : metadata.names()) {
+ for (String val : metadata.getValues(name)) {
+ copy.add(name, val);
+ }
+ }
+ extracted.add(copy);
+ }
+ });
+ return context;
+ }
+
+ /**
+ * Process an RTF file through the tokenizer + state + embedded handler
directly.
+ */
+ private List<Metadata> extractEmbeddedDirect(String resourceName)
+ throws IOException, SAXException, TikaException {
+ List<Metadata> extracted = new ArrayList<>();
+ ParseContext context = buildContext(extracted);
+ ContentHandler handler = new DefaultHandler();
+ RTFEmbeddedHandler embHandler = new RTFEmbeddedHandler(handler,
context, 20 * 1024);
+ RTFState state = new RTFState();
+
+ try (InputStream is =
getClass().getResourceAsStream("/test-documents/" + resourceName);
+ Reader reader = new InputStreamReader(is,
StandardCharsets.US_ASCII)) {
+
+ RTFTokenizer tokenizer = new RTFTokenizer(reader);
+ RTFToken tok;
+
+ while ((tok = tokenizer.yylex()) != null) {
+ if (tok.getType() == RTFTokenType.EOF) {
+ break;
+ }
+ boolean consumed = state.processToken(tok);
+ if (!consumed) {
+ RTFGroupState closingGroup =
+ (tok.getType() == RTFTokenType.GROUP_CLOSE)
+ ? state.getLastClosedGroup() : null;
+ embHandler.processToken(tok, state, closingGroup);
+ }
+ }
+ }
+ return extracted;
+ }
+
+ @Test
+ public void testEmbeddedFiles() throws Exception {
+ List<Metadata> embedded =
extractEmbeddedDirect("testRTFEmbeddedFiles.rtf");
+ assertTrue(embedded.size() > 0,
+ "should extract at least one embedded object from
testRTFEmbeddedFiles.rtf");
+ }
+
+ @Test
+ public void testPictExtraction() throws Exception {
+ // Verifies the handler doesn't crash on a typical RTF file
+ extractEmbeddedDirect("testRTF.rtf");
+ }
+
+ @Test
+ public void testEmbeddedObjectMetadata() throws Exception {
+ List<Metadata> embedded =
extractEmbeddedDirect("testRTFEmbeddedFiles.rtf");
+ if (embedded.size() > 0) {
+ boolean hasName = false;
+ for (Metadata m : embedded) {
+ String name = m.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ if (name != null && !name.isEmpty()) {
+ hasName = true;
+ break;
+ }
+ }
+ assertTrue(hasName, "at least one embedded should have a resource
name");
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
new file mode 100644
index 0000000000..72235f36a3
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Tests for {@link RTFHtmlDecapsulator}, mirroring the original
+ * RTFEncapsulatedHTMLExtractorTest to verify parity.
+ */
+public class RTFHtmlDecapsulatorTest {
+
+ private static String extract(byte[] rtfBytes)
+ throws IOException, SAXException, TikaException {
+ return new RTFHtmlDecapsulator(new DefaultHandler(), new
ParseContext())
+ .extract(rtfBytes);
+ }
+
+ @Test
+ public void testNullAndEmpty() throws Exception {
+ assertNull(extract(null));
+ assertNull(extract(new byte[0]));
+ }
+
+ @Test
+ public void testNonEncapsulatedRtf() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
+ assertNull(extract(rtf.getBytes(US_ASCII)));
+ }
+
+ @Test
+ public void testSimpleEncapsulatedHtml() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag19 <html>}\n" +
+ "{\\*\\htmltag34 <head>}\n" +
+ "{\\*\\htmltag41 </head>}\n" +
+ "{\\*\\htmltag50 <body>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "{\\*\\htmltag84 Hello world}\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "{\\*\\htmltag58 </body>}\n" +
+ "{\\*\\htmltag27 </html>}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("<html>"));
+ assertTrue(html.contains("<p>"));
+ assertTrue(html.contains("Hello world"));
+ assertTrue(html.contains("</html>"));
+ }
+
+ @Test
+ public void testImgCidExtraction() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag19 <html>}\n" +
+ "{\\*\\htmltag50 <body>}\n" +
+ "{\\*\\htmltag84 <img
src=\"cid:[email protected]\">}\n" +
+ "{\\*\\htmltag58 </body>}\n" +
+ "{\\*\\htmltag27 </html>}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("cid:[email protected]"),
+ "CID reference should be preserved in extracted HTML");
+ }
+
+ @Test
+ public void testParAndTabDecoding() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag241 <style>}\n" +
+ "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n"
+
+ "{\\*\\htmltag249 </style>}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("<style>"));
+ assertTrue(html.contains("body {"));
+ assertTrue(html.contains("\tcolor: red;"));
+ assertTrue(html.contains("</style>"));
+ }
+
+ @Test
+ public void testHexEscapeDecoding() throws Exception {
+ // \'e9 = 0xE9 = 'e' in windows-1252
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 caf\\'e9}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("caf\u00e9", html);
+ }
+
+ @Test
+ public void testMultiByteHexEscape() throws Exception {
+ // \'fc = 'u' and \'df = 'ss' in windows-1252
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("gr\u00fc\u00dfe", html);
+ }
+
+ @Test
+ public void testCodePage1254Turkish() throws Exception {
+ // \'fd in windows-1254 = 0xFD, decoded by Java's windows-1254 charset
+ String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 Say\\'fdn}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ // Verify the byte 0xFD is decoded through windows-1254
+ byte[] expected = new byte[] { 'S', 'a', 'y', (byte) 0xFD, 'n' };
+ assertEquals(new String(expected,
java.nio.charset.Charset.forName("windows-1254")), html);
+ }
+
+ @Test
+ public void testHtmlrtfSkipping() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 Hello}\n" +
+ "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
+ "{\\*\\htmltag84 World}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("Hello World", html);
+ }
+
+ @Test
+ public void testEscapedBracesAndBackslash() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("a { b } c \\d", html);
+ }
+
+ @Test
+ public void testEmptyHtmltag() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag72}\n" +
+ "{\\*\\htmltag84 text}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("text", html);
+ }
+
+ @Test
+ public void testInterTagTextContent() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag19 <html>}\n" +
+ "{\\*\\htmltag50 <body>}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "Hello from the message body\n" +
+ "\\htmlrtf\\par}\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "Second paragraph\n" +
+ "\\htmlrtf\\par}\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "{\\*\\htmltag58 </body>}\n" +
+ "{\\*\\htmltag27 </html>}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("<p>"), "should contain HTML tags");
+ assertTrue(html.contains("Hello from the message body"),
+ "should contain inter-tag text content");
+ assertTrue(html.contains("Second paragraph"),
+ "should contain second paragraph text");
+ assertTrue(html.contains("</html>"), "should contain closing tag");
+ }
+
+ @Test
+ public void testInterTagHexEscapes() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag64 <p>}\n" +
+ "\\htmlrtf {\\htmlrtf0\n" +
+ "caf\\'e9\n" +
+ "\\htmlrtf }\\htmlrtf0\n" +
+ "{\\*\\htmltag72 </p>}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertTrue(html.contains("caf\u00e9"), "hex escapes in inter-tag text
should be decoded");
+ }
+
+ @Test
+ public void testLineControlWord() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\*\\htmltag84 line1\\line line2}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("line1<br>line2", html);
+ }
+
+ @Test
+ public void testFontAwareCodePageDecoding() throws Exception {
+ // f0 = ANSI (fcharset 0 = windows-1252), f1 = Greek (fcharset 161 =
cp1253)
+ // \'e1 in windows-1252 = U+00E1 (a with acute)
+ // \'e1 in cp1253 = U+03B1 (GREEK SMALL LETTER ALPHA)
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\fonttbl{\\f0\\fcharset0 Times;}{\\f1\\fcharset161
Greek;}}\n" +
+ "{\\*\\htmltag84 \\f0 caf\\'e9}\n" +
+ "{\\*\\htmltag84 \\f1 \\'e1}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ // f0: \'e9 in windows-1252 = e with acute
+ assertTrue(html.contains("caf\u00e9"), "f0 should decode as
windows-1252");
+ // f1: \'e1 in cp1253 = Greek alpha
+ assertTrue(html.contains("\u03b1"), "f1 should decode as cp1253
(Greek)");
+ }
+
+ @Test
+ public void testUnicodeEscapeWithAnsiShadow() throws Exception {
+ // \u8212 is em dash (U+2014). The \'97 is the ANSI shadow and should
be skipped.
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+ "{\\fonttbl{\\f0\\fcharset0 Times;}}\n" +
+ "{\\*\\htmltag84 A\\u8212\\'97B}\n" +
+ "}";
+ String html = extract(rtf.getBytes(US_ASCII));
+ assertNotNull(html);
+ assertEquals("A\u2014B", html);
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
new file mode 100644
index 0000000000..7595c8342e
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.StringReader;
+import java.nio.charset.Charset;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFStateTest {
+
+ private RTFState processRtf(String rtf) throws Exception {
+ RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+ RTFState state = new RTFState();
+ RTFToken tok;
+ while ((tok = tokenizer.yylex()) != null) {
+ if (tok.getType() == RTFTokenType.EOF) {
+ break;
+ }
+ state.processToken(tok);
+ }
+ return state;
+ }
+
+ @Test
+ public void testGlobalCharsetFromAnsicpg() throws Exception {
+ RTFState state = processRtf("{\\rtf1\\ansi\\ansicpg1251}");
+ assertEquals(Charset.forName("CP1251"), state.getGlobalCharset());
+ }
+
+ @Test
+ public void testGlobalCharsetDefaultWindows1252() throws Exception {
+ RTFState state = processRtf("{\\rtf1\\ansi}");
+ assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getGlobalCharset());
+ }
+
+ @Test
+ public void testGlobalCharsetPca() throws Exception {
+ RTFState state = processRtf("{\\rtf1\\pca}");
+ assertEquals(Charset.forName("cp850"), state.getGlobalCharset());
+ }
+
+ @Test
+ public void testGlobalCharsetPc() throws Exception {
+ RTFState state = processRtf("{\\rtf1\\pc}");
+ assertEquals(Charset.forName("cp437"), state.getGlobalCharset());
+ }
+
+ @Test
+ public void testGlobalCharsetMac() throws Exception {
+ RTFState state = processRtf("{\\rtf1\\mac}");
+ assertEquals(Charset.forName("MacRoman"), state.getGlobalCharset());
+ }
+
+ @Test
+ public void testFontTableParsing() throws Exception {
+ // Realistic font table: f0=Times New Roman (ANSI), f1=MS Mincho
(Shift_JIS)
+ String rtf = "{\\rtf1\\ansi\\deff0" +
+ "{\\fonttbl" +
+ "{\\f0\\froman\\fcharset0 Times New Roman;}" +
+ "{\\f1\\fnil\\fcharset128 MS Mincho;}" +
+ "}" +
+ "\\f0 Hello}";
+ RTFState state = processRtf(rtf);
+
+ // fcharset 0 = ANSI = WINDOWS-1252
+ assertEquals(RTFCharsetMaps.WINDOWS_1252,
state.getFontToCharset().get(0));
+ // fcharset 128 = Shift JIS = MS932
+ assertEquals(Charset.forName("MS932"),
state.getFontToCharset().get(1));
+ }
+
+ @Test
+ public void testCurrentCharsetFollowsFont() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
+ "{\\fonttbl" +
+ "{\\f0\\froman\\fcharset0 Times;}" +
+ "{\\f1\\fnil\\fcharset161 Greek;}" +
+ "}" +
+ "\\f1 text}";
+ RTFTokenizer tokenizer = new RTFTokenizer(new
java.io.StringReader(rtf));
+ RTFState state = new RTFState();
+ Charset charsetAtText = null;
+
+ RTFToken tok;
+ while ((tok = tokenizer.yylex()) != null) {
+ if (tok.getType() == RTFTokenType.EOF) {
+ break;
+ }
+ state.processToken(tok);
+ // Capture charset when we see the first body text char
+ if (tok.getType() == RTFTokenType.TEXT && tok.getChar() == 't'
+ && charsetAtText == null) {
+ charsetAtText = state.getCurrentCharset();
+ }
+ }
+
+ // Verify font table was populated
+ assertEquals(2, state.getFontToCharset().size());
+ assertEquals(Charset.forName("cp1253"),
state.getFontToCharset().get(1));
+
+ // After \f1, charset should be cp1253 (Greek)
+ assertNotNull(charsetAtText);
+ assertEquals(Charset.forName("cp1253"), charsetAtText);
+ }
+
+ @Test
+ public void testCurrentCharsetFallsBackToGlobal() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1254\\deff0" +
+ "{\\fonttbl" +
+ "{\\f0\\froman\\fcharset0 Times;}" +
+ "}" +
+ "\\f0 text}";
+ RTFState state = processRtf(rtf);
+
+ // fcharset 0 = WINDOWS-1252 (ANSI)
+ assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getCurrentCharset());
+ }
+
+ @Test
+ public void testDefaultFontCharset() throws Exception {
+ // \deff1 sets default font to f1, which maps to fcharset 162 (Turkish
= cp1254)
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff1" +
+ "{\\fonttbl" +
+ "{\\f0\\froman\\fcharset0 Times;}" +
+ "{\\f1\\fnil\\fcharset162 Arial;}" +
+ "}" +
+ "\\pard text}";
+ RTFState state = processRtf(rtf);
+
+ // No explicit \fN in body, so should fall back to deff1 -> fcharset
162 -> cp1254
+ assertEquals(Charset.forName("cp1254"), state.getCurrentCharset());
+ }
+
+ @Test
+ public void testUcSkipInherited() throws Exception {
+ // RTF uc control word sets skip count to 2, inherited by child groups
+ // We process token-by-token and check inside the inner group
+ String rtf = "{\\rtf1\\ansi\\uc2{inner}}";
+ RTFTokenizer tokenizer = new RTFTokenizer(new
java.io.StringReader(rtf));
+ RTFState state = new RTFState();
+
+ int ucSkipInInnerGroup = -1;
+ boolean seenInnerText = false;
+ RTFToken tok;
+ while ((tok = tokenizer.yylex()) != null) {
+ if (tok.getType() == RTFTokenType.EOF) {
+ break;
+ }
+ state.processToken(tok);
+ // Check ucSkip when we see the first char of "inner"
+ if (tok.getType() == RTFTokenType.TEXT && tok.getChar() == 'i' &&
!seenInnerText) {
+ ucSkipInInnerGroup = state.getCurrentGroup().ucSkip;
+ seenInnerText = true;
+ }
+ }
+ // Inside {inner}, ucSkip should be inherited as 2 from parent
+ assertEquals(2, ucSkipInInnerGroup);
+ }
+
+ @Test
+ public void testAnsiSkipAfterUnicode() throws Exception {
+ // After \u8212, the next ucSkip (default 1) ANSI chars should be
skipped
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252" +
+ "{\\fonttbl{\\f0\\fcharset0 Times;}}" +
+ "\\f0 A\\u8212\\'97B}";
+ RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+ RTFState state = new RTFState();
+ StringBuilder textOutput = new StringBuilder();
+
+ RTFToken tok;
+ while ((tok = tokenizer.yylex()) != null) {
+ if (tok.getType() == RTFTokenType.EOF) {
+ break;
+ }
+ boolean consumed = state.processToken(tok);
+ if (!consumed && !state.getCurrentGroup().ignore) {
+ if (tok.getType() == RTFTokenType.TEXT) {
+ textOutput.append(tok.getChar());
+ } else if (tok.getType() == RTFTokenType.UNICODE_ESCAPE) {
+ int cp = tok.getParameter();
+ if (Character.isValidCodePoint(cp)) {
+ textOutput.appendCodePoint(cp);
+ }
+ }
+ }
+ }
+ // A + \u8212 (em dash) + B. The \'97 should be skipped as unicode
shadow.
+ assertEquals("A\u2014B", textOutput.toString());
+ }
+
+ @Test
+ public void testGroupStateRestored() throws Exception {
+ // Font change inside a group should be reverted when group closes
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
+ "{\\fonttbl" +
+ "{\\f0\\fcharset0 Times;}" +
+ "{\\f1\\fcharset161 Greek;}" +
+ "}" +
+ "\\f0 {\\f1 greek}{back to times}}";
+ RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+ RTFState state = new RTFState();
+
+ Charset charsetInsideGroup = null;
+ Charset charsetAfterGroup = null;
+ boolean seenGreekGroup = false;
+ int bodyGroupDepth = 0;
+
+ RTFToken tok;
+ while ((tok = tokenizer.yylex()) != null) {
+ if (tok.getType() == RTFTokenType.EOF) {
+ break;
+ }
+ state.processToken(tok);
+
+ if (tok.getType() == RTFTokenType.TEXT) {
+ char ch = tok.getChar();
+ if (ch == 'g' && !seenGreekGroup) {
+ charsetInsideGroup = state.getCurrentCharset();
+ seenGreekGroup = true;
+ } else if (ch == 'b') {
+ charsetAfterGroup = state.getCurrentCharset();
+ }
+ }
+ }
+
+ assertNotNull(charsetInsideGroup);
+ assertNotNull(charsetAfterGroup);
+ // Inside the {\f1 ...} group, charset should be Greek (cp1253)
+ assertEquals(Charset.forName("cp1253"), charsetInsideGroup);
+ // After the group closes, should be back to f0 (WINDOWS-1252)
+ assertEquals(RTFCharsetMaps.WINDOWS_1252, charsetAfterGroup);
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
new file mode 100644
index 0000000000..b5d96178f4
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFTokenizerTest {
+
+ private List<RTFToken> tokenize(String input) throws Exception {
+ RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(input));
+ List<RTFToken> tokens = new ArrayList<>();
+ RTFToken tok;
+ while ((tok = tokenizer.yylex()) != null) {
+ if (tok.getType() == RTFTokenType.EOF) {
+ break;
+ }
+ // copy token since it's reused
+ RTFToken copy = new RTFToken();
+ if (tok.getType() == RTFTokenType.TEXT || tok.getType() ==
RTFTokenType.CONTROL_SYMBOL) {
+ copy.setChar(tok.getType(), tok.getChar());
+ } else {
+ copy.set(tok.getType(), tok.getName(), tok.getParameter(),
tok.hasParameter());
+ }
+ tokens.add(copy);
+ }
+ return tokens;
+ }
+
+ @Test
+ public void testGroupOpenClose() throws Exception {
+ List<RTFToken> tokens = tokenize("{}");
+ assertEquals(2, tokens.size());
+ assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+ assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(1).getType());
+ }
+
+ @Test
+ public void testControlWord() throws Exception {
+ List<RTFToken> tokens = tokenize("\\rtf1");
+ assertEquals(1, tokens.size());
+ assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType());
+ assertEquals("rtf", tokens.get(0).getName());
+ assertEquals(1, tokens.get(0).getParameter());
+ assertTrue(tokens.get(0).hasParameter());
+ }
+
+ @Test
+ public void testControlWordNoParam() throws Exception {
+ List<RTFToken> tokens = tokenize("\\ansi");
+ assertEquals(1, tokens.size());
+ assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType());
+ assertEquals("ansi", tokens.get(0).getName());
+ assertFalse(tokens.get(0).hasParameter());
+ }
+
+ @Test
+ public void testControlWordNegativeParam() throws Exception {
+ List<RTFToken> tokens = tokenize("\\u-4321");
+ assertEquals(1, tokens.size());
+ assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType());
+ // -4321 → 65536 - 4321 = 61215
+ assertEquals(61215, tokens.get(0).getParameter());
+ }
+
+ @Test
+ public void testHexEscape() throws Exception {
+ List<RTFToken> tokens = tokenize("\\'e9");
+ assertEquals(1, tokens.size());
+ assertEquals(RTFTokenType.HEX_ESCAPE, tokens.get(0).getType());
+ assertEquals(0xe9, tokens.get(0).getHexValue());
+ }
+
+ @Test
+ public void testUnicodeEscape() throws Exception {
+ List<RTFToken> tokens = tokenize("\\u8212");
+ assertEquals(1, tokens.size());
+ assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType());
+ assertEquals(8212, tokens.get(0).getParameter());
+ }
+
+ @Test
+ public void testBinControl() throws Exception {
+ List<RTFToken> tokens = tokenize("\\bin1024");
+ assertEquals(1, tokens.size());
+ assertEquals(RTFTokenType.BIN, tokens.get(0).getType());
+ assertEquals(1024, tokens.get(0).getParameter());
+ }
+
+ @Test
+ public void testControlSymbol() throws Exception {
+ List<RTFToken> tokens = tokenize("\\~");
+ assertEquals(1, tokens.size());
+ assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType());
+ assertEquals('~', tokens.get(0).getChar());
+ }
+
+ @Test
+ public void testEscapedBraces() throws Exception {
+ List<RTFToken> tokens = tokenize("\\{\\}\\\\");
+ assertEquals(3, tokens.size());
+ assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType());
+ assertEquals('{', tokens.get(0).getChar());
+ assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType());
+ assertEquals('}', tokens.get(1).getChar());
+ assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(2).getType());
+ assertEquals('\\', tokens.get(2).getChar());
+ }
+
+ @Test
+ public void testText() throws Exception {
+ List<RTFToken> tokens = tokenize("Hello");
+ assertEquals(5, tokens.size()); // one char at a time
+ for (RTFToken t : tokens) {
+ assertEquals(RTFTokenType.TEXT, t.getType());
+ }
+ StringBuilder sb = new StringBuilder();
+ for (RTFToken t : tokens) {
+ sb.append(t.getChar());
+ }
+ assertEquals("Hello", sb.toString());
+ }
+
+ @Test
+ public void testCrLf() throws Exception {
+ List<RTFToken> tokens = tokenize("a\r\nb");
+ assertEquals(3, tokens.size());
+ assertEquals(RTFTokenType.TEXT, tokens.get(0).getType());
+ assertEquals(RTFTokenType.CRLF, tokens.get(1).getType());
+ assertEquals(RTFTokenType.TEXT, tokens.get(2).getType());
+ }
+
+ @Test
+ public void testIgnorableDestination() throws Exception {
+ // { \* \htmltag84_ < p > }
+ // The space after \htmltag84 is consumed as the control word delimiter
+ List<RTFToken> tokens = tokenize("{\\*\\htmltag84 <p>}");
+ assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+ assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType());
+ assertEquals('*', tokens.get(1).getChar());
+ assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType());
+ assertEquals("htmltag", tokens.get(2).getName());
+ assertEquals(84, tokens.get(2).getParameter());
+ // remaining tokens are < p > }
+ assertEquals(RTFTokenType.TEXT, tokens.get(3).getType());
+ assertEquals('<', tokens.get(3).getChar());
+ assertEquals(RTFTokenType.TEXT, tokens.get(4).getType());
+ assertEquals('p', tokens.get(4).getChar());
+ assertEquals(RTFTokenType.TEXT, tokens.get(5).getType());
+ assertEquals('>', tokens.get(5).getChar());
+ assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(6).getType());
+ assertEquals(7, tokens.size());
+ }
+
+ @Test
+ public void testMixedRtf() throws Exception {
+ String rtf = "{\\rtf1\\ansi\\ansicpg1252 Hello}";
+ List<RTFToken> tokens = tokenize(rtf);
+ // { \rtf1 \ansi \ansicpg1252 SPACE H e l l o }
+ assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+ assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(1).getType());
+ assertEquals("rtf", tokens.get(1).getName());
+ assertEquals(1, tokens.get(1).getParameter());
+ assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType());
+ assertEquals("ansi", tokens.get(2).getName());
+ assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(3).getType());
+ assertEquals("ansicpg", tokens.get(3).getName());
+ assertEquals(1252, tokens.get(3).getParameter());
+ }
+}