(tika) branch main updated: TIKA-4710-rtf-attachments-in-html-decapsulation (#2744)

tallison Mon, 06 Apr 2026 17:16:43 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 4a65796943 TIKA-4710-rtf-attachments-in-html-decapsulation (#2744)
4a65796943 is described below

commit 4a657969439a4fb382a95fde98d461f991875e52
Author: Tim Allison <[email protected]>
AuthorDate: Mon Apr 6 20:14:34 2026 -0400

    TIKA-4710-rtf-attachments-in-html-decapsulation (#2744)
---
 .../tika-parser-microsoft-module/pom.xml           |  24 +
 .../tika/parser/microsoft/OfficeParserConfig.java  |  20 +
 .../tika/parser/microsoft/OutlookExtractor.java    |   9 +-
 .../parser/microsoft/rtf/jflex/RTFCharsetMaps.java | 180 ++++++++
 .../microsoft/rtf/jflex/RTFEmbeddedHandler.java    | 254 ++++++++++
 .../parser/microsoft/rtf/jflex/RTFGroupState.java  |  76 +++
 .../microsoft/rtf/jflex/RTFHtmlDecapsulator.java   | 239 ++++++++++
 .../rtf/jflex/RTFObjDataStreamParser.java          | 510 +++++++++++++++++++++
 .../microsoft/rtf/jflex/RTFPictStreamParser.java   | 101 ++++
 .../tika/parser/microsoft/rtf/jflex/RTFState.java  | 342 ++++++++++++++
 .../tika/parser/microsoft/rtf/jflex/RTFToken.java  | 116 +++++
 .../parser/microsoft/rtf/jflex/RTFTokenType.java   |  30 ++
 .../parser/microsoft/rtf/jflex/RTFTokenizer.jflex  | 153 +++++++
 .../rtf/jflex/RTFEmbeddedHandlerTest.java          | 132 ++++++
 .../rtf/jflex/RTFHtmlDecapsulatorTest.java         | 260 +++++++++++
 .../parser/microsoft/rtf/jflex/RTFStateTest.java   | 250 ++++++++++
 .../microsoft/rtf/jflex/RTFTokenizerTest.java      | 191 ++++++++
 17 files changed, 2884 insertions(+), 3 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index 63cc9605cd..a3c7e1f1c8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -125,6 +125,30 @@
   </dependencies>
   <build>
     <plugins>
+      <plugin>
+        <groupId>de.jflex</groupId>
+        <artifactId>jflex-maven-plugin</artifactId>
+        <version>1.9.1</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>generate</goal>
+            </goals>
+            <configuration>
+              <lexDefinitions>
+                <lexDefinition>src/main/jflex</lexDefinition>
+              </lexDefinitions>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-checkstyle-plugin</artifactId>
+        <configuration>
+          <excludes>**/rtf/jflex/RTFTokenizer.java</excludes>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.rat</groupId>
         <artifactId>apache-rat-plugin</artifactId>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 9f21b0b798..363b0a0773 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -39,6 +39,13 @@ public class OfficeParserConfig implements Serializable {
 
     private boolean writeSelectHeadersInBody = false;
 
+    /**
+     * Maximum bytes per embedded object/pict when extracting from RTF within
+     * MSG files.  Since embedded data is streamed to disk (not held in 
memory),
+     * the default is 2 GB.  Set to -1 for unlimited.
+     */
+    private int rtfEmbeddedMaxBytesInKb = 2 * 1024 * 1024; // 2 GB
+
     private boolean includeGlossary = true;
     private String dateOverrideFormat = null;
     private int maxOverride = 0;//ignore
@@ -319,4 +326,17 @@ public class OfficeParserConfig implements Serializable {
     public void setWriteSelectHeadersInBody(boolean writeSelectHeadersInBody) {
         this.writeSelectHeadersInBody = writeSelectHeadersInBody;
     }
+
+    /**
+     * Maximum bytes (in KB) per embedded object/pict when extracting from RTF
+     * within MSG files.  Data is streamed to disk, so the default is 2 GB.
+     * Set to -1 for unlimited.
+     */
+    public int getRtfEmbeddedMaxBytesInKb() {
+        return rtfEmbeddedMaxBytesInKb;
+    }
+
+    public void setRtfEmbeddedMaxBytesInKb(int rtfEmbeddedMaxBytesInKb) {
+        this.rtfEmbeddedMaxBytesInKb = rtfEmbeddedMaxBytesInKb;
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index a2ef6de04f..01b357bddf 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -84,8 +84,8 @@ import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.html.JSoupParser;
 import org.apache.tika.parser.mailcommons.MailDateParser;
 import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
-import org.apache.tika.parser.microsoft.msg.RTFEncapsulatedHTMLExtractor;
 import org.apache.tika.parser.microsoft.rtf.RTFParser;
+import org.apache.tika.parser.microsoft.rtf.jflex.RTFHtmlDecapsulator;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.apache.tika.sax.BodyContentHandler;
@@ -600,8 +600,11 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                         new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, 
Types.BINARY.getId(),
                                 chunk.getValue());
                 byte[] rtfData = rtf.getData();
-                // Try to extract encapsulated HTML — returns null if not 
present
-                String html = RTFEncapsulatedHTMLExtractor.extract(rtfData);
+                // Try to extract encapsulated HTML + embedded objects in one 
pass
+                RTFHtmlDecapsulator decapsulator =
+                        new RTFHtmlDecapsulator(xhtml, parseContext,
+                                
officeParserConfig.getRtfEmbeddedMaxBytesInKb());
+                String html = decapsulator.extract(rtfData);
                 if (html != null) {
                     parseHtmlString(html, xhtml, contentIdNames);
                     parentMetadata.add(MAPI.BODY_TYPES_PROCESSED,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
new file mode 100644
index 0000000000..aaac2552ac
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFCharsetMaps.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.utils.CharsetUtils;
+
+/**
+ * Shared charset maps for RTF parsing. Maps RTF {@code \fcharsetN} and
+ * {@code \ansicpgN} values to Java {@link Charset} instances.
+ *
+ * <p>Extracted from the original {@code TextExtractor} so both the JFlex-based
+ * parser and decapsulator can reuse them.</p>
+ */
+public final class RTFCharsetMaps {
+
+    public static final Charset WINDOWS_1252 = Charset.forName("windows-1252");
+
+    /**
+     * Maps {@code \fcharsetN} values to Java charsets.
+     * The RTF font table uses these to declare per-font character encodings.
+     */
+    public static final Map<Integer, Charset> FCHARSET_MAP;
+
+    /**
+     * Maps {@code \ansicpgN} values to Java charsets.
+     * This is the global ANSI code page declared in the RTF header.
+     */
+    public static final Map<Integer, Charset> ANSICPG_MAP;
+
+    static {
+        Map<Integer, Charset> fcharset = new HashMap<>();
+
+        fcharset.put(0, WINDOWS_1252);                   // ANSI
+        // charset 1 = Default, charset 2 = Symbol
+
+        fcharset.put(77, getCharset("MacRoman"));        // Mac Roman
+        fcharset.put(78, getCharset("Shift_JIS"));       // Mac Shift Jis
+        fcharset.put(79, getCharset("ms949"));            // Mac Hangul
+        fcharset.put(80, getCharset("GB2312"));           // Mac GB2312
+        fcharset.put(81, getCharset("Big5"));             // Mac Big5
+        fcharset.put(82, getCharset("johab"));            // Mac Johab (old)
+        fcharset.put(83, getCharset("MacHebrew"));        // Mac Hebrew
+        fcharset.put(84, getCharset("MacArabic"));        // Mac Arabic
+        fcharset.put(85, getCharset("MacGreek"));         // Mac Greek
+        fcharset.put(86, getCharset("MacTurkish"));       // Mac Turkish
+        fcharset.put(87, getCharset("MacThai"));          // Mac Thai
+        fcharset.put(88, getCharset("cp1250"));           // Mac East Europe
+        fcharset.put(89, getCharset("cp1251"));           // Mac Russian
+
+        fcharset.put(128, getCharset("MS932"));           // Shift JIS
+        fcharset.put(129, getCharset("ms949"));           // Hangul
+        fcharset.put(130, getCharset("ms1361"));          // Johab
+        fcharset.put(134, getCharset("ms936"));           // GB2312
+        fcharset.put(136, getCharset("ms950"));           // Big5
+        fcharset.put(161, getCharset("cp1253"));          // Greek
+        fcharset.put(162, getCharset("cp1254"));          // Turkish
+        fcharset.put(163, getCharset("cp1258"));          // Vietnamese
+        fcharset.put(177, getCharset("cp1255"));          // Hebrew
+        fcharset.put(178, getCharset("cp1256"));          // Arabic
+        fcharset.put(186, getCharset("cp1257"));          // Baltic
+
+        fcharset.put(204, getCharset("cp1251"));          // Russian
+        fcharset.put(222, getCharset("ms874"));           // Thai
+        fcharset.put(238, getCharset("cp1250"));          // Eastern European
+        fcharset.put(254, getCharset("cp437"));           // PC 437
+        fcharset.put(255, getCharset("cp850"));           // OEM
+
+        FCHARSET_MAP = Collections.unmodifiableMap(fcharset);
+    }
+
+    static {
+        Map<Integer, Charset> ansicpg = new HashMap<>();
+
+        ansicpg.put(437, getCharset("CP437"));            // US IBM
+        ansicpg.put(708, getCharset("ISO-8859-6"));       // Arabic (ASMO 708)
+        ansicpg.put(709, getCharset("windows-709"));      // Arabic (ASMO 449+)
+        ansicpg.put(710, getCharset("windows-710"));      // Arabic 
(transparent)
+        ansicpg.put(711, getCharset("windows-711"));      // Arabic (Nafitha)
+        ansicpg.put(720, getCharset("windows-720"));      // Arabic 
(transparent ASMO)
+        ansicpg.put(819, getCharset("CP819"));            // Windows 3.1 
(US/Western)
+        ansicpg.put(850, getCharset("CP850"));            // IBM Multilingual
+        ansicpg.put(852, getCharset("CP852"));            // Eastern European
+        ansicpg.put(860, getCharset("CP860"));            // Portuguese
+        ansicpg.put(862, getCharset("CP862"));            // Hebrew
+        ansicpg.put(863, getCharset("CP863"));            // French Canadian
+        ansicpg.put(864, getCharset("CP864"));            // Arabic
+        ansicpg.put(865, getCharset("CP865"));            // Norwegian
+        ansicpg.put(866, getCharset("CP866"));            // Soviet Union
+        ansicpg.put(874, getCharset("MS874"));            // Thai
+        ansicpg.put(932, getCharset("MS932"));            // Japanese
+        ansicpg.put(936, getCharset("MS936"));            // Simplified Chinese
+        ansicpg.put(949, getCharset("CP949"));            // Korean
+        ansicpg.put(950, getCharset("CP950"));            // Traditional 
Chinese
+        ansicpg.put(1250, getCharset("CP1250"));          // Eastern European
+        ansicpg.put(1251, getCharset("CP1251"));          // Cyrillic
+        ansicpg.put(1252, getCharset("CP1252"));          // Western European
+        ansicpg.put(1253, getCharset("CP1253"));          // Greek
+        ansicpg.put(1254, getCharset("CP1254"));          // Turkish
+        ansicpg.put(1255, getCharset("CP1255"));          // Hebrew
+        ansicpg.put(1256, getCharset("CP1256"));          // Arabic
+        ansicpg.put(1257, getCharset("CP1257"));          // Baltic
+        ansicpg.put(1258, getCharset("CP1258"));          // Vietnamese
+        ansicpg.put(1361, getCharset("x-Johab"));         // Johab
+        ansicpg.put(10000, getCharset("MacRoman"));       // Mac Roman
+        ansicpg.put(10001, getCharset("Shift_JIS"));      // Mac Japan
+        ansicpg.put(10004, getCharset("MacArabic"));      // Mac Arabic
+        ansicpg.put(10005, getCharset("MacHebrew"));      // Mac Hebrew
+        ansicpg.put(10006, getCharset("MacGreek"));       // Mac Greek
+        ansicpg.put(10007, getCharset("MacCyrillic"));    // Mac Cyrillic
+        ansicpg.put(10029, getCharset("x-MacCentralEurope")); // Mac Latin2
+        ansicpg.put(10081, getCharset("MacTurkish"));     // Mac Turkish
+        ansicpg.put(57002, getCharset("x-ISCII91"));      // Devanagari
+        ansicpg.put(57003, getCharset("windows-57003"));  // Bengali
+        ansicpg.put(57004, getCharset("windows-57004"));  // Tamil
+        ansicpg.put(57005, getCharset("windows-57005"));  // Telugu
+        ansicpg.put(57006, getCharset("windows-57006"));  // Assamese
+        ansicpg.put(57007, getCharset("windows-57007"));  // Oriya
+        ansicpg.put(57008, getCharset("windows-57008"));  // Kannada
+        ansicpg.put(57009, getCharset("windows-57009"));  // Malayalam
+        ansicpg.put(57010, getCharset("windows-57010"));  // Gujarati
+        ansicpg.put(57011, getCharset("windows-57011"));  // Punjabi
+
+        ANSICPG_MAP = Collections.unmodifiableMap(ansicpg);
+    }
+
+    private RTFCharsetMaps() {
+    }
+
+    /**
+     * Resolve a charset by name, falling back to US-ASCII if unavailable.
+     */
+    static Charset getCharset(String name) {
+        try {
+            return CharsetUtils.forName(name);
+        } catch (IllegalArgumentException e) {
+            return StandardCharsets.US_ASCII;
+        }
+    }
+
+    /**
+     * Resolve an ANSI code page number to a Java Charset.
+     * Tries the ANSICPG_MAP first, then falls back to {@code windows-N} and 
{@code cpN}.
+     * Returns {@code WINDOWS_1252} if nothing matches.
+     */
+    public static Charset resolveCodePage(int cpNumber) {
+        Charset cs = ANSICPG_MAP.get(cpNumber);
+        if (cs != null) {
+            return cs;
+        }
+        try {
+            return Charset.forName("windows-" + cpNumber);
+        } catch (Exception e) {
+            try {
+                return Charset.forName("cp" + cpNumber);
+            } catch (Exception e2) {
+                return WINDOWS_1252;
+            }
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
new file mode 100644
index 0000000000..dda6d98dff
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandler.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+
+/**
+ * Handles embedded objects and pictures within the JFlex-based RTF token 
stream.
+ *
+ * <p>Uses streaming parsers ({@link RTFObjDataStreamParser} and
+ * {@link RTFPictStreamParser}) so that large embedded objects are written
+ * to temp files rather than buffered entirely in memory.</p>
+ */
+public class RTFEmbeddedHandler {
+
+    private final ContentHandler handler;
+    private final ParseContext context;
+    private final EmbeddedDocumentUtil embeddedDocumentUtil;
+    private final long maxBytes;
+
+    private boolean inObject;
+    private boolean isPictBitmap;
+    private int hi = -1;
+    private int thumbCount;
+    private final AtomicInteger unknownFilenameCount = new AtomicInteger();
+
+    private String sn = "";
+    private String sv = "";
+    private final StringBuilder metadataBuffer = new StringBuilder();
+
+    private Metadata metadata;
+
+    // Streaming parsers -- one active at a time
+    private RTFObjDataStreamParser objParser;
+    private RTFPictStreamParser pictParser;
+
+    public RTFEmbeddedHandler(ContentHandler handler, ParseContext context,
+                              int maxBytesInKb) {
+        this.handler = handler;
+        this.context = context;
+        this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+        this.maxBytes = maxBytesInKb > 0 ? (long) maxBytesInKb * 1024 : -1;
+        this.metadata = Metadata.newInstance(context);
+    }
+
+    /**
+     * Process a token for embedded object/pict handling.
+     * Call this AFTER {@link RTFState#processToken(RTFToken)} has run.
+     */
+    public void processToken(RTFToken tok, RTFState rtfState, RTFGroupState 
closingGroup)
+            throws IOException, SAXException, TikaException {
+        RTFGroupState group = rtfState.getCurrentGroup();
+
+        switch (tok.getType()) {
+            case GROUP_CLOSE:
+                if (closingGroup.objdata) {
+                    handleCompletedObjData();
+                } else if (closingGroup.pictDepth == 1) {
+                    handleCompletedPict();
+                } else if (closingGroup.sn) {
+                    sn = metadataBuffer.toString();
+                } else if (closingGroup.sv) {
+                    sv = metadataBuffer.toString();
+                } else if (closingGroup.sp) {
+                    metadata.add(sn, sv);
+                }
+                if (closingGroup.object) {
+                    inObject = false;
+                }
+                break;
+
+            case CONTROL_WORD:
+                switch (tok.getName()) {
+                    case "object":
+                        inObject = true;
+                        break;
+                    case "objdata":
+                        metadata = Metadata.newInstance(context);
+                        objParser = new RTFObjDataStreamParser(maxBytes);
+                        break;
+                    case "pict":
+                        metadata = Metadata.newInstance(context);
+                        pictParser = new RTFPictStreamParser(maxBytes);
+                        break;
+                    case "sn":
+                        metadataBuffer.setLength(0);
+                        
metadataBuffer.append(RTFMetadata.RTF_PICT_META_PREFIX);
+                        break;
+                    case "sv":
+                        metadataBuffer.setLength(0);
+                        break;
+                    case "wbitmap":
+                        isPictBitmap = true;
+                        break;
+                }
+                break;
+
+            case TEXT:
+                if (group.objdata || group.pictDepth == 1) {
+                    writeHexChar(tok.getChar());
+                } else if (group.sn || group.sv) {
+                    metadataBuffer.append(tok.getChar());
+                }
+                break;
+
+            case HEX_ESCAPE:
+                if (group.sn || group.sv) {
+                    metadataBuffer.append((char) tok.getHexValue());
+                }
+                break;
+
+            default:
+                break;
+        }
+    }
+
+    private void handleCompletedObjData() throws IOException, SAXException, 
TikaException {
+        try (TikaInputStream tis = objParser.onComplete(metadata, 
unknownFilenameCount)) {
+            if (tis != null) {
+                extractObj(tis, metadata);
+            }
+        } catch (IOException e) {
+            EmbeddedDocumentUtil.recordException(e, metadata);
+        } finally {
+            objParser.close();
+            objParser = null;
+            reset();
+        }
+    }
+
+    private void handleCompletedPict() throws IOException, SAXException, 
TikaException {
+        try {
+            String filePath =
+                    metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + 
"wzDescription");
+            if (filePath != null && !filePath.isEmpty()) {
+                metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, 
filePath);
+                metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                        FilenameUtils.getName(filePath));
+                metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
filePath);
+            }
+            metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+            if (isPictBitmap) {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+                        "image/x-rtf-raw-bitmap");
+            }
+            try (TikaInputStream tis = pictParser.onComplete(metadata)) {
+                if (tis != null) {
+                    extractObj(tis, metadata);
+                }
+            }
+        } catch (IOException e) {
+            EmbeddedDocumentUtil.recordException(e, metadata);
+        } finally {
+            pictParser = null;
+            reset();
+        }
+    }
+
+    private void writeHexChar(int b) throws IOException, TikaException {
+        if (isHexChar(b)) {
+            if (hi == -1) {
+                hi = 16 * hexValue(b);
+            } else {
+                int decoded = hi + hexValue(b);
+                hi = -1;
+                if (objParser != null) {
+                    objParser.onByte(decoded);
+                } else if (pictParser != null) {
+                    pictParser.onByte(decoded);
+                }
+            }
+        }
+    }
+
+    private void extractObj(TikaInputStream tis, Metadata meta)
+            throws SAXException, IOException, TikaException {
+        meta.set(Metadata.CONTENT_LENGTH, Long.toString(tis.getLength()));
+
+        if (embeddedDocumentUtil.shouldParseEmbedded(meta)) {
+            if (meta.get(TikaCoreProperties.RESOURCE_NAME_KEY) == null) {
+                String extension = embeddedDocumentUtil.getExtension(tis, 
meta);
+                if (inObject && pictParser != null) {
+                    meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                            
EmbeddedDocumentUtil.EmbeddedResourcePrefix.THUMBNAIL.getPrefix()
+                                    + "-" + thumbCount++ + extension);
+                    meta.set(RTFMetadata.THUMBNAIL, "true");
+                } else {
+                    meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                            
EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED.getPrefix()
+                                    + "-" + 
unknownFilenameCount.getAndIncrement()
+                                    + extension);
+                }
+                meta.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, 
true);
+            }
+            try {
+                embeddedDocumentUtil.parseEmbedded(
+                        tis, new EmbeddedContentHandler(handler), meta, true);
+            } catch (IOException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, meta);
+            }
+        }
+    }
+
+    private void reset() {
+        metadata = Metadata.newInstance(context);
+        hi = -1;
+        sn = "";
+        sv = "";
+        metadataBuffer.setLength(0);
+        isPictBitmap = false;
+    }
+
+    private static boolean isHexChar(int ch) {
+        return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 
'A' && ch <= 'F');
+    }
+
+    private static int hexValue(int ch) {
+        if (ch >= '0' && ch <= '9') {
+            return ch - '0';
+        } else if (ch >= 'a' && ch <= 'z') {
+            return 10 + (ch - 'a');
+        } else {
+            return 10 + (ch - 'A');
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
new file mode 100644
index 0000000000..c5f9f8c444
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFGroupState.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+
+/**
+ * State associated with a single RTF group ({@code \{ ... \}}).
+ * <p>
+ * When a new group opens, the current state is pushed onto the stack and a
+ * child state is created that inherits the parent's properties. When the group
+ * closes, the state is popped.
+ */
+public class RTFGroupState {
+
+    /** Nesting depth (0 = root). */
+    int depth;
+
+    /** Current font charset, set by {@code \fN} if the font table maps it. 
May be null. */
+    Charset fontCharset;
+
+    /** Current font ID, set by {@code \fN}. -1 if unset. */
+    int fontId = -1;
+
+    /** Number of ANSI chars to skip after a unicode escape (ucN control 
word). Default 1. */
+    int ucSkip = 1;
+
+    /** True if this group's content should be ignored (e.g. {@code \*} 
destination). */
+    boolean ignore;
+
+    /** True if bold. */
+    boolean bold;
+
+    /** True if italic. */
+    boolean italic;
+
+    // Embedded object / picture state
+    boolean objdata;
+    int pictDepth;
+    boolean sp;
+    boolean sn;
+    boolean sv;
+    boolean object;
+    boolean annotation;
+
+    /** Create a root group state with defaults. */
+    public RTFGroupState() {
+    }
+
+    /** Create a child group state inheriting from the parent. */
+    public RTFGroupState(RTFGroupState parent) {
+        this.depth = parent.depth + 1;
+        this.fontCharset = parent.fontCharset;
+        this.fontId = parent.fontId;
+        this.ucSkip = parent.ucSkip;
+        this.ignore = parent.ignore;
+        this.bold = parent.bold;
+        this.italic = parent.italic;
+        this.pictDepth = parent.pictDepth > 0 ? parent.pictDepth + 1 : 0;
+        // objdata, sp, sn, sv, object, annotation are NOT inherited
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
new file mode 100644
index 0000000000..7ef06f1ebe
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulator.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Extracts the original HTML from an RTF document that contains encapsulated 
HTML
+ * (as indicated by the {@code \fromhtml1} control word), using a JFlex-based 
tokenizer
+ * and shared {@link RTFState} for font/codepage tracking.
+ *
+ * <p>Embedded objects and pictures are extracted in the same pass via
+ * {@link RTFEmbeddedHandler}.</p>
+ */
+public class RTFHtmlDecapsulator {
+
+    private static final int DEFAULT_MAX_BYTES_KB = 2 * 1024 * 1024; // 2 GB
+
+    private final RTFEmbeddedHandler embHandler;
+
+    public RTFHtmlDecapsulator(ContentHandler handler, ParseContext context,
+                               int maxBytesInKb) {
+        this.embHandler = new RTFEmbeddedHandler(handler, context, 
maxBytesInKb);
+    }
+
+    public RTFHtmlDecapsulator(ContentHandler handler, ParseContext context) {
+        this(handler, context, DEFAULT_MAX_BYTES_KB);
+    }
+
+    public String extract(byte[] rtfBytes) throws IOException, SAXException, 
TikaException {
+        if (rtfBytes == null || rtfBytes.length == 0) {
+            return null;
+        }
+        // Wrap byte[] in a Reader directly — RTF is 7-bit ASCII, so
+        // US_ASCII decoding is a 1:1 byte-to-char mapping with no
+        // intermediate String allocation.
+        Reader reader = new InputStreamReader(
+                new ByteArrayInputStream(rtfBytes), StandardCharsets.US_ASCII);
+        RTFTokenizer tokenizer = new RTFTokenizer(reader);
+        RTFState state = new RTFState();
+        StringBuilder html = new StringBuilder(rtfBytes.length / 2);
+        ByteArrayOutputStream pendingBytes = new ByteArrayOutputStream();
+
+        boolean foundFromHtml = false;
+        boolean foundHtmlTag = false;
+        boolean inHtmlRtfSkip = false;
+        boolean sawIgnorable = false;
+        int htmlTagDepth = -1;
+        boolean inHtmlTag = false;
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            RTFTokenType type = tok.getType();
+            if (type == RTFTokenType.EOF) {
+                break;
+            }
+
+            // Flush pending bytes before charset-changing events
+            if (type == RTFTokenType.GROUP_CLOSE
+                    || (type == RTFTokenType.CONTROL_WORD && 
"f".equals(tok.getName())
+                        && tok.hasParameter())) {
+                flushPendingBytes(pendingBytes, html, state);
+            }
+
+            boolean consumed = state.processToken(tok);
+
+            // Embedded handler processes objdata/pict/sp in the same pass
+            if (!consumed) {
+                RTFGroupState closingGroup =
+                        (type == RTFTokenType.GROUP_CLOSE) ? 
state.getLastClosedGroup() : null;
+                try {
+                    embHandler.processToken(tok, state, closingGroup);
+                } catch (TikaException | IOException e) {
+                    // don't let a bad embedded object kill decapsulation
+                }
+            }
+
+            RTFGroupState group = state.getCurrentGroup();
+
+            // Skip tokens that are part of objdata/pict hex streams
+            if (!consumed && (group.objdata || group.pictDepth > 0)) {
+                continue;
+            }
+
+            switch (type) {
+                case GROUP_OPEN:
+                    sawIgnorable = false;
+                    break;
+
+                case GROUP_CLOSE:
+                    if (inHtmlTag && state.getDepth() < htmlTagDepth) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        inHtmlTag = false;
+                        htmlTagDepth = -1;
+                    }
+                    break;
+
+                case CONTROL_SYMBOL:
+                    if (tok.getChar() == '*') {
+                        sawIgnorable = true;
+                    }
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || htmlTagDepth == -1) {
+                        char sym = tok.getChar();
+                        if (sym == '{' || sym == '}' || sym == '\\') {
+                            flushPendingBytes(pendingBytes, html, state);
+                            html.append(sym);
+                        }
+                    }
+                    break;
+
+                case CONTROL_WORD:
+                    if (consumed) {
+                        break;
+                    }
+                    String name = tok.getName();
+
+                    if ("fromhtml".equals(name)) {
+                        foundFromHtml = true;
+                        break;
+                    }
+                    if ("htmltag".equals(name) && sawIgnorable) {
+                        if (!foundFromHtml) {
+                            break;
+                        }
+                        foundHtmlTag = true;
+                        flushPendingBytes(pendingBytes, html, state);
+                        inHtmlTag = true;
+                        htmlTagDepth = state.getDepth();
+                        break;
+                    }
+                    if ("htmlrtf".equals(name)) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        inHtmlRtfSkip = !(tok.hasParameter() && 
tok.getParameter() == 0);
+                        break;
+                    }
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || htmlTagDepth == -1) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        switch (name) {
+                            case "par":
+                            case "pard":
+                                html.append('\n');
+                                break;
+                            case "tab":
+                                html.append('\t');
+                                break;
+                            case "line":
+                                html.append("<br>");
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+                    break;
+
+                case HEX_ESCAPE:
+                    if (consumed || !foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || htmlTagDepth == -1) {
+                        pendingBytes.write(tok.getHexValue());
+                    }
+                    break;
+
+                case UNICODE_ESCAPE:
+                    if (!foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || htmlTagDepth == -1) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        int cp = tok.getParameter();
+                        if (Character.isValidCodePoint(cp)) {
+                            html.appendCodePoint(cp);
+                        }
+                    }
+                    break;
+
+                case TEXT:
+                    if (consumed || !foundHtmlTag || inHtmlRtfSkip) {
+                        break;
+                    }
+                    if (inHtmlTag || htmlTagDepth == -1) {
+                        flushPendingBytes(pendingBytes, html, state);
+                        html.append(tok.getChar());
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+
+        flushPendingBytes(pendingBytes, html, state);
+        if (!foundFromHtml || html.length() == 0) {
+            return null;
+        }
+        return html.toString();
+    }
+
+    private static void flushPendingBytes(ByteArrayOutputStream pending, 
StringBuilder out,
+                                          RTFState state) {
+        if (pending.size() > 0) {
+            Charset cs = state.getCurrentCharset();
+            out.append(new String(pending.toByteArray(), cs));
+            pending.reset();
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
new file mode 100644
index 0000000000..8f7801f790
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFObjDataStreamParser.java
@@ -0,0 +1,510 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Locale;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+/**
+ * Parses OLE objdata from an RTF stream inline, byte by byte.
+ *
+ * <p>The OLE objdata structure is:
+ * <pre>
+ *   [4 bytes version][4 bytes formatId]
+ *   [4 bytes classNameLen][classNameLen bytes className]
+ *   [4 bytes topicNameLen][topicNameLen bytes topicName]
+ *   [4 bytes itemNameLen][itemNameLen bytes itemName]
+ *   [4 bytes dataSz][dataSz bytes payload]
+ * </pre>
+ * The small header fields are parsed byte-by-byte via a state machine.
+ * Once the header is complete and {@code dataSz} is known, the payload
+ * bytes stream directly to a temp file -- never buffered in memory.</p>
+ *
+ * <p>On {@link #onComplete(Metadata, AtomicInteger)}, the payload is
+ * interpreted based on {@code className} (Package, PBrush, POIFS, etc.)
+ * and the extracted content is returned as a {@link TikaInputStream} whose
+ * close will clean up all temp files via {@link TemporaryResources}.</p>
+ */
+public class RTFObjDataStreamParser implements Closeable {
+
+    private static final String WIN_ASCII = "WINDOWS-1252";
+
+    private final long maxBytes;
+    private final TemporaryResources tmp = new TemporaryResources();
+
+    // State machine
+    private Field currentField = Field.VERSION;
+    private byte[] fieldBuf = new byte[4];
+    private int fieldPos;
+    private int fieldTarget = 4;
+
+    // Parsed header values
+    private long version;
+    private long formatId;
+    private String className;
+    private String topicName;
+    private String itemName;
+    private long dataSz;
+
+    // String accumulator for length-prefixed ANSI strings
+    private byte[] stringBuf;
+    private int stringPos;
+
+    // Payload streaming
+    private Path tempFile;
+    private OutputStream dataOut;
+    private long dataWritten;
+
+    /**
+     * @param maxBytes maximum payload bytes to accept (-1 for unlimited)
+     */
+    public RTFObjDataStreamParser(long maxBytes) {
+        this.maxBytes = maxBytes;
+    }
+
+    /**
+     * Receive a single decoded byte from the objdata hex stream.
+     */
+    public void onByte(int b) throws IOException, TikaException {
+        switch (currentField) {
+            case VERSION:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    version = readLE32(fieldBuf);
+                    initUint32Field(Field.FORMAT_ID);
+                }
+                break;
+
+            case FORMAT_ID:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    formatId = readLE32(fieldBuf);
+                    if (formatId != 2L) {
+                        // Not an embedded object (1 = link). Skip everything.
+                        currentField = Field.SKIP;
+                    } else {
+                        initUint32Field(Field.CLASS_LEN);
+                    }
+                }
+                break;
+
+            case CLASS_LEN:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    int len = (int) readLE32(fieldBuf);
+                    initStringField(Field.CLASS_NAME, len);
+                }
+                break;
+
+            case CLASS_NAME:
+                stringBuf[stringPos++] = (byte) b;
+                if (stringPos >= fieldTarget) {
+                    className = decodeString(stringBuf, fieldTarget);
+                    initUint32Field(Field.TOPIC_LEN);
+                }
+                break;
+
+            case TOPIC_LEN:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    int len = (int) readLE32(fieldBuf);
+                    initStringField(Field.TOPIC_NAME, len);
+                }
+                break;
+
+            case TOPIC_NAME:
+                stringBuf[stringPos++] = (byte) b;
+                if (stringPos >= fieldTarget) {
+                    topicName = decodeString(stringBuf, fieldTarget);
+                    initUint32Field(Field.ITEM_LEN);
+                }
+                break;
+
+            case ITEM_LEN:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    int len = (int) readLE32(fieldBuf);
+                    initStringField(Field.ITEM_NAME, len);
+                }
+                break;
+
+            case ITEM_NAME:
+                stringBuf[stringPos++] = (byte) b;
+                if (stringPos >= fieldTarget) {
+                    itemName = decodeString(stringBuf, fieldTarget);
+                    initUint32Field(Field.DATA_SIZE);
+                }
+                break;
+
+            case DATA_SIZE:
+                fieldBuf[fieldPos++] = (byte) b;
+                if (fieldPos >= fieldTarget) {
+                    dataSz = readLE32(fieldBuf);
+                    if (dataSz <= 0) {
+                        currentField = Field.DONE;
+                    } else {
+                        currentField = Field.DATA;
+                        tempFile = tmp.createTempFile(".bin");
+                        dataOut = new 
BufferedOutputStream(Files.newOutputStream(tempFile));
+                    }
+                }
+                break;
+
+            case DATA:
+                if (maxBytes > 0 && dataWritten >= maxBytes) {
+                    throw new TikaMemoryLimitException(dataWritten + 1, 
maxBytes);
+                }
+                dataOut.write(b);
+                dataWritten++;
+                if (dataWritten >= dataSz) {
+                    dataOut.close();
+                    dataOut = null;
+                    currentField = Field.DONE;
+                }
+                break;
+
+            case DONE:
+            case SKIP:
+                break;
+        }
+    }
+
+    /**
+     * Called when the objdata group closes. Populates metadata and returns
+     * a TikaInputStream with the extracted embedded content, or null if
+     * the object couldn't be parsed.
+     *
+     * <p>The caller owns the returned TikaInputStream -- closing it will
+     * clean up all temp files via TemporaryResources.</p>
+     */
+    public TikaInputStream onComplete(Metadata metadata, AtomicInteger 
unknownFilenameCount)
+            throws IOException, TikaException {
+        if (currentField == Field.SKIP || tempFile == null) {
+            return null;
+        }
+
+        metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
+        if (className != null && !className.isEmpty()) {
+            metadata.add(RTFMetadata.EMB_CLASS, className);
+        }
+        if (topicName != null && !topicName.isEmpty()) {
+            metadata.add(RTFMetadata.EMB_TOPIC, topicName);
+        }
+        if (itemName != null && !itemName.isEmpty()) {
+            metadata.add(RTFMetadata.EMB_ITEM, itemName);
+        }
+
+        String cn = className != null ? className.toLowerCase(Locale.ROOT) : 
"";
+
+        if ("package".equals(cn)) {
+            return handlePackage(metadata);
+        } else if ("pbrush".equals(cn)) {
+            return TikaInputStream.get(tempFile, metadata, tmp);
+        } else {
+            return handleGenericOrPOIFS(metadata, unknownFilenameCount);
+        }
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (dataOut != null) {
+            dataOut.close();
+            dataOut = null;
+        }
+        tmp.close();
+    }
+
+    // --- Package handling ---
+
+    private TikaInputStream handlePackage(Metadata metadata) throws 
IOException, TikaException {
+        try (InputStream is = new 
BufferedInputStream(Files.newInputStream(tempFile))) {
+            readUShortLE(is); // type
+
+            String displayName = readNullTerminatedString(is);
+            readNullTerminatedString(is); // iconFilePath
+            readUShortBE(is); // iconIndex
+            int type2 = readUShortLE(is);
+
+            if (type2 != 3) {
+                return null;
+            }
+
+            readUIntLE(is); // filePathLen
+            String ansiFilePath = readNullTerminatedString(is);
+            long bytesLen = readUIntLE(is);
+
+            // Write the embedded file content to a new temp file
+            Path contentFile = tmp.createTempFile(".bin");
+            try (OutputStream contentOut = new BufferedOutputStream(
+                    Files.newOutputStream(contentFile))) {
+                copyBounded(is, contentOut, bytesLen);
+            }
+
+            // Try to read unicode file path (optional)
+            StringBuilder unicodePath = new StringBuilder();
+            try {
+                long unicodeLen = readUIntLE(is);
+                for (int i = 0; i < unicodeLen; i++) {
+                    int lo = is.read();
+                    int hi = is.read();
+                    if (lo == -1 || hi == -1) {
+                        unicodePath.setLength(0);
+                        break;
+                    }
+                    unicodePath.append((char) (lo + 256 * hi));
+                }
+            } catch (IOException e) {
+                unicodePath.setLength(0);
+            }
+
+            String fileNameToUse;
+            String pathToUse;
+            if (unicodePath.length() > 0) {
+                fileNameToUse = unicodePath.toString();
+                pathToUse = unicodePath.toString();
+            } else {
+                fileNameToUse = displayName != null ? displayName : "";
+                pathToUse = ansiFilePath != null ? ansiFilePath : "";
+            }
+            metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
fileNameToUse);
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                    FilenameUtils.getName(fileNameToUse));
+            metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, 
pathToUse);
+
+            // Return TikaInputStream backed by contentFile; closing it cleans 
up
+            // both contentFile and the original tempFile via 
TemporaryResources
+            return TikaInputStream.get(contentFile, metadata, tmp);
+        }
+    }
+
+    // --- Generic / POIFS handling ---
+
+    private TikaInputStream handleGenericOrPOIFS(Metadata metadata,
+                                                  AtomicInteger 
unknownFilenameCount)
+            throws IOException, TikaException {
+        try (InputStream probe = new 
BufferedInputStream(Files.newInputStream(tempFile))) {
+            boolean isOLE2 = FileMagic.valueOf(probe) == FileMagic.OLE2;
+            if (!isOLE2) {
+                return TikaInputStream.get(tempFile, metadata, tmp);
+            }
+        }
+
+        // It's POIFS -- parse it
+        try (InputStream poifsIn = new 
BufferedInputStream(Files.newInputStream(tempFile));
+             POIFSFileSystem fs = new POIFSFileSystem(poifsIn)) {
+            DirectoryNode root = fs.getRoot();
+            if (root == null) {
+                return null;
+            }
+
+            byte[] content = null;
+
+            if (root.hasEntry("Package")) {
+                Entry pkg = root.getEntry("Package");
+                try (BoundedInputStream bis = new BoundedInputStream(
+                        maxBytes > 0 ? maxBytes : Long.MAX_VALUE,
+                        new DocumentInputStream((DocumentEntry) pkg))) {
+                    content = IOUtils.toByteArray(bis);
+                    if (bis.hasHitBound()) {
+                        throw new TikaMemoryLimitException(maxBytes + 1, 
maxBytes);
+                    }
+                }
+            } else {
+                POIFSDocumentType type = POIFSDocumentType.detectType(root);
+                if (type == POIFSDocumentType.OLE10_NATIVE) {
+                    try {
+                        Ole10Native ole = 
Ole10Native.createFromEmbeddedOleObject(root);
+                        content = ole.getDataBuffer();
+                    } catch (Ole10NativeException ex) {
+                        // Not valid OLE10Native
+                    }
+                } else if (type == POIFSDocumentType.COMP_OBJ) {
+                    DocumentEntry contentsEntry;
+                    try {
+                        contentsEntry = (DocumentEntry) 
root.getEntry("CONTENTS");
+                    } catch (FileNotFoundException e) {
+                        contentsEntry = (DocumentEntry) 
root.getEntry("Contents");
+                    }
+                    try (DocumentInputStream inp = new 
DocumentInputStream(contentsEntry)) {
+                        content = new byte[contentsEntry.getSize()];
+                        inp.readFully(content);
+                    }
+                } else {
+                    // Unknown POIFS type -- return the whole thing
+                    metadata.set(Metadata.CONTENT_TYPE, 
type.getType().toString());
+                    return TikaInputStream.get(tempFile, metadata, tmp);
+                }
+            }
+
+            if (content != null) {
+                Path contentFile = tmp.createTempFile(".bin");
+                Files.write(contentFile, content);
+                return TikaInputStream.get(contentFile, metadata, tmp);
+            }
+        }
+        return null;
+    }
+
+    // --- Helper methods ---
+
+    private void initUint32Field(Field next) {
+        currentField = next;
+        fieldPos = 0;
+        fieldTarget = 4;
+    }
+
+    private static final int MAX_HEADER_STRING_LENGTH = 4096;
+
+    private void initStringField(Field next, int len) {
+        currentField = next;
+        if (len > MAX_HEADER_STRING_LENGTH) {
+            // Corrupt or crafted header — bail out
+            currentField = Field.SKIP;
+            return;
+        }
+        if (len <= 0) {
+            switch (next) {
+                case CLASS_NAME:
+                    className = "";
+                    initUint32Field(Field.TOPIC_LEN);
+                    break;
+                case TOPIC_NAME:
+                    topicName = "";
+                    initUint32Field(Field.ITEM_LEN);
+                    break;
+                case ITEM_NAME:
+                    itemName = "";
+                    initUint32Field(Field.DATA_SIZE);
+                    break;
+                default:
+                    break;
+            }
+            return;
+        }
+        stringBuf = new byte[len];
+        stringPos = 0;
+        fieldTarget = len;
+    }
+
+    private static long readLE32(byte[] buf) {
+        return (buf[0] & 0xFFL)
+                | ((buf[1] & 0xFFL) << 8)
+                | ((buf[2] & 0xFFL) << 16)
+                | ((buf[3] & 0xFFL) << 24);
+    }
+
+    private static String decodeString(byte[] buf, int len) {
+        try {
+            return new String(buf, 0, len, WIN_ASCII).trim();
+        } catch (java.io.UnsupportedEncodingException e) {
+            return new String(buf, 0, len, 
java.nio.charset.StandardCharsets.US_ASCII).trim();
+        }
+    }
+
+    private static int readUShortLE(InputStream is) throws IOException {
+        try {
+            return EndianUtils.readUShortLE(is);
+        } catch (EndianUtils.BufferUnderrunException e) {
+            throw new IOException(e);
+        }
+    }
+
+    private static int readUShortBE(InputStream is) throws IOException {
+        try {
+            return EndianUtils.readUShortBE(is);
+        } catch (EndianUtils.BufferUnderrunException e) {
+            throw new IOException(e);
+        }
+    }
+
+    private static long readUIntLE(InputStream is) throws IOException {
+        try {
+            return EndianUtils.readUIntLE(is);
+        } catch (EndianUtils.BufferUnderrunException e) {
+            throw new IOException(e);
+        }
+    }
+
+    private static String readNullTerminatedString(InputStream is) throws 
IOException {
+        StringBuilder sb = new StringBuilder();
+        int c = is.read();
+        while (c > 0) {
+            sb.append((char) c);
+            c = is.read();
+        }
+        if (c == -1) {
+            throw new IOException("hit end of stream before null terminator");
+        }
+        return sb.toString();
+    }
+
+    private static long copyBounded(InputStream in, OutputStream out, long 
maxLen)
+            throws IOException {
+        byte[] buf = new byte[8192];
+        long total = 0;
+        while (total < maxLen) {
+            int toRead = (int) Math.min(buf.length, maxLen - total);
+            int read = in.read(buf, 0, toRead);
+            if (read == -1) {
+                break;
+            }
+            out.write(buf, 0, read);
+            total += read;
+        }
+        return total;
+    }
+
+    private enum Field {
+        VERSION, FORMAT_ID,
+        CLASS_LEN, CLASS_NAME,
+        TOPIC_LEN, TOPIC_NAME,
+        ITEM_LEN, ITEM_NAME,
+        DATA_SIZE, DATA,
+        DONE, SKIP
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
new file mode 100644
index 0000000000..906d351e26
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFPictStreamParser.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.io.BufferedOutputStream;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Streams decoded bytes from an RTF {@code \pict} group to a temp file.
+ *
+ * <p>Pict data is raw image bytes (after hex-pair decoding). There is no
+ * header to parse -- bytes are written directly to a temp file. On
+ * {@link #onComplete(Metadata)}, a {@link TikaInputStream} is returned
+ * whose close will clean up the temp file via {@link TemporaryResources}.</p>
+ */
+public class RTFPictStreamParser implements Closeable {
+
+    private final long maxBytes;
+    private final TemporaryResources tmp = new TemporaryResources();
+    private Path tempFile;
+    private OutputStream out;
+    private long bytesWritten;
+
+    /**
+     * @param maxBytes maximum number of bytes to accept (-1 for unlimited)
+     */
+    public RTFPictStreamParser(long maxBytes) throws IOException {
+        this.maxBytes = maxBytes;
+        this.tempFile = tmp.createTempFile(".bin");
+        this.out = new BufferedOutputStream(Files.newOutputStream(tempFile));
+    }
+
+    /**
+     * Receive a single decoded byte from the pict hex stream.
+     */
+    public void onByte(int b) throws IOException, TikaException {
+        if (maxBytes > 0 && bytesWritten >= maxBytes) {
+            throw new TikaMemoryLimitException(bytesWritten + 1, maxBytes);
+        }
+        out.write(b);
+        bytesWritten++;
+    }
+
+    /**
+     * Called when the pict group closes. Returns a TikaInputStream backed
+     * by the temp file. The caller owns the TikaInputStream -- closing it
+     * will delete the temp file.
+     *
+     * @return a TikaInputStream, or null if no bytes were written
+     */
+    public TikaInputStream onComplete(Metadata metadata) throws IOException {
+        out.close();
+        out = null;
+        if (bytesWritten == 0) {
+            tmp.close();
+            return null;
+        }
+        // Hand ownership of the temp file to the TikaInputStream.
+        // TikaInputStream.close() will close the TemporaryResources,
+        // which deletes the temp file.
+        return TikaInputStream.get(tempFile, metadata, tmp);
+    }
+
+    /** Returns the number of bytes written so far. */
+    public long getBytesWritten() {
+        return bytesWritten;
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (out != null) {
+            out.close();
+            out = null;
+        }
+        tmp.close();
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
new file mode 100644
index 0000000000..00ecd4d70d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFState.java
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import java.nio.charset.Charset;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Shared RTF parsing state: group stack, font table, codepage tracking,
+ * and unicode skip handling.
+ *
+ * <p>Both the HTML decapsulator and the full RTF parser use this class
+ * to manage the stateful parts of RTF processing.</p>
+ *
+ * <p>Typical usage: feed every token to {@link #processToken(RTFToken)}
+ * and query the current charset via {@link #getCurrentCharset()}.</p>
+ */
+public class RTFState {
+
+    /** Global charset from {@code \ansicpgN} or charset family selectors. */
+    private Charset globalCharset = RTFCharsetMaps.WINDOWS_1252;
+
+    /** Default font ID from {@code \deffN}. */
+    private int globalDefaultFont = -1;
+
+    /** Font table: maps font number ({@code \fN}) to charset ({@code 
\fcharsetN}). */
+    private final Map<Integer, Charset> fontToCharset = new HashMap<>();
+
+    private static final int MAX_GROUP_DEPTH = 10_000;
+
+    /** Group state stack. */
+    private final Deque<RTFGroupState> stack = new ArrayDeque<>();
+
+    /** Current (active) group state. */
+    private RTFGroupState current = new RTFGroupState();
+
+    /** Number of ANSI chars remaining to skip after a unicode escape. */
+    private int ansiSkip = 0;
+
+    /** The group state that was just closed (before popGroup). Set on 
GROUP_CLOSE. */
+    private RTFGroupState lastClosedGroup;
+
+    // Font table parsing state
+    // 0 = not yet seen, 1 = inside fonttbl, 2 = finished fonttbl
+    private int fontTableState = 0;
+    private int fontTableDepth = -1;
+    private int currentFontId = -1;
+
+    private boolean inHeader = true;
+
+    /**
+     * Process a single token to update internal state.
+     * <p>
+     * This handles: group open/close, charset selectors (ansi, ansicpg,
+     * deff), font table parsing (fonttbl, f, fcharset),
+     * unicode skip tracking (uc), and font changes (f in body).
+     *
+     * @return true if the token was consumed by state management (caller 
should skip it),
+     *         false if the caller should also process it
+     */
+    public boolean processToken(RTFToken tok) {
+        switch (tok.getType()) {
+            case GROUP_OPEN:
+                pushGroup();
+                return false;
+
+            case GROUP_CLOSE:
+                lastClosedGroup = current;
+                popGroup();
+                // Check if we've exited the font table
+                if (fontTableState == 1 && current.depth < fontTableDepth) {
+                    fontTableState = 2;
+                }
+                return false;
+
+            case CONTROL_SYMBOL:
+                if (tok.getChar() == '*') {
+                    current.ignore = true;
+                }
+                return false;
+
+            case CONTROL_WORD:
+                return processControlWord(tok);
+
+            case UNICODE_ESCAPE:
+                // After a unicode escape, skip the next ucSkip ANSI chars
+                ansiSkip = current.ucSkip;
+                return false;
+
+            case HEX_ESCAPE:
+                // If we're in the ANSI shadow of a unicode escape, skip this 
byte
+                if (ansiSkip > 0) {
+                    ansiSkip--;
+                    return true; // consumed — caller should ignore
+                }
+                return false;
+
+            case TEXT:
+                // If we're in the ANSI shadow, skip text chars
+                if (ansiSkip > 0) {
+                    // Each TEXT token is one char
+                    ansiSkip--;
+                    return true;
+                }
+                return false;
+
+            default:
+                return false;
+        }
+    }
+
+    private boolean processControlWord(RTFToken tok) {
+        String name = tok.getName();
+        boolean hasParam = tok.hasParameter();
+        int param = tok.getParameter();
+
+        // Global charset selectors (header)
+        switch (name) {
+            case "ansi":
+                globalCharset = RTFCharsetMaps.WINDOWS_1252;
+                return true;
+            case "pca":
+                globalCharset = RTFCharsetMaps.getCharset("cp850");
+                return true;
+            case "pc":
+                globalCharset = RTFCharsetMaps.getCharset("cp437");
+                return true;
+            case "mac":
+                globalCharset = RTFCharsetMaps.getCharset("MacRoman");
+                return true;
+            case "ansicpg":
+                if (hasParam) {
+                    Charset cs = RTFCharsetMaps.ANSICPG_MAP.get(param);
+                    if (cs != null) {
+                        globalCharset = cs;
+                    } else {
+                        globalCharset = RTFCharsetMaps.resolveCodePage(param);
+                    }
+                }
+                return true;
+            case "deff":
+                if (hasParam) {
+                    globalDefaultFont = param;
+                }
+                return true;
+        }
+
+        // Font table management
+        if ("fonttbl".equals(name)) {
+            fontTableState = 1;
+            fontTableDepth = current.depth;
+            current.ignore = true;
+            return true;
+        }
+
+        if (fontTableState == 1) {
+            // Inside font table
+            if (current.depth < fontTableDepth) {
+                fontTableState = 2;
+            } else {
+                if ("f".equals(name) && hasParam) {
+                    currentFontId = param;
+                    return true;
+                } else if ("fcharset".equals(name) && hasParam) {
+                    Charset cs = RTFCharsetMaps.FCHARSET_MAP.get(param);
+                    if (cs != null) {
+                        fontToCharset.put(currentFontId, cs);
+                    }
+                    return true;
+                }
+            }
+        }
+
+        // Unicode skip count
+        if ("uc".equals(name) && hasParam) {
+            current.ucSkip = param;
+            return true;
+        }
+
+        // Font change in body
+        if ("f".equals(name) && hasParam) {
+            current.fontId = param;
+            Charset fontCs = fontToCharset.get(param);
+            current.fontCharset = fontCs; // may be null
+            // If we've seen the font table and this is a body font change,
+            // we're out of the header
+            if (fontTableState == 2 && !current.ignore) {
+                inHeader = false;
+            }
+            return false; // caller may also want to know about font changes
+        }
+
+        // Header-ending control words
+        if (inHeader && !current.ignore) {
+            switch (name) {
+                case "par":
+                case "pard":
+                case "sect":
+                case "sectd":
+                case "plain":
+                case "ltrch":
+                case "rtlch":
+                case "htmlrtf":
+                case "line":
+                    inHeader = false;
+                    break;
+            }
+        }
+
+        // Embedded object / picture control words
+        switch (name) {
+            case "object":
+                current.object = true;
+                return false; // caller may want to know
+            case "objdata":
+                current.objdata = true;
+                return false;
+            case "pict":
+                current.pictDepth = 1;
+                return false;
+            case "sp":
+                current.sp = true;
+                return false;
+            case "sn":
+                current.sn = true;
+                return false;
+            case "sv":
+                current.sv = true;
+                return false;
+            case "wbitmap":
+                return false; // caller handles
+        }
+
+        // Ignorable destinations
+        if (inHeader) {
+            switch (name) {
+                case "colortbl":
+                case "stylesheet":
+                    current.ignore = true;
+                    return true;
+            }
+        }
+
+        return false;
+    }
+
+    /** Open a new group: push current state and create a child. */
+    public void pushGroup() {
+        if (stack.size() >= MAX_GROUP_DEPTH) {
+            // Silently ignore — treat further { as flat content
+            return;
+        }
+        stack.push(current);
+        current = new RTFGroupState(current);
+    }
+
+    /** Close the current group: pop and restore the parent state. */
+    public void popGroup() {
+        if (!stack.isEmpty()) {
+            current = stack.pop();
+        }
+    }
+
+    /**
+     * Returns the charset that should be used to decode the current hex escape
+     * or text byte. Priority:
+     * <ol>
+     *   <li>Font-specific charset (from {@code \fN → \fcharsetN})</li>
+     *   <li>Global default font's charset (from {@code \deffN})</li>
+     *   <li>Global charset (from {@code \ansicpgN} or family selector)</li>
+     * </ol>
+     */
+    public Charset getCurrentCharset() {
+        if (current.fontCharset != null) {
+            return current.fontCharset;
+        }
+        if (globalDefaultFont != -1 && !inHeader) {
+            Charset cs = fontToCharset.get(globalDefaultFont);
+            if (cs != null) {
+                return cs;
+            }
+        }
+        return globalCharset;
+    }
+
+    /** Returns the global charset ({@code \ansicpgN}). */
+    public Charset getGlobalCharset() {
+        return globalCharset;
+    }
+
+    /** Returns the current group state. */
+    public RTFGroupState getCurrentGroup() {
+        return current;
+    }
+
+    /** Returns true if we're still in the RTF header (before body content). */
+    public boolean isInHeader() {
+        return inHeader;
+    }
+
+    /** Returns the current group nesting depth. */
+    public int getDepth() {
+        return current.depth;
+    }
+
+    /** Returns the font-to-charset mapping table. */
+    public Map<Integer, Charset> getFontToCharset() {
+        return fontToCharset;
+    }
+
+    /** Returns the number of ANSI chars remaining to skip. */
+    public int getAnsiSkip() {
+        return ansiSkip;
+    }
+
+    /**
+     * Returns the group state that was just closed on the most recent 
GROUP_CLOSE.
+     * This is the child group's state before it was popped.
+     * Useful for checking flags like objdata, pictDepth, sn, sv, sp, object
+     * to trigger completion handlers.
+     */
+    public RTFGroupState getLastClosedGroup() {
+        return lastClosedGroup;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
new file mode 100644
index 0000000000..3278a9a1a4
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFToken.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+/**
+ * A single token produced by the RTF tokenizer.
+ * <p>
+ * Mutable and reused by the tokenizer to avoid allocation in the hot loop.
+ * Consumers must copy any data they need before requesting the next token.
+ * <p>
+ * For TEXT and CONTROL_SYMBOL tokens (single character), use {@link 
#getChar()}
+ * to avoid String allocation. For CONTROL_WORD tokens, use {@link #getName()}.
+ */
+public class RTFToken {
+
+    private RTFTokenType type;
+    private String name;
+    private char ch;
+    private int parameter;
+    private boolean hasParameter;
+
+    public void reset(RTFTokenType type) {
+        this.type = type;
+        this.name = null;
+        this.ch = 0;
+        this.parameter = -1;
+        this.hasParameter = false;
+    }
+
+    public void set(RTFTokenType type, String name, int parameter, boolean 
hasParameter) {
+        this.type = type;
+        this.name = name;
+        this.ch = 0;
+        this.parameter = parameter;
+        this.hasParameter = hasParameter;
+    }
+
+    public void setChar(RTFTokenType type, char ch) {
+        this.type = type;
+        this.name = null;
+        this.ch = ch;
+        this.parameter = -1;
+        this.hasParameter = false;
+    }
+
+    public RTFTokenType getType() {
+        return type;
+    }
+
+    /** For CONTROL_WORD tokens: the control word name. */
+    public String getName() {
+        return name;
+    }
+
+    /**
+     * For TEXT and CONTROL_SYMBOL tokens: the single character, without
+     * allocating a String.
+     */
+    public char getChar() {
+        return ch;
+    }
+
+    public int getParameter() {
+        return parameter;
+    }
+
+    public boolean hasParameter() {
+        return hasParameter;
+    }
+
+    public int getHexValue() {
+        return parameter;
+    }
+
+    @Override
+    public String toString() {
+        switch (type) {
+            case GROUP_OPEN:
+                return "{";
+            case GROUP_CLOSE:
+                return "}";
+            case CONTROL_WORD:
+                return "\\" + name + (hasParameter ? String.valueOf(parameter) 
: "");
+            case CONTROL_SYMBOL:
+                return "\\" + ch;
+            case HEX_ESCAPE:
+                return String.format(java.util.Locale.ROOT, "\\'%02x", 
parameter);
+            case UNICODE_ESCAPE:
+                return "\\u" + parameter;
+            case TEXT:
+                return "TEXT[" + ch + "]";
+            case BIN:
+                return "\\bin" + parameter;
+            case CRLF:
+                return "CRLF";
+            case EOF:
+                return "EOF";
+            default:
+                return type.name();
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
new file mode 100644
index 0000000000..dcdcf511f9
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenType.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+public enum RTFTokenType {
+    GROUP_OPEN,
+    GROUP_CLOSE,
+    CONTROL_WORD,
+    CONTROL_SYMBOL,
+    HEX_ESCAPE,
+    UNICODE_ESCAPE,
+    TEXT,
+    BIN,
+    CRLF,
+    EOF
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
new file mode 100644
index 0000000000..2f5baff0f4
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/jflex/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizer.jflex
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+%%
+
+%public
+%class RTFTokenizer
+%unicode
+%type RTFToken
+%char
+
+%{
+    private final RTFToken token = new RTFToken();
+
+    /**
+     * Returns the reusable token instance. Callers must copy data
+     * before the next call to {@link #yylex()}.
+     */
+    public RTFToken getToken() {
+        return token;
+    }
+
+    /** Control word with parameter: \ letters [-] digits [space] */
+    private RTFToken controlWordWithParam() {
+        int len = yylength();
+        if (yycharat(len - 1) == ' ') {
+            len--;
+        }
+        // find where letters end
+        int nameEnd = 1;
+        while (nameEnd < len && Character.isLetter(yycharat(nameEnd))) {
+            nameEnd++;
+        }
+        String name = new String(zzBuffer, zzStartRead + 1, nameEnd - 1);
+        int param = parseIntFromBuffer(nameEnd, len);
+        token.set(RTFTokenType.CONTROL_WORD, name, param, true);
+        return token;
+    }
+
+    /** Control word without parameter: \ letters [space] */
+    private RTFToken controlWord() {
+        int len = yylength();
+        if (yycharat(len - 1) == ' ') {
+            len--;
+        }
+        String name = new String(zzBuffer, zzStartRead + 1, len - 1);
+        token.set(RTFTokenType.CONTROL_WORD, name, -1, false);
+        return token;
+    }
+
+    private RTFToken hexEscape() {
+        // layout: \' hex hex  (4 chars)
+        int hi = Character.digit(yycharat(2), 16);
+        int lo = Character.digit(yycharat(3), 16);
+        token.set(RTFTokenType.HEX_ESCAPE, null, (hi << 4) | lo, true);
+        return token;
+    }
+
+    private RTFToken unicodeEscape() {
+        // layout: backslash u [-] digits [space]
+        int len = yylength();
+        if (yycharat(len - 1) == ' ') {
+            len--;
+        }
+        int codePoint = parseIntFromBuffer(2, len);
+        // RTF uses signed 16-bit: negative values map to 65536 + value
+        if (codePoint < 0) {
+            codePoint = 65536 + codePoint;
+        }
+        token.set(RTFTokenType.UNICODE_ESCAPE, null, codePoint, true);
+        return token;
+    }
+
+    private RTFToken binToken() {
+        // layout: \bin digits [space]
+        int len = yylength();
+        if (yycharat(len - 1) == ' ') {
+            len--;
+        }
+        int count = parseIntFromBuffer(4, len);
+        token.set(RTFTokenType.BIN, null, count, true);
+        return token;
+    }
+
+    /**
+     * Parse an integer from JFlex's internal char buffer between positions
+     * start (inclusive) and end (exclusive), relative to the current match.
+     * Handles optional leading '-'.
+     */
+    private int parseIntFromBuffer(int start, int end) {
+        boolean neg = false;
+        int pos = start;
+        if (yycharat(pos) == '-') {
+            neg = true;
+            pos++;
+        }
+        int result = 0;
+        while (pos < end) {
+            result = result * 10 + (yycharat(pos) - '0');
+            pos++;
+        }
+        return neg ? -result : result;
+    }
+%}
+
+/* RTF is 7-bit ASCII; bytes above 127 are escaped. We read as Latin1/byte 
stream. */
+
+/* RTF spec: a control word's delimiter space is consumed and not part of the 
output.
+   We include the optional trailing space in each pattern so the tokenizer 
eats it. */
+ControlWordWithParam = "\\" [a-zA-Z]+ "-"? [0-9]+ " "?
+ControlWord = "\\" [a-zA-Z]+ " "?
+HexEscape = "\\'" [0-9a-fA-F]{2}
+UnicodeEscape = "\\u" "-"? [0-9]+ " "?
+BinControl = "\\bin" [0-9]+ " "?
+ControlSymbol = "\\" [^a-zA-Z0-9\r\n]
+GroupOpen = "{"
+GroupClose = "}"
+CrLf = \r\n | \r | \n
+
+%%
+
+/* Order matters: more specific rules first */
+
+{BinControl}             { return binToken(); }
+{UnicodeEscape}          { return unicodeEscape(); }
+{HexEscape}              { return hexEscape(); }
+{ControlWordWithParam}   { return controlWordWithParam(); }
+{ControlWord}            { return controlWord(); }
+{ControlSymbol}          { token.setChar(RTFTokenType.CONTROL_SYMBOL, 
yycharat(1)); return token; }
+{GroupOpen}              { token.reset(RTFTokenType.GROUP_OPEN); return token; 
}
+{GroupClose}             { token.reset(RTFTokenType.GROUP_CLOSE); return 
token; }
+{CrLf}                   { token.reset(RTFTokenType.CRLF); return token; }
+
+/* Text: one char at a time. Uses yycharat(0) to avoid String allocation. */
+[^\\\{\}\r\n]            { token.setChar(RTFTokenType.TEXT, yycharat(0)); 
return token; }
+
+<<EOF>>                  { token.reset(RTFTokenType.EOF); return token; }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
new file mode 100644
index 0000000000..32b8ae58f9
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFEmbeddedHandlerTest.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Tests for {@link RTFEmbeddedHandler} driven by the JFlex tokenizer,
+ * both standalone and integrated into the decapsulator.
+ */
+public class RTFEmbeddedHandlerTest {
+
+    private static ParseContext buildContext(List<Metadata> extracted) {
+        ParseContext context = new ParseContext();
+        context.set(EmbeddedDocumentExtractor.class, new 
EmbeddedDocumentExtractor() {
+            @Override
+            public boolean shouldParseEmbedded(Metadata metadata) {
+                return true;
+            }
+
+            @Override
+            public void parseEmbedded(TikaInputStream stream, ContentHandler 
handler,
+                                      Metadata metadata, ParseContext 
parseContext,
+                                      boolean outputHtml) {
+                Metadata copy = new Metadata();
+                for (String name : metadata.names()) {
+                    for (String val : metadata.getValues(name)) {
+                        copy.add(name, val);
+                    }
+                }
+                extracted.add(copy);
+            }
+        });
+        return context;
+    }
+
+    /**
+     * Process an RTF file through the tokenizer + state + embedded handler 
directly.
+     */
+    private List<Metadata> extractEmbeddedDirect(String resourceName)
+            throws IOException, SAXException, TikaException {
+        List<Metadata> extracted = new ArrayList<>();
+        ParseContext context = buildContext(extracted);
+        ContentHandler handler = new DefaultHandler();
+        RTFEmbeddedHandler embHandler = new RTFEmbeddedHandler(handler, 
context, 20 * 1024);
+        RTFState state = new RTFState();
+
+        try (InputStream is = 
getClass().getResourceAsStream("/test-documents/" + resourceName);
+             Reader reader = new InputStreamReader(is, 
StandardCharsets.US_ASCII)) {
+
+            RTFTokenizer tokenizer = new RTFTokenizer(reader);
+            RTFToken tok;
+
+            while ((tok = tokenizer.yylex()) != null) {
+                if (tok.getType() == RTFTokenType.EOF) {
+                    break;
+                }
+                boolean consumed = state.processToken(tok);
+                if (!consumed) {
+                    RTFGroupState closingGroup =
+                            (tok.getType() == RTFTokenType.GROUP_CLOSE)
+                                    ? state.getLastClosedGroup() : null;
+                    embHandler.processToken(tok, state, closingGroup);
+                }
+            }
+        }
+        return extracted;
+    }
+
+    @Test
+    public void testEmbeddedFiles() throws Exception {
+        List<Metadata> embedded = 
extractEmbeddedDirect("testRTFEmbeddedFiles.rtf");
+        assertTrue(embedded.size() > 0,
+                "should extract at least one embedded object from 
testRTFEmbeddedFiles.rtf");
+    }
+
+    @Test
+    public void testPictExtraction() throws Exception {
+        // Verifies the handler doesn't crash on a typical RTF file
+        extractEmbeddedDirect("testRTF.rtf");
+    }
+
+    @Test
+    public void testEmbeddedObjectMetadata() throws Exception {
+        List<Metadata> embedded = 
extractEmbeddedDirect("testRTFEmbeddedFiles.rtf");
+        if (embedded.size() > 0) {
+            boolean hasName = false;
+            for (Metadata m : embedded) {
+                String name = m.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+                if (name != null && !name.isEmpty()) {
+                    hasName = true;
+                    break;
+                }
+            }
+            assertTrue(hasName, "at least one embedded should have a resource 
name");
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
new file mode 100644
index 0000000000..72235f36a3
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFHtmlDecapsulatorTest.java
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Tests for {@link RTFHtmlDecapsulator}, mirroring the original
+ * RTFEncapsulatedHTMLExtractorTest to verify parity.
+ */
+public class RTFHtmlDecapsulatorTest {
+
+    private static String extract(byte[] rtfBytes)
+            throws IOException, SAXException, TikaException {
+        return new RTFHtmlDecapsulator(new DefaultHandler(), new 
ParseContext())
+                .extract(rtfBytes);
+    }
+
+    @Test
+    public void testNullAndEmpty() throws Exception {
+        assertNull(extract(null));
+        assertNull(extract(new byte[0]));
+    }
+
+    @Test
+    public void testNonEncapsulatedRtf() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\deff0 Hello world}";
+        assertNull(extract(rtf.getBytes(US_ASCII)));
+    }
+
+    @Test
+    public void testSimpleEncapsulatedHtml() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag34 <head>}\n" +
+                "{\\*\\htmltag41 </head>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "{\\*\\htmltag84 Hello world}\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<html>"));
+        assertTrue(html.contains("<p>"));
+        assertTrue(html.contains("Hello world"));
+        assertTrue(html.contains("</html>"));
+    }
+
+    @Test
+    public void testImgCidExtraction() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag84 <img 
src=\"cid:[email protected]\">}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("cid:[email protected]"),
+                "CID reference should be preserved in extracted HTML");
+    }
+
+    @Test
+    public void testParAndTabDecoding() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 <style>}\n" +
+                "{\\*\\htmltag241 body \\{\\par \\tab color: red;\\par \\}}\n" 
+
+                "{\\*\\htmltag249 </style>}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<style>"));
+        assertTrue(html.contains("body {"));
+        assertTrue(html.contains("\tcolor: red;"));
+        assertTrue(html.contains("</style>"));
+    }
+
+    @Test
+    public void testHexEscapeDecoding() throws Exception {
+        // \'e9 = 0xE9 = 'e' in windows-1252
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 caf\\'e9}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("caf\u00e9", html);
+    }
+
+    @Test
+    public void testMultiByteHexEscape() throws Exception {
+        // \'fc = 'u' and \'df = 'ss' in windows-1252
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 gr\\'fc\\'dfe}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("gr\u00fc\u00dfe", html);
+    }
+
+    @Test
+    public void testCodePage1254Turkish() throws Exception {
+        // \'fd in windows-1254 = 0xFD, decoded by Java's windows-1254 charset
+        String rtf = "{\\rtf1\\ansi\\ansicpg1254\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Say\\'fdn}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        // Verify the byte 0xFD is decoded through windows-1254
+        byte[] expected = new byte[] { 'S', 'a', 'y', (byte) 0xFD, 'n' };
+        assertEquals(new String(expected, 
java.nio.charset.Charset.forName("windows-1254")), html);
+    }
+
+    @Test
+    public void testHtmlrtfSkipping() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 Hello}\n" +
+                "\\htmlrtf {\\b bold rtf only}\\htmlrtf0\n" +
+                "{\\*\\htmltag84  World}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("Hello World", html);
+    }
+
+    @Test
+    public void testEscapedBracesAndBackslash() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag241 a \\{ b \\} c \\\\d}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("a { b } c \\d", html);
+    }
+
+    @Test
+    public void testEmptyHtmltag() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag72}\n" +
+                "{\\*\\htmltag84 text}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("text", html);
+    }
+
+    @Test
+    public void testInterTagTextContent() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag19 <html>}\n" +
+                "{\\*\\htmltag50 <body>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Hello from the message body\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "Second paragraph\n" +
+                "\\htmlrtf\\par}\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "{\\*\\htmltag58 </body>}\n" +
+                "{\\*\\htmltag27 </html>}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("<p>"), "should contain HTML tags");
+        assertTrue(html.contains("Hello from the message body"),
+                "should contain inter-tag text content");
+        assertTrue(html.contains("Second paragraph"),
+                "should contain second paragraph text");
+        assertTrue(html.contains("</html>"), "should contain closing tag");
+    }
+
+    @Test
+    public void testInterTagHexEscapes() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag64 <p>}\n" +
+                "\\htmlrtf {\\htmlrtf0\n" +
+                "caf\\'e9\n" +
+                "\\htmlrtf }\\htmlrtf0\n" +
+                "{\\*\\htmltag72 </p>}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertTrue(html.contains("caf\u00e9"), "hex escapes in inter-tag text 
should be decoded");
+    }
+
+    @Test
+    public void testLineControlWord() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\*\\htmltag84 line1\\line line2}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("line1<br>line2", html);
+    }
+
+    @Test
+    public void testFontAwareCodePageDecoding() throws Exception {
+        // f0 = ANSI (fcharset 0 = windows-1252), f1 = Greek (fcharset 161 = 
cp1253)
+        // \'e1 in windows-1252 = U+00E1 (a with acute)
+        // \'e1 in cp1253 = U+03B1 (GREEK SMALL LETTER ALPHA)
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\fonttbl{\\f0\\fcharset0 Times;}{\\f1\\fcharset161 
Greek;}}\n" +
+                "{\\*\\htmltag84 \\f0 caf\\'e9}\n" +
+                "{\\*\\htmltag84 \\f1 \\'e1}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        // f0: \'e9 in windows-1252 = e with acute
+        assertTrue(html.contains("caf\u00e9"), "f0 should decode as 
windows-1252");
+        // f1: \'e1 in cp1253 = Greek alpha
+        assertTrue(html.contains("\u03b1"), "f1 should decode as cp1253 
(Greek)");
+    }
+
+    @Test
+    public void testUnicodeEscapeWithAnsiShadow() throws Exception {
+        // \u8212 is em dash (U+2014). The \'97 is the ANSI shadow and should 
be skipped.
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\fromhtml1 \\deff0\n" +
+                "{\\fonttbl{\\f0\\fcharset0 Times;}}\n" +
+                "{\\*\\htmltag84 A\\u8212\\'97B}\n" +
+                "}";
+        String html = extract(rtf.getBytes(US_ASCII));
+        assertNotNull(html);
+        assertEquals("A\u2014B", html);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
new file mode 100644
index 0000000000..7595c8342e
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFStateTest.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.StringReader;
+import java.nio.charset.Charset;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFStateTest {
+
+    private RTFState processRtf(String rtf) throws Exception {
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+        RTFState state = new RTFState();
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+        }
+        return state;
+    }
+
+    @Test
+    public void testGlobalCharsetFromAnsicpg() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\ansi\\ansicpg1251}");
+        assertEquals(Charset.forName("CP1251"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetDefaultWindows1252() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\ansi}");
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetPca() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\pca}");
+        assertEquals(Charset.forName("cp850"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetPc() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\pc}");
+        assertEquals(Charset.forName("cp437"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testGlobalCharsetMac() throws Exception {
+        RTFState state = processRtf("{\\rtf1\\mac}");
+        assertEquals(Charset.forName("MacRoman"), state.getGlobalCharset());
+    }
+
+    @Test
+    public void testFontTableParsing() throws Exception {
+        // Realistic font table: f0=Times New Roman (ANSI), f1=MS Mincho 
(Shift_JIS)
+        String rtf = "{\\rtf1\\ansi\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times New Roman;}" +
+                "{\\f1\\fnil\\fcharset128 MS Mincho;}" +
+                "}" +
+                "\\f0 Hello}";
+        RTFState state = processRtf(rtf);
+
+        // fcharset 0 = ANSI = WINDOWS-1252
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, 
state.getFontToCharset().get(0));
+        // fcharset 128 = Shift JIS = MS932
+        assertEquals(Charset.forName("MS932"), 
state.getFontToCharset().get(1));
+    }
+
+    @Test
+    public void testCurrentCharsetFollowsFont() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times;}" +
+                "{\\f1\\fnil\\fcharset161 Greek;}" +
+                "}" +
+                "\\f1 text}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new 
java.io.StringReader(rtf));
+        RTFState state = new RTFState();
+        Charset charsetAtText = null;
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+            // Capture charset when we see the first body text char
+            if (tok.getType() == RTFTokenType.TEXT && tok.getChar() == 't'
+                    && charsetAtText == null) {
+                charsetAtText = state.getCurrentCharset();
+            }
+        }
+
+        // Verify font table was populated
+        assertEquals(2, state.getFontToCharset().size());
+        assertEquals(Charset.forName("cp1253"), 
state.getFontToCharset().get(1));
+
+        // After \f1, charset should be cp1253 (Greek)
+        assertNotNull(charsetAtText);
+        assertEquals(Charset.forName("cp1253"), charsetAtText);
+    }
+
+    @Test
+    public void testCurrentCharsetFallsBackToGlobal() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1254\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times;}" +
+                "}" +
+                "\\f0 text}";
+        RTFState state = processRtf(rtf);
+
+        // fcharset 0 = WINDOWS-1252 (ANSI)
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, state.getCurrentCharset());
+    }
+
+    @Test
+    public void testDefaultFontCharset() throws Exception {
+        // \deff1 sets default font to f1, which maps to fcharset 162 (Turkish 
= cp1254)
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff1" +
+                "{\\fonttbl" +
+                "{\\f0\\froman\\fcharset0 Times;}" +
+                "{\\f1\\fnil\\fcharset162 Arial;}" +
+                "}" +
+                "\\pard text}";
+        RTFState state = processRtf(rtf);
+
+        // No explicit \fN in body, so should fall back to deff1 -> fcharset 
162 -> cp1254
+        assertEquals(Charset.forName("cp1254"), state.getCurrentCharset());
+    }
+
+    @Test
+    public void testUcSkipInherited() throws Exception {
+        // RTF uc control word sets skip count to 2, inherited by child groups
+        // We process token-by-token and check inside the inner group
+        String rtf = "{\\rtf1\\ansi\\uc2{inner}}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new 
java.io.StringReader(rtf));
+        RTFState state = new RTFState();
+
+        int ucSkipInInnerGroup = -1;
+        boolean seenInnerText = false;
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+            // Check ucSkip when we see the first char of "inner"
+            if (tok.getType() == RTFTokenType.TEXT && tok.getChar() == 'i' && 
!seenInnerText) {
+                ucSkipInInnerGroup = state.getCurrentGroup().ucSkip;
+                seenInnerText = true;
+            }
+        }
+        // Inside {inner}, ucSkip should be inherited as 2 from parent
+        assertEquals(2, ucSkipInInnerGroup);
+    }
+
+    @Test
+    public void testAnsiSkipAfterUnicode() throws Exception {
+        // After \u8212, the next ucSkip (default 1) ANSI chars should be 
skipped
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252" +
+                "{\\fonttbl{\\f0\\fcharset0 Times;}}" +
+                "\\f0 A\\u8212\\'97B}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+        RTFState state = new RTFState();
+        StringBuilder textOutput = new StringBuilder();
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            boolean consumed = state.processToken(tok);
+            if (!consumed && !state.getCurrentGroup().ignore) {
+                if (tok.getType() == RTFTokenType.TEXT) {
+                    textOutput.append(tok.getChar());
+                } else if (tok.getType() == RTFTokenType.UNICODE_ESCAPE) {
+                    int cp = tok.getParameter();
+                    if (Character.isValidCodePoint(cp)) {
+                        textOutput.appendCodePoint(cp);
+                    }
+                }
+            }
+        }
+        // A + \u8212 (em dash) + B.  The \'97 should be skipped as unicode 
shadow.
+        assertEquals("A\u2014B", textOutput.toString());
+    }
+
+    @Test
+    public void testGroupStateRestored() throws Exception {
+        // Font change inside a group should be reverted when group closes
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252\\deff0" +
+                "{\\fonttbl" +
+                "{\\f0\\fcharset0 Times;}" +
+                "{\\f1\\fcharset161 Greek;}" +
+                "}" +
+                "\\f0 {\\f1 greek}{back to times}}";
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(rtf));
+        RTFState state = new RTFState();
+
+        Charset charsetInsideGroup = null;
+        Charset charsetAfterGroup = null;
+        boolean seenGreekGroup = false;
+        int bodyGroupDepth = 0;
+
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            state.processToken(tok);
+
+            if (tok.getType() == RTFTokenType.TEXT) {
+                char ch = tok.getChar();
+                if (ch == 'g' && !seenGreekGroup) {
+                    charsetInsideGroup = state.getCurrentCharset();
+                    seenGreekGroup = true;
+                } else if (ch == 'b') {
+                    charsetAfterGroup = state.getCurrentCharset();
+                }
+            }
+        }
+
+        assertNotNull(charsetInsideGroup);
+        assertNotNull(charsetAfterGroup);
+        // Inside the {\f1 ...} group, charset should be Greek (cp1253)
+        assertEquals(Charset.forName("cp1253"), charsetInsideGroup);
+        // After the group closes, should be back to f0 (WINDOWS-1252)
+        assertEquals(RTFCharsetMaps.WINDOWS_1252, charsetAfterGroup);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
new file mode 100644
index 0000000000..b5d96178f4
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/jflex/RTFTokenizerTest.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.rtf.jflex;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+public class RTFTokenizerTest {
+
+    private List<RTFToken> tokenize(String input) throws Exception {
+        RTFTokenizer tokenizer = new RTFTokenizer(new StringReader(input));
+        List<RTFToken> tokens = new ArrayList<>();
+        RTFToken tok;
+        while ((tok = tokenizer.yylex()) != null) {
+            if (tok.getType() == RTFTokenType.EOF) {
+                break;
+            }
+            // copy token since it's reused
+            RTFToken copy = new RTFToken();
+            if (tok.getType() == RTFTokenType.TEXT || tok.getType() == 
RTFTokenType.CONTROL_SYMBOL) {
+                copy.setChar(tok.getType(), tok.getChar());
+            } else {
+                copy.set(tok.getType(), tok.getName(), tok.getParameter(), 
tok.hasParameter());
+            }
+            tokens.add(copy);
+        }
+        return tokens;
+    }
+
+    @Test
+    public void testGroupOpenClose() throws Exception {
+        List<RTFToken> tokens = tokenize("{}");
+        assertEquals(2, tokens.size());
+        assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+        assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(1).getType());
+    }
+
+    @Test
+    public void testControlWord() throws Exception {
+        List<RTFToken> tokens = tokenize("\\rtf1");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType());
+        assertEquals("rtf", tokens.get(0).getName());
+        assertEquals(1, tokens.get(0).getParameter());
+        assertTrue(tokens.get(0).hasParameter());
+    }
+
+    @Test
+    public void testControlWordNoParam() throws Exception {
+        List<RTFToken> tokens = tokenize("\\ansi");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(0).getType());
+        assertEquals("ansi", tokens.get(0).getName());
+        assertFalse(tokens.get(0).hasParameter());
+    }
+
+    @Test
+    public void testControlWordNegativeParam() throws Exception {
+        List<RTFToken> tokens = tokenize("\\u-4321");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType());
+        // -4321 → 65536 - 4321 = 61215
+        assertEquals(61215, tokens.get(0).getParameter());
+    }
+
+    @Test
+    public void testHexEscape() throws Exception {
+        List<RTFToken> tokens = tokenize("\\'e9");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.HEX_ESCAPE, tokens.get(0).getType());
+        assertEquals(0xe9, tokens.get(0).getHexValue());
+    }
+
+    @Test
+    public void testUnicodeEscape() throws Exception {
+        List<RTFToken> tokens = tokenize("\\u8212");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.UNICODE_ESCAPE, tokens.get(0).getType());
+        assertEquals(8212, tokens.get(0).getParameter());
+    }
+
+    @Test
+    public void testBinControl() throws Exception {
+        List<RTFToken> tokens = tokenize("\\bin1024");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.BIN, tokens.get(0).getType());
+        assertEquals(1024, tokens.get(0).getParameter());
+    }
+
+    @Test
+    public void testControlSymbol() throws Exception {
+        List<RTFToken> tokens = tokenize("\\~");
+        assertEquals(1, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType());
+        assertEquals('~', tokens.get(0).getChar());
+    }
+
+    @Test
+    public void testEscapedBraces() throws Exception {
+        List<RTFToken> tokens = tokenize("\\{\\}\\\\");
+        assertEquals(3, tokens.size());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(0).getType());
+        assertEquals('{', tokens.get(0).getChar());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType());
+        assertEquals('}', tokens.get(1).getChar());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(2).getType());
+        assertEquals('\\', tokens.get(2).getChar());
+    }
+
+    @Test
+    public void testText() throws Exception {
+        List<RTFToken> tokens = tokenize("Hello");
+        assertEquals(5, tokens.size()); // one char at a time
+        for (RTFToken t : tokens) {
+            assertEquals(RTFTokenType.TEXT, t.getType());
+        }
+        StringBuilder sb = new StringBuilder();
+        for (RTFToken t : tokens) {
+            sb.append(t.getChar());
+        }
+        assertEquals("Hello", sb.toString());
+    }
+
+    @Test
+    public void testCrLf() throws Exception {
+        List<RTFToken> tokens = tokenize("a\r\nb");
+        assertEquals(3, tokens.size());
+        assertEquals(RTFTokenType.TEXT, tokens.get(0).getType());
+        assertEquals(RTFTokenType.CRLF, tokens.get(1).getType());
+        assertEquals(RTFTokenType.TEXT, tokens.get(2).getType());
+    }
+
+    @Test
+    public void testIgnorableDestination() throws Exception {
+        // {  \*  \htmltag84_  <  p  >  }
+        // The space after \htmltag84 is consumed as the control word delimiter
+        List<RTFToken> tokens = tokenize("{\\*\\htmltag84 <p>}");
+        assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+        assertEquals(RTFTokenType.CONTROL_SYMBOL, tokens.get(1).getType());
+        assertEquals('*', tokens.get(1).getChar());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType());
+        assertEquals("htmltag", tokens.get(2).getName());
+        assertEquals(84, tokens.get(2).getParameter());
+        // remaining tokens are < p > }
+        assertEquals(RTFTokenType.TEXT, tokens.get(3).getType());
+        assertEquals('<', tokens.get(3).getChar());
+        assertEquals(RTFTokenType.TEXT, tokens.get(4).getType());
+        assertEquals('p', tokens.get(4).getChar());
+        assertEquals(RTFTokenType.TEXT, tokens.get(5).getType());
+        assertEquals('>', tokens.get(5).getChar());
+        assertEquals(RTFTokenType.GROUP_CLOSE, tokens.get(6).getType());
+        assertEquals(7, tokens.size());
+    }
+
+    @Test
+    public void testMixedRtf() throws Exception {
+        String rtf = "{\\rtf1\\ansi\\ansicpg1252 Hello}";
+        List<RTFToken> tokens = tokenize(rtf);
+        // { \rtf1 \ansi \ansicpg1252 SPACE H e l l o }
+        assertEquals(RTFTokenType.GROUP_OPEN, tokens.get(0).getType());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(1).getType());
+        assertEquals("rtf", tokens.get(1).getName());
+        assertEquals(1, tokens.get(1).getParameter());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(2).getType());
+        assertEquals("ansi", tokens.get(2).getName());
+        assertEquals(RTFTokenType.CONTROL_WORD, tokens.get(3).getType());
+        assertEquals("ansicpg", tokens.get(3).getName());
+        assertEquals(1252, tokens.get(3).getParameter());
+    }
+}

(tika) branch main updated: TIKA-4710-rtf-attachments-in-html-decapsulation (#2744)

Reply via email to