Re: [PR] SOLR-7632 TikaServer as pluggable backend to existing extraction handler [solr]

via GitHub Fri, 17 Oct 2025 21:57:00 -0700


janhoy commented on code in PR #3670:
URL: https://github.com/apache/solr/pull/3670#discussion_r2410800305



##########
solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java:
##########
@@ -131,169 +105,192 @@ public void load(
       ContentStream stream,
       UpdateRequestProcessor processor)
       throws Exception {
-    Parser parser = null;
     String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, 
null);
-    if (streamType != null) {
-      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm 
told
-      MediaType mt = 
MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
-      parser = new 
DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
-    } else {
-      parser = autoDetectParser;
-    }
-    if (parser != null) {
-      Metadata metadata = new Metadata();
-
-      // If you specify the resource name (the filename, roughly) with this 
parameter,
-      // then Tika can make use of it in guessing the appropriate MIME type:
-      String resourceName = 
req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
-      if (resourceName != null) {
-        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
-      }
-      // Provide stream's content type as hint for auto detection
-      if (stream.getContentType() != null) {
-        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
-      }
+    String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, 
null);
 
-      try (InputStream inputStream = stream.getStream()) {
-        metadata.add(ExtractingMetadataConstants.STREAM_NAME, 
stream.getName());
-        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, 
stream.getSourceInfo());
-        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, 
String.valueOf(stream.getSize()));
-        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, 
stream.getContentType());
-        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in 
metadata
-        String charset = 
ContentStreamBase.getCharsetFromContentType(stream.getContentType());
-        if (charset != null) {
-          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
+    try (InputStream inputStream = stream.getStream()) {
+      String charset = 
ContentStreamBase.getCharsetFromContentType(stream.getContentType());
+
+      String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
+      boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, 
false);
+      boolean recursive = params.getBool(ExtractingParams.RECURSIVE, false);
+      String extractFormat =
+          params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT 
: TEXT_FORMAT);
+
+      // Parse optional passwords file into a map
+      LinkedHashMap<Pattern, String> pwMap = null;
+      String passwordsFile = params.get("passwordsFile");
+      if (passwordsFile != null) {
+        try (java.io.InputStream is = 
core.getResourceLoader().openResource(passwordsFile)) {
+          pwMap = RegexRulesPasswordProvider.parseRulesFile(is);
         }
+      }
 
-        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
-        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, 
false);
-        SolrContentHandler handler =
-            factory.createSolrContentHandler(metadata, params, 
req.getSchema());
-        ContentHandler parsingHandler = handler;
-
-        StringWriter writer = null;
-        BaseMarkupSerializer serializer = null;
-        if (extractOnly == true) {
-          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, 
"xml");
-          writer = new StringWriter();
-          if (extractFormat.equals(TEXT_FORMAT)) {
-            serializer = new TextSerializer();
-            serializer.setOutputCharStream(writer);
-            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", 
true));
-          } else {
-            serializer = new XMLSerializer(writer, new OutputFormat("XML", 
"UTF-8", true));
-          }
-          if (xpathExpr != null) {
-            Matcher matcher = PARSER.parse(xpathExpr);
-            serializer
-                .startDocument(); // The MatchingContentHandler does not 
invoke startDocument.  See
-            // 
https://lists.apache.org/thread.html/5ec63e104e564a2363e45f74d5aced6520b7d32b4b625762ef56cb86%401226775505%40%3Cdev.tika.apache.org%3E
-            parsingHandler = new MatchingContentHandler(serializer, matcher);
-          } else {
-            parsingHandler = serializer;
-          }
-        } else if (xpathExpr != null) {
-          Matcher matcher = PARSER.parse(xpathExpr);
-          parsingHandler = new MatchingContentHandler(handler, matcher);
-        } // else leave it as is
+      ExtractionRequest extractionRequest =
+          new ExtractionRequest(
+              streamType,
+              resourceName,
+              stream.getContentType(),
+              charset,
+              stream.getName(),
+              stream.getSourceInfo(),
+              stream.getSize(),
+              params.get(ExtractingParams.RESOURCE_PASSWORD, null),
+              pwMap,
+              extractFormat,
+              recursive,
+              Collections.emptyMap());
+
+      boolean captureAttr = 
params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
+      String[] captureElems = 
params.getParams(ExtractingParams.CAPTURE_ELEMENTS);
+      boolean needsSaxParsing =
+          extractOnly
+              || xpathExpr != null
+              || captureAttr
+              || (captureElems != null && captureElems.length > 0)
+              || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null)
+              || (passwordsFile != null);
 
+      if (extractOnly) {
         try {
-          // potentially use a wrapper handler for parsing, but we still need 
the SolrContentHandler
-          // for getting the document.
-          ParseContext context = parseContextConfig.create();
-
-          context.set(Parser.class, parser);
-          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
-
-          // Password handling
-          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
-          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
-          if (pwMapFile != null && pwMapFile.length() > 0) {
-            InputStream is = 
req.getCore().getResourceLoader().openResource(pwMapFile);
-            if (is != null) {
-              log.debug("Password file supplied: {}", pwMapFile);
-              epp.parse(is);
+          ExtractionMetadata md = 
backend.buildMetadataFromRequest(extractionRequest);
+          String content;
+          if 
(ExtractingDocumentLoader.TEXT_FORMAT.equals(extractionRequest.extractFormat)
+              || xpathExpr != null) {
+            content =
+                extractWithHandler(
+                    inputStream, xpathExpr, extractionRequest, md, new 
ToTextContentHandler());
+          } else { // XML format
+            content =
+                extractWithHandler(
+                    inputStream, xpathExpr, extractionRequest, md, new 
ToXMLContentHandler());
+            if (!content.startsWith("<?xml")) {
+              content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + 
content;
             }
           }
-          context.set(PasswordProvider.class, epp);
-          String resourcePassword = 
params.get(ExtractingParams.RESOURCE_PASSWORD);
-          if (resourcePassword != null) {
-            epp.setExplicitPassword(resourcePassword);
-            log.debug("Literal password supplied for file {}", resourceName);
+
+          appendBackCompatTikaMetadata(md);
+
+          rsp.add(stream.getName(), content);
+          NamedList<String[]> metadataNL = new NamedList<>();
+          for (String name : md.keySet()) {
+            metadataNL.add(name, md.get(name).toArray(new String[0]));
+          }
+          rsp.add(stream.getName() + "_metadata", metadataNL);
+        } catch (Exception e) {
+          if (ignoreTikaException) {
+            if (log.isWarnEnabled())
+              log.warn("skip extracting text due to {}.", 
e.getLocalizedMessage(), e);
+            return;
           }
-          parser.parse(inputStream, parsingHandler, metadata, context);
-        } catch (TikaException e) {
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+        }
+        return;
+      }
+
+      if (needsSaxParsing) {
+        ExtractionMetadata metadata = 
backend.buildMetadataFromRequest(extractionRequest);
+        SolrContentHandler handler =
+            factory.createSolrContentHandler(metadata, params, 
req.getSchema());
+        try {
+          backend.extractWithSaxHandler(inputStream, extractionRequest, 
metadata, handler);
+        } catch (Exception e) {
           if (ignoreTikaException) {
             if (log.isWarnEnabled()) {
-              log.warn(
-                  "skip extracting text due to {}. metadata={}",
-                  e.getLocalizedMessage(),
-                  metadata,
-                  e);
+              log.warn("skip extracting text due to {}.", 
e.getLocalizedMessage(), e);
+              return;
             }
-          } else {
-            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
           }
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
         }
-        if (extractOnly == false) {
-          addDoc(handler);
-        } else {
-          // serializer is not null, so we need to call endDoc on it if using 
xpath
-          if (xpathExpr != null) {
-            serializer.endDocument();
-          }
-          rsp.add(stream.getName(), writer.toString());
-          writer.close();
-          String[] names = metadata.names();
-          NamedList<String[]> metadataNL = new NamedList<>();
-          for (int i = 0; i < names.length; i++) {
-            String[] vals = metadata.getValues(names[i]);
-            metadataNL.add(names[i], vals);
-          }
-          rsp.add(stream.getName() + "_metadata", metadataNL);
+        appendBackCompatTikaMetadata(handler.metadata);
+
+        addDoc(handler);
+        return;
+      }
+
+      ExtractionResult result;
+      try {
+        result = backend.extract(inputStream, extractionRequest);
+      } catch (Exception e) {
+        if (ignoreTikaException) {
+          if (log.isWarnEnabled())
+            log.warn("skip extracting text due to {}.", 
e.getLocalizedMessage(), e);
+          return;
         }
-      } catch (SAXException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       }
-    } else {
-      throw new SolrException(
-          SolrException.ErrorCode.BAD_REQUEST,
-          "Stream type of "
-              + streamType
-              + " didn't match any known parsers.  Please supply the "
-              + ExtractingParams.STREAM_TYPE
-              + " parameter.");
+
+      ExtractionMetadata metadata = result.getMetadata();
+
+      appendBackCompatTikaMetadata(metadata);
+
+      SolrContentHandler handler =
+          factory.createSolrContentHandler(metadata, params, req.getSchema());
+      handler.appendToContent(result.getContent());
+      addDoc(handler);
     }
   }
 
-  public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
-    public static final HtmlMapper INSTANCE = new 
MostlyPassthroughHtmlMapper();
-
-    /**
-     * Keep all elements and their content.
-     *
-     * <p>Apparently &lt;SCRIPT&gt; and &lt;STYLE&gt; elements are blocked 
elsewhere
-     */
-    @Override
-    public boolean isDiscardElement(String name) {
-      return false;
+  /*
+   * Extracts content from the given input stream using an optional XPath 
expression
+   * and a SAX content handler. The extraction process may filter content 
based on
+   * the XPath expression, if provided.
+   */
+  private String extractWithHandler(
+      InputStream inputStream,
+      String xpathExpr,
+      ExtractionRequest extractionRequest,
+      ExtractionMetadata md,
+      DefaultHandler ch)
+      throws Exception {
+    if (xpathExpr != null) {
+      org.apache.tika.sax.xpath.XPathParser xparser =
+          new org.apache.tika.sax.xpath.XPathParser(
+              "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
+      org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
+      ch = new org.apache.tika.sax.xpath.MatchingContentHandler(ch, matcher);
     }
+    backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch);
+    return ch.toString();
+  }
+
+  private final Map<String, String> fieldMappings = new LinkedHashMap<>();
 
-    /** Lowercases the attribute name */
-    @Override
-    public String mapSafeAttribute(String elementName, String attributeName) {
-      return attributeName.toLowerCase(Locale.ENGLISH);
+  // TODO: Improve backward compatibility by adding more mappings

Review Comment:
   Guess there may be other mappings one could add for back-compat. But this 
was added mostly to get tests passing in an easy way without modifying the test 
itself. Do we believe that a back-compat mode is useful? 
   
   NB: Currently the `extraction.metadataBackCompat` param is not parsed 
properly if configured in solrconfig.xml and not on the update request.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] SOLR-7632 TikaServer as pluggable backend to existing extraction handler [solr]

Reply via email to