Re: [PR] SOLR-7632 TikaServer as pluggable backend to existing extraction handler [solr]

via GitHub Fri, 17 Oct 2025 20:15:22 -0700


Copilot commented on code in PR #3670:
URL: https://github.com/apache/solr/pull/3670#discussion_r2424992757



##########
solr/solrj/src/java/org/apache/solr/common/SolrException.java:
##########
@@ -45,6 +45,7 @@ public enum ErrorCode {
     TOO_MANY_REQUESTS(429),
     SERVER_ERROR(500),
     SERVICE_UNAVAILABLE(503),

Review Comment:
   The new GATEWAY_TIMEOUT error code should be documented with a brief comment 
explaining when it's used, similar to other error codes in this enum.
   ```suggestion
       SERVICE_UNAVAILABLE(503),
       /** Used when a gateway or proxy times out waiting for a response from 
an upstream server (HTTP 504). */
   ```



##########
solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java:
##########
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.InputStream;
+import java.net.ConnectException;
+import java.net.SocketTimeoutException;
+import java.nio.channels.ClosedChannelException;
+import java.time.Duration;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.eclipse.jetty.client.HttpClient;
+import org.eclipse.jetty.client.InputStreamRequestContent;
+import org.eclipse.jetty.client.InputStreamResponseListener;
+import org.eclipse.jetty.client.Request;
+import org.eclipse.jetty.client.Response;
+import org.eclipse.jetty.io.EofException;
+import org.eclipse.jetty.util.thread.ScheduledExecutorScheduler;
+import org.xml.sax.helpers.DefaultHandler;
+
+/** Extraction backend using the Tika Server. It uses a shared Jetty 
HttpClient. */
+public class TikaServerExtractionBackend implements ExtractionBackend {
+  private static volatile HttpClient SHARED_CLIENT;
+  private static volatile ExecutorService SHARED_EXECUTOR;
+  private static final Object INIT_LOCK = new Object();
+  private static volatile boolean INITIALIZED = false;
+  private static volatile boolean SHUTDOWN = false;
+  private final String baseUrl;
+  private static final int DEFAULT_TIMEOUT_SECONDS = 3 * 60;
+  private final Duration defaultTimeout;
+  private final TikaServerParser tikaServerResponseParser = new 
TikaServerParser();
+  private boolean tikaMetadataCompatibility;
+  private HashMap<String, Object> initArgsMap = new HashMap<>();
+
+  public TikaServerExtractionBackend(String baseUrl) {
+    this(baseUrl, DEFAULT_TIMEOUT_SECONDS, null);
+  }
+
+  public TikaServerExtractionBackend(String baseUrl, int timeoutSeconds, 
NamedList<?> initArgs) {
+    if (initArgs != null) {
+      initArgs.toMap(this.initArgsMap);
+    }
+    Object metaCompatObh = 
this.initArgsMap.get(ExtractingParams.TIKASERVER_METADATA_COMPATIBILITY);
+    if (metaCompatObh != null) {
+      this.tikaMetadataCompatibility = 
Boolean.parseBoolean(metaCompatObh.toString());
+    }
+    if (timeoutSeconds <= 0) {
+      timeoutSeconds = DEFAULT_TIMEOUT_SECONDS;
+    }
+    if (baseUrl.endsWith("/")) {
+      this.baseUrl = baseUrl.substring(0, baseUrl.length() - 1);
+    } else {
+      this.baseUrl = baseUrl;
+    }
+    this.defaultTimeout =
+        Duration.ofSeconds(timeoutSeconds > 0 ? timeoutSeconds : 
DEFAULT_TIMEOUT_SECONDS);
+  }
+
+  public static final String NAME = "tikaserver";
+
+  @Override
+  public String name() {
+    return NAME;
+  }
+
+  @Override
+  public ExtractionResult extract(InputStream inputStream, ExtractionRequest 
request)
+      throws Exception {
+    try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
+      ExtractionMetadata md = buildMetadataFromRequest(request);
+      BodyContentHandler bodyContentHandler = new BodyContentHandler(-1);
+      if (request.tikaServerRecursive) {
+        tikaServerResponseParser.parseRmetaJson(tikaResponse, 
bodyContentHandler, md);
+      } else {
+        tikaServerResponseParser.parseXml(tikaResponse, bodyContentHandler, 
md);
+      }
+      if (tikaMetadataCompatibility) {
+        appendBackCompatTikaMetadata(md);
+      }
+      return new ExtractionResult(bodyContentHandler.toString(), md);
+    }
+  }
+
+  @Override
+  public void extractWithSaxHandler(
+      InputStream inputStream,
+      ExtractionRequest request,
+      ExtractionMetadata md,
+      DefaultHandler saxContentHandler)
+      throws Exception {
+    try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
+      if (request.tikaServerRecursive) {
+        tikaServerResponseParser.parseRmetaJson(tikaResponse, 
saxContentHandler, md);
+      } else {
+        tikaServerResponseParser.parseXml(tikaResponse, saxContentHandler, md);
+      }
+      if (tikaMetadataCompatibility) {
+        appendBackCompatTikaMetadata(md);
+      }
+    }
+  }
+
+  /**
+   * Call the Tika Server to extract text and metadata. Depending on 
<code>request.recursive</code>,
+   * will either return XML (false) or JSON array (true). <b>The recursive 
mode consumes more memory
+   * both on the TikaServer side and on the Solr side</b>
+   *
+   * @return InputStream of the response body, either XML or JSON depending on 
<code>
+   *     request.tikaserverRecursive</code>
+   */
+  InputStream callTikaServer(InputStream inputStream, ExtractionRequest 
request) throws Exception {
+    String url = baseUrl + (request.tikaServerRecursive ? "/rmeta" : "/tika");
+
+    ensureClientInitialized();
+    HttpClient client = SHARED_CLIENT;
+
+    Request req = client.newRequest(url).method("PUT");
+    Duration effectiveTimeout =
+        (request.tikaServerTimeoutSeconds != null && 
request.tikaServerTimeoutSeconds > 0)
+            ? Duration.ofSeconds(request.tikaServerTimeoutSeconds)
+            : defaultTimeout;
+    req.timeout(effectiveTimeout.toMillis(), TimeUnit.MILLISECONDS);
+
+    // Headers
+    String accept = (request.tikaServerRecursive ? "application/json" : 
"text/xml");
+    req.headers(h -> h.add("Accept", accept));
+    String contentType = (request.streamType != null) ? request.streamType : 
request.contentType;
+    if (contentType != null) {
+      req.headers(h -> h.add("Content-Type", contentType));
+    }
+    if (!request.tikaServerRequestHeaders.isEmpty()) {
+      req.headers(
+          h ->
+              request.tikaServerRequestHeaders.forEach(
+                  (k, v) -> {
+                    if (k != null && v != null) h.add(k, v);
+                  }));
+    }
+
+    ExtractionMetadata md = buildMetadataFromRequest(request);
+    if (request.resourcePassword != null || request.passwordsMap != null) {
+      RegexRulesPasswordProvider passwordProvider = new 
RegexRulesPasswordProvider();
+      if (request.resourcePassword != null) {
+        passwordProvider.setExplicitPassword(request.resourcePassword);
+      }
+      if (request.passwordsMap != null) {
+        passwordProvider.setPasswordMap(request.passwordsMap);
+      }
+      String pwd = passwordProvider.getPassword(md);
+      if (pwd != null) {
+        req.headers(h -> h.add("Password", pwd)); // Tika Server expects this 
header if provided
+      }
+    }
+    if (request.resourceName != null) {
+      req.headers(
+          h ->
+              h.add(
+                  "Content-Disposition", "attachment; filename=\"" + 
request.resourceName + "\""));
+    }
+
+    if (contentType != null) {
+      req.body(new InputStreamRequestContent(contentType, inputStream));
+    } else {
+      req.body(new InputStreamRequestContent(inputStream));
+    }
+
+    InputStreamResponseListener listener = new InputStreamResponseListener();
+    req.send(listener);
+
+    final Response response;
+    try {
+      response = listener.get(effectiveTimeout.toMillis(), 
TimeUnit.MILLISECONDS);
+    } catch (TimeoutException te) {
+      throw new SolrException(
+          SolrException.ErrorCode.GATEWAY_TIMEOUT,
+          "Timeout after "
+              + effectiveTimeout.toMillis()
+              + " ms while waiting for response from TikaServer "
+              + url,
+          te);
+    } catch (InterruptedException ie) {
+      Thread.currentThread().interrupt();
+      throw new SolrException(
+          SolrException.ErrorCode.SERVER_ERROR,
+          "Interrupted while waiting for response from TikaServer " + url,
+          ie);
+    } catch (ExecutionException ee) {
+      Throwable cause = ee.getCause();
+      if (cause instanceof ConnectException
+          || cause instanceof SocketTimeoutException
+          || cause instanceof EofException
+          || cause instanceof ClosedChannelException) {
+        throw new SolrException(
+            SolrException.ErrorCode.SERVICE_UNAVAILABLE,
+            "Error communicating with TikaServer "
+                + url
+                + ": "
+                + cause.getClass().getSimpleName()
+                + ": "
+                + cause.getMessage(),
+            cause);
+      }
+      throw new SolrException(
+          SolrException.ErrorCode.SERVER_ERROR,
+          "Unexpected error while calling TikaServer " + url,
+          ee);
+    }
+
+    int code = response.getStatus();
+    if (code < 200 || code >= 300) {
+      SolrException.ErrorCode errorCode = 
SolrException.ErrorCode.getErrorCode(code);
+      String reason = response.getReason();
+      String msg =
+          "TikaServer "
+              + url
+              + " returned status "
+              + code
+              + (reason != null ? " (" + reason + ")" : "");
+      throw new SolrException(errorCode, msg);
+    }
+
+    return listener.getInputStream();
+  }
+
+  private static void ensureClientInitialized() {
+    if (INITIALIZED) return;
+    synchronized (INIT_LOCK) {
+      if (INITIALIZED) return;
+      ThreadFactory tf = new SolrNamedThreadFactory("TikaServerHttpClient");
+      ExecutorService exec = ExecutorUtil.newMDCAwareCachedThreadPool(tf);
+      HttpClient client = new HttpClient();
+      client.setExecutor(exec);
+      client.setScheduler(new 
ScheduledExecutorScheduler("TikaServerHttpClient-scheduler", true));
+      try {
+        client.start();
+      } catch (Exception e) {
+        try {
+          exec.shutdownNow();
+        } catch (Throwable ignore) {
+        }
+        throw new SolrException(
+            SolrException.ErrorCode.SERVER_ERROR, "Failed to start shared 
Jetty HttpClient", e);
+      }
+      SHARED_EXECUTOR = exec;
+      SHARED_CLIENT = client;
+      INITIALIZED = true;
+      SHUTDOWN = false;
+    }
+  }
+
+  private final Map<String, String> fieldMappings = new LinkedHashMap<>();
+
+  // TODO: Improve backward compatibility by adding more mappings
+  {
+    fieldMappings.put("dc:title", "title");
+    fieldMappings.put("dc:creator", "author");
+    fieldMappings.put("dc:description", "description");
+    fieldMappings.put("dc:subject", "subject");
+    fieldMappings.put("dc:language", "language");
+    fieldMappings.put("dc:publisher", "publisher");
+    fieldMappings.put("dcterms:created", "created");
+    fieldMappings.put("dcterms:modified", "modified");
+    fieldMappings.put("meta:author", "Author");
+    fieldMappings.put("meta:creation-date", "Creation-Date");
+    fieldMappings.put("meta:save-date", "Last-Save-Date");
+    fieldMappings.put("meta:keyword", "Keywords");
+    fieldMappings.put("pdf:docinfo:keywords", "Keywords");
+  }

Review Comment:
   [nitpick] The field mappings are hardcoded in an instance initializer block. 
Consider moving this to a static final Map or making it configurable to improve 
maintainability and allow customization.



##########
solr/modules/extraction/src/test-files/extraction/simple.html:
##########
@@ -10,7 +10,7 @@
   Here is some text
 </p>
 <p>distinct<br/>words</p>
-<div>Here is some text in a div</div>
+<h1>Here is some text in a h1</h1>

Review Comment:
   Corrected article usage from 'a h1' to 'an h1' since 'h1' starts with a 
vowel sound.
   ```suggestion
   <h1>Here is some text in an h1</h1>
   ```



##########
solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java:
##########
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.FilterReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Filters out null character entities (&#0;, &#x0;, etc.) from XML content.
+ *
+ * <p>Removes numeric character entities that resolve to code point 0, such as 
<code>&#0;</code> or
+ * <code>&#00;</code>. Everything else is passed through unchanged.
+ */
+final class XmlSanitizingReader extends FilterReader {
+  private static final Pattern NULL_ENTITY_PATTERN =
+      Pattern.compile("&#(0+|x0+);", Pattern.CASE_INSENSITIVE);

Review Comment:
   The regex pattern may not handle all null entity variations correctly. The 
pattern `&#(0+|x0+);` would match `&#x0+;` but not `&#x00;` or `&#x000;`. 
Consider using `&#(0+|x0*);` to properly match zero or more zeros after 'x'.
   ```suggestion
         Pattern.compile("&#(0+|x0*);", Pattern.CASE_INSENSITIVE);
   ```



##########
solr/modules/extraction/src/test-files/extraction/example.html:
##########
@@ -6,8 +6,8 @@
 <p>
   Here is some text
 </p>
-<div>Here is some text in a div</div>
-<div>This has a <a href="http://www.apache.org";>link</a>.</div>
+<h1>a h1 tag</h1>

Review Comment:
   Corrected article usage from 'a h1' to 'an h1' since 'h1' starts with a 
vowel sound.
   ```suggestion
   <h1>an h1 tag</h1>
   ```



##########
solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java:
##########
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.InputStream;
+import java.net.ConnectException;
+import java.net.SocketTimeoutException;
+import java.nio.channels.ClosedChannelException;
+import java.time.Duration;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.eclipse.jetty.client.HttpClient;
+import org.eclipse.jetty.client.InputStreamRequestContent;
+import org.eclipse.jetty.client.InputStreamResponseListener;
+import org.eclipse.jetty.client.Request;
+import org.eclipse.jetty.client.Response;
+import org.eclipse.jetty.io.EofException;
+import org.eclipse.jetty.util.thread.ScheduledExecutorScheduler;
+import org.xml.sax.helpers.DefaultHandler;
+
+/** Extraction backend using the Tika Server. It uses a shared Jetty 
HttpClient. */
+public class TikaServerExtractionBackend implements ExtractionBackend {
+  private static volatile HttpClient SHARED_CLIENT;
+  private static volatile ExecutorService SHARED_EXECUTOR;
+  private static final Object INIT_LOCK = new Object();
+  private static volatile boolean INITIALIZED = false;
+  private static volatile boolean SHUTDOWN = false;
+  private final String baseUrl;
+  private static final int DEFAULT_TIMEOUT_SECONDS = 3 * 60;
+  private final Duration defaultTimeout;
+  private final TikaServerParser tikaServerResponseParser = new 
TikaServerParser();
+  private boolean tikaMetadataCompatibility;
+  private HashMap<String, Object> initArgsMap = new HashMap<>();
+
+  public TikaServerExtractionBackend(String baseUrl) {
+    this(baseUrl, DEFAULT_TIMEOUT_SECONDS, null);
+  }
+
+  public TikaServerExtractionBackend(String baseUrl, int timeoutSeconds, 
NamedList<?> initArgs) {
+    if (initArgs != null) {
+      initArgs.toMap(this.initArgsMap);
+    }
+    Object metaCompatObh = 
this.initArgsMap.get(ExtractingParams.TIKASERVER_METADATA_COMPATIBILITY);
+    if (metaCompatObh != null) {
+      this.tikaMetadataCompatibility = 
Boolean.parseBoolean(metaCompatObh.toString());
+    }
+    if (timeoutSeconds <= 0) {
+      timeoutSeconds = DEFAULT_TIMEOUT_SECONDS;
+    }
+    if (baseUrl.endsWith("/")) {
+      this.baseUrl = baseUrl.substring(0, baseUrl.length() - 1);
+    } else {
+      this.baseUrl = baseUrl;
+    }
+    this.defaultTimeout =
+        Duration.ofSeconds(timeoutSeconds > 0 ? timeoutSeconds : 
DEFAULT_TIMEOUT_SECONDS);
+  }
+
+  public static final String NAME = "tikaserver";
+
+  @Override
+  public String name() {
+    return NAME;
+  }
+
+  @Override
+  public ExtractionResult extract(InputStream inputStream, ExtractionRequest 
request)
+      throws Exception {
+    try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
+      ExtractionMetadata md = buildMetadataFromRequest(request);
+      BodyContentHandler bodyContentHandler = new BodyContentHandler(-1);
+      if (request.tikaServerRecursive) {
+        tikaServerResponseParser.parseRmetaJson(tikaResponse, 
bodyContentHandler, md);
+      } else {
+        tikaServerResponseParser.parseXml(tikaResponse, bodyContentHandler, 
md);
+      }
+      if (tikaMetadataCompatibility) {
+        appendBackCompatTikaMetadata(md);
+      }
+      return new ExtractionResult(bodyContentHandler.toString(), md);
+    }
+  }
+
+  @Override
+  public void extractWithSaxHandler(
+      InputStream inputStream,
+      ExtractionRequest request,
+      ExtractionMetadata md,
+      DefaultHandler saxContentHandler)
+      throws Exception {
+    try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
+      if (request.tikaServerRecursive) {
+        tikaServerResponseParser.parseRmetaJson(tikaResponse, 
saxContentHandler, md);
+      } else {
+        tikaServerResponseParser.parseXml(tikaResponse, saxContentHandler, md);
+      }
+      if (tikaMetadataCompatibility) {
+        appendBackCompatTikaMetadata(md);
+      }
+    }
+  }
+
+  /**
+   * Call the Tika Server to extract text and metadata. Depending on 
<code>request.recursive</code>,
+   * will either return XML (false) or JSON array (true). <b>The recursive 
mode consumes more memory
+   * both on the TikaServer side and on the Solr side</b>
+   *
+   * @return InputStream of the response body, either XML or JSON depending on 
<code>
+   *     request.tikaserverRecursive</code>
+   */
+  InputStream callTikaServer(InputStream inputStream, ExtractionRequest 
request) throws Exception {
+    String url = baseUrl + (request.tikaServerRecursive ? "/rmeta" : "/tika");
+
+    ensureClientInitialized();
+    HttpClient client = SHARED_CLIENT;
+
+    Request req = client.newRequest(url).method("PUT");
+    Duration effectiveTimeout =
+        (request.tikaServerTimeoutSeconds != null && 
request.tikaServerTimeoutSeconds > 0)
+            ? Duration.ofSeconds(request.tikaServerTimeoutSeconds)
+            : defaultTimeout;
+    req.timeout(effectiveTimeout.toMillis(), TimeUnit.MILLISECONDS);
+
+    // Headers
+    String accept = (request.tikaServerRecursive ? "application/json" : 
"text/xml");
+    req.headers(h -> h.add("Accept", accept));
+    String contentType = (request.streamType != null) ? request.streamType : 
request.contentType;
+    if (contentType != null) {
+      req.headers(h -> h.add("Content-Type", contentType));
+    }
+    if (!request.tikaServerRequestHeaders.isEmpty()) {
+      req.headers(
+          h ->
+              request.tikaServerRequestHeaders.forEach(
+                  (k, v) -> {
+                    if (k != null && v != null) h.add(k, v);
+                  }));
+    }
+
+    ExtractionMetadata md = buildMetadataFromRequest(request);
+    if (request.resourcePassword != null || request.passwordsMap != null) {
+      RegexRulesPasswordProvider passwordProvider = new 
RegexRulesPasswordProvider();
+      if (request.resourcePassword != null) {
+        passwordProvider.setExplicitPassword(request.resourcePassword);
+      }
+      if (request.passwordsMap != null) {
+        passwordProvider.setPasswordMap(request.passwordsMap);
+      }
+      String pwd = passwordProvider.getPassword(md);
+      if (pwd != null) {
+        req.headers(h -> h.add("Password", pwd)); // Tika Server expects this 
header if provided
+      }
+    }
+    if (request.resourceName != null) {
+      req.headers(
+          h ->
+              h.add(
+                  "Content-Disposition", "attachment; filename=\"" + 
request.resourceName + "\""));
+    }
+
+    if (contentType != null) {
+      req.body(new InputStreamRequestContent(contentType, inputStream));
+    } else {
+      req.body(new InputStreamRequestContent(inputStream));
+    }
+
+    InputStreamResponseListener listener = new InputStreamResponseListener();
+    req.send(listener);
+
+    final Response response;
+    try {
+      response = listener.get(effectiveTimeout.toMillis(), 
TimeUnit.MILLISECONDS);
+    } catch (TimeoutException te) {
+      throw new SolrException(
+          SolrException.ErrorCode.GATEWAY_TIMEOUT,
+          "Timeout after "
+              + effectiveTimeout.toMillis()
+              + " ms while waiting for response from TikaServer "
+              + url,
+          te);
+    } catch (InterruptedException ie) {
+      Thread.currentThread().interrupt();
+      throw new SolrException(
+          SolrException.ErrorCode.SERVER_ERROR,
+          "Interrupted while waiting for response from TikaServer " + url,
+          ie);
+    } catch (ExecutionException ee) {
+      Throwable cause = ee.getCause();
+      if (cause instanceof ConnectException
+          || cause instanceof SocketTimeoutException
+          || cause instanceof EofException
+          || cause instanceof ClosedChannelException) {
+        throw new SolrException(
+            SolrException.ErrorCode.SERVICE_UNAVAILABLE,
+            "Error communicating with TikaServer "
+                + url
+                + ": "
+                + cause.getClass().getSimpleName()
+                + ": "
+                + cause.getMessage(),
+            cause);
+      }
+      throw new SolrException(
+          SolrException.ErrorCode.SERVER_ERROR,
+          "Unexpected error while calling TikaServer " + url,
+          ee);
+    }
+
+    int code = response.getStatus();
+    if (code < 200 || code >= 300) {
+      SolrException.ErrorCode errorCode = 
SolrException.ErrorCode.getErrorCode(code);
+      String reason = response.getReason();
+      String msg =
+          "TikaServer "
+              + url
+              + " returned status "
+              + code
+              + (reason != null ? " (" + reason + ")" : "");
+      throw new SolrException(errorCode, msg);
+    }
+
+    return listener.getInputStream();
+  }
+
+  private static void ensureClientInitialized() {
+    if (INITIALIZED) return;
+    synchronized (INIT_LOCK) {
+      if (INITIALIZED) return;
+      ThreadFactory tf = new SolrNamedThreadFactory("TikaServerHttpClient");
+      ExecutorService exec = ExecutorUtil.newMDCAwareCachedThreadPool(tf);
+      HttpClient client = new HttpClient();
+      client.setExecutor(exec);
+      client.setScheduler(new 
ScheduledExecutorScheduler("TikaServerHttpClient-scheduler", true));
+      try {
+        client.start();
+      } catch (Exception e) {
+        try {
+          exec.shutdownNow();
+        } catch (Throwable ignore) {
+        }
+        throw new SolrException(
+            SolrException.ErrorCode.SERVER_ERROR, "Failed to start shared 
Jetty HttpClient", e);
+      }
+      SHARED_EXECUTOR = exec;
+      SHARED_CLIENT = client;
+      INITIALIZED = true;
+      SHUTDOWN = false;
+    }
+  }
+
+  private final Map<String, String> fieldMappings = new LinkedHashMap<>();
+
+  // TODO: Improve backward compatibility by adding more mappings
+  {
+    fieldMappings.put("dc:title", "title");
+    fieldMappings.put("dc:creator", "author");
+    fieldMappings.put("dc:description", "description");
+    fieldMappings.put("dc:subject", "subject");
+    fieldMappings.put("dc:language", "language");
+    fieldMappings.put("dc:publisher", "publisher");
+    fieldMappings.put("dcterms:created", "created");
+    fieldMappings.put("dcterms:modified", "modified");
+    fieldMappings.put("meta:author", "Author");
+    fieldMappings.put("meta:creation-date", "Creation-Date");
+    fieldMappings.put("meta:save-date", "Last-Save-Date");
+    fieldMappings.put("meta:keyword", "Keywords");
+    fieldMappings.put("pdf:docinfo:keywords", "Keywords");
+  }
+
+  /*
+   * Appends back-compatible metadata into the given {@code 
ExtractionMetadata} instance by mapping
+   * source fields to target fields, provided that backward compatibility is 
enabled. If a source
+   * field exists and the target field is not yet populated, the values from 
the source field will
+   * be added to the target field.
+   */
+  private void appendBackCompatTikaMetadata(ExtractionMetadata md) {
+    for (Map.Entry<String, String> mapping : fieldMappings.entrySet()) {
+      String sourceField = mapping.getKey();
+      String targetField = mapping.getValue();
+      if (md.getFirst(sourceField) != null && md.getFirst(targetField) == 
null) {
+        md.add(targetField, md.get(sourceField));

Review Comment:
   This line copies a List<String> directly instead of individual values. It 
should iterate through the source values: `for (String value : 
md.get(sourceField)) { md.add(targetField, value); }`
   ```suggestion
           for (String value : md.get(sourceField)) {
             md.add(targetField, value);
           }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] SOLR-7632 TikaServer as pluggable backend to existing extraction handler [solr]

Reply via email to