This is an automated email from the ASF dual-hosted git repository.
krisden pushed a commit to branch branch_9_0
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9_0 by this push:
new 874cda5 SOLR-16027: Enable spotless on extraction module
874cda5 is described below
commit 874cda5a40350c70ac5f99488971434dcc98bd9f
Author: Kevin Risden <[email protected]>
AuthorDate: Sat Feb 19 10:29:58 2022 -0500
SOLR-16027: Enable spotless on extraction module
---
gradle/validation/spotless.gradle | 1 -
.../extraction/ExtractingDocumentLoader.java | 129 +-
.../extraction/ExtractingMetadataConstants.java | 6 +-
.../solr/handler/extraction/ExtractingParams.java | 108 +-
.../extraction/ExtractingRequestHandler.java | 13 +-
.../handler/extraction/ParseContextConfig.java | 54 +-
.../extraction/RegexRulesPasswordProvider.java | 69 +-
.../handler/extraction/SolrContentHandler.java | 102 +-
.../extraction/SolrContentHandlerFactory.java | 13 +-
.../handler/extraction/XLSXResponseWriter.java | 58 +-
.../solr/handler/extraction/package-info.java | 10 +-
.../extraction/ExtractingRequestHandlerTest.java | 1292 +++++++++++++-------
.../handler/extraction/ParseContextConfigTest.java | 10 +-
.../handler/extraction/TestXLSXResponseWriter.java | 302 +++--
14 files changed, 1305 insertions(+), 862 deletions(-)
diff --git a/gradle/validation/spotless.gradle
b/gradle/validation/spotless.gradle
index 651f2ed..71b06cc 100644
--- a/gradle/validation/spotless.gradle
+++ b/gradle/validation/spotless.gradle
@@ -44,7 +44,6 @@ configure(project(":solr").subprojects) { prj ->
// Exclude certain files (generated ones, mostly).
switch (project.path) {
- case ":solr:modules:extraction":
case ":solr:modules:gcs-repository":
case ":solr:modules:hadoop-auth":
case ":solr:modules:hdfs":
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index 5be32e0..615986b 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -21,7 +21,6 @@ import java.io.InputStream;
import java.io.StringWriter;
import java.lang.invoke.MethodHandles;
import java.util.Locale;
-
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
@@ -60,28 +59,17 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-
-/**
- * The class responsible for loading extracted content into Solr.
- *
- **/
+/** The class responsible for loading extracted content into Solr. */
public class ExtractingDocumentLoader extends ContentStreamLoader {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
- /**
- * Extract Only supported format
- */
+ /** Extract Only supported format */
public static final String TEXT_FORMAT = "text";
- /**
- * Extract Only supported format. Default
- */
+ /** Extract Only supported format. Default */
public static final String XML_FORMAT = "xml";
- /**
- * XHTML XPath parser.
- */
- private static final XPathParser PARSER =
- new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+ /** XHTML XPath parser. */
+ private static final XPathParser PARSER = new XPathParser("xhtml",
XHTMLContentHandler.XHTML);
final SolrCore core;
final SolrParams params;
@@ -95,9 +83,12 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
protected ParseContextConfig parseContextConfig;
protected SolrContentHandlerFactory factory;
- public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor
processor,
- TikaConfig config, ParseContextConfig
parseContextConfig,
- SolrContentHandlerFactory factory) {
+ public ExtractingDocumentLoader(
+ SolrQueryRequest req,
+ UpdateRequestProcessor processor,
+ TikaConfig config,
+ ParseContextConfig parseContextConfig,
+ SolrContentHandlerFactory factory) {
this.params = req.getParams();
this.core = req.getCore();
this.config = config;
@@ -108,20 +99,15 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);
- //this is lightweight
+ // this is lightweight
autoDetectParser = new AutoDetectParser(config);
this.factory = factory;
-
+
ignoreTikaException =
params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}
-
- /**
- * this must be MT safe... may be called concurrently from multiple threads.
- *
- */
- void doAdd(SolrContentHandler handler, AddUpdateCommand template)
- throws IOException {
+ /** this must be MT safe... may be called concurrently from multiple
threads. */
+ void doAdd(SolrContentHandler handler, AddUpdateCommand template) throws
IOException {
template.solrDoc = handler.newDocument();
processor.processAdd(template);
}
@@ -132,12 +118,16 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
}
@Override
- public void load(SolrQueryRequest req, SolrQueryResponse rsp,
- ContentStream stream, UpdateRequestProcessor processor) throws Exception
{
+ public void load(
+ SolrQueryRequest req,
+ SolrQueryResponse rsp,
+ ContentStream stream,
+ UpdateRequestProcessor processor)
+ throws Exception {
Parser parser = null;
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE,
null);
if (streamType != null) {
- //Cache? Parsers are lightweight to construct and thread-safe, so I'm
told
+ // Cache? Parsers are lightweight to construct and thread-safe, so I'm
told
MediaType mt =
MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
parser = new
DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
} else {
@@ -153,7 +143,7 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
}
// Provide stream's content type as hint for auto detection
- if(stream.getContentType() != null) {
+ if (stream.getContentType() != null) {
metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
}
@@ -166,13 +156,14 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE,
stream.getContentType());
// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in
metadata
String charset =
ContentStreamBase.getCharsetFromContentType(stream.getContentType());
- if(charset != null){
+ if (charset != null) {
metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
}
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY,
false);
- SolrContentHandler handler =
factory.createSolrContentHandler(metadata, params, req.getSchema());
+ SolrContentHandler handler =
+ factory.createSolrContentHandler(metadata, params,
req.getSchema());
ContentHandler parsingHandler = handler;
StringWriter writer = null;
@@ -188,56 +179,59 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
serializer = new XMLSerializer(writer, new OutputFormat("XML",
"UTF-8", true));
}
if (xpathExpr != null) {
- Matcher matcher =
- PARSER.parse(xpathExpr);
- serializer.startDocument();//The MatchingContentHandler does not
invoke startDocument. See
https://lists.apache.org/thread.html/5ec63e104e564a2363e45f74d5aced6520b7d32b4b625762ef56cb86%401226775505%40%3Cdev.tika.apache.org%3E
+ Matcher matcher = PARSER.parse(xpathExpr);
+ serializer
+ .startDocument(); // The MatchingContentHandler does not
invoke startDocument. See
+ //
https://lists.apache.org/thread.html/5ec63e104e564a2363e45f74d5aced6520b7d32b4b625762ef56cb86%401226775505%40%3Cdev.tika.apache.org%3E
parsingHandler = new MatchingContentHandler(serializer, matcher);
} else {
parsingHandler = serializer;
}
} else if (xpathExpr != null) {
- Matcher matcher =
- PARSER.parse(xpathExpr);
+ Matcher matcher = PARSER.parse(xpathExpr);
parsingHandler = new MatchingContentHandler(handler, matcher);
- } //else leave it as is
+ } // else leave it as is
- try{
- //potentially use a wrapper handler for parsing, but we still need
the SolrContentHandler for getting the document.
+ try {
+ // potentially use a wrapper handler for parsing, but we still need
the SolrContentHandler
+ // for getting the document.
ParseContext context = parseContextConfig.create();
-
context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
- if(pwMapFile != null && pwMapFile.length() > 0) {
+ if (pwMapFile != null && pwMapFile.length() > 0) {
InputStream is =
req.getCore().getResourceLoader().openResource(pwMapFile);
- if(is != null) {
+ if (is != null) {
log.debug("Password file supplied: {}", pwMapFile);
epp.parse(is);
}
}
context.set(PasswordProvider.class, epp);
String resourcePassword =
params.get(ExtractingParams.RESOURCE_PASSWORD);
- if(resourcePassword != null) {
+ if (resourcePassword != null) {
epp.setExplicitPassword(resourcePassword);
log.debug("Literal password supplied for file {}", resourceName);
}
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
- if(ignoreTikaException)
- log.warn(new StringBuilder("skip extracting text due to
").append(e.getLocalizedMessage())
- .append(".
metadata=").append(metadata.toString()).toString()); // nowarn
- else
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ if (ignoreTikaException)
+ log.warn(
+ new StringBuilder("skip extracting text due to ")
+ .append(e.getLocalizedMessage())
+ .append(". metadata=")
+ .append(metadata.toString())
+ .toString()); // nowarn
+ else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
e);
}
if (extractOnly == false) {
addDoc(handler);
} else {
- //serializer is not null, so we need to call endDoc on it if using
xpath
- if (xpathExpr != null){
+ // serializer is not null, so we need to call endDoc on it if using
xpath
+ if (xpathExpr != null) {
serializer.endDocument();
}
rsp.add(stream.getName(), writer.toString());
@@ -256,20 +250,26 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
IOUtils.closeQuietly(inputStream);
}
} else {
- throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream
type of " + streamType + " didn't match any known parsers. Please supply the "
+ ExtractingParams.STREAM_TYPE + " parameter.");
+ throw new SolrException(
+ SolrException.ErrorCode.BAD_REQUEST,
+ "Stream type of "
+ + streamType
+ + " didn't match any known parsers. Please supply the "
+ + ExtractingParams.STREAM_TYPE
+ + " parameter.");
}
}
public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
public static final HtmlMapper INSTANCE = new
MostlyPassthroughHtmlMapper();
- /**
+ /**
* Keep all elements and their content.
- *
- * Apparently <SCRIPT> and <STYLE> elements are blocked
elsewhere
+ *
+ * <p>Apparently <SCRIPT> and <STYLE> elements are blocked
elsewhere
*/
@Override
- public boolean isDiscardElement(String name) {
+ public boolean isDiscardElement(String name) {
return false;
}
@@ -280,15 +280,14 @@ public class ExtractingDocumentLoader extends
ContentStreamLoader {
}
/**
- * Lowercases the element name, but returns null for <BR>,
- * which suppresses the start-element event for lt;BR> tags.
- * This also suppresses the <BODY> tags because those
- * are handled internally by Tika's XHTMLContentHandler.
+ * Lowercases the element name, but returns null for <BR>, which
suppresses the
+ * start-element event for lt;BR> tags. This also suppresses the
<BODY> tags because
+ * those are handled internally by Tika's XHTMLContentHandler.
*/
@Override
public String mapSafeElement(String name) {
String lowerName = name.toLowerCase(Locale.ROOT);
return (lowerName.equals("br") || lowerName.equals("body")) ? null :
lowerName;
}
- }
- }
\ No newline at end of file
+ }
+}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java
index 71aced1..0a72edc 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java
@@ -16,11 +16,7 @@
*/
package org.apache.solr.handler.extraction;
-
-/**
- * Constants used internally by the {@link ExtractingRequestHandler}.
- *
- **/
+/** Constants used internally by the {@link ExtractingRequestHandler}. */
public interface ExtractingMetadataConstants {
String STREAM_NAME = "stream_name";
String STREAM_SOURCE_INFO = "stream_source_info";
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
index f7917bb..a7d1596 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
@@ -16,86 +16,75 @@
*/
package org.apache.solr.handler.extraction;
-
-/**
- * The various Solr Parameters names to use when extracting content.
- *
- **/
+/** The various Solr Parameters names to use when extracting content. */
public interface ExtractingParams {
- /**
- * Map all generated attribute names to field names with lowercase and
underscores.
- */
+ /** Map all generated attribute names to field names with lowercase and
underscores. */
public static final String LOWERNAMES = "lowernames";
- /**
- * if true, ignore TikaException (give up to extract text but index meta
data)
- */
+ /** if true, ignore TikaException (give up to extract text but index meta
data) */
public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
-
/**
* The param prefix for mapping Tika metadata to Solr fields.
- * <p>
- * To map a field, add a name like:
- * <pre>fmap.title=solr.title</pre>
*
- * In this example, the tika "title" metadata value will be added to a Solr
field named "solr.title"
+ * <p>To map a field, add a name like:
*
+ * <pre>fmap.title=solr.title</pre>
*
+ * In this example, the tika "title" metadata value will be added to a Solr
field named
+ * "solr.title"
*/
public static final String MAP_PREFIX = "fmap.";
/**
* Pass in literal values to be added to the document, as in
+ *
* <pre>
- * literal.myField=Foo
+ * literal.myField=Foo
* </pre>
- *
*/
public static final String LITERALS_PREFIX = "literal.";
-
/**
- * Restrict the extracted parts of a document to be indexed
- * by passing in an XPath expression. All content that satisfies the XPath
expr.
- * will be passed to the {@link SolrContentHandler}.
- * <p>
- * See Tika's docs for what the extracted document looks like.
+ * Restrict the extracted parts of a document to be indexed by passing in an
XPath expression. All
+ * content that satisfies the XPath expr. will be passed to the {@link
SolrContentHandler}.
+ *
+ * <p>See Tika's docs for what the extracted document looks like.
+ *
* @see #CAPTURE_ELEMENTS
*/
public static final String XPATH_EXPRESSION = "xpath";
-
- /**
- * Only extract and return the content, do not index it.
- */
+ /** Only extract and return the content, do not index it. */
public static final String EXTRACT_ONLY = "extractOnly";
- /**
- * Content output format if extractOnly is true. Default is "xml",
alternative is "text".
- */
+ /** Content output format if extractOnly is true. Default is "xml",
alternative is "text". */
public static final String EXTRACT_FORMAT = "extractFormat";
/**
- * Capture attributes separately according to the name of the element,
instead of just adding them to the string buffer
+ * Capture attributes separately according to the name of the element,
instead of just adding them
+ * to the string buffer
*/
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
/**
- * Literal field values will by default override other values such as
metadata and content. Set this to false to revert to pre-4.0 behaviour
+ * Literal field values will by default override other values such as
metadata and content. Set
+ * this to false to revert to pre-4.0 behaviour
*/
public static final String LITERALS_OVERRIDE = "literalsOverride";
/**
- * Capture the specified fields (and everything included below it that isn't
capture by some other capture field) separately from the default. This is
different
- * then the case of passing in an XPath expression.
- * <p>
- * The Capture field is based on the localName returned to the {@link
SolrContentHandler}
- * by Tika, not to be confused by the mapped field. The field name can then
- * be mapped into the index schema.
- * <p>
- * For instance, a Tika document may look like:
+ * Capture the specified fields (and everything included below it that isn't
capture by some other
+ * capture field) separately from the default. This is different then the
case of passing in an
+ * XPath expression.
+ *
+ * <p>The Capture field is based on the localName returned to the {@link
SolrContentHandler} by
+ * Tika, not to be confused by the mapped field. The field name can then be
mapped into the index
+ * schema.
+ *
+ * <p>For instance, a Tika document may look like:
+ *
* <pre>
* <html>
* ...
@@ -104,48 +93,47 @@ public interface ExtractingParams {
* Some more text
* </body>
* </pre>
- * By passing in the p tag, you could capture all P tags separately from the
rest of the t
- * Thus, in the example, the capture of the P tag would be: "some text here.
more text"
*
+ * By passing in the p tag, you could capture all P tags separately from the
rest of the t Thus,
+ * in the example, the capture of the P tag would be: "some text here. more
text"
*/
public static final String CAPTURE_ELEMENTS = "capture";
- /**
- * The type of the stream. If not specified, Tika will use mime type
detection.
- */
+ /** The type of the stream. If not specified, Tika will use mime type
detection. */
public static final String STREAM_TYPE = "stream.type";
-
/**
- * Optional. The file name. If specified, Tika can take this into account
while
- * guessing the MIME type.
+ * Optional. The file name. If specified, Tika can take this into account
while guessing the MIME
+ * type.
*/
public static final String RESOURCE_NAME = "resource.name";
/**
- * Optional. The password for this resource. Will be used instead of the
rule based password lookup mechanisms
+ * Optional. The password for this resource. Will be used instead of the
rule based password
+ * lookup mechanisms
*/
public static final String RESOURCE_PASSWORD = "resource.password";
/**
- * Optional. If specified, the prefix will be prepended to all Metadata,
such that it would be possible
- * to setup a dynamic field to automatically capture it
+ * Optional. If specified, the prefix will be prepended to all Metadata,
such that it would be
+ * possible to setup a dynamic field to automatically capture it
*/
public static final String UNKNOWN_FIELD_PREFIX = "uprefix";
/**
- * Optional. If specified and the name of a potential field cannot be
determined, the default Field specified
- * will be used instead.
+ * Optional. If specified and the name of a potential field cannot be
determined, the default
+ * Field specified will be used instead.
*/
public static final String DEFAULT_FIELD = "defaultField";
/**
- * Optional. If specified, loads the file as a source for password lookups
for Tika encrypted documents.
- * <p>
- * File format is Java properties format with one key=value per line.
- * The key is evaluated as a regex against the file name, and the value is
the password
- * The rules are evaluated top-bottom, i.e. the first match will be used
- * If you want a fallback password to be always used, supply a
.*=<defaultmypassword> at the end
+ * Optional. If specified, loads the file as a source for password lookups
for Tika encrypted
+ * documents.
+ *
+ * <p>File format is Java properties format with one key=value per line. The
key is evaluated as a
+ * regex against the file name, and the value is the password The rules are
evaluated top-bottom,
+ * i.e. the first match will be used If you want a fallback password to be
always used, supply a
+ * .*=<defaultmypassword> at the end
*/
public static final String PASSWORD_MAP_FILE = "passwordsFile";
}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 340ca8f..99222c0 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -18,7 +18,6 @@ package org.apache.solr.handler.extraction;
import java.io.File;
import java.io.InputStream;
-
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.core.SolrCore;
@@ -32,10 +31,11 @@ import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.tika.config.TikaConfig;
/**
- * Handler for rich documents like PDF or Word or any other file format that
Tika handles that need the text to be extracted
- * first from the document.
+ * Handler for rich documents like PDF or Word or any other file format that
Tika handles that need
+ * the text to be extracted first from the document.
*/
-public class ExtractingRequestHandler extends ContentStreamHandlerBase
implements SolrCoreAware , PermissionNameProvider {
+public class ExtractingRequestHandler extends ContentStreamHandlerBase
+ implements SolrCoreAware, PermissionNameProvider {
public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
public static final String CONFIG_LOCATION = "tika.config";
@@ -74,7 +74,8 @@ public class ExtractingRequestHandler extends
ContentStreamHandlerBase implement
if (parseContextConfigLoc == null) { // default:
parseContextConfig = new ParseContextConfig();
} else {
- parseContextConfig = new ParseContextConfig(core.getResourceLoader(),
parseContextConfigLoc);
+ parseContextConfig =
+ new ParseContextConfig(core.getResourceLoader(),
parseContextConfigLoc);
}
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to load Tika
Config", e);
@@ -97,4 +98,4 @@ public class ExtractingRequestHandler extends
ContentStreamHandlerBase implement
public String getDescription() {
return "Add/Update Rich document";
}
-}
\ No newline at end of file
+}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
index 4fdba823..327fe3e 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
@@ -26,7 +26,6 @@ import java.lang.reflect.Method;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
-
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.util.SafeXMLParsing;
import org.apache.tika.parser.ParseContext;
@@ -40,24 +39,29 @@ import org.w3c.dom.NodeList;
public class ParseContextConfig {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
+
private final Map<Class<?>, Object> entries = new HashMap<>();
/** Creates an empty Config without any settings (used as placeholder). */
- public ParseContextConfig() {
- }
+ public ParseContextConfig() {}
/** Creates a {@code ParseContextConfig} from the given XML DOM element. */
public ParseContextConfig(SolrResourceLoader resourceLoader, Element
element) throws Exception {
extract(element, resourceLoader);
}
- /** Creates a {@code ParseContextConfig} from the given XML file, loaded
from the given {@link SolrResourceLoader}. */
- public ParseContextConfig(SolrResourceLoader resourceLoader, String
parseContextConfigLoc) throws Exception {
- this(resourceLoader, loadConfigFile(resourceLoader,
parseContextConfigLoc).getDocumentElement());
+ /**
+ * Creates a {@code ParseContextConfig} from the given XML file, loaded from
the given {@link
+ * SolrResourceLoader}.
+ */
+ public ParseContextConfig(SolrResourceLoader resourceLoader, String
parseContextConfigLoc)
+ throws Exception {
+ this(
+ resourceLoader, loadConfigFile(resourceLoader,
parseContextConfigLoc).getDocumentElement());
}
-
- private static Document loadConfigFile(SolrResourceLoader resourceLoader,
String parseContextConfigLoc) throws Exception {
+
+ private static Document loadConfigFile(
+ SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws
Exception {
return SafeXMLParsing.parseConfigXML(log, resourceLoader,
parseContextConfigLoc);
}
@@ -68,11 +72,13 @@ public class ParseContextConfig {
final String className =
xmlEntryAttributes.getNamedItem("class").getNodeValue();
final String implementationName =
xmlEntryAttributes.getNamedItem("impl").getNodeValue();
- final NodeList xmlProperties =
((Element)xmlEntries.item(i)).getElementsByTagName("property");
+ final NodeList xmlProperties =
+ ((Element) xmlEntries.item(i)).getElementsByTagName("property");
final Class<?> interfaceClass = loader.findClass(className,
Object.class);
- final BeanInfo beanInfo = Introspector.getBeanInfo(interfaceClass,
Introspector.IGNORE_ALL_BEANINFO);
-
+ final BeanInfo beanInfo =
+ Introspector.getBeanInfo(interfaceClass,
Introspector.IGNORE_ALL_BEANINFO);
+
final HashMap<String, PropertyDescriptor> descriptorMap = new
HashMap<>();
for (final PropertyDescriptor pd : beanInfo.getPropertyDescriptors()) {
descriptorMap.put(pd.getName(), pd);
@@ -80,7 +86,8 @@ public class ParseContextConfig {
final Object instance = loader.newInstance(implementationName,
Object.class);
if (!interfaceClass.isInstance(instance)) {
- throw new IllegalArgumentException("Implementation class does not
extend " + interfaceClass.getName());
+ throw new IllegalArgumentException(
+ "Implementation class does not extend " +
interfaceClass.getName());
}
for (int j = 0, c2 = xmlProperties.getLength(); j < c2; j++) {
@@ -92,15 +99,24 @@ public class ParseContextConfig {
final PropertyDescriptor propertyDescriptor =
descriptorMap.get(propertyName);
if (propertyDescriptor == null) {
- throw new IllegalArgumentException(String.format(Locale.ENGLISH,
"Unknown bean property %s in class %s",
- propertyName, interfaceClass.getName()));
+ throw new IllegalArgumentException(
+ String.format(
+ Locale.ENGLISH,
+ "Unknown bean property %s in class %s",
+ propertyName,
+ interfaceClass.getName()));
}
final Method method = propertyDescriptor.getWriteMethod();
if (method == null) {
- throw new IllegalArgumentException(String.format(Locale.ENGLISH,
"Cannot set bean property %s in class %s (no write method available)",
- propertyName, interfaceClass.getName()));
+ throw new IllegalArgumentException(
+ String.format(
+ Locale.ENGLISH,
+ "Cannot set bean property %s in class %s (no write method
available)",
+ propertyName,
+ interfaceClass.getName()));
}
- method.invoke(instance,
getValueFromString(propertyDescriptor.getPropertyType(), propertyValue));
+ method.invoke(
+ instance, getValueFromString(propertyDescriptor.getPropertyType(),
propertyValue));
}
entries.put(interfaceClass, instance);
@@ -120,7 +136,7 @@ public class ParseContextConfig {
public ParseContext create() {
final ParseContext result = new ParseContext();
- for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
+ for (Map.Entry<Class<?>, Object> entry : entries.entrySet()) {
result.set((Class) entry.getKey(), entry.getValue());
}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
index 5ef5d3f..b1f1bc5 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
@@ -25,7 +25,6 @@ import java.util.LinkedHashMap;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
-
import org.apache.lucene.util.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
@@ -34,69 +33,67 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Password provider for Extracting request handler which finds correct
- * password based on file name matching against a list of regular expressions.
- * The list of passwords is supplied in an optional Map.
- * If an explicit password is set, it will be used.
+ * Password provider for Extracting request handler which finds correct
password based on file name
+ * matching against a list of regular expressions. The list of passwords is
supplied in an optional
+ * Map. If an explicit password is set, it will be used.
*/
public class RegexRulesPasswordProvider implements PasswordProvider {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
- private LinkedHashMap<Pattern,String> passwordMap = new LinkedHashMap<>();
- private String explicitPassword;
-
+
+ private LinkedHashMap<Pattern, String> passwordMap = new LinkedHashMap<>();
+ private String explicitPassword;
+
@Override
public String getPassword(Metadata meta) {
- if(getExplicitPassword() != null) {
+ if (getExplicitPassword() != null) {
return getExplicitPassword();
}
-
- if(passwordMap.size() > 0)
+
+ if (passwordMap.size() > 0)
return
lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
-
+
return null;
}
private String lookupPasswordFromMap(String fileName) {
- if(fileName != null && fileName.length() > 0) {
- for(Entry<Pattern,String> e : passwordMap.entrySet()) {
- if(e.getKey().matcher(fileName).matches()) {
+ if (fileName != null && fileName.length() > 0) {
+ for (Entry<Pattern, String> e : passwordMap.entrySet()) {
+ if (e.getKey().matcher(fileName).matches()) {
return e.getValue();
}
}
}
return null;
}
-
+
/**
* Parses rule file from stream and returns a Map of all rules found
+ *
* @param is input stream for the file
*/
- public static LinkedHashMap<Pattern,String> parseRulesFile(InputStream is) {
- LinkedHashMap<Pattern,String> rules = new LinkedHashMap<>();
+ public static LinkedHashMap<Pattern, String> parseRulesFile(InputStream is) {
+ LinkedHashMap<Pattern, String> rules = new LinkedHashMap<>();
BufferedReader br = new BufferedReader(IOUtils.getDecodingReader(is,
StandardCharsets.UTF_8));
String line;
try {
int linenum = 0;
- while ((line = br.readLine()) != null) {
+ while ((line = br.readLine()) != null) {
linenum++;
// Remove comments
String[] arr = line.split("#");
- if(arr.length > 0)
- line = arr[0].trim();
- if(line.length() == 0)
- continue;
+ if (arr.length > 0) line = arr[0].trim();
+ if (line.length() == 0) continue;
int sep = line.indexOf("=");
- if(sep <= 0) {
+ if (sep <= 0) {
log.warn("Wrong format of password line {}", linenum);
continue;
}
- String pass = line.substring(sep+1).trim();
+ String pass = line.substring(sep + 1).trim();
String regex = line.substring(0, sep).trim();
try {
Pattern pattern = Pattern.compile(regex);
- rules.put(pattern, pass);
- } catch(PatternSyntaxException pse) {
+ rules.put(pattern, pass);
+ } catch (PatternSyntaxException pse) {
log.warn("Key of line {} was not a valid regex pattern{}", linenum,
pse);
continue;
}
@@ -111,22 +108,24 @@ public class RegexRulesPasswordProvider implements
PasswordProvider {
/**
* Initialize rules through file input stream. This is a convenience for
first calling
* setPasswordMap(parseRulesFile(is)).
+ *
* @param is the input stream with rules file, one line per rule on format
regex=password
*/
public void parse(InputStream is) {
setPasswordMap(parseRulesFile(is));
}
-
- public LinkedHashMap<Pattern,String> getPasswordMap() {
+
+ public LinkedHashMap<Pattern, String> getPasswordMap() {
return passwordMap;
}
- public void setPasswordMap(LinkedHashMap<Pattern,String> linkedHashMap) {
+ public void setPasswordMap(LinkedHashMap<Pattern, String> linkedHashMap) {
this.passwordMap = linkedHashMap;
}
/**
* Gets the explicit password, if set
+ *
* @return the password, or null if not set
*/
public String getExplicitPassword() {
@@ -135,17 +134,15 @@ public class RegexRulesPasswordProvider implements
PasswordProvider {
/**
* Sets an explicit password which will be used instead of password map
+ *
* @param explicitPassword the password to use
*/
public void setExplicitPassword(String explicitPassword) {
this.explicitPassword = explicitPassword;
}
-
- /**
- * Resets explicit password, so that map will be used for lookups
- */
+
+ /** Resets explicit password, so that map will be used for lookups */
public void resetExplicitPassword() {
this.explicitPassword = null;
}
-
}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
index 8d871a4..43d0de9 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
@@ -25,7 +25,6 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
-
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.IndexSchema;
@@ -38,14 +37,13 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-
/**
- * The class responsible for handling Tika events and translating them into
{@link org.apache.solr.common.SolrInputDocument}s.
- * <B>This class is not thread-safe.</B>
- * <p>
- * This class cannot be reused, you have to create a new instance per document!
- * <p>
- * User's may wish to override this class to provide their own functionality.
+ * The class responsible for handling Tika events and translating them into
{@link
+ * org.apache.solr.common.SolrInputDocument}s. <B>This class is not
thread-safe.</B>
+ *
+ * <p>This class cannot be reused, you have to create a new instance per
document!
+ *
+ * <p>User's may wish to override this class to provide their own
functionality.
*
* @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
* @see org.apache.solr.handler.extraction.ExtractingRequestHandler
@@ -67,14 +65,13 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
protected final boolean captureAttribs;
protected final boolean lowerNames;
-
+
protected final String unknownFieldPrefix;
protected final String defaultField;
private final boolean literalsOverride;
-
- private Set<String> literalFieldNames = null;
+ private Set<String> literalFieldNames = null;
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema
schema) {
this.document = new SolrInputDocument();
@@ -87,7 +84,7 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
this.defaultField = params.get(DEFAULT_FIELD, "");
-
+
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
if (captureFields != null && captureFields.length > 0) {
fieldBuilders = new HashMap<>();
@@ -100,30 +97,29 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
bldrStack.add(catchAllBuilder);
}
-
/**
- * This is called by a consumer when it is ready to deal with a new
SolrInputDocument. Overriding
- * classes can use this hook to add in or change whatever they deem fit for
the document at that time.
- * The base implementation adds the metadata as fields, allowing for
potential remapping.
+ * This is called by a consumer when it is ready to deal with a new
SolrInputDocument. Overriding
+ * classes can use this hook to add in or change whatever they deem fit for
the document at that
+ * time. The base implementation adds the metadata as fields, allowing for
potential remapping.
*
* @return The {@link org.apache.solr.common.SolrInputDocument}.
- *
* @see #addMetadata()
* @see #addCapturedContent()
* @see #addContent()
* @see #addLiterals()
*/
public SolrInputDocument newDocument() {
- //handle the literals from the params. NOTE: This MUST be called before
the others in order for literals to override other values
+ // handle the literals from the params. NOTE: This MUST be called before
the others in order for
+ // literals to override other values
addLiterals();
- //handle the metadata extracted from the document
+ // handle the metadata extracted from the document
addMetadata();
- //add in the content
+ // add in the content
addContent();
- //add in the captured content
+ // add in the captured content
addCapturedContent();
if (log.isDebugEnabled()) {
@@ -133,26 +129,25 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
}
/**
- * Add the per field captured content to the Solr Document. Default
implementation uses the
- * {@link #fieldBuilders} info
+ * Add the per field captured content to the Solr Document. Default
implementation uses the {@link
+ * #fieldBuilders} info
*/
protected void addCapturedContent() {
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
if (entry.getValue().length() > 0) {
String fieldName = entry.getKey();
- if (literalsOverride && literalFieldNames.contains(fieldName))
- continue;
- addField(fieldName, entry.getValue().toString(), null); }
+ if (literalsOverride && literalFieldNames.contains(fieldName))
continue;
+ addField(fieldName, entry.getValue().toString(), null);
+ }
}
}
/**
- * Add in the catch all content to the field. Default impl. uses the {@link
#contentFieldName}
- * and the {@link #catchAllBuilder}
+ * Add in the catch all content to the field. Default impl. uses the {@link
#contentFieldName} and
+ * the {@link #catchAllBuilder}
*/
protected void addContent() {
- if (literalsOverride && literalFieldNames.contains(contentFieldName))
- return;
+ if (literalsOverride && literalFieldNames.contains(contentFieldName))
return;
addField(contentFieldName, catchAllBuilder.toString(), null);
}
@@ -172,13 +167,10 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
}
}
- /**
- * Add in any metadata using {@link #metadata} as the source.
- */
+ /** Add in any metadata using {@link #metadata} as the source. */
protected void addMetadata() {
for (String name : metadata.names()) {
- if (literalsOverride && literalFieldNames.contains(name))
- continue;
+ if (literalsOverride && literalFieldNames.contains(name)) continue;
String[] vals = metadata.getValues(name);
addField(name, null, vals);
}
@@ -191,21 +183,24 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
protected void addField(String fname, String fval, String[] vals) {
if (lowerNames) {
StringBuilder sb = new StringBuilder();
- for (int i=0; i<fname.length(); i++) {
+ for (int i = 0; i < fname.length(); i++) {
char ch = fname.charAt(i);
- if (!Character.isLetterOrDigit(ch)) ch='_';
- else ch=Character.toLowerCase(ch);
+ if (!Character.isLetterOrDigit(ch)) ch = '_';
+ else ch = Character.toLowerCase(ch);
sb.append(ch);
}
fname = sb.toString();
- }
+ }
String name = findMappedName(fname);
SchemaField sf = schema.getFieldOrNull(name);
- if (sf==null && unknownFieldPrefix.length() > 0) {
+ if (sf == null && unknownFieldPrefix.length() > 0) {
name = unknownFieldPrefix + name;
sf = schema.getFieldOrNull(name);
- } else if (sf == null && defaultField.length() > 0 &&
name.equals(TikaMetadataKeys.RESOURCE_NAME_KEY) == false /*let the fall through
below handle this*/){
+ } else if (sf == null
+ && defaultField.length() > 0
+ && name.equals(TikaMetadataKeys.RESOURCE_NAME_KEY)
+ == false /*let the fall through below handle this*/) {
name = defaultField;
sf = schema.getFieldOrNull(name);
}
@@ -215,12 +210,14 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
// ExtractingDocumentLoader.load(). You shouldn't have to define a mapping
for this
// field just because you specified a resource.name parameter to the
handler, should
// you?
- if (sf == null && unknownFieldPrefix.length()==0 && name ==
TikaMetadataKeys.RESOURCE_NAME_KEY) {
+ if (sf == null
+ && unknownFieldPrefix.length() == 0
+ && name == TikaMetadataKeys.RESOURCE_NAME_KEY) {
return;
}
// normalize val params so vals.length>1
- if (vals != null && vals.length==1) {
+ if (vals != null && vals.length == 1) {
fval = vals[0];
vals = null;
}
@@ -228,17 +225,17 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
// single valued field with multiple values... catenate them.
if (sf != null && !sf.multiValued() && vals != null) {
StringBuilder builder = new StringBuilder();
- boolean first=true;
+ boolean first = true;
for (String val : vals) {
if (first) {
- first=false;
+ first = false;
} else {
builder.append(' ');
}
builder.append(val);
}
fval = builder.toString();
- vals=null;
+ vals = null;
}
if (fval != null) {
@@ -256,10 +253,11 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
}
@Override
- public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
+ public void startElement(String uri, String localName, String qName,
Attributes attributes)
+ throws SAXException {
StringBuilder theBldr = fieldBuilders.get(localName);
if (theBldr != null) {
- //we need to switch the currentBuilder
+ // we need to switch the currentBuilder
bldrStack.add(theBldr);
}
if (captureAttribs == true) {
@@ -278,22 +276,19 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
public void endElement(String uri, String localName, String qName) throws
SAXException {
StringBuilder theBldr = fieldBuilders.get(localName);
if (theBldr != null) {
- //pop the stack
+ // pop the stack
bldrStack.removeLast();
assert (bldrStack.size() >= 1);
}
bldrStack.getLast().append(' ');
}
-
@Override
public void characters(char[] chars, int offset, int length) throws
SAXException {
bldrStack.getLast().append(chars, offset, length);
}
- /**
- * Treat the same as any other characters
- */
+ /** Treat the same as any other characters */
@Override
public void ignorableWhitespace(char[] chars, int offset, int length) throws
SAXException {
characters(chars, offset, length);
@@ -308,5 +303,4 @@ public class SolrContentHandler extends DefaultHandler
implements ExtractingPara
protected String findMappedName(String name) {
return params.get(MAP_PREFIX + name, name);
}
-
}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
index f95125e..1070e74 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
@@ -16,20 +16,17 @@
*/
package org.apache.solr.handler.extraction;
-import org.apache.tika.metadata.Metadata;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.IndexSchema;
+import org.apache.tika.metadata.Metadata;
-
-/**
- *
- *
- **/
+/** */
public class SolrContentHandlerFactory {
- public SolrContentHandlerFactory() { }
+ public SolrContentHandlerFactory() {}
- public SolrContentHandler createSolrContentHandler(Metadata metadata,
SolrParams params, IndexSchema schema) {
+ public SolrContentHandler createSolrContentHandler(
+ Metadata metadata, SolrParams params, IndexSchema schema) {
return new SolrContentHandler(metadata, params, schema);
}
}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XLSXResponseWriter.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XLSXResponseWriter.java
index d1e5dda..21dee63 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XLSXResponseWriter.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XLSXResponseWriter.java
@@ -29,7 +29,6 @@ import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
-
import org.apache.lucene.index.IndexableField;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.FillPatternType;
@@ -52,13 +51,14 @@ import org.apache.solr.search.ReturnFields;
public class XLSXResponseWriter extends RawResponseWriter {
@Override
- public void write(OutputStream out, SolrQueryRequest req, SolrQueryResponse
rsp) throws IOException {
+ public void write(OutputStream out, SolrQueryRequest req, SolrQueryResponse
rsp)
+ throws IOException {
// throw away arraywriter just to satisfy super requirements; we're
grabbing
// all writes before they go to it anyway
XLSXWriter w = new XLSXWriter(new CharArrayWriter(), req, rsp);
- LinkedHashMap<String,String> reqNamesMap = new LinkedHashMap<>();
- LinkedHashMap<String,Integer> reqWidthsMap = new LinkedHashMap<>();
+ LinkedHashMap<String, String> reqNamesMap = new LinkedHashMap<>();
+ LinkedHashMap<String, Integer> reqWidthsMap = new LinkedHashMap<>();
Iterator<String> paramNamesIter =
req.getParams().getParameterNamesIterator();
while (paramNamesIter.hasNext()) {
@@ -102,12 +102,12 @@ class XLSXWriter extends TabularResponseWriter {
this.rowIndex = 0;
- this.headerStyle = (XSSFCellStyle)swb.createCellStyle();
+ this.headerStyle = (XSSFCellStyle) swb.createCellStyle();
this.headerStyle.setFillBackgroundColor(IndexedColors.BLACK.getIndex());
- //solid fill
+ // solid fill
this.headerStyle.setFillPattern(FillPatternType.SOLID_FOREGROUND);
Font headerFont = swb.createFont();
- headerFont.setFontHeightInPoints((short)14);
+ headerFont.setFontHeightInPoints((short) 14);
headerFont.setBold(true);
headerFont.setColor(IndexedColors.WHITE.getIndex());
this.headerStyle.setFont(headerFont);
@@ -119,18 +119,18 @@ class XLSXWriter extends TabularResponseWriter {
}
void setHeaderRow() {
- curRow.setHeightInPoints((short)21);
+ curRow.setHeightInPoints((short) 21);
}
- //sets last created cell to have header style
+ // sets last created cell to have header style
void setHeaderCell() {
curRow.getCell(cellIndex - 1).setCellStyle(this.headerStyle);
}
- //set the width of the most recently created column
+ // set the width of the most recently created column
void setColWidth(int charWidth) {
- //width in poi is units of 1/256th of a character width for some reason
- this.sh.setColumnWidth(cellIndex - 1, 256*charWidth);
+ // width in poi is units of 1/256th of a character width for some reason
+ this.sh.setColumnWidth(cellIndex - 1, 256 * charWidth);
}
void writeCell(String value) {
@@ -145,7 +145,7 @@ class XLSXWriter extends TabularResponseWriter {
StringWriter sw = new StringWriter();
e.printStackTrace(new PrintWriter(sw));
String stacktrace = sw.toString();
- }finally {
+ } finally {
swb.dispose();
}
}
@@ -158,14 +158,17 @@ class XLSXWriter extends TabularResponseWriter {
SchemaField sf;
}
- private Map<String,XLField> xlFields = new LinkedHashMap<String,XLField>();
+ private Map<String, XLField> xlFields = new LinkedHashMap<String, XLField>();
public XLSXWriter(Writer writer, SolrQueryRequest req, SolrQueryResponse
rsp) {
super(writer, req, rsp);
}
- public void writeResponse(OutputStream out, LinkedHashMap<String, String>
colNamesMap,
- LinkedHashMap<String, Integer> colWidthsMap)
throws IOException {
+ public void writeResponse(
+ OutputStream out,
+ LinkedHashMap<String, String> colNamesMap,
+ LinkedHashMap<String, Integer> colWidthsMap)
+ throws IOException {
Collection<String> fields = getFields();
for (String field : fields) {
@@ -194,10 +197,8 @@ class XLSXWriter extends TabularResponseWriter {
xlFields.put(field, xlField);
}
-
-
wb.addRow();
- //write header
+ // write header
for (XLField xlField : xlFields.values()) {
String printName = xlField.name;
int colWidth = 14;
@@ -230,11 +231,12 @@ class XLSXWriter extends TabularResponseWriter {
super.close();
}
- //NOTE: a document cannot currently contain another document
+ // NOTE: a document cannot currently contain another document
List<Object> tmpList;
@Override
- public void writeSolrDocument(String name, SolrDocument doc, ReturnFields
returnFields, int idx ) throws IOException {
+ public void writeSolrDocument(String name, SolrDocument doc, ReturnFields
returnFields, int idx)
+ throws IOException {
if (tmpList == null) {
tmpList = new ArrayList<>(1);
tmpList.add(null);
@@ -242,7 +244,7 @@ class XLSXWriter extends TabularResponseWriter {
for (XLField xlField : xlFields.values()) {
Object val = doc.getFieldValue(xlField.name);
- int nVals = val instanceof Collection ? ((Collection)val).size() :
(val==null ? 0 : 1);
+ int nVals = val instanceof Collection ? ((Collection) val).size() : (val
== null ? 0 : 1);
if (nVals == 0) {
writeNull(xlField.name);
continue;
@@ -252,7 +254,7 @@ class XLSXWriter extends TabularResponseWriter {
Collection<?> values;
// normalize to a collection
if (val instanceof Collection) {
- values = (Collection<?>)val;
+ values = (Collection<?>) val;
} else {
tmpList.set(0, val);
values = tmpList;
@@ -263,7 +265,7 @@ class XLSXWriter extends TabularResponseWriter {
} else {
// normalize to first value
if (val instanceof Collection) {
- Collection<?> values = (Collection<?>)val;
+ Collection<?> values = (Collection<?>) val;
val = values.iterator().next();
}
writeVal(xlField.name, val);
@@ -284,7 +286,7 @@ class XLSXWriter extends TabularResponseWriter {
while (val.hasNext()) {
Object v = val.next();
if (v instanceof IndexableField) {
- IndexableField f = (IndexableField)v;
+ IndexableField f = (IndexableField) v;
if (v instanceof Date) {
output.append(((Date) val).toInstant().toString()).append("; ");
} else {
@@ -295,8 +297,8 @@ class XLSXWriter extends TabularResponseWriter {
}
}
if (output.length() > 0) {
- output.deleteCharAt(output.length()-1);
- output.deleteCharAt(output.length()-1);
+ output.deleteCharAt(output.length() - 1);
+ output.deleteCharAt(output.length() - 1);
}
writeStr(name, output.toString(), false);
}
@@ -335,4 +337,4 @@ class XLSXWriter extends TabularResponseWriter {
public void writeDate(String name, String val) throws IOException {
wb.writeCell(val);
}
-}
\ No newline at end of file
+}
diff --git
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/package-info.java
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/package-info.java
index 729a276..48d84cb 100644
---
a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/package-info.java
+++
b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/package-info.java
@@ -14,12 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-/**
- * {@link org.apache.solr.handler.extraction.ExtractingRequestHandler} and
related code.
- */
-package org.apache.solr.handler.extraction;
-
-
-
+/** {@link org.apache.solr.handler.extraction.ExtractingRequestHandler} and
related code. */
+package org.apache.solr.handler.extraction;
diff --git
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index 91ebc82..7b7f924 100644
---
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -20,7 +20,6 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.TimeZone;
-
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.ContentStream;
@@ -35,21 +34,21 @@ import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
-
-/**
- *
- *
- **/
+/** */
public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
// Is the JDK/env affected by a known bug?
- final String tzDisplayName = TimeZone.getDefault().getDisplayName(false,
TimeZone.SHORT, Locale.US);
+ final String tzDisplayName =
+ TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
if (!tzDisplayName.matches("[A-Za-z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) {
- assertTrue("Is some other JVM affected? Or bad regex? TzDisplayName: "
+ tzDisplayName,
+ assertTrue(
+ "Is some other JVM affected? Or bad regex? TzDisplayName: " +
tzDisplayName,
System.getProperty("java.version").startsWith("11"));
- assumeTrue("SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in
extracting dates in a bad format.", false);
+ assumeTrue(
+ "SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in
extracting dates in a bad format.",
+ false);
}
initCore("solrconfig.xml", "schema.xml",
getFile("extraction/solr").getAbsolutePath());
@@ -65,159 +64,265 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
@Test
public void testExtraction() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- loadLocal("extraction/solr-word.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "one",
- "fmap.Last-Modified", "extractedDate"
- );
+ loadLocal(
+ "extraction/solr-word.pdf",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "extractedContent",
+ "literal.id",
+ "one",
+ "fmap.Last-Modified",
+ "extractedDate");
assertQ(req("title:solr-word"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("title:solr-word"), "//*[@numFound='1']");
-
- loadLocal("extraction/simple.html", "fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.language", "extractedLanguage",
- "literal.id", "two",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.Last-Modified", "extractedDate"
- );
+ loadLocal(
+ "extraction/simple.html",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.language",
+ "extractedLanguage",
+ "literal.id",
+ "two",
+ "uprefix",
+ "ignored_",
+ "fmap.content",
+ "extractedContent",
+ "fmap.Last-Modified",
+ "extractedDate");
assertQ(req("title:Welcome"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("title:Welcome"), "//*[@numFound='1']");
- assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']");
- assertQ(req("extractedContent:distinct"), "//*[@numFound='1']");
- assertQ(req("extractedContent:words"), "//*[@numFound='2']");
+ assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']");
+ assertQ(req("extractedContent:distinct"), "//*[@numFound='1']");
+ assertQ(req("extractedContent:words"), "//*[@numFound='2']");
assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");
- loadLocal("extraction/simple.html",
- "literal.id","simple2",
- "uprefix", "t_",
- "lowernames", "true",
- "captureAttr", "true",
- "fmap.a","t_href",
- "fmap.content_type", "abcxyz", // test that lowernames is applied
before mapping, and uprefix is applied after mapping
- "commit", "true" // test immediate commit
- );
+ loadLocal(
+ "extraction/simple.html",
+ "literal.id",
+ "simple2",
+ "uprefix",
+ "t_",
+ "lowernames",
+ "true",
+ "captureAttr",
+ "true",
+ "fmap.a",
+ "t_href",
+ "fmap.content_type",
+ "abcxyz", // test that lowernames is applied before mapping, and
uprefix is applied after
+ // mapping
+ "commit",
+ "true" // test immediate commit
+ );
// test that purposely causes a failure to print out the doc for test
debugging
// assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
// test both lowernames and unknown field mapping
- //assertQ(req("+id:simple2 +t_content_type:[* TO *]"),
"//*[@numFound='1']");
+ // assertQ(req("+id:simple2 +t_content_type:[* TO *]"),
"//*[@numFound='1']");
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
- assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); //
make sure <style> content is excluded
- assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make
sure <script> content is excluded
+ assertQ(
+ req("+id:simple2 +t_content:serif"),
+ "//*[@numFound='0']"); // make sure <style> content is excluded
+ assertQ(
+ req("+id:simple2 +t_content:blur"),
+ "//*[@numFound='0']"); // make sure <script> content is excluded
// make sure the fact there is an index-time boost does not fail the
parsing
- loadLocal("extraction/simple.html",
- "literal.id","simple3",
- "uprefix", "t_",
- "lowernames", "true",
- "captureAttr", "true", "fmap.a","t_href",
- "commit", "true"
-
- ,"boost.t_href", "100.0"
- );
+ loadLocal(
+ "extraction/simple.html",
+ "literal.id",
+ "simple3",
+ "uprefix",
+ "t_",
+ "lowernames",
+ "true",
+ "captureAttr",
+ "true",
+ "fmap.a",
+ "t_href",
+ "commit",
+ "true",
+ "boost.t_href",
+ "100.0");
assertQ(req("t_href:http"), "//*[@numFound='2']");
assertQ(req("t_href:http"), "//doc[2]/str[.='simple3']");
- assertQ(req("+id:simple3 +t_content_type:[* TO *]"),
"//*[@numFound='1']");//test lowercase and then uprefix
-
- loadLocal("extraction/version_control.xml", "fmap.created",
"extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "three",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Last-Modified", "extractedDate"
- );
+ assertQ(
+ req("+id:simple3 +t_content_type:[* TO *]"),
+ "//*[@numFound='1']"); // test lowercase and then uprefix
+
+ loadLocal(
+ "extraction/version_control.xml",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "literal.id",
+ "three",
+ "uprefix",
+ "ignored_",
+ "fmap.content",
+ "extractedContent",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.Last-Modified",
+ "extractedDate");
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
- loadLocal("extraction/word2003.doc", "fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "four",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Last-Modified", "extractedDate"
- );
+ loadLocal(
+ "extraction/word2003.doc",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "literal.id",
+ "four",
+ "uprefix",
+ "ignored_",
+ "fmap.content",
+ "extractedContent",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.Last-Modified",
+ "extractedDate");
assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='0']");
// There is already a PDF file with this content:
- assertQ(req("extractedContent:\"This is a test of PDF and Word extraction
in Solr, it is only a test\""), "//*[@numFound='1']");
+ assertQ(
+ req(
+ "extractedContent:\"This is a test of PDF and Word extraction in
Solr, it is only a test\""),
+ "//*[@numFound='1']");
assertU(commit());
assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='1']");
// now 2 of them:
- assertQ(req("extractedContent:\"This is a test of PDF and Word extraction
in Solr, it is only a test\""), "//*[@numFound='2']");
+ assertQ(
+ req(
+ "extractedContent:\"This is a test of PDF and Word extraction in
Solr, it is only a test\""),
+ "//*[@numFound='2']");
// compressed file
- loadLocal("extraction/tiny.txt.gz",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Last-Modified", "extractedDate",
- "literal.id", "tiny.txt.gz");
+ loadLocal(
+ "extraction/tiny.txt.gz",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "uprefix",
+ "ignored_",
+ "fmap.content",
+ "extractedContent",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.Last-Modified",
+ "extractedDate",
+ "literal.id",
+ "tiny.txt.gz");
assertU(commit());
- assertQ(req("id:tiny.txt.gz")
- , "//*[@numFound='1']"
- , "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']"
- );
+ assertQ(
+ req("id:tiny.txt.gz"),
+ "//*[@numFound='1']",
+ "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']");
// compressed file
- loadLocal("extraction/open-document.odt",
- "uprefix", "ignored_",
- "fmap.content", "extractedContent",
- "literal.id", "open-document");
+ loadLocal(
+ "extraction/open-document.odt",
+ "uprefix",
+ "ignored_",
+ "fmap.content",
+ "extractedContent",
+ "literal.id",
+ "open-document");
assertU(commit());
- assertQ(req("extractedContent:\"Práctica sobre GnuPG\"")
- , "//*[@numFound='1']"
- , "//*/arr[@name='stream_name']/str[.='open-document.odt']"
- );
+ assertQ(
+ req("extractedContent:\"Práctica sobre GnuPG\""),
+ "//*[@numFound='1']",
+ "//*/arr[@name='stream_name']/str[.='open-document.odt']");
}
@Test
public void testCapture() throws Exception {
- loadLocal("extraction/simple.html",
- "literal.id","capture1",
- "uprefix","t_",
- "capture","div",
- "fmap.div", "foo_t",
- "commit", "true"
- );
+ loadLocal(
+ "extraction/simple.html",
+ "literal.id",
+ "capture1",
+ "uprefix",
+ "t_",
+ "capture",
+ "div",
+ "fmap.div",
+ "foo_t",
+ "commit",
+ "true");
assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""),
"//*[@numFound='1']");
- loadLocal("extraction/simple.html",
- "literal.id", "capture2",
- "captureAttr", "true",
- "defaultField", "text",
- "fmap.div", "div_t",
- "fmap.a", "anchor_t",
- "capture", "div",
- "capture", "a",
- "commit", "true"
- );
+ loadLocal(
+ "extraction/simple.html",
+ "literal.id",
+ "capture2",
+ "captureAttr",
+ "true",
+ "defaultField",
+ "text",
+ "fmap.div",
+ "div_t",
+ "fmap.a",
+ "anchor_t",
+ "capture",
+ "div",
+ "capture",
+ "a",
+ "commit",
+ "true");
assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""),
"//*[@numFound='1']");
assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"),
"//*[@numFound='1']");
@@ -226,68 +331,106 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
@Test
public void testDefaultField() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertNotNull("handler is null and it shouldn't be", handler);
try {
ignoreException("unknown field 'a'");
- ignoreException("unknown field 'meta'"); // TODO: should this exception
be happening?
- expectThrows(SolrException.class, () -> {
- loadLocal("extraction/simple.html",
- "literal.id", "simple2",
- "lowernames", "true",
- "captureAttr", "true",
- //"fmap.content_type", "abcxyz",
- "commit", "true" // test immediate commit
- );
- });
+ ignoreException("unknown field 'meta'"); // TODO: should this exception
be happening?
+ expectThrows(
+ SolrException.class,
+ () -> {
+ loadLocal(
+ "extraction/simple.html",
+ "literal.id",
+ "simple2",
+ "lowernames",
+ "true",
+ "captureAttr",
+ "true",
+ // "fmap.content_type", "abcxyz",
+ "commit",
+ "true" // test immediate commit
+ );
+ });
} finally {
resetExceptionIgnores();
}
-
-
- loadLocal("extraction/simple.html",
- "literal.id","simple2",
- ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped
fields go to the text field when no uprefix is specified
- "lowernames", "true",
- "captureAttr", "true",
- //"fmap.content_type", "abcxyz",
- "commit", "true" // test immediate commit
- );
+
+ loadLocal(
+ "extraction/simple.html",
+ "literal.id",
+ "simple2",
+ ExtractingParams.DEFAULT_FIELD,
+ "defaultExtr", // test that unmapped fields go to the text field when
no uprefix is
+ // specified
+ "lowernames",
+ "true",
+ "captureAttr",
+ "true",
+ // "fmap.content_type", "abcxyz",
+ "commit",
+ "true" // test immediate commit
+ );
assertQ(req("id:simple2"), "//*[@numFound='1']");
assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"),
"//*[@numFound='1']");
- //Test when both uprefix and default are specified.
- loadLocal("extraction/simple.html",
- "literal.id","simple2",
- ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped
fields go to the text field when no uprefix is specified
- ExtractingParams.UNKNOWN_FIELD_PREFIX, "t_",
- "lowernames", "true",
- "captureAttr", "true",
- "fmap.a","t_href",
- //"fmap.content_type", "abcxyz",
- "commit", "true" // test immediate commit
- );
+ // Test when both uprefix and default are specified.
+ loadLocal(
+ "extraction/simple.html",
+ "literal.id",
+ "simple2",
+ ExtractingParams.DEFAULT_FIELD,
+ "defaultExtr", // test that unmapped fields go to the text field when
no uprefix is
+ // specified
+ ExtractingParams.UNKNOWN_FIELD_PREFIX,
+ "t_",
+ "lowernames",
+ "true",
+ "captureAttr",
+ "true",
+ "fmap.a",
+ "t_href",
+ // "fmap.content_type", "abcxyz",
+ "commit",
+ "true" // test immediate commit
+ );
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
}
@Test
public void testLiterals() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- //test literal
- loadLocal("extraction/version_control.xml", "fmap.created",
"extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "one",
- "uprefix", "ignored_",
- "fmap.language", "extractedLanguage",
- "literal.extractionLiteralMV", "one",
- "literal.extractionLiteralMV", "two",
- "fmap.Last-Modified", "extractedDate"
-
- );
+ // test literal
+ loadLocal(
+ "extraction/version_control.xml",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "extractedContent",
+ "literal.id",
+ "one",
+ "uprefix",
+ "ignored_",
+ "fmap.language",
+ "extractedLanguage",
+ "literal.extractionLiteralMV",
+ "one",
+ "literal.extractionLiteralMV",
+ "two",
+ "fmap.Last-Modified",
+ "extractedDate");
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']");
@@ -297,121 +440,170 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
try {
// TODO: original author did not specify why an exception should be
thrown... how to fix?
- loadLocal("extraction/version_control.xml", "fmap.created",
"extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "two",
- "fmap.language", "extractedLanguage",
- "literal.extractionLiteral", "one",
- "literal.extractionLiteral", "two",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.Last-Modified", "extractedDate"
- );
+ loadLocal(
+ "extraction/version_control.xml",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "extractedContent",
+ "literal.id",
+ "two",
+ "fmap.language",
+ "extractedLanguage",
+ "literal.extractionLiteral",
+ "one",
+ "literal.extractionLiteral",
+ "two",
+ "fmap.X-Parsed-By",
+ "ignored_parser",
+ "fmap.Last-Modified",
+ "extractedDate");
// TODO: original author did not specify why an exception should be
thrown... how to fix?
// assertTrue("Exception should have been thrown", false);
} catch (SolrException e) {
- //nothing to see here, move along
+ // nothing to see here, move along
}
- loadLocal("extraction/version_control.xml", "fmap.created",
"extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "extractedContent",
- "literal.id", "three",
- "fmap.language", "extractedLanguage",
- "literal.extractionLiteral", "one",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.Last-Modified", "extractedDate"
- );
+ loadLocal(
+ "extraction/version_control.xml",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "extractedContent",
+ "literal.id",
+ "three",
+ "fmap.language",
+ "extractedLanguage",
+ "literal.extractionLiteral",
+ "one",
+ "fmap.X-Parsed-By",
+ "ignored_parser",
+ "fmap.Last-Modified",
+ "extractedDate");
assertU(commit());
assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
-
}
public void testLiteralDefaults() throws Exception {
// sanity check config
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.id", "lit-def-simple");
+ loadLocalFromHandler(
+ "/update/extract/lit-def", "extraction/simple.html", "literal.id",
"lit-def-simple");
assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='x']"
- , "count(//arr[@name='bar_s']/str)=1"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
-
+ assertQ(
+ req("q", "id:lit-def-simple"),
+ "//*[@numFound='1']",
+ "count(//arr[@name='foo_s']/str)=1",
+ "//arr[@name='foo_s']/str[.='x']",
+ "count(//arr[@name='bar_s']/str)=1",
+ "//arr[@name='bar_s']/str[.='y']",
+ "count(//arr[@name='zot_s']/str)=1",
+ "//arr[@name='zot_s']/str[.='z']");
+
// override the default foo_s
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.foo_s", "1111",
- "literal.id", "lit-def-simple");
+ loadLocalFromHandler(
+ "/update/extract/lit-def",
+ "extraction/simple.html",
+ "literal.foo_s",
+ "1111",
+ "literal.id",
+ "lit-def-simple");
assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='1111']"
- , "count(//arr[@name='bar_s']/str)=1"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
+ assertQ(
+ req("q", "id:lit-def-simple"),
+ "//*[@numFound='1']",
+ "count(//arr[@name='foo_s']/str)=1",
+ "//arr[@name='foo_s']/str[.='1111']",
+ "count(//arr[@name='bar_s']/str)=1",
+ "//arr[@name='bar_s']/str[.='y']",
+ "count(//arr[@name='zot_s']/str)=1",
+ "//arr[@name='zot_s']/str[.='z']");
// pre-pend the bar_s
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.bar_s", "2222",
- "literal.id", "lit-def-simple");
+ loadLocalFromHandler(
+ "/update/extract/lit-def",
+ "extraction/simple.html",
+ "literal.bar_s",
+ "2222",
+ "literal.id",
+ "lit-def-simple");
assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='x']"
- , "count(//arr[@name='bar_s']/str)=2"
- , "//arr[@name='bar_s']/str[.='2222']"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
+ assertQ(
+ req("q", "id:lit-def-simple"),
+ "//*[@numFound='1']",
+ "count(//arr[@name='foo_s']/str)=1",
+ "//arr[@name='foo_s']/str[.='x']",
+ "count(//arr[@name='bar_s']/str)=2",
+ "//arr[@name='bar_s']/str[.='2222']",
+ "//arr[@name='bar_s']/str[.='y']",
+ "count(//arr[@name='zot_s']/str)=1",
+ "//arr[@name='zot_s']/str[.='z']");
// invariant zot_s can not be changed
- loadLocalFromHandler("/update/extract/lit-def",
- "extraction/simple.html",
- "literal.zot_s", "3333",
- "literal.id", "lit-def-simple");
+ loadLocalFromHandler(
+ "/update/extract/lit-def",
+ "extraction/simple.html",
+ "literal.zot_s",
+ "3333",
+ "literal.id",
+ "lit-def-simple");
assertU(commit());
- assertQ(req("q", "id:lit-def-simple")
- , "//*[@numFound='1']"
- , "count(//arr[@name='foo_s']/str)=1"
- , "//arr[@name='foo_s']/str[.='x']"
- , "count(//arr[@name='bar_s']/str)=1"
- , "//arr[@name='bar_s']/str[.='y']"
- , "count(//arr[@name='zot_s']/str)=1"
- , "//arr[@name='zot_s']/str[.='z']"
- );
-
+ assertQ(
+ req("q", "id:lit-def-simple"),
+ "//*[@numFound='1']",
+ "count(//arr[@name='foo_s']/str)=1",
+ "//arr[@name='foo_s']/str[.='x']",
+ "count(//arr[@name='bar_s']/str)=1",
+ "//arr[@name='bar_s']/str[.='y']",
+ "count(//arr[@name='zot_s']/str)=1",
+ "//arr[@name='zot_s']/str[.='z']");
}
@Test
public void testPlainTextSpecifyingMimeType() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying MIME type:
- loadLocal("extraction/version_control.txt", "fmap.created",
"extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "one",
- "fmap.language", "extractedLanguage",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.content", "extractedContent",
- ExtractingParams.STREAM_TYPE, "text/plain"
- );
+ loadLocal(
+ "extraction/version_control.txt",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "literal.id",
+ "one",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.X-Parsed-By",
+ "ignored_parser",
+ "fmap.content",
+ "extractedContent",
+ ExtractingParams.STREAM_TYPE,
+ "text/plain");
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
@@ -419,19 +611,33 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
@Test
public void testPlainTextSpecifyingResourceName() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying filename
- loadLocal("extraction/version_control.txt", "fmap.created",
"extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "one",
- "fmap.language", "extractedLanguage",
- "fmap.X-Parsed-By", "ignored_parser",
- "fmap.content", "extractedContent",
- ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt"
- );
+ loadLocal(
+ "extraction/version_control.txt",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "literal.id",
+ "one",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.X-Parsed-By",
+ "ignored_parser",
+ "fmap.content",
+ "extractedContent",
+ ExtractingParams.RESOURCE_NAME,
+ "extraction/version_control.txt");
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("extractedContent:Apache"), "//*[@numFound='1']");
@@ -439,18 +645,24 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
@Test
public void testCommitWithin() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
-
- SolrQueryRequest req = req("literal.id", "one",
- ExtractingParams.RESOURCE_NAME,
"extraction/version_control.txt",
- "commitWithin", "200"
- );
+
+ SolrQueryRequest req =
+ req(
+ "literal.id",
+ "one",
+ ExtractingParams.RESOURCE_NAME,
+ "extraction/version_control.txt",
+ "commitWithin",
+ "200");
SolrQueryResponse rsp = new SolrQueryResponse();
BufferingRequestProcessor p = new BufferingRequestProcessor(null);
ExtractingDocumentLoader loader = (ExtractingDocumentLoader)
handler.newLoader(req, p);
- loader.load(req, rsp, new
ContentStreamBase.FileStream(getFile("extraction/version_control.txt")),p);
+ loader.load(
+ req, rsp, new
ContentStreamBase.FileStream(getFile("extraction/version_control.txt")), p);
AddUpdateCommand add = p.addCommands.get(0);
assertEquals(200, add.commitWithin);
@@ -458,20 +670,25 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
req.close();
}
- // Note: If you load a plain text file specifying neither MIME type nor
filename, extraction will silently fail. This is because Tika's
- // automatic MIME type detection will fail, and it will default to using an
empty-string-returning default parser
+ // Note: If you load a plain text file specifying neither MIME type nor
filename, extraction will
+ // silently fail. This is because Tika's
+ // automatic MIME type detection will fail, and it will default to using an
empty-string-returning
+ // default parser
@Test
public void testExtractOnly() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf",
ExtractingParams.EXTRACT_ONLY, "true");
+ SolrQueryResponse rsp =
+ loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY,
"true");
assertTrue("rsp is null and it shouldn't be", rsp != null);
NamedList<?> list = rsp.getValues();
String extraction = (String) list.get("solr-word.pdf");
assertTrue("extraction is null and it shouldn't be", extraction != null);
- assertTrue(extraction + " does not contain " + "solr-word",
extraction.indexOf("solr-word") != -1);
+ assertTrue(
+ extraction + " does not contain " + "solr-word",
extraction.indexOf("solr-word") != -1);
NamedList<?> nl = (NamedList<?>) list.get("solr-word.pdf_metadata");
assertTrue("nl is null and it shouldn't be", nl != null);
@@ -479,66 +696,95 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
assertTrue("title is null and it shouldn't be", title != null);
assertTrue(extraction.indexOf("<?xml") != -1);
- rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY,
"true",
- ExtractingParams.EXTRACT_FORMAT,
ExtractingDocumentLoader.TEXT_FORMAT);
+ rsp =
+ loadLocal(
+ "extraction/solr-word.pdf",
+ ExtractingParams.EXTRACT_ONLY,
+ "true",
+ ExtractingParams.EXTRACT_FORMAT,
+ ExtractingDocumentLoader.TEXT_FORMAT);
assertTrue("rsp is null and it shouldn't be", rsp != null);
list = rsp.getValues();
extraction = (String) list.get("solr-word.pdf");
assertTrue("extraction is null and it shouldn't be", extraction != null);
- assertTrue(extraction + " does not contain " + "solr-word",
extraction.indexOf("solr-word") != -1);
+ assertTrue(
+ extraction + " does not contain " + "solr-word",
extraction.indexOf("solr-word") != -1);
assertTrue(extraction.indexOf("<?xml") == -1);
nl = (NamedList<?>) list.get("solr-word.pdf_metadata");
assertTrue("nl is null and it shouldn't be", nl != null);
title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
-
-
-
}
@Test
public void testXPath() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- SolrQueryResponse rsp = loadLocal("extraction/example.html",
- ExtractingParams.XPATH_EXPRESSION,
"/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
- ExtractingParams.EXTRACT_ONLY, "true"
- );
+ SolrQueryResponse rsp =
+ loadLocal(
+ "extraction/example.html",
+ ExtractingParams.XPATH_EXPRESSION,
+ "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
+ ExtractingParams.EXTRACT_ONLY,
+ "true");
assertTrue("rsp is null and it shouldn't be", rsp != null);
NamedList<?> list = rsp.getValues();
String val = (String) list.get("example.html");
- assertEquals("News", val.trim()); //there is only one matching <a> tag
-
- loadLocal("extraction/example.html",
- "literal.id", "example1",
- "captureAttr", "true",
- "defaultField", "text",
- "capture", "div",
- "fmap.div", "foo_t",
- "boost.foo_t", "3",
- "xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
- "commit", "true"
- );
+ assertEquals("News", val.trim()); // there is only one matching <a> tag
+
+ loadLocal(
+ "extraction/example.html",
+ "literal.id",
+ "example1",
+ "captureAttr",
+ "true",
+ "defaultField",
+ "text",
+ "capture",
+ "div",
+ "fmap.div",
+ "foo_t",
+ "boost.foo_t",
+ "3",
+ "xpath",
+ "/xhtml:html/xhtml:body/xhtml:div//node()",
+ "commit",
+ "true");
assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""),
"//*[@numFound='1']");
}
/** test arabic PDF extraction is functional */
@Test
public void testArabicPDF() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
- h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "fmap.Author", "extractedAuthor",
- "uprefix", "ignored_",
- "fmap.content", "wdf_nocase",
- "literal.id", "one",
- "fmap.Last-Modified", "extractedDate");
+ loadLocal(
+ "extraction/arabic.pdf",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "fmap.Author",
+ "extractedAuthor",
+ "uprefix",
+ "ignored_",
+ "fmap.content",
+ "wdf_nocase",
+ "literal.id",
+ "one",
+ "fmap.Last-Modified",
+ "extractedDate");
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
assertU(commit());
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
@@ -546,133 +792,215 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
@Test
public void testTikaExceptionHandling() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
- h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- expectThrows(Exception.class, () -> {
- loadLocal("extraction/password-is-solrcell.docx", "literal.id", "one");
- });
+ expectThrows(
+ Exception.class,
+ () -> {
+ loadLocal("extraction/password-is-solrcell.docx", "literal.id",
"one");
+ });
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=0]");
- try{
- loadLocal("extraction/password-is-solrcell.docx", "fmap.created",
"extractedDate", "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator", "fmap.Keywords",
"extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "one",
- "ignoreTikaException", "true", // set ignore flag
- "fmap.Last-Modified", "extractedDate");
- }
- catch(Exception e){
+ try {
+ loadLocal(
+ "extraction/password-is-solrcell.docx",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "wdf_nocase",
+ "literal.id",
+ "one",
+ "ignoreTikaException",
+ "true", // set ignore flag
+ "fmap.Last-Modified",
+ "extractedDate");
+ } catch (Exception e) {
fail("TikaException should be ignored.");
}
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=1]");
}
-
+
@Test
public void testWrongStreamType() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- expectThrows(Exception.class, () -> {
- // Load plain text specifying another mime type, should fail
- loadLocal("extraction/version_control.txt",
- "literal.id", "one",
- ExtractingParams.STREAM_TYPE, "application/pdf"
- );
- });
-
- expectThrows(Exception.class, () -> {
- // Load plain text specifying non existing mimetype, should fail
- loadLocal("extraction/version_control.txt",
- "literal.id", "one",
- ExtractingParams.STREAM_TYPE, "foo/bar"
- );
- });
+ expectThrows(
+ Exception.class,
+ () -> {
+ // Load plain text specifying another mime type, should fail
+ loadLocal(
+ "extraction/version_control.txt",
+ "literal.id",
+ "one",
+ ExtractingParams.STREAM_TYPE,
+ "application/pdf");
+ });
+
+ expectThrows(
+ Exception.class,
+ () -> {
+ // Load plain text specifying non existing mimetype, should fail
+ loadLocal(
+ "extraction/version_control.txt",
+ "literal.id",
+ "one",
+ ExtractingParams.STREAM_TYPE,
+ "foo/bar");
+ });
}
public void testLiteralsOverride() throws Exception {
- ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
+ ExtractingRequestHandler handler =
+ (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
-
+
assertQ(req("*:*"), "//*[@numFound='0']");
// Here Tika should parse out a title for this document:
- loadLocal("extraction/solr-word.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "three",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Last-Modified", "extractedDate");
+ loadLocal(
+ "extraction/solr-word.pdf",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "literal.id",
+ "three",
+ "fmap.content",
+ "extractedContent",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Last-Modified",
+ "extractedDate");
// Here the literal value should override the Tika-parsed title:
- loadLocal("extraction/solr-word.pdf",
- "literal.title", "wolf-man",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "four",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Last-Modified", "extractedDate");
+ loadLocal(
+ "extraction/solr-word.pdf",
+ "literal.title",
+ "wolf-man",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "literal.id",
+ "four",
+ "fmap.content",
+ "extractedContent",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Last-Modified",
+ "extractedDate");
// Here we mimic the old behaviour where literals are added, not overridden
- loadLocal("extraction/solr-word.pdf",
- "literalsOverride", "false",
- // Trick - we first map the metadata-title to an ignored field
before we replace with literal title
- "fmap.title", "ignored_a",
- "literal.title", "old-behaviour",
- "literal.extractedKeywords", "literalkeyword",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Author", "extractedAuthor",
- "literal.id", "five",
- "fmap.content", "extractedContent",
- "fmap.language", "extractedLanguage",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Last-Modified", "extractedDate");
+ loadLocal(
+ "extraction/solr-word.pdf",
+ "literalsOverride",
+ "false",
+ // Trick - we first map the metadata-title to an ignored field before
we replace with
+ // literal title
+ "fmap.title",
+ "ignored_a",
+ "literal.title",
+ "old-behaviour",
+ "literal.extractedKeywords",
+ "literalkeyword",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Author",
+ "extractedAuthor",
+ "literal.id",
+ "five",
+ "fmap.content",
+ "extractedContent",
+ "fmap.language",
+ "extractedLanguage",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Last-Modified",
+ "extractedDate");
assertU(commit());
assertQ(req("title:solr-word"), "//*[@numFound='1']");
assertQ(req("title:wolf-man"), "//*[@numFound='1']");
- assertQ(req("extractedKeywords:(solr AND word AND pdf AND
literalkeyword)"), "//*[@numFound='1']");
+ assertQ(
+ req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"),
"//*[@numFound='1']");
}
@Test
public void testPdfWithImages() throws Exception {
- //Tests possibility to configure ParseContext (by example to extract
embedded images from pdf)
- loadLocal("extraction/pdf-with-image.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "pdfWithImage",
- "resource.name", "pdf-with-image.pdf",
- "resource.password", "solrRules",
- "fmap.Last-Modified", "extractedDate");
+ // Tests possibility to configure ParseContext (by example to extract
embedded images from pdf)
+ loadLocal(
+ "extraction/pdf-with-image.pdf",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "wdf_nocase",
+ "literal.id",
+ "pdfWithImage",
+ "resource.name",
+ "pdf-with-image.pdf",
+ "resource.password",
+ "solrRules",
+ "fmap.Last-Modified",
+ "extractedDate");
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
assertU(commit());
@@ -682,74 +1010,126 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
@Test
public void testPasswordProtected() throws Exception {
// PDF, Passwords from resource.password
- loadLocal("extraction/encrypted-password-is-solrRules.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "pdfpwliteral",
- "resource.name", "encrypted-password-is-solrRules.pdf",
- "resource.password", "solrRules",
- "fmap.Last-Modified", "extractedDate");
+ loadLocal(
+ "extraction/encrypted-password-is-solrRules.pdf",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "wdf_nocase",
+ "literal.id",
+ "pdfpwliteral",
+ "resource.name",
+ "encrypted-password-is-solrRules.pdf",
+ "resource.password",
+ "solrRules",
+ "fmap.Last-Modified",
+ "extractedDate");
// PDF, Passwords from passwords property file
- loadLocal("extraction/encrypted-password-is-solrRules.pdf",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "pdfpwfile",
- "resource.name", "encrypted-password-is-solrRules.pdf",
- "passwordsFile", "passwordRegex.properties", // Passwords-file
- "fmap.Last-Modified", "extractedDate");
+ loadLocal(
+ "extraction/encrypted-password-is-solrRules.pdf",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "wdf_nocase",
+ "literal.id",
+ "pdfpwfile",
+ "resource.name",
+ "encrypted-password-is-solrRules.pdf",
+ "passwordsFile",
+ "passwordRegex.properties", // Passwords-file
+ "fmap.Last-Modified",
+ "extractedDate");
// DOCX, Explicit password
- loadLocal("extraction/password-is-Word2010.docx",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "uprefix", "ignored_",
- "literal.id", "docxpwliteral",
- "resource.name", "password-is-Word2010.docx",
- "resource.password", "Word2010", // Explicit password
- "fmap.Last-Modified", "extractedDate");
+ loadLocal(
+ "extraction/password-is-Word2010.docx",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "wdf_nocase",
+ "uprefix",
+ "ignored_",
+ "literal.id",
+ "docxpwliteral",
+ "resource.name",
+ "password-is-Word2010.docx",
+ "resource.password",
+ "Word2010", // Explicit password
+ "fmap.Last-Modified",
+ "extractedDate");
// DOCX, Passwords from file
- loadLocal("extraction/password-is-Word2010.docx",
- "fmap.created", "extractedDate",
- "fmap.producer", "extractedProducer",
- "fmap.creator", "extractedCreator",
- "fmap.Keywords", "extractedKeywords",
- "fmap.Creation-Date", "extractedDate",
- "uprefix", "ignored_",
- "fmap.Author", "extractedAuthor",
- "fmap.content", "wdf_nocase",
- "literal.id", "docxpwfile",
- "resource.name", "password-is-Word2010.docx",
- "passwordsFile", "passwordRegex.properties", // Passwords-file
- "fmap.Last-Modified", "extractedDate");
-
+ loadLocal(
+ "extraction/password-is-Word2010.docx",
+ "fmap.created",
+ "extractedDate",
+ "fmap.producer",
+ "extractedProducer",
+ "fmap.creator",
+ "extractedCreator",
+ "fmap.Keywords",
+ "extractedKeywords",
+ "fmap.Creation-Date",
+ "extractedDate",
+ "uprefix",
+ "ignored_",
+ "fmap.Author",
+ "extractedAuthor",
+ "fmap.content",
+ "wdf_nocase",
+ "literal.id",
+ "docxpwfile",
+ "resource.name",
+ "password-is-Word2010.docx",
+ "passwordsFile",
+ "passwordRegex.properties", // Passwords-file
+ "fmap.Last-Modified",
+ "extractedDate");
+
assertU(commit());
Thread.sleep(100);
assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
assertQ(req("wdf_nocase:\"Test password protected word doc\""),
"//*[@numFound='2']");
}
-
- SolrQueryResponse loadLocalFromHandler(String handler, String filename,
- String... args) throws Exception {
-
+
+ SolrQueryResponse loadLocalFromHandler(String handler, String filename,
String... args)
+ throws Exception {
+
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {
// TODO: stop using locally defined streams once stream.file and
@@ -766,6 +1146,4 @@ public class ExtractingRequestHandlerTest extends
SolrTestCaseJ4 {
SolrQueryResponse loadLocal(String filename, String... args) throws
Exception {
return loadLocalFromHandler("/update/extract", filename, args);
}
-
-
}
diff --git
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
index 8aeeaad..680ba14 100644
---
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
+++
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
@@ -16,9 +16,8 @@
*/
package org.apache.solr.handler.extraction;
-import javax.xml.parsers.DocumentBuilderFactory;
import java.nio.file.Paths;
-
+import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.tika.parser.ParseContext;
@@ -28,12 +27,11 @@ import org.w3c.dom.Element;
public class ParseContextConfigTest extends SolrTestCaseJ4 {
- public void testAll() throws Exception {
+ public void testAll() throws Exception {
Document document =
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
Element entries = document.createElement("entries");
Element entry = document.createElement("entry");
-
entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
@@ -44,11 +42,11 @@ public class ParseContextConfigTest extends SolrTestCaseJ4 {
entry.appendChild(property);
entries.appendChild(entry);
- ParseContext parseContext = new ParseContextConfig(new
SolrResourceLoader(Paths.get(".")), entries).create();
+ ParseContext parseContext =
+ new ParseContextConfig(new SolrResourceLoader(Paths.get(".")),
entries).create();
PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
assertEquals(true, pdfParserConfig.getExtractInlineImages());
}
-
}
diff --git
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
index 78f2df3..56b8945 100644
---
a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
+++
b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TestXLSXResponseWriter.java
@@ -23,7 +23,6 @@ import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
-
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
@@ -48,9 +47,9 @@ public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
System.setProperty("enable.update.log", "false");
-
initCore("solrconfig.xml","schema.xml",getFile("extraction/solr").getAbsolutePath());
+ initCore("solrconfig.xml", "schema.xml",
getFile("extraction/solr").getAbsolutePath());
createIndex();
- //find a reference to the default response writer so we can redirect its
output later
+ // find a reference to the default response writer so we can redirect its
output later
SolrCore testCore = h.getCore();
QueryResponseWriter writer = testCore.getQueryResponseWriter("xlsx");
if (writer instanceof XLSXResponseWriter) {
@@ -61,11 +60,41 @@ public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
}
public static void createIndex() {
- assertU(adoc("id","1", "foo_i","-1", "foo_s","hi",
"foo_l","12345678987654321", "foo_b","false",
"foo_f","1.414","foo_d","-1.0E300","foo_dt1","2000-01-02T03:04:05Z"));
- assertU(adoc("id","2", "v_ss","hi", "v_ss","there", "v2_ss","nice",
"v2_ss","output", "shouldbeunstored","foo"));
- assertU(adoc("id","3", "shouldbeunstored","foo"));
- assertU(adoc("id","4", "foo_s1","foo"));
- assertU(adoc("id","5", "pubyear_ii", "123", "store_iis", "12", "price_ff",
"1.3"));
+ assertU(
+ adoc(
+ "id",
+ "1",
+ "foo_i",
+ "-1",
+ "foo_s",
+ "hi",
+ "foo_l",
+ "12345678987654321",
+ "foo_b",
+ "false",
+ "foo_f",
+ "1.414",
+ "foo_d",
+ "-1.0E300",
+ "foo_dt1",
+ "2000-01-02T03:04:05Z"));
+ assertU(
+ adoc(
+ "id",
+ "2",
+ "v_ss",
+ "hi",
+ "v_ss",
+ "there",
+ "v2_ss",
+ "nice",
+ "v2_ss",
+ "output",
+ "shouldbeunstored",
+ "foo"));
+ assertU(adoc("id", "3", "shouldbeunstored", "foo"));
+ assertU(adoc("id", "4", "foo_s1", "foo"));
+ assertU(adoc("id", "5", "pubyear_ii", "123", "store_iis", "12",
"price_ff", "1.3"));
assertU(commit());
}
@@ -85,212 +114,266 @@ public class TestXLSXResponseWriter extends
SolrTestCaseJ4 {
SolrQueryRequest r = req();
// check Content-Type
-
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
writerXlsx.getContentType(r, rsp));
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ writerXlsx.getContentType(r, rsp));
// test our basic types,and that fields come back in the requested order
- XSSFSheet resultSheet = getWSResultForQuery(req("q","id:1", "wt","xlsx",
"fl","id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1"));
+ XSSFSheet resultSheet =
+ getWSResultForQuery(
+ req("q", "id:1", "wt", "xlsx", "fl",
"id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1"));
-
assertEquals("id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1\n1,hi,-1,12345678987654321,F,1.414,-1.0E300,2000-01-02T03:04:05Z\n"
- , getStringFromSheet(resultSheet));
+ assertEquals(
+
"id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1\n1,hi,-1,12345678987654321,F,1.414,-1.0E300,2000-01-02T03:04:05Z\n",
+ getStringFromSheet(resultSheet));
- resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx",
"fl","id,score,foo_s"));
+ resultSheet = getWSResultForQuery(req("q", "id:1^0", "wt", "xlsx", "fl",
"id,score,foo_s"));
// test retrieving score
assertEquals("id,score,foo_s\n1,0.0,hi\n",
getStringFromSheet(resultSheet));
- resultSheet = getWSResultForQuery(req("q","id:1^0", "wt","xlsx",
"colname.id", "I.D.", "colwidth.id", "10",
- "fl","id,score,foo_s"));
+ resultSheet =
+ getWSResultForQuery(
+ req(
+ "q",
+ "id:1^0",
+ "wt",
+ "xlsx",
+ "colname.id",
+ "I.D.",
+ "colwidth.id",
+ "10",
+ "fl",
+ "id,score,foo_s"));
// test override colname/width
assertEquals("I.D.,score,foo_s\n1,0.0,hi\n",
getStringFromSheet(resultSheet));
// test colwidth (value returned is in 256ths of a character as per excel
standard)
- assertEquals(10*256, resultSheet.getColumnWidth(0));
+ assertEquals(10 * 256, resultSheet.getColumnWidth(0));
- resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx",
"fl","id,v_ss"));
+ resultSheet = getWSResultForQuery(req("q", "id:2", "wt", "xlsx", "fl",
"id,v_ss"));
// test multivalued
assertEquals("id,v_ss\n2,hi; there\n", getStringFromSheet(resultSheet));
// test retrieving fields from index
- resultSheet = getWSResultForQuery(req("q","*:*", "wt","xslx",
"fl","*,score"));
+ resultSheet = getWSResultForQuery(req("q", "*:*", "wt", "xslx", "fl",
"*,score"));
String result = getStringFromSheet(resultSheet);
- for (String field :
"id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss,score".split(",")) {
+ for (String field :
+
"id,foo_s,foo_i,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss,score".split(",")) {
assertTrue(result.indexOf(field) >= 0);
}
// test null values
- resultSheet = getWSResultForQuery(req("q","id:2", "wt","xlsx",
"fl","id,foo_s,v_ss"));
+ resultSheet = getWSResultForQuery(req("q", "id:2", "wt", "xlsx", "fl",
"id,foo_s,v_ss"));
assertEquals("id,foo_s,v_ss\n2,,hi; there\n",
getStringFromSheet(resultSheet));
// now test SolrDocumentList
SolrDocument d = new SolrDocument();
SolrDocument d1 = d;
- d.addField("id","1");
- d.addField("foo_i",-1);
- d.addField("foo_s","hi");
- d.addField("foo_l","12345678987654321L");
- d.addField("foo_b",false);
- d.addField("foo_f",1.414f);
- d.addField("foo_d",-1.0E300);
+ d.addField("id", "1");
+ d.addField("foo_i", -1);
+ d.addField("foo_s", "hi");
+ d.addField("foo_l", "12345678987654321L");
+ d.addField("foo_b", false);
+ d.addField("foo_f", 1.414f);
+ d.addField("foo_d", -1.0E300);
d.addField("foo_dt1", new
Date(Instant.parse("2000-01-02T03:04:05Z").toEpochMilli()));
d.addField("score", "2.718");
d = new SolrDocument();
SolrDocument d2 = d;
- d.addField("id","2");
- d.addField("v_ss","hi");
- d.addField("v_ss","there");
- d.addField("v2_ss","nice");
- d.addField("v2_ss","output");
+ d.addField("id", "2");
+ d.addField("v_ss", "hi");
+ d.addField("v_ss", "there");
+ d.addField("v2_ss", "nice");
+ d.addField("v2_ss", "output");
d.addField("score", "89.83");
- d.addField("shouldbeunstored","foo");
+ d.addField("shouldbeunstored", "foo");
SolrDocumentList sdl = new SolrDocumentList();
sdl.add(d1);
sdl.add(d2);
-
- SolrQueryRequest req = req("q","*:*");
+
+ SolrQueryRequest req = req("q", "*:*");
rsp = new SolrQueryResponse();
rsp.addResponse(sdl);
- rsp.setReturnFields( new SolrReturnFields("id,foo_s", req) );
+ rsp.setReturnFields(new SolrReturnFields("id,foo_s", req));
resultSheet = getWSResultForQuery(req, rsp);
assertEquals("id,foo_s\n1,hi\n2,\n", getStringFromSheet(resultSheet));
// try scores
- rsp.setReturnFields( new SolrReturnFields("id,score,foo_s", req) );
+ rsp.setReturnFields(new SolrReturnFields("id,score,foo_s", req));
resultSheet = getWSResultForQuery(req, rsp);
assertEquals("id,score,foo_s\n1,2.718,hi\n2,89.83,\n",
getStringFromSheet(resultSheet));
// get field values from docs... should be ordered and not include score
unless requested
- rsp.setReturnFields( new SolrReturnFields("*", req) );
+ rsp.setReturnFields(new SolrReturnFields("*", req));
resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss\n"
+
-
"1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z,,\n" +
- "2,,,,,,,,hi; there,nice; output\n", getStringFromSheet(resultSheet));
-
- // get field values and scores - just check that the scores are there...
we don't guarantee where
- rsp.setReturnFields( new SolrReturnFields("*,score", req) );
+ assertEquals(
+ "id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1,v_ss,v2_ss\n"
+ +
"1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z,,\n"
+ + "2,,,,,,,,hi; there,nice; output\n",
+ getStringFromSheet(resultSheet));
+
+ // get field values and scores - just check that the scores are there...
we don't guarantee
+ // where
+ rsp.setReturnFields(new SolrReturnFields("*,score", req));
resultSheet = getWSResultForQuery(req, rsp);
String s = getStringFromSheet(resultSheet);
- assertTrue(s.indexOf("score") >=0 && s.indexOf("2.718") > 0 &&
s.indexOf("89.83") > 0 );
-
+ assertTrue(s.indexOf("score") >= 0 && s.indexOf("2.718") > 0 &&
s.indexOf("89.83") > 0);
+
// Test field globs
- rsp.setReturnFields( new SolrReturnFields("id,foo*", req) );
+ rsp.setReturnFields(new SolrReturnFields("id,foo*", req));
resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1\n" +
-
"1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z\n" +
- "2,,,,,,,\n", getStringFromSheet(resultSheet));
+ assertEquals(
+ "id,foo_i,foo_s,foo_l,foo_b,foo_f,foo_d,foo_dt1\n"
+ +
"1,-1,hi,12345678987654321L,false,1.414,-1.0E300,2000-01-02T03:04:05Z\n"
+ + "2,,,,,,,\n",
+ getStringFromSheet(resultSheet));
- rsp.setReturnFields( new SolrReturnFields("id,*_d*", req) );
+ rsp.setReturnFields(new SolrReturnFields("id,*_d*", req));
resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("id,foo_d,foo_dt1\n" +
- "1,-1.0E300,2000-01-02T03:04:05Z\n" +
- "2,,\n", getStringFromSheet(resultSheet));
+ assertEquals(
+ "id,foo_d,foo_dt1\n" + "1,-1.0E300,2000-01-02T03:04:05Z\n" + "2,,\n",
+ getStringFromSheet(resultSheet));
// Test function queries
- rsp.setReturnFields( new
SolrReturnFields("sum(1,1),id,exists(foo_s1),div(9,1),foo_f", req) );
+ rsp.setReturnFields(new
SolrReturnFields("sum(1,1),id,exists(foo_s1),div(9,1),foo_f", req));
resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("sum(1,1),id,exists(foo_s1),div(9,1),foo_f\n" +
- ",1,,,1.414\n" +
- ",2,,,\n", getStringFromSheet(resultSheet));
+ assertEquals(
+ "sum(1,1),id,exists(foo_s1),div(9,1),foo_f\n" + ",1,,,1.414\n" +
",2,,,\n",
+ getStringFromSheet(resultSheet));
// Test transformers
- rsp.setReturnFields( new SolrReturnFields("mydocid:[docid],[explain]",
req) );
+ rsp.setReturnFields(new SolrReturnFields("mydocid:[docid],[explain]",
req));
resultSheet = getWSResultForQuery(req, rsp);
- assertEquals("mydocid,[explain]\n" +
- ",\n" +
- ",\n", getStringFromSheet(resultSheet));
+ assertEquals("mydocid,[explain]\n" + ",\n" + ",\n",
getStringFromSheet(resultSheet));
req.close();
}
-
@Test
public void testPseudoFields() throws Exception {
// Use Pseudo Field
- SolrQueryRequest req = req("q","id:1", "wt","xlsx", "fl","XXX:id,foo_s");
+ SolrQueryRequest req = req("q", "id:1", "wt", "xlsx", "fl",
"XXX:id,foo_s");
XSSFSheet resultSheet = getWSResultForQuery(req);
assertEquals("XXX,foo_s\n1,hi\n", getStringFromSheet(resultSheet));
-
- String txt = getStringFromSheet(getWSResultForQuery(req("q","id:1",
"wt","xlsx", "fl","XXX:id,YYY:[docid],FOO:foo_s")));
+
+ String txt =
+ getStringFromSheet(
+ getWSResultForQuery(
+ req("q", "id:1", "wt", "xlsx", "fl",
"XXX:id,YYY:[docid],FOO:foo_s")));
String[] lines = txt.split("\n");
assertEquals(2, lines.length);
- assertEquals("XXX,YYY,FOO", lines[0] );
- assertEquals("1,0,hi", lines[1] );
-
- //assertions specific to multiple pseudofields functions like abs, div,
exists, etc.. (SOLR-5423)
- String funcText = getStringFromSheet(getWSResultForQuery(req("df", "text",
"q","*", "wt","xlsx", "fl","XXX:id,YYY:exists(foo_s1)")));
+ assertEquals("XXX,YYY,FOO", lines[0]);
+ assertEquals("1,0,hi", lines[1]);
+
+ // assertions specific to multiple pseudofields functions like abs, div,
exists, etc..
+ // (SOLR-5423)
+ String funcText =
+ getStringFromSheet(
+ getWSResultForQuery(
+ req("df", "text", "q", "*", "wt", "xlsx", "fl",
"XXX:id,YYY:exists(foo_s1)")));
String[] funcLines = funcText.split("\n");
assertEquals(6, funcLines.length);
- assertEquals("XXX,YYY", funcLines[0] );
- assertEquals("1,false", funcLines[1] );
- assertEquals("3,false", funcLines[3] );
-
- //assertions specific to single function without alias (SOLR-5423)
- String singleFuncText = getStringFromSheet(
- getWSResultForQuery(req("df", "text", "q","*", "wt","xlsx",
"fl","exists(foo_s1),XXX:id")));
+ assertEquals("XXX,YYY", funcLines[0]);
+ assertEquals("1,false", funcLines[1]);
+ assertEquals("3,false", funcLines[3]);
+
+ // assertions specific to single function without alias (SOLR-5423)
+ String singleFuncText =
+ getStringFromSheet(
+ getWSResultForQuery(
+ req("df", "text", "q", "*", "wt", "xlsx", "fl",
"exists(foo_s1),XXX:id")));
String[] singleFuncLines = singleFuncText.split("\n");
assertEquals(6, singleFuncLines.length);
- assertEquals("exists(foo_s1),XXX", singleFuncLines[0] );
- assertEquals("false,1", singleFuncLines[1] );
- assertEquals("false,3", singleFuncLines[3] );
+ assertEquals("exists(foo_s1),XXX", singleFuncLines[0]);
+ assertEquals("false,1", singleFuncLines[1]);
+ assertEquals("false,3", singleFuncLines[3]);
// pseudo-fields with * in fl
- txt = getStringFromSheet(
- getWSResultForQuery(req("df", "text", "q","id:4", "wt","xlsx",
"fl","*,YYY:[docid],FOO:foo_s1")));
+ txt =
+ getStringFromSheet(
+ getWSResultForQuery(
+ req("df", "text", "q", "id:4", "wt", "xlsx", "fl",
"*,YYY:[docid],FOO:foo_s1")));
lines = txt.split("\n");
assertEquals(2, lines.length);
- assertEquals(sortHeader("foo_i,foo_l,FOO,foo_s,pubyear_ii,store_iis," +
-
"v2_ss,multiDefault,timestamp,foo_dt1,foo_b,YYY,foo_d,id,foo_f,v_ss,foo_s1,intDefault"),
sortHeader(lines[0]));
+ assertEquals(
+ sortHeader(
+ "foo_i,foo_l,FOO,foo_s,pubyear_ii,store_iis,"
+ +
"v2_ss,multiDefault,timestamp,foo_dt1,foo_b,YYY,foo_d,id,foo_f,v_ss,foo_s1,intDefault"),
+ sortHeader(lines[0]));
}
@Test
public void testForDVEnabledFields() throws Exception {
// for dv enabled and useDocValueAsStored=true
// returns pubyear_ii, store_iis but not price_ff
- String singleFuncText = getStringFromSheet(
- getWSResultForQuery(req("df", "text", "q","id:5", "wt","xlsx")));
- String sortedHeader = sortHeader("foo_i,foo_l,foo_s,pubyear_ii,store_iis,"
+
-
"v2_ss,multiDefault,timestamp,foo_dt1,foo_b,foo_d,id,foo_f,v_ss,foo_s1,intDefault");
+ String singleFuncText =
+ getStringFromSheet(getWSResultForQuery(req("df", "text", "q", "id:5",
"wt", "xlsx")));
+ String sortedHeader =
+ sortHeader(
+ "foo_i,foo_l,foo_s,pubyear_ii,store_iis,"
+ +
"v2_ss,multiDefault,timestamp,foo_dt1,foo_b,foo_d,id,foo_f,v_ss,foo_s1,intDefault");
String[] singleFuncLines = singleFuncText.split("\n");
assertEquals(2, singleFuncLines.length);
assertEquals(sortedHeader, sortHeader(singleFuncLines[0]));
- List<String> actualVal =
Arrays.stream(singleFuncLines[1].trim().split(","))
- .filter(val -> !val.trim().isEmpty() && !val.trim().equals("\"\""))
- .collect(Collectors.toList());
+ List<String> actualVal =
+ Arrays.stream(singleFuncLines[1].trim().split(","))
+ .filter(val -> !val.trim().isEmpty() && !val.trim().equals("\"\""))
+ .collect(Collectors.toList());
assertTrue(actualVal.containsAll(Arrays.asList("5", "123", "12")));
// explicit fl=*
- singleFuncText = getStringFromSheet(
- getWSResultForQuery(req("df", "text", "q","id:5", "wt","xlsx", "fl",
"*")));
+ singleFuncText =
+ getStringFromSheet(
+ getWSResultForQuery(req("df", "text", "q", "id:5", "wt", "xlsx",
"fl", "*")));
singleFuncLines = singleFuncText.split("\n");
assertEquals(2, singleFuncLines.length);
assertEquals(sortedHeader, sortHeader(singleFuncLines[0]));
- actualVal = Arrays.stream(singleFuncLines[1].trim().split(","))
- .filter(val -> !val.trim().isEmpty() && !val.trim().equals("\"\""))
- .collect(Collectors.toList());
+ actualVal =
+ Arrays.stream(singleFuncLines[1].trim().split(","))
+ .filter(val -> !val.trim().isEmpty() && !val.trim().equals("\"\""))
+ .collect(Collectors.toList());
assertTrue(actualVal.containsAll(Arrays.asList("5", "123", "12")));
// explicit price_ff
- singleFuncText = getStringFromSheet(
- getWSResultForQuery(req("df", "text", "q","id:5", "wt","xlsx", "fl",
"price_ff")));
+ singleFuncText =
+ getStringFromSheet(
+ getWSResultForQuery(req("df", "text", "q", "id:5", "wt", "xlsx",
"fl", "price_ff")));
singleFuncLines = singleFuncText.split("\n");
assertEquals(2, singleFuncLines.length);
assertEquals("price_ff", singleFuncLines[0]);
assertEquals("1.3", singleFuncLines[1]);
// explicit price_ff with fl=*
- singleFuncText = getStringFromSheet(
- getWSResultForQuery(req("df", "text", "q","id:5", "wt","xlsx",
"csv.header","true", "fl", "*,price_ff")));
- sortedHeader = sortHeader("foo_i,foo_l,foo_b,foo_s,pubyear_ii,store_iis," +
-
"v2_ss,multiDefault,timestamp,foo_dt1,id,foo_d,foo_f,v_ss,foo_s1,intDefault,price_ff");
+ singleFuncText =
+ getStringFromSheet(
+ getWSResultForQuery(
+ req(
+ "df",
+ "text",
+ "q",
+ "id:5",
+ "wt",
+ "xlsx",
+ "csv.header",
+ "true",
+ "fl",
+ "*,price_ff")));
+ sortedHeader =
+ sortHeader(
+ "foo_i,foo_l,foo_b,foo_s,pubyear_ii,store_iis,"
+ +
"v2_ss,multiDefault,timestamp,foo_dt1,id,foo_d,foo_f,v_ss,foo_s1,intDefault,price_ff");
singleFuncLines = singleFuncText.split("\n");
assertEquals(2, singleFuncLines.length);
assertEquals(sortedHeader, sortHeader(singleFuncLines[0]));
- actualVal = Arrays.stream(singleFuncLines[1].trim().split(","))
- .filter(val -> !val.trim().isEmpty() && !val.trim().equals("\"\""))
- .collect(Collectors.toList());
+ actualVal =
+ Arrays.stream(singleFuncLines[1].trim().split(","))
+ .filter(val -> !val.trim().isEmpty() && !val.trim().equals("\"\""))
+ .collect(Collectors.toList());
assertTrue(actualVal.containsAll(Arrays.asList("5", "123", "12", "1.3")));
}
@@ -300,7 +383,8 @@ public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
return getWSResultForQuery(req, rsp);
}
- private XSSFSheet getWSResultForQuery(SolrQueryRequest req,
SolrQueryResponse rsp) throws Exception {
+ private XSSFSheet getWSResultForQuery(SolrQueryRequest req,
SolrQueryResponse rsp)
+ throws Exception {
ByteArrayOutputStream xmlBout = new ByteArrayOutputStream();
writerXlsx.write(xmlBout, req, rsp);
XSSFWorkbook output = new XSSFWorkbook(new
ByteArrayInputStream(xmlBout.toByteArray()));
@@ -312,8 +396,8 @@ public class TestXLSXResponseWriter extends SolrTestCaseJ4 {
private String getStringFromSheet(XSSFSheet sheet) {
StringBuilder output = new StringBuilder();
- for (Row row: sheet) {
- for (Cell cell: row) {
+ for (Row row : sheet) {
+ for (Cell cell : row) {
output.append(cell.getStringCellValue());
output.append(",");
}