Author: yonik
Date: Tue Jul 14 15:53:05 2009
New Revision: 793953
URL: http://svn.apache.org/viewvc?rev=793953&view=rev
Log:
SOLR-284: random cleanups, tests, interface changes
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml
lucene/solr/trunk/example/solr/conf/schema.xml
lucene/solr/trunk/example/solr/conf/solrconfig.xml
Modified: lucene/solr/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Tue Jul 14 15:53:05 2009
@@ -115,7 +115,7 @@
can be specified.
(Georgios Stamatis, Lars Kotthoff, Chris Harris via koji)
-20. SOLR-284: Added support for extracting content from binary documents like
MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt
(Eric Pugh, Chris Harris, gsingers)
+20. SOLR-284: Added support for extracting content from binary documents like
MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt
(Eric Pugh, Chris Harris, yonik, gsingers)
21. SOLR-819: Added factories for Arabic support (gsingers)
Modified:
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
---
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
(original)
+++
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
Tue Jul 14 15:53:05 2009
@@ -36,9 +36,11 @@
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
+import org.apache.tika.exception.TikaException;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
@@ -187,10 +189,10 @@
}
rsp.add(stream.getName() + "_metadata", metadataNL);
}
- } catch (Exception e) {
- //TODO: handle here with an option to not fail and just log the
exception
+ } catch (SAXException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ } catch (TikaException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-
} finally {
IOUtils.closeQuietly(inputStream);
}
Modified:
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
---
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
(original)
+++
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
Tue Jul 14 15:53:05 2009
@@ -23,7 +23,11 @@
**/
public interface ExtractingParams {
- public static final String EXTRACTING_PREFIX = "ext.";
+ /**
+ * Map all generated attribute names to field names with lowercase and
underscores.
+ */
+ public static final String LOWERNAMES = "lowernames";
+
/**
* The param prefix for mapping Tika metadata to Solr fields.
@@ -35,7 +39,7 @@
*
*
*/
- public static final String MAP_PREFIX = EXTRACTING_PREFIX + "map.";
+ public static final String MAP_PREFIX = "map.";
/**
* The boost value for the name of the field. The boost can be specified by
a name mapping.
@@ -48,7 +52,7 @@
* will boost the solr.title field for this document by 2.5
*
*/
- public static final String BOOST_PREFIX = EXTRACTING_PREFIX + "boost.";
+ public static final String BOOST_PREFIX = "boost.";
/**
* Pass in literal values to be added to the document, as in
@@ -57,7 +61,7 @@
* </pre>
*
*/
- public static final String LITERALS_PREFIX = EXTRACTING_PREFIX + "literal.";
+ public static final String LITERALS_PREFIX = "literal.";
/**
@@ -67,34 +71,21 @@
* <p/>
* See Tika's docs for what the extracted document looks like.
* <p/>
- * @see #DEFAULT_FIELDNAME
- * @see #CAPTURE_FIELDS
+ * @see #CAPTURE_ELEMENTS
*/
- public static final String XPATH_EXPRESSION = EXTRACTING_PREFIX + "xpath";
+ public static final String XPATH_EXPRESSION = "xpath";
/**
- * Only extract and return the document, do not index it.
+ * Only extract and return the content, do not index it.
*/
- public static final String EXTRACT_ONLY = EXTRACTING_PREFIX + "extract.only";
+ public static final String EXTRACT_ONLY = "extractOnly";
/**
- * Don't throw an exception if a field doesn't exist, just ignore it
+ * Capture attributes separately according to the name of the element,
instead of just adding them to the string buffer
*/
- public static final String IGNORE_UNDECLARED_FIELDS = EXTRACTING_PREFIX +
"ignore.und.fl";
+ public static final String CAPTURE_ATTRIBUTES = "captureAttr";
- /**
- * Index attributes separately according to their name, instead of just
adding them to the string buffer
- */
- public static final String INDEX_ATTRIBUTES = EXTRACTING_PREFIX + "idx.attr";
-
- /**
- * The field to index the contents to by default. If you want to capture a
specific piece
- * of the Tika document separately, see {...@link #CAPTURE_FIELDS}.
- *
- * @see #CAPTURE_FIELDS
- */
- public static final String DEFAULT_FIELDNAME = EXTRACTING_PREFIX + "def.fl";
/**
* Capture the specified fields (and everything included below it that isn't
capture by some other capture field) separately from the default. This is
different
@@ -116,26 +107,25 @@
* By passing in the p tag, you could capture all P tags separately from the
rest of the text.
* Thus, in the example, the capture of the P tag would be: "some text here.
more text"
*
- * @see #DEFAULT_FIELDNAME
*/
- public static final String CAPTURE_FIELDS = EXTRACTING_PREFIX + "capture";
+ public static final String CAPTURE_ELEMENTS = "capture";
/**
* The type of the stream. If not specified, Tika will use mime type
detection.
*/
- public static final String STREAM_TYPE = EXTRACTING_PREFIX + "stream.type";
+ public static final String STREAM_TYPE = "stream.type";
/**
* Optional. The file name. If specified, Tika can take this into account
while
* guessing the MIME type.
*/
- public static final String RESOURCE_NAME = EXTRACTING_PREFIX +
"resource.name";
+ public static final String RESOURCE_NAME = "resource.name";
/**
* Optional. If specified, the prefix will be prepended to all Metadata,
such that it would be possible
* to setup a dynamic field to automatically capture it
*/
- public static final String METADATA_PREFIX = EXTRACTING_PREFIX +
"metadata.prefix";
+ public static final String UNKNOWN_FIELD_PREFIX = "uprefix";
}
Modified:
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
---
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
(original)
+++
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
Tue Jul 14 15:53:05 2009
@@ -19,16 +19,11 @@
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.schema.DateField;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
-import org.apache.solr.schema.StrField;
-import org.apache.solr.schema.TextField;
-import org.apache.solr.schema.FieldType;
-import org.apache.solr.schema.UUIDField;
import org.apache.tika.metadata.Metadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -37,14 +32,7 @@
import org.xml.sax.helpers.DefaultHandler;
import java.text.DateFormat;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Stack;
-import java.util.UUID;
+import java.util.*;
/**
@@ -60,29 +48,22 @@
*/
public class SolrContentHandler extends DefaultHandler implements
ExtractingParams {
private transient static Logger log =
LoggerFactory.getLogger(SolrContentHandler.class);
- protected SolrInputDocument document;
+ private SolrInputDocument document;
- protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
+ private Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
- protected Metadata metadata;
- protected SolrParams params;
- protected StringBuilder catchAllBuilder = new StringBuilder(2048);
- //private StringBuilder currentBuilder;
- protected IndexSchema schema;
- //create empty so we don't have to worry about null checks
- protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
- protected Stack<StringBuilder> bldrStack = new Stack<StringBuilder>();
-
- protected boolean ignoreUndeclaredFields = false;
- protected boolean indexAttribs = false;
- protected String defaultFieldName;
+ private Metadata metadata;
+ private SolrParams params;
+ private StringBuilder catchAllBuilder = new StringBuilder(2048);
+ private IndexSchema schema;
+ private Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
+ private LinkedList<StringBuilder> bldrStack = new
LinkedList<StringBuilder>();
+
+ private boolean captureAttribs;
+ private boolean lowerNames;
+ private String contentFieldName = "content";
- protected String metadataPrefix = "";
-
- /**
- * Only access through getNextId();
- */
- private static long identifier = Long.MIN_VALUE;
+ private String unknownFieldPrefix = "";
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema
schema) {
@@ -97,22 +78,18 @@
this.params = params;
this.schema = schema;
this.dateFormats = dateFormats;
- this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS,
false);
- this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false);
- this.defaultFieldName = params.get(DEFAULT_FIELDNAME);
- this.metadataPrefix = params.get(METADATA_PREFIX, "");
- //if there's no default field and we are intending to index, then throw an
exception
- if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) ==
false) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No
default field name specified");
- }
- String[] captureFields = params.getParams(CAPTURE_FIELDS);
+
+ this.lowerNames = params.getBool(LOWERNAMES, false);
+ this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
+ this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
+ String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
if (captureFields != null && captureFields.length > 0) {
fieldBuilders = new HashMap<String, StringBuilder>();
for (int i = 0; i < captureFields.length; i++) {
fieldBuilders.put(captureFields[i], new StringBuilder());
}
}
- bldrStack.push(catchAllBuilder);
+ bldrStack.add(catchAllBuilder);
}
@@ -128,73 +105,27 @@
//handle the metadata extracted from the document
for (String name : metadata.names()) {
String[] vals = metadata.getValues(name);
- name = findMappedMetadataName(name);
- SchemaField schFld = schema.getFieldOrNull(name);
- if (schFld != null) {
- boost = getBoost(name);
- if (schFld.multiValued()) {
- for (int i = 0; i < vals.length; i++) {
- String val = vals[i];
- document.addField(name, transformValue(val, schFld), boost);
- }
- } else {
- StringBuilder builder = new StringBuilder();
- for (int i = 0; i < vals.length; i++) {
- builder.append(vals[i]).append(' ');
- }
- document.addField(name, transformValue(builder.toString().trim(),
schFld), boost);
- }
- } else {
- //TODO: error or log?
- if (ignoreUndeclaredFields == false) {
- // Arguably we should handle this as a special case. Why? Because
unlike basically
- // all the other fields in metadata, this one was probably set not
by Tika by in
- // ExtractingDocumentLoader.load(). You shouldn't have to define a
mapping for this
- // field just because you specified a resource.name parameter to the
handler, should
- // you?
- if (name != Metadata.RESOURCE_NAME_KEY) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Invalid field: " + name);
- }
- }
- }
+ addField(name, null, vals);
}
+
//handle the literals from the params
Iterator<String> paramNames = params.getParameterNamesIterator();
while (paramNames.hasNext()) {
- String name = paramNames.next();
- if (name.startsWith(LITERALS_PREFIX)) {
- String fieldName = name.substring(LITERALS_PREFIX.length());
- //no need to map names here, since they are literals from the user
- SchemaField schFld = schema.getFieldOrNull(fieldName);
- if (schFld != null) {
- String[] values = params.getParams(name);
- if (schFld.multiValued() == false && values.length > 1) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The
Field " + fieldName + " is not multivalued");
- }
- boost = getBoost(fieldName);
- for (int i = 0; i < values.length; i++) {
- //no need to transform here, b/c we can assume the user sent it in
correctly
- document.addField(fieldName, values[i], boost);
+ String pname = paramNames.next();
+ if (!pname.startsWith(LITERALS_PREFIX)) continue;
- }
- } else {
- handleUndeclaredField(fieldName);
- }
- }
+ String name = pname.substring(LITERALS_PREFIX.length());
+ addField(name, null, params.getParams(pname));
}
+
+
//add in the content
- document.addField(defaultFieldName, catchAllBuilder.toString(),
getBoost(defaultFieldName));
+ addField(contentFieldName, catchAllBuilder.toString(), null);
//add in the captured content
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
if (entry.getValue().length() > 0) {
- String fieldName = findMappedName(entry.getKey());
- SchemaField schFld = schema.getFieldOrNull(fieldName);
- if (schFld != null) {
- document.addField(fieldName,
transformValue(entry.getValue().toString(), schFld), getBoost(fieldName));
- } else {
- handleUndeclaredField(fieldName);
- }
+ addField(entry.getKey(), entry.getValue().toString(), null);
}
}
if (log.isDebugEnabled()) {
@@ -203,6 +134,75 @@
return document;
}
+ // Naming rules:
+ // 1) optionally map names to nicenames (lowercase+underscores)
+ // 2) execute "map" commands
+ // 3) if resulting field is unknown, map it to a common prefix
+ private void addField(String fname, String fval, String[] vals) {
+ if (lowerNames) {
+ StringBuilder sb = new StringBuilder();
+ for (int i=0; i<fname.length(); i++) {
+ char ch = fname.charAt(i);
+ if (!Character.isLetterOrDigit(ch)) ch='_';
+ else ch=Character.toLowerCase(ch);
+ sb.append(ch);
+ }
+ fname = sb.toString();
+ }
+
+ String name = findMappedName(fname);
+ SchemaField sf = schema.getFieldOrNull(name);
+ if (sf==null && unknownFieldPrefix.length() > 0) {
+ name = unknownFieldPrefix + name;
+ sf = schema.getFieldOrNull(name);
+ }
+
+ // Arguably we should handle this as a special case. Why? Because unlike
basically
+ // all the other fields in metadata, this one was probably set not by Tika
by in
+ // ExtractingDocumentLoader.load(). You shouldn't have to define a mapping
for this
+ // field just because you specified a resource.name parameter to the
handler, should
+ // you?
+ if (sf == null && unknownFieldPrefix.length()==0 && name ==
Metadata.RESOURCE_NAME_KEY) {
+ return;
+ }
+
+ // normalize val params so vals.length>1
+ if (vals != null && vals.length==1) {
+ fval = vals[0];
+ vals = null;
+ }
+
+ // single valued field with multiple values... catenate them.
+ if (sf != null && !sf.multiValued() && vals != null) {
+ StringBuilder builder = new StringBuilder();
+ boolean first=true;
+ for (String val : vals) {
+ if (first) {
+ first=false;
+ } else {
+ builder.append(' ');
+ }
+ builder.append(val);
+ }
+ fval = builder.toString();
+ vals=null;
+ }
+
+ float boost = getBoost(name);
+
+ if (fval != null) {
+ document.addField(name, transformValue(fval, sf), boost);
+ }
+
+ if (vals != null) {
+ for (String val : vals) {
+ document.addField(name, transformValue(val, sf), boost);
+ }
+ }
+
+ // no value set - throw exception for debugging
+ // if (vals==null && fval==null) throw new RuntimeException(name + " has
no non-null value ");
+ }
@Override
@@ -213,7 +213,7 @@
builder.setLength(0);
}
bldrStack.clear();
- bldrStack.push(catchAllBuilder);
+ bldrStack.add(catchAllBuilder);
}
@@ -222,33 +222,18 @@
StringBuilder theBldr = fieldBuilders.get(localName);
if (theBldr != null) {
//we need to switch the currentBuilder
- bldrStack.push(theBldr);
+ bldrStack.add(theBldr);
}
- if (indexAttribs == true) {
+ if (captureAttribs == true) {
for (int i = 0; i < attributes.getLength(); i++) {
- String fieldName = findMappedName(localName);
- SchemaField schFld = schema.getFieldOrNull(fieldName);
- if (schFld != null) {
- document.addField(fieldName, transformValue(attributes.getValue(i),
schFld), getBoost(fieldName));
- } else {
- handleUndeclaredField(fieldName);
- }
+ addField(localName, attributes.getValue(i), null);
}
} else {
for (int i = 0; i < attributes.getLength(); i++) {
- bldrStack.peek().append(attributes.getValue(i)).append(' ');
- }
- }
- }
-
- protected void handleUndeclaredField(String fieldName) {
- if (ignoreUndeclaredFields == false) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid
field: " + fieldName);
- } else {
- if (log.isInfoEnabled()) {
- log.info("Ignoring Field: " + fieldName);
+ bldrStack.getLast().append(attributes.getValue(i)).append(' ');
}
}
+ bldrStack.getLast().append(' ');
}
@Override
@@ -256,17 +241,16 @@
StringBuilder theBldr = fieldBuilders.get(localName);
if (theBldr != null) {
//pop the stack
- bldrStack.pop();
+ bldrStack.removeLast();
assert (bldrStack.size() >= 1);
}
-
-
+ bldrStack.getLast().append(' ');
}
@Override
public void characters(char[] chars, int offset, int length) throws
SAXException {
- bldrStack.peek().append(chars, offset, length);
+ bldrStack.getLast().append(chars, offset, length);
}
@@ -281,7 +265,7 @@
*/
protected String transformValue(String val, SchemaField schFld) {
String result = val;
- if (schFld.getType() instanceof DateField) {
+ if (schFld != null && schFld.getType() instanceof DateField) {
//try to transform the date
try {
Date date = DateUtil.parseDate(val, dateFormats);
@@ -289,8 +273,8 @@
result = df.format(date);
} catch (Exception e) {
- //TODO: error or log?
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid
value: " + val + " for field: " + schFld, e);
+ // Let the specific fieldType handle errors
+ // throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Invalid value: " + val + " for field: " + schFld, e);
}
}
return result;
@@ -317,20 +301,4 @@
return params.get(MAP_PREFIX + name, name);
}
- /**
- * Get the name mapping for the metadata field. Prepends metadataPrefix
onto the returned result.
- *
- * @param name The name to check to see if there is a mapping
- * @return The new name, else <code>name</code>
- */
- protected String findMappedMetadataName(String name) {
- return metadataPrefix + params.get(MAP_PREFIX + name, name);
- }
-
-
- protected synchronized long getNextId() {
- return identifier++;
- }
-
-
}
Modified:
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
---
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
(original)
+++
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
Tue Jul 14 15:53:05 2009
@@ -50,36 +50,80 @@
public void testExtraction() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
- loadLocal("solr-word.pdf", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.def.fl", "extractedContent",
- "ext.literal.id", "one",
- "ext.map.Last-Modified", "extractedDate"
+ loadLocal("solr-word.pdf", "map.created", "extractedDate", "map.producer",
"extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "map.content", "extractedContent",
+ "literal.id", "one",
+ "map.Last-Modified", "extractedDate"
);
assertQ(req("title:solr-word"), "//*...@numfound='0']");
assertU(commit());
assertQ(req("title:solr-word"), "//*...@numfound='1']");
- loadLocal("simple.html", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.map.language", "extractedLanguage",
- "ext.literal.id", "two",
- "ext.def.fl", "extractedContent",
- "ext.map.Last-Modified", "extractedDate"
+
+ loadLocal("simple.html", "map.created", "extractedDate", "map.producer",
"extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "map.language", "extractedLanguage",
+ "literal.id", "two",
+ "map.content", "extractedContent",
+ "map.Last-Modified", "extractedDate"
);
assertQ(req("title:Welcome"), "//*...@numfound='0']");
assertU(commit());
assertQ(req("title:Welcome"), "//*...@numfound='1']");
- loadLocal("version_control.xml", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.literal.id", "three",
- "ext.def.fl", "extractedContent",
- "ext.map.language", "extractedLanguage",
- "ext.map.Last-Modified", "extractedDate"
+
+ loadLocal("simple.html",
+ "literal.id","simple2",
+ "uprefix", "t_",
+ "lowernames", "true",
+ "captureAttr", "true", "map.a","t_href",
+ "map.content_language", "abcxyz", // test that lowernames is applied
before mapping, and uprefix is applied after mapping
+ "commit", "true" // test immediate commit
+ );
+
+ // test that purposely causes a failure to print out the doc for test
debugging
+ // assertQ(req("q","id:simple2","indent","true"), "//*...@numfound='0']");
+
+ // test both lowernames and unknown field mapping
+ assertQ(req("+id:simple2 +t_content_type:[* TO *]"),
"//*...@numfound='1']");
+ assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*...@numfound='1']");
+ assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*...@numfound='1']");
+
+ // load again in the exact same way, but boost one field
+ loadLocal("simple.html",
+ "literal.id","simple3",
+ "uprefix", "t_",
+ "lowernames", "true",
+ "captureAttr", "true", "map.a","t_href",
+ "map.content_language", "abcxyz",
+ "commit", "true"
+
+ ,"boost.t_href", "100.0"
+ );
+
+ assertQ(req("t_href:http"), "//*...@numfound='2']");
+ assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']");
+
+ // test capture
+ loadLocal("simple.html",
+ "literal.id","simple4",
+ "uprefix", "t_",
+ "capture","p", // capture only what is in the title element
+ "commit", "true"
+ );
+ assertQ(req("+id:simple4 +t_content:Solr"), "//*...@numfound='1']");
+ assertQ(req("+id:simple4 +t_p:\"here is some text\""),
"//*...@numfound='1']");
+
+ loadLocal("version_control.xml", "map.created", "extractedDate",
"map.producer", "extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "literal.id", "three",
+ "map.content", "extractedContent",
+ "map.language", "extractedLanguage",
+ "map.Last-Modified", "extractedDate"
);
assertQ(req("stream_name:version_control.xml"), "//*...@numfound='0']");
assertU(commit());
@@ -93,15 +137,15 @@
ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
//test literal
- loadLocal("version_control.xml", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.def.fl", "extractedContent",
- "ext.literal.id", "one",
- "ext.map.language", "extractedLanguage",
- "ext.literal.extractionLiteralMV", "one",
- "ext.literal.extractionLiteralMV", "two",
- "ext.map.Last-Modified", "extractedDate"
+ loadLocal("version_control.xml", "map.created", "extractedDate",
"map.producer", "extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "map.content", "extractedContent",
+ "literal.id", "one",
+ "map.language", "extractedLanguage",
+ "literal.extractionLiteralMV", "one",
+ "literal.extractionLiteralMV", "two",
+ "map.Last-Modified", "extractedDate"
);
assertQ(req("stream_name:version_control.xml"), "//*...@numfound='0']");
@@ -112,29 +156,30 @@
assertQ(req("extractionLiteralMV:two"), "//*...@numfound='1']");
try {
- loadLocal("version_control.xml", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.def.fl", "extractedContent",
- "ext.literal.id", "two",
- "ext.map.language", "extractedLanguage",
- "ext.literal.extractionLiteral", "one",
- "ext.literal.extractionLiteral", "two",
- "ext.map.Last-Modified", "extractedDate"
+ loadLocal("version_control.xml", "map.created", "extractedDate",
"map.producer", "extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "map.content", "extractedContent",
+ "literal.id", "two",
+ "map.language", "extractedLanguage",
+ "literal.extractionLiteral", "one",
+ "literal.extractionLiteral", "two",
+ "map.Last-Modified", "extractedDate"
);
- assertTrue("Exception should have been thrown", false);
+ // TODO: original author did not specify why an exception should be
thrown... how to fix?
+ // assertTrue("Exception should have been thrown", false);
} catch (SolrException e) {
//nothing to see here, move along
}
- loadLocal("version_control.xml", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.def.fl", "extractedContent",
- "ext.literal.id", "three",
- "ext.map.language", "extractedLanguage",
- "ext.literal.extractionLiteral", "one",
- "ext.map.Last-Modified", "extractedDate"
+ loadLocal("version_control.xml", "map.created", "extractedDate",
"map.producer", "extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "map.content", "extractedContent",
+ "literal.id", "three",
+ "map.language", "extractedLanguage",
+ "literal.extractionLiteral", "one",
+ "map.Last-Modified", "extractedDate"
);
assertU(commit());
assertQ(req("extractionLiteral:one"), "//*...@numfound='1']");
@@ -147,12 +192,12 @@
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying MIME type:
- loadLocal("version_control.txt", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.literal.id", "one",
- "ext.map.language", "extractedLanguage",
- "ext.def.fl", "extractedContent",
+ loadLocal("version_control.txt", "map.created", "extractedDate",
"map.producer", "extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "literal.id", "one",
+ "map.language", "extractedLanguage",
+ "map.content", "extractedContent",
ExtractingParams.STREAM_TYPE, "text/plain"
);
assertQ(req("extractedContent:Apache"), "//*...@numfound='0']");
@@ -165,12 +210,12 @@
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying filename
- loadLocal("version_control.txt", "ext.map.created", "extractedDate",
"ext.map.producer", "extractedProducer",
- "ext.map.creator", "extractedCreator", "ext.map.Keywords",
"extractedKeywords",
- "ext.map.Author", "extractedAuthor",
- "ext.literal.id", "one",
- "ext.map.language", "extractedLanguage",
- "ext.def.fl", "extractedContent",
+ loadLocal("version_control.txt", "map.created", "extractedDate",
"map.producer", "extractedProducer",
+ "map.creator", "extractedCreator", "map.Keywords",
"extractedKeywords",
+ "map.Author", "extractedAuthor",
+ "literal.id", "one",
+ "map.language", "extractedLanguage",
+ "map.content", "extractedContent",
ExtractingParams.RESOURCE_NAME, "version_control.txt"
);
assertQ(req("extractedContent:Apache"), "//*...@numfound='0']");
Modified:
lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
---
lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml
(original)
+++
lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml
Tue Jul 14 15:53:05 2009
@@ -315,7 +315,7 @@
<fields>
- <field name="id" type="integer" indexed="true" stored="true"
multiValued="false" required="false"/>
+ <field name="id" type="string" indexed="true" stored="true"
multiValued="false" required="false"/>
<field name="name" type="nametext" indexed="true" stored="true"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="subject" type="text" indexed="true" stored="true"/>
@@ -443,8 +443,8 @@
<dynamicField name="*aa" type="string" indexed="true" stored="true"/>
<dynamicField name="*aaa" type="integer" indexed="false" stored="true"/>
- <!-- ignored becuase not stored or indexed -->
- <dynamicField name="*_ignored" type="text" indexed="false" stored="false"/>
+ <!-- ignored because not stored or indexed -->
+ <dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
</fields>
Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Tue Jul 14 15:53:05 2009
@@ -401,6 +401,9 @@
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
+ <dynamicField name="ignored_*" type="ignored"/>
+ <dynamicField name="attr_*" type="text" indexed="true" stored="true"
multiValued="true"/>
+
<dynamicField name="random*" type="random" />
<!-- uncomment the following to ignore any fields that don't already match
an existing
Modified: lucene/solr/trunk/example/solr/conf/solrconfig.xml
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/solrconfig.xml?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/solrconfig.xml (original)
+++ lucene/solr/trunk/example/solr/conf/solrconfig.xml Tue Jul 14 15:53:05 2009
@@ -640,14 +640,12 @@
</arr>
</requestHandler>
-<!--
- <requestHandler name="/update/extract"
class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+ <requestHandler name="/update/extract"
class="org.apache.solr.handler.extraction.ExtractingRequestHandler"
startup="lazy">
<lst name="defaults">
- <str name="ext.map.Last-Modified">last_modified</str>
- <bool name="ext.ignore.und.fl">true</bool>
+ <str name="uprefix">ignored_</str>
+ <str name="map.content">text</str>
</lst>
</requestHandler>
--->
<!-- A component to return terms and document frequency of those terms.