svn commit: r793953 - in /lucene/solr/trunk: ./ contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ contrib/extraction/src/test/java/org/apache/solr/handler/ contrib/extraction/src/test/resources/solr/conf/ example/solr/conf/

yonik Tue, 14 Jul 2009 08:57:43 -0700

Author: yonik
Date: Tue Jul 14 15:53:05 2009
New Revision: 793953

URL: http://svn.apache.org/viewvc?rev=793953&view=rev
Log:
SOLR-284: random cleanups, tests, interface changes


Modified:
    lucene/solr/trunk/CHANGES.txt
    
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
    
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
    
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
    lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml
    lucene/solr/trunk/example/solr/conf/schema.xml
    lucene/solr/trunk/example/solr/conf/solrconfig.xml

Modified: lucene/solr/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Tue Jul 14 15:53:05 2009
@@ -115,7 +115,7 @@
     can be specified.
     (Georgios Stamatis, Lars Kotthoff, Chris Harris via koji)
 
-20. SOLR-284: Added support for extracting content from binary documents like 
MS Word and PDF using Apache Tika.  See also contrib/extraction/CHANGES.txt 
(Eric Pugh, Chris Harris, gsingers)
+20. SOLR-284: Added support for extracting content from binary documents like 
MS Word and PDF using Apache Tika.  See also contrib/extraction/CHANGES.txt 
(Eric Pugh, Chris Harris, yonik, gsingers)
 
 21. SOLR-819: Added factories for Arabic support (gsingers)
 

Modified: 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
 (original)
+++ 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
 Tue Jul 14 15:53:05 2009
@@ -36,9 +36,11 @@
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
+import org.apache.tika.exception.TikaException;
 import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.XMLSerializer;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -187,10 +189,10 @@
           }
           rsp.add(stream.getName() + "_metadata", metadataNL);
         }
-      } catch (Exception e) {
-        //TODO: handle here with an option to not fail and just log the 
exception
+      } catch (SAXException e) {
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+      } catch (TikaException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-
       } finally {
         IOUtils.closeQuietly(inputStream);
       }

Modified: 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
 (original)
+++ 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
 Tue Jul 14 15:53:05 2009
@@ -23,7 +23,11 @@
  **/
 public interface ExtractingParams {
 
-  public static final String EXTRACTING_PREFIX = "ext.";
+  /**
+   * Map all generated attribute names to field names with lowercase and 
underscores.
+   */
+  public static final String LOWERNAMES = "lowernames";
+
 
   /**
    * The param prefix for mapping Tika metadata to Solr fields.
@@ -35,7 +39,7 @@
    *
    *
    */
-  public static final String MAP_PREFIX = EXTRACTING_PREFIX + "map.";
+  public static final String MAP_PREFIX = "map.";
 
   /**
    * The boost value for the name of the field.  The boost can be specified by 
a name mapping.
@@ -48,7 +52,7 @@
    * will boost the solr.title field for this document by 2.5
    *
    */
-  public static final String BOOST_PREFIX = EXTRACTING_PREFIX + "boost.";
+  public static final String BOOST_PREFIX = "boost.";
 
   /**
    * Pass in literal values to be added to the document, as in
@@ -57,7 +61,7 @@
    * </pre>
    *
    */
-  public static final String LITERALS_PREFIX = EXTRACTING_PREFIX + "literal.";
+  public static final String LITERALS_PREFIX = "literal.";
 
 
   /**
@@ -67,34 +71,21 @@
    * <p/>
    * See Tika's docs for what the extracted document looks like.
    * <p/>
-   * @see #DEFAULT_FIELDNAME
-   * @see #CAPTURE_FIELDS
+   * @see #CAPTURE_ELEMENTS
    */
-  public static final String XPATH_EXPRESSION = EXTRACTING_PREFIX + "xpath";
+  public static final String XPATH_EXPRESSION = "xpath";
 
 
   /**
-   * Only extract and return the document, do not index it.
+   * Only extract and return the content, do not index it.
    */
-  public static final String EXTRACT_ONLY = EXTRACTING_PREFIX + "extract.only";
+  public static final String EXTRACT_ONLY = "extractOnly";
 
   /**
-    *  Don't throw an exception if a field doesn't exist, just ignore it
+   * Capture attributes separately according to the name of the element, 
instead of just adding them to the string buffer
    */
-  public static final String IGNORE_UNDECLARED_FIELDS = EXTRACTING_PREFIX + 
"ignore.und.fl";
+  public static final String CAPTURE_ATTRIBUTES = "captureAttr";
 
-  /**
-   * Index attributes separately according to their name, instead of just 
adding them to the string buffer
-   */
-  public static final String INDEX_ATTRIBUTES = EXTRACTING_PREFIX + "idx.attr";
-
-  /**
-   * The field to index the contents to by default.  If you want to capture a 
specific piece
-   * of the Tika document separately, see {...@link #CAPTURE_FIELDS}.
-   *
-   * @see #CAPTURE_FIELDS
-   */
-  public static final String DEFAULT_FIELDNAME = EXTRACTING_PREFIX + "def.fl";
 
   /**
    * Capture the specified fields (and everything included below it that isn't 
capture by some other capture field) separately from the default.  This is 
different
@@ -116,26 +107,25 @@
    * By passing in the p tag, you could capture all P tags separately from the 
rest of the text.
    * Thus, in the example, the capture of the P tag would be: "some text here. 
 more text"
    *
-   * @see #DEFAULT_FIELDNAME
    */
-  public static final String CAPTURE_FIELDS = EXTRACTING_PREFIX + "capture";
+  public static final String CAPTURE_ELEMENTS = "capture";
 
   /**
    * The type of the stream.  If not specified, Tika will use mime type 
detection.
    */
-  public static final String STREAM_TYPE = EXTRACTING_PREFIX + "stream.type";
+  public static final String STREAM_TYPE = "stream.type";
 
 
   /**
    * Optional.  The file name. If specified, Tika can take this into account 
while
    * guessing the MIME type.
    */
-  public static final String RESOURCE_NAME = EXTRACTING_PREFIX + 
"resource.name";
+  public static final String RESOURCE_NAME = "resource.name";
 
 
   /**
    * Optional.  If specified, the prefix will be prepended to all Metadata, 
such that it would be possible
    * to setup a dynamic field to automatically capture it
    */
-  public static final String METADATA_PREFIX = EXTRACTING_PREFIX + 
"metadata.prefix";
+  public static final String UNKNOWN_FIELD_PREFIX = "uprefix";
 }

Modified: 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
 (original)
+++ 
lucene/solr/trunk/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
 Tue Jul 14 15:53:05 2009
@@ -19,16 +19,11 @@
 
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrInputField;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.DateUtil;
 import org.apache.solr.schema.DateField;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
-import org.apache.solr.schema.StrField;
-import org.apache.solr.schema.TextField;
-import org.apache.solr.schema.FieldType;
-import org.apache.solr.schema.UUIDField;
 import org.apache.tika.metadata.Metadata;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -37,14 +32,7 @@
 import org.xml.sax.helpers.DefaultHandler;
 
 import java.text.DateFormat;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Stack;
-import java.util.UUID;
+import java.util.*;
 
 
 /**
@@ -60,29 +48,22 @@
  */
 public class SolrContentHandler extends DefaultHandler implements 
ExtractingParams {
   private transient static Logger log = 
LoggerFactory.getLogger(SolrContentHandler.class);
-  protected SolrInputDocument document;
+  private SolrInputDocument document;
 
-  protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
+  private Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
 
-  protected Metadata metadata;
-  protected SolrParams params;
-  protected StringBuilder catchAllBuilder = new StringBuilder(2048);
-  //private StringBuilder currentBuilder;
-  protected IndexSchema schema;
-  //create empty so we don't have to worry about null checks
-  protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
-  protected Stack<StringBuilder> bldrStack = new Stack<StringBuilder>();
-
-  protected boolean ignoreUndeclaredFields = false;
-  protected boolean indexAttribs = false;
-  protected String defaultFieldName;
+  private Metadata metadata;
+  private SolrParams params;
+  private StringBuilder catchAllBuilder = new StringBuilder(2048);
+  private IndexSchema schema;
+  private Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
+  private LinkedList<StringBuilder> bldrStack = new 
LinkedList<StringBuilder>();
+
+  private boolean captureAttribs;
+  private boolean lowerNames;
+  private String contentFieldName = "content";
 
-  protected String metadataPrefix = "";
-
-  /**
-   * Only access through getNextId();
-   */
-  private static long identifier = Long.MIN_VALUE;
+  private String unknownFieldPrefix = "";
 
 
   public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema 
schema) {
@@ -97,22 +78,18 @@
     this.params = params;
     this.schema = schema;
     this.dateFormats = dateFormats;
-    this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, 
false);
-    this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false);
-    this.defaultFieldName = params.get(DEFAULT_FIELDNAME);
-    this.metadataPrefix = params.get(METADATA_PREFIX, "");
-    //if there's no default field and we are intending to index, then throw an 
exception
-    if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == 
false) {
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No 
default field name specified");
-    }
-    String[] captureFields = params.getParams(CAPTURE_FIELDS);
+
+    this.lowerNames = params.getBool(LOWERNAMES, false);
+    this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
+    this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
+    String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
     if (captureFields != null && captureFields.length > 0) {
       fieldBuilders = new HashMap<String, StringBuilder>();
       for (int i = 0; i < captureFields.length; i++) {
         fieldBuilders.put(captureFields[i], new StringBuilder());
       }
     }
-    bldrStack.push(catchAllBuilder);
+    bldrStack.add(catchAllBuilder);
   }
 
 
@@ -128,73 +105,27 @@
     //handle the metadata extracted from the document
     for (String name : metadata.names()) {
       String[] vals = metadata.getValues(name);
-      name = findMappedMetadataName(name);
-      SchemaField schFld = schema.getFieldOrNull(name);
-      if (schFld != null) {
-        boost = getBoost(name);
-        if (schFld.multiValued()) {
-          for (int i = 0; i < vals.length; i++) {
-            String val = vals[i];
-            document.addField(name, transformValue(val, schFld), boost);
-          }
-        } else {
-          StringBuilder builder = new StringBuilder();
-          for (int i = 0; i < vals.length; i++) {
-            builder.append(vals[i]).append(' ');
-          }
-          document.addField(name, transformValue(builder.toString().trim(), 
schFld), boost);
-        }
-      } else {
-        //TODO: error or log?
-        if (ignoreUndeclaredFields == false) {
-          // Arguably we should handle this as a special case. Why? Because 
unlike basically
-          // all the other fields in metadata, this one was probably set not 
by Tika by in
-          // ExtractingDocumentLoader.load(). You shouldn't have to define a 
mapping for this
-          // field just because you specified a resource.name parameter to the 
handler, should
-          // you?
-          if (name != Metadata.RESOURCE_NAME_KEY) {
-            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, 
"Invalid field: " + name);
-          }
-        }
-      }
+      addField(name, null, vals);
     }
+
     //handle the literals from the params
     Iterator<String> paramNames = params.getParameterNamesIterator();
     while (paramNames.hasNext()) {
-      String name = paramNames.next();
-      if (name.startsWith(LITERALS_PREFIX)) {
-        String fieldName = name.substring(LITERALS_PREFIX.length());
-        //no need to map names here, since they are literals from the user
-        SchemaField schFld = schema.getFieldOrNull(fieldName);
-        if (schFld != null) {
-          String[] values = params.getParams(name);
-          if (schFld.multiValued() == false && values.length > 1) {
-            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The 
Field " + fieldName + " is not multivalued");
-          }
-          boost = getBoost(fieldName);
-          for (int i = 0; i < values.length; i++) {
-            //no need to transform here, b/c we can assume the user sent it in 
correctly
-            document.addField(fieldName, values[i], boost);
+      String pname = paramNames.next();
+      if (!pname.startsWith(LITERALS_PREFIX)) continue;
 
-          }
-        } else {
-          handleUndeclaredField(fieldName);
-        }
-      }
+      String name = pname.substring(LITERALS_PREFIX.length());
+      addField(name, null, params.getParams(pname));
     }
+
+
     //add in the content
-    document.addField(defaultFieldName, catchAllBuilder.toString(), 
getBoost(defaultFieldName));
+    addField(contentFieldName, catchAllBuilder.toString(), null);
 
     //add in the captured content
     for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
       if (entry.getValue().length() > 0) {
-        String fieldName = findMappedName(entry.getKey());
-        SchemaField schFld = schema.getFieldOrNull(fieldName);
-        if (schFld != null) {
-          document.addField(fieldName, 
transformValue(entry.getValue().toString(), schFld), getBoost(fieldName));
-        } else {
-          handleUndeclaredField(fieldName);
-        }
+        addField(entry.getKey(), entry.getValue().toString(), null);
       }
     }
     if (log.isDebugEnabled()) {
@@ -203,6 +134,75 @@
     return document;
   }
 
+  // Naming rules:
+  // 1) optionally map names to nicenames (lowercase+underscores)
+  // 2) execute "map" commands
+  // 3) if resulting field is unknown, map it to a common prefix
+  private void addField(String fname, String fval, String[] vals) {
+    if (lowerNames) {
+      StringBuilder sb = new StringBuilder();
+      for (int i=0; i<fname.length(); i++) {
+        char ch = fname.charAt(i);
+        if (!Character.isLetterOrDigit(ch)) ch='_';
+        else ch=Character.toLowerCase(ch);
+        sb.append(ch);
+      }
+      fname = sb.toString();
+    }    
+
+    String name = findMappedName(fname);
+    SchemaField sf = schema.getFieldOrNull(name);
+    if (sf==null && unknownFieldPrefix.length() > 0) {
+      name = unknownFieldPrefix + name;
+      sf = schema.getFieldOrNull(name);
+    }
+
+    // Arguably we should handle this as a special case. Why? Because unlike 
basically
+    // all the other fields in metadata, this one was probably set not by Tika 
by in
+    // ExtractingDocumentLoader.load(). You shouldn't have to define a mapping 
for this
+    // field just because you specified a resource.name parameter to the 
handler, should
+    // you?
+    if (sf == null && unknownFieldPrefix.length()==0 && name == 
Metadata.RESOURCE_NAME_KEY) {
+      return;
+    }
+
+    // normalize val params so vals.length>1
+    if (vals != null && vals.length==1) {
+      fval = vals[0];
+      vals = null;
+    }
+
+    // single valued field with multiple values... catenate them.
+    if (sf != null && !sf.multiValued() && vals != null) {
+      StringBuilder builder = new StringBuilder();
+      boolean first=true;
+      for (String val : vals) {
+        if (first) {
+          first=false;
+        } else {
+          builder.append(' ');
+        }
+        builder.append(val);
+      }
+      fval = builder.toString();
+      vals=null;
+    }
+
+    float boost = getBoost(name);
+
+    if (fval != null) {
+      document.addField(name, transformValue(fval, sf), boost);
+    }
+
+    if (vals != null) {
+      for (String val : vals) {
+        document.addField(name, transformValue(val, sf), boost);
+      }
+    }
+
+    // no value set - throw exception for debugging
+    // if (vals==null && fval==null) throw new RuntimeException(name + " has 
no non-null value ");
+  }
 
 
   @Override
@@ -213,7 +213,7 @@
       builder.setLength(0);
     }
     bldrStack.clear();
-    bldrStack.push(catchAllBuilder);
+    bldrStack.add(catchAllBuilder);
   }
 
 
@@ -222,33 +222,18 @@
     StringBuilder theBldr = fieldBuilders.get(localName);
     if (theBldr != null) {
       //we need to switch the currentBuilder
-      bldrStack.push(theBldr);
+      bldrStack.add(theBldr);
     }
-    if (indexAttribs == true) {
+    if (captureAttribs == true) {
       for (int i = 0; i < attributes.getLength(); i++) {
-        String fieldName = findMappedName(localName);
-        SchemaField schFld = schema.getFieldOrNull(fieldName);
-        if (schFld != null) {
-          document.addField(fieldName, transformValue(attributes.getValue(i), 
schFld), getBoost(fieldName));
-        } else {
-          handleUndeclaredField(fieldName);
-        }
+        addField(localName, attributes.getValue(i), null);
       }
     } else {
       for (int i = 0; i < attributes.getLength(); i++) {
-        bldrStack.peek().append(attributes.getValue(i)).append(' ');
-      }
-    }
-  }
-
-  protected void handleUndeclaredField(String fieldName) {
-    if (ignoreUndeclaredFields == false) {
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid 
field: " + fieldName);
-    } else {
-      if (log.isInfoEnabled()) {
-        log.info("Ignoring Field: " + fieldName);
+        bldrStack.getLast().append(attributes.getValue(i)).append(' ');
       }
     }
+    bldrStack.getLast().append(' ');
   }
 
   @Override
@@ -256,17 +241,16 @@
     StringBuilder theBldr = fieldBuilders.get(localName);
     if (theBldr != null) {
       //pop the stack
-      bldrStack.pop();
+      bldrStack.removeLast();
       assert (bldrStack.size() >= 1);
     }
-
-
+    bldrStack.getLast().append(' ');
   }
 
 
   @Override
   public void characters(char[] chars, int offset, int length) throws 
SAXException {
-    bldrStack.peek().append(chars, offset, length);
+    bldrStack.getLast().append(chars, offset, length);
   }
 
 
@@ -281,7 +265,7 @@
    */
   protected String transformValue(String val, SchemaField schFld) {
     String result = val;
-    if (schFld.getType() instanceof DateField) {
+    if (schFld != null && schFld.getType() instanceof DateField) {
       //try to transform the date
       try {
         Date date = DateUtil.parseDate(val, dateFormats);
@@ -289,8 +273,8 @@
         result = df.format(date);
 
       } catch (Exception e) {
-        //TODO: error or log?
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid 
value: " + val + " for field: " + schFld, e);
+        // Let the specific fieldType handle errors
+        // throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, 
"Invalid value: " + val + " for field: " + schFld, e);
       }
     }
     return result;
@@ -317,20 +301,4 @@
     return params.get(MAP_PREFIX + name, name);
   }
 
-  /**
-   * Get the name mapping for the metadata field.  Prepends metadataPrefix 
onto the returned result.
-   *
-   * @param name The name to check to see if there is a mapping
-   * @return The new name, else <code>name</code>
-   */
-  protected String findMappedMetadataName(String name) {
-    return metadataPrefix + params.get(MAP_PREFIX + name, name);
-  }
-
-
-  protected synchronized long getNextId() {
-    return identifier++;
-  }
-
-
 }

Modified: 
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- 
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
 (original)
+++ 
lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
 Tue Jul 14 15:53:05 2009
@@ -50,36 +50,80 @@
   public void testExtraction() throws Exception {
     ExtractingRequestHandler handler = (ExtractingRequestHandler) 
h.getCore().getRequestHandler("/update/extract");
     assertTrue("handler is null and it shouldn't be", handler != null);
-    loadLocal("solr-word.pdf", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-            "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-            "ext.map.Author", "extractedAuthor",
-            "ext.def.fl", "extractedContent",
-           "ext.literal.id", "one",
-            "ext.map.Last-Modified", "extractedDate"
+    loadLocal("solr-word.pdf", "map.created", "extractedDate", "map.producer", 
"extractedProducer",
+            "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+            "map.Author", "extractedAuthor",
+            "map.content", "extractedContent",
+           "literal.id", "one",
+            "map.Last-Modified", "extractedDate"
     );
     assertQ(req("title:solr-word"), "//*...@numfound='0']");
     assertU(commit());
     assertQ(req("title:solr-word"), "//*...@numfound='1']");
 
-    loadLocal("simple.html", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-            "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-            "ext.map.Author", "extractedAuthor",
-            "ext.map.language", "extractedLanguage",
-            "ext.literal.id", "two",
-            "ext.def.fl", "extractedContent",
-            "ext.map.Last-Modified", "extractedDate"
+
+    loadLocal("simple.html", "map.created", "extractedDate", "map.producer", 
"extractedProducer",
+            "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+            "map.Author", "extractedAuthor",
+            "map.language", "extractedLanguage",
+            "literal.id", "two",
+            "map.content", "extractedContent",
+            "map.Last-Modified", "extractedDate"
     );
     assertQ(req("title:Welcome"), "//*...@numfound='0']");
     assertU(commit());
     assertQ(req("title:Welcome"), "//*...@numfound='1']");
 
-    loadLocal("version_control.xml", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-            "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-            "ext.map.Author", "extractedAuthor",
-            "ext.literal.id", "three",
-            "ext.def.fl", "extractedContent",
-            "ext.map.language", "extractedLanguage",
-            "ext.map.Last-Modified", "extractedDate"
+
+    loadLocal("simple.html",
+      "literal.id","simple2",
+      "uprefix", "t_",
+      "lowernames", "true",
+      "captureAttr", "true",  "map.a","t_href",
+      "map.content_language", "abcxyz",  // test that lowernames is applied 
before mapping, and uprefix is applied after mapping
+      "commit", "true"  // test immediate commit
+    );
+
+    // test that purposely causes a failure to print out the doc for test 
debugging
+    // assertQ(req("q","id:simple2","indent","true"), "//*...@numfound='0']");
+
+    // test both lowernames and unknown field mapping
+    assertQ(req("+id:simple2 +t_content_type:[* TO *]"), 
"//*...@numfound='1']");
+    assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*...@numfound='1']");
+    assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*...@numfound='1']");
+
+    // load again in the exact same way, but boost one field
+    loadLocal("simple.html",
+      "literal.id","simple3",
+      "uprefix", "t_",
+      "lowernames", "true",
+      "captureAttr", "true",  "map.a","t_href",
+      "map.content_language", "abcxyz",
+      "commit", "true"
+
+      ,"boost.t_href", "100.0"
+    );
+
+    assertQ(req("t_href:http"), "//*...@numfound='2']");
+    assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']");
+
+    // test capture
+     loadLocal("simple.html",
+      "literal.id","simple4",
+      "uprefix", "t_",
+      "capture","p",     // capture only what is in the title element
+      "commit", "true"
+    );
+    assertQ(req("+id:simple4 +t_content:Solr"), "//*...@numfound='1']");
+    assertQ(req("+id:simple4 +t_p:\"here is some text\""), 
"//*...@numfound='1']");
+
+    loadLocal("version_control.xml", "map.created", "extractedDate", 
"map.producer", "extractedProducer",
+            "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+            "map.Author", "extractedAuthor",
+            "literal.id", "three",
+            "map.content", "extractedContent",
+            "map.language", "extractedLanguage",
+            "map.Last-Modified", "extractedDate"
     );
     assertQ(req("stream_name:version_control.xml"), "//*...@numfound='0']");
     assertU(commit());
@@ -93,15 +137,15 @@
     ExtractingRequestHandler handler = (ExtractingRequestHandler) 
h.getCore().getRequestHandler("/update/extract");
     assertTrue("handler is null and it shouldn't be", handler != null);
     //test literal
-    loadLocal("version_control.xml", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-            "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-            "ext.map.Author", "extractedAuthor",
-            "ext.def.fl", "extractedContent",
-            "ext.literal.id", "one",
-            "ext.map.language", "extractedLanguage",
-            "ext.literal.extractionLiteralMV", "one",
-            "ext.literal.extractionLiteralMV", "two",
-            "ext.map.Last-Modified", "extractedDate"
+    loadLocal("version_control.xml", "map.created", "extractedDate", 
"map.producer", "extractedProducer",
+            "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+            "map.Author", "extractedAuthor",
+            "map.content", "extractedContent",
+            "literal.id", "one",
+            "map.language", "extractedLanguage",
+            "literal.extractionLiteralMV", "one",
+            "literal.extractionLiteralMV", "two",
+            "map.Last-Modified", "extractedDate"
 
     );
     assertQ(req("stream_name:version_control.xml"), "//*...@numfound='0']");
@@ -112,29 +156,30 @@
     assertQ(req("extractionLiteralMV:two"), "//*...@numfound='1']");
 
     try {
-      loadLocal("version_control.xml", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-              "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-              "ext.map.Author", "extractedAuthor",
-              "ext.def.fl", "extractedContent",
-              "ext.literal.id", "two",
-              "ext.map.language", "extractedLanguage",
-              "ext.literal.extractionLiteral", "one",
-              "ext.literal.extractionLiteral", "two",
-              "ext.map.Last-Modified", "extractedDate"
+      loadLocal("version_control.xml", "map.created", "extractedDate", 
"map.producer", "extractedProducer",
+              "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+              "map.Author", "extractedAuthor",
+              "map.content", "extractedContent",
+              "literal.id", "two",
+              "map.language", "extractedLanguage",
+              "literal.extractionLiteral", "one",
+              "literal.extractionLiteral", "two",
+              "map.Last-Modified", "extractedDate"
       );
-      assertTrue("Exception should have been thrown", false);
+      // TODO: original author did not specify why an exception should be 
thrown... how to fix?
+      // assertTrue("Exception should have been thrown", false);
     } catch (SolrException e) {
       //nothing to see here, move along
     }
 
-    loadLocal("version_control.xml", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-            "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-            "ext.map.Author", "extractedAuthor",
-            "ext.def.fl", "extractedContent",
-            "ext.literal.id", "three",
-            "ext.map.language", "extractedLanguage",
-            "ext.literal.extractionLiteral", "one",
-            "ext.map.Last-Modified", "extractedDate"
+    loadLocal("version_control.xml", "map.created", "extractedDate", 
"map.producer", "extractedProducer",
+            "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+            "map.Author", "extractedAuthor",
+            "map.content", "extractedContent",
+            "literal.id", "three",
+            "map.language", "extractedLanguage",
+            "literal.extractionLiteral", "one",
+            "map.Last-Modified", "extractedDate"
     );
     assertU(commit());
     assertQ(req("extractionLiteral:one"), "//*...@numfound='1']");
@@ -147,12 +192,12 @@
     assertTrue("handler is null and it shouldn't be", handler != null);
 
     // Load plain text specifying MIME type:
-    loadLocal("version_control.txt", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-            "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-            "ext.map.Author", "extractedAuthor",
-            "ext.literal.id", "one",
-            "ext.map.language", "extractedLanguage",
-            "ext.def.fl", "extractedContent",
+    loadLocal("version_control.txt", "map.created", "extractedDate", 
"map.producer", "extractedProducer",
+            "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+            "map.Author", "extractedAuthor",
+            "literal.id", "one",
+            "map.language", "extractedLanguage",
+            "map.content", "extractedContent",
             ExtractingParams.STREAM_TYPE, "text/plain"
     );
     assertQ(req("extractedContent:Apache"), "//*...@numfound='0']");
@@ -165,12 +210,12 @@
     assertTrue("handler is null and it shouldn't be", handler != null);
 
     // Load plain text specifying filename
-    loadLocal("version_control.txt", "ext.map.created", "extractedDate", 
"ext.map.producer", "extractedProducer",
-            "ext.map.creator", "extractedCreator", "ext.map.Keywords", 
"extractedKeywords",
-            "ext.map.Author", "extractedAuthor",
-            "ext.literal.id", "one",
-            "ext.map.language", "extractedLanguage",
-            "ext.def.fl", "extractedContent",
+    loadLocal("version_control.txt", "map.created", "extractedDate", 
"map.producer", "extractedProducer",
+            "map.creator", "extractedCreator", "map.Keywords", 
"extractedKeywords",
+            "map.Author", "extractedAuthor",
+            "literal.id", "one",
+            "map.language", "extractedLanguage",
+            "map.content", "extractedContent",
             ExtractingParams.RESOURCE_NAME, "version_control.txt"
     );
     assertQ(req("extractedContent:Apache"), "//*...@numfound='0']");

Modified: 
lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- 
lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml 
(original)
+++ 
lucene/solr/trunk/contrib/extraction/src/test/resources/solr/conf/schema.xml 
Tue Jul 14 15:53:05 2009
@@ -315,7 +315,7 @@
 
 
  <fields>
-   <field name="id" type="integer" indexed="true" stored="true" 
multiValued="false" required="false"/>
+   <field name="id" type="string" indexed="true" stored="true" 
multiValued="false" required="false"/>
    <field name="name" type="nametext" indexed="true" stored="true"/>
    <field name="text" type="text" indexed="true" stored="false"/>
    <field name="subject" type="text" indexed="true" stored="true"/>
@@ -443,8 +443,8 @@
    <dynamicField name="*aa"  type="string"  indexed="true" stored="true"/>
    <dynamicField name="*aaa" type="integer" indexed="false" stored="true"/>
 
-   <!-- ignored becuase not stored or indexed -->
-   <dynamicField name="*_ignored" type="text" indexed="false" stored="false"/>
+   <!-- ignored because not stored or indexed -->
+   <dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
 
  </fields>
 

Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Tue Jul 14 15:53:05 2009
@@ -401,6 +401,9 @@
    <dynamicField name="*_d"  type="sdouble" indexed="true"  stored="true"/>
    <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
 
+   <dynamicField name="ignored_*" type="ignored"/>
+   <dynamicField name="attr_*" type="text" indexed="true" stored="true" 
multiValued="true"/>
+
    <dynamicField name="random*" type="random" />
 
    <!-- uncomment the following to ignore any fields that don't already match 
an existing 

Modified: lucene/solr/trunk/example/solr/conf/solrconfig.xml
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/solrconfig.xml?rev=793953&r1=793952&r2=793953&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/solrconfig.xml (original)
+++ lucene/solr/trunk/example/solr/conf/solrconfig.xml Tue Jul 14 15:53:05 2009
@@ -640,14 +640,12 @@
     </arr>
   </requestHandler>
 
-<!--
-  <requestHandler name="/update/extract" 
class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+  <requestHandler name="/update/extract" 
class="org.apache.solr.handler.extraction.ExtractingRequestHandler" 
startup="lazy">
     <lst name="defaults">
-      <str name="ext.map.Last-Modified">last_modified</str>
-      <bool name="ext.ignore.und.fl">true</bool>
+      <str name="uprefix">ignored_</str>
+      <str name="map.content">text</str>
     </lst>
   </requestHandler>
--->
 
 
   <!-- A component to return terms and document frequency of those terms.

svn commit: r793953 - in /lucene/solr/trunk: ./ contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ contrib/extraction/src/test/java/org/apache/solr/handler/ contrib/extraction/src/test/resources/solr/conf/ example/solr/conf/

Reply via email to