luetzkendorf    2005/04/04 06:46:26

  Modified:    src/share/org/apache/slide/extractor OfficeExtractor.java
  Log:
  reworked
   - labeled (userdefined) properties now can be extracted
   - extracted properties now can have namespaces
  
  Revision  Changes    Path
  1.5       +152 -24   
jakarta-slide/src/share/org/apache/slide/extractor/OfficeExtractor.java
  
  Index: OfficeExtractor.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/OfficeExtractor.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- OfficeExtractor.java      14 Jan 2005 18:34:13 -0000      1.4
  +++ OfficeExtractor.java      4 Apr 2005 13:46:25 -0000       1.5
  @@ -1,7 +1,7 @@
   package org.apache.slide.extractor;
   
   import java.io.InputStream;
  -import java.util.ArrayList;
  +import java.util.Collections;
   import java.util.Enumeration;
   import java.util.HashMap;
   import java.util.Iterator;
  @@ -15,17 +15,86 @@
   import org.apache.poi.poifs.eventfilesystem.POIFSReader;
   import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
   import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
  +import org.apache.slide.common.PropertyName;
   import org.apache.slide.util.conf.Configurable;
   import org.apache.slide.util.conf.Configuration;
   import org.apache.slide.util.conf.ConfigurationException;
   
   /**
    * Property extractor for Microsoft office documents.
  + * 
  + * <p>This property extractor extracts properties from 
<code>SummaryInformation</code> and
  + * <code>DocumentSummaryInformation</code> headers of office documents.
  + * 
  + * <p>Sample configuration:
  + * <pre>
  + *   &lt;extractor classname="org.apache.slide.extractor.OfficeExtractor" 
uri="/files/docs/">
  + *     &lt;configuration>
  + *       &lt;instruction property="author" 
namespace="http://mycomp.com/namepsaces/webdav"; summary-information="4" />
  + *       &lt;instruction property="application" 
namespace="http://mycomp.com/namepsaces/webdav"; summary-information="18" />
  + *       &lt;instruction property="title" 
namespace="http://mycomp.com/namepsaces/webdav"; summary-information="2" />
  + *       &lt;instruction property="category" 
namespace="http://mycomp.com/namepsaces/webdav"; 
document-summary-information="2" />
  + *       &lt;instruction property="docid" 
namespace="http://mycomp.com/namepsaces/webdav"; label="Document-ID" />
  + *     &lt;/configuration>
  + *   &lt;/extractor>
  + * </pre>
  + * The sample configuration
  + * <ul> 
  + * <li>maps the <em>author</em> info of office documents to the 
<code>author</code> 
  + * property. The author info can be found in the 
<code>SummaryInformation</code> header and
  + * has the <code>id</code> 4.
  + * <li>and maps the <em>category</em> entry of the 
<code>DocumentSummaryInformation</code> header, 
  + * which has the <code>id</code> 2 to the WebDAV property 
<code>category</code>.
  + * <li><code>SummaryInformation</code> headers can also contain "labled" 
entries, e.g. for user
  + * defined metadata. In the sample the labled entries with the label 
<code>Document-ID</code>
  + * will be mapped to the WebDAV-Property <code>docid</code>.
  + * </ul>
  + * All WebDAV properties in the sample will have the namespace 
  + * <code>http://mycomp.com/namepsaces/webdav</code>.
  + * 
  + * <p>The IDs in the <code>DocumentSummaryInformation</code> and 
<code>SummaryInformation</code>
  + * headers are somewhat mystical. Samples for 
<code>SummaryInformation</code> are:
  + * <pre>
  + *    1: codepage
  + *    2: title
  + *    3: theme
  + *    4: author
  + *    5: keywords
  + *    6: comments
  + *    7: template (e.g. Normal.dot"
  + *    8: last author
  + *    9: revision number
  + *   11: last printing date
  + *   12: creation date
  + *   13: last saved date
  + *   14: number of pages
  + *   15: number of words
  + *   16: number of characters
  + *   18: application name (e.g. "Microsoft Word 9.0")
  + *   19: 
  + * </pre>
  + * Samples for <code>DocumentSummaryInformation</code> are:
  + * <pre>
  + *    1: codepage
  + *    2: category
  + *    5: number of lines
  + *    6: number of paragraphs
  + *   14: manager
  + *   15: company
  + * </pre>
    */
   public class OfficeExtractor extends AbstractPropertyExtractor implements 
Configurable {
  -     protected List instructions = new ArrayList();
  -     protected Map propertyMap = new HashMap();
  -     static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV = 
MSWordExtractor.CONTENT_TYPE_WORD_ALL_CSV+","+MSExcelExtractor.CONTENT_TYPE_EXCEL_ALL_CSV+","+MSPowerPointExtractor.CONTENT_TYPE_POWERPOINT_ALL_CSV;
  +     // maps SummaryInformation IDs to PropertyNames 
  +     protected Map propertyMapSI = new HashMap();
  +     // maps DocumentSummaryInformation IDs to PropertyNames
  +     protected Map propertyMapDSI = new HashMap();
  +     // maps labled properties to PropertyNames
  +     protected Map propertyMapLbl = new HashMap();
  +     
  +     static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV = 
  +             MSWordExtractor.CONTENT_TYPE_WORD_ALL_CSV + "," +
  +             MSExcelExtractor.CONTENT_TYPE_EXCEL_ALL_CSV + "," +
  +             MSPowerPointExtractor.CONTENT_TYPE_POWERPOINT_ALL_CSV;
        
        public OfficeExtractor(String uri, String contentType, String 
namespace) {
                super(uri, contentType, namespace);
  @@ -38,17 +107,17 @@
                        r.registerListener(listener);
                        r.read(content);
                } catch (Exception e) {
  -                     throw new ExtractorException("Exception while 
extracting properties in OfficeExtractor");
  +                     throw new ExtractorException("Exception while 
extracting properties in OfficeExtractor: " + e);
                }
                return listener.getProperties();
        }
   
        class OfficePropertiesListener implements POIFSReaderListener {
   
  -             private HashMap properties = new HashMap();
  +             private HashMap extractedProperties = new HashMap();
   
                public Map getProperties() {
  -                             return properties;
  +                     return extractedProperties;
                }
   
                public void processPOIFSReaderEvent(POIFSReaderEvent event) {
  @@ -60,22 +129,46 @@
                        } catch (Exception ex) {
                                throw new RuntimeException("Property set stream 
\"" + event.getPath() + event.getName() + "\": " + ex);
                        }
  -                     String eventName = event.getName().trim();
  -                     final long sectionCount = ps.getSectionCount();
  +
  +                     Map idMap = null;
  +                     
  +                     if (ps.isDocumentSummaryInformation()) {
  +                             idMap = propertyMapDSI;
  +                     } else if (ps.isSummaryInformation()) {
  +                             idMap = propertyMapSI;
  +                     } else {
  +                             // can this happen?
  +                             idMap = Collections.EMPTY_MAP;
  +                     }
  +                     
                        List sections = ps.getSections();
  -                     int nr = 0;
  +
                        for (Iterator i = sections.iterator(); i.hasNext();) {
                                Section sec = (Section) i.next();
  -                             int propertyCount = sec.getPropertyCount();
  -                             Property[] props = sec.getProperties();
  -                             for (int i2 = 0; i2 < props.length; i2++) {
  -                                     Property p = props[i2];
  -                                     int id = p.getID();
  -                                     long type = p.getType();
  -                                     Object value = p.getValue();
  -                                     String key = eventName + "-" + nr + "-" 
+ id; 
  -                                     if ( propertyMap.containsKey(key) ) {
  -                                             
properties.put(propertyMap.get(key), value);
  +                             System.out.println("section: " + sec);
  +                             
  +                             if (sec.getProperty(0) == null) {
  +                                     for(Iterator j = 
idMap.entrySet().iterator(); j.hasNext();) {
  +                                             Map.Entry e = 
(Map.Entry)j.next();
  +                                             
  +                                             Object propertyValue = 
sec.getProperty(((Integer)e.getKey()).intValue());
  +                                             if (propertyValue != null) {
  +                                                     
//System.out.println("\t" + e.getValue() + "=" + propertyValue);
  +                                                     
extractedProperties.put(e.getValue(), propertyValue);
  +                                             }
  +                                     }
  +                             } else {
  +                                     Map dict = (Map)sec.getProperty(0);
  +                                     // this section has a dictionary
  +                                     Property property[] = 
sec.getProperties();
  +                                     for(int j = 0; j < property.length; 
j++) {
  +                                             //String label = 
sec.getPIDString(property[j].getID()); TODO why doesn't this work
  +                                             String label = 
(String)dict.get(new Long(property[j].getID()));
  +                                             PropertyName slideProperty = 
(PropertyName)propertyMapLbl.get(label);
  +                                             if (slideProperty != null) {
  +                                                     
//System.out.println("\t" + slideProperty + "=" + property[j].getValue());
  +                                                     
extractedProperties.put(slideProperty, property[j].getValue());
  +                                             }
                                        }
                                }
                        }
  @@ -85,10 +178,45 @@
        public void configure(Configuration configuration) throws 
ConfigurationException {
           Enumeration instructions = 
configuration.getConfigurations("instruction");
           while (instructions.hasMoreElements()) {
  -            Configuration extract = 
(Configuration)instructions.nextElement();
  -            String property = extract.getAttribute("property");
  -            String id = extract.getAttribute("id");
  -                     propertyMap.put(id, property);
  +            Configuration instruction = 
(Configuration)instructions.nextElement();
  +            PropertyName propertyName = PropertyName.getPropertyName(
  +                     instruction.getAttribute("property"),
  +                                     instruction.getAttribute("namespace", 
"DAV:"));
  +            
  +            try {
  +                 String id = instruction.getAttribute("summary-information", 
null);
  +                 if (id != null) {
  +                     this.propertyMapSI.put(Integer.valueOf(id), 
propertyName);
  +                     continue;
  +                 }
  +                 
  +                 id = 
instruction.getAttribute("document-summary-information", null);
  +                 if (id != null) {
  +                     this.propertyMapDSI.put(Integer.valueOf(id), 
propertyName);
  +                     continue;
  +                 }
  +                 
  +                 id = instruction.getAttribute("label", null);
  +                 if (id != null) {
  +                     this.propertyMapLbl.put(id, propertyName);
  +                     continue;
  +                 }
  +                 
  +                 // for backward compatibility
  +                 // old style id atributes like SummaryInformation-0-4
  +                 id = instruction.getAttribute("id", null);
  +                 if (id != null) {
  +                     Integer intId = 
Integer.valueOf(id.substring(id.lastIndexOf('-')+1));
  +                     if (id.startsWith("SummaryInformation")) {
  +                             this.propertyMapSI.put(intId, propertyName);
  +                     }
  +                     if (id.startsWith("DocumentSummaryInformation")) {
  +                             this.propertyMapDSI.put(intId, propertyName);
  +                     }
  +                 }
  +            } catch(NumberFormatException e) {
  +             throw new ConfigurationException("Invalid instruction: " + e, 
instruction);
  +            }
           }
        }
        
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to