Author: jukka
Date: Thu Jun 25 14:00:09 2009
New Revision: 788360
URL: http://svn.apache.org/viewvc?rev=788360&view=rev
Log:
TIKA-247: parse language and category from MS Office properties
Patch contributed by Daan de Wit.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=788360&r1=788359&r2=788360&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Thu Jun 25 14:00:09 2009
@@ -22,6 +22,7 @@
import java.util.Iterator;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.MarkUnsupportedException;
import org.apache.poi.hpsf.NoPropertySetStreamException;
@@ -147,6 +148,19 @@
private void parse(DocumentSummaryInformation summary, Metadata metadata) {
set(metadata, Metadata.COMPANY, summary.getCompany());
set(metadata, Metadata.MANAGER, summary.getManager());
+ set(metadata, Metadata.LANGUAGE, getLanguage(summary));
+ set(metadata, Metadata.CATEGORY, summary.getCategory());
+ }
+
+ private String getLanguage(DocumentSummaryInformation summary) {
+ CustomProperties customProperties = summary.getCustomProperties();
+ if (customProperties != null) {
+ Object value = customProperties.get("Language");
+ if (value instanceof String) {
+ return (String) value;
+ }
+ }
+ return null;
}
private void setType(Metadata metadata, String type) {