Author: jukka
Date: Wed Mar 26 11:23:53 2008
New Revision: 641457
URL: http://svn.apache.org/viewvc?rev=641457&view=rev
Log:
TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
- Replace TikaExcelCell with a modular/extensible set of classes that
encapsulate the functionality of rendering the cell content to XHTML
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=641457&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
Wed Mar 26 11:23:53 2008
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+ /**
+ * Renders the content to the given XHTML SAX event stream.
+ *
+ * @param handler
+ * @throws SAXException
+ */
+ void render(XHTMLContentHandler handler) throws SAXException;
+
+}
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=641457&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
Wed Mar 26 11:23:53 2008
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+ private final Cell cell;
+
+ public CellDecorator(Cell cell) {
+ this.cell = cell;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ cell.render(handler);
+ }
+
+}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641457&r1=641456&r2=641457&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Wed Mar 26 11:23:53 2008
@@ -165,8 +165,8 @@
private boolean insideWorksheet = false;
- private SortedMap<Point, TikaExcelCell> currentSheet =
- new TreeMap<Point, TikaExcelCell>(new Comparator<Point> () {
+ private SortedMap<Point, Cell> currentSheet =
+ new TreeMap<Point, Cell>(new Comparator<Point> () {
public int compare(Point a, Point b) {
int diff = a.y - b.y;
if (diff == 0) {
@@ -256,9 +256,11 @@
// if (insideWorksheet) {
// int row = hyperlinkRecord.getFirstRow();
// short column = hyperlinkRecord.getFirstColumn();
- // TikaExcelCell cell = currentSheet.findCell(row,
column);
+ // Point point = new Point(column, row);
+ // Cell cell = currentSheet.get(point);
// if (cell != null) {
- // cell.setHyperlink(hyperlinkRecord.getAddress());
+ // cell = new LinkedCell(cell,
hyperlinkRecord.getAddress());
+ // currentSheet.put(point, cell);
// }
// }
// break;
@@ -323,7 +325,7 @@
if (text != null && text.length() > 0) {
currentSheet.put(
new Point(record.getColumn(), record.getRow()),
- new TikaExcelCell(text));
+ new TextCell(text));
}
}
@@ -347,7 +349,7 @@
int currentColumn = 1;
handler.startElement("tr");
handler.startElement("td");
- for (Map.Entry<Point, TikaExcelCell> entry :
currentSheet.entrySet()) {
+ for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
while (currentRow < entry.getKey().y) {
handler.endElement("td");
handler.endElement("tr");
@@ -365,14 +367,7 @@
currentColumn++;
}
- TikaExcelCell cell = entry.getValue();
- if (cell.getHyperlink() != null) {
- handler.startElement("a", "href", cell.getHyperlink());
- handler.characters(cell.getText());
- handler.endElement("a");
- } else {
- handler.characters(cell.getText());
- }
+ entry.getValue().render(handler);
}
handler.endElement("td");
handler.endElement("tr");
@@ -383,54 +378,6 @@
handler.endElement("div");
handler.characters("\n");
}
- }
-
- // ======================================================================
-
- /**
- * Tika's excel cell representation.
- */
- private static class TikaExcelCell {
- private String text;
- private String hyperlink;
-
- /**
- * Construct a new cell.
- *
- * @param column The cell's column number
- * @param text The cell's text
- */
- TikaExcelCell(String text) {
- this.text = text;
- }
-
- /**
- * Return the cell's text.
- *
- * @return the cell's text
- */
- String getText() {
- return text;
- }
-
- /**
- * Return hyperlink address, if any
- *
- * @return the hyperlink address
- */
- String getHyperlink() {
- return hyperlink;
- }
-
- /**
- * Set the hyperlink address
- *
- * @param hyperlink the hyperlink address to set
- */
- void setHyperlink(String hyperlink) {
- this.hyperlink = hyperlink;
- }
-
}
}
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=641457&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
Wed Mar 26 11:23:53 2008
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Linked cell. This class decorates another content cell with a hyperlink.
+ */
+public class LinkedCell extends CellDecorator {
+
+ private final String link;
+
+ public LinkedCell(Cell cell, String link) {
+ super(cell);
+ this.link = link;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.startElement("a", "href", link);
+ super.render(handler);
+ handler.endElement("a");
+ }
+
+}
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=641457&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
Wed Mar 26 11:23:53 2008
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Text cell.
+ */
+public class TextCell implements Cell {
+
+ private final String text;
+
+ public TextCell(String text) {
+ this.text = text;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.characters(text);
+ }
+
+}