Author: jukka
Date: Tue Dec 2 15:34:44 2008
New Revision: 722673
URL: http://svn.apache.org/viewvc?rev=722673&view=rev
Log:
TIKA-172: New Open Document Parser that emits structured XHTML content
Use spaces instead of tabs for indentation.
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java?rev=722673&r1=722672&r2=722673&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/NSNormalizerContentHandler.java
Tue Dec 2 15:34:44 2008
@@ -37,50 +37,50 @@
*/
public class NSNormalizerContentHandler extends ContentHandlerDecorator {
- public NSNormalizerContentHandler(ContentHandler handler) {
- super(handler);
- }
-
- private final String mapOldNS(String ns) {
- if (ns==null) return null;
- if (ns.startsWith("http://openoffice.org/2000/"))
-
ns="urn:oasis:names:tc:opendocument:xmlns:"+ns.substring(27)+":1.0";
- return ns;
- }
-
- @Override
- public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
- AttributesImpl natts = new AttributesImpl();
- for (int i = 0; i < atts.getLength(); i++) {
- natts.addAttribute(
- mapOldNS(atts.getURI(i)), atts.getLocalName(i),
atts.getQName(i),
- atts.getType(i), atts.getValue(i)
- );
- }
- super.startElement(mapOldNS(namespaceURI),localName,qName,atts);
- }
-
- @Override
- public void endElement(String namespaceURI, String localName, String
qName) throws SAXException {
- super.endElement(mapOldNS(namespaceURI),localName,qName);
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws
SAXException {
- super.startPrefixMapping(prefix,mapOldNS(uri));
- }
-
- /** do not load any DTDs (may be requested by parser). Fake the DTD by
returning a empty string as InputSource */
- @Override
- public InputSource resolveEntity(String publicId, String systemId)
throws IOException,SAXException {
- if (
- "-//OpenOffice.org//DTD OfficeDocument
1.0//EN".equals(publicId) ||
- (systemId!=null &&
systemId.toLowerCase().endsWith(".dtd"))
- ) {
- return new InputSource(new StringReader(""));
- } else {
- return super.resolveEntity(publicId,systemId);
- }
- }
-
+ public NSNormalizerContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ private final String mapOldNS(String ns) {
+ if (ns==null) return null;
+ if (ns.startsWith("http://openoffice.org/2000/"))
+
ns="urn:oasis:names:tc:opendocument:xmlns:"+ns.substring(27)+":1.0";
+ return ns;
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
+ AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ natts.addAttribute(
+ mapOldNS(atts.getURI(i)), atts.getLocalName(i),
atts.getQName(i),
+ atts.getType(i), atts.getValue(i)
+ );
+ }
+ super.startElement(mapOldNS(namespaceURI),localName,qName,atts);
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String
qName) throws SAXException {
+ super.endElement(mapOldNS(namespaceURI),localName,qName);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws
SAXException {
+ super.startPrefixMapping(prefix,mapOldNS(uri));
+ }
+
+ /** do not load any DTDs (may be requested by parser). Fake the DTD by
returning a empty string as InputSource */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId) throws
IOException,SAXException {
+ if (
+ "-//OpenOffice.org//DTD OfficeDocument
1.0//EN".equals(publicId) ||
+ (systemId!=null && systemId.toLowerCase().endsWith(".dtd"))
+ ) {
+ return new InputSource(new StringReader(""));
+ } else {
+ return super.resolveEntity(publicId,systemId);
+ }
+ }
+
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java?rev=722673&r1=722672&r2=722673&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
Tue Dec 2 15:34:44 2008
@@ -47,146 +47,146 @@
*/
public class OpenOfficeContentParser implements Parser {
- public static final String
TEXT_NS="urn:oasis:names:tc:opendocument:xmlns:text:1.0";
- public static final String
TABLE_NS="urn:oasis:names:tc:opendocument:xmlns:table:1.0";
- public static final String XLINK_NS="http://www.w3.org/1999/xlink";
-
- protected static final char[] TAB=new char[]{'\t'};
-
- /**
- * Mappings between OpenDocument tag names and XHTML tag names
(including attributes).
- * All other tag names/attributes are ignored and left out from event
stream.
- */
- private static final HashMap<QName,TargetElement> MAPPINGS=new
HashMap<QName,TargetElement>();
- static {
- // general mappings of text:-tags
- MAPPINGS.put(new QName(TEXT_NS,"p"), new
TargetElement(XHTML,"p"));
- // text:h-tags are mapped specifically in
startElement/endElement
- MAPPINGS.put(new QName(TEXT_NS,"line-break"), new
TargetElement(XHTML,"br"));
- MAPPINGS.put(new QName(TEXT_NS,"list"), new
TargetElement(XHTML,"ul"));
- MAPPINGS.put(new QName(TEXT_NS,"list-item"), new
TargetElement(XHTML,"li"));
- MAPPINGS.put(new QName(TEXT_NS,"note"), new
TargetElement(XHTML,"div"));
- MAPPINGS.put(new QName(TEXT_NS,"span"), new
TargetElement(XHTML,"span"));
- MAPPINGS.put(new QName(TEXT_NS,"a"),new TargetElement(XHTML,"a",
- Collections.singletonMap(new QName(XLINK_NS,"href"),
new QName("href"))
- ));
-
- // create HTML tables from table:-tags
- MAPPINGS.put(new QName(TABLE_NS,"table"), new
TargetElement(XHTML,"table"));
- // repeating of rows is ignored; for columns, see below!
- MAPPINGS.put(new QName(TABLE_NS,"table-row"), new
TargetElement(XHTML,"tr"));
- // special mapping for rowspan/colspan attributes
- final HashMap<QName,QName> tableCellAttsMapping=new
HashMap<QName,QName>();
- tableCellAttsMapping.put(new
QName(TABLE_NS,"number-columns-spanned"),new QName("colspan"));
- tableCellAttsMapping.put(new
QName(TABLE_NS,"number-rows-spanned"),new QName("rowspan"));
- /* TODO: The following is not correct, the cell should be
repeated not spanned!
- * Code generates a HTML cell, spanning all repeated columns,
to make the cell look correct.
- * Problems may occur when both spanning and repeating is
given, which is not allowed by spec.
- * Cell spanning instead of repeating is not a problem,
because OpenOffice uses it
- * only for empty cells.
- */
- tableCellAttsMapping.put(new
QName(TABLE_NS,"number-columns-repeated"),new QName("colspan"));
- MAPPINGS.put(new QName(TABLE_NS,"table-cell"), new
TargetElement(XHTML,"td",tableCellAttsMapping));
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata
metadata)
- throws IOException, SAXException, TikaException {
-
- final XHTMLContentHandler xhtml = new
XHTMLContentHandler(handler,metadata);
- final DefaultHandler dh = new
ElementMappingContentHandler(xhtml, MAPPINGS) {
- private final BitSet textNodeStack=new BitSet();
- private int nodeDepth=0,completelyFiltered=0;
- private Stack<String> headingStack=new Stack<String>();
-
- @Override
- public void characters(char[] ch, int start, int
length) throws SAXException {
- // only forward content of tags from
text:-namespace
- if (completelyFiltered==0 && nodeDepth>0 &&
textNodeStack.get(nodeDepth-1))
- super.characters(ch,start,length);
- }
-
- // helper for checking tags which need complete
filtering (with sub-tags)
- private final boolean needsCompleteFiltering(String
namespaceURI, String localName) {
- return (
- (TEXT_NS.equals(namespaceURI) &&
(localName.endsWith("-template") || localName.endsWith("-style"))) ||
- (TABLE_NS.equals(namespaceURI) &&
"covered-table-cell".equals(localName))
- );
- }
-
- // map the heading level to <hX> HTML tags
- private final String getXHTMLHeaderTagName(Attributes
atts) {
- final String
depthStr=atts.getValue(TEXT_NS,"outline-level");
- if (depthStr==null) return "h1";
- int depth=Integer.parseInt(depthStr);
- if (depth>6) depth=6;
- if (depth<1) depth=1;
- return "h"+depth;
- }
-
- @Override
- public void startElement(String namespaceURI, String
localName, String qName, Attributes atts) throws SAXException {
- // keep track of current node type. If it is a
text node, a bit at the current depth ist set in textNodeStack.
- // characters() checks the top bit to
determine, if the actual node is a text node to print out
- // nodeDepth contains the depth of the current
node and also marks top of stack.
- assert nodeDepth>=0;
- textNodeStack.set(nodeDepth++,
TEXT_NS.equals(namespaceURI));
- // filter *all* content of some tags
- assert completelyFiltered>=0;
- if
(needsCompleteFiltering(namespaceURI,localName)) completelyFiltered++;
- // call next handler if no filtering
- if (completelyFiltered==0) {
- // special handling of text:h, that are
directly passed to xhtml handler
- if (TEXT_NS.equals(namespaceURI) &&
"h".equals(localName)) {
-
xhtml.startElement(headingStack.push(getXHTMLHeaderTagName(atts)));
- } else {
-
super.startElement(namespaceURI,localName,qName,atts);
- }
- }
- }
-
- @Override
- public void endElement(String namespaceURI, String
localName, String qName) throws SAXException {
- // call next handler if no filtering
- if (completelyFiltered==0) {
- // special handling of text:h, that are
directly passed to xhtml handler
- if (TEXT_NS.equals(namespaceURI) &&
"h".equals(localName)) {
-
xhtml.endElement(headingStack.pop());
- } else {
-
super.endElement(namespaceURI,localName,qName);
- }
- // special handling of tabulators
- if (TEXT_NS.equals(namespaceURI) &&
("tab-stop".equals(localName) || "tab".equals(localName)))
-
this.characters(TAB,0,TAB.length);
- }
- // revert filter for *all* content of some tags
- if
(needsCompleteFiltering(namespaceURI,localName)) completelyFiltered--;
- assert completelyFiltered>=0;
- // reduce current node depth
- nodeDepth--;
- assert nodeDepth>=0;
- }
-
- @Override
- public void startPrefixMapping(String prefix, String
uri) throws SAXException {
- // remove prefix mappings as they should not
occur in XHTML
- }
-
- @Override
- public void endPrefixMapping(String prefix) throws
SAXException {
- // remove prefix mappings as they should not
occur in XHTML
- }
-
- };
-
- try {
- SAXParserFactory factory =
SAXParserFactory.newInstance();
- factory.setValidating(false);
- factory.setNamespaceAware(true);
- SAXParser parser = factory.newSAXParser();
- parser.parse(new CloseShieldInputStream(stream),new
NSNormalizerContentHandler(dh));
- } catch (ParserConfigurationException e) {
- throw new TikaException("XML parser configuration
error", e);
- }
- }
+ public static final String
TEXT_NS="urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+ public static final String
TABLE_NS="urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+ public static final String XLINK_NS="http://www.w3.org/1999/xlink";
+
+ protected static final char[] TAB=new char[]{'\t'};
+
+ /**
+ * Mappings between OpenDocument tag names and XHTML tag names (including
attributes).
+ * All other tag names/attributes are ignored and left out from event
stream.
+ */
+ private static final HashMap<QName,TargetElement> MAPPINGS=new
HashMap<QName,TargetElement>();
+ static {
+ // general mappings of text:-tags
+ MAPPINGS.put(new QName(TEXT_NS,"p"), new TargetElement(XHTML,"p"));
+ // text:h-tags are mapped specifically in startElement/endElement
+ MAPPINGS.put(new QName(TEXT_NS,"line-break"), new
TargetElement(XHTML,"br"));
+ MAPPINGS.put(new QName(TEXT_NS,"list"), new TargetElement(XHTML,"ul"));
+ MAPPINGS.put(new QName(TEXT_NS,"list-item"), new
TargetElement(XHTML,"li"));
+ MAPPINGS.put(new QName(TEXT_NS,"note"), new
TargetElement(XHTML,"div"));
+ MAPPINGS.put(new QName(TEXT_NS,"span"), new
TargetElement(XHTML,"span"));
+ MAPPINGS.put(new QName(TEXT_NS,"a"),new TargetElement(XHTML,"a",
+ Collections.singletonMap(new QName(XLINK_NS,"href"), new
QName("href"))
+ ));
+
+ // create HTML tables from table:-tags
+ MAPPINGS.put(new QName(TABLE_NS,"table"), new
TargetElement(XHTML,"table"));
+ // repeating of rows is ignored; for columns, see below!
+ MAPPINGS.put(new QName(TABLE_NS,"table-row"), new
TargetElement(XHTML,"tr"));
+ // special mapping for rowspan/colspan attributes
+ final HashMap<QName,QName> tableCellAttsMapping=new
HashMap<QName,QName>();
+ tableCellAttsMapping.put(new
QName(TABLE_NS,"number-columns-spanned"),new QName("colspan"));
+ tableCellAttsMapping.put(new QName(TABLE_NS,"number-rows-spanned"),new
QName("rowspan"));
+ /* TODO: The following is not correct, the cell should be repeated not
spanned!
+ * Code generates a HTML cell, spanning all repeated columns, to make
the cell look correct.
+ * Problems may occur when both spanning and repeating is given, which
is not allowed by spec.
+ * Cell spanning instead of repeating is not a problem, because
OpenOffice uses it
+ * only for empty cells.
+ */
+ tableCellAttsMapping.put(new
QName(TABLE_NS,"number-columns-repeated"),new QName("colspan"));
+ MAPPINGS.put(new QName(TABLE_NS,"table-cell"), new
TargetElement(XHTML,"td",tableCellAttsMapping));
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata)
+ throws IOException, SAXException, TikaException {
+
+ final XHTMLContentHandler xhtml = new
XHTMLContentHandler(handler,metadata);
+ final DefaultHandler dh = new ElementMappingContentHandler(xhtml,
MAPPINGS) {
+ private final BitSet textNodeStack=new BitSet();
+ private int nodeDepth=0,completelyFiltered=0;
+ private Stack<String> headingStack=new Stack<String>();
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ // only forward content of tags from text:-namespace
+ if (completelyFiltered==0 && nodeDepth>0 &&
textNodeStack.get(nodeDepth-1))
+ super.characters(ch,start,length);
+ }
+
+ // helper for checking tags which need complete filtering (with
sub-tags)
+ private final boolean needsCompleteFiltering(String namespaceURI,
String localName) {
+ return (
+ (TEXT_NS.equals(namespaceURI) &&
(localName.endsWith("-template") || localName.endsWith("-style"))) ||
+ (TABLE_NS.equals(namespaceURI) &&
"covered-table-cell".equals(localName))
+ );
+ }
+
+ // map the heading level to <hX> HTML tags
+ private final String getXHTMLHeaderTagName(Attributes atts) {
+ final String depthStr=atts.getValue(TEXT_NS,"outline-level");
+ if (depthStr==null) return "h1";
+ int depth=Integer.parseInt(depthStr);
+ if (depth>6) depth=6;
+ if (depth<1) depth=1;
+ return "h"+depth;
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName,
String qName, Attributes atts) throws SAXException {
+ // keep track of current node type. If it is a text node, a
bit at the current depth ist set in textNodeStack.
+ // characters() checks the top bit to determine, if the actual
node is a text node to print out
+ // nodeDepth contains the depth of the current node and also
marks top of stack.
+ assert nodeDepth>=0;
+ textNodeStack.set(nodeDepth++, TEXT_NS.equals(namespaceURI));
+ // filter *all* content of some tags
+ assert completelyFiltered>=0;
+ if (needsCompleteFiltering(namespaceURI,localName))
completelyFiltered++;
+ // call next handler if no filtering
+ if (completelyFiltered==0) {
+ // special handling of text:h, that are directly passed to
xhtml handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName))
{
+
xhtml.startElement(headingStack.push(getXHTMLHeaderTagName(atts)));
+ } else {
+ super.startElement(namespaceURI,localName,qName,atts);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName,
String qName) throws SAXException {
+ // call next handler if no filtering
+ if (completelyFiltered==0) {
+ // special handling of text:h, that are directly passed to
xhtml handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName))
{
+ xhtml.endElement(headingStack.pop());
+ } else {
+ super.endElement(namespaceURI,localName,qName);
+ }
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI) &&
("tab-stop".equals(localName) || "tab".equals(localName)))
+ this.characters(TAB,0,TAB.length);
+ }
+ // revert filter for *all* content of some tags
+ if (needsCompleteFiltering(namespaceURI,localName))
completelyFiltered--;
+ assert completelyFiltered>=0;
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth>=0;
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws
SAXException {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ };
+
+ try {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setValidating(false);
+ factory.setNamespaceAware(true);
+ SAXParser parser = factory.newSAXParser();
+ parser.parse(new CloseShieldInputStream(stream),new
NSNormalizerContentHandler(dh));
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("XML parser configuration error", e);
+ }
+ }
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java?rev=722673&r1=722672&r2=722673&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
Tue Dec 2 15:34:44 2008
@@ -65,7 +65,7 @@
dh = getStatistic(dh, md, "nbPara", "paragraph-count");
dh = getStatistic(dh, md, "nbWord", "word-count");
dh = getStatistic(dh, md, "nbCharacter", "character-count");
- dh = new NSNormalizerContentHandler(dh);
+ dh = new NSNormalizerContentHandler(dh);
return dh;
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java?rev=722673&r1=722672&r2=722673&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
Tue Dec 2 15:34:44 2008
@@ -34,82 +34,82 @@
*/
public class ElementMappingContentHandler extends ContentHandlerDecorator {
- private final Map<QName,TargetElement> mappings;
+ private final Map<QName,TargetElement> mappings;
+
+ public ElementMappingContentHandler(ContentHandler handler,
Map<QName,TargetElement> mappings) {
+ super(handler);
+ this.mappings=mappings;
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
+ final TargetElement mapping=mappings.get(new
QName(namespaceURI,localName));
+ if (mapping!=null) {
+ final QName tag=mapping.getMappedTagName();
+
super.startElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag),mapping.mapAttributes(atts));
+ }
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String
qName) throws SAXException {
+ final TargetElement mapping=mappings.get(new
QName(namespaceURI,localName));
+ if (mapping!=null) {
+ final QName tag=mapping.getMappedTagName();
+
super.endElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag));
+ }
+ }
+
+ protected static final String getQNameAsString(final QName qname) {
+ final StringBuilder qn=new StringBuilder(qname.getPrefix());
+ if (qn.length()>0) qn.append(':');
+ return qn.append(qname.getLocalPart()).toString();
+ }
+
+ public static class TargetElement {
+
+ /** Creates an TargetElement, attributes of this element will be
mapped as specified */
+ public TargetElement(QName mappedTagName, Map<QName,QName>
attributesMapping) {
+ this.mappedTagName=mappedTagName;
+ this.attributesMapping=attributesMapping;
+ }
+
+ /** A shortcut that automatically creates the QName object */
+ public TargetElement(String mappedTagURI, String mappedTagLocalName,
Map<QName,QName> attributesMapping) {
+ this(new QName(mappedTagURI,mappedTagLocalName),
attributesMapping);
+ }
+
+ /** Creates an TargetElement with no attributes, all attributes will
be deleted from SAX stream */
+ public TargetElement(QName mappedTagName) {
+ this(mappedTagName, Collections.<QName,QName>emptyMap());
+ }
+
+ /** A shortcut that automatically creates the QName object */
+ public TargetElement(String mappedTagURI, String mappedTagLocalName) {
+ this(mappedTagURI, mappedTagLocalName,
Collections.<QName,QName>emptyMap());
+ }
+
+ public QName getMappedTagName() {
+ return mappedTagName;
+ }
+
+ public Map<QName,QName> getAttributesMapping() {
+ return attributesMapping;
+ }
+
+ public Attributes mapAttributes(final Attributes atts) {
+ final AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ QName name=attributesMapping.get(new QName(atts.getURI(i),
atts.getLocalName(i)));
+ if (name!=null) natts.addAttribute(
+ name.getNamespaceURI(), name.getLocalPart(),
getQNameAsString(name),
+ atts.getType(i), atts.getValue(i)
+ );
+ }
+ return natts;
+ }
+
+ private final QName mappedTagName;
+ private final Map<QName,QName> attributesMapping;
+ }
- public ElementMappingContentHandler(ContentHandler handler,
Map<QName,TargetElement> mappings) {
- super(handler);
- this.mappings=mappings;
- }
-
- @Override
- public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
- final TargetElement mapping=mappings.get(new
QName(namespaceURI,localName));
- if (mapping!=null) {
- final QName tag=mapping.getMappedTagName();
-
super.startElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag),mapping.mapAttributes(atts));
- }
- }
-
- @Override
- public void endElement(String namespaceURI, String localName, String
qName) throws SAXException {
- final TargetElement mapping=mappings.get(new
QName(namespaceURI,localName));
- if (mapping!=null) {
- final QName tag=mapping.getMappedTagName();
-
super.endElement(tag.getNamespaceURI(),tag.getLocalPart(),getQNameAsString(tag));
- }
- }
-
- protected static final String getQNameAsString(final QName qname) {
- final StringBuilder qn=new StringBuilder(qname.getPrefix());
- if (qn.length()>0) qn.append(':');
- return qn.append(qname.getLocalPart()).toString();
- }
-
- public static class TargetElement {
-
- /** Creates an TargetElement, attributes of this element will
be mapped as specified */
- public TargetElement(QName mappedTagName, Map<QName,QName>
attributesMapping) {
- this.mappedTagName=mappedTagName;
- this.attributesMapping=attributesMapping;
- }
-
- /** A shortcut that automatically creates the QName object */
- public TargetElement(String mappedTagURI, String
mappedTagLocalName, Map<QName,QName> attributesMapping) {
- this(new QName(mappedTagURI,mappedTagLocalName),
attributesMapping);
- }
-
- /** Creates an TargetElement with no attributes, all attributes
will be deleted from SAX stream */
- public TargetElement(QName mappedTagName) {
- this(mappedTagName,
Collections.<QName,QName>emptyMap());
- }
-
- /** A shortcut that automatically creates the QName object */
- public TargetElement(String mappedTagURI, String
mappedTagLocalName) {
- this(mappedTagURI, mappedTagLocalName,
Collections.<QName,QName>emptyMap());
- }
-
- public QName getMappedTagName() {
- return mappedTagName;
- }
-
- public Map<QName,QName> getAttributesMapping() {
- return attributesMapping;
- }
-
- public Attributes mapAttributes(final Attributes atts) {
- final AttributesImpl natts = new AttributesImpl();
- for (int i = 0; i < atts.getLength(); i++) {
- QName name=attributesMapping.get(new
QName(atts.getURI(i), atts.getLocalName(i)));
- if (name!=null) natts.addAttribute(
- name.getNamespaceURI(),
name.getLocalPart(), getQNameAsString(name),
- atts.getType(i), atts.getValue(i)
- );
- }
- return natts;
- }
-
- private final QName mappedTagName;
- private final Map<QName,QName> attributesMapping;
- }
-
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java?rev=722673&r1=722672&r2=722673&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
Tue Dec 2 15:34:44 2008
@@ -47,7 +47,7 @@
}
@Override
- public String toString() {
+ public String toString() {
return delegate.toString();
}