Hi to all, I don't have too much time to work in Droids now. But I saw that Ryan commits the Tika patch and I was working in a better integration with Tika. In te attachment there the .java file with the new version of LinkExtraction class. Change a lot of things and i think is better to send you in this way instead a diff.
I hope to have time in the next year to work more actively with this project. For now i will help with everything as i can do. Salu10.
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.droids.tika; import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.droids.LinkTask; import org.apache.droids.api.Link; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class LinkExtractor extends DefaultHandler { protected final Log log = LogFactory.getLog(this.getClass()); /** * List of links */ private Collection<Link> links = new ArrayList<Link>(); /** * Map with the pair label-attribute for the accepted items */ private Map<String, String> elements; /** * Base url for host reference */ private Link base = null; /** * Set of URIs visited yet */ private Set<String> history = null; /** * The parsed link */ private URI link = null; @Override public void startDocument() throws SAXException { history = new HashSet<String>(); history.add(base.getURI().toString()); } @Override public void startElement(String uri, String loc, String raw, Attributes att) throws SAXException { Iterator<String> it = elements.keySet().iterator(); String elem, linkAtt; while (it.hasNext()) { elem = it.next(); linkAtt = elements.get(elem); if (elem.equalsIgnoreCase(loc) && att.getValue(linkAtt) != null) { link = getURI(att.getValue(linkAtt)); log.debug("Found element: " + elem + " with link: " + link); } } } @Override public void characters(char[] chars, int start, int length) throws SAXException { if (link != null) { addOutlinkURI(new StringBuilder().append(chars, start, length).toString()); link = null; } } @Override public void endDocument() throws SAXException { history = null; log.debug("Found " + links.size() + " outliks"); } public void addOutlinkURI(String value) { if (history == null) history = new HashSet<String>(); if (links == null) links = new ArrayList<Link>(); if (history.add(link.toString())) { links.add(new LinkTask(base, link, base.getDepth() + 1)); log.debug("Added outlink: " + link + " with depth: " + base.getDepth() + 1); } } public void setBase(Link base) { this.base = base; } public Collection<Link> getLinks() { return links; } public Map<String, String> getElements() { return elements; } public void setElements(Map<String, String> elements) { this.elements = elements; } private URI getURI(String target) { try { if (!target.toLowerCase().startsWith("javascript") && !target.contains(":/")) { return base.getURI().resolve(target.split("#")[0]); } else if (!target.toLowerCase().startsWith("javascript")) { return new URI(target.split("#")[0]); } } catch (Exception e) { log.error("URI not valid: " + target); } return null; } }
