Author: jerome Date: Fri Aug 19 14:15:02 2005 New Revision: 233559 URL: http://svn.apache.org/viewcvs?rev=233559&view=rev Log: * Add utility to extract urls from plain text (Stephan Strittmatter) * Uses the OutlinkExtractor in parse plugins PDF, MSWord, Text, RTF, Ext
Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (with props) lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java (with props) Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=233559&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Fri Aug 19 14:15:02 2005 @@ -0,0 +1,227 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import org.apache.nutch.util.LogFormatter; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache.oro.text.regex.PatternCompiler; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.PatternMatcherInput; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; + +/** + * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s + * / URLs from plain text using Regular Expressions. + * + * @see <a + * href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison + * of different regexp-Implementations </a> + * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs + * </a> + * + * @author Stephan Strittmatter - http://www.sybit.de + * @version 1.0 + * @since 0.7 + */ +public class OutlinkExtractor { + private static final Logger LOG = LogFormatter + .getLogger(OutlinkExtractor.class.getName()); + + /** + * Regex pattern to get URLs within a plain text. + * + * @see <a + * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html + * </a> + */ + private static final String URL_PATTERN = + "([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?)"; + + /** + * Extracts <code>Outlink</code> from given plain text. + * + * @param plainText the plain text from wich URLs should be extracted. + * + * @return Array of <code>Outlink</code>s within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText) { + return OutlinkExtractor.getOutlinks(plainText, ""); + } + + /** + * Extracts <code>Outlink</code> from given plain text and adds anchor + * to the extracted <code>Outlink</code>s + * + * @param plainText the plain text from wich URLs should be extracted. + * @param anchor the anchor of the url + * + * @return Array of <code>Outlink</code>s within found in plainText + */ + public static Outlink[] getOutlinks(final String plainText, String anchor) { + + final List outlinks = new ArrayList(); + + try { + final PatternCompiler cp = new Perl5Compiler(); + final Pattern pattern = cp.compile(URL_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final PatternMatcher matcher = new Perl5Matcher(); + + final PatternMatcherInput input = new PatternMatcherInput(plainText); + + MatchResult result; + String url; + + //loop the matches + while (matcher.contains(input, pattern)) { + result = matcher.getMatch(); + url = result.group(0); + outlinks.add(new Outlink(url, anchor)); + } + } catch (Exception ex) { + // if it is a malformed URL we just throw it away and continue with + // extraction. + LOG.throwing(OutlinkExtractor.class.getName(), "getOutlinks", ex); + } + + final Outlink[] retval; + + //create array of the Outlinks + if (outlinks != null && outlinks.size() > 0) { + retval = (Outlink[]) outlinks.toArray(new Outlink[0]); + } else { + retval = new Outlink[0]; + } + + return retval; + } + + + /** + * Extracts outlinks from a plain text. <br /> + * This Method takes the Jakarta Regexp API. + * + * @param plainText + * + * @return Array of <code>Outlink</code> s within found in plainText + * @deprecated only for tests + */ + private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) { + + throw new UnsupportedOperationException( + "Implementation commented out. Please uncomment to use it."); + + // final List outlinks = new ArrayList(); + // String url; + // Outlink link; + // + // RE re = new RE(URL_PATTERN); + // + // int pos = 0; + // + // while (re.match(plainText, pos)) { + // + // url = re.getParen(0); + // + // LOG.finest("Extracted url: " + url); + // + // try { + // + // link = new Outlink(url, null); + // outlinks.add(link); + // + // } catch (MalformedURLException ex) { + // // if it is a malformed URL we just throw it away and continue with + // // extraction. + // LOG.throwing(this.getClass().getName(), "getOutlinks", ex); + // } + // + // pos = re.getParenEnd(0); + // } + // + // final Outlink[] retval; + // + // if (pos > 0) { + // retval = (Outlink[]) outlinks.toArray(new Outlink[0]); + // } else { + // retval = new Outlink[0]; + // } + // + // return retval; + + } + + /** + * Extracts outlinks from a plain text. + * </p> + * This Method takes the JDK5 Regexp API. + * + * @param plainText + * + * @return Array of <code>Outlink</code> s within found in plainText + * @deprecated only for tests + */ + private Outlink[] getOutlinksJDK5Impl(final String plainText) { + + throw new UnsupportedOperationException( + "Implementation commented out. Please uncomment to use it."); + + // final List outlinks = new ArrayList(); + // String url; + // Outlink link; + // + // final Pattern urlPattern = Pattern.compile(URL_PATTERN); + // final RE re = new RE(urlPattern); + // + // int pos = 0; + // + // while (re.match(plainText, pos)) { + // + // url = re.getParen(0); + // + // try { + // + // link = new Outlink(url, null); + // outlinks.add(link); + // } catch (MalformedURLException ex) { + // // if it is a malformed URL we just throw it away and continue with + // // extraction. + // LOG.throwing(this.getClass().getName(), "getOutlinks", ex); + // } + // + // pos = re.getParenEnd(0); + // } + // + // final Outlink[] retval; + // + // if (pos > 0) { + // retval = (Outlink[]) outlinks.toArray(new Outlink[0]); + // } else { + // retval = new Outlink[0]; + // } + // + // return retval; + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=233559&r1=233558&r2=233559&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Fri Aug 19 14:15:02 2005 @@ -23,6 +23,7 @@ import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.CommandRunner; @@ -151,7 +152,7 @@ title = ""; // collect outlink - Outlink[] outlinks = new Outlink[0]; + Outlink[] outlinks = OutlinkExtractor.getOutlinks(text); // collect meta data Properties metaData = new Properties(); Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=233559&r1=233558&r2=233559&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Fri Aug 19 14:15:02 2005 @@ -24,6 +24,7 @@ import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; import org.apache.nutch.parse.ParseException; import java.util.Properties; @@ -117,7 +118,7 @@ title = ""; // collect outlink - Outlink[] outlinks = new Outlink[0]; + Outlink[] outlinks = OutlinkExtractor.getOutlinks(text); ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata); return new ParseImpl(text, parseData); Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=233559&r1=233558&r2=233559&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Fri Aug 19 14:15:02 2005 @@ -33,6 +33,7 @@ import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; import org.apache.nutch.parse.ParseException; import java.text.SimpleDateFormat; @@ -161,7 +162,7 @@ title = ""; // collect outlink - Outlink[] outlinks = new Outlink[0]; + Outlink[] outlinks = OutlinkExtractor.getOutlinks(text); // collect meta data Properties metadata = new Properties(); Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=233559&r1=233558&r2=233559&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Fri Aug 19 14:15:02 2005 @@ -59,9 +59,12 @@ title = ""; } - ParseData parseData = new ParseData(title, new Outlink[0], metadata); + String text = delegate.getText(); - return new ParseImpl(delegate.getText(), parseData); + return new ParseImpl(text, + new ParseData(title, + OutlinkExtractor.getOutlinks(text), + metadata)); } Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=233559&r1=233558&r2=233559&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Fri Aug 19 14:15:02 2005 @@ -28,7 +28,7 @@ Properties metadata = new Properties(); metadata.putAll(content.getMetadata()); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata); + //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata); String encoding = StringUtil.parseCharacterEncoding(content.getContentType()); @@ -45,6 +45,9 @@ text = new String(content.getContent()); // use default encoding } - return new ParseImpl(text, parseData); + return new ParseImpl(text, + new ParseData(ParseStatus.STATUS_SUCCESS, "", + OutlinkExtractor.getOutlinks(text), + metadata)); } } Added: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java?rev=233559&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java Fri Aug 19 14:15:02 2005 @@ -0,0 +1,77 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; + +import junit.framework.TestCase; + +/** + * TestCase to check regExp extraction of URLs. + * + * @author Stephan Strittmatter - http://www.sybit.de + * + * @version 1.0 + */ +public class TestOutlinkExtractor extends TestCase { + + public void testGetNoOutlinks() { + Outlink[] outlinks = null; + + outlinks = OutlinkExtractor.getOutlinks(null); + assertNotNull(outlinks); + assertEquals(0, outlinks.length); + + outlinks = OutlinkExtractor.getOutlinks(""); + assertNotNull(outlinks); + assertEquals(0, outlinks.length); + } + + public void testGetOutlinksHttp() { + Outlink[] outlinks = OutlinkExtractor.getOutlinks( + "Test with http://www.nutch.org/index.html is it found? " + + "What about www.google.com at http://www.google.de " + + "A longer URL could be http://www.sybit.com/solutions/portals.html"); + + assertTrue("Url not found!", outlinks.length == 3); + assertEquals("Wrong URL", "http://www.nutch.org/index.html", outlinks[0].getToUrl()); + assertEquals("Wrong URL", "http://www.google.de/", outlinks[1].getToUrl()); + assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl()); + } + + public void testGetOutlinksHttp2() { + Outlink[] outlinks = OutlinkExtractor.getOutlinks( + "Test with http://www.nutch.org/index.html is it found? " + + "What about www.google.com at http://www.google.de " + + "A longer URL could be http://www.sybit.com/solutions/portals.html", "http://www.sybit.de"); + + assertTrue("Url not found!", outlinks.length == 3); + assertEquals("Wrong URL", "http://www.nutch.org/index.html", outlinks[0].getToUrl()); + assertEquals("Wrong URL", "http://www.google.de/", outlinks[1].getToUrl()); + assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl()); + } + public void testGetOutlinksFtp() { + Outlink[] outlinks = OutlinkExtractor.getOutlinks( + "Test with ftp://www.nutch.org is it found? " + + "What about www.google.com at ftp://www.google.de"); + + assertTrue("Url not found!", outlinks.length >1); + assertEquals("Wrong URL", "ftp://www.nutch.org/", outlinks[0].getToUrl()); + assertEquals("Wrong URL", "ftp://www.google.de/", outlinks[1].getToUrl()); + } +} Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native