Author: jerome
Date: Fri Aug 19 14:15:02 2005
New Revision: 233559
URL: http://svn.apache.org/viewcvs?rev=233559&view=rev
Log:
* Add utility to extract urls from plain text (Stephan Strittmatter)
* Uses the OutlinkExtractor in parse plugins PDF, MSWord, Text, RTF, Ext
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
(with props)
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
(with props)
Modified:
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=233559&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
Fri Aug 19 14:15:02 2005
@@ -0,0 +1,227 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s
+ * / URLs from plain text using Regular Expressions.
+ *
+ * @see <a
+ *
href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
+ * of different regexp-Implementations </a>
+ * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs
+ * </a>
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+ private static final Logger LOG = LogFormatter
+ .getLogger(OutlinkExtractor.class.getName());
+
+ /**
+ * Regex pattern to get URLs within a plain text.
+ *
+ * @see <a
+ *
href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+ * </a>
+ */
+ private static final String URL_PATTERN =
+
"([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?)";
+
+ /**
+ * Extracts <code>Outlink</code> from given plain text.
+ *
+ * @param plainText the plain text from wich URLs should be extracted.
+ *
+ * @return Array of <code>Outlink</code>s within found in plainText
+ */
+ public static Outlink[] getOutlinks(final String plainText) {
+ return OutlinkExtractor.getOutlinks(plainText, "");
+ }
+
+ /**
+ * Extracts <code>Outlink</code> from given plain text and adds anchor
+ * to the extracted <code>Outlink</code>s
+ *
+ * @param plainText the plain text from wich URLs should be extracted.
+ * @param anchor the anchor of the url
+ *
+ * @return Array of <code>Outlink</code>s within found in plainText
+ */
+ public static Outlink[] getOutlinks(final String plainText, String anchor) {
+
+ final List outlinks = new ArrayList();
+
+ try {
+ final PatternCompiler cp = new Perl5Compiler();
+ final Pattern pattern = cp.compile(URL_PATTERN,
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
+ final PatternMatcher matcher = new Perl5Matcher();
+
+ final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+ MatchResult result;
+ String url;
+
+ //loop the matches
+ while (matcher.contains(input, pattern)) {
+ result = matcher.getMatch();
+ url = result.group(0);
+ outlinks.add(new Outlink(url, anchor));
+ }
+ } catch (Exception ex) {
+ // if it is a malformed URL we just throw it away and continue with
+ // extraction.
+ LOG.throwing(OutlinkExtractor.class.getName(), "getOutlinks", ex);
+ }
+
+ final Outlink[] retval;
+
+ //create array of the Outlinks
+ if (outlinks != null && outlinks.size() > 0) {
+ retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ } else {
+ retval = new Outlink[0];
+ }
+
+ return retval;
+ }
+
+
+ /**
+ * Extracts outlinks from a plain text. <br />
+ * This Method takes the Jakarta Regexp API.
+ *
+ * @param plainText
+ *
+ * @return Array of <code>Outlink</code> s within found in plainText
+ * @deprecated only for tests
+ */
+ private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
+
+ throw new UnsupportedOperationException(
+ "Implementation commented out. Please uncomment to use it.");
+
+ // final List outlinks = new ArrayList();
+ // String url;
+ // Outlink link;
+ //
+ // RE re = new RE(URL_PATTERN);
+ //
+ // int pos = 0;
+ //
+ // while (re.match(plainText, pos)) {
+ //
+ // url = re.getParen(0);
+ //
+ // LOG.finest("Extracted url: " + url);
+ //
+ // try {
+ //
+ // link = new Outlink(url, null);
+ // outlinks.add(link);
+ //
+ // } catch (MalformedURLException ex) {
+ // // if it is a malformed URL we just throw it away and continue with
+ // // extraction.
+ // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+ // }
+ //
+ // pos = re.getParenEnd(0);
+ // }
+ //
+ // final Outlink[] retval;
+ //
+ // if (pos > 0) {
+ // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ // } else {
+ // retval = new Outlink[0];
+ // }
+ //
+ // return retval;
+
+ }
+
+ /**
+ * Extracts outlinks from a plain text.
+ * </p>
+ * This Method takes the JDK5 Regexp API.
+ *
+ * @param plainText
+ *
+ * @return Array of <code>Outlink</code> s within found in plainText
+ * @deprecated only for tests
+ */
+ private Outlink[] getOutlinksJDK5Impl(final String plainText) {
+
+ throw new UnsupportedOperationException(
+ "Implementation commented out. Please uncomment to use it.");
+
+ // final List outlinks = new ArrayList();
+ // String url;
+ // Outlink link;
+ //
+ // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+ // final RE re = new RE(urlPattern);
+ //
+ // int pos = 0;
+ //
+ // while (re.match(plainText, pos)) {
+ //
+ // url = re.getParen(0);
+ //
+ // try {
+ //
+ // link = new Outlink(url, null);
+ // outlinks.add(link);
+ // } catch (MalformedURLException ex) {
+ // // if it is a malformed URL we just throw it away and continue with
+ // // extraction.
+ // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+ // }
+ //
+ // pos = re.getParenEnd(0);
+ // }
+ //
+ // final Outlink[] retval;
+ //
+ // if (pos > 0) {
+ // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ // } else {
+ // retval = new Outlink[0];
+ // }
+ //
+ // return retval;
+ }
+
+}
Propchange:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
Fri Aug 19 14:15:02 2005
@@ -23,6 +23,7 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.CommandRunner;
@@ -151,7 +152,7 @@
title = "";
// collect outlink
- Outlink[] outlinks = new Outlink[0];
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
Properties metaData = new Properties();
Modified:
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
Fri Aug 19 14:15:02 2005
@@ -24,6 +24,7 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.ParseException;
import java.util.Properties;
@@ -117,7 +118,7 @@
title = "";
// collect outlink
- Outlink[] outlinks = new Outlink[0];
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, metadata);
return new ParseImpl(text, parseData);
Modified:
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
Fri Aug 19 14:15:02 2005
@@ -33,6 +33,7 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.ParseException;
import java.text.SimpleDateFormat;
@@ -161,7 +162,7 @@
title = "";
// collect outlink
- Outlink[] outlinks = new Outlink[0];
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
Properties metadata = new Properties();
Modified:
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
Fri Aug 19 14:15:02 2005
@@ -59,9 +59,12 @@
title = "";
}
- ParseData parseData = new ParseData(title, new Outlink[0], metadata);
+ String text = delegate.getText();
- return new ParseImpl(delegate.getText(), parseData);
+ return new ParseImpl(text,
+ new ParseData(title,
+ OutlinkExtractor.getOutlinks(text),
+ metadata));
}
Modified:
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
Fri Aug 19 14:15:02 2005
@@ -28,7 +28,7 @@
Properties metadata = new Properties();
metadata.putAll(content.getMetadata());
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
Outlink[0], metadata);
+ //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
Outlink[0], metadata);
String encoding =
StringUtil.parseCharacterEncoding(content.getContentType());
@@ -45,6 +45,9 @@
text = new String(content.getContent()); // use default encoding
}
- return new ParseImpl(text, parseData);
+ return new ParseImpl(text,
+ new ParseData(ParseStatus.STATUS_SUCCESS, "",
+ OutlinkExtractor.getOutlinks(text),
+ metadata));
}
}
Added:
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java?rev=233559&view=auto
==============================================================================
---
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
(added)
+++
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
Fri Aug 19 14:15:02 2005
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+
+import junit.framework.TestCase;
+
+/**
+ * TestCase to check regExp extraction of URLs.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+public class TestOutlinkExtractor extends TestCase {
+
+ public void testGetNoOutlinks() {
+ Outlink[] outlinks = null;
+
+ outlinks = OutlinkExtractor.getOutlinks(null);
+ assertNotNull(outlinks);
+ assertEquals(0, outlinks.length);
+
+ outlinks = OutlinkExtractor.getOutlinks("");
+ assertNotNull(outlinks);
+ assertEquals(0, outlinks.length);
+ }
+
+ public void testGetOutlinksHttp() {
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+ "Test with http://www.nutch.org/index.html is it found? " +
+ "What about www.google.com at http://www.google.de " +
+ "A longer URL could be http://www.sybit.com/solutions/portals.html");
+
+ assertTrue("Url not found!", outlinks.length == 3);
+ assertEquals("Wrong URL", "http://www.nutch.org/index.html",
outlinks[0].getToUrl());
+ assertEquals("Wrong URL", "http://www.google.de/", outlinks[1].getToUrl());
+ assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html",
outlinks[2].getToUrl());
+ }
+
+ public void testGetOutlinksHttp2() {
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+ "Test with http://www.nutch.org/index.html is it found? " +
+ "What about www.google.com at http://www.google.de " +
+ "A longer URL could be http://www.sybit.com/solutions/portals.html",
"http://www.sybit.de");
+
+ assertTrue("Url not found!", outlinks.length == 3);
+ assertEquals("Wrong URL", "http://www.nutch.org/index.html",
outlinks[0].getToUrl());
+ assertEquals("Wrong URL", "http://www.google.de/", outlinks[1].getToUrl());
+ assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html",
outlinks[2].getToUrl());
+ }
+ public void testGetOutlinksFtp() {
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+ "Test with ftp://www.nutch.org is it found? " +
+ "What about www.google.com at ftp://www.google.de");
+
+ assertTrue("Url not found!", outlinks.length >1);
+ assertEquals("Wrong URL", "ftp://www.nutch.org/", outlinks[0].getToUrl());
+ assertEquals("Wrong URL", "ftp://www.google.de/", outlinks[1].getToUrl());
+ }
+}
Propchange:
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native