plugi...

jerome Fri, 19 Aug 2005 14:15:26 -0700

Author: jerome
Date: Fri Aug 19 14:15:02 2005
New Revision: 233559

URL: http://svn.apache.org/viewcvs?rev=233559&view=rev
Log:
* Add utility to extract urls from plain text (Stephan Strittmatter)
* Uses the OutlinkExtractor in parse plugins PDF, MSWord, Text, RTF, Ext


Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java   
(with props)
    
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java   
(with props)
Modified:
    
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
    
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
    
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
    
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=233559&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Fri Aug 19 14:15:02 2005
@@ -0,0 +1,227 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s 
+ * / URLs from plain text using Regular Expressions.
+ * 
+ * @see <a
+ *      
href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions";>Comparison
+ *      of different regexp-Implementations </a>
+ * @see <a href="http://regex.info/java.html";>Overview about Java Regexp APIs
+ *      </a>
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+  private static final Logger LOG = LogFormatter
+      .getLogger(OutlinkExtractor.class.getName());
+
+  /**
+   * Regex pattern to get URLs within a plain text.
+   * 
+   * @see <a
+   *      
href="http://www.truerwords.net/articles/ut/urlactivation.html";>http://www.truerwords.net/articles/ut/urlactivation.html
+   *      </a>
+   */
+  private static final String URL_PATTERN = 
+    
"([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?)";
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text.
+   * 
+   * @param plainText  the plain text from wich URLs should be extracted.
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText) {
+    return OutlinkExtractor.getOutlinks(plainText, "");
+  }
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text and adds anchor
+   * to the extracted <code>Outlink</code>s
+   * 
+   * @param plainText the plain text from wich URLs should be extracted.
+   * @param anchor    the anchor of the url
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, String anchor) {
+
+    final List outlinks = new ArrayList();
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(URL_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      //loop the matches
+      while (matcher.contains(input, pattern)) {
+        result = matcher.getMatch();
+        url = result.group(0);
+        outlinks.add(new Outlink(url, anchor));
+      }
+    } catch (Exception ex) {
+      // if it is a malformed URL we just throw it away and continue with
+      // extraction.
+      LOG.throwing(OutlinkExtractor.class.getName(), "getOutlinks", ex);
+    }
+
+    final Outlink[] retval;
+
+    //create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    } else {
+      retval = new Outlink[0];
+    }
+
+    return retval;
+  }
+  
+
+  /**
+   * Extracts outlinks from a plain text. <br />
+   * This Method takes the Jakarta Regexp API.
+   * 
+   * @param plainText
+   * 
+   * @return Array of <code>Outlink</code> s within found in plainText
+   * @deprecated only for tests
+   */
+  private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
+
+    throw new UnsupportedOperationException(
+        "Implementation commented out. Please uncomment to use it.");
+
+    // final List outlinks = new ArrayList();
+    // String url;
+    // Outlink link;
+    //
+    // RE re = new RE(URL_PATTERN);
+    //
+    // int pos = 0;
+    //
+    // while (re.match(plainText, pos)) {
+    //
+    // url = re.getParen(0);
+    //
+    // LOG.finest("Extracted url: " + url);
+    //
+    // try {
+    //
+    // link = new Outlink(url, null);
+    // outlinks.add(link);
+    //
+    // } catch (MalformedURLException ex) {
+    // // if it is a malformed URL we just throw it away and continue with
+    // // extraction.
+    // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+    // }
+    //
+    // pos = re.getParenEnd(0);
+    // }
+    //
+    // final Outlink[] retval;
+    //
+    // if (pos > 0) {
+    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    // } else {
+    // retval = new Outlink[0];
+    // }
+    //
+    // return retval;
+
+  }
+
+  /**
+   * Extracts outlinks from a plain text.
+   * </p>
+   * This Method takes the JDK5 Regexp API.
+   * 
+   * @param plainText
+   * 
+   * @return Array of <code>Outlink</code> s within found in plainText
+   * @deprecated only for tests
+   */
+  private Outlink[] getOutlinksJDK5Impl(final String plainText) {
+
+    throw new UnsupportedOperationException(
+        "Implementation commented out. Please uncomment to use it.");
+
+    // final List outlinks = new ArrayList();
+    // String url;
+    // Outlink link;
+    //
+    // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+    // final RE re = new RE(urlPattern);
+    //
+    // int pos = 0;
+    //
+    // while (re.match(plainText, pos)) {
+    //
+    // url = re.getParen(0);
+    //
+    // try {
+    //
+    // link = new Outlink(url, null);
+    // outlinks.add(link);
+    // } catch (MalformedURLException ex) {
+    // // if it is a malformed URL we just throw it away and continue with
+    // // extraction.
+    // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+    // }
+    //
+    // pos = re.getParenEnd(0);
+    // }
+    //
+    // final Outlink[] retval;
+    //
+    // if (pos > 0) {
+    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    // } else {
+    // retval = new Outlink[0];
+    // }
+    //
+    // return retval;
+  }
+ 
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
 Fri Aug 19 14:15:02 2005
@@ -23,6 +23,7 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.CommandRunner;
@@ -151,7 +152,7 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = new Outlink[0];
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
     Properties metaData = new Properties();

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
 Fri Aug 19 14:15:02 2005
@@ -24,6 +24,7 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
 import org.apache.nutch.parse.ParseException;
 
 import java.util.Properties;
@@ -117,7 +118,7 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = new Outlink[0];
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, 
outlinks, metadata);
     return new ParseImpl(text, parseData);

Modified: 
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
 Fri Aug 19 14:15:02 2005
@@ -33,6 +33,7 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
 import org.apache.nutch.parse.ParseException;
 
 import java.text.SimpleDateFormat;
@@ -161,7 +162,7 @@
       title = "";
 
     // collect outlink
-    Outlink[] outlinks = new Outlink[0];
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
     Properties metadata = new Properties();

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
 Fri Aug 19 14:15:02 2005
@@ -59,9 +59,12 @@
       title = "";
     }
 
-    ParseData parseData = new ParseData(title, new Outlink[0], metadata);
+    String text = delegate.getText();
 
-    return new ParseImpl(delegate.getText(), parseData);
+    return new ParseImpl(text, 
+                         new ParseData(title,
+                                       OutlinkExtractor.getOutlinks(text),
+                                       metadata));
   }
 
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=233559&r1=233558&r2=233559&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
 Fri Aug 19 14:15:02 2005
@@ -28,7 +28,7 @@
     Properties metadata = new Properties();
     metadata.putAll(content.getMetadata());
 
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new 
Outlink[0], metadata);
+    //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new 
Outlink[0], metadata);
 
     String encoding =
       StringUtil.parseCharacterEncoding(content.getContentType());
@@ -45,6 +45,9 @@
       text = new String(content.getContent());    // use default encoding
     }
 
-    return new ParseImpl(text, parseData);
+    return new ParseImpl(text,
+                         new ParseData(ParseStatus.STATUS_SUCCESS, "",
+                                       OutlinkExtractor.getOutlinks(text),
+                                       metadata));
   }
 }

Added: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java?rev=233559&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java 
(added)
+++ 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java 
Fri Aug 19 14:15:02 2005
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.nutch.parse;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+
+import junit.framework.TestCase;
+
+/**
+ * TestCase to check regExp extraction of URLs.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+public class TestOutlinkExtractor extends TestCase {
+
+  public void testGetNoOutlinks() {
+    Outlink[]  outlinks = null;
+            
+    outlinks = OutlinkExtractor.getOutlinks(null);
+    assertNotNull(outlinks);
+    assertEquals(0, outlinks.length);
+    
+    outlinks = OutlinkExtractor.getOutlinks("");
+    assertNotNull(outlinks);
+    assertEquals(0, outlinks.length);
+  }
+  
+  public void testGetOutlinksHttp() {
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+        "Test with http://www.nutch.org/index.html is it found? " +
+        "What about www.google.com at http://www.google.de " +
+        "A longer URL could be http://www.sybit.com/solutions/portals.html";);
+    
+    assertTrue("Url not found!", outlinks.length == 3);
+    assertEquals("Wrong URL", "http://www.nutch.org/index.html";, 
outlinks[0].getToUrl());
+    assertEquals("Wrong URL", "http://www.google.de/";, outlinks[1].getToUrl());
+    assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html";, 
outlinks[2].getToUrl());
+  }
+  
+  public void testGetOutlinksHttp2() {
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+        "Test with http://www.nutch.org/index.html is it found? " +
+        "What about www.google.com at http://www.google.de " +
+        "A longer URL could be http://www.sybit.com/solutions/portals.html";, 
"http://www.sybit.de";);
+    
+    assertTrue("Url not found!", outlinks.length == 3);
+    assertEquals("Wrong URL", "http://www.nutch.org/index.html";, 
outlinks[0].getToUrl());
+    assertEquals("Wrong URL", "http://www.google.de/";, outlinks[1].getToUrl());
+    assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html";, 
outlinks[2].getToUrl());
+  }
+  public void testGetOutlinksFtp() {
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+        "Test with ftp://www.nutch.org is it found? " +
+        "What about www.google.com at ftp://www.google.de";);
+    
+    assertTrue("Url not found!", outlinks.length >1);
+    assertEquals("Wrong URL", "ftp://www.nutch.org/";, outlinks[0].getToUrl());
+    assertEquals("Wrong URL", "ftp://www.google.de/";, outlinks[1].getToUrl());
+  }
+}

Propchange: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r233559 - in /lucene/nutch/trunk/src: java/org/apache/nutch/parse/ plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ plugi...

Reply via email to