Author: siren
Date: Sat Mar 10 12:11:43 2007
New Revision: 516778

URL: http://svn.apache.org/viewvc?view=rev&rev=516778
Log:
change MoreIndexingFilter to use regular expressions from jre

Added:
    lucene/nutch/trunk/src/plugin/index-more/src/test/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/
    
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/
    
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java
Modified:
    
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=516778&r1=516777&r2=516778
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Sat Mar 10 12:11:43 2007
@@ -16,14 +16,6 @@
  */
 package org.apache.nutch.indexer.more;
 
-
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.MalformedPatternException;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
@@ -56,6 +48,8 @@
 
 import java.util.Date;
 import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.lang.time.DateUtils;
 
@@ -244,21 +238,15 @@
   // Patterns used to extract filename from possible non-standard
   // HTTP header "Content-Disposition". Typically it looks like:
   // Content-Disposition: inline; filename="foo.ppt"
-  private PatternMatcher matcher = new Perl5Matcher();
 
   private Configuration conf;
-  static Perl5Pattern patterns[] = {null, null};
+  static Pattern patterns[] = new Pattern[2];
   static {
-    Perl5Compiler compiler = new Perl5Compiler();
-    try {
       // order here is important
       patterns[0] =
-        (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]");
+        Pattern.compile("\\bfilename=['\"](.+)['\"]");
       patterns[1] =
-        (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
-    } catch (MalformedPatternException e) {
-      // just ignore
-    }
+        Pattern.compile("\\bfilename=(\\S+)\\b");
   }
 
   private Document resetTitle(Document doc, ParseData data, String url) {
@@ -266,16 +254,28 @@
     if (contentDisposition == null)
       return doc;
 
-    MatchResult result;
-    for (int i=0; i<patterns.length; i++) {
-      if (matcher.contains(contentDisposition,patterns[i])) {
-        result = matcher.getMatch();
-        doc.add(new Field("title", result.group(1), Field.Store.YES, 
Field.Index.NO));
+    String filename = getFileName(contentDisposition);
+
+    if (filename != null) {
+      doc.add(new Field("title", filename, Field.Store.YES, Field.Index.NO));
+    }
+
+    return doc;
+  }
+  
+  String getFileName(String value) {
+
+    String filename = null;
+
+    for (int i = 0; i < patterns.length; i++) {
+      Matcher matcher = patterns[i].matcher(value);
+      if(matcher.find()) {
+        filename = matcher.group(1);
         break;
       }
     }
+    return filename;
 
-    return doc;
   }
 
   public void setConf(Configuration conf) {

Added: 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java?view=auto&rev=516778
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java
 Sat Mar 10 12:11:43 2007
@@ -0,0 +1,36 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.nutch.indexer.more;
+
+import junit.framework.TestCase;
+
+public class MoreIndexingFilterTest extends TestCase {
+
+  public void testGetFileNamePlain() {
+    assertMatches("attachment; filename=genome.jpeg;", "genome.jpeg");
+    assertMatches("attachment; filename=\"genome.jpeg\";", "genome.jpeg");
+    assertMatches("attachment; filename=\'genome.jpeg\';", "genome.jpeg");
+  }
+
+  private void assertMatches(String headerValue, String expected) {
+    MoreIndexingFilter mif = new MoreIndexingFilter();
+    String filename = mif.getFileName(headerValue);
+    assertEquals("Did not match '" + expected + "'!='" + filename + "'",
+        expected, filename);
+  }
+
+}



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to