Author: siren Date: Sat Mar 10 12:11:43 2007 New Revision: 516778 URL: http://svn.apache.org/viewvc?view=rev&rev=516778 Log: change MoreIndexingFilter to use regular expressions from jre
Added: lucene/nutch/trunk/src/plugin/index-more/src/test/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=516778&r1=516777&r2=516778 ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sat Mar 10 12:11:43 2007 @@ -16,14 +16,6 @@ */ package org.apache.nutch.indexer.more; - -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; -import org.apache.oro.text.regex.Perl5Pattern; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.MalformedPatternException; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -56,6 +48,8 @@ import java.util.Date; import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.lang.time.DateUtils; @@ -244,21 +238,15 @@ // Patterns used to extract filename from possible non-standard // HTTP header "Content-Disposition". Typically it looks like: // Content-Disposition: inline; filename="foo.ppt" - private PatternMatcher matcher = new Perl5Matcher(); private Configuration conf; - static Perl5Pattern patterns[] = {null, null}; + static Pattern patterns[] = new Pattern[2]; static { - Perl5Compiler compiler = new Perl5Compiler(); - try { // order here is important patterns[0] = - (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]"); + Pattern.compile("\\bfilename=['\"](.+)['\"]"); patterns[1] = - (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b"); - } catch (MalformedPatternException e) { - // just ignore - } + Pattern.compile("\\bfilename=(\\S+)\\b"); } private Document resetTitle(Document doc, ParseData data, String url) { @@ -266,16 +254,28 @@ if (contentDisposition == null) return doc; - MatchResult result; - for (int i=0; i<patterns.length; i++) { - if (matcher.contains(contentDisposition,patterns[i])) { - result = matcher.getMatch(); - doc.add(new Field("title", result.group(1), Field.Store.YES, Field.Index.NO)); + String filename = getFileName(contentDisposition); + + if (filename != null) { + doc.add(new Field("title", filename, Field.Store.YES, Field.Index.NO)); + } + + return doc; + } + + String getFileName(String value) { + + String filename = null; + + for (int i = 0; i < patterns.length; i++) { + Matcher matcher = patterns[i].matcher(value); + if(matcher.find()) { + filename = matcher.group(1); break; } } + return filename; - return doc; } public void setConf(Configuration conf) { Added: lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java?view=auto&rev=516778 ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java (added) +++ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/MoreIndexingFilterTest.java Sat Mar 10 12:11:43 2007 @@ -0,0 +1,36 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.nutch.indexer.more; + +import junit.framework.TestCase; + +public class MoreIndexingFilterTest extends TestCase { + + public void testGetFileNamePlain() { + assertMatches("attachment; filename=genome.jpeg;", "genome.jpeg"); + assertMatches("attachment; filename=\"genome.jpeg\";", "genome.jpeg"); + assertMatches("attachment; filename=\'genome.jpeg\';", "genome.jpeg"); + } + + private void assertMatches(String headerValue, String expected) { + MoreIndexingFilter mif = new MoreIndexingFilter(); + String filename = mif.getFileName(headerValue); + assertEquals("Did not match '" + expected + "'!='" + filename + "'", + expected, filename); + } + +} ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs