Author: jerome Date: Sat Dec 10 16:36:57 2005 New Revision: 355828 URL: http://svn.apache.org/viewcvs?rev=355828&view=rev Log: NUTCH-135 : Content metadata are now case insensitive (thanks to S. Groschupf)
Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java lucene/nutch/trunk/src/web/jsp/cached.jsp Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Dec 10 16:36:57 2005 @@ -234,7 +234,7 @@ MD5Hash hash = null; String url = fle.getPage().getURL().toString(); if (content == null) { - content = new Content(url, url, new byte[0], "", new Properties()); + content = new Content(url, url, new byte[0], "", new ContentProperties()); hash = MD5Hash.digest(url); } else { hash = MD5Hash.digest(content.getContent()); @@ -263,7 +263,7 @@ + status.toString()); outputPage(new FetcherOutput(fle, hash, protocolStatus), content, new ParseText(""), - new ParseData(status, "", new Outlink[0], new Properties())); + new ParseData(status, "", new Outlink[0], new ContentProperties())); } return status; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Sat Dec 10 16:36:57 2005 @@ -21,6 +21,7 @@ import org.apache.nutch.io.*; import org.apache.nutch.fs.*; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.tools.UpdateDatabaseTool; @@ -34,12 +35,12 @@ private String title; private Outlink[] outlinks; - private Properties metadata; + private ContentProperties metadata; private ParseStatus status; public ParseData() {} - public ParseData(ParseStatus status, String title, Outlink[] outlinks, Properties metadata) { + public ParseData(ParseStatus status, String title, Outlink[] outlinks, ContentProperties metadata) { this.status = status; this.title = title; this.outlinks = outlinks; @@ -62,7 +63,7 @@ /** Other page properties. This is the place to find format-specific * properties. Different parser implementations for different content types * will populate this differently. */ - public Properties getMetadata() { return metadata; } + public ContentProperties getMetadata() { return metadata; } /** Return the value of a metadata property. */ public String get(String name) { return getMetadata().getProperty(name); } @@ -94,7 +95,7 @@ } int propertyCount = in.readInt(); // read metadata - metadata = new Properties(); + metadata = new ContentProperties(); for (int i = 0; i < propertyCount; i++) { metadata.put(UTF8.readString(in), UTF8.readString(in)); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Sat Dec 10 16:36:57 2005 @@ -12,6 +12,7 @@ import org.apache.nutch.io.VersionedWritable; import org.apache.nutch.io.WritableUtils; +import org.apache.nutch.protocol.ContentProperties; /** * @author Andrzej Bialecki <[EMAIL PROTECTED]> @@ -230,7 +231,7 @@ private ParseData data = null; public EmptyParseImpl(ParseStatus status) { - data = new ParseData(status, "", new Outlink[0], new Properties()); + data = new ParseData(status, "", new Outlink[0], new ContentProperties()); } public ParseData getData() { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 10 16:36:57 2005 @@ -45,12 +45,12 @@ private String base; private byte[] content; private String contentType; - private Properties metadata; + private ContentProperties metadata; public Content() {} public Content(String url, String base, byte[] content, String contentType, - Properties metadata) { + ContentProperties metadata) { if (url == null) throw new IllegalArgumentException("null url"); if (base == null) throw new IllegalArgumentException("null base"); @@ -77,7 +77,7 @@ contentType = UTF8.readString(in); // read contentType int propertyCount = in.readInt(); // read metadata - metadata = new Properties(); + metadata = new ContentProperties(); for (int i = 0; i < propertyCount; i++) { metadata.put(UTF8.readString(in), UTF8.readString(in)); } @@ -134,7 +134,7 @@ } /** Other protocol-specific data. */ - public Properties getMetadata() { return metadata; } + public ContentProperties getMetadata() { return metadata; } /** Return the value of a metadata property. */ public String get(String name) { return getMetadata().getProperty(name); } Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=355828&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java Sat Dec 10 16:36:57 2005 @@ -0,0 +1,88 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +import java.util.Enumeration; +import java.util.Iterator; +import java.util.Properties; +import java.util.TreeMap; + +/** + * case insensitive properties + */ +public class ContentProperties extends TreeMap { + + /** + * construct the TreeMap with a case insensitive comparator + */ + public ContentProperties() { + super(String.CASE_INSENSITIVE_ORDER); + } + + /** + * initialize with default values + * + * @param defaults + */ + public ContentProperties(Properties defaults) { + super(String.CASE_INSENSITIVE_ORDER); + putAll(defaults); + } + + /** + * @param key + * @return the property value or null + */ + public String getProperty(String key) { + return (String) get(key); + } + + /** + * sets the key value tuple + * + * @param key + * @param value + */ + public void setProperty(String key, String value) { + put(key, value); + + } + + public Enumeration propertyNames() { + return new KeyEnumeration(keySet().iterator()); + } + + class KeyEnumeration implements Enumeration { + + private Iterator fIterator; + + public KeyEnumeration(Iterator iterator) { + fIterator = iterator; + } + + public boolean hasMoreElements() { + return fIterator.hasNext(); + + } + + public Object nextElement() { + return fIterator.next(); + } + + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Sat Dec 10 16:36:57 2005 @@ -16,6 +16,7 @@ package org.apache.nutch.servlet; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.searcher.NutchBean; import org.apache.nutch.searcher.Hit; import org.apache.nutch.searcher.HitDetails; @@ -76,7 +77,7 @@ byte[] bytes = bean.getContent(details); // pass all original headers? only these for now. - Properties metaData = bean.getParseData(details).getMetadata(); + ContentProperties metaData = bean.getParseData(details).getMetadata(); String contentType = (String) metaData.get("Content-Type"); //String lastModified = (String) metaData.get("Last-Modified"); //String contentLength = (String) metaData.get("Content-Length"); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java Sat Dec 10 16:36:57 2005 @@ -240,7 +240,7 @@ } outputPage(new ParseText(""), new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT), - "", new Outlink[0], new Properties())); + "", new Outlink[0], new ContentProperties())); } } @@ -250,7 +250,7 @@ return; } outputPage(new ParseText(""), - new ParseData(status, "", new Outlink[0], new Properties())); + new ParseData(status, "", new Outlink[0], new ContentProperties())); } private void outputPage Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Sat Dec 10 16:36:57 2005 @@ -18,6 +18,7 @@ import org.apache.nutch.parse.*; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.NutchConf; import java.util.*; @@ -51,7 +52,7 @@ } /** Scan the document adding attributes to metadata.*/ - public static void walk(Node doc, URL base, Properties metadata) + public static void walk(Node doc, URL base, ContentProperties metadata) throws ParseException { // walk the DOM tree, scanning for license data Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Sat Dec 10 16:36:57 2005 @@ -19,6 +19,7 @@ import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import java.util.Properties; import java.io.*; @@ -56,10 +57,10 @@ byte[] bytes = out.toByteArray(); Content content = - new Content(url, url, bytes, contentType, new Properties()); + new Content(url, url, bytes, contentType, new ContentProperties()); Parse parse = ParseUtil.parseByParserId("parse-html",content); - Properties metadata = parse.getData().getMetadata(); + ContentProperties metadata = parse.getData().getMetadata(); assertEquals(license, metadata.get("License-Url")); assertEquals(location, metadata.get("License-Location")); assertEquals(type, metadata.get("Work-Type")); Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sat Dec 10 16:36:57 2005 @@ -29,6 +29,7 @@ import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.parse.Parse; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; @@ -86,7 +87,7 @@ String url = fo.getUrl().toString(); // normalize metaData (see note in the method below). - Properties metaData = normalizeMeta(parse.getData().getMetadata()); + ContentProperties metaData = normalizeMeta(parse.getData().getMetadata()); addTime(doc, metaData, url, fo); @@ -101,7 +102,7 @@ // Add time related meta info. Add last-modified if present. Index date as // last-modified, or, if that's not present, use fetch time. - private Document addTime(Document doc, Properties metaData, String url, + private Document addTime(Document doc, ContentProperties metaData, String url, FetcherOutput fo) { long time = -1; @@ -169,7 +170,7 @@ } // Add Content-Length - private Document addLength(Document doc, Properties metaData, String url) { + private Document addLength(Document doc, ContentProperties metaData, String url) { String contentLength = metaData.getProperty("content-length"); if (contentLength != null) @@ -179,7 +180,7 @@ } // Add Content-Type and its primaryType and subType - private Document addType(Document doc, Properties metaData, String url) { + private Document addType(Document doc, ContentProperties metaData, String url) { MimeType mimeType = null; String contentType = metaData.getProperty("content-type"); if (contentType == null) { @@ -259,7 +260,7 @@ } } - private Document resetTitle(Document doc, Properties metaData, String url) { + private Document resetTitle(Document doc, ContentProperties metaData, String url) { String contentDisposition = metaData.getProperty("content-disposition"); if (contentDisposition == null) return doc; @@ -284,8 +285,8 @@ // (*) empty header value // Note: the original metaData should be kept intact, // because there is a benefit to preserve whatever comes from server. - private Properties normalizeMeta(Properties old) { - Properties normalized = new Properties(); + private ContentProperties normalizeMeta(ContentProperties old) { + ContentProperties normalized = new ContentProperties(); for (Enumeration e = old.propertyNames(); e.hasMoreElements();) { String key = (String) e.nextElement(); Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Sat Dec 10 16:36:57 2005 @@ -15,8 +15,7 @@ */ package org.apache.nutch.analysis.lang; -// JDK imports -import java.util.Properties; + // JUnit imports import junit.framework.TestCase; @@ -26,6 +25,7 @@ import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.ParserFactory; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; public class TestHTMLLanguageParser extends TestCase { @@ -122,7 +122,7 @@ private Content getContent(String text) { - Properties p = new Properties(); + ContentProperties p = new ContentProperties(); p.put("Content-Type", "text/html"); Content content = new Content(URL, BASE, text.getBytes(), "text/html", p); Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Sat Dec 10 16:36:57 2005 @@ -17,6 +17,7 @@ package org.apache.nutch.parse.ext; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.Parse; @@ -155,7 +156,7 @@ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text); // collect meta data - Properties metaData = new Properties(); + ContentProperties metaData = new ContentProperties(); metaData.putAll(content.getMetadata()); // copy through ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Sat Dec 10 16:36:57 2005 @@ -31,6 +31,7 @@ import org.apache.html.dom.*; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.*; import org.apache.nutch.parse.*; @@ -106,7 +107,7 @@ String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; - Properties metadata = new Properties(); + ContentProperties metadata = new ContentProperties(); // check that contentType is one we can handle String contentType = content.getContentType(); @@ -271,7 +272,7 @@ in.readFully(bytes); Parse parse = new HtmlParser().getParse(new Content(url,url, bytes,"text/html", - new Properties())); + new ContentProperties())); System.out.println("data: "+parse.getData()); System.out.println("text: "+parse.getText()); Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Dec 10 16:36:57 2005 @@ -22,6 +22,7 @@ import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; import org.apache.oro.text.regex.MatchResult; import org.apache.oro.text.regex.Pattern; @@ -56,7 +57,7 @@ walk(doc, parse, metaTags, url, outlinks); if (outlinks.size() > 0) { Outlink[] old = parse.getData().getOutlinks(); - Properties metadata = parse.getData().getMetadata(); + ContentProperties metadata = parse.getData().getMetadata(); String title = parse.getData().getTitle(); List list = Arrays.asList(old); outlinks.addAll(list); @@ -136,7 +137,7 @@ idx = Math.min(MAX_TITLE_LEN, script.length()); title = script.substring(0, idx); } - Properties metadata = new Properties(); + ContentProperties metadata = new ContentProperties(); metadata.putAll(c.getMetadata()); ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata); Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Sat Dec 10 16:36:57 2005 @@ -30,6 +30,7 @@ import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; /** @@ -73,7 +74,7 @@ byte[] raw = getRawBytes(new File(file)); - Properties prop = new Properties(); + ContentProperties prop = new ContentProperties(); prop.setProperty("Content-Length", "" + raw.length); Content content = new Content(file, file, raw, MIME_TYPE, prop); @@ -130,7 +131,7 @@ } // collect meta data - final Properties metadata = new Properties(); + final ContentProperties metadata = new ContentProperties(); metadata.putAll(content.getMetadata()); // copy through if (properties != null) { Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Sat Dec 10 16:36:57 2005 @@ -17,6 +17,7 @@ package org.apache.nutch.parse.msword; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; @@ -102,7 +103,7 @@ } // collect meta data - Properties metadata = new Properties(); + ContentProperties metadata = new ContentProperties(); metadata.putAll(content.getMetadata()); // copy through if(properties != null) { Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Sat Dec 10 16:36:57 2005 @@ -26,6 +26,7 @@ import org.pdfbox.exceptions.InvalidPasswordException; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; @@ -165,7 +166,7 @@ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text); // collect meta data - Properties metadata = new Properties(); + ContentProperties metadata = new ContentProperties(); metadata.putAll(content.getMetadata()); // copy through ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata); Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Sat Dec 10 16:36:57 2005 @@ -19,13 +19,14 @@ import java.util.Properties; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.parse.*; import org.apache.nutch.util.*; public class TextParser implements Parser { public Parse getParse(Content content) { // copy content meta data through - Properties metadata = new Properties(); + ContentProperties metadata = new ContentProperties(); metadata.putAll(content.getMetadata()); //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata); Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Sat Dec 10 16:36:57 2005 @@ -31,6 +31,7 @@ import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; /** @@ -87,7 +88,7 @@ } // collect meta data - final Properties metadata = new Properties(); + final ContentProperties metadata = new ContentProperties(); metadata.putAll(content.getMetadata()); // copy through if (resultText == null) { Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Sat Dec 10 16:36:57 2005 @@ -33,6 +33,7 @@ import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.Outlink; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.NutchConf; import org.apache.nutch.util.mime.MimeTypes; @@ -84,7 +85,7 @@ // Trying to resolve the Mime-Type String contentType = MIME.getMimeType(fname).getName(); try { - Properties metadata = new Properties(); + ContentProperties metadata = new ContentProperties(); metadata.setProperty("Content-Length", Long.toString(entry.getSize())); metadata.setProperty("Content-Type", contentType); Content content = new Content(newurl, base, b, contentType, metadata); Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Sat Dec 10 16:36:57 2005 @@ -25,6 +25,7 @@ // Nutch imports import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; /************************************ @@ -59,7 +60,7 @@ private String base; private byte[] content; private int code; - private Properties headers = new Properties(); + private ContentProperties headers = new ContentProperties(); private final File file; Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Sat Dec 10 16:36:57 2005 @@ -25,6 +25,7 @@ import org.apache.commons.net.ftp.parser.ParserInitializationException; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import java.net.InetAddress; import java.net.URL; @@ -59,7 +60,7 @@ private String base; private byte[] content; private int code; - private Properties headers = new Properties(); + private ContentProperties headers = new ContentProperties(); private final Ftp ftp; Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Sat Dec 10 16:36:57 2005 @@ -32,6 +32,7 @@ import java.util.logging.Level; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.util.GZIPUtils; @@ -44,7 +45,7 @@ private String base; private byte[] content; private int code; - private Properties headers = new Properties(); + private ContentProperties headers = new ContentProperties(); /** Returns the response code. */ public int getCode() { return code; } Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Sat Dec 10 16:36:57 2005 @@ -11,6 +11,8 @@ import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; + +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.util.NutchConf; @@ -44,7 +46,7 @@ private HttpAuthenticationFactory() { } - public static HttpAuthentication findAuthentication(Properties header) { + public static HttpAuthentication findAuthentication(ContentProperties header) { if (header == null) return null; try { Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java Sat Dec 10 16:36:57 2005 @@ -10,17 +10,18 @@ import java.util.Iterator; import java.util.logging.Logger; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.util.LogFormatter; /** - * An extension to [EMAIL PROTECTED] Properties} which allows multiple values for a single key. + * An extension to [EMAIL PROTECTED] ContentProperties} which allows multiple values for a single key. * The [EMAIL PROTECTED] #get(Object)} method may return a single value or a * [EMAIL PROTECTED] java.util.Collection} of values. * * @author Matt Tencati */ -public class MultiProperties extends Properties { +public class MultiProperties extends ContentProperties { public static final Logger LOG = LogFormatter .getLogger("net.nutch.protocol.http.MultiProperties"); @@ -31,7 +32,7 @@ */ public MultiProperties() { super(); - multiMap = new TreeMap(); + multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER); } /** @@ -41,7 +42,7 @@ */ public MultiProperties(Properties defaults) { super(defaults); - multiMap = new TreeMap(); + multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER); } /** Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Sat Dec 10 16:36:57 2005 @@ -16,10 +16,8 @@ package org.apache.nutch.parse; -import java.io.*; -import java.util.Properties; import org.apache.nutch.io.*; -import org.apache.nutch.pagedb.*; +import org.apache.nutch.protocol.ContentProperties; import junit.framework.TestCase; /** Unit tests for ParseData. */ @@ -36,7 +34,7 @@ new Outlink("http://bar.com/", "Bar") }; - Properties metaData = new Properties(); + ContentProperties metaData = new ContentProperties(); metaData.put("Language", "en/us"); metaData.put("Charset", "UTF-8"); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Sat Dec 10 16:36:57 2005 @@ -16,10 +16,7 @@ package org.apache.nutch.protocol; -import java.io.*; -import java.util.Properties; import org.apache.nutch.io.*; -import org.apache.nutch.pagedb.*; import junit.framework.TestCase; /** Unit tests for Content. */ @@ -33,7 +30,7 @@ String url = "http://www.foo.com/"; - Properties metaData = new Properties(); + ContentProperties metaData = new ContentProperties(); metaData.put("Host", "www.foo.com"); metaData.put("Content-Type", "text/html"); @@ -41,12 +38,14 @@ metaData); TestWritable.testWritable(r); + assertEquals("text/html", r.getMetadata().get("Content-Type")); + assertEquals("text/html", r.getMetadata().get("content-type")); } /** Unit tests for getContentType(String, String, byte[]) method. */ public void testGetContentType() throws Exception { Content c = null; - Properties p = new Properties(); + ContentProperties p = new ContentProperties(); c = new Content("http://www.foo.com/", "http://www.foo.com/", Modified: lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java Sat Dec 10 16:36:57 2005 @@ -32,6 +32,7 @@ import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.protocol.ProtocolStatus; import junit.framework.TestCase; @@ -90,7 +91,7 @@ content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n"); } content.append("</body></html>"); - Properties meta = new Properties(); + ContentProperties meta = new ContentProperties(); meta.setProperty("Content-Type", "text/html"); meta.setProperty("Host", "http://localhost"); meta.setProperty("Connection", "Keep-alive, close"); Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cached.jsp?rev=355828&r1=355827&r2=355828&view=diff ============================================================================== --- lucene/nutch/trunk/src/web/jsp/cached.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cached.jsp Sat Dec 10 16:36:57 2005 @@ -7,6 +7,7 @@ import="org.apache.nutch.searcher.*" import="org.apache.nutch.parse.ParseData" + import="org.apache.nutch.protocol.ContentProperties" %><% NutchBean bean = NutchBean.get(application); bean.LOG.info("cache request from " + request.getRemoteAddr()); @@ -19,7 +20,7 @@ ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale()) .getLocale().getLanguage(); - Properties metaData = bean.getParseData(details).getMetadata(); + ContentProperties metaData = bean.getParseData(details).getMetadata(); String content = null; String contentType = (String) metaData.get("Content-Type");