Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java Wed Feb 8 13:48:52 2006 @@ -20,6 +20,7 @@ import org.apache.poi.poifs.eventfilesystem.*; import org.apache.poi.poifs.filesystem.*; import org.apache.poi.util.LittleEndian; +import org.apache.nutch.metadata.Metadata; import java.util.*; import java.io.*; @@ -33,8 +34,7 @@ * code to extract all msword properties. * */ -public class WordExtractor -{ +public class WordExtractor { /** * Constructor @@ -276,39 +276,40 @@ /*Dates are being stored in millis since the epoch to aid localization*/ if(title != null) - properties.setProperty("Title", title); + properties.setProperty(Metadata.TITLE, title); if(applicationName != null) - properties.setProperty("Application-Name", applicationName); + properties.setProperty(Metadata.APPLICATION_NAME, applicationName); if(author != null) - properties.setProperty("Author", author); + properties.setProperty(Metadata.AUTHOR, author); if(charCount != 0) - properties.setProperty("Character Count", charCount + ""); + properties.setProperty(Metadata.CHARACTER_COUNT, charCount + ""); if(comments != null) - properties.setProperty("Comments", comments); + properties.setProperty(Metadata.COMMENTS, comments); if(createDateTime != null) - properties.setProperty("Creation-Date", createDateTime.getTime() + ""); + properties.setProperty(Metadata.DATE, + Metadata.DATE_FORMAT.format(createDateTime)); if(editTime != 0) - properties.setProperty("Edit-Time", editTime + ""); + properties.setProperty(Metadata.LAST_MODIFIED, editTime + ""); if(keywords != null) - properties.setProperty("Keywords", keywords); + properties.setProperty(Metadata.KEYWORDS, keywords); if(lastAuthor != null) - properties.setProperty("Last-Author", lastAuthor); + properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor); if(lastPrinted != null) - properties.setProperty("Last-Printed", lastPrinted.getTime() + ""); + properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + ""); if(lastSaveDateTime != null) - properties.setProperty("Last-Save-Date", lastSaveDateTime.getTime() + ""); + properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() + ""); if(pageCount != 0) - properties.setProperty("Page-Count", pageCount + ""); + properties.setProperty(Metadata.PAGE_COUNT, pageCount + ""); if(revNumber != null) - properties.setProperty("Revision-Number", revNumber); + properties.setProperty(Metadata.REVISION_NUMBER, revNumber); if(security != 0) - properties.setProperty("Security", security + ""); + properties.setProperty(Metadata.RIGHTS, security + ""); if(subject != null) - properties.setProperty("Subject", subject); + properties.setProperty(Metadata.SUBJECT, subject); if(template != null) - properties.setProperty("Template", template); + properties.setProperty(Metadata.TEMPLATE, template); if(wordCount != 0) - properties.setProperty("Word-Count", wordCount + ""); + properties.setProperty(Metadata.WORD_COUNT, wordCount + ""); propertiesBroker.setProperties(properties); //si.getThumbnail(); // can't think of a sensible way of turning this into a string.
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Wed Feb 8 13:48:52 2006 @@ -26,7 +26,8 @@ import org.pdfbox.exceptions.InvalidPasswordException; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.ParseStatus; @@ -89,12 +90,13 @@ String text = null; String title = null; + Metadata metadata = new Metadata(); try { byte[] raw = content.getContent(); - String contentLength = content.get("Content-Length"); + String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); if (contentLength != null && raw.length != Integer.parseInt(contentLength)) { return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, @@ -102,8 +104,7 @@ +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf()); } - PDFParser parser = new PDFParser( - new ByteArrayInputStream(raw)); + PDFParser parser = new PDFParser(new ByteArrayInputStream(raw)); parser.parse(); pdf = parser.getPDDocument(); @@ -122,15 +123,18 @@ PDDocumentInformation info = pdf.getDocumentInformation(); title = info.getTitle(); // more useful info, currently not used. please keep them for future use. - // pdf.getPageCount(); - // info.getAuthor() - // info.getSubject() - // info.getKeywords() - // info.getCreator() - // info.getProducer() - // info.getTrapped() - // formatDate(info.getCreationDate()) - // formatDate(info.getModificationDate()) + metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount())); + metadata.add(Metadata.AUTHOR, info.getAuthor()); + metadata.add(Metadata.SUBJECT, info.getSubject()); + metadata.add(Metadata.KEYWORDS, info.getKeywords()); + metadata.add(Metadata.CREATOR, info.getCreator()); + metadata.add(Metadata.PUBLISHER, info.getProducer()); + + //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM + //error here + + //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime())); + //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime())); } catch (CryptographyException e) { return new ParseStatus(ParseStatus.FAILED, @@ -139,6 +143,8 @@ return new ParseStatus(ParseStatus.FAILED, "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf()); } catch (Exception e) { // run time exception + LOG.warning("General exception in PDF parser: "+e.getMessage()); + e.printStackTrace(); return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParse(getConf()); } finally { @@ -159,11 +165,9 @@ // collect outlink Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf()); - // collect meta data - ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // copy through - - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, content.getMetadata(), + metadata); parseData.setConf(this.conf); return new ParseImpl(text, parseData); // any filter? Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Wed Feb 8 13:48:52 2006 @@ -18,7 +18,16 @@ import org.apache.nutch.parse.*; import org.apache.nutch.protocol.Content; +<<<<<<< .mine +<<<<<<< .mine +import org.apache.nutch.util.MetadataNames; + +======= +import org.apache.nutch.util.NutchConf; +======= import org.apache.hadoop.conf.Configuration; +>>>>>>> .r374853 +>>>>>>> .r373941 import java.io.ByteArrayInputStream; import java.io.InputStreamReader; import java.io.Reader; @@ -31,7 +40,7 @@ * * @author Andy Hedges */ -public class RTFParseFactory implements Parser { +public class RTFParseFactory implements Parser, MetadataNames { private Configuration conf; @@ -53,10 +62,13 @@ Properties metadata = new Properties(); metadata.putAll(content.getMetadata()); metadata.putAll(delegate.getMetaData()); - String title = metadata.getProperty("title"); + String title = metadata.getProperty(TITLE); if (title != null) { - metadata.remove(title); + //(CM): Why remove the title metadata property here? Even + //though it's stored in the ParseData, it still might be useful + //to have via this properties object? + //metadata.remove(title); } else { title = ""; } Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java Wed Feb 8 13:48:52 2006 @@ -22,18 +22,20 @@ import java.util.List; import java.util.Properties; +import org.apache.nutch.util.MetadataNames; + /** * A parser delegate for handling rtf events. * @author Andy Hedges */ -public class RTFParserDelegateImpl implements RTFParserDelegate { +public class RTFParserDelegateImpl implements RTFParserDelegate, MetadataNames { String tabs = ""; Properties metadata = new Properties(); - String[] META_NAMES_TEXT = {"title", "subject", "author", "manager", - "company", "operator", "category", "keywords", - "comment", "doccomm", "hlinkbase"}; + String[] META_NAMES_TEXT = {TITLE, SUBJECT, AUTHOR, "manager", + "company", "operator", "category", KEYWORDS, + COMMENTS, "doccomm", "hlinkbase"}; String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"}; String metaName = ""; Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Wed Feb 8 13:48:52 2006 @@ -25,7 +25,15 @@ import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolFactory; +<<<<<<< .mine +<<<<<<< .mine +import org.apache.nutch.util.MetadataNames; +======= +import org.apache.nutch.util.NutchConf; +======= import org.apache.hadoop.conf.Configuration; +>>>>>>> .r374853 +>>>>>>> .r373941 import java.util.Properties; @@ -34,7 +42,7 @@ * * @author Andy Hedges */ -public class TestRTFParser extends TestCase { +public class TestRTFParser extends TestCase implements MetadataNames { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml @@ -73,7 +81,7 @@ String title = parse.getData().getTitle(); Properties meta = parse.getData().getMetadata(); assertEquals("test rft document", title); - assertEquals("tests", meta.getProperty("subject")); + assertEquals("tests", meta.getProperty(SUBJECT)); Modified: lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Wed Feb 8 13:48:52 2006 @@ -21,9 +21,10 @@ import java.util.*; import java.util.logging.Logger; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.*; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -60,19 +61,17 @@ public Parse getParse(Content content) { String text = null; - // collect meta data - ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // copy through Vector outlinks = new Vector(); try { byte[] raw = content.getContent(); - String contentLength = content.get("Content-Length"); + String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); if (contentLength != null && raw.length != Integer.parseInt(contentLength)) { - return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length - + " bytes. Parser can't handle incomplete files.").getEmptyParse(conf); + return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, + "Content truncated at " + raw.length + + " bytes. Parser can't handle incomplete files.").getEmptyParse(conf); } ExtractText extractor = new ExtractText(); @@ -106,7 +105,8 @@ if (text == null) text = ""; Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, metadata); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, + content.getMetadata()); return new ParseImpl(text, parseData); } @@ -119,8 +119,10 @@ byte[] buf = new byte[in.available()]; in.read(buf); SWFParser parser = new SWFParser(); - Parse p = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", - new ContentProperties(), NutchConfiguration.create())); + Parse p = parser.getParse(new Content("file:" + args[0], "file:" + args[0], + buf, "application/x-shockwave-flash", + new Metadata(), + NutchConfiguration.create())); System.out.println("Parse Text:"); System.out.println(p.getText()); System.out.println("Parse Data:"); Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Wed Feb 8 13:48:52 2006 @@ -17,7 +17,6 @@ package org.apache.nutch.parse.text; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.parse.*; import org.apache.nutch.util.*; @@ -27,9 +26,6 @@ private Configuration conf; public Parse getParse(Content content) { - // copy content meta data through - ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new // Outlink[0], metadata); @@ -49,7 +45,7 @@ text = new String(content.getContent()); // use default encoding } ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", - OutlinkExtractor.getOutlinks(text, getConf()), metadata); + OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata()); parseData.setConf(this.conf); return new ParseImpl(text, parseData); Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Wed Feb 8 13:48:52 2006 @@ -23,6 +23,8 @@ import java.util.ArrayList; import java.util.List; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; @@ -30,7 +32,6 @@ import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -59,7 +60,7 @@ Properties properties = null; try { - final String contentLen = content.get("Content-Length"); + final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH); final int len = Integer.parseInt(contentLen); System.out.println("ziplen: " + len); final byte[] contentInBytes = content.getContent(); @@ -86,10 +87,6 @@ "Can't be handled as Zip document. " + e).getEmptyParse(getConf()); } - // collect meta data - final ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // copy through - if (resultText == null) { resultText = ""; } @@ -100,7 +97,8 @@ outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]); final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, - resultTitle, outlinks, metadata); + resultTitle, outlinks, + content.getMetadata()); parseData.setConf(this.conf); LOG.finest("Zip file parsed sucessfully !!"); Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Wed Feb 8 13:48:52 2006 @@ -26,13 +26,14 @@ import java.net.URL; // Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.Outlink; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.mime.MimeTypes; @@ -87,9 +88,9 @@ // Trying to resolve the Mime-Type String contentType = MIME.getMimeType(fname).getName(); try { - ContentProperties metadata = new ContentProperties(); - metadata.setProperty("Content-Length", Long.toString(entry.getSize())); - metadata.setProperty("Content-Type", contentType); + Metadata metadata = new Metadata(); + metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize())); + metadata.set(Response.CONTENT_TYPE, contentType); Content content = new Content(newurl, base, b, contentType, metadata, this.conf); Parse parse = new ParseUtil(this.conf).parse(content); ParseData theParseData = parse.getData(); Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Wed Feb 8 13:48:52 2006 @@ -19,7 +19,9 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.hadoop.io.UTF8; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -144,8 +146,10 @@ Content content = file.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); - System.err.println("Content-Length: " + content.get("Content-Length")); - System.err.println("Last-Modified: " + content.get("Last-Modified")); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Wed Feb 8 13:48:52 2006 @@ -18,6 +18,7 @@ // JDK imports import java.net.URL; +import java.util.Date; import java.util.TreeMap; import java.util.logging.Level; import java.io.IOException; @@ -25,7 +26,10 @@ // Nutch imports import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; + +// Hadoop imports import org.apache.hadoop.conf.Configuration; @@ -61,7 +65,7 @@ private String base; private byte[] content; private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); private final File file; private Configuration conf; @@ -71,17 +75,17 @@ /** Returns the value of a named header. */ public String getHeader(String name) { - return (String)headers.get(name); + return headers.get(name); } public byte[] getContent() { return content; } public Content toContent() { return new Content(orig, base, content, - getHeader("Content-Type"), + getHeader(Response.CONTENT_TYPE), headers, this.conf); } - + public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { @@ -124,10 +128,8 @@ // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers - TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); //hdrs.put("Location", f.getCanonicalFile().toURI()); - hdrs.put("Location", f.getCanonicalFile().toURL().toString()); - this.headers.putAll(hdrs); + headers.set(Response.LOCATION, f.getCanonicalFile().toURL().toString()); this.code = 300; // http redirect return; @@ -181,16 +183,10 @@ is.close(); // set headers - TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); - - hdrs.put("Content-Length", new Long(size).toString()); - - hdrs.put("Last-Modified", + headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); + headers.set(Response.LAST_MODIFIED, this.file.httpDateFormat.toString(f.lastModified())); - - hdrs.put("Content-Type", ""); // No Content-Type at file protocol level - - this.headers.putAll(hdrs); + headers.set(Response.CONTENT_TYPE, ""); // No Content-Type at file protocol level // response code this.code = 200; // http OK @@ -204,17 +200,11 @@ this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); // set headers - TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); - - hdrs.put("Content-Length", + headers.set(Response.CONTENT_LENGTH, new Integer(this.content.length).toString()); - - hdrs.put("Content-Type", "text/html"); - - hdrs.put("Last-Modified", + headers.set(Response.CONTENT_TYPE, "text/html"); + headers.set(Response.LAST_MODIFIED, this.file.httpDateFormat.toString(f.lastModified())); - - this.headers.putAll(hdrs); // response code this.code = 200; // http OK Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Wed Feb 8 13:48:52 2006 @@ -22,6 +22,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.hadoop.io.UTF8; import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -212,8 +213,10 @@ Content content = ftp.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); - System.err.println("Content-Length: " + content.get("Content-Length")); - System.err.println("Last-Modified: " + content.get("Last-Modified")); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Wed Feb 8 13:48:52 2006 @@ -26,7 +26,9 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; + import org.apache.hadoop.conf.Configuration; import java.net.InetAddress; @@ -34,11 +36,9 @@ import java.util.List; import java.util.LinkedList; - import java.util.logging.Level; import java.io.ByteArrayOutputStream; -//import java.io.InputStream; import java.io.IOException; @@ -61,7 +61,7 @@ private String base; private byte[] content; private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); private final Ftp ftp; private Configuration conf; @@ -71,14 +71,14 @@ /** Returns the value of a named header. */ public String getHeader(String name) { - return (String)headers.get(name); + return headers.get(name); } public byte[] getContent() { return content; } public Content toContent() { return new Content(orig, base, content, - getHeader("Content-Type"), + getHeader(Response.CONTENT_TYPE), headers, this.conf); } @@ -294,11 +294,11 @@ ftp.client.retrieveFile(path, os, ftp.maxContentLength); FTPFile ftpFile = (FTPFile) list.get(0); - this.headers.put("Content-Length", - new Long(ftpFile.getSize()).toString()); + this.headers.set(Response.CONTENT_LENGTH, + new Long(ftpFile.getSize()).toString()); //this.headers.put("content-type", "text/html"); - this.headers.put("Last-Modified", - ftp.httpDateFormat.toString(ftpFile.getTimestamp())); + this.headers.set(Response.LAST_MODIFIED, + ftp.httpDateFormat.toString(ftpFile.getTimestamp())); this.content = os.toByteArray(); // // approximate bytes sent and read @@ -330,11 +330,11 @@ } FTPFile ftpFile = (FTPFile) list.get(0); - this.headers.put("Content-Length", - new Long(ftpFile.getSize()).toString()); + this.headers.set(Response.CONTENT_LENGTH, + new Long(ftpFile.getSize()).toString()); //this.headers.put("content-type", "text/html"); - this.headers.put("Last-Modified", - ftp.httpDateFormat.toString(ftpFile.getTimestamp())); + this.headers.set(Response.LAST_MODIFIED, + ftp.httpDateFormat.toString(ftpFile.getTimestamp())); this.content = os.toByteArray(); // // approximate bytes sent and read @@ -349,7 +349,7 @@ if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { // it is not a file, but dir, so redirect as a dir - this.headers.put("Location", path + "/"); + this.headers.set(Response.LOCATION, path + "/"); this.code = 300; // http redirect // fixme, should we do ftp.client.cwd("/"), back to top dir? } else { @@ -386,9 +386,9 @@ ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser); this.content = list2html(list, path, "/".equals(path) ? false : true); - this.headers.put("Content-Length", - new Integer(this.content.length).toString()); - this.headers.put("Content-Type", "text/html"); + this.headers.set(Response.CONTENT_LENGTH, + new Integer(this.content.length).toString()); + this.headers.set(Response.CONTENT_TYPE, "text/html"); // this.headers.put("Last-Modified", null); // // approximate bytes sent and read @@ -408,9 +408,9 @@ ftp.client = null; this.content = list2html(list, path, "/".equals(path) ? false : true); - this.headers.put("Content-Length", - new Integer(this.content.length).toString()); - this.headers.put("Content-Type", "text/html"); + this.headers.set(Response.CONTENT_LENGTH, + new Integer(this.content.length).toString()); + this.headers.set(Response.CONTENT_TYPE, "text/html"); // this.headers.put("Last-Modified", null); // // approximate bytes sent and read Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Wed Feb 8 13:48:52 2006 @@ -28,12 +28,13 @@ import java.net.URL; import java.util.Map; import java.util.TreeMap; +import java.util.Date; import java.util.logging.Level; // Nutch imports import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.protocol.http.api.HttpException; @@ -49,7 +50,7 @@ private String base; private byte[] content; private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); public HttpResponse(HttpBase http, URL url, CrawlDatum datum) @@ -141,13 +142,13 @@ // parse status code line this.code = parseStatusLine(in, line); // parse headers - headers.putAll(parseHeaders(in, line)); + parseHeaders(in, line); haveSeenNonContinueStatus= code != 100; // 100 is "Continue" } readPlainContent(in); - String contentEncoding= getHeader("Content-Encoding"); + String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { Http.LOG.fine("uncompressing...."); byte[] compressed = content; @@ -187,10 +188,10 @@ } public String getHeader(String name) { - return (String) headers.get(name); + return headers.get(name); } - public ContentProperties getHeaders() { + public Metadata getHeaders() { return headers; } @@ -207,7 +208,7 @@ throws HttpException, IOException { int contentLength = Integer.MAX_VALUE; // get content length - String contentLengthString = (String)headers.get("Content-Length"); + String contentLengthString = headers.get(Response.CONTENT_LENGTH); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { @@ -333,8 +334,9 @@ } - private void processHeaderLine(StringBuffer line, TreeMap headers) + private void processHeaderLine(StringBuffer line) throws IOException, HttpException { + int colonIndex = line.indexOf(":"); // key is up to colon if (colonIndex == -1) { int i; @@ -355,20 +357,14 @@ valueStart++; } String value = line.substring(valueStart); - - headers.put(key, value); + headers.set(key, value); } - private Map parseHeaders(PushbackInputStream in, StringBuffer line) - throws IOException, HttpException { - TreeMap headers = new TreeMap(String.CASE_INSENSITIVE_ORDER); - return parseHeaders(in, line, headers); - } - // Adds headers to an existing TreeMap - private Map parseHeaders(PushbackInputStream in, StringBuffer line, - TreeMap headers) + // Adds headers to our headers Metadata + private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + while (readLine(in, line, true) != 0) { // handle HTTP responses with missing blank line after headers @@ -381,18 +377,21 @@ line.setLength(pos); try { - processHeaderLine(line, headers); + //TODO: (CM) We don't know the header names here + //since we're just handling them generically. It would + //be nice to provide some sort of mapping function here + //for the returned header names to the standard metadata + //names in the ParseData class + processHeaderLine(line); } catch (Exception e) { // fixme: e.printStackTrace(); } - - return headers; + return; } - processHeaderLine(line, headers); + processHeaderLine(line); } - return headers; } private static int readLine(PushbackInputStream in, StringBuffer line, Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Wed Feb 8 13:48:52 2006 @@ -11,7 +11,8 @@ import java.util.logging.Level; import java.util.logging.Logger; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; + import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configurable; @@ -35,7 +36,7 @@ * The HTTP Authentication (WWW-Authenticate) header which is returned * by a webserver requiring authentication. */ - public static final String AUTH_HEADER = "WWW-Authenticate"; + public static final String WWW_AUTHENTICATE = "WWW-Authenticate"; public static final Logger LOG = LogFormatter.getLogger(HttpAuthenticationFactory.class.getName()); @@ -72,13 +73,14 @@ * ---------------------------------- */ - public HttpAuthentication findAuthentication(ContentProperties header) { + public HttpAuthentication findAuthentication(Metadata header) { + if (header == null) return null; try { Collection challenge = null; - if (header instanceof ContentProperties) { - Object o = header.get(AUTH_HEADER); + if (header instanceof Metadata) { + Object o = header.get(WWW_AUTHENTICATE); if (o instanceof Collection) { challenge = (Collection) o; } else { @@ -86,7 +88,7 @@ challenge.add(o.toString()); } } else { - String challengeString = header.getProperty(AUTH_HEADER); + String challengeString = header.get(WWW_AUTHENTICATE); if (challengeString != null) { challenge = new ArrayList(); challenge.add(challengeString); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Wed Feb 8 13:48:52 2006 @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; +import java.util.Date; // HTTP Client imports import org.apache.commons.httpclient.Header; @@ -30,8 +31,8 @@ // Nutch imports import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.protocol.http.api.HttpBase; @@ -52,7 +53,7 @@ private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws IOException { @@ -85,8 +86,9 @@ Header[] heads = get.getResponseHeaders(); for (int i = 0; i < heads.length; i++) { - headers.setProperty(heads[i].getName(), heads[i].getValue()); + headers.set(heads[i].getName(), heads[i].getValue()); } + // always read content. Sometimes content is useful to find a cause // for error. try { @@ -131,10 +133,10 @@ } public String getHeader(String name) { - return (String) headers.get(name); + return headers.get(name); } - public ContentProperties getHeaders() { + public Metadata getHeaders() { return headers; } Added: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java?rev=376089&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java Wed Feb 8 13:48:52 2006 @@ -0,0 +1,268 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +// JDK imports +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Properties; +import junit.framework.Test; + +// JUnit imports +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; + + +/** + * JUnit based tests of class [EMAIL PROTECTED] org.apache.nutch.metadata.Metadata}. + * + * @author Chris Mattmann + * @author Jérôme Charron + */ +public class TestMetadata extends TestCase { + + + public TestMetadata(String testName) { + super(testName); + } + + public static Test suite() { + return new TestSuite(TestMetadata.class); + } + + public static void main(String[] args) { + TestRunner.run(suite()); + } + + + /** Test for the <code>getNormalizedName(String)</code> method. */ + public void testGetNormalizedName() { + assertEquals("Content-Type", Metadata.getNormalizedName("Content-Type")); + assertEquals("Content-Type", Metadata.getNormalizedName("ContentType")); + assertEquals("Content-Type", Metadata.getNormalizedName("Content-type")); + assertEquals("Content-Type", Metadata.getNormalizedName("contenttype")); + assertEquals("Content-Type", Metadata.getNormalizedName("contentype")); + assertEquals("Content-Type", Metadata.getNormalizedName("contntype")); + } + + /** Test for the <code>add(String, String)</code> method. */ + public void testAdd() { + String[] values = null; + Metadata meta = new Metadata(); + + values = meta.getValues("contentype"); + assertEquals(0, values.length); + + meta.add("contentype", "value1"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value1", values[0]); + + meta.add("Content-Type", "value2"); + values = meta.getValues("contentype"); + assertEquals(2, values.length); + assertEquals("value1", values[0]); + assertEquals("value2", values[1]); + + // NOTE : For now, the same value can be added many times. + // Should it be changed? + meta.add("ContentType", "value1"); + values = meta.getValues("Content-Type"); + assertEquals(3, values.length); + assertEquals("value1", values[0]); + assertEquals("value2", values[1]); + assertEquals("value1", values[2]); + } + + /** Test for the <code>set(String, String)</code> method. */ + public void testSet() { + String[] values = null; + Metadata meta = new Metadata(); + + values = meta.getValues("contentype"); + assertEquals(0, values.length); + + meta.set("contentype", "value1"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value1", values[0]); + + meta.set("Content-Type", "value2"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value2", values[0]); + + meta.set("contenttype", "new value 1"); + meta.add("contenttype", "new value 2"); + values = meta.getValues("contentype"); + assertEquals(2, values.length); + assertEquals("new value 1", values[0]); + assertEquals("new value 2", values[1]); + } + + /** Test for <code>setAll(Properties)</code> method */ + public void testSetProperties() { + String[] values = null; + Metadata meta = new Metadata(); + Properties props = new Properties(); + + meta.setAll(props); + assertEquals(0, meta.size()); + + props.setProperty("name-one", "value1.1"); + meta.setAll(props); + assertEquals(1, meta.size()); + values = meta.getValues("name-one"); + assertEquals(1, values.length); + assertEquals("value1.1", values[0]); + + props.setProperty("name-two", "value2.1"); + meta.setAll(props); + assertEquals(2, meta.size()); + values = meta.getValues("name-one"); + assertEquals(1, values.length); + assertEquals("value1.1", values[0]); + values = meta.getValues("name-two"); + assertEquals(1, values.length); + assertEquals("value2.1", values[0]); + } + + /** Test for <code>get(String)</code> method */ + public void testGet() { + String[] values = null; + Metadata meta = new Metadata(); + assertNull(meta.get("a-name")); + + meta.add("a-name", "value-1"); + assertEquals("value-1", meta.get("a-name")); + meta.add("a-name", "value-2"); + assertEquals("value-1", meta.get("a-name")); + } + + /** Test for <code>isMultiValued()</code> method */ + public void testIsMultiValued() { + Metadata meta = new Metadata(); + assertFalse(meta.isMultiValued("key")); + meta.add("key", "value1"); + assertFalse(meta.isMultiValued("key")); + meta.add("key", "value2"); + assertTrue(meta.isMultiValued("key")); + } + + /** Test for <code>names</code> method */ + public void testNames() { + String[] names = null; + Metadata meta = new Metadata(); + names = meta.names(); + assertEquals(0, names.length); + + meta.add("name-one", "value"); + names = meta.names(); + assertEquals(1, names.length); + assertEquals("name-one", names[0]); + meta.add("name-two", "value"); + names = meta.names(); + assertEquals(2, names.length); + } + + /** Test for <code>remove(String)</code> method */ + public void testRemove() { + Metadata meta = new Metadata(); + meta.remove("name-one"); + assertEquals(0, meta.size()); + meta.add("name-one", "value-1.1"); + meta.add("name-one", "value-1.2"); + meta.add("name-two", "value-2.2"); + assertEquals(2, meta.size()); + assertNotNull(meta.get("name-one")); + assertNotNull(meta.get("name-two")); + meta.remove("name-one"); + assertEquals(1, meta.size()); + assertNull(meta.get("name-one")); + assertNotNull(meta.get("name-two")); + meta.remove("name-two"); + assertEquals(0, meta.size()); + assertNull(meta.get("name-one")); + assertNull(meta.get("name-two")); + } + + /** Test for <code>equals(Object)</code> method */ + public void testObject() { + Metadata meta1 = new Metadata(); + Metadata meta2 = new Metadata(); + assertFalse(meta1.equals(null)); + assertFalse(meta1.equals("String")); + assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.1"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.1"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.2"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.2"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.1"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.1"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.2"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.x"); + assertFalse(meta1.equals(meta2)); + } + + /** Test for <code>Writable</code> implementation */ + public void testWritable() { + Metadata result = null; + Metadata meta = new Metadata(); + result = writeRead(meta); + assertEquals(0, result.size()); + meta.add("name-one", "value-1.1"); + result = writeRead(meta); + assertEquals(1, result.size()); + assertEquals(1, result.getValues("name-one").length); + assertEquals("value-1.1", result.get("name-one")); + meta.add("name-two", "value-2.1"); + meta.add("name-two", "value-2.2"); + result = writeRead(meta); + assertEquals(2, result.size()); + assertEquals(1, result.getValues("name-one").length); + assertEquals("value-1.1", result.getValues("name-one")[0]); + assertEquals(2, result.getValues("name-two").length); + assertEquals("value-2.1", result.getValues("name-two")[0]); + assertEquals("value-2.2", result.getValues("name-two")[1]); + } + + private Metadata writeRead(Metadata meta) { + Metadata readed = new Metadata(); + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + meta.write(new DataOutputStream(out)); + readed.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray()))); + } catch (IOException ioe) { + fail(ioe.toString()); + } + return readed; + } + +} Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Wed Feb 8 13:48:52 2006 @@ -20,7 +20,7 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.WritableTestUtils; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; import junit.framework.TestCase; @@ -41,9 +41,9 @@ new Outlink("http://bar.com/", "Bar", conf) }; - ContentProperties metaData = new ContentProperties(); - metaData.put("Language", "en/us"); - metaData.put("Charset", "UTF-8"); + Metadata metaData = new Metadata(); + metaData.add("Language", "en/us"); + metaData.add("Charset", "UTF-8"); ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); r.setConf(conf); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Wed Feb 8 13:48:52 2006 @@ -16,13 +16,14 @@ package org.apache.nutch.protocol; +import org.apache.nutch.metadata.Metadata; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; - import org.apache.nutch.util.WritableTestUtils; import junit.framework.TestCase; + /** Unit tests for Content. */ public class TestContent extends TestCase { @@ -37,9 +38,9 @@ String url = "http://www.foo.com/"; - ContentProperties metaData = new ContentProperties(); - metaData.put("Host", "www.foo.com"); - metaData.put("Content-Type", "text/html"); + Metadata metaData = new Metadata(); + metaData.add("Host", "www.foo.com"); + metaData.add("Content-Type", "text/html"); Content r = new Content(url, url, page.getBytes("UTF8"), "text/html", metaData, conf); @@ -47,12 +48,13 @@ WritableTestUtils.testWritable(r); assertEquals("text/html", r.getMetadata().get("Content-Type")); assertEquals("text/html", r.getMetadata().get("content-type")); + assertEquals("text/html", r.getMetadata().get("CONTENTYPE")); } /** Unit tests for getContentType(String, String, byte[]) method. */ public void testGetContentType() throws Exception { Content c = null; - ContentProperties p = new ContentProperties(); + Metadata p = new Metadata(); c = new Content("http://www.foo.com/", "http://www.foo.com/", Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cached.jsp?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/web/jsp/cached.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cached.jsp Wed Feb 8 13:48:52 2006 @@ -6,7 +6,7 @@ import="org.apache.nutch.searcher.*" import="org.apache.nutch.parse.ParseData" - import="org.apache.nutch.protocol.ContentProperties" + import="org.apache.nutch.metadata.Metadata" import="org.apache.hadoop.conf.Configuration" import="org.apache.nutch.util.NutchConfiguration" %><% @@ -26,10 +26,10 @@ ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale()) .getLocale().getLanguage(); - ContentProperties metaData = bean.getParseData(details).getMetadata(); + Metadata metaData = bean.getParseData(details).getContentMeta(); String content = null; - String contentType = (String) metaData.get("Content-Type"); + String contentType = (String) metaData.get(Metadata.CONTENT_TYPE); if (contentType.startsWith("text/html")) { // FIXME : it's better to emit the original 'byte' sequence // with 'charset' set to the value of 'CharEncoding',