Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java Wed Feb 8 13:48:52 2006 @@ -20,6 +20,7 @@ import org.apache.poi.poifs.eventfilesystem.*; import org.apache.poi.poifs.filesystem.*; import org.apache.poi.util.LittleEndian; +import org.apache.nutch.metadata.Metadata; import java.util.*; import java.io.*; @@ -33,8 +34,7 @@ * code to extract all msword properties. * */ -public class WordExtractor -{ +public class WordExtractor { /** * Constructor @@ -276,39 +276,40 @@ /*Dates are being stored in millis since the epoch to aid localization*/ if(title != null) - properties.setProperty("Title", title); + properties.setProperty(Metadata.TITLE, title); if(applicationName != null) - properties.setProperty("Application-Name", applicationName); + properties.setProperty(Metadata.APPLICATION_NAME, applicationName); if(author != null) - properties.setProperty("Author", author); + properties.setProperty(Metadata.AUTHOR, author); if(charCount != 0) - properties.setProperty("Character Count", charCount + ""); + properties.setProperty(Metadata.CHARACTER_COUNT, charCount + ""); if(comments != null) - properties.setProperty("Comments", comments); + properties.setProperty(Metadata.COMMENTS, comments); if(createDateTime != null) - properties.setProperty("Creation-Date", createDateTime.getTime() + ""); + properties.setProperty(Metadata.DATE, + Metadata.DATE_FORMAT.format(createDateTime)); if(editTime != 0) - properties.setProperty("Edit-Time", editTime + ""); + properties.setProperty(Metadata.LAST_MODIFIED, editTime + ""); if(keywords != null) - properties.setProperty("Keywords", keywords); + properties.setProperty(Metadata.KEYWORDS, keywords); if(lastAuthor != null) - properties.setProperty("Last-Author", lastAuthor); + properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor); if(lastPrinted != null) - properties.setProperty("Last-Printed", lastPrinted.getTime() + ""); + properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + ""); if(lastSaveDateTime != null) - properties.setProperty("Last-Save-Date", lastSaveDateTime.getTime() + ""); + properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() + ""); if(pageCount != 0) - properties.setProperty("Page-Count", pageCount + ""); + properties.setProperty(Metadata.PAGE_COUNT, pageCount + ""); if(revNumber != null) - properties.setProperty("Revision-Number", revNumber); + properties.setProperty(Metadata.REVISION_NUMBER, revNumber); if(security != 0) - properties.setProperty("Security", security + ""); + properties.setProperty(Metadata.RIGHTS, security + ""); if(subject != null) - properties.setProperty("Subject", subject); + properties.setProperty(Metadata.SUBJECT, subject); if(template != null) - properties.setProperty("Template", template); + properties.setProperty(Metadata.TEMPLATE, template); if(wordCount != 0) - properties.setProperty("Word-Count", wordCount + ""); + properties.setProperty(Metadata.WORD_COUNT, wordCount + ""); propertiesBroker.setProperties(properties); //si.getThumbnail(); // can't think of a sensible way of turning this into a string.
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Wed Feb 8 13:48:52 2006 @@ -26,7 +26,8 @@ import org.pdfbox.exceptions.InvalidPasswordException; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.ParseStatus; @@ -89,12 +90,13 @@ String text = null; String title = null; + Metadata metadata = new Metadata(); try { byte[] raw = content.getContent(); - String contentLength = content.get("Content-Length"); + String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); if (contentLength != null && raw.length != Integer.parseInt(contentLength)) { return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, @@ -102,8 +104,7 @@ +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf()); } - PDFParser parser = new PDFParser( - new ByteArrayInputStream(raw)); + PDFParser parser = new PDFParser(new ByteArrayInputStream(raw)); parser.parse(); pdf = parser.getPDDocument(); @@ -122,15 +123,18 @@ PDDocumentInformation info = pdf.getDocumentInformation(); title = info.getTitle(); // more useful info, currently not used. please keep them for future use. - // pdf.getPageCount(); - // info.getAuthor() - // info.getSubject() - // info.getKeywords() - // info.getCreator() - // info.getProducer() - // info.getTrapped() - // formatDate(info.getCreationDate()) - // formatDate(info.getModificationDate()) + metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount())); + metadata.add(Metadata.AUTHOR, info.getAuthor()); + metadata.add(Metadata.SUBJECT, info.getSubject()); + metadata.add(Metadata.KEYWORDS, info.getKeywords()); + metadata.add(Metadata.CREATOR, info.getCreator()); + metadata.add(Metadata.PUBLISHER, info.getProducer()); + + //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM + //error here + + //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime())); + //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime())); } catch (CryptographyException e) { return new ParseStatus(ParseStatus.FAILED, @@ -139,6 +143,8 @@ return new ParseStatus(ParseStatus.FAILED, "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf()); } catch (Exception e) { // run time exception + LOG.warning("General exception in PDF parser: "+e.getMessage()); + e.printStackTrace(); return new ParseStatus(ParseStatus.FAILED, "Can't be handled as pdf document. " + e).getEmptyParse(getConf()); } finally { @@ -159,11 +165,9 @@ // collect outlink Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf()); - // collect meta data - ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // copy through - - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, content.getMetadata(), + metadata); parseData.setConf(this.conf); return new ParseImpl(text, parseData); // any filter? Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java Wed Feb 8 13:48:52 2006 @@ -18,7 +18,16 @@ import org.apache.nutch.parse.*; import org.apache.nutch.protocol.Content; +<<<<<<< .mine +<<<<<<< .mine +import org.apache.nutch.util.MetadataNames; + +======= +import org.apache.nutch.util.NutchConf; +======= import org.apache.hadoop.conf.Configuration; +>>>>>>> .r374853 +>>>>>>> .r373941 import java.io.ByteArrayInputStream; import java.io.InputStreamReader; import java.io.Reader; @@ -31,7 +40,7 @@ * * @author Andy Hedges */ -public class RTFParseFactory implements Parser { +public class RTFParseFactory implements Parser, MetadataNames { private Configuration conf; @@ -53,10 +62,13 @@ Properties metadata = new Properties(); metadata.putAll(content.getMetadata()); metadata.putAll(delegate.getMetaData()); - String title = metadata.getProperty("title"); + String title = metadata.getProperty(TITLE); if (title != null) { - metadata.remove(title); + //(CM): Why remove the title metadata property here? Even + //though it's stored in the ParseData, it still might be useful + //to have via this properties object? + //metadata.remove(title); } else { title = ""; } Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java Wed Feb 8 13:48:52 2006 @@ -22,18 +22,20 @@ import java.util.List; import java.util.Properties; +import org.apache.nutch.util.MetadataNames; + /** * A parser delegate for handling rtf events. * @author Andy Hedges */ -public class RTFParserDelegateImpl implements RTFParserDelegate { +public class RTFParserDelegateImpl implements RTFParserDelegate, MetadataNames { String tabs = ""; Properties metadata = new Properties(); - String[] META_NAMES_TEXT = {"title", "subject", "author", "manager", - "company", "operator", "category", "keywords", - "comment", "doccomm", "hlinkbase"}; + String[] META_NAMES_TEXT = {TITLE, SUBJECT, AUTHOR, "manager", + "company", "operator", "category", KEYWORDS, + COMMENTS, "doccomm", "hlinkbase"}; String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"}; String metaName = ""; Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Wed Feb 8 13:48:52 2006 @@ -25,7 +25,15 @@ import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolFactory; +<<<<<<< .mine +<<<<<<< .mine +import org.apache.nutch.util.MetadataNames; +======= +import org.apache.nutch.util.NutchConf; +======= import org.apache.hadoop.conf.Configuration; +>>>>>>> .r374853 +>>>>>>> .r373941 import java.util.Properties; @@ -34,7 +42,7 @@ * * @author Andy Hedges */ -public class TestRTFParser extends TestCase { +public class TestRTFParser extends TestCase implements MetadataNames { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml @@ -73,7 +81,7 @@ String title = parse.getData().getTitle(); Properties meta = parse.getData().getMetadata(); assertEquals("test rft document", title); - assertEquals("tests", meta.getProperty("subject")); + assertEquals("tests", meta.getProperty(SUBJECT)); Modified: lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Wed Feb 8 13:48:52 2006 @@ -21,9 +21,10 @@ import java.util.*; import java.util.logging.Logger; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.*; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -60,19 +61,17 @@ public Parse getParse(Content content) { String text = null; - // collect meta data - ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // copy through Vector outlinks = new Vector(); try { byte[] raw = content.getContent(); - String contentLength = content.get("Content-Length"); + String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); if (contentLength != null && raw.length != Integer.parseInt(contentLength)) { - return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length - + " bytes. Parser can't handle incomplete files.").getEmptyParse(conf); + return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, + "Content truncated at " + raw.length + + " bytes. Parser can't handle incomplete files.").getEmptyParse(conf); } ExtractText extractor = new ExtractText(); @@ -106,7 +105,8 @@ if (text == null) text = ""; Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, metadata); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, + content.getMetadata()); return new ParseImpl(text, parseData); } @@ -119,8 +119,10 @@ byte[] buf = new byte[in.available()]; in.read(buf); SWFParser parser = new SWFParser(); - Parse p = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", - new ContentProperties(), NutchConfiguration.create())); + Parse p = parser.getParse(new Content("file:" + args[0], "file:" + args[0], + buf, "application/x-shockwave-flash", + new Metadata(), + NutchConfiguration.create())); System.out.println("Parse Text:"); System.out.println(p.getText()); System.out.println("Parse Data:"); Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Wed Feb 8 13:48:52 2006 @@ -17,7 +17,6 @@ package org.apache.nutch.parse.text; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.parse.*; import org.apache.nutch.util.*; @@ -27,9 +26,6 @@ private Configuration conf; public Parse getParse(Content content) { - // copy content meta data through - ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new // Outlink[0], metadata); @@ -49,7 +45,7 @@ text = new String(content.getContent()); // use default encoding } ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", - OutlinkExtractor.getOutlinks(text, getConf()), metadata); + OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata()); parseData.setConf(this.conf); return new ParseImpl(text, parseData); Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Wed Feb 8 13:48:52 2006 @@ -23,6 +23,8 @@ import java.util.ArrayList; import java.util.List; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; @@ -30,7 +32,6 @@ import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -59,7 +60,7 @@ Properties properties = null; try { - final String contentLen = content.get("Content-Length"); + final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH); final int len = Integer.parseInt(contentLen); System.out.println("ziplen: " + len); final byte[] contentInBytes = content.getContent(); @@ -86,10 +87,6 @@ "Can't be handled as Zip document. " + e).getEmptyParse(getConf()); } - // collect meta data - final ContentProperties metadata = new ContentProperties(); - metadata.putAll(content.getMetadata()); // copy through - if (resultText == null) { resultText = ""; } @@ -100,7 +97,8 @@ outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]); final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, - resultTitle, outlinks, metadata); + resultTitle, outlinks, + content.getMetadata()); parseData.setConf(this.conf); LOG.finest("Zip file parsed sucessfully !!"); Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Wed Feb 8 13:48:52 2006 @@ -26,13 +26,14 @@ import java.net.URL; // Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.Outlink; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.mime.MimeTypes; @@ -87,9 +88,9 @@ // Trying to resolve the Mime-Type String contentType = MIME.getMimeType(fname).getName(); try { - ContentProperties metadata = new ContentProperties(); - metadata.setProperty("Content-Length", Long.toString(entry.getSize())); - metadata.setProperty("Content-Type", contentType); + Metadata metadata = new Metadata(); + metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize())); + metadata.set(Response.CONTENT_TYPE, contentType); Content content = new Content(newurl, base, b, contentType, metadata, this.conf); Parse parse = new ParseUtil(this.conf).parse(content); ParseData theParseData = parse.getData(); Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Wed Feb 8 13:48:52 2006 @@ -19,7 +19,9 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.hadoop.io.UTF8; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -144,8 +146,10 @@ Content content = file.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); - System.err.println("Content-Length: " + content.get("Content-Length")); - System.err.println("Last-Modified: " + content.get("Last-Modified")); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Wed Feb 8 13:48:52 2006 @@ -18,6 +18,7 @@ // JDK imports import java.net.URL; +import java.util.Date; import java.util.TreeMap; import java.util.logging.Level; import java.io.IOException; @@ -25,7 +26,10 @@ // Nutch imports import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; + +// Hadoop imports import org.apache.hadoop.conf.Configuration; @@ -61,7 +65,7 @@ private String base; private byte[] content; private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); private final File file; private Configuration conf; @@ -71,17 +75,17 @@ /** Returns the value of a named header. */ public String getHeader(String name) { - return (String)headers.get(name); + return headers.get(name); } public byte[] getContent() { return content; } public Content toContent() { return new Content(orig, base, content, - getHeader("Content-Type"), + getHeader(Response.CONTENT_TYPE), headers, this.conf); } - + public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { @@ -124,10 +128,8 @@ // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers - TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); //hdrs.put("Location", f.getCanonicalFile().toURI()); - hdrs.put("Location", f.getCanonicalFile().toURL().toString()); - this.headers.putAll(hdrs); + headers.set(Response.LOCATION, f.getCanonicalFile().toURL().toString()); this.code = 300; // http redirect return; @@ -181,16 +183,10 @@ is.close(); // set headers - TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); - - hdrs.put("Content-Length", new Long(size).toString()); - - hdrs.put("Last-Modified", + headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); + headers.set(Response.LAST_MODIFIED, this.file.httpDateFormat.toString(f.lastModified())); - - hdrs.put("Content-Type", ""); // No Content-Type at file protocol level - - this.headers.putAll(hdrs); + headers.set(Response.CONTENT_TYPE, ""); // No Content-Type at file protocol level // response code this.code = 200; // http OK @@ -204,17 +200,11 @@ this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); // set headers - TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); - - hdrs.put("Content-Length", + headers.set(Response.CONTENT_LENGTH, new Integer(this.content.length).toString()); - - hdrs.put("Content-Type", "text/html"); - - hdrs.put("Last-Modified", + headers.set(Response.CONTENT_TYPE, "text/html"); + headers.set(Response.LAST_MODIFIED, this.file.httpDateFormat.toString(f.lastModified())); - - this.headers.putAll(hdrs); // response code this.code = 200; // http OK Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Wed Feb 8 13:48:52 2006 @@ -22,6 +22,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.hadoop.io.UTF8; import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; @@ -212,8 +213,10 @@ Content content = ftp.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); - System.err.println("Content-Length: " + content.get("Content-Length")); - System.err.println("Last-Modified: " + content.get("Last-Modified")); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Wed Feb 8 13:48:52 2006 @@ -26,7 +26,9 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; + import org.apache.hadoop.conf.Configuration; import java.net.InetAddress; @@ -34,11 +36,9 @@ import java.util.List; import java.util.LinkedList; - import java.util.logging.Level; import java.io.ByteArrayOutputStream; -//import java.io.InputStream; import java.io.IOException; @@ -61,7 +61,7 @@ private String base; private byte[] content; private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); private final Ftp ftp; private Configuration conf; @@ -71,14 +71,14 @@ /** Returns the value of a named header. */ public String getHeader(String name) { - return (String)headers.get(name); + return headers.get(name); } public byte[] getContent() { return content; } public Content toContent() { return new Content(orig, base, content, - getHeader("Content-Type"), + getHeader(Response.CONTENT_TYPE), headers, this.conf); } @@ -294,11 +294,11 @@ ftp.client.retrieveFile(path, os, ftp.maxContentLength); FTPFile ftpFile = (FTPFile) list.get(0); - this.headers.put("Content-Length", - new Long(ftpFile.getSize()).toString()); + this.headers.set(Response.CONTENT_LENGTH, + new Long(ftpFile.getSize()).toString()); //this.headers.put("content-type", "text/html"); - this.headers.put("Last-Modified", - ftp.httpDateFormat.toString(ftpFile.getTimestamp())); + this.headers.set(Response.LAST_MODIFIED, + ftp.httpDateFormat.toString(ftpFile.getTimestamp())); this.content = os.toByteArray(); // // approximate bytes sent and read @@ -330,11 +330,11 @@ } FTPFile ftpFile = (FTPFile) list.get(0); - this.headers.put("Content-Length", - new Long(ftpFile.getSize()).toString()); + this.headers.set(Response.CONTENT_LENGTH, + new Long(ftpFile.getSize()).toString()); //this.headers.put("content-type", "text/html"); - this.headers.put("Last-Modified", - ftp.httpDateFormat.toString(ftpFile.getTimestamp())); + this.headers.set(Response.LAST_MODIFIED, + ftp.httpDateFormat.toString(ftpFile.getTimestamp())); this.content = os.toByteArray(); // // approximate bytes sent and read @@ -349,7 +349,7 @@ if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { // it is not a file, but dir, so redirect as a dir - this.headers.put("Location", path + "/"); + this.headers.set(Response.LOCATION, path + "/"); this.code = 300; // http redirect // fixme, should we do ftp.client.cwd("/"), back to top dir? } else { @@ -386,9 +386,9 @@ ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser); this.content = list2html(list, path, "/".equals(path) ? false : true); - this.headers.put("Content-Length", - new Integer(this.content.length).toString()); - this.headers.put("Content-Type", "text/html"); + this.headers.set(Response.CONTENT_LENGTH, + new Integer(this.content.length).toString()); + this.headers.set(Response.CONTENT_TYPE, "text/html"); // this.headers.put("Last-Modified", null); // // approximate bytes sent and read @@ -408,9 +408,9 @@ ftp.client = null; this.content = list2html(list, path, "/".equals(path) ? false : true); - this.headers.put("Content-Length", - new Integer(this.content.length).toString()); - this.headers.put("Content-Type", "text/html"); + this.headers.set(Response.CONTENT_LENGTH, + new Integer(this.content.length).toString()); + this.headers.set(Response.CONTENT_TYPE, "text/html"); // this.headers.put("Last-Modified", null); // // approximate bytes sent and read Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Wed Feb 8 13:48:52 2006 @@ -28,12 +28,13 @@ import java.net.URL; import java.util.Map; import java.util.TreeMap; +import java.util.Date; import java.util.logging.Level; // Nutch imports import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.protocol.http.api.HttpException; @@ -49,7 +50,7 @@ private String base; private byte[] content; private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); public HttpResponse(HttpBase http, URL url, CrawlDatum datum) @@ -141,13 +142,13 @@ // parse status code line this.code = parseStatusLine(in, line); // parse headers - headers.putAll(parseHeaders(in, line)); + parseHeaders(in, line); haveSeenNonContinueStatus= code != 100; // 100 is "Continue" } readPlainContent(in); - String contentEncoding= getHeader("Content-Encoding"); + String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { Http.LOG.fine("uncompressing...."); byte[] compressed = content; @@ -187,10 +188,10 @@ } public String getHeader(String name) { - return (String) headers.get(name); + return headers.get(name); } - public ContentProperties getHeaders() { + public Metadata getHeaders() { return headers; } @@ -207,7 +208,7 @@ throws HttpException, IOException { int contentLength = Integer.MAX_VALUE; // get content length - String contentLengthString = (String)headers.get("Content-Length"); + String contentLengthString = headers.get(Response.CONTENT_LENGTH); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { @@ -333,8 +334,9 @@ } - private void processHeaderLine(StringBuffer line, TreeMap headers) + private void processHeaderLine(StringBuffer line) throws IOException, HttpException { + int colonIndex = line.indexOf(":"); // key is up to colon if (colonIndex == -1) { int i; @@ -355,20 +357,14 @@ valueStart++; } String value = line.substring(valueStart); - - headers.put(key, value); + headers.set(key, value); } - private Map parseHeaders(PushbackInputStream in, StringBuffer line) - throws IOException, HttpException { - TreeMap headers = new TreeMap(String.CASE_INSENSITIVE_ORDER); - return parseHeaders(in, line, headers); - } - // Adds headers to an existing TreeMap - private Map parseHeaders(PushbackInputStream in, StringBuffer line, - TreeMap headers) + // Adds headers to our headers Metadata + private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + while (readLine(in, line, true) != 0) { // handle HTTP responses with missing blank line after headers @@ -381,18 +377,21 @@ line.setLength(pos); try { - processHeaderLine(line, headers); + //TODO: (CM) We don't know the header names here + //since we're just handling them generically. It would + //be nice to provide some sort of mapping function here + //for the returned header names to the standard metadata + //names in the ParseData class + processHeaderLine(line); } catch (Exception e) { // fixme: e.printStackTrace(); } - - return headers; + return; } - processHeaderLine(line, headers); + processHeaderLine(line); } - return headers; } private static int readLine(PushbackInputStream in, StringBuffer line, Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Wed Feb 8 13:48:52 2006 @@ -11,7 +11,8 @@ import java.util.logging.Level; import java.util.logging.Logger; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; + import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configurable; @@ -35,7 +36,7 @@ * The HTTP Authentication (WWW-Authenticate) header which is returned * by a webserver requiring authentication. */ - public static final String AUTH_HEADER = "WWW-Authenticate"; + public static final String WWW_AUTHENTICATE = "WWW-Authenticate"; public static final Logger LOG = LogFormatter.getLogger(HttpAuthenticationFactory.class.getName()); @@ -72,13 +73,14 @@ * ---------------------------------- */ - public HttpAuthentication findAuthentication(ContentProperties header) { + public HttpAuthentication findAuthentication(Metadata header) { + if (header == null) return null; try { Collection challenge = null; - if (header instanceof ContentProperties) { - Object o = header.get(AUTH_HEADER); + if (header instanceof Metadata) { + Object o = header.get(WWW_AUTHENTICATE); if (o instanceof Collection) { challenge = (Collection) o; } else { @@ -86,7 +88,7 @@ challenge.add(o.toString()); } } else { - String challengeString = header.getProperty(AUTH_HEADER); + String challengeString = header.get(WWW_AUTHENTICATE); if (challengeString != null) { challenge = new ArrayList(); challenge.add(challengeString); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Wed Feb 8 13:48:52 2006 @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; +import java.util.Date; // HTTP Client imports import org.apache.commons.httpclient.Header; @@ -30,8 +31,8 @@ // Nutch imports import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ContentProperties; import org.apache.nutch.protocol.http.api.HttpBase; @@ -52,7 +53,7 @@ private int code; - private ContentProperties headers = new ContentProperties(); + private Metadata headers = new Metadata(); public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws IOException { @@ -85,8 +86,9 @@ Header[] heads = get.getResponseHeaders(); for (int i = 0; i < heads.length; i++) { - headers.setProperty(heads[i].getName(), heads[i].getValue()); + headers.set(heads[i].getName(), heads[i].getValue()); } + // always read content. Sometimes content is useful to find a cause // for error. try { @@ -131,10 +133,10 @@ } public String getHeader(String name) { - return (String) headers.get(name); + return headers.get(name); } - public ContentProperties getHeaders() { + public Metadata getHeaders() { return headers; } Added: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java?rev=376089&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java Wed Feb 8 13:48:52 2006 @@ -0,0 +1,268 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +// JDK imports +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Properties; +import junit.framework.Test; + +// JUnit imports +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; + + +/** + * JUnit based tests of class [EMAIL PROTECTED] org.apache.nutch.metadata.Metadata}. + * + * @author Chris Mattmann + * @author Jérôme Charron + */ +public class TestMetadata extends TestCase { + + + public TestMetadata(String testName) { + super(testName); + } + + public static Test suite() { + return new TestSuite(TestMetadata.class); + } + + public static void main(String[] args) { + TestRunner.run(suite()); + } + + + /** Test for the <code>getNormalizedName(String)</code> method. */ + public void testGetNormalizedName() { + assertEquals("Content-Type", Metadata.getNormalizedName("Content-Type")); + assertEquals("Content-Type", Metadata.getNormalizedName("ContentType")); + assertEquals("Content-Type", Metadata.getNormalizedName("Content-type")); + assertEquals("Content-Type", Metadata.getNormalizedName("contenttype")); + assertEquals("Content-Type", Metadata.getNormalizedName("contentype")); + assertEquals("Content-Type", Metadata.getNormalizedName("contntype")); + } + + /** Test for the <code>add(String, String)</code> method. */ + public void testAdd() { + String[] values = null; + Metadata meta = new Metadata(); + + values = meta.getValues("contentype"); + assertEquals(0, values.length); + + meta.add("contentype", "value1"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value1", values[0]); + + meta.add("Content-Type", "value2"); + values = meta.getValues("contentype"); + assertEquals(2, values.length); + assertEquals("value1", values[0]); + assertEquals("value2", values[1]); + + // NOTE : For now, the same value can be added many times. + // Should it be changed? + meta.add("ContentType", "value1"); + values = meta.getValues("Content-Type"); + assertEquals(3, values.length); + assertEquals("value1", values[0]); + assertEquals("value2", values[1]); + assertEquals("value1", values[2]); + } + + /** Test for the <code>set(String, String)</code> method. */ + public void testSet() { + String[] values = null; + Metadata meta = new Metadata(); + + values = meta.getValues("contentype"); + assertEquals(0, values.length); + + meta.set("contentype", "value1"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value1", values[0]); + + meta.set("Content-Type", "value2"); + values = meta.getValues("contentype"); + assertEquals(1, values.length); + assertEquals("value2", values[0]); + + meta.set("contenttype", "new value 1"); + meta.add("contenttype", "new value 2"); + values = meta.getValues("contentype"); + assertEquals(2, values.length); + assertEquals("new value 1", values[0]); + assertEquals("new value 2", values[1]); + } + + /** Test for <code>setAll(Properties)</code> method */ + public void testSetProperties() { + String[] values = null; + Metadata meta = new Metadata(); + Properties props = new Properties(); + + meta.setAll(props); + assertEquals(0, meta.size()); + + props.setProperty("name-one", "value1.1"); + meta.setAll(props); + assertEquals(1, meta.size()); + values = meta.getValues("name-one"); + assertEquals(1, values.length); + assertEquals("value1.1", values[0]); + + props.setProperty("name-two", "value2.1"); + meta.setAll(props); + assertEquals(2, meta.size()); + values = meta.getValues("name-one"); + assertEquals(1, values.length); + assertEquals("value1.1", values[0]); + values = meta.getValues("name-two"); + assertEquals(1, values.length); + assertEquals("value2.1", values[0]); + } + + /** Test for <code>get(String)</code> method */ + public void testGet() { + String[] values = null; + Metadata meta = new Metadata(); + assertNull(meta.get("a-name")); + + meta.add("a-name", "value-1"); + assertEquals("value-1", meta.get("a-name")); + meta.add("a-name", "value-2"); + assertEquals("value-1", meta.get("a-name")); + } + + /** Test for <code>isMultiValued()</code> method */ + public void testIsMultiValued() { + Metadata meta = new Metadata(); + assertFalse(meta.isMultiValued("key")); + meta.add("key", "value1"); + assertFalse(meta.isMultiValued("key")); + meta.add("key", "value2"); + assertTrue(meta.isMultiValued("key")); + } + + /** Test for <code>names</code> method */ + public void testNames() { + String[] names = null; + Metadata meta = new Metadata(); + names = meta.names(); + assertEquals(0, names.length); + + meta.add("name-one", "value"); + names = meta.names(); + assertEquals(1, names.length); + assertEquals("name-one", names[0]); + meta.add("name-two", "value"); + names = meta.names(); + assertEquals(2, names.length); + } + + /** Test for <code>remove(String)</code> method */ + public void testRemove() { + Metadata meta = new Metadata(); + meta.remove("name-one"); + assertEquals(0, meta.size()); + meta.add("name-one", "value-1.1"); + meta.add("name-one", "value-1.2"); + meta.add("name-two", "value-2.2"); + assertEquals(2, meta.size()); + assertNotNull(meta.get("name-one")); + assertNotNull(meta.get("name-two")); + meta.remove("name-one"); + assertEquals(1, meta.size()); + assertNull(meta.get("name-one")); + assertNotNull(meta.get("name-two")); + meta.remove("name-two"); + assertEquals(0, meta.size()); + assertNull(meta.get("name-one")); + assertNull(meta.get("name-two")); + } + + /** Test for <code>equals(Object)</code> method */ + public void testObject() { + Metadata meta1 = new Metadata(); + Metadata meta2 = new Metadata(); + assertFalse(meta1.equals(null)); + assertFalse(meta1.equals("String")); + assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.1"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.1"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.2"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.2"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.1"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.1"); + assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.2"); + assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.x"); + assertFalse(meta1.equals(meta2)); + } + + /** Test for <code>Writable</code> implementation */ + public void testWritable() { + Metadata result = null; + Metadata meta = new Metadata(); + result = writeRead(meta); + assertEquals(0, result.size()); + meta.add("name-one", "value-1.1"); + result = writeRead(meta); + assertEquals(1, result.size()); + assertEquals(1, result.getValues("name-one").length); + assertEquals("value-1.1", result.get("name-one")); + meta.add("name-two", "value-2.1"); + meta.add("name-two", "value-2.2"); + result = writeRead(meta); + assertEquals(2, result.size()); + assertEquals(1, result.getValues("name-one").length); + assertEquals("value-1.1", result.getValues("name-one")[0]); + assertEquals(2, result.getValues("name-two").length); + assertEquals("value-2.1", result.getValues("name-two")[0]); + assertEquals("value-2.2", result.getValues("name-two")[1]); + } + + private Metadata writeRead(Metadata meta) { + Metadata readed = new Metadata(); + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + meta.write(new DataOutputStream(out)); + readed.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray()))); + } catch (IOException ioe) { + fail(ioe.toString()); + } + return readed; + } + +} Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Wed Feb 8 13:48:52 2006 @@ -20,7 +20,7 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.WritableTestUtils; -import org.apache.nutch.protocol.ContentProperties; +import org.apache.nutch.metadata.Metadata; import junit.framework.TestCase; @@ -41,9 +41,9 @@ new Outlink("http://bar.com/", "Bar", conf) }; - ContentProperties metaData = new ContentProperties(); - metaData.put("Language", "en/us"); - metaData.put("Charset", "UTF-8"); + Metadata metaData = new Metadata(); + metaData.add("Language", "en/us"); + metaData.add("Charset", "UTF-8"); ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); r.setConf(conf); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Wed Feb 8 13:48:52 2006 @@ -16,13 +16,14 @@ package org.apache.nutch.protocol; +import org.apache.nutch.metadata.Metadata; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; - import org.apache.nutch.util.WritableTestUtils; import junit.framework.TestCase; + /** Unit tests for Content. */ public class TestContent extends TestCase { @@ -37,9 +38,9 @@ String url = "http://www.foo.com/"; - ContentProperties metaData = new ContentProperties(); - metaData.put("Host", "www.foo.com"); - metaData.put("Content-Type", "text/html"); + Metadata metaData = new Metadata(); + metaData.add("Host", "www.foo.com"); + metaData.add("Content-Type", "text/html"); Content r = new Content(url, url, page.getBytes("UTF8"), "text/html", metaData, conf); @@ -47,12 +48,13 @@ WritableTestUtils.testWritable(r); assertEquals("text/html", r.getMetadata().get("Content-Type")); assertEquals("text/html", r.getMetadata().get("content-type")); + assertEquals("text/html", r.getMetadata().get("CONTENTYPE")); } /** Unit tests for getContentType(String, String, byte[]) method. */ public void testGetContentType() throws Exception { Content c = null; - ContentProperties p = new ContentProperties(); + Metadata p = new Metadata(); c = new Content("http://www.foo.com/", "http://www.foo.com/", Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cached.jsp?rev=376089&r1=376088&r2=376089&view=diff ============================================================================== --- lucene/nutch/trunk/src/web/jsp/cached.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cached.jsp Wed Feb 8 13:48:52 2006 @@ -6,7 +6,7 @@ import="org.apache.nutch.searcher.*" import="org.apache.nutch.parse.ParseData" - import="org.apache.nutch.protocol.ContentProperties" + import="org.apache.nutch.metadata.Metadata" import="org.apache.hadoop.conf.Configuration" import="org.apache.nutch.util.NutchConfiguration" %><% @@ -26,10 +26,10 @@ ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale()) .getLocale().getLanguage(); - ContentProperties metaData = bean.getParseData(details).getMetadata(); + Metadata metaData = bean.getParseData(details).getContentMeta(); String content = null; - String contentType = (String) metaData.get("Content-Type"); + String contentType = (String) metaData.get(Metadata.CONTENT_TYPE); if (contentType.startsWith("text/html")) { // FIXME : it's better to emit the original 'byte' sequence // with 'charset' set to the value of 'CharEncoding', ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs