Hi Lewis,

a patch is ready, on my machine all tests pass now.
Currently, I experience problems with Jira:
feel free to open and resolve the issue.

Cheers,
Sebastian


On 06/19/2014 07:58 PM, Lewis John Mcgibbney wrote:
> Hi Seb,
> 
> On Thu, Jun 19, 2014 at 1:46 PM, <[email protected]
> <mailto:[email protected]>> wrote:
> 
> 
>     it seems to be related to NUTCH-1714:
>     WebPage-owned maps (metadata, headers, etc.) are not
>     initialized any more in the constructor.
> 
> 
> Yeah, I thought we had addressed this in the patch! We should create empty 
> data structures for
> MapWrapper, ArrayWrapper, etc to work with rather than a null field. 
> 
>     This causes also other tests to fail.
> 
>     The solution would be to replace
>       WebPage page = new WebPage();
>     by
>       WebPage page = WebPage.newBuilder().build();
>     in every test where a WebPage object is needed.
> 
>     Right?
> 
> 
> Yes, this is the best way to use new Objects. Unlike constructors, builders 
> will automatically set
> any default values specified in the WebPage/Host (or any others JSON) schema. 
> Additionally, builders
> validate the data as it set, whereas objects constructed directly will not 
> cause an error until the
> object is serialized. However, using constructors directly generally offers 
> better performance, as
> builders create a copy of the datastructure before it is written.
>  
> 
> 
>     I'll open a Jira and try to provide a patch.
> 
> That would be excellent. I thought that this was related to something else 
> entirely... e.g. a dodgy
> commit.
> 
> Hopefully this is the root of the problem.

diff --git src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index caf7487..23dd863 100644
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -22,18 +22,16 @@ import java.nio.charset.Charset;
 
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.NutchConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.junit.Before;
 import org.junit.Test;
+
 import static org.junit.Assert.*;
 
 public class TestHtmlParser {
@@ -111,7 +109,7 @@ public class TestHtmlParser {
   }
 
   protected WebPage page(byte[] contentBytes) {
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8(dummyUrl));
     page.setContent(ByteBuffer.wrap(contentBytes));
     page.setContentType(new Utf8("text/html"));
diff --git src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
index ad8e5bc..01155b3 100644
--- src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
+++ src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
@@ -76,7 +76,7 @@ public class TestMetaTagsParser {
       in.readFully(bytes);
       in.close();
 
-      WebPage page = new WebPage();
+      WebPage page = WebPage.newBuilder().build();
       page.setBaseUrl(new Utf8(urlString));
       page.setContent(ByteBuffer.wrap(bytes));
       page.setContentType(new Utf8("text/html"));
diff --git src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 130a90a..6f8737f 100644
--- src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -60,7 +60,7 @@ public class TestImageMetadata {
       in.readFully(bytes);
       in.close();
       
-      WebPage page = new WebPage();
+      WebPage page = WebPage.newBuilder().build();
       page.setBaseUrl(new Utf8(urlString));
       page.setContent(ByteBuffer.wrap(bytes));
       String mtype = mimeutil.getMimeType(file);
diff --git src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
index 868b54c..3469a2e 100644
--- src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
+++ src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
@@ -119,8 +119,9 @@ public class TestProtocolHttp {
    */
   private void fetchPage(String page, int expectedCode) throws Exception {
     URL url = new URL("http", "127.0.0.1", port, page);
-    Response response = http.getResponse(url, new WebPage(), true);
-    ProtocolOutput out = http.getProtocolOutput(url.toString(), new WebPage());
+    WebPage p = WebPage.newBuilder().build();
+    Response response = http.getResponse(url, p, true);
+    ProtocolOutput out = http.getProtocolOutput(url.toString(), p);
     Content content = out.getContent();
     
     assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
diff --git src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
index 21e9b77..587a218 100644
--- src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
+++ src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
@@ -25,7 +25,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 
-import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.scoring.ScoreDatum;
 import org.apache.nutch.storage.WebPage;
@@ -148,7 +147,7 @@ public class TestOPICScoringFilter {
 
     // injecting seed list, with scored attached to webpages
     for (String url : self.seedList) {
-      WebPage row = new WebPage();
+      WebPage row = WebPage.newBuilder().build();
       row.setScore(scoreInjected);
       scoringFilter.injectedScore(url, row);
 
@@ -191,15 +190,15 @@ public class TestOPICScoringFilter {
         // getting outlinks from testdata
         String[] seedOutlinks = self.linkList.get(url);
         for (String seedOutlink : seedOutlinks) {
-          row.putToOutlinks(new Utf8(seedOutlink), new Utf8());
+          row.getOutlinks().put(seedOutlink, "");
         }
 
         self.outlinkedScoreData.clear();
 
         // Existing outlinks are added to outlinkedScoreData
-        Map<Utf8, Utf8> outlinks = row.getOutlinks();
+        Map<CharSequence, CharSequence> outlinks = row.getOutlinks();
         if (outlinks != null) {
-          for (Entry<Utf8, Utf8> e : outlinks.entrySet()) {
+          for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
             int depth = Integer.MAX_VALUE;
             self.outlinkedScoreData.add(new ScoreDatum(0.0f, e.getKey()
                 .toString(), e.getValue().toString(), depth));
@@ -213,7 +212,7 @@ public class TestOPICScoringFilter {
           if (dbWebPages.get(TableUtil.reverseUrl(sc.getUrl())) == null) {
             // Check each outlink and creates new webpages if it's not
             // exist in database (dbWebPages)
-            WebPage outlinkRow = new WebPage();
+            WebPage outlinkRow = WebPage.newBuilder().build();
             scoringFilter.initialScore(sc.getUrl(), outlinkRow);
             List<ScoreDatum> newScoreList = new LinkedList<ScoreDatum>();
             newScoreList.add(sc);
diff --git src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
index b4730d1..481e7aa 100644
--- src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
+++ src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
@@ -92,7 +92,7 @@ public class TestTLDIndexingFilter {
     urls.put("ftp://w.info.nf/";, "info.nf");
     urls.put("file://x.aa.no", "aa.no");
 
-    WebPage page = new WebPage();
+    WebPage page = WebPage.newBuilder().build();
 
     TLDIndexingFilter filter = new TLDIndexingFilter();
     assertNotNull(filter);
diff --git src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
index 1357832..5bfe47f 100644
--- src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
+++ src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
@@ -83,7 +83,7 @@ public class TestAdaptiveFetchSchedule extends TestCase {
    * @return wp :Webpage
    */
   public WebPage prepareWebpage() {
-    WebPage wp = new WebPage();
+    WebPage wp = WebPage.newBuilder().build();
     wp.setStatus(1);
     wp.setFetchInterval(interval);
     wp.setScore(1.0f);

Reply via email to