Hi Lewis,
a patch is ready, on my machine all tests pass now.
Currently, I experience problems with Jira:
feel free to open and resolve the issue.
Cheers,
Sebastian
On 06/19/2014 07:58 PM, Lewis John Mcgibbney wrote:
> Hi Seb,
>
> On Thu, Jun 19, 2014 at 1:46 PM, <[email protected]
> <mailto:[email protected]>> wrote:
>
>
> it seems to be related to NUTCH-1714:
> WebPage-owned maps (metadata, headers, etc.) are not
> initialized any more in the constructor.
>
>
> Yeah, I thought we had addressed this in the patch! We should create empty
> data structures for
> MapWrapper, ArrayWrapper, etc to work with rather than a null field.
>
> This causes also other tests to fail.
>
> The solution would be to replace
> WebPage page = new WebPage();
> by
> WebPage page = WebPage.newBuilder().build();
> in every test where a WebPage object is needed.
>
> Right?
>
>
> Yes, this is the best way to use new Objects. Unlike constructors, builders
> will automatically set
> any default values specified in the WebPage/Host (or any others JSON) schema.
> Additionally, builders
> validate the data as it set, whereas objects constructed directly will not
> cause an error until the
> object is serialized. However, using constructors directly generally offers
> better performance, as
> builders create a copy of the datastructure before it is written.
>
>
>
> I'll open a Jira and try to provide a patch.
>
> That would be excellent. I thought that this was related to something else
> entirely... e.g. a dodgy
> commit.
>
> Hopefully this is the root of the problem.
diff --git src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index caf7487..23dd863 100644
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -22,18 +22,16 @@ import java.nio.charset.Charset;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.junit.Before;
import org.junit.Test;
+
import static org.junit.Assert.*;
public class TestHtmlParser {
@@ -111,7 +109,7 @@ public class TestHtmlParser {
}
protected WebPage page(byte[] contentBytes) {
- WebPage page = new WebPage();
+ WebPage page = WebPage.newBuilder().build();
page.setBaseUrl(new Utf8(dummyUrl));
page.setContent(ByteBuffer.wrap(contentBytes));
page.setContentType(new Utf8("text/html"));
diff --git src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
index ad8e5bc..01155b3 100644
--- src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
+++ src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
@@ -76,7 +76,7 @@ public class TestMetaTagsParser {
in.readFully(bytes);
in.close();
- WebPage page = new WebPage();
+ WebPage page = WebPage.newBuilder().build();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
page.setContentType(new Utf8("text/html"));
diff --git src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 130a90a..6f8737f 100644
--- src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -60,7 +60,7 @@ public class TestImageMetadata {
in.readFully(bytes);
in.close();
- WebPage page = new WebPage();
+ WebPage page = WebPage.newBuilder().build();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
String mtype = mimeutil.getMimeType(file);
diff --git src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
index 868b54c..3469a2e 100644
--- src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
+++ src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
@@ -119,8 +119,9 @@ public class TestProtocolHttp {
*/
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
- Response response = http.getResponse(url, new WebPage(), true);
- ProtocolOutput out = http.getProtocolOutput(url.toString(), new WebPage());
+ WebPage p = WebPage.newBuilder().build();
+ Response response = http.getResponse(url, p, true);
+ ProtocolOutput out = http.getProtocolOutput(url.toString(), p);
Content content = out.getContent();
assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
diff --git src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
index 21e9b77..587a218 100644
--- src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
+++ src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
@@ -25,7 +25,6 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
-import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.scoring.ScoreDatum;
import org.apache.nutch.storage.WebPage;
@@ -148,7 +147,7 @@ public class TestOPICScoringFilter {
// injecting seed list, with scored attached to webpages
for (String url : self.seedList) {
- WebPage row = new WebPage();
+ WebPage row = WebPage.newBuilder().build();
row.setScore(scoreInjected);
scoringFilter.injectedScore(url, row);
@@ -191,15 +190,15 @@ public class TestOPICScoringFilter {
// getting outlinks from testdata
String[] seedOutlinks = self.linkList.get(url);
for (String seedOutlink : seedOutlinks) {
- row.putToOutlinks(new Utf8(seedOutlink), new Utf8());
+ row.getOutlinks().put(seedOutlink, "");
}
self.outlinkedScoreData.clear();
// Existing outlinks are added to outlinkedScoreData
- Map<Utf8, Utf8> outlinks = row.getOutlinks();
+ Map<CharSequence, CharSequence> outlinks = row.getOutlinks();
if (outlinks != null) {
- for (Entry<Utf8, Utf8> e : outlinks.entrySet()) {
+ for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
int depth = Integer.MAX_VALUE;
self.outlinkedScoreData.add(new ScoreDatum(0.0f, e.getKey()
.toString(), e.getValue().toString(), depth));
@@ -213,7 +212,7 @@ public class TestOPICScoringFilter {
if (dbWebPages.get(TableUtil.reverseUrl(sc.getUrl())) == null) {
// Check each outlink and creates new webpages if it's not
// exist in database (dbWebPages)
- WebPage outlinkRow = new WebPage();
+ WebPage outlinkRow = WebPage.newBuilder().build();
scoringFilter.initialScore(sc.getUrl(), outlinkRow);
List<ScoreDatum> newScoreList = new LinkedList<ScoreDatum>();
newScoreList.add(sc);
diff --git src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
index b4730d1..481e7aa 100644
--- src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
+++ src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
@@ -92,7 +92,7 @@ public class TestTLDIndexingFilter {
urls.put("ftp://w.info.nf/", "info.nf");
urls.put("file://x.aa.no", "aa.no");
- WebPage page = new WebPage();
+ WebPage page = WebPage.newBuilder().build();
TLDIndexingFilter filter = new TLDIndexingFilter();
assertNotNull(filter);
diff --git src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
index 1357832..5bfe47f 100644
--- src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
+++ src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
@@ -83,7 +83,7 @@ public class TestAdaptiveFetchSchedule extends TestCase {
* @return wp :Webpage
*/
public WebPage prepareWebpage() {
- WebPage wp = new WebPage();
+ WebPage wp = WebPage.newBuilder().build();
wp.setStatus(1);
wp.setFetchInterval(interval);
wp.setScore(1.0f);