This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 90849124d NUTCH-3020 -- ParseSegment should check for okhttp's truncation flag (#794) 90849124d is described below commit 90849124d757fb0417ea90576e88b1f55da616f1 Author: Tim Allison <talli...@apache.org> AuthorDate: Mon Nov 6 15:07:10 2023 -0500 NUTCH-3020 -- ParseSegment should check for okhttp's truncation flag (#794) * ParseSegment should check for okhttp's truncated flag --- src/java/org/apache/nutch/parse/ParseSegment.java | 11 +++ .../org/apache/nutch/parse/TestParseSegment.java | 81 ++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index 1995a880e..e9f041a5f 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -180,6 +180,17 @@ public class ParseSegment extends NutchTool implements Tool { if (metadata == null) return false; + //check for okhttp or other protocol's truncated flag + //if the flag is there, no matter the value, trust it. + if (metadata.get(Response.TRUNCATED_CONTENT) != null) { + if ("true".equals(metadata.get(Response.TRUNCATED_CONTENT))) { + LOG.info(content.getUrl() + " skipped. Protocol metadata indicates truncated content, " + + "actualSize= " + content.getContent().length); + return true; + } + return false; + } + String lengthStr = metadata.get(Response.CONTENT_LENGTH); if (lengthStr != null) lengthStr = lengthStr.trim(); diff --git a/src/test/org/apache/nutch/parse/TestParseSegment.java b/src/test/org/apache/nutch/parse/TestParseSegment.java new file mode 100644 index 000000000..dd7f4f920 --- /dev/null +++ b/src/test/org/apache/nutch/parse/TestParseSegment.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +import static junit.framework.TestCase.assertFalse; +import static junit.framework.TestCase.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; +import org.junit.Test; + +public class TestParseSegment { + private static byte[] BYTES = "the quick brown fox".getBytes(StandardCharsets.UTF_8); + + @Test + public void testMetadataFlag() throws Exception { + + Content content = new Content(); + Metadata metadata = new Metadata(); + metadata.set(Response.TRUNCATED_CONTENT, "true"); + content.setMetadata(metadata); + content.setContent(BYTES); + assertTrue(ParseSegment.isTruncated(content)); + + metadata.set(Response.TRUNCATED_CONTENT, "false"); + assertFalse(ParseSegment.isTruncated(content)); + + //test that truncated_content does override length field + metadata = new Metadata(); + metadata.set(Response.TRUNCATED_CONTENT, "false"); + metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length + 10)); + assertFalse(ParseSegment.isTruncated(content)); + + //test that truncated_content does override length field + metadata = new Metadata(); + metadata.set(Response.TRUNCATED_CONTENT, "true"); + metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length)); + assertFalse(ParseSegment.isTruncated(content)); + + } + + @Test + public void testLength() throws Exception { + Content content = new Content(); + Metadata metadata = new Metadata(); + metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length)); + content.setMetadata(metadata); + content.setContent(BYTES); + assertFalse(ParseSegment.isTruncated(content)); + + metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length * 2)); + assertTrue(ParseSegment.isTruncated(content)); + } + + @Test + public void testNoLengthField() { + //test return false if there is no "Length" header field + Content content = new Content(); + Metadata metadata = new Metadata(); + content.setMetadata(metadata); + content.setContent(BYTES); + assertFalse(ParseSegment.isTruncated(content)); + } +}