This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 90849124d NUTCH-3020 -- ParseSegment should check for okhttp's
truncation flag (#794)
90849124d is described below
commit 90849124d757fb0417ea90576e88b1f55da616f1
Author: Tim Allison <[email protected]>
AuthorDate: Mon Nov 6 15:07:10 2023 -0500
NUTCH-3020 -- ParseSegment should check for okhttp's truncation flag (#794)
* ParseSegment should check for okhttp's truncated flag
---
src/java/org/apache/nutch/parse/ParseSegment.java | 11 +++
.../org/apache/nutch/parse/TestParseSegment.java | 81 ++++++++++++++++++++++
2 files changed, 92 insertions(+)
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java
b/src/java/org/apache/nutch/parse/ParseSegment.java
index 1995a880e..e9f041a5f 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -180,6 +180,17 @@ public class ParseSegment extends NutchTool implements
Tool {
if (metadata == null)
return false;
+ //check for okhttp or other protocol's truncated flag
+ //if the flag is there, no matter the value, trust it.
+ if (metadata.get(Response.TRUNCATED_CONTENT) != null) {
+ if ("true".equals(metadata.get(Response.TRUNCATED_CONTENT))) {
+ LOG.info(content.getUrl() + " skipped. Protocol metadata indicates
truncated content, " +
+ "actualSize= " + content.getContent().length);
+ return true;
+ }
+ return false;
+ }
+
String lengthStr = metadata.get(Response.CONTENT_LENGTH);
if (lengthStr != null)
lengthStr = lengthStr.trim();
diff --git a/src/test/org/apache/nutch/parse/TestParseSegment.java
b/src/test/org/apache/nutch/parse/TestParseSegment.java
new file mode 100644
index 000000000..dd7f4f920
--- /dev/null
+++ b/src/test/org/apache/nutch/parse/TestParseSegment.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import static junit.framework.TestCase.assertFalse;
+import static junit.framework.TestCase.assertTrue;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.junit.Test;
+
+public class TestParseSegment {
+ private static byte[] BYTES = "the quick brown
fox".getBytes(StandardCharsets.UTF_8);
+
+ @Test
+ public void testMetadataFlag() throws Exception {
+
+ Content content = new Content();
+ Metadata metadata = new Metadata();
+ metadata.set(Response.TRUNCATED_CONTENT, "true");
+ content.setMetadata(metadata);
+ content.setContent(BYTES);
+ assertTrue(ParseSegment.isTruncated(content));
+
+ metadata.set(Response.TRUNCATED_CONTENT, "false");
+ assertFalse(ParseSegment.isTruncated(content));
+
+ //test that truncated_content does override length field
+ metadata = new Metadata();
+ metadata.set(Response.TRUNCATED_CONTENT, "false");
+ metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length + 10));
+ assertFalse(ParseSegment.isTruncated(content));
+
+ //test that truncated_content does override length field
+ metadata = new Metadata();
+ metadata.set(Response.TRUNCATED_CONTENT, "true");
+ metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length));
+ assertFalse(ParseSegment.isTruncated(content));
+
+ }
+
+ @Test
+ public void testLength() throws Exception {
+ Content content = new Content();
+ Metadata metadata = new Metadata();
+ metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length));
+ content.setMetadata(metadata);
+ content.setContent(BYTES);
+ assertFalse(ParseSegment.isTruncated(content));
+
+ metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length * 2));
+ assertTrue(ParseSegment.isTruncated(content));
+ }
+
+ @Test
+ public void testNoLengthField() {
+ //test return false if there is no "Length" header field
+ Content content = new Content();
+ Metadata metadata = new Metadata();
+ content.setMetadata(metadata);
+ content.setContent(BYTES);
+ assertFalse(ParseSegment.isTruncated(content));
+ }
+}