This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 90849124d NUTCH-3020 -- ParseSegment should check for okhttp's 
truncation flag (#794)
90849124d is described below

commit 90849124d757fb0417ea90576e88b1f55da616f1
Author: Tim Allison <talli...@apache.org>
AuthorDate: Mon Nov 6 15:07:10 2023 -0500

    NUTCH-3020 -- ParseSegment should check for okhttp's truncation flag (#794)
    
    * ParseSegment should check for okhttp's truncated flag
---
 src/java/org/apache/nutch/parse/ParseSegment.java  | 11 +++
 .../org/apache/nutch/parse/TestParseSegment.java   | 81 ++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java 
b/src/java/org/apache/nutch/parse/ParseSegment.java
index 1995a880e..e9f041a5f 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -180,6 +180,17 @@ public class ParseSegment extends NutchTool implements 
Tool {
     if (metadata == null)
       return false;
 
+    //check for okhttp or other protocol's truncated flag
+    //if the flag is there, no matter the value, trust it.
+    if (metadata.get(Response.TRUNCATED_CONTENT) != null) {
+      if ("true".equals(metadata.get(Response.TRUNCATED_CONTENT))) {
+        LOG.info(content.getUrl() + " skipped. Protocol metadata indicates 
truncated content, " +
+                "actualSize= " + content.getContent().length);
+        return true;
+      }
+      return false;
+    }
+
     String lengthStr = metadata.get(Response.CONTENT_LENGTH);
     if (lengthStr != null)
       lengthStr = lengthStr.trim();
diff --git a/src/test/org/apache/nutch/parse/TestParseSegment.java 
b/src/test/org/apache/nutch/parse/TestParseSegment.java
new file mode 100644
index 000000000..dd7f4f920
--- /dev/null
+++ b/src/test/org/apache/nutch/parse/TestParseSegment.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import static junit.framework.TestCase.assertFalse;
+import static junit.framework.TestCase.assertTrue;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.junit.Test;
+
+public class TestParseSegment {
+  private static byte[] BYTES = "the quick brown 
fox".getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  public void testMetadataFlag() throws Exception {
+
+    Content content = new Content();
+    Metadata metadata = new Metadata();
+    metadata.set(Response.TRUNCATED_CONTENT, "true");
+    content.setMetadata(metadata);
+    content.setContent(BYTES);
+    assertTrue(ParseSegment.isTruncated(content));
+
+    metadata.set(Response.TRUNCATED_CONTENT, "false");
+    assertFalse(ParseSegment.isTruncated(content));
+
+    //test that truncated_content does override length field
+    metadata = new Metadata();
+    metadata.set(Response.TRUNCATED_CONTENT, "false");
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length + 10));
+    assertFalse(ParseSegment.isTruncated(content));
+
+    //test that truncated_content does override length field
+    metadata = new Metadata();
+    metadata.set(Response.TRUNCATED_CONTENT, "true");
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length));
+    assertFalse(ParseSegment.isTruncated(content));
+
+  }
+
+  @Test
+  public void testLength() throws Exception {
+    Content content = new Content();
+    Metadata metadata = new Metadata();
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length));
+    content.setMetadata(metadata);
+    content.setContent(BYTES);
+    assertFalse(ParseSegment.isTruncated(content));
+
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length * 2));
+    assertTrue(ParseSegment.isTruncated(content));
+  }
+
+  @Test
+  public void testNoLengthField() {
+    //test return false if there is no "Length" header field
+    Content content = new Content();
+    Metadata metadata = new Metadata();
+    content.setMetadata(metadata);
+    content.setContent(BYTES);
+    assertFalse(ParseSegment.isTruncated(content));
+  }
+}

Reply via email to