Author: dogacan
Date: Mon Sep 24 01:27:34 2007
New Revision: 578703

URL: http://svn.apache.org/viewvc?rev=578703&view=rev
Log:
NUTCH-529 - NodeWalker.skipChildren doesn't work for more than 1 child. 
Contributed by Emmanuel Joke.

Added:
    lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=578703&r1=578702&r2=578703&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Sep 24 01:27:34 2007
@@ -136,6 +136,9 @@
 46. NUTCH-554 - Generator throws IOException on invalid urls.
     (Brian Whitman via ab)
 
+47. NUTCH-529 - NodeWalker.skipChildren doesn't work for more than 1 child.
+    (Emmanuel Joke via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=578703&r1=578702&r2=578703&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Mon Sep 
24 01:27:34 2007
@@ -77,8 +77,7 @@
     
     int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
     
-    // put the children node on the stack in first to last order
-    for (int i = childLen - 1; i >= 0; i--) {
+    for (int i = 0 ; i < childLen ; i++) {
       Node child = nodes.peek();
       if (child.equals(currentChildren.item(i))) {
         nodes.pop();

Added: lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java?rev=578703&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java 
(added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java Mon 
Sep 24 01:27:34 2007
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.ByteArrayInputStream;
+import junit.framework.TestCase;
+
+import org.apache.xerces.parsers.DOMParser;
+import org.w3c.dom.Node;
+import org.xml.sax.InputSource;
+
+
+
+
+/** Unit tests for NodeWalker methods. */
+public class TestNodeWalker extends TestCase {
+  public TestNodeWalker(String name) { 
+    super(name); 
+  }
+
+  /* a snapshot of the nutch webpage */
+  private final static String WEBPAGE= 
+  "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" 
\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\";>"
+  + "<html xmlns=\"http://www.w3.org/1999/xhtml\"; lang=\"en\" 
xml:lang=\"en\"><head><title>Nutch</title></head>"
+  + "<body>"
+  + "<ul>"
+  + "<li>crawl several billion pages per month</li>"
+  + "<li>maintain an index of these pages</li>"
+  + "<li>search that index up to 1000 times per second</li>"
+  + "<li>provide very high quality search results</li>"
+  + "<li>operate at minimal cost</li>"
+  + "</ul>"
+  + "</body>"
+  + "</html>";
+
+  private final static String[] ULCONTENT = new String[4];
+  
+  protected void setUp() throws Exception{
+    ULCONTENT[0]="crawl several billion pages per month" ;
+    ULCONTENT[1]="maintain an index of these pages" ;
+    ULCONTENT[2]="search that index up to 1000 times per second"  ;
+    ULCONTENT[3]="operate at minimal cost" ;
+  }
+
+  public void testSkipChildren() {
+    DOMParser parser= new DOMParser();
+    try {
+      parser.parse(new InputSource(new 
ByteArrayInputStream(WEBPAGE.getBytes())));
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+     
+    StringBuffer sb = new StringBuffer();
+    NodeWalker walker = new NodeWalker(parser.getDocument());
+    while (walker.hasNext()) {
+      Node currentNode = walker.nextNode();
+      short nodeType = currentNode.getNodeType();
+      if (nodeType == Node.TEXT_NODE) {
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        sb.append(text);
+      }
+    }
+   assertTrue("UL Content can NOT be found in the node", 
findSomeUlContent(sb.toString()));
+     
+   StringBuffer sbSkip = new StringBuffer();
+   NodeWalker walkerSkip = new NodeWalker(parser.getDocument());
+   while (walkerSkip.hasNext()) {
+     Node currentNode = walkerSkip.nextNode();
+     String nodeName = currentNode.getNodeName();
+     short nodeType = currentNode.getNodeType();
+     if ("ul".equalsIgnoreCase(nodeName)) {
+       walkerSkip.skipChildren();
+     }
+     if (nodeType == Node.TEXT_NODE) {
+       String text = currentNode.getNodeValue();
+       text = text.replaceAll("\\s+", " ");
+       sbSkip.append(text);
+     }
+   }
+   assertFalse("UL Content can be found in the node", 
findSomeUlContent(sbSkip.toString()));
+  }
+  
+  public boolean findSomeUlContent(String str) {
+    for(int i=0; i<ULCONTENT.length ; i++){
+      if(str.contains(ULCONTENT[i])) return true;
+    }    
+    return false;
+  }
+}


Reply via email to