Author: dogacan Date: Mon Sep 24 01:27:34 2007 New Revision: 578703 URL: http://svn.apache.org/viewvc?rev=578703&view=rev Log: NUTCH-529 - NodeWalker.skipChildren doesn't work for more than 1 child. Contributed by Emmanuel Joke.
Added: lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=578703&r1=578702&r2=578703&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Sep 24 01:27:34 2007 @@ -136,6 +136,9 @@ 46. NUTCH-554 - Generator throws IOException on invalid urls. (Brian Whitman via ab) +47. NUTCH-529 - NodeWalker.skipChildren doesn't work for more than 1 child. + (Emmanuel Joke via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=578703&r1=578702&r2=578703&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Mon Sep 24 01:27:34 2007 @@ -77,8 +77,7 @@ int childLen = (currentChildren != null) ? currentChildren.getLength() : 0; - // put the children node on the stack in first to last order - for (int i = childLen - 1; i >= 0; i--) { + for (int i = 0 ; i < childLen ; i++) { Node child = nodes.peek(); if (child.equals(currentChildren.item(i))) { nodes.pop(); Added: lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java?rev=578703&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java Mon Sep 24 01:27:34 2007 @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.ByteArrayInputStream; +import junit.framework.TestCase; + +import org.apache.xerces.parsers.DOMParser; +import org.w3c.dom.Node; +import org.xml.sax.InputSource; + + + + +/** Unit tests for NodeWalker methods. */ +public class TestNodeWalker extends TestCase { + public TestNodeWalker(String name) { + super(name); + } + + /* a snapshot of the nutch webpage */ + private final static String WEBPAGE= + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" + + "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\"><head><title>Nutch</title></head>" + + "<body>" + + "<ul>" + + "<li>crawl several billion pages per month</li>" + + "<li>maintain an index of these pages</li>" + + "<li>search that index up to 1000 times per second</li>" + + "<li>provide very high quality search results</li>" + + "<li>operate at minimal cost</li>" + + "</ul>" + + "</body>" + + "</html>"; + + private final static String[] ULCONTENT = new String[4]; + + protected void setUp() throws Exception{ + ULCONTENT[0]="crawl several billion pages per month" ; + ULCONTENT[1]="maintain an index of these pages" ; + ULCONTENT[2]="search that index up to 1000 times per second" ; + ULCONTENT[3]="operate at minimal cost" ; + } + + public void testSkipChildren() { + DOMParser parser= new DOMParser(); + try { + parser.parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes()))); + } catch (Exception e) { + e.printStackTrace(); + } + + StringBuffer sb = new StringBuffer(); + NodeWalker walker = new NodeWalker(parser.getDocument()); + while (walker.hasNext()) { + Node currentNode = walker.nextNode(); + short nodeType = currentNode.getNodeType(); + if (nodeType == Node.TEXT_NODE) { + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + sb.append(text); + } + } + assertTrue("UL Content can NOT be found in the node", findSomeUlContent(sb.toString())); + + StringBuffer sbSkip = new StringBuffer(); + NodeWalker walkerSkip = new NodeWalker(parser.getDocument()); + while (walkerSkip.hasNext()) { + Node currentNode = walkerSkip.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + if ("ul".equalsIgnoreCase(nodeName)) { + walkerSkip.skipChildren(); + } + if (nodeType == Node.TEXT_NODE) { + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + sbSkip.append(text); + } + } + assertFalse("UL Content can be found in the node", findSomeUlContent(sbSkip.toString())); + } + + public boolean findSomeUlContent(String str) { + for(int i=0; i<ULCONTENT.length ; i++){ + if(str.contains(ULCONTENT[i])) return true; + } + return false; + } +}