Author: siren
Date: Tue Jun 20 12:11:00 2006
New Revision: 415772

URL: http://svn.apache.org/viewvc?rev=415772&view=rev
Log:
NUTCH-110 fix illegal xml output contributed by [EMAIL PROTECTED]

Added:
    
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=415772&r1=415771&r2=415772&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 
Tue Jun 20 12:11:00 2006
@@ -262,23 +262,57 @@
   private static void addNode(Document doc, Node parent,
                               String name, String text) {
     Element child = doc.createElement(name);
-    child.appendChild(doc.createTextNode(text));
+    child.appendChild(doc.createTextNode(getLegalXml(text)));
     parent.appendChild(child);
   }
 
   private static void addNode(Document doc, Node parent,
                               String ns, String name, String text) {
     Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
-    child.appendChild(doc.createTextNode(text));
+    child.appendChild(doc.createTextNode(getLegalXml(text)));
     parent.appendChild(child);
   }
 
   private static void addAttribute(Document doc, Element node,
                                    String name, String value) {
     Attr attribute = doc.createAttribute(name);
-    attribute.setValue(value);
+    attribute.setValue(getLegalXml(value));
     node.getAttributes().setNamedItem(attribute);
   }
 
-}
+  /*
+   * Ensure string is legal xml.
+   * @param text String to verify.
+   * @return Passed <code>text</code> or a new string with illegal
+   * characters removed if any found in <code>text</code>.
+   * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
+   */
+  protected static String getLegalXml(final String text) {
+      if (text == null) {
+          return null;
+      }
+      StringBuffer buffer = null;
+      for (int i = 0; i < text.length(); i++) {
+        char c = text.charAt(i);
+        if (!isLegalXml(c)) {
+         if (buffer == null) {
+              // Start up a buffer.  Copy characters here from now on
+              // now we've found at least one bad character in original.
+             buffer = new StringBuffer(text.length());
+              buffer.append(text.substring(0, i));
+          }
+        } else {
+           if (buffer != null) {
+             buffer.append(c);
+           }
+        }
+      }
+      return (buffer != null)? buffer.toString(): text;
+  }
+ 
+  private static boolean isLegalXml(final char c) {
+    return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
+        || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
+  }
 
+}

Added: 
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java?rev=415772&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java
 (added)
+++ 
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java
 Tue Jun 20 12:11:00 2006
@@ -0,0 +1,17 @@
+package org.apache.nutch.searcher;
+
+import junit.framework.TestCase;
+
+public class TestOpenSearchServlet extends TestCase {
+
+  /**
+   * Test removing of illegal xml chars from string
+   */
+  public void testGetLegalXml(){
+    assertEquals("hello",OpenSearchServlet.getLegalXml("hello"));
+    assertEquals("hello",OpenSearchServlet.getLegalXml("he\u0000llo"));
+    assertEquals("hello",OpenSearchServlet.getLegalXml("\u0000he\u0000llo"));
+    
assertEquals("hello",OpenSearchServlet.getLegalXml("\u0000he\u0000llo\u0000"));
+  }
+  
+}




_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to