Author: yonik
Date: Sun Jan  6 07:43:50 2008
New Revision: 609333

URL: http://svn.apache.org/viewvc?rev=609333&view=rev
Log:
new CSV lib, escape option for loader

Added:
    lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar   (with props)
Removed:
    lucene/solr/trunk/lib/commons-csv-0.1-SNAPSHOT.jar
Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java
    lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java

Modified: lucene/solr/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=609333&r1=609332&r2=609333&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sun Jan  6 07:43:50 2008
@@ -272,6 +272,10 @@
     lucene-analyzers-2.2.0.jar -- includes support for German, Chinese,
     Russan, Dutch, Greek, Brazilian, Thai, and French.   (hossman)
 
+ 7. Upgraded to commons-CSV r609327, which fixes escaping bugs and
+    introduces new escaping and whitespace handling options to
+    increase compatibility with different formats.  (yonik)
+
 Build
  1. SOLR-411.  Changed the names of the Solr JARs to use the defacto standard 
JAR names based on
     project-name-version.jar.  This yields, for example:

Added: lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar?rev=609333&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar
------------------------------------------------------------------------------
    svn:executable = *

Propchange: lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: 
lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java?rev=609333&r1=609332&r2=609333&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java 
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java 
Sun Jan  6 07:43:50 2008
@@ -99,6 +99,7 @@
   static String EMPTY="keepEmpty";
   static String SPLIT="split";
   static String ENCAPSULATOR="encapsulator";
+  static String ESCAPE="escape";
   static String OVERWRITE="overwrite";
 
   private static Pattern colonSplit = Pattern.compile(":");
@@ -216,7 +217,7 @@
       templateAdd.overwritePending=false;
     }
 
-    strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, true,  
false, true);
+    strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, 
CSVStrategy.ESCAPE_DISABLED, false, false, false, true);
     String sep = params.get(SEPARATOR);
     if (sep!=null) {
       if (sep.length()!=1) throw new SolrException( 
SolrException.ErrorCode.BAD_REQUEST,"Invalid separator:'"+sep+"'");
@@ -225,8 +226,32 @@
 
     String encapsulator = params.get(ENCAPSULATOR);
     if (encapsulator!=null) {
-      if (encapsulator.length()!=1) throw new SolrException( 
SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+sep+"'");
-      strategy.setEncapsulator(encapsulator.charAt(0));
+      if (encapsulator.length()!=1) throw new SolrException( 
SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+encapsulator+"'");
+    }
+
+    String escape = params.get(ESCAPE);
+    if (escape!=null) {
+      if (escape.length()!=1) throw new SolrException( 
SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");
+    }
+
+    // if only encapsulator or escape is set, disable the other escaping 
mechanism
+    if (encapsulator == null && escape != null) {
+      strategy.setEncapsulator((char)-2);  // TODO: add 
CSVStrategy.ENCAPSULATOR_DISABLED      
+      strategy.setEscape(escape.charAt(0));
+    } else {
+      if (encapsulator != null) {
+        strategy.setEncapsulator(encapsulator.charAt(0));
+      }
+      if (escape != null) {
+        char ch = escape.charAt(0);
+        strategy.setEscape(ch);
+        if (ch == '\\') {
+          // If the escape is the standard backslash, then also enable
+          // unicode escapes (it's harmless since 'u' would not otherwise
+          // be escaped.
+          strategy.setUnicodeEscapeInterpretation(true);
+        }
+      }
     }
 
     String fn = params.get(FIELDNAMES);

Modified: lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java?rev=609333&r1=609332&r2=609333&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java 
(original)
+++ lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java Sun 
Jan  6 07:43:50 2008
@@ -234,7 +234,9 @@
             +"100|^quoted^\n"
             +"101|a;'b';c\n"
             +"102|a;;b\n"
-            +"103|\n");
+            +"103|\n"
+            +"104|a\\\\b\n"  // no backslash escaping should be done by default
+    );
 
     loadLocal("stream.file",filename, "commit","true",
               "separator","|",
@@ -244,14 +246,38 @@
               "f.str_s.separator",";",
               "f.str_s.encapsulator","'"
     );
-    assertQ(req("id:[100 TO 110]"),"//[EMAIL PROTECTED]'4']");
+    assertQ(req("id:[100 TO 110]"),"//[EMAIL PROTECTED]'5']");
     assertQ(req("id:100"),"//[EMAIL PROTECTED]'str_s'][.='quoted']");
     assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s']/str[1][.='a']");
     assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s']/str[2][.='b']");
     assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s']/str[3][.='c']");
     assertQ(req("id:102"),"//[EMAIL PROTECTED]'str_s']/str[2][.='EMPTY']");
     assertQ(req("id:103"),"//[EMAIL PROTECTED]'str_s'][.='EMPTY']");
+    assertQ(req("id:104"),"//[EMAIL PROTECTED]'str_s'][.='a\\\\b']");
+
+    // test no escaping + double encapsulator escaping by default
+    makeFile("id,str_s\n"
+            +"100,\"quoted \"\" \\ string\"\n"
+            +"101,unquoted \"\" \\ string\n"     // double encap shouldn't be 
an escape outside encap
+            +"102,end quote \\\n"
+    );
+    loadLocal("stream.file",filename, "commit","true"
+    );
+    assertQ(req("id:100"),"//[EMAIL PROTECTED]'str_s'][.='quoted \" \\ 
string']");
+    assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s'][.='unquoted \"\" \\ 
string']");
+    assertQ(req("id:102"),"//[EMAIL PROTECTED]'str_s'][.='end quote \\']");
 
+
+    // setting an escape should disable encapsulator
+    makeFile("id,str_s\n"
+            +"100,\"quoted \"\" \\\" \\\\ string\"\n"  // quotes should be 
part of value
+            +"101,unquoted \"\" \\\" \\, \\\\ string\n"
+    );
+    loadLocal("stream.file",filename, "commit","true"
+            ,"escape","\\"
+    );
+    assertQ(req("id:100"),"//[EMAIL PROTECTED]'str_s'][.='\"quoted \"\" \" \\ 
string\"']");
+    assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s'][.='unquoted \"\" \" , 
\\ string']");
 
   }
 


Reply via email to