Author: yonik
Date: Sun Jan 6 07:43:50 2008
New Revision: 609333
URL: http://svn.apache.org/viewvc?rev=609333&view=rev
Log:
new CSV lib, escape option for loader
Added:
lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar (with props)
Removed:
lucene/solr/trunk/lib/commons-csv-0.1-SNAPSHOT.jar
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java
lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java
Modified: lucene/solr/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=609333&r1=609332&r2=609333&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sun Jan 6 07:43:50 2008
@@ -272,6 +272,10 @@
lucene-analyzers-2.2.0.jar -- includes support for German, Chinese,
Russan, Dutch, Greek, Brazilian, Thai, and French. (hossman)
+ 7. Upgraded to commons-CSV r609327, which fixes escaping bugs and
+ introduces new escaping and whitespace handling options to
+ increase compatibility with different formats. (yonik)
+
Build
1. SOLR-411. Changed the names of the Solr JARs to use the defacto standard
JAR names based on
project-name-version.jar. This yields, for example:
Added: lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar?rev=609333&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar
------------------------------------------------------------------------------
svn:executable = *
Propchange: lucene/solr/trunk/lib/commons-csv-1.0-SNAPSHOT-r609327.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified:
lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java?rev=609333&r1=609332&r2=609333&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java
(original)
+++ lucene/solr/trunk/src/java/org/apache/solr/handler/CSVRequestHandler.java
Sun Jan 6 07:43:50 2008
@@ -99,6 +99,7 @@
static String EMPTY="keepEmpty";
static String SPLIT="split";
static String ENCAPSULATOR="encapsulator";
+ static String ESCAPE="escape";
static String OVERWRITE="overwrite";
private static Pattern colonSplit = Pattern.compile(":");
@@ -216,7 +217,7 @@
templateAdd.overwritePending=false;
}
- strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, true,
false, true);
+ strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED,
CSVStrategy.ESCAPE_DISABLED, false, false, false, true);
String sep = params.get(SEPARATOR);
if (sep!=null) {
if (sep.length()!=1) throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,"Invalid separator:'"+sep+"'");
@@ -225,8 +226,32 @@
String encapsulator = params.get(ENCAPSULATOR);
if (encapsulator!=null) {
- if (encapsulator.length()!=1) throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+sep+"'");
- strategy.setEncapsulator(encapsulator.charAt(0));
+ if (encapsulator.length()!=1) throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+encapsulator+"'");
+ }
+
+ String escape = params.get(ESCAPE);
+ if (escape!=null) {
+ if (escape.length()!=1) throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");
+ }
+
+ // if only encapsulator or escape is set, disable the other escaping
mechanism
+ if (encapsulator == null && escape != null) {
+ strategy.setEncapsulator((char)-2); // TODO: add
CSVStrategy.ENCAPSULATOR_DISABLED
+ strategy.setEscape(escape.charAt(0));
+ } else {
+ if (encapsulator != null) {
+ strategy.setEncapsulator(encapsulator.charAt(0));
+ }
+ if (escape != null) {
+ char ch = escape.charAt(0);
+ strategy.setEscape(ch);
+ if (ch == '\\') {
+ // If the escape is the standard backslash, then also enable
+ // unicode escapes (it's harmless since 'u' would not otherwise
+ // be escaped.
+ strategy.setUnicodeEscapeInterpretation(true);
+ }
+ }
}
String fn = params.get(FIELDNAMES);
Modified: lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java
URL:
http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java?rev=609333&r1=609332&r2=609333&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java
(original)
+++ lucene/solr/trunk/src/test/org/apache/solr/handler/TestCSVLoader.java Sun
Jan 6 07:43:50 2008
@@ -234,7 +234,9 @@
+"100|^quoted^\n"
+"101|a;'b';c\n"
+"102|a;;b\n"
- +"103|\n");
+ +"103|\n"
+ +"104|a\\\\b\n" // no backslash escaping should be done by default
+ );
loadLocal("stream.file",filename, "commit","true",
"separator","|",
@@ -244,14 +246,38 @@
"f.str_s.separator",";",
"f.str_s.encapsulator","'"
);
- assertQ(req("id:[100 TO 110]"),"//[EMAIL PROTECTED]'4']");
+ assertQ(req("id:[100 TO 110]"),"//[EMAIL PROTECTED]'5']");
assertQ(req("id:100"),"//[EMAIL PROTECTED]'str_s'][.='quoted']");
assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s']/str[1][.='a']");
assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s']/str[2][.='b']");
assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s']/str[3][.='c']");
assertQ(req("id:102"),"//[EMAIL PROTECTED]'str_s']/str[2][.='EMPTY']");
assertQ(req("id:103"),"//[EMAIL PROTECTED]'str_s'][.='EMPTY']");
+ assertQ(req("id:104"),"//[EMAIL PROTECTED]'str_s'][.='a\\\\b']");
+
+ // test no escaping + double encapsulator escaping by default
+ makeFile("id,str_s\n"
+ +"100,\"quoted \"\" \\ string\"\n"
+ +"101,unquoted \"\" \\ string\n" // double encap shouldn't be
an escape outside encap
+ +"102,end quote \\\n"
+ );
+ loadLocal("stream.file",filename, "commit","true"
+ );
+ assertQ(req("id:100"),"//[EMAIL PROTECTED]'str_s'][.='quoted \" \\
string']");
+ assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s'][.='unquoted \"\" \\
string']");
+ assertQ(req("id:102"),"//[EMAIL PROTECTED]'str_s'][.='end quote \\']");
+
+ // setting an escape should disable encapsulator
+ makeFile("id,str_s\n"
+ +"100,\"quoted \"\" \\\" \\\\ string\"\n" // quotes should be
part of value
+ +"101,unquoted \"\" \\\" \\, \\\\ string\n"
+ );
+ loadLocal("stream.file",filename, "commit","true"
+ ,"escape","\\"
+ );
+ assertQ(req("id:100"),"//[EMAIL PROTECTED]'str_s'][.='\"quoted \"\" \" \\
string\"']");
+ assertQ(req("id:101"),"//[EMAIL PROTECTED]'str_s'][.='unquoted \"\" \" ,
\\ string']");
}