tokenize into lp:zorba

Matthias Brantner Fri, 23 Dec 2011 12:34:37 -0800

Matthias Brantner has proposed merging lp:~zorba-coders/zorba/tokenize into 
lp:zorba.


Requested reviews:
  Matthias Brantner (matthias-brantner)
  Paul J. Lucas (paul-lucas)
  William Candillon (wcandillon)
Related bugs:
  Bug #898074 in Zorba: "fn:tokenize() doesn't stream"
  https://bugs.launchpad.net/zorba/+bug/898074

For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/tokenize/+merge/86829

implementation of string:split function that doesn't accept regular expressions 
but allows for streamable processing of the input (resolves bug #898074)
-- 
https://code.launchpad.net/~zorba-coders/zorba/tokenize/+merge/86829
Your team Zorba Coders is subscribed to branch lp:zorba.

=== modified file 'ChangeLog'
--- ChangeLog	2011-12-23 19:38:53 +0000
+++ ChangeLog	2011-12-23 20:33:38 +0000
@@ -12,6 +12,8 @@
     set multiple times via the c++ api).
   * Fixed bug #905050 (setting and getting the context item type via the c++ api)
   * Added createDayTimeDuration, createYearMonthDuration, createDocumentNode, createCommentNode, createPiNode to api's ItemFactory.
+  * Added split function to the string module that allows for streamable tokenization but doesn't have regular expression
+    support.
   * zerr is not predeclared anymore to be http://www.zorba-xquery.com/errors
 
 version 2.1

=== modified file 'modules/com/zorba-xquery/www/modules/CMakeLists.txt'
--- modules/com/zorba-xquery/www/modules/CMakeLists.txt	2011-12-21 14:40:33 +0000
+++ modules/com/zorba-xquery/www/modules/CMakeLists.txt	2011-12-23 20:33:38 +0000
@@ -58,7 +58,7 @@
   URI "http://www.zorba-xquery.com/modules/reflection";)
 DECLARE_ZORBA_MODULE(FILE schema.xq VERSION 2.0
   URI "http://www.zorba-xquery.com/modules/schema";)
-DECLARE_ZORBA_MODULE(FILE string.xq VERSION 2.0
+DECLARE_ZORBA_MODULE(FILE string.xq VERSION 2.1
   URI "http://www.zorba-xquery.com/modules/string";)
 DECLARE_ZORBA_MODULE(FILE xml.xq VERSION 2.0
   URI "http://www.zorba-xquery.com/modules/xml";)

=== modified file 'modules/com/zorba-xquery/www/modules/string.xq'
--- modules/com/zorba-xquery/www/modules/string.xq	2011-08-03 15:12:40 +0000
+++ modules/com/zorba-xquery/www/modules/string.xq	2011-12-23 20:33:38 +0000
@@ -25,7 +25,7 @@
  :)
 module namespace string = "http://www.zorba-xquery.com/modules/string";;
 declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
-declare option ver:module-version "2.0";
+declare option ver:module-version "2.1";
 
 (:~
  : This function materializes a streamable string.
@@ -63,3 +63,23 @@
  :
  :)
 declare function string:is-streamable($s as xs:string) as xs:boolean external;
+
+(:~
+ : Returns a sequence of strings constructed by splitting the input wherever the given
+ : separator is found.
+ :
+ : The function is different from fn:tokenize. It doesn't allow
+ : the separator to be a regular expression. This restriction allows for more
+ : performant implementation. Specifically, the function processes
+ : streamable strings as input in a streamable way which is particularly useful
+ : to tokenize huge strings (e.g. if returned by the file module's read-text
+ : function).
+ :
+ : @param $s the input string to split
+ : @param $separator the separator used for splitting the input string $s
+ :
+ : @return a sequence of strings constructed by splitting the input
+ :)
+declare function string:split(
+  $s as xs:string,
+  $separator as xs:string) as xs:string* external;

=== modified file 'src/functions/pregenerated/func_strings.cpp'
--- src/functions/pregenerated/func_strings.cpp	2011-12-21 14:40:33 +0000
+++ src/functions/pregenerated/func_strings.cpp	2011-12-23 20:33:38 +0000
@@ -320,6 +320,16 @@
   return new StringIsStreamableIterator(sctx, loc, argv);
 }
 
+PlanIter_t fn_zorba_string_split::codegen(
+  CompilerCB*,
+  static_context* sctx,
+  const QueryLoc& loc,
+  std::vector<PlanIter_t>& argv,
+  AnnotationHolder& ann) const
+{
+  return new StringSplitIterator(sctx, loc, argv);
+}
+
 void populate_context_strings(static_context* sctx)
 {
   {
@@ -890,6 +900,19 @@
 
   }
 
+
+  {
+    
+
+    DECL_WITH_KIND(sctx, fn_zorba_string_split,
+        (createQName("http://www.zorba-xquery.com/modules/string","","split";), 
+        GENV_TYPESYSTEM.STRING_TYPE_ONE, 
+        GENV_TYPESYSTEM.STRING_TYPE_ONE, 
+        GENV_TYPESYSTEM.STRING_TYPE_STAR),
+        FunctionConsts::FN_ZORBA_STRING_SPLIT_2);
+
+  }
+
 }
 
 

=== modified file 'src/functions/pregenerated/func_strings.h'
--- src/functions/pregenerated/func_strings.h	2011-12-22 14:14:53 +0000
+++ src/functions/pregenerated/func_strings.h	2011-12-23 20:33:38 +0000
@@ -481,6 +481,19 @@
 };
 
 
+//fn-zorba-string:split
+class fn_zorba_string_split : public function
+{
+public:
+  fn_zorba_string_split(const signature& sig, FunctionConsts::FunctionKind kind)
+    : function(sig, kind) {
+
+}
+
+  CODEGEN_DECL();
+};
+
+
 } //namespace zorba
 
 

=== modified file 'src/functions/pregenerated/function_enum.h'
--- src/functions/pregenerated/function_enum.h	2011-12-21 14:40:33 +0000
+++ src/functions/pregenerated/function_enum.h	2011-12-23 20:33:38 +0000
@@ -371,6 +371,7 @@
   FN_ANALYZE_STRING_3,
   FN_ZORBA_STRING_MATERIALIZE_1,
   FN_ZORBA_STRING_IS_STREAMABLE_1,
+  FN_ZORBA_STRING_SPLIT_2,
   FN_ZORBA_XQDOC_XQDOC_1,
   FN_ZORBA_XQDOC_XQDOC_CONTENT_1,
 

=== modified file 'src/runtime/spec/strings/strings.xml'
--- src/runtime/spec/strings/strings.xml	2011-12-21 14:40:33 +0000
+++ src/runtime/spec/strings/strings.xml	2011-12-23 20:33:38 +0000
@@ -729,4 +729,35 @@
 
 </zorba:iterator>
 
+<!--
+/*******************************************************************************
+ * string:tokenize
+********************************************************************************/
+-->
+<zorba:iterator name="StringSplitIterator">
+
+  <zorba:description author="Matthias Brantner">
+    string:split
+  </zorba:description>
+
+  <zorba:function>
+    <zorba:signature localname="split" prefix="fn-zorba-string">
+      <zorba:param>xs:string</zorba:param>
+      <zorba:param>xs:string</zorba:param>
+      <zorba:output>xs:string*</zorba:output>
+    </zorba:signature>
+  </zorba:function>
+
+  <zorba:state>
+    <zorba:member type="zstring" name="theSeparator"
+      brief="separator for the tokenization"/>
+    <zorba:member type="std::istream*" name="theIStream"
+      brief="the remaining string (if the input is streamable)"/>
+    <zorba:member type="zstring" name="theInput"
+      brief="the string to tokenize (if the input is not streamable)"/>
+    <zorba:member type="size_t" name="theNextStartPos" defaultValue="0"/>
+  </zorba:state>
+
+</zorba:iterator>
+
 </zorba:iterators>

=== modified file 'src/runtime/strings/pregenerated/strings.cpp'
--- src/runtime/strings/pregenerated/strings.cpp	2011-12-21 14:40:33 +0000
+++ src/runtime/strings/pregenerated/strings.cpp	2011-12-23 20:33:38 +0000
@@ -830,6 +830,48 @@
 // </StringIsStreamableIterator>
 
 
+// <StringSplitIterator>
+const char* StringSplitIterator::class_name_str = "StringSplitIterator";
+StringSplitIterator::class_factory<StringSplitIterator>
+StringSplitIterator::g_class_factory;
+
+const serialization::ClassVersion 
+StringSplitIterator::class_versions[] ={{ 1, 0x000905, false}};
+
+const int StringSplitIterator::class_versions_count =
+sizeof(StringSplitIterator::class_versions)/sizeof(struct serialization::ClassVersion);
+
+void StringSplitIterator::accept(PlanIterVisitor& v) const {
+  v.beginVisit(*this);
+
+  std::vector<PlanIter_t>::const_iterator lIter = theChildren.begin();
+  std::vector<PlanIter_t>::const_iterator lEnd = theChildren.end();
+  for ( ; lIter != lEnd; ++lIter ){
+    (*lIter)->accept(v);
+  }
+
+  v.endVisit(*this);
+}
+
+StringSplitIterator::~StringSplitIterator() {}
+
+StringSplitIteratorState::StringSplitIteratorState() {}
+
+StringSplitIteratorState::~StringSplitIteratorState() {}
+
+
+void StringSplitIteratorState::init(PlanState& planState) {
+  PlanIteratorState::init(planState);
+  theNextStartPos = 0;
+}
+
+void StringSplitIteratorState::reset(PlanState& planState) {
+  PlanIteratorState::reset(planState);
+  theNextStartPos = 0;
+}
+// </StringSplitIterator>
+
+
 
 }
 

=== modified file 'src/runtime/strings/pregenerated/strings.h'
--- src/runtime/strings/pregenerated/strings.h	2011-12-21 14:40:33 +0000
+++ src/runtime/strings/pregenerated/strings.h	2011-12-23 20:33:38 +0000
@@ -1075,6 +1075,58 @@
 };
 
 
+/**
+ * 
+ *    string:split
+ *  
+ * Author: Matthias Brantner
+ */
+class StringSplitIteratorState : public PlanIteratorState
+{
+public:
+  zstring theSeparator; //separator for the tokenization
+  std::istream* theIStream; //the remaining string (if the input is streamable)
+  zstring theInput; //the string to tokenize (if the input is not streamable)
+  size_t theNextStartPos; //
+
+  StringSplitIteratorState();
+
+  ~StringSplitIteratorState();
+
+  void init(PlanState&);
+  void reset(PlanState&);
+};
+
+class StringSplitIterator : public NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>
+{ 
+public:
+  SERIALIZABLE_CLASS(StringSplitIterator);
+
+  SERIALIZABLE_CLASS_CONSTRUCTOR2T(StringSplitIterator,
+    NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>);
+
+  void serialize( ::zorba::serialization::Archiver& ar)
+  {
+    serialize_baseclass(ar,
+    (NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>*)this);
+  }
+
+  StringSplitIterator(
+    static_context* sctx,
+    const QueryLoc& loc,
+    std::vector<PlanIter_t>& children)
+    : 
+    NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>(sctx, loc, children)
+  {}
+
+  virtual ~StringSplitIterator();
+
+  void accept(PlanIterVisitor& v) const;
+
+  bool nextImpl(store::Item_t& result, PlanState& aPlanState) const;
+};
+
+
 }
 #endif
 /*

=== modified file 'src/runtime/strings/strings_impl.cpp'
--- src/runtime/strings/strings_impl.cpp	2011-12-23 06:41:43 +0000
+++ src/runtime/strings/strings_impl.cpp	2011-12-23 20:33:38 +0000
@@ -140,6 +140,7 @@
       p = ec;
 
       if ( utf8::read( *state->theStream, ec ) == utf8::npos )
+      {
         if ( state->theStream->good() ) {
           //
           // If read() failed but the stream state is good, it means that an
@@ -165,6 +166,7 @@
             zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
           );
         }
+      }
       state->theResult.clear();
       state->theResult.push_back( utf8::next_char( p ) );
       
@@ -2284,5 +2286,133 @@
   STACK_END(state);
 }
 
+/**
+ *______________________________________________________________________
+ *
+ * http://www.zorba-xquery.com/modules/string
+ * string:split
+ */
+bool StringSplitIterator::nextImpl(
+    store::Item_t& result,
+    PlanState& planState) const
+{
+  store::Item_t item;
+  size_t lNewPos = 0;
+  zstring lToken;
+  zstring lPartialMatch;
+
+  StringSplitIteratorState* state;
+  DEFAULT_STACK_INIT(StringSplitIteratorState, state, planState);
+
+  // init phase, get input string and tokens
+  consumeNext(item, theChildren[0].getp(), planState);
+
+  if (item->isStreamable())
+  {
+    state->theIStream = &item->getStream();
+  }
+  else
+  {
+    state->theIStream = 0;
+    item->getStringValue2(state->theInput);
+  }
+
+  consumeNext(item, theChildren[1].getp(), planState);
+
+  item->getStringValue2(state->theSeparator);
+
+  // working phase, do the tokenization
+  if (state->theIStream)
+  {
+    while ( !state->theIStream->eof() )
+    {
+      utf8::encoded_char_type ec;
+      memset( ec, '\0' , sizeof(ec) );
+      utf8::storage_type *p;
+      p = ec;
+
+      if ( utf8::read( *state->theIStream, ec ) != utf8::npos )
+      {
+        if (state->theSeparator.compare(lNewPos, 1, ec) == 0)
+        {
+          if (++lNewPos == state->theSeparator.length())
+          {
+            GENV_ITEMFACTORY->createString(result, lToken);
+            STACK_PUSH(true, state);
+          }
+          else
+          {
+            lPartialMatch.append(ec);
+          }
+        }
+        else
+        {
+          lToken.append(lPartialMatch);
+          lToken.append(ec);
+        }
+      }
+      else
+      {
+        if (state->theIStream->good())
+        {
+          char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
+          bool first = true;
+          for ( ; *p; ++p ) {
+            if ( first )
+              first = false;
+            else
+              *b++ = ',';
+            ::strcpy( b, "0x" );          b += 2;
+            ::sprintf( b, "%0hhX", *p );  b += 2;
+          }
+          throw XQUERY_EXCEPTION(
+            zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+            ERROR_PARAMS( buf ),
+            ERROR_LOC( loc )
+          );
+        }
+        if (!lToken.empty())
+        {
+          GENV_ITEMFACTORY->createString(result, lToken);
+          STACK_PUSH(true, state);
+        }
+        break;
+      }
+    }
+  }
+  else
+  {
+    while (true)
+    {
+      if (state->theNextStartPos == zstring::npos)
+      {
+        break;
+      }
+
+      lNewPos =
+        state->theInput.find(state->theSeparator, state->theNextStartPos);
+      if (lNewPos != zstring::npos)
+      {
+        zstring lSubStr = state->theInput.substr(
+            state->theNextStartPos,
+            lNewPos - state->theNextStartPos);
+        GENV_ITEMFACTORY->createString(result, lSubStr);
+        state->theNextStartPos =
+          lNewPos==state->theInput.length() - state->theSeparator.length()
+          ? zstring::npos
+          : lNewPos + state->theSeparator.length();
+      }
+      else
+      {
+        zstring lSubStr = state->theInput.substr(state->theNextStartPos);
+        GENV_ITEMFACTORY->createString(result, lSubStr);
+        state->theNextStartPos = zstring::npos;
+      }
+      STACK_PUSH(true, state);
+    }
+  }
+
+  STACK_END(state);
+}
 } // namespace zorba
 /* vim:set et sw=2 ts=2: */

=== modified file 'src/runtime/visitors/pregenerated/planiter_visitor.h'
--- src/runtime/visitors/pregenerated/planiter_visitor.h	2011-12-21 14:40:33 +0000
+++ src/runtime/visitors/pregenerated/planiter_visitor.h	2011-12-23 20:33:38 +0000
@@ -582,6 +582,8 @@
 
     class StringIsStreamableIterator;
 
+    class StringSplitIterator;
+
     class XQDocIterator;
 
     class XQDocContentIterator;
@@ -1423,6 +1425,9 @@
     virtual void beginVisit ( const StringIsStreamableIterator& ) = 0;
     virtual void endVisit   ( const StringIsStreamableIterator& ) = 0;
 
+    virtual void beginVisit ( const StringSplitIterator& ) = 0;
+    virtual void endVisit   ( const StringSplitIterator& ) = 0;
+
     virtual void beginVisit ( const XQDocIterator& ) = 0;
     virtual void endVisit   ( const XQDocIterator& ) = 0;
 

=== modified file 'src/runtime/visitors/pregenerated/printer_visitor.cpp'
--- src/runtime/visitors/pregenerated/printer_visitor.cpp	2011-12-21 14:40:33 +0000
+++ src/runtime/visitors/pregenerated/printer_visitor.cpp	2011-12-23 20:33:38 +0000
@@ -3961,6 +3961,20 @@
 // </StringIsStreamableIterator>
 
 
+// <StringSplitIterator>
+void PrinterVisitor::beginVisit ( const StringSplitIterator& a) {
+  thePrinter.startBeginVisit("StringSplitIterator", ++theId);
+  printCommons( &a, theId );
+  thePrinter.endBeginVisit( theId );
+}
+
+void PrinterVisitor::endVisit ( const StringSplitIterator& ) {
+  thePrinter.startEndVisit();
+  thePrinter.endEndVisit();
+}
+// </StringSplitIterator>
+
+
 // <XQDocIterator>
 void PrinterVisitor::beginVisit ( const XQDocIterator& a) {
   thePrinter.startBeginVisit("XQDocIterator", ++theId);

=== modified file 'src/runtime/visitors/pregenerated/printer_visitor.h'
--- src/runtime/visitors/pregenerated/printer_visitor.h	2011-12-21 14:40:33 +0000
+++ src/runtime/visitors/pregenerated/printer_visitor.h	2011-12-23 20:33:38 +0000
@@ -876,6 +876,9 @@
     void beginVisit( const StringIsStreamableIterator& );
     void endVisit  ( const StringIsStreamableIterator& );
 
+    void beginVisit( const StringSplitIterator& );
+    void endVisit  ( const StringSplitIterator& );
+
     void beginVisit( const XQDocIterator& );
     void endVisit  ( const XQDocIterator& );
 

=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res	1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+a d a d

=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res	1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+a a

=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res	1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+ d  d

=== added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res'
--- test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res	1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+abcd abcd

=== added file 'test/rbkt/Queries/zorba/string/token01.txt'
--- test/rbkt/Queries/zorba/string/token01.txt	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token01.txt	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+abcd
\ No newline at end of file

=== added file 'test/rbkt/Queries/zorba/string/token02.txt'
--- test/rbkt/Queries/zorba/string/token02.txt	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token02.txt	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+abc
\ No newline at end of file

=== added file 'test/rbkt/Queries/zorba/string/token03.txt'
--- test/rbkt/Queries/zorba/string/token03.txt	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token03.txt	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+bcd
\ No newline at end of file

=== added file 'test/rbkt/Queries/zorba/string/token04.txt'
--- test/rbkt/Queries/zorba/string/token04.txt	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/token04.txt	2011-12-23 20:33:38 +0000
@@ -0,0 +1,1 @@
+abcd
\ No newline at end of file

=== added file 'test/rbkt/Queries/zorba/string/tokenize01.xq'
--- test/rbkt/Queries/zorba/string/tokenize01.xq	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize01.xq	2011-12-23 20:33:38 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";;
+import module namespace s = "http://www.zorba-xquery.com/modules/string";;
+
+s:split(f:read-text(fn:resolve-uri("token01.txt")), "bc"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token01.txt"))), "bc")

=== added file 'test/rbkt/Queries/zorba/string/tokenize02.xq'
--- test/rbkt/Queries/zorba/string/tokenize02.xq	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize02.xq	2011-12-23 20:33:38 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";;
+import module namespace s = "http://www.zorba-xquery.com/modules/string";;
+
+s:split(f:read-text(fn:resolve-uri("token02.txt")), "bc"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token02.txt"))), "bc")

=== added file 'test/rbkt/Queries/zorba/string/tokenize03.xq'
--- test/rbkt/Queries/zorba/string/tokenize03.xq	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize03.xq	2011-12-23 20:33:38 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";;
+import module namespace s = "http://www.zorba-xquery.com/modules/string";;
+
+s:split(f:read-text(fn:resolve-uri("token03.txt")), "bc"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token03.txt"))), "bc")

=== added file 'test/rbkt/Queries/zorba/string/tokenize04.xq'
--- test/rbkt/Queries/zorba/string/tokenize04.xq	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/string/tokenize04.xq	2011-12-23 20:33:38 +0000
@@ -0,0 +1,5 @@
+import module namespace f = "http://expath.org/ns/file";;
+import module namespace s = "http://www.zorba-xquery.com/modules/string";;
+
+s:split(f:read-text(fn:resolve-uri("token04.txt")), "f"),
+s:split(s:materialize(f:read-text(fn:resolve-uri("token04.txt"))), "f")

-- 
Mailing list: https://launchpad.net/~zorba-coders
Post to     : zorba-coders@lists.launchpad.net
Unsubscribe : https://launchpad.net/~zorba-coders
More help   : https://help.launchpad.net/ListHelp

[Zorba-coders] [Merge] lp:~zorba-coders/zorba/tokenize into lp:zorba

Reply via email to