Matthias Brantner has proposed merging lp:~zorba-coders/zorba/tokenize into lp:zorba.
Requested reviews: Matthias Brantner (matthias-brantner) Paul J. Lucas (paul-lucas) William Candillon (wcandillon) Related bugs: Bug #898074 in Zorba: "fn:tokenize() doesn't stream" https://bugs.launchpad.net/zorba/+bug/898074 For more details, see: https://code.launchpad.net/~zorba-coders/zorba/tokenize/+merge/86829 implementation of string:split function that doesn't accept regular expressions but allows for streamable processing of the input (resolves bug #898074) -- https://code.launchpad.net/~zorba-coders/zorba/tokenize/+merge/86829 Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'ChangeLog' --- ChangeLog 2011-12-23 19:38:53 +0000 +++ ChangeLog 2011-12-23 20:33:38 +0000 @@ -12,6 +12,8 @@ set multiple times via the c++ api). * Fixed bug #905050 (setting and getting the context item type via the c++ api) * Added createDayTimeDuration, createYearMonthDuration, createDocumentNode, createCommentNode, createPiNode to api's ItemFactory. + * Added split function to the string module that allows for streamable tokenization but doesn't have regular expression + support. * zerr is not predeclared anymore to be http://www.zorba-xquery.com/errors version 2.1 === modified file 'modules/com/zorba-xquery/www/modules/CMakeLists.txt' --- modules/com/zorba-xquery/www/modules/CMakeLists.txt 2011-12-21 14:40:33 +0000 +++ modules/com/zorba-xquery/www/modules/CMakeLists.txt 2011-12-23 20:33:38 +0000 @@ -58,7 +58,7 @@ URI "http://www.zorba-xquery.com/modules/reflection") DECLARE_ZORBA_MODULE(FILE schema.xq VERSION 2.0 URI "http://www.zorba-xquery.com/modules/schema") -DECLARE_ZORBA_MODULE(FILE string.xq VERSION 2.0 +DECLARE_ZORBA_MODULE(FILE string.xq VERSION 2.1 URI "http://www.zorba-xquery.com/modules/string") DECLARE_ZORBA_MODULE(FILE xml.xq VERSION 2.0 URI "http://www.zorba-xquery.com/modules/xml") === modified file 'modules/com/zorba-xquery/www/modules/string.xq' --- modules/com/zorba-xquery/www/modules/string.xq 2011-08-03 15:12:40 +0000 +++ modules/com/zorba-xquery/www/modules/string.xq 2011-12-23 20:33:38 +0000 @@ -25,7 +25,7 @@ :) module namespace string = "http://www.zorba-xquery.com/modules/string"; declare namespace ver = "http://www.zorba-xquery.com/options/versioning"; -declare option ver:module-version "2.0"; +declare option ver:module-version "2.1"; (:~ : This function materializes a streamable string. @@ -63,3 +63,23 @@ : :) declare function string:is-streamable($s as xs:string) as xs:boolean external; + +(:~ + : Returns a sequence of strings constructed by splitting the input wherever the given + : separator is found. + : + : The function is different from fn:tokenize. It doesn't allow + : the separator to be a regular expression. This restriction allows for more + : performant implementation. Specifically, the function processes + : streamable strings as input in a streamable way which is particularly useful + : to tokenize huge strings (e.g. if returned by the file module's read-text + : function). + : + : @param $s the input string to split + : @param $separator the separator used for splitting the input string $s + : + : @return a sequence of strings constructed by splitting the input + :) +declare function string:split( + $s as xs:string, + $separator as xs:string) as xs:string* external; === modified file 'src/functions/pregenerated/func_strings.cpp' --- src/functions/pregenerated/func_strings.cpp 2011-12-21 14:40:33 +0000 +++ src/functions/pregenerated/func_strings.cpp 2011-12-23 20:33:38 +0000 @@ -320,6 +320,16 @@ return new StringIsStreamableIterator(sctx, loc, argv); } +PlanIter_t fn_zorba_string_split::codegen( + CompilerCB*, + static_context* sctx, + const QueryLoc& loc, + std::vector<PlanIter_t>& argv, + AnnotationHolder& ann) const +{ + return new StringSplitIterator(sctx, loc, argv); +} + void populate_context_strings(static_context* sctx) { { @@ -890,6 +900,19 @@ } + + { + + + DECL_WITH_KIND(sctx, fn_zorba_string_split, + (createQName("http://www.zorba-xquery.com/modules/string","","split"), + GENV_TYPESYSTEM.STRING_TYPE_ONE, + GENV_TYPESYSTEM.STRING_TYPE_ONE, + GENV_TYPESYSTEM.STRING_TYPE_STAR), + FunctionConsts::FN_ZORBA_STRING_SPLIT_2); + + } + } === modified file 'src/functions/pregenerated/func_strings.h' --- src/functions/pregenerated/func_strings.h 2011-12-22 14:14:53 +0000 +++ src/functions/pregenerated/func_strings.h 2011-12-23 20:33:38 +0000 @@ -481,6 +481,19 @@ }; +//fn-zorba-string:split +class fn_zorba_string_split : public function +{ +public: + fn_zorba_string_split(const signature& sig, FunctionConsts::FunctionKind kind) + : function(sig, kind) { + +} + + CODEGEN_DECL(); +}; + + } //namespace zorba === modified file 'src/functions/pregenerated/function_enum.h' --- src/functions/pregenerated/function_enum.h 2011-12-21 14:40:33 +0000 +++ src/functions/pregenerated/function_enum.h 2011-12-23 20:33:38 +0000 @@ -371,6 +371,7 @@ FN_ANALYZE_STRING_3, FN_ZORBA_STRING_MATERIALIZE_1, FN_ZORBA_STRING_IS_STREAMABLE_1, + FN_ZORBA_STRING_SPLIT_2, FN_ZORBA_XQDOC_XQDOC_1, FN_ZORBA_XQDOC_XQDOC_CONTENT_1, === modified file 'src/runtime/spec/strings/strings.xml' --- src/runtime/spec/strings/strings.xml 2011-12-21 14:40:33 +0000 +++ src/runtime/spec/strings/strings.xml 2011-12-23 20:33:38 +0000 @@ -729,4 +729,35 @@ </zorba:iterator> +<!-- +/******************************************************************************* + * string:tokenize +********************************************************************************/ +--> +<zorba:iterator name="StringSplitIterator"> + + <zorba:description author="Matthias Brantner"> + string:split + </zorba:description> + + <zorba:function> + <zorba:signature localname="split" prefix="fn-zorba-string"> + <zorba:param>xs:string</zorba:param> + <zorba:param>xs:string</zorba:param> + <zorba:output>xs:string*</zorba:output> + </zorba:signature> + </zorba:function> + + <zorba:state> + <zorba:member type="zstring" name="theSeparator" + brief="separator for the tokenization"/> + <zorba:member type="std::istream*" name="theIStream" + brief="the remaining string (if the input is streamable)"/> + <zorba:member type="zstring" name="theInput" + brief="the string to tokenize (if the input is not streamable)"/> + <zorba:member type="size_t" name="theNextStartPos" defaultValue="0"/> + </zorba:state> + +</zorba:iterator> + </zorba:iterators> === modified file 'src/runtime/strings/pregenerated/strings.cpp' --- src/runtime/strings/pregenerated/strings.cpp 2011-12-21 14:40:33 +0000 +++ src/runtime/strings/pregenerated/strings.cpp 2011-12-23 20:33:38 +0000 @@ -830,6 +830,48 @@ // </StringIsStreamableIterator> +// <StringSplitIterator> +const char* StringSplitIterator::class_name_str = "StringSplitIterator"; +StringSplitIterator::class_factory<StringSplitIterator> +StringSplitIterator::g_class_factory; + +const serialization::ClassVersion +StringSplitIterator::class_versions[] ={{ 1, 0x000905, false}}; + +const int StringSplitIterator::class_versions_count = +sizeof(StringSplitIterator::class_versions)/sizeof(struct serialization::ClassVersion); + +void StringSplitIterator::accept(PlanIterVisitor& v) const { + v.beginVisit(*this); + + std::vector<PlanIter_t>::const_iterator lIter = theChildren.begin(); + std::vector<PlanIter_t>::const_iterator lEnd = theChildren.end(); + for ( ; lIter != lEnd; ++lIter ){ + (*lIter)->accept(v); + } + + v.endVisit(*this); +} + +StringSplitIterator::~StringSplitIterator() {} + +StringSplitIteratorState::StringSplitIteratorState() {} + +StringSplitIteratorState::~StringSplitIteratorState() {} + + +void StringSplitIteratorState::init(PlanState& planState) { + PlanIteratorState::init(planState); + theNextStartPos = 0; +} + +void StringSplitIteratorState::reset(PlanState& planState) { + PlanIteratorState::reset(planState); + theNextStartPos = 0; +} +// </StringSplitIterator> + + } === modified file 'src/runtime/strings/pregenerated/strings.h' --- src/runtime/strings/pregenerated/strings.h 2011-12-21 14:40:33 +0000 +++ src/runtime/strings/pregenerated/strings.h 2011-12-23 20:33:38 +0000 @@ -1075,6 +1075,58 @@ }; +/** + * + * string:split + * + * Author: Matthias Brantner + */ +class StringSplitIteratorState : public PlanIteratorState +{ +public: + zstring theSeparator; //separator for the tokenization + std::istream* theIStream; //the remaining string (if the input is streamable) + zstring theInput; //the string to tokenize (if the input is not streamable) + size_t theNextStartPos; // + + StringSplitIteratorState(); + + ~StringSplitIteratorState(); + + void init(PlanState&); + void reset(PlanState&); +}; + +class StringSplitIterator : public NaryBaseIterator<StringSplitIterator, StringSplitIteratorState> +{ +public: + SERIALIZABLE_CLASS(StringSplitIterator); + + SERIALIZABLE_CLASS_CONSTRUCTOR2T(StringSplitIterator, + NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>); + + void serialize( ::zorba::serialization::Archiver& ar) + { + serialize_baseclass(ar, + (NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>*)this); + } + + StringSplitIterator( + static_context* sctx, + const QueryLoc& loc, + std::vector<PlanIter_t>& children) + : + NaryBaseIterator<StringSplitIterator, StringSplitIteratorState>(sctx, loc, children) + {} + + virtual ~StringSplitIterator(); + + void accept(PlanIterVisitor& v) const; + + bool nextImpl(store::Item_t& result, PlanState& aPlanState) const; +}; + + } #endif /* === modified file 'src/runtime/strings/strings_impl.cpp' --- src/runtime/strings/strings_impl.cpp 2011-12-23 06:41:43 +0000 +++ src/runtime/strings/strings_impl.cpp 2011-12-23 20:33:38 +0000 @@ -140,6 +140,7 @@ p = ec; if ( utf8::read( *state->theStream, ec ) == utf8::npos ) + { if ( state->theStream->good() ) { // // If read() failed but the stream state is good, it means that an @@ -165,6 +166,7 @@ zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc ) ); } + } state->theResult.clear(); state->theResult.push_back( utf8::next_char( p ) ); @@ -2284,5 +2286,133 @@ STACK_END(state); } +/** + *______________________________________________________________________ + * + * http://www.zorba-xquery.com/modules/string + * string:split + */ +bool StringSplitIterator::nextImpl( + store::Item_t& result, + PlanState& planState) const +{ + store::Item_t item; + size_t lNewPos = 0; + zstring lToken; + zstring lPartialMatch; + + StringSplitIteratorState* state; + DEFAULT_STACK_INIT(StringSplitIteratorState, state, planState); + + // init phase, get input string and tokens + consumeNext(item, theChildren[0].getp(), planState); + + if (item->isStreamable()) + { + state->theIStream = &item->getStream(); + } + else + { + state->theIStream = 0; + item->getStringValue2(state->theInput); + } + + consumeNext(item, theChildren[1].getp(), planState); + + item->getStringValue2(state->theSeparator); + + // working phase, do the tokenization + if (state->theIStream) + { + while ( !state->theIStream->eof() ) + { + utf8::encoded_char_type ec; + memset( ec, '\0' , sizeof(ec) ); + utf8::storage_type *p; + p = ec; + + if ( utf8::read( *state->theIStream, ec ) != utf8::npos ) + { + if (state->theSeparator.compare(lNewPos, 1, ec) == 0) + { + if (++lNewPos == state->theSeparator.length()) + { + GENV_ITEMFACTORY->createString(result, lToken); + STACK_PUSH(true, state); + } + else + { + lPartialMatch.append(ec); + } + } + else + { + lToken.append(lPartialMatch); + lToken.append(ec); + } + } + else + { + if (state->theIStream->good()) + { + char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf; + bool first = true; + for ( ; *p; ++p ) { + if ( first ) + first = false; + else + *b++ = ','; + ::strcpy( b, "0x" ); b += 2; + ::sprintf( b, "%0hhX", *p ); b += 2; + } + throw XQUERY_EXCEPTION( + zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE, + ERROR_PARAMS( buf ), + ERROR_LOC( loc ) + ); + } + if (!lToken.empty()) + { + GENV_ITEMFACTORY->createString(result, lToken); + STACK_PUSH(true, state); + } + break; + } + } + } + else + { + while (true) + { + if (state->theNextStartPos == zstring::npos) + { + break; + } + + lNewPos = + state->theInput.find(state->theSeparator, state->theNextStartPos); + if (lNewPos != zstring::npos) + { + zstring lSubStr = state->theInput.substr( + state->theNextStartPos, + lNewPos - state->theNextStartPos); + GENV_ITEMFACTORY->createString(result, lSubStr); + state->theNextStartPos = + lNewPos==state->theInput.length() - state->theSeparator.length() + ? zstring::npos + : lNewPos + state->theSeparator.length(); + } + else + { + zstring lSubStr = state->theInput.substr(state->theNextStartPos); + GENV_ITEMFACTORY->createString(result, lSubStr); + state->theNextStartPos = zstring::npos; + } + STACK_PUSH(true, state); + } + } + + STACK_END(state); +} } // namespace zorba /* vim:set et sw=2 ts=2: */ === modified file 'src/runtime/visitors/pregenerated/planiter_visitor.h' --- src/runtime/visitors/pregenerated/planiter_visitor.h 2011-12-21 14:40:33 +0000 +++ src/runtime/visitors/pregenerated/planiter_visitor.h 2011-12-23 20:33:38 +0000 @@ -582,6 +582,8 @@ class StringIsStreamableIterator; + class StringSplitIterator; + class XQDocIterator; class XQDocContentIterator; @@ -1423,6 +1425,9 @@ virtual void beginVisit ( const StringIsStreamableIterator& ) = 0; virtual void endVisit ( const StringIsStreamableIterator& ) = 0; + virtual void beginVisit ( const StringSplitIterator& ) = 0; + virtual void endVisit ( const StringSplitIterator& ) = 0; + virtual void beginVisit ( const XQDocIterator& ) = 0; virtual void endVisit ( const XQDocIterator& ) = 0; === modified file 'src/runtime/visitors/pregenerated/printer_visitor.cpp' --- src/runtime/visitors/pregenerated/printer_visitor.cpp 2011-12-21 14:40:33 +0000 +++ src/runtime/visitors/pregenerated/printer_visitor.cpp 2011-12-23 20:33:38 +0000 @@ -3961,6 +3961,20 @@ // </StringIsStreamableIterator> +// <StringSplitIterator> +void PrinterVisitor::beginVisit ( const StringSplitIterator& a) { + thePrinter.startBeginVisit("StringSplitIterator", ++theId); + printCommons( &a, theId ); + thePrinter.endBeginVisit( theId ); +} + +void PrinterVisitor::endVisit ( const StringSplitIterator& ) { + thePrinter.startEndVisit(); + thePrinter.endEndVisit(); +} +// </StringSplitIterator> + + // <XQDocIterator> void PrinterVisitor::beginVisit ( const XQDocIterator& a) { thePrinter.startBeginVisit("XQDocIterator", ++theId); === modified file 'src/runtime/visitors/pregenerated/printer_visitor.h' --- src/runtime/visitors/pregenerated/printer_visitor.h 2011-12-21 14:40:33 +0000 +++ src/runtime/visitors/pregenerated/printer_visitor.h 2011-12-23 20:33:38 +0000 @@ -876,6 +876,9 @@ void beginVisit( const StringIsStreamableIterator& ); void endVisit ( const StringIsStreamableIterator& ); + void beginVisit( const StringSplitIterator& ); + void endVisit ( const StringSplitIterator& ); + void beginVisit( const XQDocIterator& ); void endVisit ( const XQDocIterator& ); === added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res' --- test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res 1970-01-01 00:00:00 +0000 +++ test/rbkt/ExpQueryResults/zorba/string/tokenize01.xml.res 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ +a d a d === added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res' --- test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res 1970-01-01 00:00:00 +0000 +++ test/rbkt/ExpQueryResults/zorba/string/tokenize02.xml.res 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ +a a === added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res' --- test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res 1970-01-01 00:00:00 +0000 +++ test/rbkt/ExpQueryResults/zorba/string/tokenize03.xml.res 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ + d d === added file 'test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res' --- test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res 1970-01-01 00:00:00 +0000 +++ test/rbkt/ExpQueryResults/zorba/string/tokenize04.xml.res 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ +abcd abcd === added file 'test/rbkt/Queries/zorba/string/token01.txt' --- test/rbkt/Queries/zorba/string/token01.txt 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/token01.txt 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ +abcd \ No newline at end of file === added file 'test/rbkt/Queries/zorba/string/token02.txt' --- test/rbkt/Queries/zorba/string/token02.txt 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/token02.txt 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ +abc \ No newline at end of file === added file 'test/rbkt/Queries/zorba/string/token03.txt' --- test/rbkt/Queries/zorba/string/token03.txt 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/token03.txt 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ +bcd \ No newline at end of file === added file 'test/rbkt/Queries/zorba/string/token04.txt' --- test/rbkt/Queries/zorba/string/token04.txt 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/token04.txt 2011-12-23 20:33:38 +0000 @@ -0,0 +1,1 @@ +abcd \ No newline at end of file === added file 'test/rbkt/Queries/zorba/string/tokenize01.xq' --- test/rbkt/Queries/zorba/string/tokenize01.xq 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/tokenize01.xq 2011-12-23 20:33:38 +0000 @@ -0,0 +1,5 @@ +import module namespace f = "http://expath.org/ns/file"; +import module namespace s = "http://www.zorba-xquery.com/modules/string"; + +s:split(f:read-text(fn:resolve-uri("token01.txt")), "bc"), +s:split(s:materialize(f:read-text(fn:resolve-uri("token01.txt"))), "bc") === added file 'test/rbkt/Queries/zorba/string/tokenize02.xq' --- test/rbkt/Queries/zorba/string/tokenize02.xq 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/tokenize02.xq 2011-12-23 20:33:38 +0000 @@ -0,0 +1,5 @@ +import module namespace f = "http://expath.org/ns/file"; +import module namespace s = "http://www.zorba-xquery.com/modules/string"; + +s:split(f:read-text(fn:resolve-uri("token02.txt")), "bc"), +s:split(s:materialize(f:read-text(fn:resolve-uri("token02.txt"))), "bc") === added file 'test/rbkt/Queries/zorba/string/tokenize03.xq' --- test/rbkt/Queries/zorba/string/tokenize03.xq 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/tokenize03.xq 2011-12-23 20:33:38 +0000 @@ -0,0 +1,5 @@ +import module namespace f = "http://expath.org/ns/file"; +import module namespace s = "http://www.zorba-xquery.com/modules/string"; + +s:split(f:read-text(fn:resolve-uri("token03.txt")), "bc"), +s:split(s:materialize(f:read-text(fn:resolve-uri("token03.txt"))), "bc") === added file 'test/rbkt/Queries/zorba/string/tokenize04.xq' --- test/rbkt/Queries/zorba/string/tokenize04.xq 1970-01-01 00:00:00 +0000 +++ test/rbkt/Queries/zorba/string/tokenize04.xq 2011-12-23 20:33:38 +0000 @@ -0,0 +1,5 @@ +import module namespace f = "http://expath.org/ns/file"; +import module namespace s = "http://www.zorba-xquery.com/modules/string"; + +s:split(f:read-text(fn:resolve-uri("token04.txt")), "f"), +s:split(s:materialize(f:read-text(fn:resolve-uri("token04.txt"))), "f")
-- Mailing list: https://launchpad.net/~zorba-coders Post to : zorba-coders@lists.launchpad.net Unsubscribe : https://launchpad.net/~zorba-coders More help : https://help.launchpad.net/ListHelp