Paul J. Lucas has proposed merging lp:~zorba-coders/zorba/feature-ft_module into lp:zorba.
Requested reviews: Matthias Brantner (matthias-brantner) Paul J. Lucas (paul-lucas) Related bugs: Bug #944795 in Zorba: "XQDoc doesn't handle & in URLs" https://bugs.launchpad.net/zorba/+bug/944795 For more details, see: https://code.launchpad.net/~zorba-coders/zorba/feature-ft_module/+merge/105913 Renamed Tokenizer::Numbers to Tokenizer::State now (just prior to the 2.5 release) to give it a better name for the forthcoming addition of the ability to tokenize using include/exclude Item lists. At that time, State will most likely be expanded to include additional state information beyond just numbers, hence the name change. -- https://code.launchpad.net/~zorba-coders/zorba/feature-ft_module/+merge/105913 Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'include/zorba/tokenizer.h' --- include/zorba/tokenizer.h 2012-05-03 12:31:51 +0000 +++ include/zorba/tokenizer.h 2012-05-16 00:57:21 +0000 @@ -48,9 +48,10 @@ ///////////////////////////////////////////////////////////////////////////// /** - * A %Numbers contains the current token, sentence, and paragraph numbers. + * A %State contains inter-Tokenizer state, currently the current token, + * sentence, and paragraph numbers. */ - struct Numbers { + struct State { typedef Tokenizer::size_type value_type; value_type token; ///< Token number. @@ -60,7 +61,7 @@ /** * Default constructor. */ - Numbers(); + State(); }; ///////////////////////////////////////////////////////////////////////////// @@ -125,7 +126,7 @@ /** * If \c true, XML processing instructions separate tokens. For example, - * <code>net<?PI pi?>work</code> would be 2 tokens instead of 1. + * <code>net<?PI pi?>work</code> would be 2 tokens instead of 1. */ bool processing_instructions_separate_tokens; @@ -162,18 +163,18 @@ virtual void destroy() const = 0; /** - * Gets this %Tokenizer's associated Numbers. + * Gets this %Tokenizer's associated State. * - * @return Returns said Numbers. + * @return Returns said State. */ - Numbers& numbers(); + State& state(); /** - * Gets this %Tokenizer's associated Numbers. + * Gets this %Tokenizer's associated State. * - * @return Returns said Numbers. + * @return Returns said State. */ - Numbers const& numbers() const; + State const& state() const; /** * Tokenizes the given node. @@ -207,9 +208,9 @@ /** * Constructs a %Tokenizer. * - * @param numbers the Numbers to use. + * @param state the State to use. */ - Tokenizer( Numbers &numbers ); + Tokenizer( State &state ); /** * Destroys a %Tokenizer. @@ -255,18 +256,18 @@ Callback &callback, bool tokenize_acp ); private: - Numbers *numbers_; + State *state_; }; -inline Tokenizer::Tokenizer( Numbers &numbers ) : numbers_( &numbers ) { -} - -inline Tokenizer::Numbers& Tokenizer::numbers() { - return *numbers_; -} - -inline Tokenizer::Numbers const& Tokenizer::numbers() const { - return *numbers_; +inline Tokenizer::Tokenizer( State &state ) : state_( &state ) { +} + +inline Tokenizer::State& Tokenizer::state() { + return *state_; +} + +inline Tokenizer::State const& Tokenizer::state() const { + return *state_; } inline void Tokenizer::tokenize_node( Item const &item, @@ -288,13 +289,13 @@ * Creates a new %Tokenizer. * * @param lang The language of the text that the tokenizer will tokenize. - * @param numbers The Numbers to use. If \c null, \a t is not set. + * @param state The State to use. If \c null, \a t is not set. * @param t If not \c null, set to point to a Tokenizer for \a lang. * @return Returns \c true only if this provider can provide a tokenizer for * \a lang. */ virtual bool getTokenizer( locale::iso639_1::type lang, - Tokenizer::Numbers *numbers = 0, + Tokenizer::State *state = 0, Tokenizer::ptr *t = 0 ) const = 0; }; === modified file 'src/runtime/full_text/apply.cpp' --- src/runtime/full_text/apply.cpp 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/apply.cpp 2012-05-16 00:57:21 +0000 @@ -1251,11 +1251,11 @@ FTTokenSeqIterator::FTTokens synonyms; thesaurus_callback cb( qt0.pos(), qt0.lang(), synonyms ); - Tokenizer::Numbers t_num; + Tokenizer::State t_state; TokenizerProvider const *const provider = GENV_STORE.getTokenizerProvider(); ZORBA_ASSERT( provider ); Tokenizer::ptr tokenizer; - if ( !provider->getTokenizer( qt0.lang(), &t_num, &tokenizer ) ) + if ( !provider->getTokenizer( qt0.lang(), &t_state, &tokenizer ) ) throw XQUERY_EXCEPTION( err::FTST0009, ERROR_PARAMS( === modified file 'src/runtime/full_text/ft_module_impl.cpp' --- src/runtime/full_text/ft_module_impl.cpp 2012-05-15 21:13:21 +0000 +++ src/runtime/full_text/ft_module_impl.cpp 2012-05-16 00:57:21 +0000 @@ -552,7 +552,7 @@ zstring base_uri; store::Item_t item; iso639_1::type lang; - Tokenizer::Numbers no; + Tokenizer::State t_state; store::NsBindings const ns_bindings; TokenizerProvider const *tokenizer_provider; store::Item_t type_name; @@ -574,7 +574,7 @@ tokenizer_provider = GENV_STORE.getTokenizerProvider(); ZORBA_ASSERT( tokenizer_provider ); state->doc_tokens_ = - state->doc_item_->getTokens( *tokenizer_provider, no, lang ); + state->doc_item_->getTokens( *tokenizer_provider, t_state, lang ); while ( state->doc_tokens_->hasNext() ) { FTToken const *token; @@ -667,7 +667,7 @@ store::Item_t element, item, junk, name; zstring base_uri; iso639_1::type lang; - Tokenizer::Numbers no; + Tokenizer::State t_state; store::NsBindings const ns_bindings; Tokenizer::ptr tokenizer; store::Item_t type_name; @@ -689,7 +689,7 @@ tokenizer_provider = GENV_STORE.getTokenizerProvider(); ZORBA_ASSERT( tokenizer_provider ); - if ( !tokenizer_provider->getTokenizer( lang, &no, &tokenizer ) ) + if ( !tokenizer_provider->getTokenizer( lang, &t_state, &tokenizer ) ) throw XQUERY_EXCEPTION( err::FTST0009 /* lang not supported */, ERROR_PARAMS( @@ -826,9 +826,9 @@ TokenizerProvider const *const tokenizer_provider = GENV_STORE.getTokenizerProvider(); ZORBA_ASSERT( tokenizer_provider ); - Tokenizer::Numbers no; + Tokenizer::State t_state; Tokenizer::ptr tokenizer; - if ( !tokenizer_provider->getTokenizer( lang, &no, &tokenizer ) ) + if ( !tokenizer_provider->getTokenizer( lang, &t_state, &tokenizer ) ) throw XQUERY_EXCEPTION( err::FTST0009 /* lang not supported */, ERROR_PARAMS( === modified file 'src/runtime/full_text/ftcontains_visitor.cpp' --- src/runtime/full_text/ftcontains_visitor.cpp 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/ftcontains_visitor.cpp 2012-05-16 00:57:21 +0000 @@ -426,9 +426,9 @@ // actual query. // while ( PlanIterator::consumeNext( item, plan_iter, plan_state_ ) ) { - Tokenizer::Numbers no; + Tokenizer::State t_state; query_item_t const qi( - item->getTokens( tokenizer_provider, no, lang, wildcards ) + item->getTokens( tokenizer_provider, t_state, lang, wildcards ) ); if ( qi->hasNext() ) query_items.push_back( qi ); === modified file 'src/runtime/full_text/full_text_impl.cpp' --- src/runtime/full_text/full_text_impl.cpp 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/full_text_impl.cpp 2012-05-16 00:57:21 +0000 @@ -84,9 +84,9 @@ tokenizer_provider = GENV_STORE.getTokenizerProvider(); while ( !ftcontains && consumeNext( doc_item, search_ctx, plan_state ) ) { - Tokenizer::Numbers no; + Tokenizer::State t_state; FTTokenIterator_t doc_tokens( - doc_item->getTokens( *tokenizer_provider, no, lang ) + doc_item->getTokens( *tokenizer_provider, t_state, lang ) ); store::Item_t ignore_item; if ( ftignore ) === modified file 'src/runtime/full_text/icu_tokenizer.cpp' --- src/runtime/full_text/icu_tokenizer.cpp 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/icu_tokenizer.cpp 2012-05-16 00:57:21 +0000 @@ -130,8 +130,8 @@ /////////////////////////////////////////////////////////////////////////////// -ICU_Tokenizer::ICU_Tokenizer( iso639_1::type lang, Numbers &no ) : - Tokenizer( no ), +ICU_Tokenizer::ICU_Tokenizer( iso639_1::type lang, State &state ) : + Tokenizer( state ), lang_( lang ) { Locale const &icu_locale = get_icu_locale_for( lang ); @@ -381,9 +381,9 @@ cout << " setting token" << endl; # endif t.set( - utf8_buf, utf8_len, numbers().token, numbers().sent, numbers().para + utf8_buf, utf8_len, state().token, state().sent, state().para ); - ++numbers().token; + ++state().token; } } @@ -408,7 +408,7 @@ // The addition of the "if" fixes: // https://bugs.launchpad.net/bugs/863320 if ( sent_end != BreakIterator::DONE ) - ++numbers().sent; + ++state().sent; } } // while @@ -419,7 +419,7 @@ t.send( item, callback ); // Incrementing "sent" here fixes: // https://bugs.launchpad.net/bugs/897800 - ++numbers().sent; + ++state().sent; #if DEBUG_TOKENIZER cout << "--------------------\n"; #endif /* DEBUG_TOKENIZER */ @@ -428,13 +428,13 @@ /////////////////////////////////////////////////////////////////////////////// bool ICU_TokenizerProvider::getTokenizer( iso639_1::type lang, - Tokenizer::Numbers *num, + Tokenizer::State *state, Tokenizer::ptr *t ) const { for ( int32_t n = ubrk_countAvailable(), i = 0; i < n; ++i ) { if ( char const *const icu_locale = ubrk_getAvailable( i ) ) if ( lang == find_lang( icu_locale ) ) { - if ( num && t ) - t->reset( new ICU_Tokenizer( lang, *num ) ); + if ( state && t ) + t->reset( new ICU_Tokenizer( lang, *state ) ); return true; } } === modified file 'src/runtime/full_text/icu_tokenizer.h' --- src/runtime/full_text/icu_tokenizer.h 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/icu_tokenizer.h 2012-05-16 00:57:21 +0000 @@ -40,9 +40,9 @@ * Constructs an %ICU_Tokenizer. * * @param lang The language of the text that the tokenizer will tokenize. - * @param no The Numbers to use. + * @param state The State to use. */ - ICU_Tokenizer( locale::iso639_1::type lang, Numbers &no ); + ICU_Tokenizer( locale::iso639_1::type lang, State &state ); ~ICU_Tokenizer(); @@ -67,7 +67,7 @@ ICU_TokenizerProvider() { } // needed to work-around compiler bug // inherited - bool getTokenizer( locale::iso639_1::type, Tokenizer::Numbers* = 0, + bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0, Tokenizer::ptr* = 0 ) const; }; === modified file 'src/runtime/full_text/latin_tokenizer.cpp' --- src/runtime/full_text/latin_tokenizer.cpp 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/latin_tokenizer.cpp 2012-05-16 00:57:21 +0000 @@ -242,12 +242,12 @@ /////////////////////////////////////////////////////////////////////////////// bool LatinTokenizerProvider::getTokenizer( iso639_1::type lang, - Tokenizer::Numbers *num, + Tokenizer::State *state, Tokenizer::ptr *t ) const { switch ( lang ) { case iso639_1::en: - if ( num && t ) - t->reset( new LatinTokenizer( *num ) ); + if ( state && t ) + t->reset( new LatinTokenizer( *state ) ); return true; default: return false; === modified file 'src/runtime/full_text/latin_tokenizer.h' --- src/runtime/full_text/latin_tokenizer.h 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/latin_tokenizer.h 2012-05-16 00:57:21 +0000 @@ -34,7 +34,7 @@ */ class LatinTokenizer : public Tokenizer { public: - LatinTokenizer( Numbers &num ) : Tokenizer( num ) { } + LatinTokenizer( State &state ) : Tokenizer( state ) { } // inherited void destroy() const; @@ -66,7 +66,7 @@ class LatinTokenizerProvider : public TokenizerProvider { public: // inherited - bool getTokenizer( locale::iso639_1::type, Tokenizer::Numbers* = 0, + bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0, Tokenizer::ptr* = 0 ) const; }; === modified file 'src/runtime/full_text/tokenizer.cpp' --- src/runtime/full_text/tokenizer.cpp 2012-05-03 12:31:51 +0000 +++ src/runtime/full_text/tokenizer.cpp 2012-05-16 00:57:21 +0000 @@ -59,7 +59,7 @@ void Tokenizer::item( Item const &item, bool entering ) { if ( entering && item.isNode() && item.getNodeKind() == store::StoreConsts::elementNode ) { - ++numbers().para; + ++state().para; } } @@ -78,7 +78,7 @@ if ( find_lang_attribute( item, &lang ) ) { TokenizerProvider const *const p = GENV_STORE.getTokenizerProvider(); ZORBA_ASSERT( p ); - if ( !p->getTokenizer( lang, numbers_, &t_ptr ) ) + if ( !p->getTokenizer( lang, state_, &t_ptr ) ) break; t_raw = t_ptr.get(); } @@ -109,7 +109,7 @@ } } -Tokenizer::Numbers::Numbers() { +Tokenizer::State::State() { token = para = 0; sent = 1; } === modified file 'src/store/api/item.h' --- src/store/api/item.h 2012-05-03 12:31:51 +0000 +++ src/store/api/item.h 2012-05-16 00:57:21 +0000 @@ -838,13 +838,13 @@ * Gets the tokens for this item. * * @param provider The TokenizerProvider to use. - * @param numbers The Tokenizer::Numbers to use. + * @param state The Tokenizer::State to use. * @param lang The language to use for tokenization. * @param wildcards If \c true, allow XQuery wildcard syntax. * @return Returns an iterator over the tokens. */ virtual FTTokenIterator_t - getTokens(TokenizerProvider const &provider, Tokenizer::Numbers &numbers, + getTokens(TokenizerProvider const &provider, Tokenizer::State &state, locale::iso639_1::type lang, bool wildcards = false) const; #endif /* ZORBA_NO_FULL_TEXT */ === modified file 'src/store/naive/atomic_items.cpp' --- src/store/naive/atomic_items.cpp 2012-05-15 21:12:27 +0000 +++ src/store/naive/atomic_items.cpp 2012-05-16 00:57:21 +0000 @@ -1651,7 +1651,7 @@ #ifndef ZORBA_NO_FULL_TEXT FTTokenIterator_t StringItem::getTokens( TokenizerProvider const &provider, - Tokenizer::Numbers &numbers, + Tokenizer::State &state, iso639_1::type lang, bool wildcards ) const { @@ -1660,7 +1660,7 @@ AtomicItemTokenizerCallback callback( *tokens ); Tokenizer::ptr tokenizer; - if ( provider.getTokenizer( lang, &numbers, &tokenizer ) ) + if ( provider.getTokenizer( lang, &state, &tokenizer ) ) tokenizer->tokenize_string( theValue.data(), theValue.size(), lang, wildcards, callback ); === modified file 'src/store/naive/atomic_items.h' --- src/store/naive/atomic_items.h 2012-05-08 01:09:52 +0000 +++ src/store/naive/atomic_items.h 2012-05-16 00:57:21 +0000 @@ -852,7 +852,7 @@ #ifndef ZORBA_NO_FULL_TEXT FTTokenIterator_t getTokens( TokenizerProvider const&, - Tokenizer::Numbers&, + Tokenizer::State&, locale::iso639_1::type, bool = false ) const; #endif /* ZORBA_NO_FULL_TEXT */ === modified file 'src/store/naive/item.cpp' --- src/store/naive/item.cpp 2012-05-03 12:31:51 +0000 +++ src/store/naive/item.cpp 2012-05-16 00:57:21 +0000 @@ -354,7 +354,7 @@ #ifndef ZORBA_NO_FULL_TEXT FTTokenIterator_t -Item::getTokens( TokenizerProvider const&, Tokenizer::Numbers&, +Item::getTokens( TokenizerProvider const&, Tokenizer::State&, locale::iso639_1::type, bool ) const { throw ZORBA_EXCEPTION( === modified file 'src/store/naive/node_items.cpp' --- src/store/naive/node_items.cpp 2012-05-08 23:31:37 +0000 +++ src/store/naive/node_items.cpp 2012-05-16 00:57:21 +0000 @@ -4822,7 +4822,7 @@ FTTokenIterator_t AttributeNode::getTokens( TokenizerProvider const &provider, - Tokenizer::Numbers &numbers, iso639_1::type lang, + Tokenizer::State &state, iso639_1::type lang, bool ) const { FTTokenStore &token_store = getTree()->getTokenStore(); @@ -4838,7 +4838,7 @@ zorba::Item const api_attr( this ); Tokenizer::ptr tokenizer; - if ( provider.getTokenizer( lang, &numbers, &tokenizer ) ) { + if ( provider.getTokenizer( lang, &state, &tokenizer ) ) { tokenizer->tokenize_node( api_attr, lang, callback ); token_store.putAttr( this, att_tokens ); } @@ -4907,7 +4907,7 @@ FTTokenIterator_t XmlNode::getTokens( TokenizerProvider const &provider, - Tokenizer::Numbers &numbers, iso639_1::type lang, + Tokenizer::State &state, iso639_1::type lang, bool ) const { FTTokenStore &token_store = getTree()->getTokenStore(); @@ -4918,7 +4918,7 @@ zorba::Item const api_root( getRoot() ); XmlNodeTokenizerCallback callback( token_store ); Tokenizer::ptr tokenizer; - if ( provider.getTokenizer( lang, &numbers, &tokenizer ) ) + if ( provider.getTokenizer( lang, &state, &tokenizer ) ) tokenizer->tokenize_node( api_root, lang, callback ); } === modified file 'src/store/naive/node_items.h' --- src/store/naive/node_items.h 2012-05-03 12:31:51 +0000 +++ src/store/naive/node_items.h 2012-05-16 00:57:21 +0000 @@ -555,7 +555,7 @@ #ifndef ZORBA_NO_FULL_TEXT FTTokenIterator_t getTokens( TokenizerProvider const&, - Tokenizer::Numbers&, + Tokenizer::State&, locale::iso639_1::type, bool = false ) const; #endif /* ZORBA_NO_FULL_TEXT */ @@ -1233,7 +1233,7 @@ isPrecedingSibling(const store::Item_t&) const { return false; } #ifndef ZORBA_NO_FULL_TEXT - FTTokenIterator_t getTokens( TokenizerProvider const&, Tokenizer::Numbers&, + FTTokenIterator_t getTokens( TokenizerProvider const&, Tokenizer::State&, locale::iso639_1::type, bool wildcards = false ) const; #endif /* ZORBA_NO_FULL_TEXT */ === modified file 'src/unit_tests/tokenizer.cpp' --- src/unit_tests/tokenizer.cpp 2012-05-03 12:31:51 +0000 +++ src/unit_tests/tokenizer.cpp 2012-05-16 00:57:21 +0000 @@ -60,7 +60,7 @@ class TestTokenizer : public Tokenizer { public: - TestTokenizer( Numbers &num ) : Tokenizer( num ) { } + TestTokenizer( State &state ) : Tokenizer( state ) { } ~TestTokenizer(); // inherited @@ -125,7 +125,7 @@ item.getNodeName( qname ); if ( ::binary_search( block_elements, end, qname.getLocalName().c_str(), less<char const*>() ) ) { - ++numbers().para; + ++state().para; } } } @@ -291,7 +291,7 @@ // no break; case '!': case '?': - ++numbers().sent; + ++state().sent; } } // for @@ -324,19 +324,19 @@ Callback &callback, Item const *item ) { if ( !token.empty() ) { #if PRINT_TOKENS - cout << "t=" << setw(2) << numbers().token - << ", s=" << setw(2) << numbers().sent - << ", p=" << setw(2) << numbers().para + cout << "t=" << setw(2) << state().token + << ", s=" << setw(2) << state().sent + << ", p=" << setw(2) << state().para << ": \"" << token << "\"\n"; #endif /* PRINT_TOKENS */ - check_token( token.c_str(), numbers().token ); + check_token( token.c_str(), state().token ); callback.token( token.data(), token.size(), lang, - numbers().token, numbers().sent, numbers().para, item + state().token, state().sent, state().para, item ); - ++numbers().token; + ++state().token; return true; } return false; @@ -347,15 +347,15 @@ class TestTokenizerProvider : public TokenizerProvider { public: // inherited - bool getTokenizer( iso639_1::type, Tokenizer::Numbers* = 0, + bool getTokenizer( iso639_1::type, Tokenizer::State* = 0, Tokenizer::ptr* = 0 ) const; }; bool TestTokenizerProvider::getTokenizer( iso639_1::type lang, - Tokenizer::Numbers *num, + Tokenizer::State *state, Tokenizer::ptr *t ) const { - if ( num && t ) - t->reset( new TestTokenizer( *num ) ); + if ( state && t ) + t->reset( new TestTokenizer( *state ) ); return true; }
-- Mailing list: https://launchpad.net/~zorba-coders Post to : zorba-coders@lists.launchpad.net Unsubscribe : https://launchpad.net/~zorba-coders More help : https://help.launchpad.net/ListHelp