bug-897800 into lp:zorba

Paul J. Lucas Tue, 29 Nov 2011 18:20:08 -0800

Paul J. Lucas has proposed merging lp:~paul-lucas/zorba/bug-897800 into 
lp:zorba.


Requested reviews:
  Paul J. Lucas (paul-lucas)
  Matthias Brantner (matthias-brantner)
Related bugs:
  Bug #897800 in Zorba: "Full-text regressions"
  https://bugs.launchpad.net/zorba/+bug/897800

For more details, see:
https://code.launchpad.net/~paul-lucas/zorba/bug-897800/+merge/83882

Fixed.
-- 
https://code.launchpad.net/~paul-lucas/zorba/bug-897800/+merge/83882
Your team Zorba Coders is subscribed to branch lp:zorba.

=== modified file 'src/runtime/full_text/icu_tokenizer.cpp'
--- src/runtime/full_text/icu_tokenizer.cpp	2011-11-20 19:02:30 +0000
+++ src/runtime/full_text/icu_tokenizer.cpp	2011-11-30 01:02:26 +0000
@@ -69,7 +69,7 @@
   void send( void *payload, Tokenizer::Callback &callback ) {
     if ( !empty() ) {
 #     if DEBUG_TOKENIZER
-      cout << "TOKEN: \"" << value_ << "\"\n";
+      cout << "TOKEN: \"" << value_ << "\" (" << pos_ << ',' << sent_ << ',' << para_ << ")\n";
 #     endif
       callback( value_.data(), value_.size(), pos_, sent_, para_, payload );
       clear();
@@ -131,7 +131,7 @@
   Locale const &icu_locale = get_icu_locale_for( lang );
   UErrorCode status = U_ZERO_ERROR;
 
-  word_.reset(
+  word_it_.reset(
     dynamic_cast<RuleBasedBreakIterator*>(
       BreakIterator::createWordInstance( icu_locale, status )
     )
@@ -139,7 +139,7 @@
   if ( U_FAILURE( status ) )
     throw ZORBA_EXCEPTION( zerr::ZXQP0036_BREAKITERATOR_CREATION_FAILED );
 
-  sent_.reset(
+  sent_it_.reset(
     dynamic_cast<RuleBasedBreakIterator*>(
       BreakIterator::createSentenceInstance( Locale::getUS(), status )
     )
@@ -199,11 +199,12 @@
   // This unicode::string wraps the existing buffer: no copy is made.
   unicode::string const utf16_s( false, utf16_buf, utf16_len );
 
-  word_->setText( utf16_s );
-  unicode::size_type word_start = word_->first(), word_end = word_->next();
+  word_it_->setText( utf16_s );
+  unicode::size_type word_start = word_it_->first();
+  unicode::size_type word_end   = word_it_->next();
 
-  sent_->setText( utf16_s );
-  unicode::size_type sent_end = sent_->first(); sent_end = sent_->next();
+  sent_it_->setText( utf16_s );
+  unicode::size_type sent_end = sent_it_->first(); sent_end = sent_it_->next();
 
   temp_token t;
 
@@ -227,10 +228,11 @@
     }
     unique_ptr<utf8::storage_type[]> const auto_utf8_buf( utf8_buf );
 
-    zstring_b utf8_word;
+    zstring_b utf8_word; // used only for debugging & error reporting
     utf8_word.wrap_memory( utf8_buf, utf8_len );
-
-    unicode::size_type const rule_status = word_->getRuleStatus();
+#   if DEBUG_TOKENIZER
+    cout << "GOT: \"" << utf8_word << "\" ";
+#   endif
 
     //
     // "Junk" tokens are whitespace and punctuation -- except some punctuation
@@ -238,10 +240,7 @@
     //
     bool is_junk = false;
 
-#   if DEBUG_TOKENIZER
-    cout << "GOT: \"" << utf8_word << "\" ";
-#   endif
-
+    int32_t const rule_status = word_it_->getRuleStatus();
     if ( IS_WORD_BREAK( NONE, rule_status ) ) {
       //
       // "NONE" tokens are what ICU calls whitespace and punctuation.
@@ -289,7 +288,7 @@
           default:
             in_wild = false;
         }
-      }
+      } // if ( wildcards )
       is_junk = true;
     }
 
@@ -350,10 +349,16 @@
       t.send( payload, callback );
 
 set_token:
+#   if DEBUG_TOKENIZER
+    cout << "at set_token" << endl;
+#   endif
     if ( !is_junk ) {
       if ( in_wild || got_backslash )
         t.append( utf8_buf, utf8_len );
       else {
+#       if DEBUG_TOKENIZER
+        cout << "setting token" << endl;
+#       endif
         t.set(
           utf8_buf, utf8_len, numbers().token, numbers().sent, numbers().para
         );
@@ -362,9 +367,14 @@
     }
 
 next:
-    word_start = word_end, word_end = word_->next();
+#   if DEBUG_TOKENIZER
+    cout << "at next" << endl;
+#   endif
+    word_start = word_end, word_end = word_it_->next();
     if ( word_end >= sent_end && sent_end != BreakIterator::DONE ) {
-      sent_end = sent_->next();
+      sent_end = sent_it_->next();
+      // The addition of the "if" fixes:
+      // https://bugs.launchpad.net/bugs/863320
       if ( sent_end != BreakIterator::DONE )
         ++numbers().sent;
     }
@@ -375,6 +385,9 @@
       err::FTDY0020, ERROR_PARAMS( "", ZED( UnbalancedChar_3 ), '}' )
     );
   t.send( payload, callback );
+  // Incrementing "sent" here fixes:
+  // https://bugs.launchpad.net/bugs/897800
+  ++numbers().sent;
 }
 
 ///////////////////////////////////////////////////////////////////////////////

=== modified file 'src/runtime/full_text/icu_tokenizer.h'
--- src/runtime/full_text/icu_tokenizer.h	2011-09-05 02:06:22 +0000
+++ src/runtime/full_text/icu_tokenizer.h	2011-11-30 01:02:26 +0000
@@ -55,8 +55,8 @@
   typedef std::unique_ptr<RuleBasedBreakIterator> rbbi_ptr;
 
   locale::iso639_1::type const lang_;
-  rbbi_ptr word_;
-  rbbi_ptr sent_;
+  rbbi_ptr word_it_;
+  rbbi_ptr sent_it_;
 };
 
 ///////////////////////////////////////////////////////////////////////////////

=== added file 'test/rbkt/ExpQueryResults/zorba/fulltext/ft-same-sentence-false-2.xml.res'
--- test/rbkt/ExpQueryResults/zorba/fulltext/ft-same-sentence-false-2.xml.res	1970-01-01 00:00:00 +0000
+++ test/rbkt/ExpQueryResults/zorba/fulltext/ft-same-sentence-false-2.xml.res	2011-11-30 01:02:26 +0000
@@ -0,0 +1,1 @@
+false

=== added file 'test/rbkt/Queries/zorba/fulltext/ft-same-sentence-false-2.xq'
--- test/rbkt/Queries/zorba/fulltext/ft-same-sentence-false-2.xq	1970-01-01 00:00:00 +0000
+++ test/rbkt/Queries/zorba/fulltext/ft-same-sentence-false-2.xq	2011-11-30 01:02:26 +0000
@@ -0,0 +1,2 @@
+let $x := <msg>hello. world</msg>
+return $x contains text "hello" ftand "world" same sentence

-- 
Mailing list: https://launchpad.net/~zorba-coders
Post to     : [email protected]
Unsubscribe : https://launchpad.net/~zorba-coders
More help   : https://help.launchpad.net/ListHelp

[Zorba-coders] [Merge] lp:~paul-lucas/zorba/bug-897800 into lp:zorba

Reply via email to