(istream, string&) inefficient

Martin Sebor (JIRA) Wed, 06 Feb 2008 22:15:41 -0800

    [ 
https://issues.apache.org/jira/browse/STDCXX-249?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12566455#action_12566455
 ]


Martin Sebor commented on STDCXX-249:
-------------------------------------

Great numbers!

The patch looks good to me. Maybe just a few minor suggestions:

{code}
Index: istream.cc
===================================================================
--- istream.cc  (revision 618349)
+++ istream.cc  (working copy)
@@ -788,24 +788,23 @@
 {
     _RWSTD_ASSERT (0 != __is.rdbuf ());
 
+    const _TYPENAME basic_istream<_CharT, _Traits>::sentry
+        __ipfx (__is /* , noskipws = false */);
+
     ios_base::iostate __err = ios_base::goodbit;
 
-    _RWSTD_SIZE_T __gcount = 0;
+    typedef _RWSTD_SIZE_T _SizeT;
 
+    // count of characters read from stream
+    _SizeT __gcount = 0;
+
     _TRY {
 
-        const _TYPENAME basic_istream<_CharT, _Traits>::sentry
-            __ipfx (__is /* , noskipws = false */);
-
         if (__ipfx) {
 
-            basic_streambuf<_CharT, _Traits>* const __rdbuf = __is.rdbuf ();
+            __str.clear ();
 
-            // FIXME: code commented out to work around an HP aCC 3.14.10
-            // bug #JAGac86264
-
-            // typedef _TYPENAME
-            //     basic_string<_CharT, _Traits, _Allocator>::size_type
-
-            const _RWSTD_SIZE_T __maxlen =
+            // maximum number of characters we can read
+            _RWSTD_SIZE_T __n =
                 __is.width () ? __is.width () : __str.max_size ();
+            // __is.width (0); // ensure reset in case of exception?
{code}

I agree we should be exception safe but unless this is the only place where we 
don't take exceptions into consideration I'd do that in a separate pass 
throughout all of iostreams.

{code}
-            __str.clear ();
+            basic_streambuf<_CharT, _Traits>* const __rdbuf = __is.rdbuf ();
 
             const ctype<_CharT> &__ctp =
                 _USE_FACET (ctype<_CharT>, __is.getloc ());
 
-            // increment gcount only _after_ sbumpc() but _before_
-            // the subsequent call to sgetc() to correctly reflect
-            // the number of extracted characters in the presence
-            // of exceptions thrown from streambuf virtuals
-            for ( ; __maxlen != __gcount; __rdbuf->sbumpc (), ++__gcount) {
+#ifndef _RWSTD_NO_FRIEND_TEMPLATE
 
-                const _TYPENAME _Traits::int_type __c (__rdbuf->sgetc ());
+            for ( ; __n != 0; ) {
{code}

Since there is no expression in the for statement wouldn't a while loop be 
better here?
 
{code}
+                const _CharT* const __gptr  = __rdbuf->gptr ();
+                const _CharT* const __egptr = __rdbuf->egptr ();
+
+                // maximum number of characters would want to extract
+                _SizeT __navail = __egptr - __gptr;
+                if (__n < __navail)
+                    __navail = __n;
+
+                if (__navail) {
+
+                    // find the delimeter in the squence if it exists, or
+                    // get pointer to end of sequence
+                    const _CharT* __pdel = __gptr;
+                    for (/**/; __pdel != __egptr; ++__pdel) {
+
+                        const _TYPENAME _Traits::int_type __c
+                            (_Traits::to_int_type(*__pdel));
+
+                        if (_Traits::eq_int_type (__c, _Traits::eof ())) {
+                            __err = ios_base::eofbit;
+                            break;
+                        }
+
+                        if (__ctp.is (__ctp.space, *__pdel))
+                            break;
+                    }
+
+                    // __pdel is either pointing to a delimiter or one past
+                    // the end of the input stream get area. if it is past
+                    // the end, then set it to null so the above loop acts
+                    // similar to strpbrk()
+                    if (__pdel == __egptr) {
+                        __pdel = 0;
+                    }
+
+                    if (__pdel) { // < __egptr
{code}

What does the comment mean here? That (__pdel < __egptr) holds? Would an 
assertion instead be overkill here?

{code}
+                        __navail = __pdel - __gptr + 1;
+                        __n     -= __navail - 1;
+                    }
+                    else if (__n == __navail) // == __egptr
{code}

And here that __pdel == __egptr? (Same as above.)

{code}
+                        __n -= --__navail;
+                    else
+                        __n -= __navail;
+
+                    // store characters excluding the delimiter
+                    __str.append (__gptr, __navail - !!__pdel);
+
+                    __gcount += __navail;
+
+                    // advance gptr() by the number of extracted
+                    // characters, including the delimiter
+                    __rdbuf->gbump (__navail);
+
+                    // we found a delimiter before the end of the get area, 
break out
{code}

Can you trim the comment to 78 characters or less? :)

{code}
+                    // of outer loop
+                    if (__pdel) {
+                        break;
+                    }
+
+                    if (2 > __n && _SizeT (__egptr - __gptr) != __navail) {
+                        __err = ios_base::failbit;
+                        break;
+                    }
+                }
+                else {
+
+                    // n data in buffer, trigger underflow()
+                    // note that streambuf may be unbuffered
+                    const _TYPENAME _Traits::int_type __c
+                        = __rdbuf->sgetc ();
+
+                    if (_Traits::eq_int_type (__c, _Traits::eof ())) {
+                        __err = ios_base::eofbit;
+                        break;
+                    }
+
+                    const _TYPENAME _Traits::char_type __ch
+                        = _Traits::to_char_type (__c);
{code}

The equals should go on the line above.

{code}
+
+                    if (__ctp.is (__ctp.space, __ch))
+                        break;
+
+                    __str.push_back (__ch);
+                    --__n;
+
+                    __rdbuf->sbumpc ();
+
+                    // increment gcount only _after_ sbumpc() but _before_
+                    // the subsequent call to sgetc() to correctly reflect
+                    // the number of extracted characters in the presence
+                    // of exceptions thrown from streambuf virtuals
+                    ++__gcount;
+
+                    // honestly, it seems that we should not append to __str
+                    // until after sbumpc(). if we leave it like this, then
+                    // __str will have the character in it, but the stream
+                    // may also.
{code}

That wouldn't be good. The 
[21.string.io.cpp|http://svn.apache.org/repos/asf/stdcxx/trunk/tests/strings/21.string.io.cpp]
 test passes so maybe it needs to be tightened up to expose this problem? And 
if/when it does expose it, I agree we should deal with it.

{code}
+                }
+            }
+
+#else   // if defined (_RWSTD_NO_FRIEND_TEMPLATE)
+
+            for ( ; __n != 0; ) {
+
+                const _TYPENAME _Traits::int_type __c
+                    = __rdbuf->sgetc ();
+
                 if (_Traits::eq_int_type (__c, _Traits::eof ())) {
                     __err = ios_base::eofbit;
                     break;
                 }
 
-                // convert to char_type so that isspace works correctly
-                const _TYPENAME _Traits::char_type
-                    __ch = _Traits::to_char_type (__c);
+                const _TYPENAME _Traits::char_type __ch
+                    = _Traits::to_char_type (__c);
{code}

Same as above with the equals.

{code}
                 if (__ctp.is (__ctp.space, __ch))
                     break;
 
                 __str.push_back (__ch);
+                --__n;
+
+                __rdbuf->sbumpc ();
+
+                // increment gcount only _after_ sbumpc() but _before_
+                // the subsequent call to sgetc() to correctly reflect
+                // the number of extracted characters in the presence
+                // of exceptions thrown from streambuf virtuals
+                ++__gcount;
             }
 
+#endif   // if defined (_RWSTD_NO_FRIEND_TEMPLATE)
+
+
             __is.width (0);
         }
     }
     _CATCH (...) {
-        __is.setstate (__is.badbit | _RW::__rw_rethrow);
+        __is.setstate (ios_base::badbit | _RW::__rw_rethrow);
{code}

I realize we're not consistent in our usage of the bits but I think we should 
pick one of the two styles and use it consistently throughout, unless there's a 
reason not to (such as if {{ios_base}} is an incomplete type).

{code}
     }
 
     if (!__gcount)
@@ -852,7 +965,7 @@
         __is.setstate (__err);
 
     return __is;
-}  
+}
 
 
 _EXPORT
Index: streambuf
===================================================================
--- streambuf   (revision 618349)
+++ streambuf   (working copy)
@@ -415,6 +415,12 @@
     getline (basic_istream<_Char2, _Traits2>&, 
              basic_string<_Char2, _Traits2, _Allocator>&, _Char2);
 
+    _EXPORT
+    template <class _Char2, class _Traits2, class _Allocator>
+    friend basic_istream<_Char2, _Traits2>&
+    operator>> (basic_istream<_Char2, _Traits2>&, 
+                basic_string<_Char2, _Traits2, _Allocator>&);
+
 #endif   // _RWSTD_NO_FRIEND_TEMPLATE
 
 };
{code}

> std::operator>>(istream, string&) inefficient
> ---------------------------------------------
>
>                 Key: STDCXX-249
>                 URL: https://issues.apache.org/jira/browse/STDCXX-249
>             Project: C++ Standard Library
>          Issue Type: Improvement
>          Components: 21. Strings
>    Affects Versions: 4.1.2, 4.1.3, 4.1.4, 4.2.0
>         Environment: all
>            Reporter: Martin Sebor
>            Assignee: Travis Vitek
>            Priority: Minor
>             Fix For: 4.2.1
>
>         Attachments: stdcxx-249.patch
>
>   Original Estimate: 8h
>          Time Spent: 9h
>  Remaining Estimate: 0h
>
> The string extractor reads and appends one character at a time. It could be 
> made more efficient by extracting and appending all non-whitespace characters 
> that are available in the buffer in chunks. See the definition of the 
> operator here:
> http://svn.apache.org/viewvc/stdcxx/trunk/include/istream.cc?revision=383265

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

[jira] Commented: (STDCXX-249) std::operator>>(istream, string&) inefficient

Reply via email to