EBram Moolenaar wrote:
>
> Dominique Pellé wrote:
>
>> Here is another bug found by afl-fuzz. The following commands causes
>> vim-7.8.824 (and older) to access invalid memory, beyond end of string:
>>
>>   $ mkdir /tmp/foo
>>   $ cd /tmp/foo
>>   $ touch '#'
>>   $ touch $(perl -e 'print chr(0xf0),chr(0x80),chr(0x80),chr(0xa3)')
>>   $ vim -u NONE -c 'e*'
>>
>> Asan reports this error:
>>
>> =================================================================
>> ==6572==ERROR: AddressSanitizer: heap-buffer-overflow on address
>> 0x602000017114 at pc 0x0000011db7e1 bp 0x7fff5b6aacb0 sp
>> 0x7fff5b6aaca8f_wc_equal
>> READ of size 1 at 0x602000017114 thread T0
>>     #0 0x11db7e0 in utf_ptr2char /home/pel/sb/vim/src/mbyte.c:1696
>> (discriminator 3)
>>     #1 0x113bbe7 in pathcmp /home/pel/sb/vim/src/misc2.c:5824 (discriminator 
>> 1)
>>     #2 0x10e9c68 in pstrcmp /home/pel/sb/vim/src/misc1.c:10159 
>> (discriminator 9)
>>     #3 0x7f4bd183b4c8 in msort_with_tmp
>> /build/buildd/eglibc-2.19/stdlib/msort.c:83
>>     #4 0x7f4bd183b77b in msort_with_tmp
>> /build/buildd/eglibc-2.19/stdlib/msort.c:45
>>     #5 0x10e8820 in unix_expandpath /home/pel/sb/vim/src/misc1.c:10377
>> (discriminator 12)
>>     #6 0x150934e in mch_expandpath /home/pel/sb/vim/src/os_unix.c:5640
>>     #7 0x10e1dbf in gen_expand_wildcards /home/pel/sb/vim/src/misc1.c:10983
>>     #8 0x10dd295 in expand_wildcards /home/pel/sb/vim/src/misc1.c:9695
>>     #9 0x10dcb25 in expand_wildcards_eval /home/pel/sb/vim/src/misc1.c:9666
>>     #10 0xc31003 in ExpandFromContext /home/pel/sb/vim/src/ex_getln.c:4613
>>     #11 0xbfef41 in ExpandOne /home/pel/sb/vim/src/ex_getln.c:3627
>>     #12 0xae89e1 in expand_filename /home/pel/sb/vim/src/ex_docmd.c:5061
>>     #13 0xab8d5d in do_one_cmd /home/pel/sb/vim/src/ex_docmd.c:2879
>>     #14 0xa92b4d in do_cmdline /home/pel/sb/vim/src/ex_docmd.c:1133
>>     #15 0xa99f50 in do_cmdline_cmd /home/pel/sb/vim/src/ex_docmd.c:738
>>     #16 0x2076afd in exe_commands /home/pel/sb/vim/src/main.c:2926
>>     #17 0x20577a5 in main /home/pel/sb/vim/src/main.c:961
>>     #18 0x7f4bd1821ec4 in __libc_start_main
>> /build/buildd/eglibc-2.19/csu/libc-start.c:287
>>     #19 0x466c96 in _start ??:?
>>
>> 0x602000017114 is located 2 bytes to the right of 2-byte region
>> [0x602000017110,0x602000017112)
>> allocated by thread T0 here:
>>     #0 0x4edc62 in __interceptor_malloc ??:?
>>     #1 0x110e232 in lalloc /home/pel/sb/vim/src/misc2.c:921
>>     #2 0x110de9b in alloc /home/pel/sb/vim/src/misc2.c:820
>>     #3 0x10e9360 in addfile /home/pel/sb/vim/src/misc1.c:11134
>>     #4 0x10e8269 in unix_expandpath /home/pel/sb/vim/src/misc1.c:10363
>>     #5 0x150934e in mch_expandpath /home/pel/sb/vim/src/os_unix.c:5640
>>     #6 0x10e1dbf in gen_expand_wildcards /home/pel/sb/vim/src/misc1.c:10983
>>     #7 0x10dd295 in expand_wildcards /home/pel/sb/vim/src/misc1.c:9695
>>     #8 0x10dcb25 in expand_wildcards_eval /home/pel/sb/vim/src/misc1.c:9666
>>     #9 0xc31003 in ExpandFromContext /home/pel/sb/vim/src/ex_getln.c:4613
>>     #10 0xbfef41 in ExpandOne /home/pel/sb/vim/src/ex_getln.c:3627
>>     #11 0xae89e1 in expand_filename /home/pel/sb/vim/src/ex_docmd.c:5061
>>     #12 0xab8d5d in do_one_cmd /home/pel/sb/vim/src/ex_docmd.c:2879
>>     #13 0xa92b4d in do_cmdline /home/pel/sb/vim/src/ex_docmd.c:1133
>>     #14 0xa99f50 in do_cmdline_cmd /home/pel/sb/vim/src/ex_docmd.c:738
>>     #15 0x2076afd in exe_commands /home/pel/sb/vim/src/main.c:2926
>>     #16 0x20577a5 in main /home/pel/sb/vim/src/main.c:961
>>     #17 0x7f4bd1821ec4 in __libc_start_main
>> /build/buildd/eglibc-2.19/csu/libc-start.c:287
>>
>> Shadow bytes around the buggy address:
>>   0x0c047fffadd0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>>   0x0c047fffade0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>>   0x0c047fffadf0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>>   0x0c047fffae00: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>>   0x0c047fffae10: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>> =>0x0c047fffae20: fa fa[02]fa fa fa 05 fa fa fa fd fa fa fa fd fa
>>   0x0c047fffae30: fa fa 03 fa fa fa 04 fa fa fa 00 00 fa fa 00 00
>>   0x0c047fffae40: fa fa 00 00 fa fa 00 00 fa fa 00 00 fa fa 00 00
>>   0x0c047fffae50: fa fa 00 00 fa fa fd fd fa fa fd fd fa fa fd fd
>>   0x0c047fffae60: fa fa fd fd fa fa fd fd fa fa fd fd fa fa fd fd
>>   0x0c047fffae70: fa fa fd fd fa fa fd fd fa fa fd fd fa fa fd fd
>> Shadow byte legend (one shadow byte represents 8 application bytes):
>>   Addressable:           00
>>   Partially addressable: 01 02 03 04 05 06 07
>>   Heap left redzone:       fa
>>   Heap right redzone:      fb
>>   Freed heap region:       fd
>>   Stack left redzone:      f1
>>   Stack mid redzone:       f2
>>   Stack right redzone:     f3
>>   Stack partial redzone:   f4
>>   Stack after return:      f5
>>   Stack use after scope:   f8
>>   Global redzone:          f9
>>   Global init order:       f6
>>   Poisoned by user:        f7
>>   Container overflow:      fc
>>   Array cookie:            ac
>>   Intra object redzone:    bb
>>   ASan internal:           fe
>>   Left alloca redzone:     ca
>>   Right alloca redzone:    cb
>> ==6572==ABORTING
>>
>> Code in misc2.c:
>>
>> 5812     int
>> 5813 pathcmp(p, q, maxlen)
>> 5814     const char *p, *q;
>> 5815     int maxlen;
>> 5816 {
>> 5817     int         i;
>> 5818     int         c1, c2;
>> 5819     const char  *s = NULL;
>> 5820
>> 5821     for (i = 0; maxlen < 0 || i < maxlen; i += MB_PTR2LEN((char_u *)p + 
>> i))
>> 5822     {
>> 5823         c1 = PTR2CHAR((char_u *)p + i);
>> 5824         c2 = PTR2CHAR((char_u *)q + i);
>>
>> This code assumes that if c1 and c2 are identical, then they must also
>> have the same value for MB_PTR2LEN(....). It's an incorrect assumption.
>> For example, the following 2 UTF-8 sequences encode the same character
>> U+0023 (i.e #) and yet they have different MB_PTR2LEN(...):
>>
>>    0x23                     (MB_PTR2LEN(...) is 1)
>>    0xf0 0x80 0x80 0xa3      (MB_PTR2LEN(...) is 4).
>
> Good catch.  The code is indeed making a wrong assumption.
>
>>
>> The first UTF-8 sequence uses:
>>
>>        0x00000000 - 0x0000007F:
>>            0xxxxxxx
>>
>>        0x00010000 - 0x001FFFFF:
>>            11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
>>
>> Strictly speaking, the 2nd UTF-8 sequence should be considered illegal
>> in this case according to "man utf-8":
>>
>> === BEGIN [ man utf-8 ] ===
>> Security
>>    The Unicode and UCS standards require that producers of UTF-8 shall
>>    use  the  shortest form possible, for example, producing a two-byte
>>    sequence with first byte 0xc0 is nonconforming.   Unicode  3.1  has
>>    added the requirement that conforming programs must not accept non-
>>    shortest forms in their input.  This is for  security  reasons:  if
>>    user  input  is checked for possible security violations, a program
>>    might check only for the ASCII version of "/../" or ";" or NUL  and
>>    overlook  that  there  are  many  non-ASCII ways to represent these
>>    things in a non-shortest UTF-8 encoding.
>> === END [ man utf-8 ] ===
>>
>> Yet Vim gladly accept such malformed UTF-8 sequence like 0xf0 0x80 0x80 0xa3
>> as valid and treats it the same as 0x23 (# char). So perhaps a better fix
>> would be to consider those UTF-8 sequences as invalid? (changes to mbytes.c).
>> I did not do that. It might slow down Vim with UTF-8, and being able to view
>> such invalid utf8 sequences in Vim might be considered useful in some cases?
>
> Normalizing utf-8 characters would be useful in some places.  But in
> other places it conflicts with the idea that Vim should not change the
> text.  It might not be Unicode that we're editing.  This can get
> complicated, thus I'm not so eager to make a drastic change.
>
> When editing a text file, when encountering such an overlong utf-8
> sequence, this could be considered a conversion error.  This would then
> result in falling back to assuming the file is latin1, which is an 8-bit
> encoding without illegal characters.
>
> However, even then it's still possible to force Vim to assume the file
> is utf-8.  The problem would be smaller, since the user must have setup
> this encoding enforcement somehow.  But it's better to not make the
> assumption that all utf-8 is valid utf-8.
>
> In the code you found it should be simple to use MB_PTR2LEN() twice.
>
> Perhaps we could do both.


Upon code inspection, I found a similar kind of bug in f_wc_equal(...).

Furthermore, besides overlong UTF-8 sequences, there is another
reason why c1 and c2 may have different PTR2LEN.
It could also happen for valid UTF-8 sequences when doing
case insensitive comparisons.

See this link:

== BEGIN QUOTE https://twitter.com/jifa/status/625776454479970304 ==
Ⱥ (U+023A) and Ⱦ (U+023E) are the *only* code points to increase
in length (2 to 3 bytes) when lowercased.
== END QUOTE ===

I've updated my patch to fix both f_wc_equal(...) and pathcmp(...).
I also changed the fix for pathcmp(...) as it was not correct when
doing case insensitive comparison.

Regards
Dominique

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.
diff -r 91852336e638 src/misc2.c
--- a/src/misc2.c	Thu Aug 13 23:28:44 2015 +0200
+++ b/src/misc2.c	Sat Aug 15 14:49:59 2015 +0200
@@ -5058,7 +5058,7 @@
     char_u	*s1;
     char_u	*s2;
 {
-    int		i;
+    int		i, j;
     int		prev1 = NUL;
     int		prev2 = NUL;
 
@@ -5071,16 +5071,19 @@
     if (STRLEN(s1) != STRLEN(s2))
 	return FAIL;
 
-    for (i = 0; s1[i] != NUL && s2[i] != NUL; i += MB_PTR2LEN(s1 + i))
+    for (i = 0, j = 0; s1[i] != NUL;)
     {
 	int c1 = PTR2CHAR(s1 + i);
-	int c2 = PTR2CHAR(s2 + i);
+	int c2 = PTR2CHAR(s2 + j);
 
 	if ((p_fic ? MB_TOLOWER(c1) != MB_TOLOWER(c2) : c1 != c2)
 		&& (prev1 != '*' || prev2 != '*'))
 	    return FAIL;
 	prev2 = prev1;
 	prev1 = c1;
+
+        i += MB_PTR2LEN(s1 + i);
+        j += MB_PTR2LEN(s2 + j);
     }
     return TRUE;
 }
@@ -5814,14 +5817,14 @@
     const char *p, *q;
     int maxlen;
 {
-    int		i;
+    int		i, j;
     int		c1, c2;
     const char	*s = NULL;
 
-    for (i = 0; maxlen < 0 || i < maxlen; i += MB_PTR2LEN((char_u *)p + i))
+    for (i = 0, j = 0; maxlen < 0 || (i < maxlen && j < maxlen);)
     {
 	c1 = PTR2CHAR((char_u *)p + i);
-	c2 = PTR2CHAR((char_u *)q + i);
+	c2 = PTR2CHAR((char_u *)q + j);
 
 	/* End of "p": check if "q" also ends or just has a slash. */
 	if (c1 == NUL)
@@ -5829,6 +5832,7 @@
 	    if (c2 == NUL)  /* full match */
 		return 0;
 	    s = q;
+            i = j;
 	    break;
 	}
 
@@ -5854,8 +5858,11 @@
 	    return p_fic ? MB_TOUPPER(c1) - MB_TOUPPER(c2)
 		    : c1 - c2;  /* no match */
 	}
+
+	i += MB_PTR2LEN((char_u *)p + i);
+	j += MB_PTR2LEN((char_u *)q + j);
     }
-    if (s == NULL)	/* "i" ran into "maxlen" */
+    if (s == NULL)	/* "i" or "j" ran into "maxlen" */
 	return 0;
 
     c1 = PTR2CHAR((char_u *)s + i);

Raspunde prin e-mail lui