New topic: 

Regex hard crash at c4000 characters

<http://forums.realsoftware.com/viewtopic.php?t=46514>

         Page 1 of 1
   [ 3 posts ]                 Previous topic | Next topic          Author  
Message        JimPitchford          Post subject: Regex hard crash at c4000 
charactersPosted: Thu Jan 10, 2013 3:40 pm                         
Joined: Mon Apr 11, 2011 2:01 pm
Posts: 127                I have been investigating a hard crash in regex. It 
seems it crashes when the searched text exceeds around 4000 characters for 
particular search patterns.

I have extracted code to reproduce this below. Putting a breakpoint at the 
crash site allows you to step through until the crash occurs. In my case this 
occurs at j = 4 which is somewhere between 3,000 and 4,000 characters.

Changing the search pattern to be slightly simpler stops the crash occuring - 
at least for these text sizes.

Has anyone witnessed similar. Any work arounds?

Jim

for j as integer = 1 to 10 step 1
  
  dim file as FolderItem = SpecialFolder.Desktop.Child("test")
  dim Bookmark as string
  
  for i as integer = 0 to 100*j
  bookmark = bookmark + "abcdefghij"
  next
  
  dim bookmarks() as string
  bookmarks.append encodehex(bookmark)
  
  dim a as string = join(bookmarks, endofLine)
  
  dim b as string = EncodeBase64(a)
  
  b = "<string>" + b + "</string>"
  
  dim tTyp, tVal as string
  
  dim tRgx as regex, tFnd as regexMatch
  tRgx = new regex
  tRgx.options.caseSensitive = true
  tRgx.options.greedy = false
  tRgx.options.matchEmpty = false
  
  tRgx.searchPattern = "<\w+>"
  tFnd = tRgx.search(b)
  if tFnd <> nil then
  tTyp = tFnd.subExpressionString(0)
  
  tRgx.searchPattern = ">([a-zA-Z0-9\-\+.:,\\/=\n\r])+<"  //this search pattern 
crashes
  'tRgx.searchPattern = "([a-zA-Z0-9\-\+.:,\\/=\n\r])+"  //this search pattern 
works
  
  tFnd = tRgx.search(b) //crashes here without warning
  
  if tfnd <> nil then
  tVal = tFnd.subExpressionString(0)
  end if
  end if
  
  textarea1.text = str(j)
  
  
next
      
_________________
Jim
OSX 10.8.2, rb2012r2  
                             Top                ktekinay          Post subject: 
Re: Regex hard crash at c4000 charactersPosted: Thu Jan 10, 2013 7:07 pm        
                         
Joined: Mon Feb 05, 2007 5:21 pm
Posts: 340
Location: New York, NY                I ran into something similar with 
RegExRX. Your pattern is making the PCRE engine exceed the available stack size 
of the main thread. Unfortunately, there is nothing you can do about that, but 
you can run your search in a separate thread and adjust the stack size there.

I didn't try to narrow down the "sweet spot", but I increased the stack size to 
40 MB and avoided the crash. Here is the code:
Sub Action()
  dim thd as new Thread
  thd.StackSize = 40000000
  thd.Priority = Thread.HighPriority
  AddHandler thd.Run, AddressOf RunRegEx
  thd.Run
  while thd.State = Thread.Running
  App.YieldToNextThread
  wend
  RemoveHandler thd.Run, AddressOf RunRegEx
  thd = nil
  
End Sub


Sub RunRegEx(sender As Thread = nil)
  #pragma unused sender
  
  for j as integer = 1 to 10 step 1
  
  'dim file as FolderItem = SpecialFolder.Desktop.Child("test")
  dim Bookmark as string
  
  for i as integer = 0 to 100*j
  bookmark = bookmark + "abcdefghij"
  next
  
  dim bookmarks() as string
  bookmarks.append encodehex(bookmark)
  
  dim a as string = join(bookmarks, endofLine)
  
  dim b as string = EncodeBase64(a)
  
  b = "<string>" + b + "</string>"
  
  dim tTyp, tVal as string
  
  dim tRgx as regex, tFnd as regexMatch
  tRgx = new regex
  tRgx.options.caseSensitive = true
  tRgx.options.greedy = false
  tRgx.options.matchEmpty = false
  
  tRgx.searchPattern = "<\w+>"
  tFnd = tRgx.search(b)
  if tFnd <> nil then
  tTyp = tFnd.subExpressionString(0)
  
  tRgx.searchPattern = ">([a-zA-Z0-9\-\+.:,\\/=\n\r])+<"  //this search pattern 
crashes
  'tRgx.searchPattern = "([a-zA-Z0-9\-\+.:,\\/=\n\r])+"  //this search pattern 
works
  
  tFnd = tRgx.search(b) //crashes here without warning
  
  if tfnd <> nil then
    tVal = tFnd.subExpressionString(0)
  end if
  end if
  
  'textarea1.text = str(j)
  
  
  next
End Sub

The pattern requires a large stack because you are using the repeater "+" 
outside the capturing group. If you don't need the string to be captured, you 
could remove the parenthesis, or put the repeater inside of them. There really 
isn't much use in creating the pattern that way.
tRgx.searchPattern = ">([a-zA-Z0-9\-\+.:,\\/=\n\r]+)<"  // or
tRgx.searchPattern = ">[a-zA-Z0-9\-\+.:,\\/=\n\r]+<"  // or
tRgx.searchPattern = ">(?:[a-zA-Z0-9\-\+.:,\\/=\n\r])+<"  // (odd pattern, 
better to use one of the ones above

I tried the first version above and it does not crash.      
_________________
Kem Tekinay
MacTechnologies Consulting
http://www.mactechnologies.com/

Need to develop, test, and refine regular expressions? Try RegExRX.
  
                             Top                JimPitchford          Post 
subject: Re: Regex hard crash at c4000 charactersPosted: Thu Jan 10, 2013 8:05 
pm                         
Joined: Mon Apr 11, 2011 2:01 pm
Posts: 127                Brilliant.

Yes, it seems to work.

Thanks for the insights.

Jim      
_________________
Jim
OSX 10.8.2, rb2012r2  
                             Top             Display posts from previous: All 
posts1 day7 days2 weeks1 month3 months6 months1 year Sort by AuthorPost 
timeSubject AscendingDescending          Page 1 of 1
   [ 3 posts ]      
-- 
Over 1500 classes with 29000 functions in one REALbasic plug-in collection. 
The Monkeybread Software Realbasic Plugin v9.3. 
http://www.monkeybreadsoftware.de/realbasic/plugins.shtml

[email protected]

Reply via email to