New topic: 

Building a faster InStr

<http://forums.realsoftware.com/viewtopic.php?t=47035>

         Page 1 of 1
   [ 1 post ]                 Previous topic | Next topic          Author  
Message        ktekinay          Post subject: Building a faster InStrPosted: 
Fri Feb 22, 2013 12:00 pm                                 
Joined: Mon Feb 05, 2007 5:21 pm
Posts: 469
Location: New York, NY                Based on the conversation here, I tried 
to build a faster InStr using RegEx as the backbone, and I succeeded.

To a point. 

I converted the find string to a pattern like this:
find = find.ConvertEncodings( Encodings.UTF8 )
find = "\Q" + find.ReplaceAllB( "\E", "\E\\E\Q" ) + "\E"

Basically, that will turn a string like "1+2" into "\Q1+2\E". and "1\E2" into 
"\Q1\E\\E\Q2\E". (Anything between "\Q" and "\E" will be treated like a literal 
token, so I have to treat an actual "\E" in the find string in a special way.)

The problem is the accented characters. Finding "h" in "Hot" will work in 
either InStr or using a RegEx, but finding "ü" in "Ü2" will fail with a 
RegEx. The alternative is to convert both the source and find to uppercase and 
run a case-sensitive regex match, but the overhead of that makes it take longer 
than InStr. Finally, I had to settle on using this only on strings that don't 
have multi-byte characters.

This code will be in my M_String module the next time I post an update to my 
web site, but here's part of it for your consideration:
Function InStrFaster_MTC(Extends src As String, Optional start As Integer, find 
As String) As Integer
  // Uses RegEx to perform a faster search.
  // Only bothers with longer source strings, otherwise the method calls and 
setup will overcome the savings.
  
  const kSourceLimit = 1000
  if src.LenB < kSourceLimit or find = "" then return src.InStr( start, find )
  
  // Won't try with multi-byte strings because regex won't equate upper and 
lowercase accented characters.
  src = src.ConvertEncoding( Encodings.UTF8 )
  if src.Len <> src.LenB then return src.InStr( start, find )
  
  find = find.ConvertEncoding( Encodings.UTF8 )
  find = "\Q" + find.ReplaceAllB( "\E", "\E\\E\Q" ) + "\E"
  
  // Could have called InStrRegEx, but this saves some method calls
  dim startB as integer
  if start <> 0 then startB = PosToPosB( src, start )
  dim posB as integer = InStrRegExB( startB, src, find, false )
  dim pos as integer
  if posB <> 0 then pos = PosBToPos( src, posB )
  return pos
  
End Function

Protected Function InStrRegExB(startB As Integer, source As String, pattern As 
String, caseSensitive As Boolean = True) As Integer
  dim r as Integer
  
  static rx as RegEx
  dim match as RegExMatch
  
  if pattern = "" then return startB
  if source = "" then return 0
  
  dim enc as TextEncoding = source.Encoding
  dim needsConversion as boolean = ( enc <> Encodings.UTF8 ) and 
source.HasMultiByteChars_MTC
  if needsConversion then
  dim start as integer = PosBToPos( source, startB )
  source = source.ConvertEncoding( Encodings.UTF8 )
  startB = PosToPosB( source, start )
  end if
  pattern = pattern.ConvertEncoding( source.Encoding )
  
  // Adjust the startB
  startB = startB - 1
  if startB < 0 then startB = 0
  
  if rx = nil then
  rx = new RegEx
  rx.Options.Greedy = false
  end if
  rx.Options.CaseSensitive = caseSensitive
  
  rx.SearchPattern = pattern
  match = rx.Search( source, startB )
  if match = nil or match.SubExpressionCount = 0 then
  r = 0
  else
  r = match.SubExpressionStartB( 0 ) + 1
  end if
  
  if needsConversion and r <> 0 then
  source = source.LeftB( r - 1 )
  source = source.ConvertEncoding( enc )
  r = source.LenB + 1
  end if
  
  return r
  
End Function
      
_________________
Kem Tekinay
MacTechnologies Consulting
http://www.mactechnologies.com/

Need to develop, test, and refine regular expressions? Try RegExRX.
  
                             Top             Display posts from previous: All 
posts1 day7 days2 weeks1 month3 months6 months1 year Sort by AuthorPost 
timeSubject AscendingDescending          Page 1 of 1
   [ 1 post ]      
-- 
Over 1500 classes with 29000 functions in one REALbasic plug-in collection. 
The Monkeybread Software Realbasic Plugin v9.3. 
http://www.monkeybreadsoftware.de/realbasic/plugins.shtml

[email protected]

Reply via email to