http://nagoya.apache.org/bugzilla/show_bug.cgi?id=1346 *** shadow/1346 Mon Apr 16 12:26:53 2001 --- shadow/1346.tmp.5869 Mon Apr 16 12:26:53 2001 *************** *** 0 **** --- 1,178 ---- + +============================================================================+ + | substitue gives strange result | + +----------------------------------------------------------------------------+ + | Bug #: 1346 Product: ORO | + | Status: NEW Version: 2.0.2 | + | Resolution: Platform: PC | + | Severity: Normal OS/Version: Linux | + | Priority: Component: Main | + +----------------------------------------------------------------------------+ + | Assigned To: [EMAIL PROTECTED] | + | Reported By: [EMAIL PROTECTED] | + +----------------------------------------------------------------------------+ + | URL: | + +============================================================================+ + | DESCRIPTION | + I am trying to find email addresses and substitute them with a link to an email + compose page. However, when I try the substitution I get odd results- the + substitute method accurately finds the email addresses but I don't have access + to $1.... Oddly enough when I use the same code for making URL's links to a + popup window containing the url content everything works ok. I think the + problem may be in the ORO code. + + _______________________________________________________________________________ + code and regexes + _______________________________________________________________________________ + code: + ******************************************************************************* + + public static String PerlSubstituteTool(String reg_ex, int reg_ex_Flags, + String substitution, int interpolations, String input) + { + //fail safe -- if a regex doesn't work it might be set to + //empty string... test for that and if we don't have a regex skip + //alternately a flag could have an alpha character in it (causing + //a number format exception (before this is called) but that is + //kludgy + if(null == reg_ex || reg_ex.equals("")) + { + //do nothing -- we'll just send back the input + } + else + { + try + { + String temp = null; + Perl5Compiler compiler = new Perl5Compiler(); + Perl5Substitution sub = new Perl5Substitution(substitution); + Pattern pat = compiler.compile(reg_ex, reg_ex_Flags); + temp = Util.substitute(new Perl5Matcher(), pat, sub, input, + interpolations); + input = temp; + } + catch (MalformedPatternException mpe) + { + System.out.println("in catch MalformedPatternException"); + System.out.println("Exception is "+mpe.getMessage()); + } + } + return input; + } + ******************************************************************************* + Note: these regexes are pulled in from LDAP so the String that is generated + (by the time it gets pulled in by the Java code) is properly escaped + + a regex that works (for URLS): + ******************************************************************************* + +\b((ftp|http|gopher|mailto|news|nntp|telnet|wais|file|prospero|z39.50s|z39.50r|cid|mid|vemmi|service|imap|nfs|acap|rstp|tip|pop|data|dav|opaquelocktoken|sip|tel|fax|modem|ldap|afs|tn3270|mailserver):[\w/#~:.?+=&@!\-.:?\-;!>] + +? ) (?= [.:?\-;!>]* [^\w/#~:.?+=&@!\-.:?\-;!>] | $ ) + + ******************************************************************************* + a regex that properly finds its target but does not give me access to $1... once + it is done (it is from the OReilly book "Mastering Regular Expressions" + ******************************************************************************* + [\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?: (?: [^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|"[^\x80-\xff\n\015"]* (?: + \[^\x80-\xff][^\x80-\xff\n\015"]* )* ")[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?: \.[\040\t]* (?: + \([^\x80-\xff\n\015()]* (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?: [^ (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|"[^\x80-\xff\n\015"]* (?: + \[^\x80-\xff][^\x80-\xff\n\015"]* )* ")[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*)* \@[\040\t]* (?: + \([^\x80-\xff\n\015()]* (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:[^ (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:\.[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?:[^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*)*|(?: [^ (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|"[^\x80-\xff\n\015"]* (?: + \[^\x80-\xff][^\x80-\xff\n\015"]* )* + ")[^()<>\@,;:".\[\]\x80-\xff\000-\010\012-\037]* (?: (?:\([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* + )*\)|"[^\x80-\xff\n\015"]* (?: \[^\x80-\xff][^\x80-\xff\n\015"]* )* + ")[^()<>\@,;:".\[\]\x80-\xff\000-\010\012-\037]* )*< [\040\t]* (?: + \([^\x80-\xff\n\015()]* (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:\@[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?:[^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:\.[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?:[^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*)*(?: , [\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*\@[\040\t]* (?: + \([^\x80-\xff\n\015()]* (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:[^ (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:\.[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?:[^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*)*)* :[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*)?(?: [^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|"[^\x80-\xff\n\015"]* (?: + \[^\x80-\xff][^\x80-\xff\n\015"]* )* ")[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?: \.[\040\t]* (?: + \([^\x80-\xff\n\015()]* (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?: [^ (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|"[^\x80-\xff\n\015"]* (?: + \[^\x80-\xff][^\x80-\xff\n\015"]* )* ")[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*)* \@[\040\t]* (?: + \([^\x80-\xff\n\015()]* (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:[^ (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*(?:\.[\040\t]* (?: \([^\x80-\xff\n\015()]* (?: + (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* (?:\[^\x80-\xff][^\x80-\xff\n\015()]* + )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* )*(?:[^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff]+(?![^ + (\040)<>\@,;:".\[\]\000-\037\x80-\xff])|\[(?: + [^\x80-\xff\n\015\[\]]|\[^\x80-\xff])* \])[\040\t]* (?: \([^\x80-\xff\n\015()]* + (?: (?:\[^\x80-\xff]|\([^\x80-\xff\n\015()]* + (?:\[^\x80-\xff][^\x80-\xff\n\015()]* )*\))[^\x80-\xff\n\015()]* )*\)[\040\t]* + )*)*>)
