Re: Need REGEX help

Claude Schneegans Wed, 23 Nov 2005 18:06:02 -0800

 >>I'm coding a "filter" that will clean all the crap out of MS
Word-generated HTML.  I am using rereplace() to do this; I want to be
selective in what generated code gets removed.


You will need A LOT of Regex, since the crap MS Word generate is just 
beyond all understanding.
Personnally, I do it in JS directly from my editor, but they are still 
Regex.
I started from the code I found in FCKEditor and added my own stuff, 
sinc I found that the FCKeditor
was far too liberal. And I'm still discovering new issues everyday.

Here is the code below (note that some of the filters may be obsolete 
because of even more efficient
filters I've added in the last lines of code, I still have to clean up a 
little).

function cleanWord (html)
    // cleans pasted text from Word
    {
    //alert(html)
    html = html.replace(/<o:p>\s*<\/o:p>/g, "") ;
    html = html.replace(/<o:p>.*?<\/o:p>/g, "") ;
   
    // Remove mso-xxx styles.
    html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ;

    // Remove margin styles.
    html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ;
    html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;

    html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ;
    html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;

    html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;

    html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;

    html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;

    html = html.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ;
    html = html.replace( /\s*tab-stops:[^"]*/gi, "" ) ;

    html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ;
   
    // Remove Class attributes
    html = html.replace(/<(\w[^>]*)\s*class=([^ |>]*)([^>]*)/gi, "<$1$3") ;

    // Remove styles.
    html = html.replace( /<(\w[^>]*)style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;

    // Remove empty styles.
    html =  html.replace( /\s*style="\s*"/gi, '' ) ;
   
    html = html.replace( /<SPAN[^>]*>\s*&nbsp;\s*<\/SPAN>/gi, '&nbsp;' ) ;
   
    html = html.replace( /<SPAN[^>]*>\s*<\/SPAN>/gi, '' ) ;
   
    // Remove Lang attributes
    html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;
   
    html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
    html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
    html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
   
    // remove all font tags
    html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
    html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
    html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;

    // Remove XML elements and declarations
    html = html.replace(/<\\?\?xml[^>]*>/gi, "") ;
   
    // Remove Tags with XML namespace declarations: <o:p></o:p>
    html = html.replace(/<\/?\w+:[^>]*>/gi, "") ;
   
    html = html.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;

    //clean up H tags   
    html = html.replace( /<H1([^>]*)>/gi, '<H1>' ) ;
    html = html.replace( /<H2([^>]*)>/gi, '<H2>' ) ;
    html = html.replace( /<H3([^>]*)>/gi, '<H3>' ) ;
    html = html.replace( /<H4([^>]*)>/gi, '<H4>' ) ;
    html = html.replace( /<H5([^>]*)>/gi, '<H5>' ) ;
    html = html.replace( /<H6([^>]*)>/gi, '<H6>' ) ;
   
    html = html.replace( /<(U|I|STRIKE)>&nbsp;<\/\1>/g, '&nbsp;' ) ;

    // no comment...
    html = html.replace( /<!--[\s\S]*?-->/gi, '' ) ;
   
    // Remove empty tags (three times, just to be sure).
    html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
    html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
    html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;

    // transform bullet lists
    var re = new RegExp("<P>·<SPAN>(&nbsp;| )*</SPAN>([\\s\\S]*?)</P>", 
"gi");
    html = html.replace( re, "<LI>$2</LI>" ) ;
    re = new RegExp("<P>·(&nbsp;| )*([\\s\\S]*?)</P>", "gi");
    html = html.replace( /<P>·(&nbsp;| )*([\s\S]*?)<\/P>/gi, 
"<LI>$2</LI>" ) ;
    // remove spaces at begining
    html = html.replace( /^(&nbsp;| )*\s*/, '') ;
    // replace all stupid <P align=center>...</P> because they are 
overridden by higher
    // style declarations like justify, etc.
    html = html.replace( /<P\s*align=center>([\s\S]*?)<\/P>/gi, 
'<BR><CENTER>$1</CENTER>' ) ;
    // remove useless </CENTER><CENTER>
    html = html.replace( /<\/CENTER>(\s*<BR>\s*)<CENTER>/gi, '$1' ) ;
    // remove useless <BR> in <TD>
    html = html.replace( /(<TD[^>]*>)\s*<BR>\s*/gi, '$1' ) ;
    // replace <CENTER>...</CENTER> inside of TDs
    html = html.replace( 
/(<TD[^>]*)>\s*<CENTER>([\s\S]*?)<\/CENTER>\s*<\/TD>/gi,
        '$1 align=center>$2</TD>' ) ;
    // remove Paragraphs inside TD
    html = 
html.replace(/(<TD[^>]*>)\s*<P[^>]*>([\s\S]*?)\s*<\/P>\s*([\s\S]*?<\/TD>)/gi, 

        '$1$2$3');
   //alert(html)
  return (html);
    }

-- 
_______________________________________
REUSE CODE! Use custom tags;
See http://www.contentbox.com/claude/customtags/tagstore.cfm
(Please send any spam to this address: [EMAIL PROTECTED])
Thanks.



~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
Logware (www.logware.us): a new and convenient web-based time tracking 
application. Start tracking and documenting hours spent on a project or with a 
client with Logware today. Try it for free with a 15 day trial account.
http://www.houseoffusion.com/banners/view.cfm?bannerid=67

Message: http://www.houseoffusion.com/lists.cfm/link=i:4:225157
Archives: http://www.houseoffusion.com/cf_lists/threads.cfm/4
Subscription: http://www.houseoffusion.com/lists.cfm/link=s:4
Unsubscribe: http://www.houseoffusion.com/cf_lists/unsubscribe.cfm?user=89.70.4
Donations & Support: http://www.houseoffusion.com/tiny.cfm/54

Re: Need REGEX help

Reply via email to