>>I'm coding a "filter" that will clean all the crap out of MS
Word-generated HTML. I am using rereplace() to do this; I want to be
selective in what generated code gets removed.
You will need A LOT of Regex, since the crap MS Word generate is just
beyond all understanding.
Personnally, I do it in JS directly from my editor, but they are still
Regex.
I started from the code I found in FCKEditor and added my own stuff,
sinc I found that the FCKeditor
was far too liberal. And I'm still discovering new issues everyday.
Here is the code below (note that some of the filters may be obsolete
because of even more efficient
filters I've added in the last lines of code, I still have to clean up a
little).
function cleanWord (html)
// cleans pasted text from Word
{
//alert(html)
html = html.replace(/<o:p>\s*<\/o:p>/g, "") ;
html = html.replace(/<o:p>.*?<\/o:p>/g, "") ;
// Remove mso-xxx styles.
html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ;
// Remove margin styles.
html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ;
html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;
html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ;
html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;
html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;
html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;
html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;
html = html.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ;
html = html.replace( /\s*tab-stops:[^"]*/gi, "" ) ;
html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ;
// Remove Class attributes
html = html.replace(/<(\w[^>]*)\s*class=([^ |>]*)([^>]*)/gi, "<$1$3") ;
// Remove styles.
html = html.replace( /<(\w[^>]*)style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;
// Remove empty styles.
html = html.replace( /\s*style="\s*"/gi, '' ) ;
html = html.replace( /<SPAN[^>]*>\s* \s*<\/SPAN>/gi, ' ' ) ;
html = html.replace( /<SPAN[^>]*>\s*<\/SPAN>/gi, '' ) ;
// Remove Lang attributes
html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;
html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
// remove all font tags
html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
// Remove XML elements and declarations
html = html.replace(/<\\?\?xml[^>]*>/gi, "") ;
// Remove Tags with XML namespace declarations: <o:p></o:p>
html = html.replace(/<\/?\w+:[^>]*>/gi, "") ;
html = html.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;
//clean up H tags
html = html.replace( /<H1([^>]*)>/gi, '<H1>' ) ;
html = html.replace( /<H2([^>]*)>/gi, '<H2>' ) ;
html = html.replace( /<H3([^>]*)>/gi, '<H3>' ) ;
html = html.replace( /<H4([^>]*)>/gi, '<H4>' ) ;
html = html.replace( /<H5([^>]*)>/gi, '<H5>' ) ;
html = html.replace( /<H6([^>]*)>/gi, '<H6>' ) ;
html = html.replace( /<(U|I|STRIKE)> <\/\1>/g, ' ' ) ;
// no comment...
html = html.replace( /<!--[\s\S]*?-->/gi, '' ) ;
// Remove empty tags (three times, just to be sure).
html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
// transform bullet lists
var re = new RegExp("<P>·<SPAN>( | )*</SPAN>([\\s\\S]*?)</P>",
"gi");
html = html.replace( re, "<LI>$2</LI>" ) ;
re = new RegExp("<P>·( | )*([\\s\\S]*?)</P>", "gi");
html = html.replace( /<P>·( | )*([\s\S]*?)<\/P>/gi,
"<LI>$2</LI>" ) ;
// remove spaces at begining
html = html.replace( /^( | )*\s*/, '') ;
// replace all stupid <P align=center>...</P> because they are
overridden by higher
// style declarations like justify, etc.
html = html.replace( /<P\s*align=center>([\s\S]*?)<\/P>/gi,
'<BR><CENTER>$1</CENTER>' ) ;
// remove useless </CENTER><CENTER>
html = html.replace( /<\/CENTER>(\s*<BR>\s*)<CENTER>/gi, '$1' ) ;
// remove useless <BR> in <TD>
html = html.replace( /(<TD[^>]*>)\s*<BR>\s*/gi, '$1' ) ;
// replace <CENTER>...</CENTER> inside of TDs
html = html.replace(
/(<TD[^>]*)>\s*<CENTER>([\s\S]*?)<\/CENTER>\s*<\/TD>/gi,
'$1 align=center>$2</TD>' ) ;
// remove Paragraphs inside TD
html =
html.replace(/(<TD[^>]*>)\s*<P[^>]*>([\s\S]*?)\s*<\/P>\s*([\s\S]*?<\/TD>)/gi,
'$1$2$3');
//alert(html)
return (html);
}
--
_______________________________________
REUSE CODE! Use custom tags;
See http://www.contentbox.com/claude/customtags/tagstore.cfm
(Please send any spam to this address: [EMAIL PROTECTED])
Thanks.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
Logware (www.logware.us): a new and convenient web-based time tracking
application. Start tracking and documenting hours spent on a project or with a
client with Logware today. Try it for free with a 15 day trial account.
http://www.houseoffusion.com/banners/view.cfm?bannerid=67
Message: http://www.houseoffusion.com/lists.cfm/link=i:4:225157
Archives: http://www.houseoffusion.com/cf_lists/threads.cfm/4
Subscription: http://www.houseoffusion.com/lists.cfm/link=s:4
Unsubscribe: http://www.houseoffusion.com/cf_lists/unsubscribe.cfm?user=89.70.4
Donations & Support: http://www.houseoffusion.com/tiny.cfm/54