Re: RichTextArea, SafeHtml, and general rich text best practices

Patrice De Saint Steban Tue, 05 Apr 2011 05:59:06 -0700

Hello,

I'm doing the same thing, and I write a new Sanitize class to accept more 
tags and parameters.
I use this sanitizer before and after the setHtml() and getHtml() of the 
RichTextArea.
I share the file in attachment, If there are some issues thanks to inform 
me.


Patrice

-- 
You received this message because you are subscribed to the Google Groups 
"Google Web Toolkit" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to 
[email protected].
For more options, visit this group at 
http://groups.google.com/group/google-web-toolkit?hl=en.

package cef.kephas.client.utils;

/**
 Copyright (c) 2009 Open Lab, http://www.open-lab.com/
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
 "Software"), to deal in the Software without restriction, including
 without limitation the rights to use, copy, modify, merge, publish,
 distribute, sublicense, and/or sell copies of the Software, and to
 permit persons to whom the Software is furnished to do so, subject to
 the following conditions:

 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 This class was adapted to GWT by Patrice de Saint St?ban
 */

import java.util.ArrayList;
import java.util.List;
import java.util.Stack;

import com.google.gwt.regexp.shared.MatchResult;
import com.google.gwt.regexp.shared.RegExp;
import com.google.gwt.safehtml.shared.HtmlSanitizer;
import com.google.gwt.safehtml.shared.SafeHtml;
import com.google.gwt.safehtml.shared.SafeHtmlBuilder;
import com.google.gwt.safehtml.shared.SafeHtmlUtils;

public class ComplexeHtmlSanitizer implements HtmlSanitizer {

	public static RegExp forbiddenTags = RegExp
			.compile("^(script|object|embed|link|style|form|input|iframe)$");
	public static RegExp allowedTags = RegExp
			.compile("^(b|p|i|s|a|img|table|thead|tbody|tfoot|tr|th|td|dd|dl|dt|em|h1|h2|h3|h4|h5|h6|li|ul|ol|span|div|strike|strong|"
					+ "sub|sup|pre|del|code|blockquote|strike|kbd|br|hr|area|map|object|embed|param|link|form|small|big|font)$");

	private static RegExp commentRegExp = RegExp.compile("<!--.*"); // <!--.........>
	private static RegExp tagStartRegExp = RegExp
			.compile("<(\\w+\\b)\\s*(.*)/?>$"); // <tag ....props.....>
	private static RegExp tagCloseRegExp = RegExp.compile("</(\\w+\\b)\\s*>$"); // </tag
																				// .........>

	private static RegExp standAloneTags = RegExp.compile("^(img|br|hr)$");
	private static RegExp selfClosed = RegExp.compile("<.+/>");

	private static RegExp attributesRegExp = RegExp.compile(
			"(\\w*)\\s*=\\s*\"([^\"]*)\"", "g"); // prop="...."
	private static RegExp styleRegExp = RegExp.compile(
			"([^\\s^:]+)\\s*:\\s*([^;]+);?", "g"); // color:red;

	private static RegExp urlStyleRegExp = RegExp.compile(
			".*\\b\\s*url\\s*\\(['\"]([^)]*)['\"]\\)", "g"); // url('....')"

	public static RegExp forbiddenStyleRegExp = RegExp.compile(
			"(expression|eval|javascript)\\s*\\(", "g"); // expression(....)"
															// thanks to
															// Ben
															// Summer

	private static ComplexeHtmlSanitizer instance = new ComplexeHtmlSanitizer();

	public static ComplexeHtmlSanitizer getInstance() {
		return instance;
	}

	/**
	 * Used to clean every html before to output it in any html page
	 * 
	 * @param html
	 * @return sanitized html
	 */
	public SafeHtml sanitize(String html) {
		_sanitizeHtml(html);
		return this.safeHtmlBuilder.toSafeHtml();
	}
	
	public static SafeHtml sanitizeHtml(String html) {
		return instance.sanitize(html);
	}

	private void _sanitizeHtml(String html) {
		safeHtmlBuilder = new SafeHtmlBuilder();
		if (html == null || html.isEmpty()) {
			safeHtmlBuilder.append(' ');
			return;
		}
		Stack<String> openTags = new Stack<String>();

		List<String> tokens = tokenize(html);

		// ------------------- LOOP for every token --------------------------
		for (String token : tokens) {
			boolean isAcceptedToken = false;

			MatchResult endMatcher = tagCloseRegExp.exec(token);
			MatchResult startMatcher = tagStartRegExp.exec(token);

			// --------------------------------------------------------------------------------
			// COMMENT <!-- ......... -->
			if (commentRegExp.test(token)) {
				invalidTags.add(token + (token.endsWith("-->") ? "" : "-->"));
				continue;

				// --------------------------------------------------------------------------------
				// OPEN TAG <tag .........>
			} else if (startMatcher != null) {

				// tag name extraction
				String tag = startMatcher.getGroup(1).toLowerCase();

				// -----------------------------------------------------
				// FORBIDDEN TAG <script .........>
				if (forbiddenTags.test(tag)) {
					invalidTags.add("<" + tag + ">");
					continue;

					// -------------------------------------------------- WELL
					// KNOWN TAG
				} else if (allowedTags.test(tag)) {

					String cleanToken = "<" + tag;
					String tokenBody = startMatcher.getGroup(2);

					// first test table consistency
					// table tbody tfoot thead th tr td
					if ("thead".equals(tag) || "tbody".equals(tag)
							|| "tfoot".equals(tag) || "tr".equals(tag)) {
						if (openTags.search("table") < 1) {
							invalidTags.add("<" + tag + ">");
							continue;
						}
					} else if ("td".equals(tag) || "th".equals(tag)) {
						if (openTags.search("tr") < 1) {
							invalidTags.add("<" + tag + ">");
							continue;
						}
					}

					// then test properties

					boolean foundURL = false; // URL flag
					for (MatchResult attributes = attributesRegExp
							.exec(tokenBody); attributes != null; attributes = attributesRegExp
							.exec(tokenBody)) {

						String attr = attributes.getGroup(1).toLowerCase();
						String val = attributes.getGroup(2);

						// we will accept href in case of <A>
						if ("a".equals(tag) && "href".equals(attr)) { // <a
																		// href="......">
							String[] customSchemes = { "http", "https" };
							if (new UrlValidator(customSchemes).isValid(val)) {
								foundURL = true;
							} else {
								// may be it is a mailto?
								// case <a
								// href="mailto:[email protected]?subject=...."
								if (val.toLowerCase().startsWith("mailto:")
										&& val.indexOf("@") >= 0) {
									String val1 = "http://www.";
											+ val.substring(val.indexOf("@") + 1);
									if (new UrlValidator(customSchemes)
											.isValid(val1)) {
										foundURL = true;
									} else {
										invalidTags.add(attr + " " + val);
										val = "";
									}
								} else {
									invalidTags.add(attr + " " + val);
									val = "";
								}
							}

						} else if (tag.matches("img|embed")
								&& "src".equals(attr)) { // <img src="......">
							String[] customSchemes = { "http", "https" };
							if (new UrlValidator(customSchemes).isValid(val)) {
								foundURL = true;
							} else {
								invalidTags.add(attr + " " + val);
								val = "";
							}

						} else if ("href".equals(attr) || "src".equals(attr)) { // <tag
																				// src/href="......">
																				// skipped
							invalidTags.add(tag + " " + attr + " " + val);
							continue;

						} else if (attr.matches("width|height")) { // <tag
																	// width/height="......">
							if (!val.toLowerCase().matches("\\d+%|\\d+$")) { // test
																				// numeric
																				// values
								invalidTags.add(tag + " " + attr + " " + val);
								continue;
							}

						} else if ("style".equals(attr)) { // <tag
															// style="......">

							// then test properties
							MatchResult styles = styleRegExp.exec(val);
							String cleanStyle = "";

							while (styles != null) {
								String styleName = styles.getGroup(1)
										.toLowerCase();
								String styleValue = styles.getGroup(2);

								// suppress invalid styles values
								if (forbiddenStyleRegExp.test(styleValue)) {
									invalidTags.add(tag + " " + attr + " "
											+ styleValue);
									continue;
								}

								// check if valid url
								MatchResult urlStyleMatcher = urlStyleRegExp
										.exec(styleValue);
								if (urlStyleMatcher != null) {
									String[] customSchemes = { "http", "https" };
									String url = urlStyleMatcher.getGroup(1);
									if (!new UrlValidator(customSchemes)
											.isValid(url)) {
										invalidTags.add(tag + " " + attr + " "
												+ styleValue);
										continue;
									}
								}

								cleanStyle = cleanStyle + styleName + ":"
										+ encode(styleValue) + ";";
								styles = styleRegExp.exec(val);
							}
							val = cleanStyle;

						} else if (attr.startsWith("on")) { // skip all
															// JavaScript events
							invalidTags.add(tag + " " + attr + " " + val);
							continue;

						} else { // by default encode all properties
							val = encode(val);
						}

						cleanToken = cleanToken + " " + attr + "=\"" + val
								+ "\"";
					}
					cleanToken = cleanToken + ">";

					isAcceptedToken = true;

					// for <img> and <a>
					if (tag.matches("a|img|embed") && !foundURL) {
						isAcceptedToken = false;
						cleanToken = "";
					}

					token = cleanToken;

					// push the tag if require closure and it is accepted
					// (otherwirse is encoded)
					if (isAcceptedToken
							&& !(standAloneTags.test(tag) || selfClosed
									.test(tag)))
						openTags.push(tag);

					// --------------------------------------------------------------------------------
					// UNKNOWN TAG
				} else {
					invalidTags.add(token);
					continue;
				}

				// --------------------------------------------------------------------------------
				// CLOSE TAG </tag>
			} else if (tagCloseRegExp.test(token)) {
				String tag = endMatcher.getGroup(1).toLowerCase();

				// is self closing
				if (selfClosed.test(tag)) {
					invalidTags.add(token);
					continue;
				}
				if (forbiddenTags.test(tag)) {
					invalidTags.add("/" + tag);
					continue;
				}
				if (!allowedTags.test(tag)) {
					invalidTags.add(token);
					continue;
				} else {
					String cleanToken = "";

					// check tag position in the stack
					int pos = openTags.search(tag);
					// if found on top ok
					for (int i = 1; i <= pos; i++) {
						// pop all elements before tag and close it
						String poppedTag = openTags.pop();
						cleanToken = cleanToken + "</" + poppedTag + ">";
						isAcceptedToken = true;
					}
					token = cleanToken;
				}

			}

			if (isAcceptedToken) {
				this.safeHtmlBuilder.appendHtmlConstant(token);
				// ret.text = ret.text + " ";
			} else {
				String sanToken = htmlEncodeApexesAndTags(token);
				this.safeHtmlBuilder.append(SafeHtmlUtils
						.fromSafeConstant(sanToken));
				this.textBuilder
						.append(htmlEncodeApexesAndTags(removeLineFeed(token)));
			}
		}

		// must close remaining tags
		while (openTags.size() > 0) {
			// pop all elements before tag and close it
			String poppedTag = openTags.pop();
			this.safeHtmlBuilder.appendHtmlConstant("</" + poppedTag + ">");
		}

		// set boolean value
		isValid = invalidTags.size() == 0;
	}

	/**
	 * Splits html tag and tag content <......>.
	 * 
	 * @param html
	 * @return a list of token
	 */
	private static List<String> tokenize(String html) {
		ArrayList<String> tokens = new ArrayList<String>();
		int pos = 0;
		String token = "";
		int len = html.length();
		while (pos < len) {
			char c = html.charAt(pos);

			String ahead = html.substring(pos, pos > len - 4 ? len : pos + 4);

			// a comment is starting
			if ("<!--".equals(ahead)) {
				// store the current token
				if (token.length() > 0)
					tokens.add(token);

				// clear the token
				token = "";

				// serch the end of <......>
				int end = moveToMarkerEnd(pos, "-->", html);
				tokens.add(html.substring(pos, end));
				pos = end;

				// a new "<" token is starting
			} else if ('<' == c) {

				// store the current token
				if (token.length() > 0)
					tokens.add(token);

				// clear the token
				token = "";

				// serch the end of <......>
				int end = moveToMarkerEnd(pos, ">", html);
				tokens.add(html.substring(pos, end));
				pos = end;

			} else {
				token = token + c;
				pos++;
			}

		}

		// store the last token
		if (token.length() > 0)
			tokens.add(token);

		return tokens;
	}

	private static int moveToMarkerEnd(int pos, String marker, String s) {
		int i = s.indexOf(marker, pos);
		if (i > -1)
			pos = i + marker.length();
		else
			pos = s.length();
		return pos;
	}

	/**
	 * Contains the sanitizing results. html is the sanitized html encoded ready
	 * to be printed. Unaccepted tag are encode, text inside tag is always
	 * encoded MUST BE USED WHEN PRINTING HTML text is the text inside valid
	 * tags. Contains invalid tags encoded SHOULD BE USED TO PRINT EXCERPTS
	 * isValid is true when every tag is accepted without forcing encoding
	 * invalidTags is the list of encoded-killed tags
	 */
	private SafeHtmlBuilder safeHtmlBuilder = new SafeHtmlBuilder();
	private StringBuilder textBuilder = new StringBuilder();
	private boolean isValid = true;
	private List<String> invalidTags = new ArrayList<String>();

	public SafeHtml getHtml() {
		return safeHtmlBuilder.toSafeHtml();
	}

	public void setHtml(SafeHtml html) {
		this.safeHtmlBuilder = new SafeHtmlBuilder().append(html);
	}

	public void setHtml(String html) {
		_sanitizeHtml(html);
	}

	public String getText() {
		return textBuilder.toString();
	}

	public void setText(StringBuilder text) {
		this.textBuilder = text;
	}

	public boolean isValid() {
		return isValid;
	}

	public void setValid(boolean isValid) {
		this.isValid = isValid;
	}

	public List<String> getInvalidTags() {
		return invalidTags;
	}

	public void setInvalidTags(List<String> invalidTags) {
		this.invalidTags = invalidTags;
	}

	public static String encode(String s) {
		return convertLineFeedToBR(htmlEncodeApexesAndTags(s == null ? "" : s));
	}

	public static final String htmlEncodeApexesAndTags(String source) {
		return htmlEncodeTag(htmlEncodeApexes(source));
	}

	public static final String htmlEncodeApexes(String source) {
		if (source != null) {
			String result = replaceAllNoRegex(source,
					new String[] { "\"", "'" }, new String[] { "&quot;",
							"&#39;" });
			return result;
		} else
			return null;
	}

	public static final String htmlEncodeTag(String source) {
		if (source != null) {
			String result = replaceAllNoRegex(source,
					new String[] { "<", ">" }, new String[] { "&lt;", "&gt;" });
			return result;
		} else
			return null;
	}

	public static String convertLineFeedToBR(String text) {
		if (text != null)
			return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" },
					new String[] { "<br/>", "<br/>", " " });
		else
			return null;
	}

	public static String removeLineFeed(String text) {

		if (text != null)
			return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" },
					new String[] { " ", " ", " " });
		else
			return null;
	}

	public static final String replaceAllNoRegex(String source,
			String searches[], String replaces[]) {
		int k;
		String tmp = source;
		for (k = 0; k < searches.length; k++)
			tmp = replaceAllNoRegex(tmp, searches[k], replaces[k]);
		return tmp;
	}

	public static final String replaceAllNoRegex(String source, String search,
			String replace) {
		StringBuffer buffer = new StringBuffer();
		if (source != null) {
			if (search.length() == 0)
				return source;
			int oldPos, pos;
			for (oldPos = 0, pos = source.indexOf(search, oldPos); pos != -1; oldPos = pos
					+ search.length(), pos = source.indexOf(search, oldPos)) {
				buffer.append(source.substring(oldPos, pos));
				buffer.append(replace);
			}
			if (oldPos < source.length())
				buffer.append(source.substring(oldPos));
		}
		return new String(buffer);
	}
}

Re: RichTextArea, SafeHtml, and general rich text best practices

Reply via email to