Repository: yetus Updated Branches: refs/heads/master c3a2359f6 -> a12ceced0
YETUS-457. RDM does not properly escape entities. Signed-off-by: Allen Wittenauer <a...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/yetus/repo Commit: http://git-wip-us.apache.org/repos/asf/yetus/commit/a12ceced Tree: http://git-wip-us.apache.org/repos/asf/yetus/tree/a12ceced Diff: http://git-wip-us.apache.org/repos/asf/yetus/diff/a12ceced Branch: refs/heads/master Commit: a12ceced072c1dc01c285492a8ffda27d20e898c Parents: c3a2359 Author: Andrew Wang <andrew.w...@cloudera.com> Authored: Tue Sep 6 14:30:48 2016 -0700 Committer: Allen Wittenauer <a...@apache.org> Committed: Thu Sep 15 18:16:34 2016 -0700 ---------------------------------------------------------------------- release-doc-maker/releasedocmaker.py | 12 ++--- release-doc-maker/utils.py | 80 ++++++++++++++++++++----------- 2 files changed, 59 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/yetus/blob/a12ceced/release-doc-maker/releasedocmaker.py ---------------------------------------------------------------------- diff --git a/release-doc-maker/releasedocmaker.py b/release-doc-maker/releasedocmaker.py index 9987355..589a8c3 100755 --- a/release-doc-maker/releasedocmaker.py +++ b/release-doc-maker/releasedocmaker.py @@ -29,7 +29,7 @@ import urllib import urllib2 import httplib import json -from utils import to_unicode, text_sanitize, processrelnote, Outputs +from utils import to_unicode, sanitize_text, processrelnote, Outputs try: @@ -531,11 +531,11 @@ class Linter(object): if self._filters["incompatible"] and jira.get_incompatible_change(): self._warning_count += 1 self._lint_message += "\nWARNING: incompatible change %s lacks release notes." % \ - (text_sanitize(jira.get_id())) + (sanitize_text(jira.get_id())) if self._filters["important"] and jira.get_important(): self._warning_count += 1 self._lint_message += "\nWARNING: important issue %s lacks release notes." % \ - (text_sanitize(jira.get_id())) + (sanitize_text(jira.get_id())) if self._check_version_string(jira): self._warning_count += 1 @@ -815,10 +815,10 @@ def main(): else: otherlist.append(jira) - line = '* [%s](' % (text_sanitize(jira.get_id())) + JIRA_BASE_URL + \ + line = '* [%s](' % (sanitize_text(jira.get_id())) + JIRA_BASE_URL + \ '/browse/%s) | *%s* | **%s**\n' \ - % (text_sanitize(jira.get_id()), - text_sanitize(jira.get_priority()), text_sanitize(jira.get_summary())) + % (sanitize_text(jira.get_id()), + sanitize_text(jira.get_priority()), sanitize_text(jira.get_summary())) if len(jira.get_release_note()) > 0 or \ jira.get_incompatible_change() or jira.get_important(): http://git-wip-us.apache.org/repos/asf/yetus/blob/a12ceced/release-doc-maker/utils.py ---------------------------------------------------------------------- diff --git a/release-doc-maker/utils.py b/release-doc-maker/utils.py index 0ef9290..55de3d5 100644 --- a/release-doc-maker/utils.py +++ b/release-doc-maker/utils.py @@ -23,7 +23,7 @@ BASE_URL = "https://issues.apache.org/jira" def clean(input_string): - return markdown_sanitize(re.sub(NAME_PATTERN, "", input_string)) + return sanitize_markdown(re.sub(NAME_PATTERN, "", input_string)) def format_components(input_string): @@ -36,28 +36,54 @@ def format_components(input_string): return clean(ret) -# convert to utf-8 -def markdown_sanitize(input_string): - input_string = input_string.encode('utf-8') +# Return the string encoded as UTF-8. +# +# This is necessary for handling markdown in Python. +def encode_utf8(input_string): + return input_string.encode('utf-8') + + +# Sanitize Markdown input so it can be handled by Python. +# +# The expectation is that the input is already valid Markdown, +# so no additional escaping is required. +def sanitize_markdown(input_string): + input_string = encode_utf8(input_string) input_string = input_string.replace("\r", "") input_string = input_string.rstrip() return input_string -# same thing as markdownsanitize, -# except markdown metachars are also -# escaped as well as more -# things we don't want doxia, etc, to -# screw up -def text_sanitize(input_string): - input_string = markdown_sanitize(input_string) - input_string = input_string.replace("_", r"\_") - input_string = input_string.replace("|", r"\|") - input_string = input_string.replace("<", r"\<") - input_string = input_string.replace(">", r"\>") - input_string = input_string.replace("*", r"\*") - input_string = input_string.rstrip() - return input_string +# Sanitize arbitrary text so it can be embedded in MultiMarkdown output. +# +# Note that MultiMarkdown is not Markdown, and cannot be parsed as such. +# For instance, when using pandoc, invoke it as `pandoc -f markdown_mmd`. +# +# Calls sanitize_markdown at the end as a final pass. +def sanitize_text(input_string): + escapes = dict() + # See: https://daringfireball.net/projects/markdown/syntax#backslash + # We only escape a subset of special characters. We ignore characters + # that only have significance at the start of a line. + slash_escapes = "_<>*|" + slash_escapes += "'" + slash_escapes += "\\" + all_chars = set() + # Construct a set of escapes + for c in slash_escapes: + all_chars.add(c) + for c in all_chars: + escapes[c] = "\\" + c + + # Build the output string character by character to prevent double escaping + output_string = "" + for c in input_string: + o = c + if c in escapes: + o = escapes[c] + output_string += o + + return sanitize_markdown(output_string.rstrip()) # if release notes have a special marker, @@ -66,12 +92,12 @@ def processrelnote(input_string): relnote_pattern = re.compile('^\<\!\-\- ([a-z]+) \-\-\>') fmt = relnote_pattern.match(input_string) if fmt is None: - return text_sanitize(input_string) + return sanitize_text(input_string) else: return { - 'markdown': markdown_sanitize(input_string), + 'markdown': sanitize_markdown(input_string), }.get( - fmt.group(1), text_sanitize(input_string)) + fmt.group(1), sanitize_text(input_string)) def to_unicode(obj): @@ -117,11 +143,11 @@ class Outputs(object): for jira in sorted(mylist): line = '| [%s](' + BASE_URL + '/browse/%s) ' +\ '| %s | %s | %s | %s | %s |\n' - line = line % (text_sanitize(jira.get_id()), - text_sanitize(jira.get_id()), - text_sanitize(jira.get_summary()), - text_sanitize(jira.get_priority()), + line = line % (encode_utf8(jira.get_id()), + encode_utf8(jira.get_id()), + sanitize_text(jira.get_summary()), + sanitize_text(jira.get_priority()), format_components(jira.get_components()), - text_sanitize(jira.get_reporter()), - text_sanitize(jira.get_assignee())) + sanitize_text(jira.get_reporter()), + sanitize_text(jira.get_assignee())) self.write_key_raw(jira.get_project(), line)