Repository: yetus
Updated Branches:
  refs/heads/master c3a2359f6 -> a12ceced0


YETUS-457. RDM does not properly escape entities.

Signed-off-by: Allen Wittenauer <a...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/yetus/repo
Commit: http://git-wip-us.apache.org/repos/asf/yetus/commit/a12ceced
Tree: http://git-wip-us.apache.org/repos/asf/yetus/tree/a12ceced
Diff: http://git-wip-us.apache.org/repos/asf/yetus/diff/a12ceced

Branch: refs/heads/master
Commit: a12ceced072c1dc01c285492a8ffda27d20e898c
Parents: c3a2359
Author: Andrew Wang <andrew.w...@cloudera.com>
Authored: Tue Sep 6 14:30:48 2016 -0700
Committer: Allen Wittenauer <a...@apache.org>
Committed: Thu Sep 15 18:16:34 2016 -0700

----------------------------------------------------------------------
 release-doc-maker/releasedocmaker.py | 12 ++---
 release-doc-maker/utils.py           | 80 ++++++++++++++++++++-----------
 2 files changed, 59 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/yetus/blob/a12ceced/release-doc-maker/releasedocmaker.py
----------------------------------------------------------------------
diff --git a/release-doc-maker/releasedocmaker.py 
b/release-doc-maker/releasedocmaker.py
index 9987355..589a8c3 100755
--- a/release-doc-maker/releasedocmaker.py
+++ b/release-doc-maker/releasedocmaker.py
@@ -29,7 +29,7 @@ import urllib
 import urllib2
 import httplib
 import json
-from utils import to_unicode, text_sanitize, processrelnote, Outputs
+from utils import to_unicode, sanitize_text, processrelnote, Outputs
 
 
 try:
@@ -531,11 +531,11 @@ class Linter(object):
             if self._filters["incompatible"] and 
jira.get_incompatible_change():
                 self._warning_count += 1
                 self._lint_message += "\nWARNING: incompatible change %s lacks 
release notes." % \
-                                (text_sanitize(jira.get_id()))
+                                (sanitize_text(jira.get_id()))
             if self._filters["important"] and jira.get_important():
                 self._warning_count += 1
                 self._lint_message += "\nWARNING: important issue %s lacks 
release notes." % \
-                                (text_sanitize(jira.get_id()))
+                                (sanitize_text(jira.get_id()))
 
         if self._check_version_string(jira):
             self._warning_count += 1
@@ -815,10 +815,10 @@ def main():
             else:
                 otherlist.append(jira)
 
-            line = '* [%s](' % (text_sanitize(jira.get_id())) + JIRA_BASE_URL 
+ \
+            line = '* [%s](' % (sanitize_text(jira.get_id())) + JIRA_BASE_URL 
+ \
                    '/browse/%s) | *%s* | **%s**\n' \
-                   % (text_sanitize(jira.get_id()),
-                      text_sanitize(jira.get_priority()), 
text_sanitize(jira.get_summary()))
+                   % (sanitize_text(jira.get_id()),
+                      sanitize_text(jira.get_priority()), 
sanitize_text(jira.get_summary()))
 
             if len(jira.get_release_note()) > 0 or \
                jira.get_incompatible_change() or jira.get_important():

http://git-wip-us.apache.org/repos/asf/yetus/blob/a12ceced/release-doc-maker/utils.py
----------------------------------------------------------------------
diff --git a/release-doc-maker/utils.py b/release-doc-maker/utils.py
index 0ef9290..55de3d5 100644
--- a/release-doc-maker/utils.py
+++ b/release-doc-maker/utils.py
@@ -23,7 +23,7 @@ BASE_URL = "https://issues.apache.org/jira";
 
 
 def clean(input_string):
-    return markdown_sanitize(re.sub(NAME_PATTERN, "", input_string))
+    return sanitize_markdown(re.sub(NAME_PATTERN, "", input_string))
 
 
 def format_components(input_string):
@@ -36,28 +36,54 @@ def format_components(input_string):
     return clean(ret)
 
 
-# convert to utf-8
-def markdown_sanitize(input_string):
-    input_string = input_string.encode('utf-8')
+# Return the string encoded as UTF-8.
+#
+# This is necessary for handling markdown in Python.
+def encode_utf8(input_string):
+    return input_string.encode('utf-8')
+
+
+# Sanitize Markdown input so it can be handled by Python.
+#
+# The expectation is that the input is already valid Markdown,
+# so no additional escaping is required.
+def sanitize_markdown(input_string):
+    input_string = encode_utf8(input_string)
     input_string = input_string.replace("\r", "")
     input_string = input_string.rstrip()
     return input_string
 
 
-# same thing as markdownsanitize,
-# except markdown metachars are also
-# escaped as well as more
-# things we don't want doxia, etc, to
-# screw up
-def text_sanitize(input_string):
-    input_string = markdown_sanitize(input_string)
-    input_string = input_string.replace("_", r"\_")
-    input_string = input_string.replace("|", r"\|")
-    input_string = input_string.replace("<", r"\<")
-    input_string = input_string.replace(">", r"\>")
-    input_string = input_string.replace("*", r"\*")
-    input_string = input_string.rstrip()
-    return input_string
+# Sanitize arbitrary text so it can be embedded in MultiMarkdown output.
+#
+# Note that MultiMarkdown is not Markdown, and cannot be parsed as such.
+# For instance, when using pandoc, invoke it as `pandoc -f markdown_mmd`.
+#
+# Calls sanitize_markdown at the end as a final pass.
+def sanitize_text(input_string):
+    escapes = dict()
+    # See: https://daringfireball.net/projects/markdown/syntax#backslash
+    # We only escape a subset of special characters. We ignore characters
+    # that only have significance at the start of a line.
+    slash_escapes = "_<>*|"
+    slash_escapes += "'"
+    slash_escapes += "\\"
+    all_chars = set()
+    # Construct a set of escapes
+    for c in slash_escapes:
+        all_chars.add(c)
+    for c in all_chars:
+        escapes[c] = "\\" + c
+
+    # Build the output string character by character to prevent double escaping
+    output_string = ""
+    for c in input_string:
+        o = c
+        if c in escapes:
+            o = escapes[c]
+        output_string += o
+
+    return sanitize_markdown(output_string.rstrip())
 
 
 # if release notes have a special marker,
@@ -66,12 +92,12 @@ def processrelnote(input_string):
     relnote_pattern = re.compile('^\<\!\-\- ([a-z]+) \-\-\>')
     fmt = relnote_pattern.match(input_string)
     if fmt is None:
-        return text_sanitize(input_string)
+        return sanitize_text(input_string)
     else:
         return {
-            'markdown': markdown_sanitize(input_string),
+            'markdown': sanitize_markdown(input_string),
         }.get(
-            fmt.group(1), text_sanitize(input_string))
+            fmt.group(1), sanitize_text(input_string))
 
 
 def to_unicode(obj):
@@ -117,11 +143,11 @@ class Outputs(object):
         for jira in sorted(mylist):
             line = '| [%s](' + BASE_URL + '/browse/%s) ' +\
                    '| %s |  %s | %s | %s | %s |\n'
-            line = line % (text_sanitize(jira.get_id()),
-                           text_sanitize(jira.get_id()),
-                           text_sanitize(jira.get_summary()),
-                           text_sanitize(jira.get_priority()),
+            line = line % (encode_utf8(jira.get_id()),
+                           encode_utf8(jira.get_id()),
+                           sanitize_text(jira.get_summary()),
+                           sanitize_text(jira.get_priority()),
                            format_components(jira.get_components()),
-                           text_sanitize(jira.get_reporter()),
-                           text_sanitize(jira.get_assignee()))
+                           sanitize_text(jira.get_reporter()),
+                           sanitize_text(jira.get_assignee()))
             self.write_key_raw(jira.get_project(), line)

Reply via email to