http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/scanners.h
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/scanners.h 
b/compiler/modules/CommonMark/src/scanners.h
index f360505..a6a71bf 100644
--- a/compiler/modules/CommonMark/src/scanners.h
+++ b/compiler/modules/CommonMark/src/scanners.h
@@ -5,25 +5,40 @@
 extern "C" {
 #endif
 
-int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int 
offset);
-int _scan_autolink_uri(const unsigned char *p);
-int _scan_autolink_email(const unsigned char *p);
-int _scan_html_tag(const unsigned char *p);
-int _scan_html_block_tag(const unsigned char *p);
-int _scan_link_url(const unsigned char *p);
-int _scan_link_title(const unsigned char *p);
-int _scan_spacechars(const unsigned char *p);
-int _scan_atx_header_start(const unsigned char *p);
-int _scan_setext_header_line(const unsigned char *p);
-int _scan_hrule(const unsigned char *p);
-int _scan_open_code_fence(const unsigned char *p);
-int _scan_close_code_fence(const unsigned char *p);
-int _scan_entity(const unsigned char *p);
+bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk 
*c, bufsize_t offset);
+bufsize_t _scan_scheme(const unsigned char *p);
+bufsize_t _scan_autolink_uri(const unsigned char *p);
+bufsize_t _scan_autolink_email(const unsigned char *p);
+bufsize_t _scan_html_tag(const unsigned char *p);
+bufsize_t _scan_html_block_start(const unsigned char *p);
+bufsize_t _scan_html_block_start_7(const unsigned char *p);
+bufsize_t _scan_html_block_end_1(const unsigned char *p);
+bufsize_t _scan_html_block_end_2(const unsigned char *p);
+bufsize_t _scan_html_block_end_3(const unsigned char *p);
+bufsize_t _scan_html_block_end_4(const unsigned char *p);
+bufsize_t _scan_html_block_end_5(const unsigned char *p);
+bufsize_t _scan_link_url(const unsigned char *p);
+bufsize_t _scan_link_title(const unsigned char *p);
+bufsize_t _scan_spacechars(const unsigned char *p);
+bufsize_t _scan_atx_header_start(const unsigned char *p);
+bufsize_t _scan_setext_header_line(const unsigned char *p);
+bufsize_t _scan_hrule(const unsigned char *p);
+bufsize_t _scan_open_code_fence(const unsigned char *p);
+bufsize_t _scan_close_code_fence(const unsigned char *p);
+bufsize_t _scan_entity(const unsigned char *p);
+bufsize_t _scan_dangerous_url(const unsigned char *p);
 
+#define scan_scheme(c, n) _scan_at(&_scan_scheme, c, n)
 #define scan_autolink_uri(c, n) _scan_at(&_scan_autolink_uri, c, n)
 #define scan_autolink_email(c, n) _scan_at(&_scan_autolink_email, c, n)
 #define scan_html_tag(c, n) _scan_at(&_scan_html_tag, c, n)
-#define scan_html_block_tag(c, n) _scan_at(&_scan_html_block_tag, c, n)
+#define scan_html_block_start(c, n) _scan_at(&_scan_html_block_start, c, n)
+#define scan_html_block_start_7(c, n) _scan_at(&_scan_html_block_start_7, c, n)
+#define scan_html_block_end_1(c, n) _scan_at(&_scan_html_block_end_1, c, n)
+#define scan_html_block_end_2(c, n) _scan_at(&_scan_html_block_end_2, c, n)
+#define scan_html_block_end_3(c, n) _scan_at(&_scan_html_block_end_3, c, n)
+#define scan_html_block_end_4(c, n) _scan_at(&_scan_html_block_end_4, c, n)
+#define scan_html_block_end_5(c, n) _scan_at(&_scan_html_block_end_5, c, n)
 #define scan_link_url(c, n) _scan_at(&_scan_link_url, c, n)
 #define scan_link_title(c, n) _scan_at(&_scan_link_title, c, n)
 #define scan_spacechars(c, n) _scan_at(&_scan_spacechars, c, n)
@@ -33,6 +48,7 @@ int _scan_entity(const unsigned char *p);
 #define scan_open_code_fence(c, n) _scan_at(&_scan_open_code_fence, c, n)
 #define scan_close_code_fence(c, n) _scan_at(&_scan_close_code_fence, c, n)
 #define scan_entity(c, n) _scan_at(&_scan_entity, c, n)
+#define scan_dangerous_url(c, n) _scan_at(&_scan_dangerous_url, c, n)
 
 #ifdef __cplusplus
 }

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/scanners.re
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/scanners.re 
b/compiler/modules/CommonMark/src/scanners.re
index d83efde..fbe3283 100644
--- a/compiler/modules/CommonMark/src/scanners.re
+++ b/compiler/modules/CommonMark/src/scanners.re
@@ -2,9 +2,9 @@
 #include "chunk.h"
 #include "scanners.h"
 
-int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int offset)
+bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk 
*c, bufsize_t offset)
 {
-       int res;
+       bufsize_t res;
        unsigned char *ptr = (unsigned char *)c->data;
        unsigned char lim = ptr[c->len];
 
@@ -24,15 +24,15 @@ int _scan_at(int (*scanner)(const unsigned char *), 
cmark_chunk *c, int offset)
 
   wordchar = [^\x00-\x20];
 
-  spacechar = [ \t\n];
+  spacechar = [ \t\v\f\r\n];
 
   reg_char     = [^\\()\x00-\x20];
 
   escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];
 
-  tagname = [A-Za-z][A-Za-z0-9]*;
+  tagname = [A-Za-z][A-Za-z0-9-]*;
 
-  blocktagname = 
'article'|'header'|'aside'|'hgroup'|'iframe'|'blockquote'|'hr'|'body'|'li'|'map'|'button'|'object'|'canvas'|'ol'|'caption'|'output'|'col'|'p'|'colgroup'|'pre'|'dd'|'progress'|'div'|'section'|'dl'|'table'|'td'|'dt'|'tbody'|'embed'|'textarea'|'fieldset'|'tfoot'|'figcaption'|'th'|'figure'|'thead'|'footer'|'footer'|'tr'|'form'|'ul'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'video'|'script'|'style';
+  blocktagname = 
'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'head'|'header'|'hr'|'html'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'meta'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'pre'|'section'|'source'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul';
 
   attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
 
@@ -60,7 +60,7 @@ int _scan_at(int (*scanner)(const unsigned char *), 
cmark_chunk *c, int offset)
   htmltag = opentag | closetag | htmlcomment | processinginstruction |
             declaration | cdata;
 
-  in_parens_nosp   = [(] (reg_char|escaped_char)* [)];
+  in_parens_nosp   = [(] (reg_char|escaped_char|[\\])* [)];
 
   in_double_quotes = ["] (escaped_char|[^"\x00])* ["];
   in_single_quotes = ['] (escaped_char|[^'\x00])* ['];
@@ -69,19 +69,30 @@ int _scan_at(int (*scanner)(const unsigned char *), 
cmark_chunk *c, int offset)
   scheme = 
'coap'|'doi'|'javascript'|'aaa'|'aaas'|'about'|'acap'|'cap'|'cid'|'crid'|'data'|'dav'|'dict'|'dns'|'file'|'ftp'|'geo'|'go'|'gopher'|'h323'|'http'|'https'|'iax'|'icap'|'im'|'imap'|'info'|'ipp'|'iris'|'iris.beep'|'iris.xpc'|'iris.xpcs'|'iris.lwz'|'ldap'|'mailto'|'mid'|'msrp'|'msrps'|'mtqp'|'mupdate'|'news'|'nfs'|'ni'|'nih'|'nntp'|'opaquelocktoken'|'pop'|'pres'|'rtsp'|'service'|'session'|'shttp'|'sieve'|'sip'|'sips'|'sms'|'snmp'|'soap.beep'|'soap.beeps'|'tag'|'tel'|'telnet'|'tftp'|'thismessage'|'tn3270'|'tip'|'tv'|'urn'|'vemmi'|'ws'|'wss'|'xcon'|'xcon-userid'|'xmlrpc.beep'|'xmlrpc.beeps'|'xmpp'|'z39.50r'|'z39.50s'|'adiumxtra'|'afp'|'afs'|'aim'|'apt'|'attachment'|'aw'|'beshare'|'bitcoin'|'bolo'|'callto'|'chrome'|'chrome-extension'|'com-eventbrite-attendee'|'content'|'cvs'|'dlna-playsingle'|'dlna-playcontainer'|'dtn'|'dvb'|'ed2k'|'facetime'|'feed'|'finger'|'fish'|'gg'|'git'|'gizmoproject'|'gtalk'|'hcp'|'icon'|'ipn'|'irc'|'irc6'|'ircs'|'itms'|'jar'|'jms'|'keyparc'|'lastfm'|'lda
 
ps'|'magnet'|'maps'|'market'|'message'|'mms'|'ms-help'|'msnim'|'mumble'|'mvn'|'notes'|'oid'|'palm'|'paparazzi'|'platform'|'proxy'|'psyc'|'query'|'res'|'resource'|'rmi'|'rsync'|'rtmp'|'secondlife'|'sftp'|'sgn'|'skype'|'smb'|'soldat'|'spotify'|'ssh'|'steam'|'svn'|'teamspeak'|'things'|'udp'|'unreal'|'ut2004'|'ventrilo'|'view-source'|'webcal'|'wtai'|'wyciwyg'|'xfire'|'xri'|'ymsgr';
 */
 
+// Try to match a scheme including colon.
+bufsize_t _scan_scheme(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  scheme [:] { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+
 // Try to match URI autolink after first <, returning number of chars matched.
-int _scan_autolink_uri(const unsigned char *p)
+bufsize_t _scan_autolink_uri(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  scheme [:]([^\x00-\x20<>\\]|escaped_char)*[>]  { return (p - start); }
+  scheme [:][^\x00-\x20<>]*[>]  { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
 
 // Try to match email autolink after first <, returning num of chars matched.
-int _scan_autolink_email(const unsigned char *p)
+bufsize_t _scan_autolink_email(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
@@ -90,32 +101,101 @@ int _scan_autolink_email(const unsigned char *p)
     [@]
     [a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
     ([.][a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
-    [>] { return (p - start); }
+    [>] { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
 
 // Try to match an HTML tag after first <, returning num of chars matched.
-int _scan_html_tag(const unsigned char *p)
+bufsize_t _scan_html_tag(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  htmltag { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+
+// Try to match an HTML block tag start line, returning
+// an integer code for the type of block (1-6, matching the spec).
+// #7 is handled by a separate function, below.
+bufsize_t _scan_html_block_start(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+/*!re2c
+  [<] ('script'|'pre'|'style') (spacechar | [>]) { return 1; }
+  '<!--' { return 2; }
+  '<?' { return 3; }
+  '<!' [A-Z] { return 4; }
+  '<![CDATA[' { return 5; }
+  [<] [/]? blocktagname (spacechar | [/]? [>])  { return 6; }
+  .? { return 0; }
+*/
+}
+
+// Try to match an HTML block tag start line of type 7, returning
+// 7 if successful, 0 if not.
+bufsize_t _scan_html_block_start_7(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+/*!re2c
+  [<] (opentag | closetag) [\t\n\f ]* [\r\n] { return 7; }
+  .? { return 0; }
+*/
+}
+
+// Try to match an HTML block end line of type 1
+bufsize_t _scan_html_block_end_1(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  .* [<] [/] ('script'|'pre'|'style') [>] { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+
+// Try to match an HTML block end line of type 2
+bufsize_t _scan_html_block_end_2(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  .* '-->' { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+
+// Try to match an HTML block end line of type 3
+bufsize_t _scan_html_block_end_3(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  .* '?>' { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+
+// Try to match an HTML block end line of type 4
+bufsize_t _scan_html_block_end_4(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  htmltag { return (p - start); }
+  .* '>' { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
 
-// Try to match an HTML block tag including first <,
-// returning num of chars matched.
-int _scan_html_block_tag(const unsigned char *p)
+// Try to match an HTML block end line of type 5
+bufsize_t _scan_html_block_end_5(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  [<] [/] blocktagname (spacechar | [>])  { return (p - start); }
-  [<] blocktagname (spacechar | [/>]) { return (p - start); }
-  [<] [!?] { return (p - start); }
+  .* ']]>' { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
@@ -124,13 +204,13 @@ int _scan_html_block_tag(const unsigned char *p)
 // This may optionally be contained in <..>; otherwise
 // whitespace and unbalanced right parentheses aren't allowed.
 // Newlines aren't ever allowed.
-int _scan_link_url(const unsigned char *p)
+bufsize_t _scan_link_url(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  [ \n]* [<] ([^<>\n\\\x00] | escaped_char | [\\])* [>] { return (p - start); }
-  [ \n]* (reg_char+ | escaped_char | in_parens_nosp)* { return (p - start); }
+  [ \r\n]* [<] ([^<>\r\n\\\x00] | escaped_char | [\\])* [>] { return 
(bufsize_t)(p - start); }
+  [ \r\n]* (reg_char+ | escaped_char | in_parens_nosp | [\\][^()])* { return 
(bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
@@ -138,47 +218,48 @@ int _scan_link_url(const unsigned char *p)
 // Try to match a link title (in single quotes, in double quotes, or
 // in parentheses), returning number of chars matched.  Allow one
 // level of internal nesting (quotes within quotes).
-int _scan_link_title(const unsigned char *p)
+bufsize_t _scan_link_title(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  ["] (escaped_char|[^"\x00])* ["]   { return (p - start); }
-  ['] (escaped_char|[^'\x00])* ['] { return (p - start); }
-  [(] (escaped_char|[^)\x00])* [)]  { return (p - start); }
+  ["] (escaped_char|[^"\x00])* ["]   { return (bufsize_t)(p - start); }
+  ['] (escaped_char|[^'\x00])* ['] { return (bufsize_t)(p - start); }
+  [(] (escaped_char|[^)\x00])* [)]  { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
 
 // Match space characters, including newlines.
-int _scan_spacechars(const unsigned char *p)
+bufsize_t _scan_spacechars(const unsigned char *p)
 {
+  const unsigned char *marker = NULL;
   const unsigned char *start = p; \
 /*!re2c
-  [ \t\n]* { return (p - start); }
+  [ \t\v\f\r\n]* { return (bufsize_t)(p - start); }
   . { return 0; }
 */
 }
 
 // Match ATX header start.
-int _scan_atx_header_start(const unsigned char *p)
+bufsize_t _scan_atx_header_start(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  [#]{1,6} ([ ]+|[\n])  { return (p - start); }
+  [#]{1,6} ([ ]+|[\r\n])  { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
 
-// Match sexext header line.  Return 1 for level-1 header,
+// Match setext header line.  Return 1 for level-1 header,
 // 2 for level-2, 0 for no match.
-int _scan_setext_header_line(const unsigned char *p)
+bufsize_t _scan_setext_header_line(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
 /*!re2c
-  [=]+ [ ]* [\n] { return 1; }
-  [-]+ [ ]* [\n] { return 2; }
+  [=]+ [ ]* [\r\n] { return 1; }
+  [-]+ [ ]* [\r\n] { return 2; }
   .? { return 0; }
 */
 }
@@ -186,51 +267,65 @@ int _scan_setext_header_line(const unsigned char *p)
 // Scan a horizontal rule line: "...three or more hyphens, asterisks,
 // or underscores on a line by themselves. If you wish, you may use
 // spaces between the hyphens or asterisks."
-int _scan_hrule(const unsigned char *p)
+bufsize_t _scan_hrule(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  ([*][ ]*){3,} [ \t]* [\n] { return (p - start); }
-  ([_][ ]*){3,} [ \t]* [\n] { return (p - start); }
-  ([-][ ]*){3,} [ \t]* [\n] { return (p - start); }
+  ([*][ ]*){3,} [ \t]* [\r\n] { return (bufsize_t)(p - start); }
+  ([_][ ]*){3,} [ \t]* [\r\n] { return (bufsize_t)(p - start); }
+  ([-][ ]*){3,} [ \t]* [\r\n] { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
 
 // Scan an opening code fence.
-int _scan_open_code_fence(const unsigned char *p)
+bufsize_t _scan_open_code_fence(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  [`]{3,} / [^`\n\x00]*[\n] { return (p - start); }
-  [~]{3,} / [^~\n\x00]*[\n] { return (p - start); }
+  [`]{3,} / [^`\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); }
+  [~]{3,} / [^~\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); }
   .?                        { return 0; }
 */
 }
 
 // Scan a closing code fence with length at least len.
-int _scan_close_code_fence(const unsigned char *p)
+bufsize_t _scan_close_code_fence(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
-  [`]{3,} / [ \t]*[\n] { return (p - start); }
-  [~]{3,} / [ \t]*[\n] { return (p - start); }
+  [`]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); }
+  [~]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); }
   .? { return 0; }
 */
 }
 
 // Scans an entity.
 // Returns number of chars matched.
-int _scan_entity(const unsigned char *p)
+bufsize_t _scan_entity(const unsigned char *p)
 {
   const unsigned char *marker = NULL;
   const unsigned char *start = p;
 /*!re2c
   [&] ([#] ([Xx][A-Fa-f0-9]{1,8}|[0-9]{1,8}) |[A-Za-z][A-Za-z0-9]{1,31} ) [;]
-     { return (p - start); }
+     { return (bufsize_t)(p - start); }
+  .? { return 0; }
+*/
+}
+
+// Returns positive value if a URL begins in a way that is potentially
+// dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0.
+bufsize_t _scan_dangerous_url(const unsigned char *p)
+{
+  const unsigned char *marker = NULL;
+  const unsigned char *start = p;
+/*!re2c
+  'data:image/' ('png'|'gif'|'jpeg'|'webp') { return 0; }
+  'javascript:' | 'vbscript:' | 'file:' | 'data:' { return (bufsize_t)(p - 
start); }
   .? { return 0; }
 */
 }
+

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/utf8.c
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/utf8.c 
b/compiler/modules/CommonMark/src/utf8.c
index d77c5d1..ffe6652 100644
--- a/compiler/modules/CommonMark/src/utf8.c
+++ b/compiler/modules/CommonMark/src/utf8.c
@@ -30,7 +30,7 @@ static void encode_unknown(cmark_strbuf *buf)
        cmark_strbuf_put(buf, repl, 3);
 }
 
-static int utf8proc_charlen(const uint8_t *str, int str_len)
+static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)
 {
        int length, i;
 
@@ -42,7 +42,7 @@ static int utf8proc_charlen(const uint8_t *str, int str_len)
        if (!length)
                return -1;
 
-       if (str_len >= 0 && length > str_len)
+       if (str_len >= 0 && (bufsize_t)length > str_len)
                return -str_len;
 
        for (i = 1; i < length; i++) {
@@ -54,23 +54,20 @@ static int utf8proc_charlen(const uint8_t *str, int str_len)
 }
 
 // Validate a single UTF-8 character according to RFC 3629.
-static int utf8proc_valid(const uint8_t *str, int str_len)
+static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 {
-       int length = utf8proc_charlen(str, str_len);
+       int length = utf8proc_utf8class[str[0]];
 
-       if (length <= 0)
-               return length;
+       if (!length)
+               return -1;
 
-       switch (length) {
-       case 1:
-               if (str[0] == 0x00) {
-                       // ASCII NUL is technically valid but rejected
-                       // for security reasons.
-                       return -length;
-               }
-               break;
+       if ((bufsize_t)length > str_len)
+               return -str_len;
 
+       switch (length) {
        case 2:
+               if ((str[1] & 0xC0) != 0x80)
+                       return -1;
                if (str[0] < 0xC2) {
                        // Overlong
                        return -length;
@@ -78,6 +75,10 @@ static int utf8proc_valid(const uint8_t *str, int str_len)
                break;
 
        case 3:
+               if ((str[1] & 0xC0) != 0x80)
+                       return -1;
+               if ((str[2] & 0xC0) != 0x80)
+                       return -2;
                if (str[0] == 0xE0) {
                        if (str[1] < 0xA0) {
                                // Overlong
@@ -92,6 +93,12 @@ static int utf8proc_valid(const uint8_t *str, int str_len)
                break;
 
        case 4:
+               if ((str[1] & 0xC0) != 0x80)
+                       return -1;
+               if ((str[2] & 0xC0) != 0x80)
+                       return -2;
+               if ((str[3] & 0xC0) != 0x80)
+                       return -3;
                if (str[0] == 0xF0) {
                        if (str[1] < 0x90) {
                                // Overlong
@@ -109,49 +116,47 @@ static int utf8proc_valid(const uint8_t *str, int str_len)
        return length;
 }
 
-void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, size_t size)
+void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 {
-       static const uint8_t whitespace[] = "    ";
-
-       size_t i = 0, tab = 0;
+       bufsize_t i = 0;
 
        while (i < size) {
-               size_t org = i;
-
-               while (i < size && line[i] != '\t' && line[i] != '\0'
-                      && line[i] < 0x80) {
-                       i++;
-                       tab++;
+               bufsize_t org = i;
+               int charlen = 0;
+
+               while (i < size) {
+                       if (line[i] < 0x80 && line[i] != 0) {
+                               i++;
+                       } else if (line[i] >= 0x80) {
+                               charlen = utf8proc_valid(line + i, size - i);
+                               if (charlen < 0) {
+                                       charlen = -charlen;
+                                       break;
+                               }
+                               i += charlen;
+                       } else if (line[i] == 0) {
+                               // ASCII NUL is technically valid but rejected
+                               // for security reasons.
+                               charlen = 1;
+                               break;
+                       }
                }
 
-               if (i > org)
+               if (i > org) {
                        cmark_strbuf_put(ob, line + org, i - org);
+               }
 
-               if (i >= size)
+               if (i >= size) {
                        break;
-
-               if (line[i] == '\t') {
-                       int numspaces = 4 - (tab % 4);
-                       cmark_strbuf_put(ob, whitespace, numspaces);
-                       i += 1;
-                       tab += numspaces;
                } else {
-                       int charlen = utf8proc_valid(line + i, size - i);
-
-                       if (charlen >= 0) {
-                               cmark_strbuf_put(ob, line + i, charlen);
-                       } else {
-                               encode_unknown(ob);
-                               charlen = -charlen;
-                       }
-
+                       // Invalid UTF-8
+                       encode_unknown(ob);
                        i += charlen;
-                       tab += 1;
                }
        }
 }
 
-int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst)
+int utf8proc_iterate(const uint8_t *str, bufsize_t str_len, int32_t *dst)
 {
        int length;
        int32_t uc = -1;
@@ -172,8 +177,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, 
int32_t *dst)
        case 3:
                uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
                     + (str[2] & 0x3F);
-               if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
-                   (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
+               if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000)) uc = -1;
                break;
        case 4:
                uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
@@ -182,7 +186,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, 
int32_t *dst)
                break;
        }
 
-       if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
+       if (uc < 0)
                return -1;
 
        *dst = uc;
@@ -192,7 +196,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, 
int32_t *dst)
 void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf)
 {
        uint8_t dst[4];
-       int len = 0;
+       bufsize_t len = 0;
 
        assert(uc >= 0);
 
@@ -228,7 +232,7 @@ void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf)
        cmark_strbuf_put(buf, dst, len);
 }
 
-void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len)
+void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, bufsize_t len)
 {
        int32_t c;
 
@@ -236,7 +240,7 @@ void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t 
*str, int len)
        utf8proc_encode_char(x, dest)
 
        while (len > 0) {
-               int char_len = utf8proc_iterate(str, len, &c);
+               bufsize_t char_len = utf8proc_iterate(str, len, &c);
 
                if (char_len >= 0) {
 #include "case_fold_switch.inc"

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/utf8.h
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/utf8.h 
b/compiler/modules/CommonMark/src/utf8.h
index 7df1573..9f1a4ec 100644
--- a/compiler/modules/CommonMark/src/utf8.h
+++ b/compiler/modules/CommonMark/src/utf8.h
@@ -8,10 +8,10 @@
 extern "C" {
 #endif
 
-void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len);
+void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, bufsize_t len);
 void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf);
-int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst);
-void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size);
+int utf8proc_iterate(const uint8_t *str, bufsize_t str_len, int32_t *dst);
+void utf8proc_check(cmark_strbuf *dest, const uint8_t *line, bufsize_t size);
 int utf8proc_is_space(int32_t uc);
 int utf8proc_is_punctuation(int32_t uc);
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/xml.c
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/xml.c 
b/compiler/modules/CommonMark/src/xml.c
index f630aba..7eec5a6 100644
--- a/compiler/modules/CommonMark/src/xml.c
+++ b/compiler/modules/CommonMark/src/xml.c
@@ -11,14 +11,9 @@
 
 // Functions to convert cmark_nodes to XML strings.
 
-static void escape_xml(cmark_strbuf *dest, const unsigned char *source, int 
length)
+static void escape_xml(cmark_strbuf *dest, const unsigned char *source, 
bufsize_t length)
 {
-       if (source != NULL) {
-               if (length < 0)
-                       length = strlen((char *)source);
-
-               houdini_escape_html0(dest, source, (size_t)length, 0);
-       }
+       houdini_escape_html0(dest, source, length, 0);
 }
 
 struct render_state {
@@ -36,7 +31,7 @@ static inline void indent(struct render_state *state)
 
 static int
 S_render_node(cmark_node *node, cmark_event_type ev_type,
-              struct render_state *state, long options)
+              struct render_state *state, int options)
 {
        cmark_strbuf *xml = state->xml;
        bool literal = false;
@@ -118,10 +113,12 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
                case CMARK_NODE_LINK:
                case CMARK_NODE_IMAGE:
                        cmark_strbuf_puts(xml, " destination=\"");
-                       escape_xml(xml, node->as.link.url, -1);
+                       escape_xml(xml, node->as.link.url.data,
+                                  node->as.link.url.len);
                        cmark_strbuf_putc(xml, '"');
                        cmark_strbuf_puts(xml, " title=\"");
-                       escape_xml(xml, node->as.link.title, -1);
+                       escape_xml(xml, node->as.link.title.data,
+                                  node->as.link.title.len);
                        cmark_strbuf_putc(xml, '"');
                        break;
                default:
@@ -145,7 +142,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
        return 1;
 }
 
-char *cmark_render_xml(cmark_node *root, long options)
+char *cmark_render_xml(cmark_node *root, int options)
 {
        char *result;
        cmark_strbuf xml = GH_BUF_INIT;
@@ -153,10 +150,6 @@ char *cmark_render_xml(cmark_node *root, long options)
        cmark_node *cur;
        struct render_state state = { &xml, 0 };
 
-       if (options & CMARK_OPT_NORMALIZE) {
-               cmark_consolidate_text_nodes(root);
-       }
-
        cmark_iter *iter = cmark_iter_new(root);
 
        cmark_strbuf_puts(state.xml,
@@ -170,6 +163,5 @@ char *cmark_render_xml(cmark_node *root, long options)
        result = (char *)cmark_strbuf_detach(&xml);
 
        cmark_iter_free(iter);
-       cmark_strbuf_free(&xml);
        return result;
 }

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/src/CFCCHtml.c
----------------------------------------------------------------------
diff --git a/compiler/src/CFCCHtml.c b/compiler/src/CFCCHtml.c
index 0409a7b..34b4336 100644
--- a/compiler/src/CFCCHtml.c
+++ b/compiler/src/CFCCHtml.c
@@ -756,7 +756,10 @@ S_html_create_inheritance(CFCClass *klass) {
 
 static char*
 S_md_to_html(CFCClass *klass, const char *md) {
-    cmark_node *doc = cmark_parse_document(md, strlen(md));
+    int options = CMARK_OPT_SMART
+                  | CMARK_OPT_VALIDATE_UTF8
+                  | CMARK_OPT_SAFE;
+    cmark_node *doc = cmark_parse_document(md, strlen(md), options);
     S_convert_uris(klass, doc);
     char *html = cmark_render_html(doc, CMARK_OPT_DEFAULT);
     cmark_node_free(doc);

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/src/CFCCMan.c
----------------------------------------------------------------------
diff --git a/compiler/src/CFCCMan.c b/compiler/src/CFCCMan.c
index 354d91f..79248c8 100644
--- a/compiler/src/CFCCMan.c
+++ b/compiler/src/CFCCMan.c
@@ -410,7 +410,11 @@ S_man_create_inheritance(CFCClass *klass) {
 
 static char*
 S_md_to_man(CFCClass *klass, const char *md, int needs_indent) {
-    cmark_node *doc = cmark_parse_document(md, strlen(md));
+    int options = CMARK_OPT_NORMALIZE
+                  | CMARK_OPT_SMART
+                  | CMARK_OPT_VALIDATE_UTF8
+                  | CMARK_OPT_SAFE;
+    cmark_node *doc = cmark_parse_document(md, strlen(md), options);
     char *result = S_nodes_to_man(klass, doc, needs_indent);
     cmark_node_free(doc);
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/src/CFCPerlPod.c
----------------------------------------------------------------------
diff --git a/compiler/src/CFCPerlPod.c b/compiler/src/CFCPerlPod.c
index 24d6c4b..8fc2eae 100644
--- a/compiler/src/CFCPerlPod.c
+++ b/compiler/src/CFCPerlPod.c
@@ -322,7 +322,11 @@ CFCPerlPod_gen_subroutine_pod(CFCFunction *func,
 
 char*
 CFCPerlPod_md_to_pod(const char *md, CFCClass *klass, int header_level) {
-    cmark_node *doc = cmark_parse_document(md, strlen(md));
+    int options = CMARK_OPT_NORMALIZE
+                  | CMARK_OPT_SMART
+                  | CMARK_OPT_VALIDATE_UTF8
+                  | CMARK_OPT_SAFE;
+    cmark_node *doc = cmark_parse_document(md, strlen(md), options);
     char *pod = S_nodes_to_pod(doc, klass, header_level);
     cmark_node_free(doc);
 

Reply via email to