http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/scanners.h ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/scanners.h b/compiler/modules/CommonMark/src/scanners.h index f360505..a6a71bf 100644 --- a/compiler/modules/CommonMark/src/scanners.h +++ b/compiler/modules/CommonMark/src/scanners.h @@ -5,25 +5,40 @@ extern "C" { #endif -int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int offset); -int _scan_autolink_uri(const unsigned char *p); -int _scan_autolink_email(const unsigned char *p); -int _scan_html_tag(const unsigned char *p); -int _scan_html_block_tag(const unsigned char *p); -int _scan_link_url(const unsigned char *p); -int _scan_link_title(const unsigned char *p); -int _scan_spacechars(const unsigned char *p); -int _scan_atx_header_start(const unsigned char *p); -int _scan_setext_header_line(const unsigned char *p); -int _scan_hrule(const unsigned char *p); -int _scan_open_code_fence(const unsigned char *p); -int _scan_close_code_fence(const unsigned char *p); -int _scan_entity(const unsigned char *p); +bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c, bufsize_t offset); +bufsize_t _scan_scheme(const unsigned char *p); +bufsize_t _scan_autolink_uri(const unsigned char *p); +bufsize_t _scan_autolink_email(const unsigned char *p); +bufsize_t _scan_html_tag(const unsigned char *p); +bufsize_t _scan_html_block_start(const unsigned char *p); +bufsize_t _scan_html_block_start_7(const unsigned char *p); +bufsize_t _scan_html_block_end_1(const unsigned char *p); +bufsize_t _scan_html_block_end_2(const unsigned char *p); +bufsize_t _scan_html_block_end_3(const unsigned char *p); +bufsize_t _scan_html_block_end_4(const unsigned char *p); +bufsize_t _scan_html_block_end_5(const unsigned char *p); +bufsize_t _scan_link_url(const unsigned char *p); +bufsize_t _scan_link_title(const unsigned char *p); +bufsize_t _scan_spacechars(const unsigned char *p); +bufsize_t _scan_atx_header_start(const unsigned char *p); +bufsize_t _scan_setext_header_line(const unsigned char *p); +bufsize_t _scan_hrule(const unsigned char *p); +bufsize_t _scan_open_code_fence(const unsigned char *p); +bufsize_t _scan_close_code_fence(const unsigned char *p); +bufsize_t _scan_entity(const unsigned char *p); +bufsize_t _scan_dangerous_url(const unsigned char *p); +#define scan_scheme(c, n) _scan_at(&_scan_scheme, c, n) #define scan_autolink_uri(c, n) _scan_at(&_scan_autolink_uri, c, n) #define scan_autolink_email(c, n) _scan_at(&_scan_autolink_email, c, n) #define scan_html_tag(c, n) _scan_at(&_scan_html_tag, c, n) -#define scan_html_block_tag(c, n) _scan_at(&_scan_html_block_tag, c, n) +#define scan_html_block_start(c, n) _scan_at(&_scan_html_block_start, c, n) +#define scan_html_block_start_7(c, n) _scan_at(&_scan_html_block_start_7, c, n) +#define scan_html_block_end_1(c, n) _scan_at(&_scan_html_block_end_1, c, n) +#define scan_html_block_end_2(c, n) _scan_at(&_scan_html_block_end_2, c, n) +#define scan_html_block_end_3(c, n) _scan_at(&_scan_html_block_end_3, c, n) +#define scan_html_block_end_4(c, n) _scan_at(&_scan_html_block_end_4, c, n) +#define scan_html_block_end_5(c, n) _scan_at(&_scan_html_block_end_5, c, n) #define scan_link_url(c, n) _scan_at(&_scan_link_url, c, n) #define scan_link_title(c, n) _scan_at(&_scan_link_title, c, n) #define scan_spacechars(c, n) _scan_at(&_scan_spacechars, c, n) @@ -33,6 +48,7 @@ int _scan_entity(const unsigned char *p); #define scan_open_code_fence(c, n) _scan_at(&_scan_open_code_fence, c, n) #define scan_close_code_fence(c, n) _scan_at(&_scan_close_code_fence, c, n) #define scan_entity(c, n) _scan_at(&_scan_entity, c, n) +#define scan_dangerous_url(c, n) _scan_at(&_scan_dangerous_url, c, n) #ifdef __cplusplus }
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/scanners.re ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/scanners.re b/compiler/modules/CommonMark/src/scanners.re index d83efde..fbe3283 100644 --- a/compiler/modules/CommonMark/src/scanners.re +++ b/compiler/modules/CommonMark/src/scanners.re @@ -2,9 +2,9 @@ #include "chunk.h" #include "scanners.h" -int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int offset) +bufsize_t _scan_at(bufsize_t (*scanner)(const unsigned char *), cmark_chunk *c, bufsize_t offset) { - int res; + bufsize_t res; unsigned char *ptr = (unsigned char *)c->data; unsigned char lim = ptr[c->len]; @@ -24,15 +24,15 @@ int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int offset) wordchar = [^\x00-\x20]; - spacechar = [ \t\n]; + spacechar = [ \t\v\f\r\n]; reg_char = [^\\()\x00-\x20]; escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-]; - tagname = [A-Za-z][A-Za-z0-9]*; + tagname = [A-Za-z][A-Za-z0-9-]*; - blocktagname = 'article'|'header'|'aside'|'hgroup'|'iframe'|'blockquote'|'hr'|'body'|'li'|'map'|'button'|'object'|'canvas'|'ol'|'caption'|'output'|'col'|'p'|'colgroup'|'pre'|'dd'|'progress'|'div'|'section'|'dl'|'table'|'td'|'dt'|'tbody'|'embed'|'textarea'|'fieldset'|'tfoot'|'figcaption'|'th'|'figure'|'thead'|'footer'|'footer'|'tr'|'form'|'ul'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'video'|'script'|'style'; + blocktagname = 'address'|'article'|'aside'|'base'|'basefont'|'blockquote'|'body'|'caption'|'center'|'col'|'colgroup'|'dd'|'details'|'dialog'|'dir'|'div'|'dl'|'dt'|'fieldset'|'figcaption'|'figure'|'footer'|'form'|'frame'|'frameset'|'h1'|'head'|'header'|'hr'|'html'|'legend'|'li'|'link'|'main'|'menu'|'menuitem'|'meta'|'nav'|'noframes'|'ol'|'optgroup'|'option'|'p'|'param'|'pre'|'section'|'source'|'title'|'summary'|'table'|'tbody'|'td'|'tfoot'|'th'|'thead'|'title'|'tr'|'track'|'ul'; attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*; @@ -60,7 +60,7 @@ int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int offset) htmltag = opentag | closetag | htmlcomment | processinginstruction | declaration | cdata; - in_parens_nosp = [(] (reg_char|escaped_char)* [)]; + in_parens_nosp = [(] (reg_char|escaped_char|[\\])* [)]; in_double_quotes = ["] (escaped_char|[^"\x00])* ["]; in_single_quotes = ['] (escaped_char|[^'\x00])* [']; @@ -69,19 +69,30 @@ int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int offset) scheme = 'coap'|'doi'|'javascript'|'aaa'|'aaas'|'about'|'acap'|'cap'|'cid'|'crid'|'data'|'dav'|'dict'|'dns'|'file'|'ftp'|'geo'|'go'|'gopher'|'h323'|'http'|'https'|'iax'|'icap'|'im'|'imap'|'info'|'ipp'|'iris'|'iris.beep'|'iris.xpc'|'iris.xpcs'|'iris.lwz'|'ldap'|'mailto'|'mid'|'msrp'|'msrps'|'mtqp'|'mupdate'|'news'|'nfs'|'ni'|'nih'|'nntp'|'opaquelocktoken'|'pop'|'pres'|'rtsp'|'service'|'session'|'shttp'|'sieve'|'sip'|'sips'|'sms'|'snmp'|'soap.beep'|'soap.beeps'|'tag'|'tel'|'telnet'|'tftp'|'thismessage'|'tn3270'|'tip'|'tv'|'urn'|'vemmi'|'ws'|'wss'|'xcon'|'xcon-userid'|'xmlrpc.beep'|'xmlrpc.beeps'|'xmpp'|'z39.50r'|'z39.50s'|'adiumxtra'|'afp'|'afs'|'aim'|'apt'|'attachment'|'aw'|'beshare'|'bitcoin'|'bolo'|'callto'|'chrome'|'chrome-extension'|'com-eventbrite-attendee'|'content'|'cvs'|'dlna-playsingle'|'dlna-playcontainer'|'dtn'|'dvb'|'ed2k'|'facetime'|'feed'|'finger'|'fish'|'gg'|'git'|'gizmoproject'|'gtalk'|'hcp'|'icon'|'ipn'|'irc'|'irc6'|'ircs'|'itms'|'jar'|'jms'|'keyparc'|'lastfm'|'lda ps'|'magnet'|'maps'|'market'|'message'|'mms'|'ms-help'|'msnim'|'mumble'|'mvn'|'notes'|'oid'|'palm'|'paparazzi'|'platform'|'proxy'|'psyc'|'query'|'res'|'resource'|'rmi'|'rsync'|'rtmp'|'secondlife'|'sftp'|'sgn'|'skype'|'smb'|'soldat'|'spotify'|'ssh'|'steam'|'svn'|'teamspeak'|'things'|'udp'|'unreal'|'ut2004'|'ventrilo'|'view-source'|'webcal'|'wtai'|'wyciwyg'|'xfire'|'xri'|'ymsgr'; */ +// Try to match a scheme including colon. +bufsize_t _scan_scheme(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; +/*!re2c + scheme [:] { return (bufsize_t)(p - start); } + .? { return 0; } +*/ +} + // Try to match URI autolink after first <, returning number of chars matched. -int _scan_autolink_uri(const unsigned char *p) +bufsize_t _scan_autolink_uri(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - scheme [:]([^\x00-\x20<>\\]|escaped_char)*[>] { return (p - start); } + scheme [:][^\x00-\x20<>]*[>] { return (bufsize_t)(p - start); } .? { return 0; } */ } // Try to match email autolink after first <, returning num of chars matched. -int _scan_autolink_email(const unsigned char *p) +bufsize_t _scan_autolink_email(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; @@ -90,32 +101,101 @@ int _scan_autolink_email(const unsigned char *p) [@] [a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? ([.][a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)* - [>] { return (p - start); } + [>] { return (bufsize_t)(p - start); } .? { return 0; } */ } // Try to match an HTML tag after first <, returning num of chars matched. -int _scan_html_tag(const unsigned char *p) +bufsize_t _scan_html_tag(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; +/*!re2c + htmltag { return (bufsize_t)(p - start); } + .? { return 0; } +*/ +} + +// Try to match an HTML block tag start line, returning +// an integer code for the type of block (1-6, matching the spec). +// #7 is handled by a separate function, below. +bufsize_t _scan_html_block_start(const unsigned char *p) +{ + const unsigned char *marker = NULL; +/*!re2c + [<] ('script'|'pre'|'style') (spacechar | [>]) { return 1; } + '<!--' { return 2; } + '<?' { return 3; } + '<!' [A-Z] { return 4; } + '<![CDATA[' { return 5; } + [<] [/]? blocktagname (spacechar | [/]? [>]) { return 6; } + .? { return 0; } +*/ +} + +// Try to match an HTML block tag start line of type 7, returning +// 7 if successful, 0 if not. +bufsize_t _scan_html_block_start_7(const unsigned char *p) +{ + const unsigned char *marker = NULL; +/*!re2c + [<] (opentag | closetag) [\t\n\f ]* [\r\n] { return 7; } + .? { return 0; } +*/ +} + +// Try to match an HTML block end line of type 1 +bufsize_t _scan_html_block_end_1(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; +/*!re2c + .* [<] [/] ('script'|'pre'|'style') [>] { return (bufsize_t)(p - start); } + .? { return 0; } +*/ +} + +// Try to match an HTML block end line of type 2 +bufsize_t _scan_html_block_end_2(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; +/*!re2c + .* '-->' { return (bufsize_t)(p - start); } + .? { return 0; } +*/ +} + +// Try to match an HTML block end line of type 3 +bufsize_t _scan_html_block_end_3(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; +/*!re2c + .* '?>' { return (bufsize_t)(p - start); } + .? { return 0; } +*/ +} + +// Try to match an HTML block end line of type 4 +bufsize_t _scan_html_block_end_4(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - htmltag { return (p - start); } + .* '>' { return (bufsize_t)(p - start); } .? { return 0; } */ } -// Try to match an HTML block tag including first <, -// returning num of chars matched. -int _scan_html_block_tag(const unsigned char *p) +// Try to match an HTML block end line of type 5 +bufsize_t _scan_html_block_end_5(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - [<] [/] blocktagname (spacechar | [>]) { return (p - start); } - [<] blocktagname (spacechar | [/>]) { return (p - start); } - [<] [!?] { return (p - start); } + .* ']]>' { return (bufsize_t)(p - start); } .? { return 0; } */ } @@ -124,13 +204,13 @@ int _scan_html_block_tag(const unsigned char *p) // This may optionally be contained in <..>; otherwise // whitespace and unbalanced right parentheses aren't allowed. // Newlines aren't ever allowed. -int _scan_link_url(const unsigned char *p) +bufsize_t _scan_link_url(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - [ \n]* [<] ([^<>\n\\\x00] | escaped_char | [\\])* [>] { return (p - start); } - [ \n]* (reg_char+ | escaped_char | in_parens_nosp)* { return (p - start); } + [ \r\n]* [<] ([^<>\r\n\\\x00] | escaped_char | [\\])* [>] { return (bufsize_t)(p - start); } + [ \r\n]* (reg_char+ | escaped_char | in_parens_nosp | [\\][^()])* { return (bufsize_t)(p - start); } .? { return 0; } */ } @@ -138,47 +218,48 @@ int _scan_link_url(const unsigned char *p) // Try to match a link title (in single quotes, in double quotes, or // in parentheses), returning number of chars matched. Allow one // level of internal nesting (quotes within quotes). -int _scan_link_title(const unsigned char *p) +bufsize_t _scan_link_title(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - ["] (escaped_char|[^"\x00])* ["] { return (p - start); } - ['] (escaped_char|[^'\x00])* ['] { return (p - start); } - [(] (escaped_char|[^)\x00])* [)] { return (p - start); } + ["] (escaped_char|[^"\x00])* ["] { return (bufsize_t)(p - start); } + ['] (escaped_char|[^'\x00])* ['] { return (bufsize_t)(p - start); } + [(] (escaped_char|[^)\x00])* [)] { return (bufsize_t)(p - start); } .? { return 0; } */ } // Match space characters, including newlines. -int _scan_spacechars(const unsigned char *p) +bufsize_t _scan_spacechars(const unsigned char *p) { + const unsigned char *marker = NULL; const unsigned char *start = p; \ /*!re2c - [ \t\n]* { return (p - start); } + [ \t\v\f\r\n]* { return (bufsize_t)(p - start); } . { return 0; } */ } // Match ATX header start. -int _scan_atx_header_start(const unsigned char *p) +bufsize_t _scan_atx_header_start(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - [#]{1,6} ([ ]+|[\n]) { return (p - start); } + [#]{1,6} ([ ]+|[\r\n]) { return (bufsize_t)(p - start); } .? { return 0; } */ } -// Match sexext header line. Return 1 for level-1 header, +// Match setext header line. Return 1 for level-1 header, // 2 for level-2, 0 for no match. -int _scan_setext_header_line(const unsigned char *p) +bufsize_t _scan_setext_header_line(const unsigned char *p) { const unsigned char *marker = NULL; /*!re2c - [=]+ [ ]* [\n] { return 1; } - [-]+ [ ]* [\n] { return 2; } + [=]+ [ ]* [\r\n] { return 1; } + [-]+ [ ]* [\r\n] { return 2; } .? { return 0; } */ } @@ -186,51 +267,65 @@ int _scan_setext_header_line(const unsigned char *p) // Scan a horizontal rule line: "...three or more hyphens, asterisks, // or underscores on a line by themselves. If you wish, you may use // spaces between the hyphens or asterisks." -int _scan_hrule(const unsigned char *p) +bufsize_t _scan_hrule(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - ([*][ ]*){3,} [ \t]* [\n] { return (p - start); } - ([_][ ]*){3,} [ \t]* [\n] { return (p - start); } - ([-][ ]*){3,} [ \t]* [\n] { return (p - start); } + ([*][ ]*){3,} [ \t]* [\r\n] { return (bufsize_t)(p - start); } + ([_][ ]*){3,} [ \t]* [\r\n] { return (bufsize_t)(p - start); } + ([-][ ]*){3,} [ \t]* [\r\n] { return (bufsize_t)(p - start); } .? { return 0; } */ } // Scan an opening code fence. -int _scan_open_code_fence(const unsigned char *p) +bufsize_t _scan_open_code_fence(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - [`]{3,} / [^`\n\x00]*[\n] { return (p - start); } - [~]{3,} / [^~\n\x00]*[\n] { return (p - start); } + [`]{3,} / [^`\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); } + [~]{3,} / [^~\r\n\x00]*[\r\n] { return (bufsize_t)(p - start); } .? { return 0; } */ } // Scan a closing code fence with length at least len. -int _scan_close_code_fence(const unsigned char *p) +bufsize_t _scan_close_code_fence(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c - [`]{3,} / [ \t]*[\n] { return (p - start); } - [~]{3,} / [ \t]*[\n] { return (p - start); } + [`]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); } + [~]{3,} / [ \t]*[\r\n] { return (bufsize_t)(p - start); } .? { return 0; } */ } // Scans an entity. // Returns number of chars matched. -int _scan_entity(const unsigned char *p) +bufsize_t _scan_entity(const unsigned char *p) { const unsigned char *marker = NULL; const unsigned char *start = p; /*!re2c [&] ([#] ([Xx][A-Fa-f0-9]{1,8}|[0-9]{1,8}) |[A-Za-z][A-Za-z0-9]{1,31} ) [;] - { return (p - start); } + { return (bufsize_t)(p - start); } + .? { return 0; } +*/ +} + +// Returns positive value if a URL begins in a way that is potentially +// dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0. +bufsize_t _scan_dangerous_url(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; +/*!re2c + 'data:image/' ('png'|'gif'|'jpeg'|'webp') { return 0; } + 'javascript:' | 'vbscript:' | 'file:' | 'data:' { return (bufsize_t)(p - start); } .? { return 0; } */ } + http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/utf8.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/utf8.c b/compiler/modules/CommonMark/src/utf8.c index d77c5d1..ffe6652 100644 --- a/compiler/modules/CommonMark/src/utf8.c +++ b/compiler/modules/CommonMark/src/utf8.c @@ -30,7 +30,7 @@ static void encode_unknown(cmark_strbuf *buf) cmark_strbuf_put(buf, repl, 3); } -static int utf8proc_charlen(const uint8_t *str, int str_len) +static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len) { int length, i; @@ -42,7 +42,7 @@ static int utf8proc_charlen(const uint8_t *str, int str_len) if (!length) return -1; - if (str_len >= 0 && length > str_len) + if (str_len >= 0 && (bufsize_t)length > str_len) return -str_len; for (i = 1; i < length; i++) { @@ -54,23 +54,20 @@ static int utf8proc_charlen(const uint8_t *str, int str_len) } // Validate a single UTF-8 character according to RFC 3629. -static int utf8proc_valid(const uint8_t *str, int str_len) +static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) { - int length = utf8proc_charlen(str, str_len); + int length = utf8proc_utf8class[str[0]]; - if (length <= 0) - return length; + if (!length) + return -1; - switch (length) { - case 1: - if (str[0] == 0x00) { - // ASCII NUL is technically valid but rejected - // for security reasons. - return -length; - } - break; + if ((bufsize_t)length > str_len) + return -str_len; + switch (length) { case 2: + if ((str[1] & 0xC0) != 0x80) + return -1; if (str[0] < 0xC2) { // Overlong return -length; @@ -78,6 +75,10 @@ static int utf8proc_valid(const uint8_t *str, int str_len) break; case 3: + if ((str[1] & 0xC0) != 0x80) + return -1; + if ((str[2] & 0xC0) != 0x80) + return -2; if (str[0] == 0xE0) { if (str[1] < 0xA0) { // Overlong @@ -92,6 +93,12 @@ static int utf8proc_valid(const uint8_t *str, int str_len) break; case 4: + if ((str[1] & 0xC0) != 0x80) + return -1; + if ((str[2] & 0xC0) != 0x80) + return -2; + if ((str[3] & 0xC0) != 0x80) + return -3; if (str[0] == 0xF0) { if (str[1] < 0x90) { // Overlong @@ -109,49 +116,47 @@ static int utf8proc_valid(const uint8_t *str, int str_len) return length; } -void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, size_t size) +void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size) { - static const uint8_t whitespace[] = " "; - - size_t i = 0, tab = 0; + bufsize_t i = 0; while (i < size) { - size_t org = i; - - while (i < size && line[i] != '\t' && line[i] != '\0' - && line[i] < 0x80) { - i++; - tab++; + bufsize_t org = i; + int charlen = 0; + + while (i < size) { + if (line[i] < 0x80 && line[i] != 0) { + i++; + } else if (line[i] >= 0x80) { + charlen = utf8proc_valid(line + i, size - i); + if (charlen < 0) { + charlen = -charlen; + break; + } + i += charlen; + } else if (line[i] == 0) { + // ASCII NUL is technically valid but rejected + // for security reasons. + charlen = 1; + break; + } } - if (i > org) + if (i > org) { cmark_strbuf_put(ob, line + org, i - org); + } - if (i >= size) + if (i >= size) { break; - - if (line[i] == '\t') { - int numspaces = 4 - (tab % 4); - cmark_strbuf_put(ob, whitespace, numspaces); - i += 1; - tab += numspaces; } else { - int charlen = utf8proc_valid(line + i, size - i); - - if (charlen >= 0) { - cmark_strbuf_put(ob, line + i, charlen); - } else { - encode_unknown(ob); - charlen = -charlen; - } - + // Invalid UTF-8 + encode_unknown(ob); i += charlen; - tab += 1; } } } -int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) +int utf8proc_iterate(const uint8_t *str, bufsize_t str_len, int32_t *dst) { int length; int32_t uc = -1; @@ -172,8 +177,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) case 3: uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + (str[2] & 0x3F); - if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || - (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000)) uc = -1; break; case 4: uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) @@ -182,7 +186,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) break; } - if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) + if (uc < 0) return -1; *dst = uc; @@ -192,7 +196,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf) { uint8_t dst[4]; - int len = 0; + bufsize_t len = 0; assert(uc >= 0); @@ -228,7 +232,7 @@ void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf) cmark_strbuf_put(buf, dst, len); } -void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len) +void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, bufsize_t len) { int32_t c; @@ -236,7 +240,7 @@ void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len) utf8proc_encode_char(x, dest) while (len > 0) { - int char_len = utf8proc_iterate(str, len, &c); + bufsize_t char_len = utf8proc_iterate(str, len, &c); if (char_len >= 0) { #include "case_fold_switch.inc" http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/utf8.h ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/utf8.h b/compiler/modules/CommonMark/src/utf8.h index 7df1573..9f1a4ec 100644 --- a/compiler/modules/CommonMark/src/utf8.h +++ b/compiler/modules/CommonMark/src/utf8.h @@ -8,10 +8,10 @@ extern "C" { #endif -void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len); +void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, bufsize_t len); void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf); -int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst); -void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size); +int utf8proc_iterate(const uint8_t *str, bufsize_t str_len, int32_t *dst); +void utf8proc_check(cmark_strbuf *dest, const uint8_t *line, bufsize_t size); int utf8proc_is_space(int32_t uc); int utf8proc_is_punctuation(int32_t uc); http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/xml.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/xml.c b/compiler/modules/CommonMark/src/xml.c index f630aba..7eec5a6 100644 --- a/compiler/modules/CommonMark/src/xml.c +++ b/compiler/modules/CommonMark/src/xml.c @@ -11,14 +11,9 @@ // Functions to convert cmark_nodes to XML strings. -static void escape_xml(cmark_strbuf *dest, const unsigned char *source, int length) +static void escape_xml(cmark_strbuf *dest, const unsigned char *source, bufsize_t length) { - if (source != NULL) { - if (length < 0) - length = strlen((char *)source); - - houdini_escape_html0(dest, source, (size_t)length, 0); - } + houdini_escape_html0(dest, source, length, 0); } struct render_state { @@ -36,7 +31,7 @@ static inline void indent(struct render_state *state) static int S_render_node(cmark_node *node, cmark_event_type ev_type, - struct render_state *state, long options) + struct render_state *state, int options) { cmark_strbuf *xml = state->xml; bool literal = false; @@ -118,10 +113,12 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_LINK: case CMARK_NODE_IMAGE: cmark_strbuf_puts(xml, " destination=\""); - escape_xml(xml, node->as.link.url, -1); + escape_xml(xml, node->as.link.url.data, + node->as.link.url.len); cmark_strbuf_putc(xml, '"'); cmark_strbuf_puts(xml, " title=\""); - escape_xml(xml, node->as.link.title, -1); + escape_xml(xml, node->as.link.title.data, + node->as.link.title.len); cmark_strbuf_putc(xml, '"'); break; default: @@ -145,7 +142,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, return 1; } -char *cmark_render_xml(cmark_node *root, long options) +char *cmark_render_xml(cmark_node *root, int options) { char *result; cmark_strbuf xml = GH_BUF_INIT; @@ -153,10 +150,6 @@ char *cmark_render_xml(cmark_node *root, long options) cmark_node *cur; struct render_state state = { &xml, 0 }; - if (options & CMARK_OPT_NORMALIZE) { - cmark_consolidate_text_nodes(root); - } - cmark_iter *iter = cmark_iter_new(root); cmark_strbuf_puts(state.xml, @@ -170,6 +163,5 @@ char *cmark_render_xml(cmark_node *root, long options) result = (char *)cmark_strbuf_detach(&xml); cmark_iter_free(iter); - cmark_strbuf_free(&xml); return result; } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/src/CFCCHtml.c ---------------------------------------------------------------------- diff --git a/compiler/src/CFCCHtml.c b/compiler/src/CFCCHtml.c index 0409a7b..34b4336 100644 --- a/compiler/src/CFCCHtml.c +++ b/compiler/src/CFCCHtml.c @@ -756,7 +756,10 @@ S_html_create_inheritance(CFCClass *klass) { static char* S_md_to_html(CFCClass *klass, const char *md) { - cmark_node *doc = cmark_parse_document(md, strlen(md)); + int options = CMARK_OPT_SMART + | CMARK_OPT_VALIDATE_UTF8 + | CMARK_OPT_SAFE; + cmark_node *doc = cmark_parse_document(md, strlen(md), options); S_convert_uris(klass, doc); char *html = cmark_render_html(doc, CMARK_OPT_DEFAULT); cmark_node_free(doc); http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/src/CFCCMan.c ---------------------------------------------------------------------- diff --git a/compiler/src/CFCCMan.c b/compiler/src/CFCCMan.c index 354d91f..79248c8 100644 --- a/compiler/src/CFCCMan.c +++ b/compiler/src/CFCCMan.c @@ -410,7 +410,11 @@ S_man_create_inheritance(CFCClass *klass) { static char* S_md_to_man(CFCClass *klass, const char *md, int needs_indent) { - cmark_node *doc = cmark_parse_document(md, strlen(md)); + int options = CMARK_OPT_NORMALIZE + | CMARK_OPT_SMART + | CMARK_OPT_VALIDATE_UTF8 + | CMARK_OPT_SAFE; + cmark_node *doc = cmark_parse_document(md, strlen(md), options); char *result = S_nodes_to_man(klass, doc, needs_indent); cmark_node_free(doc); http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/src/CFCPerlPod.c ---------------------------------------------------------------------- diff --git a/compiler/src/CFCPerlPod.c b/compiler/src/CFCPerlPod.c index 24d6c4b..8fc2eae 100644 --- a/compiler/src/CFCPerlPod.c +++ b/compiler/src/CFCPerlPod.c @@ -322,7 +322,11 @@ CFCPerlPod_gen_subroutine_pod(CFCFunction *func, char* CFCPerlPod_md_to_pod(const char *md, CFCClass *klass, int header_level) { - cmark_node *doc = cmark_parse_document(md, strlen(md)); + int options = CMARK_OPT_NORMALIZE + | CMARK_OPT_SMART + | CMARK_OPT_VALIDATE_UTF8 + | CMARK_OPT_SAFE; + cmark_node *doc = cmark_parse_document(md, strlen(md), options); char *pod = S_nodes_to_pod(doc, klass, header_level); cmark_node_free(doc);
