http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/inlines.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/inlines.c b/compiler/modules/CommonMark/src/inlines.c index 2487f63..7ea308d 100644 --- a/compiler/modules/CommonMark/src/inlines.c +++ b/compiler/modules/CommonMark/src/inlines.c @@ -14,6 +14,15 @@ #include "inlines.h" +static const char *EMDASH = "\xE2\x80\x94"; +static const char *ENDASH = "\xE2\x80\x93"; +static const char *ELLIPSES = "\xE2\x80\xA6"; +static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C"; +static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D"; +static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98"; +static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99"; + + // Macros for creating various kinds of simple. #define make_str(s) make_literal(CMARK_NODE_TEXT, s) #define make_code(s) make_literal(CMARK_NODE_CODE, s) @@ -27,8 +36,8 @@ typedef struct delimiter { struct delimiter *previous; struct delimiter *next; cmark_node *inl_text; + bufsize_t position; unsigned char delim_char; - int position; bool can_open; bool can_close; bool active; @@ -36,45 +45,53 @@ typedef struct delimiter { typedef struct { cmark_chunk input; - int pos; + bufsize_t pos; cmark_reference_map *refmap; delimiter *last_delim; } subject; +static inline bool +S_is_line_end_char(char c) +{ + return (c == '\n' || c == '\r'); +} + static delimiter* S_insert_emph(subject *subj, delimiter *opener, delimiter *closer); -static int parse_inline(subject* subj, cmark_node * parent); +static int parse_inline(subject* subj, cmark_node * parent, int options); static void subject_from_buf(subject *e, cmark_strbuf *buffer, cmark_reference_map *refmap); -static int subject_find_special_char(subject *subj); +static bufsize_t subject_find_special_char(subject *subj, int options); -static unsigned char *cmark_clean_autolink(cmark_chunk *url, int is_email) +static cmark_chunk cmark_clean_autolink(cmark_chunk *url, int is_email) { cmark_strbuf buf = GH_BUF_INIT; cmark_chunk_trim(url); - if (url->len == 0) - return NULL; + if (url->len == 0) { + cmark_chunk result = CMARK_CHUNK_EMPTY; + return result; + } if (is_email) cmark_strbuf_puts(&buf, "mailto:"); houdini_unescape_html_f(&buf, url->data, url->len); - return cmark_strbuf_detach(&buf); + return cmark_chunk_buf_detach(&buf); } -static inline cmark_node *make_link(cmark_node *label, unsigned char *url, unsigned char *title) +static inline cmark_node *make_link(cmark_node *label, cmark_chunk *url, cmark_chunk *title) { cmark_node* e = (cmark_node *)calloc(1, sizeof(*e)); if(e != NULL) { e->type = CMARK_NODE_LINK; e->first_child = label; e->last_child = label; - e->as.link.url = url; - e->as.link.title = title; + e->as.link.url = *url; + e->as.link.title = *title; e->next = NULL; label->parent = e; } @@ -83,7 +100,9 @@ static inline cmark_node *make_link(cmark_node *label, unsigned char *url, unsig static inline cmark_node* make_autolink(cmark_node* label, cmark_chunk url, int is_email) { - return make_link(label, cmark_clean_autolink(&url, is_email), NULL); + cmark_chunk clean_url = cmark_clean_autolink(&url, is_email); + cmark_chunk title = CMARK_CHUNK_EMPTY; + return make_link(label, &clean_url, &title); } // Create an inline with a literal string value. @@ -125,19 +144,20 @@ static inline cmark_node* make_simple(cmark_node_type t) return e; } -static unsigned char *bufdup(const unsigned char *buf) +// Duplicate a chunk by creating a copy of the buffer not by reusing the +// buffer like cmark_chunk_dup does. +static cmark_chunk chunk_clone(cmark_chunk *src) { - unsigned char *new_buf = NULL; + cmark_chunk c; + bufsize_t len = src->len; - if (buf) { - int len = strlen((char *)buf); - new_buf = (unsigned char *)calloc(len + 1, sizeof(*new_buf)); - if(new_buf != NULL) { - memcpy(new_buf, buf, len + 1); - } - } + c.len = len; + c.data = (unsigned char *)malloc(len + 1); + c.alloc = 1; + memcpy(c.data, src->data, len); + c.data[len] = '\0'; - return new_buf; + return c; } static void subject_from_buf(subject *e, cmark_strbuf *buffer, @@ -149,8 +169,6 @@ static void subject_from_buf(subject *e, cmark_strbuf *buffer, e->pos = 0; e->refmap = refmap; e->last_delim = NULL; - - cmark_chunk_rtrim(&e->input); } static inline int isbacktick(int c) @@ -160,10 +178,13 @@ static inline int isbacktick(int c) static inline unsigned char peek_char(subject *subj) { + // NULL bytes should have been stripped out by now. If they're + // present, it's a programming error: + assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0)); return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0; } -static inline unsigned char peek_at(subject *subj, int pos) +static inline unsigned char peek_at(subject *subj, bufsize_t pos) { return subj->input.data[pos]; } @@ -177,12 +198,38 @@ static inline int is_eof(subject* subj) // Advance the subject. Doesn't check for eof. #define advance(subj) (subj)->pos += 1 +static inline bool +skip_spaces(subject *subj) +{ + bool skipped = false; + while (peek_char(subj) == ' ' || peek_char(subj) == '\t') { + advance(subj); + skipped = true; + } + return skipped; +} + +static inline bool +skip_line_end(subject *subj) +{ + bool seen_line_end_char = false; + if (peek_char(subj) == '\r') { + advance(subj); + seen_line_end_char = true; + } + if (peek_char(subj) == '\n') { + advance(subj); + seen_line_end_char = true; + } + return seen_line_end_char || is_eof(subj); +} + // Take characters while a predicate holds, and return a string. static inline cmark_chunk take_while(subject* subj, int (*f)(int)) { unsigned char c; - int startpos = subj->pos; - int len = 0; + bufsize_t startpos = subj->pos; + bufsize_t len = 0; while ((c = peek_char(subj)) && (*f)(c)) { advance(subj); @@ -197,7 +244,7 @@ static inline cmark_chunk take_while(subject* subj, int (*f)(int)) // parsed). Return 0 if you don't find matching closing // backticks, otherwise return the position in the subject // after the closing backticks. -static int scan_to_closing_backticks(subject* subj, int openticklength) +static bufsize_t scan_to_closing_backticks(subject* subj, bufsize_t openticklength) { // read non backticks unsigned char c; @@ -207,7 +254,7 @@ static int scan_to_closing_backticks(subject* subj, int openticklength) if (is_eof(subj)) { return 0; // did not find closing ticks, return 0 } - int numticks = 0; + bufsize_t numticks = 0; while (peek_char(subj) == '`') { advance(subj); numticks++; @@ -223,8 +270,8 @@ static int scan_to_closing_backticks(subject* subj, int openticklength) static cmark_node* handle_backticks(subject *subj) { cmark_chunk openticks = take_while(subj, isbacktick); - int startpos = subj->pos; - int endpos = scan_to_closing_backticks(subj, openticks.len); + bufsize_t startpos = subj->pos; + bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len); if (endpos == 0) { // not found subj->pos = startpos; // rewind @@ -246,10 +293,11 @@ static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) { int numdelims = 0; - int before_char_pos; + bufsize_t before_char_pos; int32_t after_char = 0; int32_t before_char = 0; int len; + bool left_flanking, right_flanking; if (subj->pos == 0) { before_char = 10; @@ -267,9 +315,14 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) } } - while (peek_char(subj) == c) { + if (c == '\'' || c == '"') { numdelims++; - advance(subj); + advance(subj); // limit to 1 delim for quotes + } else { + while (peek_char(subj) == c) { + numdelims++; + advance(subj); + } } len = utf8proc_iterate(subj->input.data + subj->pos, @@ -277,19 +330,25 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) if (len == -1) { after_char = 10; } - *can_open = numdelims > 0 && !utf8proc_is_space(after_char) && - !(utf8proc_is_punctuation(after_char) && - !utf8proc_is_space(before_char) && - !utf8proc_is_punctuation(before_char)); - *can_close = numdelims > 0 && !utf8proc_is_space(before_char) && - !(utf8proc_is_punctuation(before_char) && - !utf8proc_is_space(after_char) && - !utf8proc_is_punctuation(after_char)); + left_flanking = numdelims > 0 && !utf8proc_is_space(after_char) && + !(utf8proc_is_punctuation(after_char) && + !utf8proc_is_space(before_char) && + !utf8proc_is_punctuation(before_char)); + right_flanking = numdelims > 0 && !utf8proc_is_space(before_char) && + !(utf8proc_is_punctuation(before_char) && + !utf8proc_is_space(after_char) && + !utf8proc_is_punctuation(after_char)); if (c == '_') { - *can_open = *can_open && !(before_char < 128 && - cmark_isalnum((char)before_char)); - *can_close = *can_close && !(before_char < 128 && - cmark_isalnum((char)after_char)); + *can_open = left_flanking && + (!right_flanking || utf8proc_is_punctuation(before_char)); + *can_close = right_flanking && + (!left_flanking || utf8proc_is_punctuation(after_char)); + } else if (c == '\'' || c == '"') { + *can_open = left_flanking && !right_flanking; + *can_close = right_flanking; + } else { + *can_open = left_flanking; + *can_close = right_flanking; } return numdelims; } @@ -300,10 +359,10 @@ static void print_delimiters(subject *subj) delimiter *delim; delim = subj->last_delim; while (delim != NULL) { - printf("Item at %p: %d %d %d next(%p) prev(%p)\n", - delim, delim->delim_char, + printf("Item at stack pos %p, text pos %d: %d %d %d next(%p) prev(%p)\n", + (void*)delim, delim->position, delim->delim_char, delim->can_open, delim->can_close, - delim->next, delim->previous); + (void*)delim->next, (void*)delim->previous); delim = delim->previous; } } @@ -347,59 +406,175 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open, subj->last_delim = delim; } -// Parse strong/emph or a fallback. -// Assumes the subject has '_' or '*' at the current position. -static cmark_node* handle_strong_emph(subject* subj, unsigned char c) +// Assumes the subject has a c at the current position. +static cmark_node* handle_delim(subject* subj, unsigned char c, bool smart) { - int numdelims; + bufsize_t numdelims; cmark_node * inl_text; bool can_open, can_close; + cmark_chunk contents; numdelims = scan_delims(subj, c, &can_open, &can_close); - inl_text = make_str(cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); + if (c == '\'' && smart) { + contents = cmark_chunk_literal(RIGHTSINGLEQUOTE); + } else if (c == '"' && smart) { + contents = cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE); + } else { + contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); + } - if (can_open || can_close) { + inl_text = make_str(contents); + + if ((can_open || can_close) && + (!(c == '\'' || c == '"') || smart)) { push_delimiter(subj, c, can_open, can_close, inl_text); } return inl_text; } -static void process_emphasis(subject *subj, delimiter *start_delim) +// Assumes we have a hyphen at the current position. +static cmark_node* handle_hyphen(subject* subj, bool smart) +{ + int startpos = subj->pos; + + advance(subj); + + if (!smart || peek_char(subj) != '-') { + return make_str(cmark_chunk_literal("-")); + } + + while (smart && peek_char(subj) == '-') { + advance(subj); + } + + int numhyphens = subj->pos - startpos; + int en_count = 0; + int em_count = 0; + int i; + cmark_strbuf buf = GH_BUF_INIT; + + if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes + em_count = numhyphens / 3; + } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes + en_count = numhyphens / 2; + } else if (numhyphens % 3 == 2) { // use one en dash at end + en_count = 1; + em_count = (numhyphens - 2) / 3; + } else { // use two en dashes at the end + en_count = 2; + em_count = (numhyphens - 4) / 3; + } + + for (i = em_count; i > 0; i--) { + cmark_strbuf_puts(&buf, EMDASH); + } + + for (i = en_count; i > 0; i--) { + cmark_strbuf_puts(&buf, ENDASH); + } + + return make_str(cmark_chunk_buf_detach(&buf)); +} + +// Assumes we have a period at the current position. +static cmark_node* handle_period(subject* subj, bool smart) +{ + advance(subj); + if (smart && peek_char(subj) == '.') { + advance(subj); + if (peek_char(subj) == '.') { + advance(subj); + return make_str(cmark_chunk_literal(ELLIPSES)); + } else { + return make_str(cmark_chunk_literal("..")); + } + } else { + return make_str(cmark_chunk_literal(".")); + } +} + +static void process_emphasis(subject *subj, delimiter *stack_bottom) { delimiter *closer = subj->last_delim; delimiter *opener; + delimiter *old_closer; + bool opener_found; + delimiter *openers_bottom[128]; + + // initialize openers_bottom: + openers_bottom['*'] = stack_bottom; + openers_bottom['_'] = stack_bottom; + openers_bottom['\''] = stack_bottom; + openers_bottom['"'] = stack_bottom; // move back to first relevant delim. - while (closer != NULL && closer->previous != start_delim) { + while (closer != NULL && closer->previous != stack_bottom) { closer = closer->previous; } // now move forward, looking for closers, and handling each while (closer != NULL) { if (closer->can_close && - (closer->delim_char == '*' || closer->delim_char == '_')) { + (closer->delim_char == '*' || closer->delim_char == '_' || + closer->delim_char == '"' || closer->delim_char == '\'')) { // Now look backwards for first matching opener: opener = closer->previous; - while (opener != NULL && opener != start_delim) { + opener_found = false; + while (opener != NULL && opener != stack_bottom && + opener != openers_bottom[closer->delim_char]) { if (opener->delim_char == closer->delim_char && opener->can_open) { + opener_found = true; break; } opener = opener->previous; } - if (opener != NULL && opener != start_delim) { - closer = S_insert_emph(subj, opener, closer); - } else { + old_closer = closer; + if (closer->delim_char == '*' || closer->delim_char == '_') { + if (opener_found) { + closer = S_insert_emph(subj, opener, closer); + } else { + closer = closer->next; + } + } else if (closer->delim_char == '\'') { + cmark_chunk_free(&closer->inl_text->as.literal); + closer->inl_text->as.literal = + cmark_chunk_literal(RIGHTSINGLEQUOTE); + if (opener_found) { + cmark_chunk_free(&opener->inl_text->as.literal); + opener->inl_text->as.literal = + cmark_chunk_literal(LEFTSINGLEQUOTE); + } closer = closer->next; + } else if (closer->delim_char == '"') { + cmark_chunk_free(&closer->inl_text->as.literal); + closer->inl_text->as.literal = + cmark_chunk_literal(RIGHTDOUBLEQUOTE); + if (opener_found) { + cmark_chunk_free(&opener->inl_text->as.literal); + opener->inl_text->as.literal = + cmark_chunk_literal(LEFTDOUBLEQUOTE); + } + closer = closer->next; + } + if (!opener_found) { + // set lower bound for future searches for openers: + openers_bottom[old_closer->delim_char] = old_closer->previous; + if (!old_closer->can_open) { + // we can remove a closer that can't be an + // opener, once we've seen there's no + // matching opener: + remove_delimiter(subj, old_closer); + } } } else { closer = closer->next; } } - // free all delimiters in list until start_delim: - while (subj->last_delim != start_delim) { + // free all delimiters in list until stack_bottom: + while (subj->last_delim != stack_bottom) { remove_delimiter(subj, subj->last_delim); } } @@ -408,11 +583,11 @@ static delimiter* S_insert_emph(subject *subj, delimiter *opener, delimiter *closer) { delimiter *delim, *tmp_delim; - int use_delims; + bufsize_t use_delims; cmark_node *opener_inl = opener->inl_text; cmark_node *closer_inl = closer->inl_text; - int opener_num_chars = opener_inl->as.literal.len; - int closer_num_chars = closer_inl->as.literal.len; + bufsize_t opener_num_chars = opener_inl->as.literal.len; + bufsize_t closer_num_chars = closer_inl->as.literal.len; cmark_node *tmp, *emph, *first_child, *last_child; // calculate the actual number of characters used from this closer @@ -491,8 +666,7 @@ static cmark_node* handle_backslash(subject *subj) if (cmark_ispunct(nextchar)) { // only ascii symbols and newline can be escaped advance(subj); return make_str(cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); - } else if (nextchar == '\n') { - advance(subj); + } else if (!is_eof(subj) && skip_line_end(subj)) { return make_linebreak(); } else { return make_str(cmark_chunk_literal("\\")); @@ -504,7 +678,7 @@ static cmark_node* handle_backslash(subject *subj) static cmark_node* handle_entity(subject* subj) { cmark_strbuf ent = GH_BUF_INIT; - size_t len; + bufsize_t len; advance(subj); @@ -526,7 +700,7 @@ static cmark_node *make_str_with_entities(cmark_chunk *content) { cmark_strbuf unescaped = GH_BUF_INIT; - if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) { + if (houdini_unescape_html(&unescaped, content->data, content->len)) { return make_str(cmark_chunk_buf_detach(&unescaped)); } else { return make_str(*content); @@ -535,14 +709,16 @@ static cmark_node *make_str_with_entities(cmark_chunk *content) // Clean a URL: remove surrounding whitespace and surrounding <>, // and remove \ that escape punctuation. -unsigned char *cmark_clean_url(cmark_chunk *url) +cmark_chunk cmark_clean_url(cmark_chunk *url) { cmark_strbuf buf = GH_BUF_INIT; cmark_chunk_trim(url); - if (url->len == 0) - return NULL; + if (url->len == 0) { + cmark_chunk result = CMARK_CHUNK_EMPTY; + return result; + } if (url->data[0] == '<' && url->data[url->len - 1] == '>') { houdini_unescape_html_f(&buf, url->data + 1, url->len - 2); @@ -551,16 +727,18 @@ unsigned char *cmark_clean_url(cmark_chunk *url) } cmark_strbuf_unescape(&buf); - return cmark_strbuf_detach(&buf); + return cmark_chunk_buf_detach(&buf); } -unsigned char *cmark_clean_title(cmark_chunk *title) +cmark_chunk cmark_clean_title(cmark_chunk *title) { cmark_strbuf buf = GH_BUF_INIT; unsigned char first, last; - if (title->len == 0) - return NULL; + if (title->len == 0) { + cmark_chunk result = CMARK_CHUNK_EMPTY; + return result; + } first = title->data[0]; last = title->data[title->len - 1]; @@ -575,14 +753,14 @@ unsigned char *cmark_clean_title(cmark_chunk *title) } cmark_strbuf_unescape(&buf); - return cmark_strbuf_detach(&buf); + return cmark_chunk_buf_detach(&buf); } // Parse an autolink or HTML tag. // Assumes the subject has a '<' character at the current position. static cmark_node* handle_pointy_brace(subject* subj) { - int matchlen = 0; + bufsize_t matchlen = 0; cmark_chunk contents; advance(subj); // advance past first < @@ -629,7 +807,7 @@ static cmark_node* handle_pointy_brace(subject* subj) // encountered. Backticks in labels do not start code spans. static int link_label(subject* subj, cmark_chunk *raw_label) { - int startpos = subj->pos; + bufsize_t startpos = subj->pos; int length = 0; unsigned char c; @@ -659,6 +837,7 @@ static int link_label(subject* subj, cmark_chunk *raw_label) if (c == ']') { // match found *raw_label = cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); + cmark_chunk_trim(raw_label); advance(subj); // advance past ] return 1; } @@ -672,14 +851,14 @@ noMatch: // Return a link, an image, or a literal close bracket. static cmark_node* handle_close_bracket(subject* subj, cmark_node *parent) { - int initial_pos; - int starturl, endurl, starttitle, endtitle, endall; - int n; - int sps; + bufsize_t initial_pos; + bufsize_t starturl, endurl, starttitle, endtitle, endall; + bufsize_t n; + bufsize_t sps; cmark_reference *ref; bool is_image = false; cmark_chunk url_chunk, title_chunk; - unsigned char *url, *title; + cmark_chunk url, title; delimiter *opener; cmark_node *link_text; cmark_node *inl; @@ -767,8 +946,8 @@ static cmark_node* handle_close_bracket(subject* subj, cmark_node *parent) cmark_chunk_free(&raw_label); if (ref != NULL) { // found - url = bufdup(ref->url); - title = bufdup(ref->title); + url = chunk_clone(&ref->url); + title = chunk_clone(&ref->title); goto match; } else { goto noMatch; @@ -785,7 +964,7 @@ match: inl->type = is_image ? NODE_IMAGE : NODE_LINK; cmark_chunk_free(&inl->as.literal); inl->first_child = link_text; - process_emphasis(subj, opener->previous); + process_emphasis(subj, opener); inl->as.link.url = url; inl->as.link.title = title; inl->next = NULL; @@ -800,10 +979,10 @@ match: } parent->last_child = inl; - // process_emphasis will remove this delimiter and all later ones. // Now, if we have a link, we also want to deactivate earlier link // delimiters. (This code can be removed if we decide to allow links // inside links.) + remove_delimiter(subj, opener); if (!is_image) { opener = subj->last_delim; while (opener != NULL) { @@ -825,13 +1004,11 @@ match: // Assumes the subject has a newline at the current position. static cmark_node* handle_newline(subject *subj) { - int nlpos = subj->pos; + bufsize_t nlpos = subj->pos; // skip over newline advance(subj); // skip spaces at beginning of line - while (peek_char(subj) == ' ') { - advance(subj); - } + skip_spaces(subj); if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' && peek_at(subj, nlpos - 2) == ' ') { @@ -841,11 +1018,11 @@ static cmark_node* handle_newline(subject *subj) } } -static int subject_find_special_char(subject *subj) +static bufsize_t subject_find_special_char(subject *subj, int options) { - // "\n\\`&_*[]<!" + // "\r\n\\`&_*[]<!" static const int8_t SPECIAL_CHARS[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, @@ -863,11 +1040,34 @@ static int subject_find_special_char(subject *subj) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - int n = subj->pos + 1; + // " ' . - + static const char SMART_PUNCT_CHARS[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + bufsize_t n = subj->pos + 1; while (n < subj->input.len) { if (SPECIAL_CHARS[subj->input.data[n]]) return n; + if (options & CMARK_OPT_SMART && + SMART_PUNCT_CHARS[subj->input.data[n]]) + return n; n++; } @@ -876,17 +1076,18 @@ static int subject_find_special_char(subject *subj) // Parse an inline, advancing subject, and add it as a child of parent. // Return 0 if no inline can be parsed, 1 otherwise. -static int parse_inline(subject* subj, cmark_node * parent) +static int parse_inline(subject* subj, cmark_node * parent, int options) { cmark_node* new_inl = NULL; cmark_chunk contents; unsigned char c; - int endpos; + bufsize_t endpos; c = peek_char(subj); if (c == 0) { return 0; } switch(c) { + case '\r': case '\n': new_inl = handle_newline(subj); break; @@ -904,7 +1105,15 @@ static int parse_inline(subject* subj, cmark_node * parent) break; case '*': case '_': - new_inl = handle_strong_emph(subj, c); + case '\'': + case '"': + new_inl = handle_delim(subj, c, options & CMARK_OPT_SMART); + break; + case '-': + new_inl = handle_hyphen(subj, options & CMARK_OPT_SMART); + break; + case '.': + new_inl = handle_period(subj, options & CMARK_OPT_SMART); break; case '[': advance(subj); @@ -925,12 +1134,12 @@ static int parse_inline(subject* subj, cmark_node * parent) } break; default: - endpos = subject_find_special_char(subj); + endpos = subject_find_special_char(subj, options); contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos); subj->pos = endpos; // if we're at a newline, strip trailing spaces. - if (peek_char(subj) == '\n') { + if (S_is_line_end_char(peek_char(subj))) { cmark_chunk_rtrim(&contents); } @@ -944,12 +1153,13 @@ static int parse_inline(subject* subj, cmark_node * parent) } // Parse inlines from parent's string_content, adding as children of parent. -extern void cmark_parse_inlines(cmark_node* parent, cmark_reference_map *refmap) +extern void cmark_parse_inlines(cmark_node* parent, cmark_reference_map *refmap, int options) { subject subj; subject_from_buf(&subj, &parent->string_content, refmap); + cmark_chunk_rtrim(&subj.input); - while (!is_eof(&subj) && parse_inline(&subj, parent)) ; + while (!is_eof(&subj) && parse_inline(&subj, parent, options)) ; process_emphasis(&subj, NULL); } @@ -957,11 +1167,9 @@ extern void cmark_parse_inlines(cmark_node* parent, cmark_reference_map *refmap) // Parse zero or more space characters, including at most one newline. static void spnl(subject* subj) { - bool seen_newline = false; - while (peek_char(subj) == ' ' || - (!seen_newline && - (seen_newline = peek_char(subj) == '\n'))) { - advance(subj); + skip_spaces(subj); + if (skip_line_end(subj)) { + skip_spaces(subj); } } @@ -969,7 +1177,7 @@ static void spnl(subject* subj) // Modify refmap if a reference is encountered. // Return 0 if no reference found, otherwise position of subject // after reference is parsed. -int cmark_parse_reference_inline(cmark_strbuf *input, cmark_reference_map *refmap) +bufsize_t cmark_parse_reference_inline(cmark_strbuf *input, cmark_reference_map *refmap) { subject subj; @@ -977,13 +1185,13 @@ int cmark_parse_reference_inline(cmark_strbuf *input, cmark_reference_map *refma cmark_chunk url; cmark_chunk title; - int matchlen = 0; - int beforetitle; + bufsize_t matchlen = 0; + bufsize_t beforetitle; subject_from_buf(&subj, input, NULL); // parse label: - if (!link_label(&subj, &lab)) + if (!link_label(&subj, &lab) || lab.len == 0) return 0; // colon: @@ -1014,14 +1222,19 @@ int cmark_parse_reference_inline(cmark_strbuf *input, cmark_reference_map *refma subj.pos = beforetitle; title = cmark_chunk_literal(""); } + // parse final spaces and newline: - while (peek_char(&subj) == ' ') { - advance(&subj); - } - if (peek_char(&subj) == '\n') { - advance(&subj); - } else if (peek_char(&subj) != 0) { - return 0; + skip_spaces(&subj); + if (!skip_line_end(&subj)) { + if (matchlen) { // try rewinding before title + subj.pos = beforetitle; + skip_spaces(&subj); + if (!skip_line_end(&subj)) { + return 0; + } + } else { + return 0; + } } // insert reference into refmap cmark_reference_create(refmap, &lab, &url, &title);
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/inlines.h ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/inlines.h b/compiler/modules/CommonMark/src/inlines.h index d2ccfb4..f8847fc 100644 --- a/compiler/modules/CommonMark/src/inlines.h +++ b/compiler/modules/CommonMark/src/inlines.h @@ -5,12 +5,12 @@ extern "C" { #endif -unsigned char *cmark_clean_url(cmark_chunk *url); -unsigned char *cmark_clean_title(cmark_chunk *title); +cmark_chunk cmark_clean_url(cmark_chunk *url); +cmark_chunk cmark_clean_title(cmark_chunk *title); -void cmark_parse_inlines(cmark_node* parent, cmark_reference_map *refmap); +void cmark_parse_inlines(cmark_node* parent, cmark_reference_map *refmap, int options); -int cmark_parse_reference_inline(cmark_strbuf *input, cmark_reference_map *refmap); +bufsize_t cmark_parse_reference_inline(cmark_strbuf *input, cmark_reference_map *refmap); #ifdef __cplusplus } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/iterator.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/iterator.c b/compiler/modules/CommonMark/src/iterator.c index 4daec2d..f18e3bf 100644 --- a/compiler/modules/CommonMark/src/iterator.c +++ b/compiler/modules/CommonMark/src/iterator.c @@ -108,6 +108,12 @@ cmark_iter_get_event_type(cmark_iter *iter) return iter->cur.ev_type; } +cmark_node* +cmark_iter_get_root(cmark_iter *iter) +{ + return iter->root; +} + void cmark_consolidate_text_nodes(cmark_node *root) { @@ -123,18 +129,20 @@ void cmark_consolidate_text_nodes(cmark_node *root) cur->next && cur->next->type == CMARK_NODE_TEXT) { cmark_strbuf_clear(&buf); - cmark_strbuf_puts(&buf, cmark_node_get_literal(cur)); + cmark_strbuf_put(&buf, cur->as.literal.data, cur->as.literal.len); tmp = cur->next; while (tmp && tmp->type == CMARK_NODE_TEXT) { - cmark_iter_get_node(iter); // advance pointer - cmark_strbuf_puts(&buf, cmark_node_get_literal(tmp)); + cmark_iter_next(iter); // advance pointer + cmark_strbuf_put(&buf, tmp->as.literal.data, tmp->as.literal.len); next = tmp->next; cmark_node_free(tmp); tmp = next; } - cmark_node_set_literal(cur, (char *)cmark_strbuf_detach(&buf)); + cmark_chunk_free(&cur->as.literal); + cur->as.literal = cmark_chunk_buf_detach(&buf); } } + cmark_strbuf_free(&buf); cmark_iter_free(iter); } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/latex.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/latex.c b/compiler/modules/CommonMark/src/latex.c new file mode 100644 index 0000000..782b0c0 --- /dev/null +++ b/compiler/modules/CommonMark/src/latex.c @@ -0,0 +1,430 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <ctype.h> + +#include "config.h" +#include "cmark.h" +#include "node.h" +#include "buffer.h" +#include "utf8.h" +#include "scanners.h" +#include "render.h" + +#define safe_strlen(s) cmark_strbuf_safe_strlen(s) +#define OUT(s, wrap, escaping) renderer->out(renderer, s, wrap, escaping) +#define LIT(s) renderer->out(renderer, s, false, LITERAL) +#define CR() renderer->cr(renderer) +#define BLANKLINE() renderer->blankline(renderer) + +static inline void outc(cmark_renderer *renderer, + cmark_escaping escape, + int32_t c, + unsigned char nextc) +{ + if (escape == LITERAL) { + cmark_render_code_point(renderer, c); + return; + } + + switch(c) { + case 123: // '{' + case 125: // '}' + case 35: // '#' + case 37: // '%' + case 38: // '&' + cmark_render_ascii(renderer, "\\"); + cmark_render_code_point(renderer, c); + break; + case 36: // '$' + case 95: // '_' + if (escape == NORMAL) { + cmark_render_ascii(renderer, "\\"); + } + cmark_render_code_point(renderer, c); + break; + case 45 : // '-' + if (nextc == 45) { // prevent ligature + cmark_render_ascii(renderer, "\\-"); + } else { + cmark_render_ascii(renderer, "-"); + } + break; + case 126: // '~' + if (escape == NORMAL) { + cmark_render_ascii(renderer, "\\textasciitilde{}"); + } else { + cmark_render_code_point(renderer, c); + } + break; + case 94: // '^' + cmark_render_ascii(renderer, "\\^{}"); + break; + case 92: // '\\' + if (escape == URL) { + // / acts as path sep even on windows: + cmark_render_ascii(renderer, "/"); + } else { + cmark_render_ascii(renderer, "\\textbackslash{}"); + } + break; + case 124: // '|' + cmark_render_ascii(renderer, "\\textbar{}"); + break; + case 60: // '<' + cmark_render_ascii(renderer, "\\textless{}"); + break; + case 62: // '>' + cmark_render_ascii(renderer, "\\textgreater{}"); + break; + case 91: // '[' + case 93: // ']' + cmark_render_ascii(renderer, "{"); + cmark_render_code_point(renderer, c); + cmark_render_ascii(renderer, "}"); + break; + case 34: // '"' + cmark_render_ascii(renderer, "\\textquotedbl{}"); + // requires \usepackage[T1]{fontenc} + break; + case 39: // '\'' + cmark_render_ascii(renderer, "\\textquotesingle{}"); + // requires \usepackage{textcomp} + break; + case 160: // nbsp + cmark_render_ascii(renderer, "~"); + break; + case 8230: // hellip + cmark_render_ascii(renderer, "\\ldots{}"); + break; + case 8216: // lsquo + if (escape == NORMAL) { + cmark_render_ascii(renderer, "`"); + } else { + cmark_render_code_point(renderer, c); + } + break; + case 8217: // rsquo + if (escape == NORMAL) { + cmark_render_ascii(renderer, "\'"); + } else { + cmark_render_code_point(renderer, c); + } + break; + case 8220: // ldquo + if (escape == NORMAL) { + cmark_render_ascii(renderer, "``"); + } else { + cmark_render_code_point(renderer, c); + } + break; + case 8221: // rdquo + if (escape == NORMAL) { + cmark_render_ascii(renderer, "''"); + } else { + cmark_render_code_point(renderer, c); + } + break; + case 8212: // emdash + if (escape == NORMAL) { + cmark_render_ascii(renderer, "---"); + } else { + cmark_render_code_point(renderer, c); + } + break; + case 8211: // endash + if (escape == NORMAL) { + cmark_render_ascii(renderer, "--"); + } else { + cmark_render_code_point(renderer, c); + } + break; + default: + cmark_render_code_point(renderer, c); + } +} + +typedef enum { + NO_LINK, + URL_AUTOLINK, + EMAIL_AUTOLINK, + NORMAL_LINK +} link_type; + +static link_type +get_link_type(cmark_node *node) +{ + size_t title_len, url_len; + cmark_node *link_text; + char *realurl; + int realurllen; + bool isemail = false; + + if (node->type != CMARK_NODE_LINK) { + return NO_LINK; + } + + const char* url = cmark_node_get_url(node); + cmark_chunk url_chunk = cmark_chunk_literal(url); + + url_len = safe_strlen(url); + if (url_len == 0 || scan_scheme(&url_chunk, 0) == 0) { + return NO_LINK; + } + + const char* title = cmark_node_get_title(node); + title_len = safe_strlen(title); + // if it has a title, we can't treat it as an autolink: + if (title_len > 0) { + return NORMAL_LINK; + } + + link_text = node->first_child; + cmark_consolidate_text_nodes(link_text); + realurl = (char*)url; + realurllen = url_len; + if (strncmp(realurl, "mailto:", 7) == 0) { + realurl += 7; + realurllen -= 7; + isemail = true; + } + if (realurllen == link_text->as.literal.len && + strncmp(realurl, + (char*)link_text->as.literal.data, + link_text->as.literal.len) == 0) { + if (isemail) { + return EMAIL_AUTOLINK; + } else { + return URL_AUTOLINK; + } + } else { + return NORMAL_LINK; + } +} + +static int +S_get_enumlevel(cmark_node *node) +{ + int enumlevel = 0; + cmark_node *tmp = node; + while (tmp) { + if (tmp->type == CMARK_NODE_LIST && + cmark_node_get_list_type(node) == CMARK_ORDERED_LIST) { + enumlevel++; + } + tmp = tmp->parent; + } + return enumlevel; +} + +static int +S_render_node(cmark_renderer *renderer, + cmark_node *node, + cmark_event_type ev_type, + int options) +{ + int list_number; + char list_number_string[20]; + bool entering = (ev_type == CMARK_EVENT_ENTER); + cmark_list_type list_type; + const char* roman_numerals[] = { "", "i", "ii", "iii", "iv", "v", + "vi", "vii", "viii", "ix", "x" + }; + + // avoid warning about unused parameter: + (void)(options); + + switch (node->type) { + case CMARK_NODE_DOCUMENT: + break; + + case CMARK_NODE_BLOCK_QUOTE: + if (entering) { + LIT("\\begin{quote}"); + CR(); + } else { + LIT("\\end{quote}"); + BLANKLINE(); + } + break; + + case CMARK_NODE_LIST: + list_type = cmark_node_get_list_type(node); + if (entering) { + LIT("\\begin{"); + LIT(list_type == CMARK_ORDERED_LIST ? + "enumerate" : "itemize"); + LIT("}"); + CR(); + list_number = cmark_node_get_list_start(node); + if (list_number > 1) { + sprintf(list_number_string, + "%d", list_number); + LIT("\\setcounter{enum"); + LIT((char *)roman_numerals[S_get_enumlevel(node)]); + LIT("}{"); + OUT(list_number_string, false, NORMAL); + LIT("}"); + CR(); + } + } else { + LIT("\\end{"); + LIT(list_type == CMARK_ORDERED_LIST ? + "enumerate" : "itemize"); + LIT("}"); + BLANKLINE(); + } + break; + + case CMARK_NODE_ITEM: + if (entering) { + LIT("\\item "); + } else { + CR(); + } + break; + + case CMARK_NODE_HEADER: + if (entering) { + switch (cmark_node_get_header_level(node)) { + case 1: + LIT("\\section"); + break; + case 2: + LIT("\\subsection"); + break; + case 3: + LIT("\\subsubsection"); + break; + case 4: + LIT("\\paragraph"); + break; + case 5: + LIT("\\subparagraph"); + break; + } + LIT("{"); + } else { + LIT("}"); + BLANKLINE(); + } + break; + + case CMARK_NODE_CODE_BLOCK: + CR(); + LIT("\\begin{verbatim}"); + CR(); + OUT(cmark_node_get_literal(node), false, LITERAL); + CR(); + LIT("\\end{verbatim}"); + BLANKLINE(); + break; + + case CMARK_NODE_HTML: + break; + + case CMARK_NODE_HRULE: + BLANKLINE(); + LIT("\\begin{center}\\rule{0.5\\linewidth}{\\linethickness}\\end{center}"); + BLANKLINE(); + break; + + case CMARK_NODE_PARAGRAPH: + if (!entering) { + BLANKLINE(); + } + break; + + case CMARK_NODE_TEXT: + OUT(cmark_node_get_literal(node), true, NORMAL); + break; + + case CMARK_NODE_LINEBREAK: + LIT("\\\\"); + CR(); + break; + + case CMARK_NODE_SOFTBREAK: + if (renderer->width == 0) { + CR(); + } else { + OUT(" ", true, NORMAL); + } + break; + + case CMARK_NODE_CODE: + LIT("\\texttt{"); + OUT(cmark_node_get_literal(node), false, NORMAL); + LIT("}"); + break; + + case CMARK_NODE_INLINE_HTML: + break; + + case CMARK_NODE_STRONG: + if (entering) { + LIT("\\textbf{"); + } else { + LIT("}"); + } + break; + + case CMARK_NODE_EMPH: + if (entering) { + LIT("\\emph{"); + } else { + LIT("}"); + } + break; + + case CMARK_NODE_LINK: + if (entering) { + const char* url = cmark_node_get_url(node); + // requires \usepackage{hyperref} + switch(get_link_type(node)) { + case URL_AUTOLINK: + LIT("\\url{"); + OUT(url, false, URL); + break; + case EMAIL_AUTOLINK: + LIT("\\href{"); + OUT(url, false, URL); + LIT("}\\nolinkurl{"); + break; + case NORMAL_LINK: + LIT("\\href{"); + OUT(url, false, URL); + LIT("}{"); + break; + case NO_LINK: + LIT("{"); // error? + } + } else { + LIT("}"); + } + + break; + + case CMARK_NODE_IMAGE: + if (entering) { + LIT("\\protect\\includegraphics{"); + // requires \include{graphicx} + OUT(cmark_node_get_url(node), false, URL); + LIT("}"); + return 0; + } + break; + + default: + assert(false); + break; + } + + return 1; +} + +char *cmark_render_latex(cmark_node *root, int options, int width) +{ + return cmark_render(root, options, width, outc, S_render_node); +} http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/libcmark.pc.in ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/libcmark.pc.in b/compiler/modules/CommonMark/src/libcmark.pc.in deleted file mode 100644 index 9c3a9a9..0000000 --- a/compiler/modules/CommonMark/src/libcmark.pc.in +++ /dev/null @@ -1,10 +0,0 @@ -prefix=@CMAKE_INSTALL_PREFIX@ -exec_prefix=@CMAKE_INSTALL_PREFIX@ -libdir=@CMAKE_INSTALL_PREFIX@/lib -includedir=@CMAKE_INSTALL_PREFIX@/include - -Name: libcmark -Description: CommonMark parsing, rendering, and manipulation -Version: @PROJECT_VERSION@ -Libs: -L${libdir} -lcmark -Cflags: -I${includedir} http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/man.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/man.c b/compiler/modules/CommonMark/src/man.c index 2c8a3a5..6ff33f5 100644 --- a/compiler/modules/CommonMark/src/man.c +++ b/compiler/modules/CommonMark/src/man.c @@ -7,72 +7,84 @@ #include "cmark.h" #include "node.h" #include "buffer.h" +#include "utf8.h" +#include "render.h" -// Functions to convert cmark_nodes to groff man strings. +#define OUT(s, wrap, escaping) renderer->out(renderer, s, wrap, escaping) +#define LIT(s) renderer->out(renderer, s, false, LITERAL) +#define CR() renderer->cr(renderer) +#define BLANKLINE() renderer->blankline(renderer) -static void escape_man(cmark_strbuf *dest, const unsigned char *source, int length) +// Functions to convert cmark_nodes to groff man strings. +static +void S_outc(cmark_renderer *renderer, + cmark_escaping escape, + int32_t c, + unsigned char nextc) { - int i; - unsigned char c; - - for (i = 0; i < length; i++) { - c = source[i]; - if (c == '.' && i == 0) { - cmark_strbuf_puts(dest, "\\&."); - } else if (c == '\'' && i == 0) { - cmark_strbuf_puts(dest, "\\&'"); - } else if (c == '-') { - cmark_strbuf_puts(dest, "\\-"); - } else if (c == '\\') { - cmark_strbuf_puts(dest, "\\e"); + (void)(nextc); + + if (escape == LITERAL) { + cmark_render_code_point(renderer, c); + return; + } + + switch(c) { + case 46: + if (renderer->begin_line) { + cmark_render_ascii(renderer, "\\&."); + } else { + cmark_render_code_point(renderer, c); + } + break; + case 39: + if (renderer->begin_line) { + cmark_render_ascii(renderer, "\\&'"); } else { - cmark_strbuf_putc(dest, source[i]); + cmark_render_code_point(renderer, c); } + break; + case 45: + cmark_render_ascii(renderer, "\\-"); + break; + case 92: + cmark_render_ascii(renderer, "\\e"); + break; + case 8216: // left single quote + cmark_render_ascii(renderer, "\\[oq]"); + break; + case 8217: // right single quote + cmark_render_ascii(renderer, "\\[cq]"); + break; + case 8220: // left double quote + cmark_render_ascii(renderer, "\\[lq]"); + break; + case 8221: // right double quote + cmark_render_ascii(renderer, "\\[rq]"); + break; + case 8212: // em dash + cmark_render_ascii(renderer, "\\[em]"); + break; + case 8211: // en dash + cmark_render_ascii(renderer, "\\[en]"); + break; + default: + cmark_render_code_point(renderer, c); } } -static inline void cr(cmark_strbuf *man) -{ - if (man->size && man->ptr[man->size - 1] != '\n') - cmark_strbuf_putc(man, '\n'); -} - -struct render_state { - cmark_strbuf* man; - cmark_node *plain; -}; - static int -S_render_node(cmark_node *node, cmark_event_type ev_type, - struct render_state *state) +S_render_node(cmark_renderer *renderer, + cmark_node *node, + cmark_event_type ev_type, + int options) { cmark_node *tmp; - cmark_strbuf *man = state->man; int list_number; bool entering = (ev_type == CMARK_EVENT_ENTER); - if (state->plain == node) { // back at original node - state->plain = NULL; - } - - if (state->plain != NULL) { - switch(node->type) { - case CMARK_NODE_TEXT: - case CMARK_NODE_CODE: - escape_man(man, node->as.literal.data, - node->as.literal.len); - break; - - case CMARK_NODE_LINEBREAK: - case CMARK_NODE_SOFTBREAK: - cmark_strbuf_putc(man, ' '); - break; - - default: - break; - } - return 1; - } + // avoid unused parameter error: + (void)(options); switch (node->type) { case CMARK_NODE_DOCUMENT: @@ -80,13 +92,13 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_BLOCK_QUOTE: if (entering) { - cr(man); - cmark_strbuf_puts(man, ".RS"); - cr(man); + CR(); + LIT(".RS"); + CR(); } else { - cr(man); - cmark_strbuf_puts(man, ".RE"); - cr(man); + CR(); + LIT(".RE"); + CR(); } break; @@ -95,11 +107,11 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_ITEM: if (entering) { - cr(man); - cmark_strbuf_puts(man, ".IP "); + CR(); + LIT(".IP "); if (cmark_node_get_list_type(node->parent) == CMARK_BULLET_LIST) { - cmark_strbuf_puts(man, "\\[bu] 2"); + LIT("\\[bu] 2"); } else { list_number = cmark_node_get_list_start(node->parent); tmp = node; @@ -107,43 +119,45 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, tmp = tmp->prev; list_number += 1; } - cmark_strbuf_printf(man, "\"%d.\" 4", list_number); + char list_number_s[20]; + sprintf(list_number_s, "\"%d.\" 4", list_number); + LIT(list_number_s); } - cr(man); + CR(); } else { - cr(man); + CR(); } break; case CMARK_NODE_HEADER: if (entering) { - cr(man); - cmark_strbuf_puts(man, - cmark_node_get_header_level(node) == 1 ? - ".SH" : ".SS"); - cr(man); + CR(); + LIT(cmark_node_get_header_level(node) == 1 ? + ".SH" : ".SS"); + CR(); } else { - cr(man); + CR(); } break; case CMARK_NODE_CODE_BLOCK: - cr(man); - cmark_strbuf_puts(man, ".IP\n.nf\n\\f[C]\n"); - escape_man(man, node->as.code.literal.data, - node->as.code.literal.len); - cr(man); - cmark_strbuf_puts(man, "\\f[]\n.fi"); - cr(man); + CR(); + LIT(".IP\n.nf\n\\f[C]\n"); + OUT(cmark_node_get_literal(node), + false, + NORMAL); + CR(); + LIT("\\f[]\n.fi"); + CR(); break; case CMARK_NODE_HTML: break; case CMARK_NODE_HRULE: - cr(man); - cmark_strbuf_puts(man, ".PP\n * * * * *"); - cr(man); + CR(); + LIT(".PP\n * * * * *"); + CR(); break; case CMARK_NODE_PARAGRAPH: @@ -154,32 +168,36 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, node->prev == NULL) { // no blank line or .PP } else { - cr(man); - cmark_strbuf_puts(man, ".PP\n"); + CR(); + LIT(".PP"); + CR(); } } else { - cr(man); + CR(); } break; case CMARK_NODE_TEXT: - escape_man(man, node->as.literal.data, - node->as.literal.len); + OUT(cmark_node_get_literal(node), true, NORMAL); break; case CMARK_NODE_LINEBREAK: - cmark_strbuf_puts(man, ".PD 0\n.P\n.PD"); - cr(man); + LIT(".PD 0\n.P\n.PD"); + CR(); break; case CMARK_NODE_SOFTBREAK: - cmark_strbuf_putc(man, '\n'); + if (renderer->width == 0) { + CR(); + } else { + OUT(" ", true, LITERAL); + } break; case CMARK_NODE_CODE: - cmark_strbuf_puts(man, "\\f[C]"); - escape_man(man, node->as.literal.data, node->as.literal.len); - cmark_strbuf_puts(man, "\\f[]"); + LIT("\\f[C]"); + OUT(cmark_node_get_literal(node), true, NORMAL); + LIT("\\f[]"); break; case CMARK_NODE_INLINE_HTML: @@ -187,33 +205,33 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_STRONG: if (entering) { - cmark_strbuf_puts(man, "\\f[B]"); + LIT("\\f[B]"); } else { - cmark_strbuf_puts(man, "\\f[]"); + LIT("\\f[]"); } break; case CMARK_NODE_EMPH: if (entering) { - cmark_strbuf_puts(man, "\\f[I]"); + LIT("\\f[I]"); } else { - cmark_strbuf_puts(man, "\\f[]"); + LIT("\\f[]"); } break; case CMARK_NODE_LINK: if (!entering) { - cmark_strbuf_printf(man, " (%s)", - cmark_node_get_url(node)); + LIT(" ("); + OUT(cmark_node_get_url(node), true, URL); + LIT(")"); } break; case CMARK_NODE_IMAGE: if (entering) { - cmark_strbuf_puts(man, "[IMAGE: "); - state->plain = node; + LIT("[IMAGE: "); } else { - cmark_strbuf_puts(man, "]"); + LIT("]"); } break; @@ -222,28 +240,10 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, break; } - // cmark_strbuf_putc(man, 'x'); return 1; } -char *cmark_render_man(cmark_node *root, long options) +char *cmark_render_man(cmark_node *root, int options, int width) { - char *result; - cmark_strbuf man = GH_BUF_INIT; - struct render_state state = { &man, NULL }; - cmark_node *cur; - cmark_event_type ev_type; - cmark_iter *iter = cmark_iter_new(root); - - if (options == 0) options = 0; // avoid warning about unused parameters - - while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { - cur = cmark_iter_get_node(iter); - S_render_node(cur, ev_type, &state); - } - result = (char *)cmark_strbuf_detach(&man); - - cmark_iter_free(iter); - cmark_strbuf_free(&man); - return result; + return cmark_render(root, options, width, S_outc, S_render_node); } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/node.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/node.c b/compiler/modules/CommonMark/src/node.c index 3785a27..7b1bb10 100644 --- a/compiler/modules/CommonMark/src/node.c +++ b/compiler/modules/CommonMark/src/node.c @@ -7,6 +7,73 @@ static void S_node_unlink(cmark_node *node); +static inline bool +S_is_block(cmark_node *node) +{ + if (node == NULL) { + return false; + } + return node->type >= CMARK_NODE_FIRST_BLOCK + && node->type <= CMARK_NODE_LAST_BLOCK; +} + +static inline bool +S_is_inline(cmark_node *node) +{ + if (node == NULL) { + return false; + } + return node->type >= CMARK_NODE_FIRST_INLINE + && node->type <= CMARK_NODE_LAST_INLINE; +} + +static bool +S_can_contain(cmark_node *node, cmark_node *child) +{ + cmark_node *cur; + + if (node == NULL || child == NULL) { + return false; + } + + // Verify that child is not an ancestor of node or equal to node. + cur = node; + do { + if (cur == child) { + return false; + } + cur = cur->parent; + } while (cur != NULL); + + if (child->type == CMARK_NODE_DOCUMENT) { + return false; + } + + switch (node->type) { + case CMARK_NODE_DOCUMENT: + case CMARK_NODE_BLOCK_QUOTE: + case CMARK_NODE_ITEM: + return S_is_block(child) + && child->type != CMARK_NODE_ITEM; + + case CMARK_NODE_LIST: + return child->type == CMARK_NODE_ITEM; + + case CMARK_NODE_PARAGRAPH: + case CMARK_NODE_HEADER: + case CMARK_NODE_EMPH: + case CMARK_NODE_STRONG: + case CMARK_NODE_LINK: + case CMARK_NODE_IMAGE: + return S_is_inline(child); + + default: + break; + } + + return false; +} + cmark_node* cmark_node_new(cmark_node_type type) { @@ -39,7 +106,9 @@ void S_free_nodes(cmark_node *e) { cmark_node *next; while (e != NULL) { - cmark_strbuf_free(&e->string_content); + if (S_is_block(e)) { + cmark_strbuf_free(&e->string_content); + } switch (e->type) { case NODE_CODE_BLOCK: cmark_chunk_free(&e->as.code.info); @@ -53,8 +122,8 @@ void S_free_nodes(cmark_node *e) break; case NODE_LINK: case NODE_IMAGE: - free(e->as.link.url); - free(e->as.link.title); + cmark_chunk_free(&e->as.link.url); + cmark_chunk_free(&e->as.link.title); break; default: break; @@ -189,13 +258,24 @@ cmark_node_last_child(cmark_node *node) } } -static char* -S_strdup(const char *str) +void* +cmark_node_get_user_data(cmark_node *node) { - size_t size = strlen(str) + 1; - char *dup = (char *)malloc(size); - memcpy(dup, str, size); - return dup; + if (node == NULL) { + return NULL; + } else { + return node->user_data; + } +} + +int +cmark_node_set_user_data(cmark_node *node, void *user_data) +{ + if (node == NULL) { + return 0; + } + node->user_data = user_data; + return 1; } const char* @@ -448,7 +528,7 @@ cmark_node_get_url(cmark_node *node) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - return (char *)node->as.link.url; + return cmark_chunk_to_cstr(&node->as.link.url); default: break; } @@ -466,8 +546,7 @@ cmark_node_set_url(cmark_node *node, const char *url) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - free(node->as.link.url); - node->as.link.url = (unsigned char *)S_strdup(url); + cmark_chunk_set_cstr(&node->as.link.url, url); return 1; default: break; @@ -486,7 +565,7 @@ cmark_node_get_title(cmark_node *node) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - return (char *)node->as.link.title; + return cmark_chunk_to_cstr(&node->as.link.title); default: break; } @@ -504,8 +583,7 @@ cmark_node_set_title(cmark_node *node, const char *title) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - free(node->as.link.title); - node->as.link.title = (unsigned char *)S_strdup(title); + cmark_chunk_set_cstr(&node->as.link.title, title); return 1; default: break; @@ -550,73 +628,6 @@ cmark_node_get_end_column(cmark_node *node) return node->end_column; } -static inline bool -S_is_block(cmark_node *node) -{ - if (node == NULL) { - return false; - } - return node->type >= CMARK_NODE_FIRST_BLOCK - && node->type <= CMARK_NODE_LAST_BLOCK; -} - -static inline bool -S_is_inline(cmark_node *node) -{ - if (node == NULL) { - return false; - } - return node->type >= CMARK_NODE_FIRST_INLINE - && node->type <= CMARK_NODE_LAST_INLINE; -} - -static bool -S_can_contain(cmark_node *node, cmark_node *child) -{ - cmark_node *cur; - - if (node == NULL || child == NULL) { - return false; - } - - // Verify that child is not an ancestor of node or equal to node. - cur = node; - do { - if (cur == child) { - return false; - } - cur = cur->parent; - } while (cur != NULL); - - if (child->type == CMARK_NODE_DOCUMENT) { - return false; - } - - switch (node->type) { - case CMARK_NODE_DOCUMENT: - case CMARK_NODE_BLOCK_QUOTE: - case CMARK_NODE_ITEM: - return S_is_block(child) - && child->type != CMARK_NODE_ITEM; - - case CMARK_NODE_LIST: - return child->type == CMARK_NODE_ITEM; - - case CMARK_NODE_PARAGRAPH: - case CMARK_NODE_HEADER: - case CMARK_NODE_EMPH: - case CMARK_NODE_STRONG: - case CMARK_NODE_LINK: - case CMARK_NODE_IMAGE: - return S_is_inline(child); - - default: - break; - } - - return false; -} - // Unlink a node without adjusting its next, prev, and parent pointers. static void S_node_unlink(cmark_node *node) http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/node.h ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/node.h b/compiler/modules/CommonMark/src/node.h index c0c43d3..b579408 100644 --- a/compiler/modules/CommonMark/src/node.h +++ b/compiler/modules/CommonMark/src/node.h @@ -6,6 +6,7 @@ extern "C" { #endif #include <stdio.h> +#include <stdint.h> #include "cmark.h" #include "buffer.h" @@ -22,12 +23,13 @@ typedef struct { } cmark_list; typedef struct { - bool fenced; - int fence_length; - int fence_offset; - unsigned char fence_char; cmark_chunk info; cmark_chunk literal; + int fence_length; + /* fence_offset must be 0-3, so we can use int8_t */ + int8_t fence_offset; + unsigned char fence_char; + bool fenced; } cmark_code; typedef struct { @@ -36,23 +38,26 @@ typedef struct { } cmark_header; typedef struct { - unsigned char *url; - unsigned char *title; + cmark_chunk url; + cmark_chunk title; } cmark_link; struct cmark_node { - cmark_node_type type; - struct cmark_node *next; struct cmark_node *prev; struct cmark_node *parent; struct cmark_node *first_child; struct cmark_node *last_child; + void *user_data; + int start_line; int start_column; int end_line; int end_column; + + cmark_node_type type; + bool open; bool last_line_blank; @@ -64,6 +69,7 @@ struct cmark_node { cmark_code code; cmark_header header; cmark_link link; + int html_block_type; } as; }; http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/parser.h ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/parser.h b/compiler/modules/CommonMark/src/parser.h index 3c8def9..01a7aeb 100644 --- a/compiler/modules/CommonMark/src/parser.h +++ b/compiler/modules/CommonMark/src/parser.h @@ -16,9 +16,16 @@ struct cmark_parser { struct cmark_node* root; struct cmark_node* current; int line_number; + bufsize_t offset; + bufsize_t column; + bufsize_t first_nonspace; + bufsize_t first_nonspace_column; + int indent; + bool blank; cmark_strbuf *curline; - int last_line_length; + bufsize_t last_line_length; cmark_strbuf *linebuf; + int options; }; #ifdef __cplusplus http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/references.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/references.c b/compiler/modules/CommonMark/src/references.c index 37bf4cb..1d3d56d 100644 --- a/compiler/modules/CommonMark/src/references.c +++ b/compiler/modules/CommonMark/src/references.c @@ -20,8 +20,8 @@ static void reference_free(cmark_reference *ref) { if(ref != NULL) { free(ref->label); - free(ref->url); - free(ref->title); + cmark_chunk_free(&ref->url); + cmark_chunk_free(&ref->title); free(ref); } } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/references.h ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/references.h b/compiler/modules/CommonMark/src/references.h index 69325bb..a360cd5 100644 --- a/compiler/modules/CommonMark/src/references.h +++ b/compiler/modules/CommonMark/src/references.h @@ -12,8 +12,8 @@ extern "C" { struct cmark_reference { struct cmark_reference *next; unsigned char *label; - unsigned char *url; - unsigned char *title; + cmark_chunk url; + cmark_chunk title; unsigned int hash; }; http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/render.c ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/render.c b/compiler/modules/CommonMark/src/render.c new file mode 100644 index 0000000..2f1faac --- /dev/null +++ b/compiler/modules/CommonMark/src/render.c @@ -0,0 +1,186 @@ +#include <stdlib.h> +#include "buffer.h" +#include "chunk.h" +#include "cmark.h" +#include "utf8.h" +#include "render.h" + +static inline +void S_cr(cmark_renderer *renderer) +{ + if (renderer->need_cr < 1) { + renderer->need_cr = 1; + } +} + +static inline +void S_blankline(cmark_renderer *renderer) +{ + if (renderer->need_cr < 2) { + renderer->need_cr = 2; + } +} + +static +void S_out(cmark_renderer *renderer, + const char *source, + bool wrap, + cmark_escaping escape) +{ + int length = cmark_strbuf_safe_strlen(source); + unsigned char nextc; + int32_t c; + int i = 0; + int len; + cmark_chunk remainder = cmark_chunk_literal(""); + int k = renderer->buffer->size - 1; + + wrap = wrap && !renderer->no_wrap; + + if (renderer->in_tight_list_item && renderer->need_cr > 1) { + renderer->need_cr = 1; + } + while (renderer->need_cr) { + if (k < 0 || renderer->buffer->ptr[k] == '\n') { + k -= 1; + } else { + cmark_strbuf_putc(renderer->buffer, '\n'); + if (renderer->need_cr > 1) { + cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr, + renderer->prefix->size); + } + } + renderer->column = 0; + renderer->begin_line = true; + renderer->need_cr -= 1; + } + + while (i < length) { + if (renderer->begin_line) { + cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr, + renderer->prefix->size); + // note: this assumes prefix is ascii: + renderer->column = renderer->prefix->size; + } + + len = utf8proc_iterate((const uint8_t *)source + i, length - i, &c); + if (len == -1) { // error condition + return; // return without rendering rest of string + } + nextc = source[i + len]; + if (c == 32 && wrap) { + if (!renderer->begin_line) { + cmark_strbuf_putc(renderer->buffer, ' '); + renderer->column += 1; + renderer->begin_line = false; + renderer->last_breakable = renderer->buffer->size - + 1; + // skip following spaces + while (source[i + 1] == ' ') { + i++; + } + } + + } else if (c == 10) { + cmark_strbuf_putc(renderer->buffer, '\n'); + renderer->column = 0; + renderer->begin_line = true; + renderer->last_breakable = 0; + } else if (escape == LITERAL) { + cmark_render_code_point(renderer, c); + renderer->begin_line = false; + } else { + (renderer->outc)(renderer, escape, c, nextc); + renderer->begin_line = false; + } + + // If adding the character went beyond width, look for an + // earlier place where the line could be broken: + if (renderer->width > 0 && + renderer->column > renderer->width && + !renderer->begin_line && + renderer->last_breakable > 0) { + + // copy from last_breakable to remainder + cmark_chunk_set_cstr(&remainder, (char *) renderer->buffer->ptr + renderer->last_breakable + 1); + // truncate at last_breakable + cmark_strbuf_truncate(renderer->buffer, renderer->last_breakable); + // add newline, prefix, and remainder + cmark_strbuf_putc(renderer->buffer, '\n'); + cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr, + renderer->prefix->size); + cmark_strbuf_put(renderer->buffer, remainder.data, remainder.len); + renderer->column = renderer->prefix->size + remainder.len; + cmark_chunk_free(&remainder); + renderer->last_breakable = 0; + renderer->begin_line = false; + } + + i += len; + } +} + +// Assumes no newlines, assumes ascii content: +void +cmark_render_ascii(cmark_renderer* renderer, const char* s) +{ + int origsize = renderer->buffer->size; + cmark_strbuf_puts(renderer->buffer, s); + renderer->column += renderer->buffer->size - origsize; +} + +void +cmark_render_code_point(cmark_renderer *renderer, uint32_t c) +{ + utf8proc_encode_char(c, renderer->buffer); + renderer->column += 1; +} + +char* +cmark_render(cmark_node *root, + int options, + int width, + void (*outc)(cmark_renderer*, + cmark_escaping, + int32_t, + unsigned char), + int (*render_node)(cmark_renderer *renderer, + cmark_node *node, + cmark_event_type ev_type, + int options)) +{ + cmark_strbuf pref = GH_BUF_INIT; + cmark_strbuf buf = GH_BUF_INIT; + cmark_node *cur; + cmark_event_type ev_type; + char *result; + cmark_iter *iter = cmark_iter_new(root); + + cmark_renderer renderer = { &buf, &pref, 0, width, + 0, 0, true, false, false, + outc, S_cr, S_blankline, S_out + }; + + while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { + cur = cmark_iter_get_node(iter); + if (!render_node(&renderer, cur, ev_type, options)) { + // a false value causes us to skip processing + // the node's contents. this is used for + // autolinks. + cmark_iter_reset(iter, cur, CMARK_EVENT_EXIT); + } + } + + // ensure final newline + if (renderer.buffer->ptr[renderer.buffer->size - 1] != '\n') { + cmark_strbuf_putc(renderer.buffer, '\n'); + } + + result = (char *)cmark_strbuf_detach(renderer.buffer); + + cmark_iter_free(iter); + cmark_strbuf_free(renderer.prefix); + cmark_strbuf_free(renderer.buffer); + + return result; +} http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/render.h ---------------------------------------------------------------------- diff --git a/compiler/modules/CommonMark/src/render.h b/compiler/modules/CommonMark/src/render.h new file mode 100644 index 0000000..ca541bc --- /dev/null +++ b/compiler/modules/CommonMark/src/render.h @@ -0,0 +1,66 @@ +#ifndef CMARK_RENDER_H +#define CMARK_RENDER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdlib.h> +#include "buffer.h" +#include "chunk.h" + +typedef enum { + LITERAL, + NORMAL, + TITLE, + URL +} cmark_escaping; + +struct cmark_renderer { + cmark_strbuf* buffer; + cmark_strbuf* prefix; + int column; + int width; + int need_cr; + bufsize_t last_breakable; + bool begin_line; + bool no_wrap; + bool in_tight_list_item; + void (*outc)(struct cmark_renderer*, + cmark_escaping, + int32_t, + unsigned char); + void (*cr)(struct cmark_renderer*); + void (*blankline)(struct cmark_renderer*); + void (*out)(struct cmark_renderer*, + const char *, + bool, + cmark_escaping); +}; + +typedef struct cmark_renderer cmark_renderer; + +void +cmark_render_ascii(cmark_renderer *renderer, const char* s); + +void +cmark_render_code_point(cmark_renderer *renderer, uint32_t c); + +char* +cmark_render(cmark_node *root, + int options, + int width, + void (*outc)(cmark_renderer*, + cmark_escaping, + int32_t, + unsigned char), + int (*render_node)(cmark_renderer *renderer, + cmark_node *node, + cmark_event_type ev_type, + int options)); + +#ifdef __cplusplus +} +#endif + +#endif
