diff options
Diffstat (limited to 'src/blocks.c')
-rw-r--r-- | src/blocks.c | 747 |
1 files changed, 747 insertions, 0 deletions
diff --git a/src/blocks.c b/src/blocks.c new file mode 100644 index 0000000..2776231 --- /dev/null +++ b/src/blocks.c @@ -0,0 +1,747 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include <ctype.h> +#include "bstrlib.h" +#include "stmd.h" +#include "uthash.h" +#include "debug.h" +#include "scanners.h" + +static block* make_block(int tag, int start_line, int start_column) +{ + block* e; + e = (block*) malloc(sizeof(block)); + e->tag = tag; + e->open = true; + e->last_line_blank = false; + e->start_line = start_line; + e->start_column = start_column; + e->end_line = start_line; + e->children = NULL; + e->last_child = NULL; + e->parent = NULL; + e->top = NULL; + e->attributes.refmap = NULL; + e->string_content = bfromcstr(""); + e->inline_content = NULL; + e->next = NULL; + e->prev = NULL; + return e; +} + +// Create a root document block. +extern block* make_document() +{ + block * e = make_block(document, 1, 1); + reference * map = NULL; + reference ** refmap; + refmap = (reference**) malloc(sizeof(reference*)); + *refmap = map; + e->attributes.refmap = refmap; + e->top = e; + return e; +} + +// Returns true if line has only space characters, else false. +bool is_blank(bstring s, int offset) +{ + char c; + while ((c = bchar(s, offset))) { + if (c == '\n') { + return true; + } else if (c == ' ') { + offset++; + } else { + return false; + } + } + return true; +} + +static inline bool can_contain(int parent_type, int child_type) +{ + return ( parent_type == document || + parent_type == block_quote || + parent_type == list_item || + (parent_type == list && child_type == list_item) ); +} + +static inline bool accepts_lines(int block_type) +{ + return (block_type == paragraph || + block_type == atx_header || + block_type == indented_code || + block_type == fenced_code); +} + +static int add_line(block* block, bstring ln, int offset) +{ + bstring s = bmidstr(ln, offset, blength(ln) - offset); + check(block->open, "attempted to add line (%s) to closed container (%d)", + ln->data, block->tag); + check(bformata(block->string_content, "%s", s->data) == 0, + "could not append line to string_content"); + bdestroy(s); + return 0; + error: + return -1; +} + +static int remove_trailing_blank_lines(bstring ln) +{ + bstring tofind = bfromcstr(" \t\r\n"); + int pos; + // find last nonspace: + pos = bninchrr(ln, blength(ln) - 1, tofind); + if (pos == BSTR_ERR) { // all spaces + bassigncstr(ln, ""); + } else { + // find next newline after it + pos = bstrchrp(ln, '\n', pos); + if (pos != BSTR_ERR) { + check(bdelete(ln, pos, blength(ln) - pos) != BSTR_ERR, + "failed to delete trailing blank lines"); + } + } + bdestroy(tofind); + return 0; + error: + return -1; +} + +// Check to see if a block ends with a blank line, descending +// if needed into lists and sublists. +static bool ends_with_blank_line(block* block) +{ + if (block->last_line_blank) { + return true; + } + if ((block->tag == list || block->tag == list_item) && block->last_child) { + return ends_with_blank_line(block->last_child); + } else { + return false; + } +} + +// Break out of all containing lists +static int break_out_of_lists(block ** bptr, int line_number) +{ + block * container = *bptr; + block * b = container->top; + // find first containing list: + while (b && b->tag != list) { + b = b->last_child; + } + if (b) { + while (container && container != b) { + finalize(container, line_number); + container = container->parent; + } + finalize(b, line_number); + *bptr = b->parent; + } + return 0; +} + + +extern int finalize(block* b, int line_number) +{ + int firstlinelen; + int pos; + block* item; + block* subitem; + + check(b != NULL, "finalize called on null block"); + if (!b->open) { + return 0; // don't do anything if the block is already closed + } + b->open = false; + if (line_number > b->start_line) { + b->end_line = line_number - 1; + } else { + b->end_line = line_number; + } + + switch (b->tag) { + + case paragraph: + pos = 0; + while (bchar(b->string_content, 0) == '[' && + (pos = parse_reference(b->string_content, + b->top->attributes.refmap))) { + bdelete(b->string_content, 0, pos); + } + if (is_blank(b->string_content, 0)) { + b->tag = reference_def; + } + break; + + case indented_code: + remove_trailing_blank_lines(b->string_content); + bformata(b->string_content, "\n"); + break; + + case fenced_code: + // first line of contents becomes info + firstlinelen = bstrchr(b->string_content, '\n'); + b->attributes.fenced_code_data.info = + bmidstr(b->string_content, 0, firstlinelen); + bdelete(b->string_content, 0, firstlinelen + 1); // +1 for \n + btrimws(b->attributes.fenced_code_data.info); + unescape(b->attributes.fenced_code_data.info); + break; + + case list: // determine tight/loose status + b->attributes.list_data.tight = true; // tight by default + item = b->children; + + while (item) { + // check for non-final non-empty list item ending with blank line: + if (item->last_line_blank && item->next) { + b->attributes.list_data.tight = false; + break; + } + // recurse into children of list item, to see if there are + // spaces between them: + subitem = item->children; + while (subitem) { + if (ends_with_blank_line(subitem) && + (item->next || subitem->next)) { + b->attributes.list_data.tight = false; + break; + } + subitem = subitem->next; + } + if (!(b->attributes.list_data.tight)) { + break; + } + item = item->next; + } + + break; + + default: + break; + } + + return 0; + error: + return -1; +} + +// Add a block as child of another. Return pointer to child. +extern block* add_child(block* parent, + int block_type, int start_line, int start_column) +{ + // if 'parent' isn't the kind of block that can accept this child, + // then back up til we hit a block that can. + while (!can_contain(parent->tag, block_type)) { + finalize(parent, start_line); + parent = parent->parent; + } + + check(parent != NULL, "parent container cannot accept children"); + + block* child = make_block(block_type, start_line, start_column); + child->parent = parent; + child->top = parent->top; + + if (parent->last_child) { + parent->last_child->next = child; + child->prev = parent->last_child; + } else { + parent->children = child; + child->prev = NULL; + } + parent->last_child = child; + return child; + error: + return NULL; +} + +// Free a block list and any children. +extern void free_blocks(block* e) +{ + block * next; + while (e != NULL) { + next = e->next; + free_inlines(e->inline_content); + bdestroy(e->string_content); + if (e->tag == fenced_code) { + bdestroy(e->attributes.fenced_code_data.info); + } else if (e->tag == document) { + free_reference_map(e->attributes.refmap); + } + free_blocks(e->children); + free(e); + e = next; + } +} + +// Walk through block and all children, recursively, parsing +// string content into inline content where appropriate. +int process_inlines(block* cur, reference** refmap) +{ + switch (cur->tag) { + + case paragraph: + case atx_header: + case setext_header: + check(cur->string_content != NULL, "string_content is NULL"); + cur->inline_content = parse_inlines(cur->string_content, refmap); + bdestroy(cur->string_content); + cur->string_content = NULL; + break; + + default: + break; + } + + block * child = cur->children; + while (child != NULL) { + process_inlines(child, refmap); + child = child->next; + } + + return 0; + error: + return -1; +} + +// Attempts to parse a list item marker (bullet or enumerated). +// On success, returns length of the marker, and populates +// data with the details. On failure, returns 0. +static int parse_list_marker(bstring ln, int pos, + struct ListData ** dataptr) +{ + char c; + int startpos; + int start = 1; + struct ListData * data; + + startpos = pos; + c = bchar(ln, pos); + + if ((c == '*' || c == '-' || c == '+') && !scan_hrule(ln, pos)) { + pos++; + if (!isspace(bchar(ln, pos))) { + return 0; + } + data = malloc(sizeof(struct ListData)); + data->marker_offset = 0; // will be adjusted later + data->list_type = bullet; + data->bullet_char = c; + data->start = 1; + data->delimiter = period; + data->tight = false; + + } else if (isdigit(c)) { + + pos++; + while (isdigit(bchar(ln, pos))) { + pos++; + } + + if (!sscanf((char *) ln->data + startpos, "%d", &start)) { + log_err("sscanf failed"); + return 0; + } + + c = bchar(ln, pos); + if (c == '.' || c == ')') { + pos++; + if (!isspace(bchar(ln, pos))) { + return 0; + } + data = malloc(sizeof(struct ListData)); + data->marker_offset = 0; // will be adjusted later + data->list_type = ordered; + data->bullet_char = 0; + data->start = start; + data->delimiter = (c == '.' ? period : parens); + data->tight = false; + } else { + return 0; + } + + } else { + return 0; + } + + *dataptr = data; + return (pos - startpos); +} + +// Return 1 if list item belongs in list, else 0. +static int lists_match(struct ListData list_data, + struct ListData item_data) +{ + return (list_data.list_type == item_data.list_type && + list_data.delimiter == item_data.delimiter && + // list_data.marker_offset == item_data.marker_offset && + list_data.bullet_char == item_data.bullet_char); +} + +// Process one line at a time, modifying a block. +// Returns 0 if successful. curptr is changed to point to +// the currently open block. +extern int incorporate_line(bstring ln, int line_number, block** curptr) +{ + block* last_matched_container; + int offset = 0; + int matched = 0; + int lev = 0; + int i; + struct ListData * data = NULL; + bool all_matched = true; + block* container; + block* cur = *curptr; + bool blank = false; + int first_nonspace; + int indent; + + // detab input line + check(bdetab(ln, 1) != BSTR_ERR, + "invalid UTF-8 sequence in line %d\n", line_number); + + // container starts at the document root. + container = cur->top; + + // for each containing block, try to parse the associated line start. + // bail out on failure: container will point to the last matching block. + + while (container->last_child && container->last_child->open) { + container = container->last_child; + + first_nonspace = offset; + while (bchar(ln, first_nonspace) == ' ') { + first_nonspace++; + } + + indent = first_nonspace - offset; + blank = bchar(ln, first_nonspace) == '\n'; + + if (container->tag == block_quote) { + + matched = indent <= 3 && bchar(ln, first_nonspace) == '>'; + if (matched) { + offset = first_nonspace + 1; + if (bchar(ln, offset) == ' ') { + offset++; + } + } else { + all_matched = false; + } + + } else if (container->tag == list_item) { + + if (indent >= container->attributes.list_data.marker_offset + + container->attributes.list_data.padding) { + offset += container->attributes.list_data.marker_offset + + container->attributes.list_data.padding; + } else if (blank) { + offset = first_nonspace; + } else { + all_matched = false; + } + + } else if (container->tag == indented_code) { + + if (indent >= CODE_INDENT) { + offset += CODE_INDENT; + } else if (blank) { + offset = first_nonspace; + } else { + all_matched = false; + } + + } else if (container->tag == atx_header || + container->tag == setext_header) { + + // a header can never contain more than one line + all_matched = false; + + } else if (container->tag == fenced_code) { + + // skip optional spaces of fence offset + i = container->attributes.fenced_code_data.fence_offset; + while (i > 0 && bchar(ln, offset) == ' ') { + offset++; + i--; + } + + } else if (container->tag == html_block) { + + if (blank) { + all_matched = false; + } + + } else if (container->tag == paragraph) { + + if (blank) { + container->last_line_blank =true; + all_matched = false; + } + + } + + if (!all_matched) { + container = container->parent; // back up to last matching block + break; + } + } + + last_matched_container = container; + + // check to see if we've hit 2nd blank line, break out of list: + if (blank && container->last_line_blank) { + break_out_of_lists(&container, line_number); + } + + // unless last matched container is code block, try new container starts: + while (container->tag != fenced_code && container->tag != indented_code && + container->tag != html_block) { + + first_nonspace = offset; + while (bchar(ln, first_nonspace) == ' ') { + first_nonspace++; + } + + indent = first_nonspace - offset; + blank = bchar(ln, first_nonspace) == '\n'; + + if (indent >= CODE_INDENT) { + + if (cur->tag != paragraph && !blank) { + offset += CODE_INDENT; + container = add_child(container, indented_code, line_number, offset + 1); + } else { // indent > 4 in lazy line + break; + } + + } else if (bchar(ln, first_nonspace) == '>') { + + offset = first_nonspace + 1; + // optional following character + if (bchar(ln, offset) == ' ') { + offset++; + } + container = add_child(container, block_quote, line_number, offset + 1); + + } else if ((matched = scan_atx_header_start(ln, first_nonspace))) { + + offset = first_nonspace + matched; + container = add_child(container, atx_header, line_number, offset + 1); + int hashpos = bstrchrp(ln, '#', first_nonspace); + check(hashpos != BSTR_ERR, "no # found in atx header start"); + int level = 0; + while (bchar(ln, hashpos) == '#') { + level++; + hashpos++; + } + container->attributes.header_level = level; + + } else if ((matched = scan_open_code_fence(ln, first_nonspace))) { + + container = add_child(container, fenced_code, line_number, + first_nonspace + 1); + container->attributes.fenced_code_data.fence_char = bchar(ln, + first_nonspace); + container->attributes.fenced_code_data.fence_length = matched; + container->attributes.fenced_code_data.fence_offset = + first_nonspace - offset; + offset = first_nonspace + matched; + + } else if ((matched = scan_html_block_tag(ln, first_nonspace))) { + + container = add_child(container, html_block, line_number, + first_nonspace + 1); + // note, we don't adjust offset because the tag is part of the text + + } else if (container->tag == paragraph && + (lev = scan_setext_header_line(ln, first_nonspace)) && + // check that there is only one line in the paragraph: + bstrrchrp(container->string_content, '\n', + blength(container->string_content) - 2) == BSTR_ERR) { + + container->tag = setext_header; + container->attributes.header_level = lev; + offset = blength(ln) - 1; + + } else if (!(container->tag == paragraph && !all_matched) && + (matched = scan_hrule(ln, first_nonspace))) { + + // it's only now that we know the line is not part of a setext header: + container = add_child(container, hrule, line_number, first_nonspace + 1); + finalize(container, line_number); + container = container->parent; + offset = blength(ln) - 1; + + } else if ((matched = parse_list_marker(ln, first_nonspace, &data))) { + + // compute padding: + offset = first_nonspace + matched; + i = 0; + while (i <= 5 && bchar(ln, offset + i) == ' ') { + i++; + } + // i = number of spaces after marker, up to 5 + if (i >= 5 || i < 1 || bchar(ln, offset) == '\n') { + data->padding = matched + 1; + if (i > 0) { + offset += 1; + } + } else { + data->padding = matched + i; + offset += i; + } + + // check container; if it's a list, see if this list item + // can continue the list; otherwise, create a list container. + + data->marker_offset = indent; + + if (container->tag != list || + !lists_match(container->attributes.list_data, *data)) { + container = add_child(container, list, line_number, + first_nonspace + 1); + container->attributes.list_data = *data; + } + + // add the list item + container = add_child(container, list_item, line_number, + first_nonspace + 1); + container->attributes.list_data = *data; + free(data); + + } else { + break; + } + + if (accepts_lines(container->tag)) { + // if it's a line container, it can't contain other containers + break; + } + } + + // what remains at offset is a text line. add the text to the + // appropriate container. + + first_nonspace = offset; + while (bchar(ln, first_nonspace) == ' ') { + first_nonspace++; + } + + indent = first_nonspace - offset; + blank = bchar(ln, first_nonspace) == '\n'; + + // block quote lines are never blank as they start with > + // and we don't count blanks in fenced code for purposes of tight/loose + // lists or breaking out of lists. we also don't set last_line_blank + // on an empty list item. + container->last_line_blank = (blank && + container->tag != block_quote && + container->tag != fenced_code && + !(container->tag == list_item && + container->children == NULL && + container->start_line == line_number)); + + block *cont = container; + while (cont->parent) { + cont->parent->last_line_blank = false; + cont = cont->parent; + } + + if (cur != last_matched_container && + container == last_matched_container && + !blank && + cur->tag == paragraph && + blength(cur->string_content) > 0) { + + check(add_line(cur, ln, offset) == 0, "could not add line"); + + } else { // not a lazy continuation + + // finalize any blocks that were not matched and set cur to container: + while (cur != last_matched_container) { + + finalize(cur, line_number); + cur = cur->parent; + check(cur != NULL, "cur is NULL, last_matched_container->tag = %d", + last_matched_container->tag); + + } + + if (container->tag == indented_code) { + + check(add_line(container, ln, offset) == 0, "could not add line"); + + } else if (container->tag == fenced_code) { + + matched = (indent <= 3 + && bchar(ln, first_nonspace) == container->attributes.fenced_code_data.fence_char) + && scan_close_code_fence(ln, first_nonspace, + container->attributes.fenced_code_data.fence_length); + if (matched) { + // if closing fence, don't add line to container; instead, close it: + finalize(container, line_number); + container = container->parent; // back up to parent + } else { + check(add_line(container, ln, offset) == 0, "could not add line"); + } + + } else if (container->tag == html_block) { + + check(add_line(container, ln, offset) == 0, "could not add line"); + + } else if (blank) { + + // ??? do nothing + + } else if (container->tag == atx_header) { + + // chop off trailing ###s...use a scanner? + brtrimws(ln); + int p = blength(ln) - 1; + int numhashes = 0; + // if string ends in #s, remove these: + while (bchar(ln, p) == '#') { + p--; + numhashes++; + } + if (bchar(ln, p) == '\\') { + // the last # was escaped, so we include it. + p++; + numhashes--; + } + check(bdelete(ln, p + 1, numhashes) != BSTR_ERR, + "could not delete final hashes"); + check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + finalize(container, line_number); + container = container->parent; + + } else if (accepts_lines(container->tag)) { + + check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + + } else if (container->tag != hrule && container->tag != setext_header) { + + // create paragraph container for line + container = add_child(container, paragraph, line_number, first_nonspace + 1); + check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + + } else { + + log_warn("Line %d with container type %d did not match any condition:\n\"%s\"", + line_number, container->tag, ln->data); + + } + *curptr = container; + } + + return 0; + error: + return -1; +} + |