aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: 72b2dc2b8247d0b61651faf4a11c9d2b42d7e304 (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "stmd.h"
  7. #include "utf8.h"
  8. #include "html/houdini.h"
  9. #include "scanners.h"
  10. #include "uthash.h"
  11. #define peek_at(i, n) (i)->data[n]
  12. static void incorporate_line(strbuf *ln, int line_number, node_block** curptr);
  13. static void finalize(node_block* b, int line_number);
  14. static node_block* make_block(int tag, int start_line, int start_column)
  15. {
  16. node_block* e;
  17. e = malloc(sizeof(node_block));
  18. memset(e, 0x0, sizeof(*e));
  19. e->tag = tag;
  20. e->open = true;
  21. e->start_line = start_line;
  22. e->start_column = start_column;
  23. e->end_line = start_line;
  24. strbuf_init(&e->string_content, 32);
  25. return e;
  26. }
  27. // Create a root document node_block.
  28. extern node_block* make_document()
  29. {
  30. node_block *e = make_block(BLOCK_DOCUMENT, 1, 1);
  31. reference *map = NULL;
  32. reference ** refmap;
  33. refmap = (reference**) malloc(sizeof(reference*));
  34. *refmap = map;
  35. e->as.document.refmap = refmap;
  36. e->top = e;
  37. return e;
  38. }
  39. // Returns true if line has only space characters, else false.
  40. bool is_blank(strbuf *s, int offset)
  41. {
  42. while (offset < s->size) {
  43. switch (s->ptr[offset]) {
  44. case '\n':
  45. return true;
  46. case ' ':
  47. offset++;
  48. break;
  49. default:
  50. return false;
  51. }
  52. }
  53. return true;
  54. }
  55. static inline bool can_contain(int parent_type, int child_type)
  56. {
  57. return ( parent_type == BLOCK_DOCUMENT ||
  58. parent_type == BLOCK_BQUOTE ||
  59. parent_type == BLOCK_LIST_ITEM ||
  60. (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
  61. }
  62. static inline bool accepts_lines(int block_type)
  63. {
  64. return (block_type == BLOCK_PARAGRAPH ||
  65. block_type == BLOCK_ATX_HEADER ||
  66. block_type == BLOCK_INDENTED_CODE ||
  67. block_type == BLOCK_FENCED_CODE);
  68. }
  69. static void add_line(node_block* node_block, chunk *ch, int offset)
  70. {
  71. assert(node_block->open);
  72. strbuf_put(&node_block->string_content, ch->data + offset, ch->len - offset);
  73. }
  74. static void remove_trailing_blank_lines(strbuf *ln)
  75. {
  76. int i;
  77. for (i = ln->size - 1; i >= 0; --i) {
  78. char c = ln->ptr[i];
  79. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  80. break;
  81. }
  82. if (i < 0) {
  83. strbuf_clear(ln);
  84. return;
  85. }
  86. i = strbuf_strchr(ln, '\n', i);
  87. if (i >= 0)
  88. strbuf_truncate(ln, i);
  89. }
  90. // Check to see if a node_block ends with a blank line, descending
  91. // if needed into lists and sublists.
  92. static bool ends_with_blank_line(node_block* node_block)
  93. {
  94. if (node_block->last_line_blank) {
  95. return true;
  96. }
  97. if ((node_block->tag == BLOCK_LIST || node_block->tag == BLOCK_LIST_ITEM) && node_block->last_child) {
  98. return ends_with_blank_line(node_block->last_child);
  99. } else {
  100. return false;
  101. }
  102. }
  103. // Break out of all containing lists
  104. static int break_out_of_lists(node_block ** bptr, int line_number)
  105. {
  106. node_block *container = *bptr;
  107. node_block *b = container->top;
  108. // find first containing BLOCK_LIST:
  109. while (b && b->tag != BLOCK_LIST) {
  110. b = b->last_child;
  111. }
  112. if (b) {
  113. while (container && container != b) {
  114. finalize(container, line_number);
  115. container = container->parent;
  116. }
  117. finalize(b, line_number);
  118. *bptr = b->parent;
  119. }
  120. return 0;
  121. }
  122. static void finalize(node_block* b, int line_number)
  123. {
  124. int firstlinelen;
  125. int pos;
  126. node_block* item;
  127. node_block* subitem;
  128. if (!b->open)
  129. return; // don't do anything if the node_block is already closed
  130. b->open = false;
  131. if (line_number > b->start_line) {
  132. b->end_line = line_number - 1;
  133. } else {
  134. b->end_line = line_number;
  135. }
  136. switch (b->tag) {
  137. case BLOCK_PARAGRAPH:
  138. pos = 0;
  139. while (strbuf_at(&b->string_content, 0) == '[' &&
  140. (pos = parse_reference(&b->string_content, b->top->as.document.refmap))) {
  141. strbuf_drop(&b->string_content, pos);
  142. }
  143. if (is_blank(&b->string_content, 0)) {
  144. b->tag = BLOCK_REFERENCE_DEF;
  145. }
  146. break;
  147. case BLOCK_INDENTED_CODE:
  148. remove_trailing_blank_lines(&b->string_content);
  149. strbuf_putc(&b->string_content, '\n');
  150. break;
  151. case BLOCK_FENCED_CODE:
  152. // first line of contents becomes info
  153. firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
  154. strbuf_init(&b->as.code.info, 0);
  155. houdini_unescape_html_f(
  156. &b->as.code.info,
  157. b->string_content.ptr,
  158. firstlinelen
  159. );
  160. strbuf_drop(&b->string_content, firstlinelen + 1);
  161. strbuf_trim(&b->as.code.info);
  162. unescape_buffer(&b->as.code.info);
  163. break;
  164. case BLOCK_LIST: // determine tight/loose status
  165. b->as.list.tight = true; // tight by default
  166. item = b->children;
  167. while (item) {
  168. // check for non-final non-empty list item ending with blank line:
  169. if (item->last_line_blank && item->next) {
  170. b->as.list.tight = false;
  171. break;
  172. }
  173. // recurse into children of list item, to see if there are
  174. // spaces between them:
  175. subitem = item->children;
  176. while (subitem) {
  177. if (ends_with_blank_line(subitem) &&
  178. (item->next || subitem->next)) {
  179. b->as.list.tight = false;
  180. break;
  181. }
  182. subitem = subitem->next;
  183. }
  184. if (!(b->as.list.tight)) {
  185. break;
  186. }
  187. item = item->next;
  188. }
  189. break;
  190. default:
  191. break;
  192. }
  193. }
  194. // Add a node_block as child of another. Return pointer to child.
  195. extern node_block* add_child(node_block* parent,
  196. int block_type, int start_line, int start_column)
  197. {
  198. assert(parent);
  199. // if 'parent' isn't the kind of node_block that can accept this child,
  200. // then back up til we hit a node_block that can.
  201. while (!can_contain(parent->tag, block_type)) {
  202. finalize(parent, start_line);
  203. parent = parent->parent;
  204. }
  205. node_block* child = make_block(block_type, start_line, start_column);
  206. child->parent = parent;
  207. child->top = parent->top;
  208. if (parent->last_child) {
  209. parent->last_child->next = child;
  210. child->prev = parent->last_child;
  211. } else {
  212. parent->children = child;
  213. child->prev = NULL;
  214. }
  215. parent->last_child = child;
  216. return child;
  217. }
  218. // Free a node_block list and any children.
  219. extern void free_blocks(node_block* e)
  220. {
  221. node_block * next;
  222. while (e != NULL) {
  223. next = e->next;
  224. free_inlines(e->inline_content);
  225. strbuf_free(&e->string_content);
  226. if (e->tag == BLOCK_FENCED_CODE) {
  227. strbuf_free(&e->as.code.info);
  228. } else if (e->tag == BLOCK_DOCUMENT) {
  229. free_reference_map(e->as.document.refmap);
  230. }
  231. free_blocks(e->children);
  232. free(e);
  233. e = next;
  234. }
  235. }
  236. // Walk through node_block and all children, recursively, parsing
  237. // string content into inline content where appropriate.
  238. void process_inlines(node_block* cur, reference** refmap)
  239. {
  240. switch (cur->tag) {
  241. case BLOCK_PARAGRAPH:
  242. case BLOCK_ATX_HEADER:
  243. case BLOCK_SETEXT_HEADER:
  244. cur->inline_content = parse_inlines(&cur->string_content, refmap);
  245. // MEM
  246. // strbuf_free(&cur->string_content);
  247. break;
  248. default:
  249. break;
  250. }
  251. node_block *child = cur->children;
  252. while (child != NULL) {
  253. process_inlines(child, refmap);
  254. child = child->next;
  255. }
  256. }
  257. // Attempts to parse a list item marker (bullet or enumerated).
  258. // On success, returns length of the marker, and populates
  259. // data with the details. On failure, returns 0.
  260. static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
  261. {
  262. unsigned char c;
  263. int startpos;
  264. struct ListData * data;
  265. startpos = pos;
  266. c = peek_at(input, pos);
  267. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(input, pos)) {
  268. pos++;
  269. if (!isspace(peek_at(input, pos))) {
  270. return 0;
  271. }
  272. data = malloc(sizeof(struct ListData));
  273. data->marker_offset = 0; // will be adjusted later
  274. data->list_type = bullet;
  275. data->bullet_char = c;
  276. data->start = 1;
  277. data->delimiter = period;
  278. data->tight = false;
  279. } else if (isdigit(c)) {
  280. int start = 0;
  281. do {
  282. start = (10 * start) + (peek_at(input, pos) - '0');
  283. pos++;
  284. } while (isdigit(peek_at(input, pos)));
  285. c = peek_at(input, pos);
  286. if (c == '.' || c == ')') {
  287. pos++;
  288. if (!isspace(peek_at(input, pos))) {
  289. return 0;
  290. }
  291. data = malloc(sizeof(struct ListData));
  292. data->marker_offset = 0; // will be adjusted later
  293. data->list_type = ordered;
  294. data->bullet_char = 0;
  295. data->start = start;
  296. data->delimiter = (c == '.' ? period : parens);
  297. data->tight = false;
  298. } else {
  299. return 0;
  300. }
  301. } else {
  302. return 0;
  303. }
  304. *dataptr = data;
  305. return (pos - startpos);
  306. }
  307. // Return 1 if list item belongs in list, else 0.
  308. static int lists_match(struct ListData *list_data, struct ListData *item_data)
  309. {
  310. return (list_data->list_type == item_data->list_type &&
  311. list_data->delimiter == item_data->delimiter &&
  312. // list_data->marker_offset == item_data.marker_offset &&
  313. list_data->bullet_char == item_data->bullet_char);
  314. }
  315. static node_block *finalize_document(node_block *document, int linenum)
  316. {
  317. while (document != document->top) {
  318. finalize(document, linenum);
  319. document = document->parent;
  320. }
  321. finalize(document, linenum);
  322. process_inlines(document, document->as.document.refmap);
  323. return document;
  324. }
  325. extern node_block *stmd_parse_file(FILE *f)
  326. {
  327. strbuf line = GH_BUF_INIT;
  328. unsigned char buffer[4096];
  329. int linenum = 1;
  330. node_block *document = make_document();
  331. while (fgets((char *)buffer, sizeof(buffer), f)) {
  332. utf8proc_detab(&line, buffer, strlen((char *)buffer));
  333. incorporate_line(&line, linenum, &document);
  334. strbuf_clear(&line);
  335. linenum++;
  336. }
  337. strbuf_free(&line);
  338. return finalize_document(document, linenum);
  339. }
  340. extern node_block *stmd_parse_document(const unsigned char *buffer, size_t len)
  341. {
  342. strbuf line = GH_BUF_INIT;
  343. int linenum = 1;
  344. const unsigned char *end = buffer + len;
  345. node_block *document = make_document();
  346. while (buffer < end) {
  347. const unsigned char *eol = memchr(buffer, '\n', end - buffer);
  348. if (!eol) {
  349. utf8proc_detab(&line, buffer, end - buffer);
  350. buffer = end;
  351. } else {
  352. utf8proc_detab(&line, buffer, (eol - buffer) + 1);
  353. buffer += (eol - buffer) + 1;
  354. }
  355. incorporate_line(&line, linenum, &document);
  356. strbuf_clear(&line);
  357. linenum++;
  358. }
  359. strbuf_free(&line);
  360. return finalize_document(document, linenum);
  361. }
  362. static void chop_trailing_hashtags(chunk *ch)
  363. {
  364. int n;
  365. chunk_rtrim(ch);
  366. n = ch->len - 1;
  367. // if string ends in #s, remove these:
  368. while (n >= 0 && peek_at(ch, n) == '#')
  369. n--;
  370. // the last # was escaped, so we include it.
  371. if (n >= 0 && peek_at(ch, n) == '\\')
  372. n++;
  373. ch->len = n + 1;
  374. }
  375. // Process one line at a time, modifying a node_block.
  376. static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
  377. {
  378. node_block* last_matched_container;
  379. int offset = 0;
  380. int matched = 0;
  381. int lev = 0;
  382. int i;
  383. struct ListData * data = NULL;
  384. bool all_matched = true;
  385. node_block* container;
  386. node_block* cur = *curptr;
  387. bool blank = false;
  388. int first_nonspace;
  389. int indent;
  390. chunk input;
  391. input.data = line->ptr;
  392. input.len = line->size;
  393. // container starts at the document root.
  394. container = cur->top;
  395. // for each containing node_block, try to parse the associated line start.
  396. // bail out on failure: container will point to the last matching node_block.
  397. while (container->last_child && container->last_child->open) {
  398. container = container->last_child;
  399. first_nonspace = offset;
  400. while (peek_at(&input, first_nonspace) == ' ') {
  401. first_nonspace++;
  402. }
  403. indent = first_nonspace - offset;
  404. blank = peek_at(&input, first_nonspace) == '\n';
  405. if (container->tag == BLOCK_BQUOTE) {
  406. matched = indent <= 3 && peek_at(&input, first_nonspace) == '>';
  407. if (matched) {
  408. offset = first_nonspace + 1;
  409. if (peek_at(&input, offset) == ' ')
  410. offset++;
  411. } else {
  412. all_matched = false;
  413. }
  414. } else if (container->tag == BLOCK_LIST_ITEM) {
  415. if (indent >= container->as.list.marker_offset +
  416. container->as.list.padding) {
  417. offset += container->as.list.marker_offset +
  418. container->as.list.padding;
  419. } else if (blank) {
  420. offset = first_nonspace;
  421. } else {
  422. all_matched = false;
  423. }
  424. } else if (container->tag == BLOCK_INDENTED_CODE) {
  425. if (indent >= CODE_INDENT) {
  426. offset += CODE_INDENT;
  427. } else if (blank) {
  428. offset = first_nonspace;
  429. } else {
  430. all_matched = false;
  431. }
  432. } else if (container->tag == BLOCK_ATX_HEADER ||
  433. container->tag == BLOCK_SETEXT_HEADER) {
  434. // a header can never contain more than one line
  435. all_matched = false;
  436. } else if (container->tag == BLOCK_FENCED_CODE) {
  437. // skip optional spaces of fence offset
  438. i = container->as.code.fence_offset;
  439. while (i > 0 && peek_at(&input, offset) == ' ') {
  440. offset++;
  441. i--;
  442. }
  443. } else if (container->tag == BLOCK_HTML) {
  444. if (blank) {
  445. all_matched = false;
  446. }
  447. } else if (container->tag == BLOCK_PARAGRAPH) {
  448. if (blank) {
  449. container->last_line_blank = true;
  450. all_matched = false;
  451. }
  452. }
  453. if (!all_matched) {
  454. container = container->parent; // back up to last matching node_block
  455. break;
  456. }
  457. }
  458. last_matched_container = container;
  459. // check to see if we've hit 2nd blank line, break out of list:
  460. if (blank && container->last_line_blank) {
  461. break_out_of_lists(&container, line_number);
  462. }
  463. // unless last matched container is code node_block, try new container starts:
  464. while (container->tag != BLOCK_FENCED_CODE && container->tag != BLOCK_INDENTED_CODE &&
  465. container->tag != BLOCK_HTML) {
  466. first_nonspace = offset;
  467. while (peek_at(&input, first_nonspace) == ' ')
  468. first_nonspace++;
  469. indent = first_nonspace - offset;
  470. blank = peek_at(&input, first_nonspace) == '\n';
  471. if (indent >= CODE_INDENT) {
  472. if (cur->tag != BLOCK_PARAGRAPH && !blank) {
  473. offset += CODE_INDENT;
  474. container = add_child(container, BLOCK_INDENTED_CODE, line_number, offset + 1);
  475. } else { // indent > 4 in lazy line
  476. break;
  477. }
  478. } else if (peek_at(&input, first_nonspace) == '>') {
  479. offset = first_nonspace + 1;
  480. // optional following character
  481. if (peek_at(&input, offset) == ' ')
  482. offset++;
  483. container = add_child(container, BLOCK_BQUOTE, line_number, offset + 1);
  484. } else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
  485. offset = first_nonspace + matched;
  486. container = add_child(container, BLOCK_ATX_HEADER, line_number, offset + 1);
  487. int hashpos = chunk_strchr(&input, '#', first_nonspace);
  488. int level = 0;
  489. while (peek_at(&input, hashpos) == '#') {
  490. level++;
  491. hashpos++;
  492. }
  493. container->as.header.level = level;
  494. } else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
  495. container = add_child(container, BLOCK_FENCED_CODE, line_number, first_nonspace + 1);
  496. container->as.code.fence_char = peek_at(&input, first_nonspace);
  497. container->as.code.fence_length = matched;
  498. container->as.code.fence_offset = first_nonspace - offset;
  499. offset = first_nonspace + matched;
  500. } else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
  501. container = add_child(container, BLOCK_HTML, line_number, first_nonspace + 1);
  502. // note, we don't adjust offset because the tag is part of the text
  503. } else if (container->tag == BLOCK_PARAGRAPH &&
  504. (lev = scan_setext_header_line(&input, first_nonspace)) &&
  505. // check that there is only one line in the paragraph:
  506. strbuf_strrchr(&container->string_content, '\n',
  507. strbuf_len(&container->string_content) - 2) < 0) {
  508. container->tag = BLOCK_SETEXT_HEADER;
  509. container->as.header.level = lev;
  510. offset = input.len - 1;
  511. } else if (!(container->tag == BLOCK_PARAGRAPH && !all_matched) &&
  512. (matched = scan_hrule(&input, first_nonspace))) {
  513. // it's only now that we know the line is not part of a setext header:
  514. container = add_child(container, BLOCK_HRULE, line_number, first_nonspace + 1);
  515. finalize(container, line_number);
  516. container = container->parent;
  517. offset = input.len - 1;
  518. } else if ((matched = parse_list_marker(&input, first_nonspace, &data))) {
  519. // compute padding:
  520. offset = first_nonspace + matched;
  521. i = 0;
  522. while (i <= 5 && peek_at(&input, offset + i) == ' ') {
  523. i++;
  524. }
  525. // i = number of spaces after marker, up to 5
  526. if (i >= 5 || i < 1 || peek_at(&input, offset) == '\n') {
  527. data->padding = matched + 1;
  528. if (i > 0) {
  529. offset += 1;
  530. }
  531. } else {
  532. data->padding = matched + i;
  533. offset += i;
  534. }
  535. // check container; if it's a list, see if this list item
  536. // can continue the list; otherwise, create a list container.
  537. data->marker_offset = indent;
  538. if (container->tag != BLOCK_LIST ||
  539. !lists_match(&container->as.list, data)) {
  540. container = add_child(container, BLOCK_LIST, line_number,
  541. first_nonspace + 1);
  542. memcpy(&container->as.list, data, sizeof(*data));
  543. }
  544. // add the list item
  545. container = add_child(container, BLOCK_LIST_ITEM, line_number,
  546. first_nonspace + 1);
  547. /* TODO: static */
  548. memcpy(&container->as.list, data, sizeof(*data));
  549. free(data);
  550. } else {
  551. break;
  552. }
  553. if (accepts_lines(container->tag)) {
  554. // if it's a line container, it can't contain other containers
  555. break;
  556. }
  557. }
  558. // what remains at offset is a text line. add the text to the
  559. // appropriate container.
  560. first_nonspace = offset;
  561. while (peek_at(&input, first_nonspace) == ' ')
  562. first_nonspace++;
  563. indent = first_nonspace - offset;
  564. blank = peek_at(&input, first_nonspace) == '\n';
  565. // node_block quote lines are never blank as they start with >
  566. // and we don't count blanks in fenced code for purposes of tight/loose
  567. // lists or breaking out of lists. we also don't set last_line_blank
  568. // on an empty list item.
  569. container->last_line_blank = (blank &&
  570. container->tag != BLOCK_BQUOTE &&
  571. container->tag != BLOCK_FENCED_CODE &&
  572. !(container->tag == BLOCK_LIST_ITEM &&
  573. container->children == NULL &&
  574. container->start_line == line_number));
  575. node_block *cont = container;
  576. while (cont->parent) {
  577. cont->parent->last_line_blank = false;
  578. cont = cont->parent;
  579. }
  580. if (cur != last_matched_container &&
  581. container == last_matched_container &&
  582. !blank &&
  583. cur->tag == BLOCK_PARAGRAPH &&
  584. strbuf_len(&cur->string_content) > 0) {
  585. add_line(cur, &input, offset);
  586. } else { // not a lazy continuation
  587. // finalize any blocks that were not matched and set cur to container:
  588. while (cur != last_matched_container) {
  589. finalize(cur, line_number);
  590. cur = cur->parent;
  591. assert(cur != NULL);
  592. }
  593. if (container->tag == BLOCK_INDENTED_CODE) {
  594. add_line(container, &input, offset);
  595. } else if (container->tag == BLOCK_FENCED_CODE) {
  596. matched = 0;
  597. if (indent <= 3 &&
  598. peek_at(&input, first_nonspace) == container->as.code.fence_char) {
  599. int fence_len = scan_close_code_fence(&input, first_nonspace);
  600. if (fence_len > container->as.code.fence_length)
  601. matched = 1;
  602. }
  603. if (matched) {
  604. // if closing fence, don't add line to container; instead, close it:
  605. finalize(container, line_number);
  606. container = container->parent; // back up to parent
  607. } else {
  608. add_line(container, &input, offset);
  609. }
  610. } else if (container->tag == BLOCK_HTML) {
  611. add_line(container, &input, offset);
  612. } else if (blank) {
  613. // ??? do nothing
  614. } else if (container->tag == BLOCK_ATX_HEADER) {
  615. chop_trailing_hashtags(&input);
  616. add_line(container, &input, first_nonspace);
  617. finalize(container, line_number);
  618. container = container->parent;
  619. } else if (accepts_lines(container->tag)) {
  620. add_line(container, &input, first_nonspace);
  621. } else if (container->tag != BLOCK_HRULE && container->tag != BLOCK_SETEXT_HEADER) {
  622. // create paragraph container for line
  623. container = add_child(container, BLOCK_PARAGRAPH, line_number, first_nonspace + 1);
  624. add_line(container, &input, first_nonspace);
  625. } else {
  626. assert(false);
  627. }
  628. *curptr = container;
  629. }
  630. }