aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: 8c7d49c1ed3da01bf722e6b21c3d95ca1d34b345 (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "stmd.h"
  7. #include "utf8.h"
  8. #include "html/houdini.h"
  9. #include "scanners.h"
  10. #include "uthash.h"
  11. #define peek_at(i, n) (i)->data[n]
  12. static void incorporate_line(strbuf *ln, int line_number, node_block** curptr);
  13. static void finalize(node_block* b, int line_number);
  14. static node_block* make_block(int tag, int start_line, int start_column)
  15. {
  16. node_block* e;
  17. e = (node_block*) malloc(sizeof(node_block));
  18. e->tag = tag;
  19. e->open = true;
  20. e->last_line_blank = false;
  21. e->start_line = start_line;
  22. e->start_column = start_column;
  23. e->end_line = start_line;
  24. e->children = NULL;
  25. e->last_child = NULL;
  26. e->parent = NULL;
  27. e->top = NULL;
  28. e->attributes.refmap = NULL;
  29. strbuf_init(&e->string_content, 32);
  30. e->inline_content = NULL;
  31. e->next = NULL;
  32. e->prev = NULL;
  33. return e;
  34. }
  35. // Create a root document node_block.
  36. extern node_block* make_document()
  37. {
  38. node_block * e = make_block(BLOCK_DOCUMENT, 1, 1);
  39. reference * map = NULL;
  40. reference ** refmap;
  41. refmap = (reference**) malloc(sizeof(reference*));
  42. *refmap = map;
  43. e->attributes.refmap = refmap;
  44. e->top = e;
  45. return e;
  46. }
  47. // Returns true if line has only space characters, else false.
  48. bool is_blank(strbuf *s, int offset)
  49. {
  50. while (offset < s->size) {
  51. switch (s->ptr[offset]) {
  52. case '\n':
  53. return true;
  54. case ' ':
  55. offset++;
  56. break;
  57. default:
  58. return false;
  59. }
  60. }
  61. return true;
  62. }
  63. static inline bool can_contain(int parent_type, int child_type)
  64. {
  65. return ( parent_type == BLOCK_DOCUMENT ||
  66. parent_type == BLOCK_BQUOTE ||
  67. parent_type == BLOCK_LIST_ITEM ||
  68. (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
  69. }
  70. static inline bool accepts_lines(int block_type)
  71. {
  72. return (block_type == BLOCK_PARAGRAPH ||
  73. block_type == BLOCK_ATX_HEADER ||
  74. block_type == BLOCK_INDENTED_CODE ||
  75. block_type == BLOCK_FENCED_CODE);
  76. }
  77. static void add_line(node_block* node_block, chunk *ch, int offset)
  78. {
  79. assert(node_block->open);
  80. strbuf_put(&node_block->string_content, ch->data + offset, ch->len - offset);
  81. }
  82. static void remove_trailing_blank_lines(strbuf *ln)
  83. {
  84. int i;
  85. for (i = ln->size - 1; i >= 0; --i) {
  86. char c = ln->ptr[i];
  87. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  88. break;
  89. }
  90. if (i < 0) {
  91. strbuf_clear(ln);
  92. return;
  93. }
  94. i = strbuf_strchr(ln, '\n', i);
  95. if (i >= 0)
  96. strbuf_truncate(ln, i);
  97. }
  98. // Check to see if a node_block ends with a blank line, descending
  99. // if needed into lists and sublists.
  100. static bool ends_with_blank_line(node_block* node_block)
  101. {
  102. if (node_block->last_line_blank) {
  103. return true;
  104. }
  105. if ((node_block->tag == BLOCK_LIST || node_block->tag == BLOCK_LIST_ITEM) && node_block->last_child) {
  106. return ends_with_blank_line(node_block->last_child);
  107. } else {
  108. return false;
  109. }
  110. }
  111. // Break out of all containing lists
  112. static int break_out_of_lists(node_block ** bptr, int line_number)
  113. {
  114. node_block * container = *bptr;
  115. node_block * b = container->top;
  116. // find first containing BLOCK_LIST:
  117. while (b && b->tag != BLOCK_LIST) {
  118. b = b->last_child;
  119. }
  120. if (b) {
  121. while (container && container != b) {
  122. finalize(container, line_number);
  123. container = container->parent;
  124. }
  125. finalize(b, line_number);
  126. *bptr = b->parent;
  127. }
  128. return 0;
  129. }
  130. static void finalize(node_block* b, int line_number)
  131. {
  132. int firstlinelen;
  133. int pos;
  134. node_block* item;
  135. node_block* subitem;
  136. if (!b->open)
  137. return; // don't do anything if the node_block is already closed
  138. b->open = false;
  139. if (line_number > b->start_line) {
  140. b->end_line = line_number - 1;
  141. } else {
  142. b->end_line = line_number;
  143. }
  144. switch (b->tag) {
  145. case BLOCK_PARAGRAPH:
  146. pos = 0;
  147. while (strbuf_at(&b->string_content, 0) == '[' &&
  148. (pos = parse_reference(&b->string_content, b->top->attributes.refmap))) {
  149. strbuf_drop(&b->string_content, pos);
  150. }
  151. if (is_blank(&b->string_content, 0)) {
  152. b->tag = BLOCK_REFERENCE_DEF;
  153. }
  154. break;
  155. case BLOCK_INDENTED_CODE:
  156. remove_trailing_blank_lines(&b->string_content);
  157. strbuf_putc(&b->string_content, '\n');
  158. break;
  159. case BLOCK_FENCED_CODE:
  160. // first line of contents becomes info
  161. firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
  162. strbuf_init(&b->attributes.fenced_code_data.info, 0);
  163. houdini_unescape_html_f(
  164. &b->attributes.fenced_code_data.info,
  165. b->string_content.ptr,
  166. firstlinelen
  167. );
  168. strbuf_drop(&b->string_content, firstlinelen + 1);
  169. strbuf_trim(&b->attributes.fenced_code_data.info);
  170. unescape_buffer(&b->attributes.fenced_code_data.info);
  171. break;
  172. case BLOCK_LIST: // determine tight/loose status
  173. b->attributes.list_data.tight = true; // tight by default
  174. item = b->children;
  175. while (item) {
  176. // check for non-final non-empty list item ending with blank line:
  177. if (item->last_line_blank && item->next) {
  178. b->attributes.list_data.tight = false;
  179. break;
  180. }
  181. // recurse into children of list item, to see if there are
  182. // spaces between them:
  183. subitem = item->children;
  184. while (subitem) {
  185. if (ends_with_blank_line(subitem) &&
  186. (item->next || subitem->next)) {
  187. b->attributes.list_data.tight = false;
  188. break;
  189. }
  190. subitem = subitem->next;
  191. }
  192. if (!(b->attributes.list_data.tight)) {
  193. break;
  194. }
  195. item = item->next;
  196. }
  197. break;
  198. default:
  199. break;
  200. }
  201. }
  202. // Add a node_block as child of another. Return pointer to child.
  203. extern node_block* add_child(node_block* parent,
  204. int block_type, int start_line, int start_column)
  205. {
  206. assert(parent);
  207. // if 'parent' isn't the kind of node_block that can accept this child,
  208. // then back up til we hit a node_block that can.
  209. while (!can_contain(parent->tag, block_type)) {
  210. finalize(parent, start_line);
  211. parent = parent->parent;
  212. }
  213. node_block* child = make_block(block_type, start_line, start_column);
  214. child->parent = parent;
  215. child->top = parent->top;
  216. if (parent->last_child) {
  217. parent->last_child->next = child;
  218. child->prev = parent->last_child;
  219. } else {
  220. parent->children = child;
  221. child->prev = NULL;
  222. }
  223. parent->last_child = child;
  224. return child;
  225. }
  226. // Free a node_block list and any children.
  227. extern void free_blocks(node_block* e)
  228. {
  229. node_block * next;
  230. while (e != NULL) {
  231. next = e->next;
  232. free_inlines(e->inline_content);
  233. strbuf_free(&e->string_content);
  234. if (e->tag == BLOCK_FENCED_CODE) {
  235. strbuf_free(&e->attributes.fenced_code_data.info);
  236. } else if (e->tag == BLOCK_DOCUMENT) {
  237. free_reference_map(e->attributes.refmap);
  238. }
  239. free_blocks(e->children);
  240. free(e);
  241. e = next;
  242. }
  243. }
  244. // Walk through node_block and all children, recursively, parsing
  245. // string content into inline content where appropriate.
  246. void process_inlines(node_block* cur, reference** refmap)
  247. {
  248. switch (cur->tag) {
  249. case BLOCK_PARAGRAPH:
  250. case BLOCK_ATX_HEADER:
  251. case BLOCK_SETEXT_HEADER:
  252. cur->inline_content = parse_inlines(&cur->string_content, refmap);
  253. // MEM
  254. // strbuf_free(&cur->string_content);
  255. break;
  256. default:
  257. break;
  258. }
  259. node_block *child = cur->children;
  260. while (child != NULL) {
  261. process_inlines(child, refmap);
  262. child = child->next;
  263. }
  264. }
  265. // Attempts to parse a list item marker (bullet or enumerated).
  266. // On success, returns length of the marker, and populates
  267. // data with the details. On failure, returns 0.
  268. static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
  269. {
  270. unsigned char c;
  271. int startpos;
  272. struct ListData * data;
  273. startpos = pos;
  274. c = peek_at(input, pos);
  275. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(input, pos)) {
  276. pos++;
  277. if (!isspace(peek_at(input, pos))) {
  278. return 0;
  279. }
  280. data = malloc(sizeof(struct ListData));
  281. data->marker_offset = 0; // will be adjusted later
  282. data->list_type = bullet;
  283. data->bullet_char = c;
  284. data->start = 1;
  285. data->delimiter = period;
  286. data->tight = false;
  287. } else if (isdigit(c)) {
  288. int start = 0;
  289. do {
  290. start = (10 * start) + (peek_at(input, pos) - '0');
  291. pos++;
  292. } while (isdigit(peek_at(input, pos)));
  293. c = peek_at(input, pos);
  294. if (c == '.' || c == ')') {
  295. pos++;
  296. if (!isspace(peek_at(input, pos))) {
  297. return 0;
  298. }
  299. data = malloc(sizeof(struct ListData));
  300. data->marker_offset = 0; // will be adjusted later
  301. data->list_type = ordered;
  302. data->bullet_char = 0;
  303. data->start = start;
  304. data->delimiter = (c == '.' ? period : parens);
  305. data->tight = false;
  306. } else {
  307. return 0;
  308. }
  309. } else {
  310. return 0;
  311. }
  312. *dataptr = data;
  313. return (pos - startpos);
  314. }
  315. // Return 1 if list item belongs in list, else 0.
  316. static int lists_match(struct ListData list_data,
  317. struct ListData item_data)
  318. {
  319. return (list_data.list_type == item_data.list_type &&
  320. list_data.delimiter == item_data.delimiter &&
  321. // list_data.marker_offset == item_data.marker_offset &&
  322. list_data.bullet_char == item_data.bullet_char);
  323. }
  324. static node_block *finalize_document(node_block *document, int linenum)
  325. {
  326. while (document != document->top) {
  327. finalize(document, linenum);
  328. document = document->parent;
  329. }
  330. finalize(document, linenum);
  331. process_inlines(document, document->attributes.refmap);
  332. return document;
  333. }
  334. extern node_block *stmd_parse_file(FILE *f)
  335. {
  336. strbuf line = GH_BUF_INIT;
  337. unsigned char buffer[4096];
  338. int linenum = 1;
  339. node_block *document = make_document();
  340. while (fgets((char *)buffer, sizeof(buffer), f)) {
  341. utf8proc_detab(&line, buffer, strlen((char *)buffer));
  342. incorporate_line(&line, linenum, &document);
  343. strbuf_clear(&line);
  344. linenum++;
  345. }
  346. strbuf_free(&line);
  347. return finalize_document(document, linenum);
  348. }
  349. extern node_block *stmd_parse_document(const unsigned char *buffer, size_t len)
  350. {
  351. strbuf line = GH_BUF_INIT;
  352. int linenum = 1;
  353. const unsigned char *end = buffer + len;
  354. node_block *document = make_document();
  355. while (buffer < end) {
  356. const unsigned char *eol = memchr(buffer, '\n', end - buffer);
  357. if (!eol) {
  358. utf8proc_detab(&line, buffer, end - buffer);
  359. buffer = end;
  360. } else {
  361. utf8proc_detab(&line, buffer, (eol - buffer) + 1);
  362. buffer += (eol - buffer) + 1;
  363. }
  364. incorporate_line(&line, linenum, &document);
  365. strbuf_clear(&line);
  366. linenum++;
  367. }
  368. strbuf_free(&line);
  369. return finalize_document(document, linenum);
  370. }
  371. static void chop_trailing_hashtags(chunk *ch)
  372. {
  373. int n;
  374. chunk_rtrim(ch);
  375. n = ch->len - 1;
  376. // if string ends in #s, remove these:
  377. while (n >= 0 && peek_at(ch, n) == '#')
  378. n--;
  379. // the last # was escaped, so we include it.
  380. if (n >= 0 && peek_at(ch, n) == '\\')
  381. n++;
  382. ch->len = n + 1;
  383. }
  384. // Process one line at a time, modifying a node_block.
  385. static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
  386. {
  387. node_block* last_matched_container;
  388. int offset = 0;
  389. int matched = 0;
  390. int lev = 0;
  391. int i;
  392. struct ListData * data = NULL;
  393. bool all_matched = true;
  394. node_block* container;
  395. node_block* cur = *curptr;
  396. bool blank = false;
  397. int first_nonspace;
  398. int indent;
  399. chunk input;
  400. input.data = line->ptr;
  401. input.len = line->size;
  402. // container starts at the document root.
  403. container = cur->top;
  404. // for each containing node_block, try to parse the associated line start.
  405. // bail out on failure: container will point to the last matching node_block.
  406. while (container->last_child && container->last_child->open) {
  407. container = container->last_child;
  408. first_nonspace = offset;
  409. while (peek_at(&input, first_nonspace) == ' ') {
  410. first_nonspace++;
  411. }
  412. indent = first_nonspace - offset;
  413. blank = peek_at(&input, first_nonspace) == '\n';
  414. if (container->tag == BLOCK_BQUOTE) {
  415. matched = indent <= 3 && peek_at(&input, first_nonspace) == '>';
  416. if (matched) {
  417. offset = first_nonspace + 1;
  418. if (peek_at(&input, offset) == ' ')
  419. offset++;
  420. } else {
  421. all_matched = false;
  422. }
  423. } else if (container->tag == BLOCK_LIST_ITEM) {
  424. if (indent >= container->attributes.list_data.marker_offset +
  425. container->attributes.list_data.padding) {
  426. offset += container->attributes.list_data.marker_offset +
  427. container->attributes.list_data.padding;
  428. } else if (blank) {
  429. offset = first_nonspace;
  430. } else {
  431. all_matched = false;
  432. }
  433. } else if (container->tag == BLOCK_INDENTED_CODE) {
  434. if (indent >= CODE_INDENT) {
  435. offset += CODE_INDENT;
  436. } else if (blank) {
  437. offset = first_nonspace;
  438. } else {
  439. all_matched = false;
  440. }
  441. } else if (container->tag == BLOCK_ATX_HEADER ||
  442. container->tag == BLOCK_SETEXT_HEADER) {
  443. // a header can never contain more than one line
  444. all_matched = false;
  445. } else if (container->tag == BLOCK_FENCED_CODE) {
  446. // skip optional spaces of fence offset
  447. i = container->attributes.fenced_code_data.fence_offset;
  448. while (i > 0 && peek_at(&input, offset) == ' ') {
  449. offset++;
  450. i--;
  451. }
  452. } else if (container->tag == BLOCK_HTML) {
  453. if (blank) {
  454. all_matched = false;
  455. }
  456. } else if (container->tag == BLOCK_PARAGRAPH) {
  457. if (blank) {
  458. container->last_line_blank = true;
  459. all_matched = false;
  460. }
  461. }
  462. if (!all_matched) {
  463. container = container->parent; // back up to last matching node_block
  464. break;
  465. }
  466. }
  467. last_matched_container = container;
  468. // check to see if we've hit 2nd blank line, break out of list:
  469. if (blank && container->last_line_blank) {
  470. break_out_of_lists(&container, line_number);
  471. }
  472. // unless last matched container is code node_block, try new container starts:
  473. while (container->tag != BLOCK_FENCED_CODE && container->tag != BLOCK_INDENTED_CODE &&
  474. container->tag != BLOCK_HTML) {
  475. first_nonspace = offset;
  476. while (peek_at(&input, first_nonspace) == ' ')
  477. first_nonspace++;
  478. indent = first_nonspace - offset;
  479. blank = peek_at(&input, first_nonspace) == '\n';
  480. if (indent >= CODE_INDENT) {
  481. if (cur->tag != BLOCK_PARAGRAPH && !blank) {
  482. offset += CODE_INDENT;
  483. container = add_child(container, BLOCK_INDENTED_CODE, line_number, offset + 1);
  484. } else { // indent > 4 in lazy line
  485. break;
  486. }
  487. } else if (peek_at(&input, first_nonspace) == '>') {
  488. offset = first_nonspace + 1;
  489. // optional following character
  490. if (peek_at(&input, offset) == ' ')
  491. offset++;
  492. container = add_child(container, BLOCK_BQUOTE, line_number, offset + 1);
  493. } else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
  494. offset = first_nonspace + matched;
  495. container = add_child(container, BLOCK_ATX_HEADER, line_number, offset + 1);
  496. int hashpos = chunk_strchr(&input, '#', first_nonspace);
  497. int level = 0;
  498. while (peek_at(&input, hashpos) == '#') {
  499. level++;
  500. hashpos++;
  501. }
  502. container->attributes.header_level = level;
  503. } else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
  504. container = add_child(container, BLOCK_FENCED_CODE, line_number, first_nonspace + 1);
  505. container->attributes.fenced_code_data.fence_char = peek_at(&input, first_nonspace);
  506. container->attributes.fenced_code_data.fence_length = matched;
  507. container->attributes.fenced_code_data.fence_offset = first_nonspace - offset;
  508. offset = first_nonspace + matched;
  509. } else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
  510. container = add_child(container, BLOCK_HTML, line_number, first_nonspace + 1);
  511. // note, we don't adjust offset because the tag is part of the text
  512. } else if (container->tag == BLOCK_PARAGRAPH &&
  513. (lev = scan_setext_header_line(&input, first_nonspace)) &&
  514. // check that there is only one line in the paragraph:
  515. strbuf_strrchr(&container->string_content, '\n',
  516. strbuf_len(&container->string_content) - 2) < 0) {
  517. container->tag = BLOCK_SETEXT_HEADER;
  518. container->attributes.header_level = lev;
  519. offset = input.len - 1;
  520. } else if (!(container->tag == BLOCK_PARAGRAPH && !all_matched) &&
  521. (matched = scan_hrule(&input, first_nonspace))) {
  522. // it's only now that we know the line is not part of a setext header:
  523. container = add_child(container, BLOCK_HRULE, line_number, first_nonspace + 1);
  524. finalize(container, line_number);
  525. container = container->parent;
  526. offset = input.len - 1;
  527. } else if ((matched = parse_list_marker(&input, first_nonspace, &data))) {
  528. // compute padding:
  529. offset = first_nonspace + matched;
  530. i = 0;
  531. while (i <= 5 && peek_at(&input, offset + i) == ' ') {
  532. i++;
  533. }
  534. // i = number of spaces after marker, up to 5
  535. if (i >= 5 || i < 1 || peek_at(&input, offset) == '\n') {
  536. data->padding = matched + 1;
  537. if (i > 0) {
  538. offset += 1;
  539. }
  540. } else {
  541. data->padding = matched + i;
  542. offset += i;
  543. }
  544. // check container; if it's a list, see if this list item
  545. // can continue the list; otherwise, create a list container.
  546. data->marker_offset = indent;
  547. if (container->tag != BLOCK_LIST ||
  548. !lists_match(container->attributes.list_data, *data)) {
  549. container = add_child(container, BLOCK_LIST, line_number,
  550. first_nonspace + 1);
  551. container->attributes.list_data = *data;
  552. }
  553. // add the list item
  554. container = add_child(container, BLOCK_LIST_ITEM, line_number,
  555. first_nonspace + 1);
  556. /* TODO: static */
  557. container->attributes.list_data = *data;
  558. free(data);
  559. } else {
  560. break;
  561. }
  562. if (accepts_lines(container->tag)) {
  563. // if it's a line container, it can't contain other containers
  564. break;
  565. }
  566. }
  567. // what remains at offset is a text line. add the text to the
  568. // appropriate container.
  569. first_nonspace = offset;
  570. while (peek_at(&input, first_nonspace) == ' ')
  571. first_nonspace++;
  572. indent = first_nonspace - offset;
  573. blank = peek_at(&input, first_nonspace) == '\n';
  574. // node_block quote lines are never blank as they start with >
  575. // and we don't count blanks in fenced code for purposes of tight/loose
  576. // lists or breaking out of lists. we also don't set last_line_blank
  577. // on an empty list item.
  578. container->last_line_blank = (blank &&
  579. container->tag != BLOCK_BQUOTE &&
  580. container->tag != BLOCK_FENCED_CODE &&
  581. !(container->tag == BLOCK_LIST_ITEM &&
  582. container->children == NULL &&
  583. container->start_line == line_number));
  584. node_block *cont = container;
  585. while (cont->parent) {
  586. cont->parent->last_line_blank = false;
  587. cont = cont->parent;
  588. }
  589. if (cur != last_matched_container &&
  590. container == last_matched_container &&
  591. !blank &&
  592. cur->tag == BLOCK_PARAGRAPH &&
  593. strbuf_len(&cur->string_content) > 0) {
  594. add_line(cur, &input, offset);
  595. } else { // not a lazy continuation
  596. // finalize any blocks that were not matched and set cur to container:
  597. while (cur != last_matched_container) {
  598. finalize(cur, line_number);
  599. cur = cur->parent;
  600. assert(cur != NULL);
  601. }
  602. if (container->tag == BLOCK_INDENTED_CODE) {
  603. add_line(container, &input, offset);
  604. } else if (container->tag == BLOCK_FENCED_CODE) {
  605. matched = 0;
  606. if (indent <= 3 &&
  607. peek_at(&input, first_nonspace) == container->attributes.fenced_code_data.fence_char) {
  608. int fence_len = scan_close_code_fence(&input, first_nonspace);
  609. if (fence_len > container->attributes.fenced_code_data.fence_length)
  610. matched = 1;
  611. }
  612. if (matched) {
  613. // if closing fence, don't add line to container; instead, close it:
  614. finalize(container, line_number);
  615. container = container->parent; // back up to parent
  616. } else {
  617. add_line(container, &input, offset);
  618. }
  619. } else if (container->tag == BLOCK_HTML) {
  620. add_line(container, &input, offset);
  621. } else if (blank) {
  622. // ??? do nothing
  623. } else if (container->tag == BLOCK_ATX_HEADER) {
  624. chop_trailing_hashtags(&input);
  625. add_line(container, &input, first_nonspace);
  626. finalize(container, line_number);
  627. container = container->parent;
  628. } else if (accepts_lines(container->tag)) {
  629. add_line(container, &input, first_nonspace);
  630. } else if (container->tag != BLOCK_HRULE && container->tag != BLOCK_SETEXT_HEADER) {
  631. // create paragraph container for line
  632. container = add_child(container, BLOCK_PARAGRAPH, line_number, first_nonspace + 1);
  633. add_line(container, &input, first_nonspace);
  634. } else {
  635. assert(false);
  636. }
  637. *curptr = container;
  638. }
  639. }