aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: bd25d6c03b3a2c40f216bf73afa6bd4c9bf53cf8 (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "stmd.h"
  7. #include "scanners.h"
  8. #include "uthash.h"
  9. #define peek_at(i, n) (i)->data[n]
  10. static void incorporate_line(gh_buf *ln, int line_number, block** curptr);
  11. static void finalize(block* b, int line_number);
  12. static block* make_block(int tag, int start_line, int start_column)
  13. {
  14. block* e;
  15. e = (block*) malloc(sizeof(block));
  16. e->tag = tag;
  17. e->open = true;
  18. e->last_line_blank = false;
  19. e->start_line = start_line;
  20. e->start_column = start_column;
  21. e->end_line = start_line;
  22. e->children = NULL;
  23. e->last_child = NULL;
  24. e->parent = NULL;
  25. e->top = NULL;
  26. e->attributes.refmap = NULL;
  27. gh_buf_init(&e->string_content, 32);
  28. e->inline_content = NULL;
  29. e->next = NULL;
  30. e->prev = NULL;
  31. return e;
  32. }
  33. // Create a root document block.
  34. extern block* make_document()
  35. {
  36. block * e = make_block(document, 1, 1);
  37. reference * map = NULL;
  38. reference ** refmap;
  39. refmap = (reference**) malloc(sizeof(reference*));
  40. *refmap = map;
  41. e->attributes.refmap = refmap;
  42. e->top = e;
  43. return e;
  44. }
  45. // Returns true if line has only space characters, else false.
  46. bool is_blank(gh_buf *s, int offset)
  47. {
  48. while (offset < s->size) {
  49. switch (s->ptr[offset]) {
  50. case '\n':
  51. return true;
  52. case ' ':
  53. offset++;
  54. break;
  55. default:
  56. return false;
  57. }
  58. }
  59. return true;
  60. }
  61. static inline bool can_contain(int parent_type, int child_type)
  62. {
  63. return ( parent_type == document ||
  64. parent_type == block_quote ||
  65. parent_type == list_item ||
  66. (parent_type == list && child_type == list_item) );
  67. }
  68. static inline bool accepts_lines(int block_type)
  69. {
  70. return (block_type == paragraph ||
  71. block_type == atx_header ||
  72. block_type == indented_code ||
  73. block_type == fenced_code);
  74. }
  75. static void add_line(block* block, chunk *ch, int offset)
  76. {
  77. assert(block->open);
  78. gh_buf_put(&block->string_content, ch->data + offset, ch->len - offset);
  79. }
  80. static void remove_trailing_blank_lines(gh_buf *ln)
  81. {
  82. int i;
  83. for (i = ln->size - 1; i >= 0; --i) {
  84. char c = ln->ptr[i];
  85. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  86. break;
  87. }
  88. if (i < 0) {
  89. gh_buf_clear(ln);
  90. return;
  91. }
  92. i = gh_buf_strchr(ln, '\n', i);
  93. if (i >= 0)
  94. gh_buf_truncate(ln, i);
  95. }
  96. // Check to see if a block ends with a blank line, descending
  97. // if needed into lists and sublists.
  98. static bool ends_with_blank_line(block* block)
  99. {
  100. if (block->last_line_blank) {
  101. return true;
  102. }
  103. if ((block->tag == list || block->tag == list_item) && block->last_child) {
  104. return ends_with_blank_line(block->last_child);
  105. } else {
  106. return false;
  107. }
  108. }
  109. // Break out of all containing lists
  110. static int break_out_of_lists(block ** bptr, int line_number)
  111. {
  112. block * container = *bptr;
  113. block * b = container->top;
  114. // find first containing list:
  115. while (b && b->tag != list) {
  116. b = b->last_child;
  117. }
  118. if (b) {
  119. while (container && container != b) {
  120. finalize(container, line_number);
  121. container = container->parent;
  122. }
  123. finalize(b, line_number);
  124. *bptr = b->parent;
  125. }
  126. return 0;
  127. }
  128. static void finalize(block* b, int line_number)
  129. {
  130. int firstlinelen;
  131. int pos;
  132. block* item;
  133. block* subitem;
  134. if (!b->open)
  135. return; // don't do anything if the block is already closed
  136. b->open = false;
  137. if (line_number > b->start_line) {
  138. b->end_line = line_number - 1;
  139. } else {
  140. b->end_line = line_number;
  141. }
  142. switch (b->tag) {
  143. case paragraph:
  144. pos = 0;
  145. while (gh_buf_at(&b->string_content, 0) == '[' &&
  146. (pos = parse_reference(&b->string_content, b->top->attributes.refmap))) {
  147. gh_buf_drop(&b->string_content, pos);
  148. }
  149. if (is_blank(&b->string_content, 0)) {
  150. b->tag = reference_def;
  151. }
  152. break;
  153. case indented_code:
  154. remove_trailing_blank_lines(&b->string_content);
  155. gh_buf_putc(&b->string_content, '\n');
  156. break;
  157. case fenced_code:
  158. // first line of contents becomes info
  159. firstlinelen = gh_buf_strchr(&b->string_content, '\n', 0);
  160. gh_buf_init(&b->attributes.fenced_code_data.info, 0);
  161. gh_buf_set(
  162. &b->attributes.fenced_code_data.info,
  163. b->string_content.ptr,
  164. firstlinelen
  165. );
  166. gh_buf_drop(&b->string_content, firstlinelen + 1);
  167. gh_buf_trim(&b->attributes.fenced_code_data.info);
  168. unescape_buffer(&b->attributes.fenced_code_data.info);
  169. break;
  170. case list: // determine tight/loose status
  171. b->attributes.list_data.tight = true; // tight by default
  172. item = b->children;
  173. while (item) {
  174. // check for non-final non-empty list item ending with blank line:
  175. if (item->last_line_blank && item->next) {
  176. b->attributes.list_data.tight = false;
  177. break;
  178. }
  179. // recurse into children of list item, to see if there are
  180. // spaces between them:
  181. subitem = item->children;
  182. while (subitem) {
  183. if (ends_with_blank_line(subitem) &&
  184. (item->next || subitem->next)) {
  185. b->attributes.list_data.tight = false;
  186. break;
  187. }
  188. subitem = subitem->next;
  189. }
  190. if (!(b->attributes.list_data.tight)) {
  191. break;
  192. }
  193. item = item->next;
  194. }
  195. break;
  196. default:
  197. break;
  198. }
  199. }
  200. // Add a block as child of another. Return pointer to child.
  201. extern block* add_child(block* parent,
  202. int block_type, int start_line, int start_column)
  203. {
  204. assert(parent);
  205. // if 'parent' isn't the kind of block that can accept this child,
  206. // then back up til we hit a block that can.
  207. while (!can_contain(parent->tag, block_type)) {
  208. finalize(parent, start_line);
  209. parent = parent->parent;
  210. }
  211. block* child = make_block(block_type, start_line, start_column);
  212. child->parent = parent;
  213. child->top = parent->top;
  214. if (parent->last_child) {
  215. parent->last_child->next = child;
  216. child->prev = parent->last_child;
  217. } else {
  218. parent->children = child;
  219. child->prev = NULL;
  220. }
  221. parent->last_child = child;
  222. return child;
  223. }
  224. // Free a block list and any children.
  225. extern void free_blocks(block* e)
  226. {
  227. block * next;
  228. while (e != NULL) {
  229. next = e->next;
  230. free_inlines(e->inline_content);
  231. gh_buf_free(&e->string_content);
  232. if (e->tag == fenced_code) {
  233. gh_buf_free(&e->attributes.fenced_code_data.info);
  234. } else if (e->tag == document) {
  235. free_reference_map(e->attributes.refmap);
  236. }
  237. free_blocks(e->children);
  238. free(e);
  239. e = next;
  240. }
  241. }
  242. // Walk through block and all children, recursively, parsing
  243. // string content into inline content where appropriate.
  244. void process_inlines(block* cur, reference** refmap)
  245. {
  246. switch (cur->tag) {
  247. case paragraph:
  248. case atx_header:
  249. case setext_header:
  250. cur->inline_content = parse_inlines(&cur->string_content, refmap);
  251. // MEM
  252. // gh_buf_free(&cur->string_content);
  253. break;
  254. default:
  255. break;
  256. }
  257. block *child = cur->children;
  258. while (child != NULL) {
  259. process_inlines(child, refmap);
  260. child = child->next;
  261. }
  262. }
  263. // Attempts to parse a list item marker (bullet or enumerated).
  264. // On success, returns length of the marker, and populates
  265. // data with the details. On failure, returns 0.
  266. static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
  267. {
  268. unsigned char c;
  269. int startpos;
  270. struct ListData * data;
  271. startpos = pos;
  272. c = peek_at(input, pos);
  273. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(input, pos)) {
  274. pos++;
  275. if (!isspace(peek_at(input, pos))) {
  276. return 0;
  277. }
  278. data = malloc(sizeof(struct ListData));
  279. data->marker_offset = 0; // will be adjusted later
  280. data->list_type = bullet;
  281. data->bullet_char = c;
  282. data->start = 1;
  283. data->delimiter = period;
  284. data->tight = false;
  285. } else if (isdigit(c)) {
  286. int start = 0;
  287. do {
  288. start = (10 * start) + (peek_at(input, pos) - '0');
  289. pos++;
  290. } while (isdigit(peek_at(input, pos)));
  291. c = peek_at(input, pos);
  292. if (c == '.' || c == ')') {
  293. pos++;
  294. if (!isspace(peek_at(input, pos))) {
  295. return 0;
  296. }
  297. data = malloc(sizeof(struct ListData));
  298. data->marker_offset = 0; // will be adjusted later
  299. data->list_type = ordered;
  300. data->bullet_char = 0;
  301. data->start = start;
  302. data->delimiter = (c == '.' ? period : parens);
  303. data->tight = false;
  304. } else {
  305. return 0;
  306. }
  307. } else {
  308. return 0;
  309. }
  310. *dataptr = data;
  311. return (pos - startpos);
  312. }
  313. // Return 1 if list item belongs in list, else 0.
  314. static int lists_match(struct ListData list_data,
  315. struct ListData item_data)
  316. {
  317. return (list_data.list_type == item_data.list_type &&
  318. list_data.delimiter == item_data.delimiter &&
  319. // list_data.marker_offset == item_data.marker_offset &&
  320. list_data.bullet_char == item_data.bullet_char);
  321. }
  322. static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size)
  323. {
  324. size_t i = 0, tab = 0;
  325. while (i < size) {
  326. size_t org = i;
  327. while (i < size && line[i] != '\t') {
  328. i++; tab++;
  329. }
  330. if (i > org)
  331. gh_buf_put(ob, line + org, i - org);
  332. if (i >= size)
  333. break;
  334. do {
  335. gh_buf_putc(ob, ' '); tab++;
  336. } while (tab % 4);
  337. i++;
  338. }
  339. }
  340. static block *finalize_document(block *document, int linenum)
  341. {
  342. while (document != document->top) {
  343. finalize(document, linenum);
  344. document = document->parent;
  345. }
  346. finalize(document, linenum);
  347. process_inlines(document, document->attributes.refmap);
  348. return document;
  349. }
  350. extern block *stmd_parse_file(FILE *f)
  351. {
  352. gh_buf line = GH_BUF_INIT;
  353. unsigned char buffer[4096];
  354. int linenum = 1;
  355. block *document = make_document();
  356. while (fgets((char *)buffer, sizeof(buffer), f)) {
  357. expand_tabs(&line, buffer, strlen((char *)buffer));
  358. incorporate_line(&line, linenum, &document);
  359. gh_buf_clear(&line);
  360. linenum++;
  361. }
  362. gh_buf_free(&line);
  363. return finalize_document(document, linenum);
  364. }
  365. extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
  366. {
  367. gh_buf line = GH_BUF_INIT;
  368. int linenum = 1;
  369. const unsigned char *end = buffer + len;
  370. block *document = make_document();
  371. while (buffer < end) {
  372. const unsigned char *eol = memchr(buffer, '\n', end - buffer);
  373. if (!eol) {
  374. expand_tabs(&line, buffer, end - buffer);
  375. buffer = end;
  376. } else {
  377. expand_tabs(&line, buffer, (eol - buffer) + 1);
  378. buffer += (eol - buffer) + 1;
  379. }
  380. incorporate_line(&line, linenum, &document);
  381. gh_buf_clear(&line);
  382. linenum++;
  383. }
  384. gh_buf_free(&line);
  385. return finalize_document(document, linenum);
  386. }
  387. static void chop_trailing_hashtags(chunk *ch)
  388. {
  389. int n;
  390. chunk_rtrim(ch);
  391. n = ch->len - 1;
  392. // if string ends in #s, remove these:
  393. while (n >= 0 && peek_at(ch, n) == '#')
  394. n--;
  395. // the last # was escaped, so we include it.
  396. if (n >= 0 && peek_at(ch, n) == '\\')
  397. n++;
  398. ch->len = n + 1;
  399. }
  400. // Process one line at a time, modifying a block.
  401. static void incorporate_line(gh_buf *line, int line_number, block** curptr)
  402. {
  403. block* last_matched_container;
  404. int offset = 0;
  405. int matched = 0;
  406. int lev = 0;
  407. int i;
  408. struct ListData * data = NULL;
  409. bool all_matched = true;
  410. block* container;
  411. block* cur = *curptr;
  412. bool blank = false;
  413. int first_nonspace;
  414. int indent;
  415. chunk input;
  416. input.data = line->ptr;
  417. input.len = line->size;
  418. // container starts at the document root.
  419. container = cur->top;
  420. // for each containing block, try to parse the associated line start.
  421. // bail out on failure: container will point to the last matching block.
  422. while (container->last_child && container->last_child->open) {
  423. container = container->last_child;
  424. first_nonspace = offset;
  425. while (peek_at(&input, first_nonspace) == ' ') {
  426. first_nonspace++;
  427. }
  428. indent = first_nonspace - offset;
  429. blank = peek_at(&input, first_nonspace) == '\n';
  430. if (container->tag == block_quote) {
  431. matched = indent <= 3 && peek_at(&input, first_nonspace) == '>';
  432. if (matched) {
  433. offset = first_nonspace + 1;
  434. if (peek_at(&input, offset) == ' ')
  435. offset++;
  436. } else {
  437. all_matched = false;
  438. }
  439. } else if (container->tag == list_item) {
  440. if (indent >= container->attributes.list_data.marker_offset +
  441. container->attributes.list_data.padding) {
  442. offset += container->attributes.list_data.marker_offset +
  443. container->attributes.list_data.padding;
  444. } else if (blank) {
  445. offset = first_nonspace;
  446. } else {
  447. all_matched = false;
  448. }
  449. } else if (container->tag == indented_code) {
  450. if (indent >= CODE_INDENT) {
  451. offset += CODE_INDENT;
  452. } else if (blank) {
  453. offset = first_nonspace;
  454. } else {
  455. all_matched = false;
  456. }
  457. } else if (container->tag == atx_header ||
  458. container->tag == setext_header) {
  459. // a header can never contain more than one line
  460. all_matched = false;
  461. } else if (container->tag == fenced_code) {
  462. // skip optional spaces of fence offset
  463. i = container->attributes.fenced_code_data.fence_offset;
  464. while (i > 0 && peek_at(&input, offset) == ' ') {
  465. offset++;
  466. i--;
  467. }
  468. } else if (container->tag == html_block) {
  469. if (blank) {
  470. all_matched = false;
  471. }
  472. } else if (container->tag == paragraph) {
  473. if (blank) {
  474. container->last_line_blank = true;
  475. all_matched = false;
  476. }
  477. }
  478. if (!all_matched) {
  479. container = container->parent; // back up to last matching block
  480. break;
  481. }
  482. }
  483. last_matched_container = container;
  484. // check to see if we've hit 2nd blank line, break out of list:
  485. if (blank && container->last_line_blank) {
  486. break_out_of_lists(&container, line_number);
  487. }
  488. // unless last matched container is code block, try new container starts:
  489. while (container->tag != fenced_code && container->tag != indented_code &&
  490. container->tag != html_block) {
  491. first_nonspace = offset;
  492. while (peek_at(&input, first_nonspace) == ' ')
  493. first_nonspace++;
  494. indent = first_nonspace - offset;
  495. blank = peek_at(&input, first_nonspace) == '\n';
  496. if (indent >= CODE_INDENT) {
  497. if (cur->tag != paragraph && !blank) {
  498. offset += CODE_INDENT;
  499. container = add_child(container, indented_code, line_number, offset + 1);
  500. } else { // indent > 4 in lazy line
  501. break;
  502. }
  503. } else if (peek_at(&input, first_nonspace) == '>') {
  504. offset = first_nonspace + 1;
  505. // optional following character
  506. if (peek_at(&input, offset) == ' ')
  507. offset++;
  508. container = add_child(container, block_quote, line_number, offset + 1);
  509. } else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
  510. offset = first_nonspace + matched;
  511. container = add_child(container, atx_header, line_number, offset + 1);
  512. int hashpos = chunk_strchr(&input, '#', first_nonspace);
  513. int level = 0;
  514. while (peek_at(&input, hashpos) == '#') {
  515. level++;
  516. hashpos++;
  517. }
  518. container->attributes.header_level = level;
  519. } else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
  520. container = add_child(container, fenced_code, line_number, first_nonspace + 1);
  521. container->attributes.fenced_code_data.fence_char = peek_at(&input, first_nonspace);
  522. container->attributes.fenced_code_data.fence_length = matched;
  523. container->attributes.fenced_code_data.fence_offset = first_nonspace - offset;
  524. offset = first_nonspace + matched;
  525. } else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
  526. container = add_child(container, html_block, line_number, first_nonspace + 1);
  527. // note, we don't adjust offset because the tag is part of the text
  528. } else if (container->tag == paragraph &&
  529. (lev = scan_setext_header_line(&input, first_nonspace)) &&
  530. // check that there is only one line in the paragraph:
  531. gh_buf_strrchr(&container->string_content, '\n',
  532. gh_buf_len(&container->string_content) - 2) < 0) {
  533. container->tag = setext_header;
  534. container->attributes.header_level = lev;
  535. offset = input.len - 1;
  536. } else if (!(container->tag == paragraph && !all_matched) &&
  537. (matched = scan_hrule(&input, first_nonspace))) {
  538. // it's only now that we know the line is not part of a setext header:
  539. container = add_child(container, hrule, line_number, first_nonspace + 1);
  540. finalize(container, line_number);
  541. container = container->parent;
  542. offset = input.len - 1;
  543. } else if ((matched = parse_list_marker(&input, first_nonspace, &data))) {
  544. // compute padding:
  545. offset = first_nonspace + matched;
  546. i = 0;
  547. while (i <= 5 && peek_at(&input, offset + i) == ' ') {
  548. i++;
  549. }
  550. // i = number of spaces after marker, up to 5
  551. if (i >= 5 || i < 1 || peek_at(&input, offset) == '\n') {
  552. data->padding = matched + 1;
  553. if (i > 0) {
  554. offset += 1;
  555. }
  556. } else {
  557. data->padding = matched + i;
  558. offset += i;
  559. }
  560. // check container; if it's a list, see if this list item
  561. // can continue the list; otherwise, create a list container.
  562. data->marker_offset = indent;
  563. if (container->tag != list ||
  564. !lists_match(container->attributes.list_data, *data)) {
  565. container = add_child(container, list, line_number,
  566. first_nonspace + 1);
  567. container->attributes.list_data = *data;
  568. }
  569. // add the list item
  570. container = add_child(container, list_item, line_number,
  571. first_nonspace + 1);
  572. /* TODO: static */
  573. container->attributes.list_data = *data;
  574. free(data);
  575. } else {
  576. break;
  577. }
  578. if (accepts_lines(container->tag)) {
  579. // if it's a line container, it can't contain other containers
  580. break;
  581. }
  582. }
  583. // what remains at offset is a text line. add the text to the
  584. // appropriate container.
  585. first_nonspace = offset;
  586. while (peek_at(&input, first_nonspace) == ' ')
  587. first_nonspace++;
  588. indent = first_nonspace - offset;
  589. blank = peek_at(&input, first_nonspace) == '\n';
  590. // block quote lines are never blank as they start with >
  591. // and we don't count blanks in fenced code for purposes of tight/loose
  592. // lists or breaking out of lists. we also don't set last_line_blank
  593. // on an empty list item.
  594. container->last_line_blank = (blank &&
  595. container->tag != block_quote &&
  596. container->tag != fenced_code &&
  597. !(container->tag == list_item &&
  598. container->children == NULL &&
  599. container->start_line == line_number));
  600. block *cont = container;
  601. while (cont->parent) {
  602. cont->parent->last_line_blank = false;
  603. cont = cont->parent;
  604. }
  605. if (cur != last_matched_container &&
  606. container == last_matched_container &&
  607. !blank &&
  608. cur->tag == paragraph &&
  609. gh_buf_len(&cur->string_content) > 0) {
  610. add_line(cur, &input, offset);
  611. } else { // not a lazy continuation
  612. // finalize any blocks that were not matched and set cur to container:
  613. while (cur != last_matched_container) {
  614. finalize(cur, line_number);
  615. cur = cur->parent;
  616. assert(cur != NULL);
  617. }
  618. if (container->tag == indented_code) {
  619. add_line(container, &input, offset);
  620. } else if (container->tag == fenced_code) {
  621. matched = (indent <= 3
  622. && peek_at(&input, first_nonspace) == container->attributes.fenced_code_data.fence_char)
  623. && scan_close_code_fence(&input, first_nonspace,
  624. container->attributes.fenced_code_data.fence_length);
  625. if (matched) {
  626. // if closing fence, don't add line to container; instead, close it:
  627. finalize(container, line_number);
  628. container = container->parent; // back up to parent
  629. } else {
  630. add_line(container, &input, offset);
  631. }
  632. } else if (container->tag == html_block) {
  633. add_line(container, &input, offset);
  634. } else if (blank) {
  635. // ??? do nothing
  636. } else if (container->tag == atx_header) {
  637. chop_trailing_hashtags(&input);
  638. add_line(container, &input, first_nonspace);
  639. finalize(container, line_number);
  640. container = container->parent;
  641. } else if (accepts_lines(container->tag)) {
  642. add_line(container, &input, first_nonspace);
  643. } else if (container->tag != hrule && container->tag != setext_header) {
  644. // create paragraph container for line
  645. container = add_child(container, paragraph, line_number, first_nonspace + 1);
  646. add_line(container, &input, first_nonspace);
  647. } else {
  648. assert(false);
  649. }
  650. *curptr = container;
  651. }
  652. }