aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: 94ff986ef5e584cbed67837b4b21f6639fc8084a (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "stmd.h"
  7. #include "scanners.h"
  8. #include "uthash.h"
  9. #define peek_at(i, n) (i)->data[n]
  10. static void incorporate_line(gh_buf *ln, int line_number, block** curptr);
  11. static void finalize(block* b, int line_number);
  12. static block* make_block(int tag, int start_line, int start_column)
  13. {
  14. block* e;
  15. e = (block*) malloc(sizeof(block));
  16. e->tag = tag;
  17. e->open = true;
  18. e->last_line_blank = false;
  19. e->start_line = start_line;
  20. e->start_column = start_column;
  21. e->end_line = start_line;
  22. e->children = NULL;
  23. e->last_child = NULL;
  24. e->parent = NULL;
  25. e->top = NULL;
  26. e->attributes.refmap = NULL;
  27. gh_buf_init(&e->string_content, 32);
  28. e->inline_content = NULL;
  29. e->next = NULL;
  30. e->prev = NULL;
  31. return e;
  32. }
  33. // Create a root document block.
  34. extern block* make_document()
  35. {
  36. block * e = make_block(document, 1, 1);
  37. reference * map = NULL;
  38. reference ** refmap;
  39. refmap = (reference**) malloc(sizeof(reference*));
  40. *refmap = map;
  41. e->attributes.refmap = refmap;
  42. e->top = e;
  43. return e;
  44. }
  45. // Returns true if line has only space characters, else false.
  46. bool is_blank(gh_buf *s, int offset)
  47. {
  48. while (offset < s->size) {
  49. switch (s->ptr[offset]) {
  50. case '\n':
  51. return true;
  52. case ' ':
  53. offset++;
  54. default:
  55. return false;
  56. }
  57. }
  58. return true;
  59. }
  60. static inline bool can_contain(int parent_type, int child_type)
  61. {
  62. return ( parent_type == document ||
  63. parent_type == block_quote ||
  64. parent_type == list_item ||
  65. (parent_type == list && child_type == list_item) );
  66. }
  67. static inline bool accepts_lines(int block_type)
  68. {
  69. return (block_type == paragraph ||
  70. block_type == atx_header ||
  71. block_type == indented_code ||
  72. block_type == fenced_code);
  73. }
  74. static void add_line(block* block, chunk *ch, int offset)
  75. {
  76. assert(block->open);
  77. gh_buf_put(&block->string_content, ch->data + offset, ch->len - offset);
  78. }
  79. static void remove_trailing_blank_lines(gh_buf *ln)
  80. {
  81. int i;
  82. for (i = ln->size - 1; i >= 0; --i) {
  83. char c = ln->ptr[i];
  84. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  85. break;
  86. }
  87. if (i < 0) {
  88. gh_buf_clear(ln);
  89. return;
  90. }
  91. i = gh_buf_strchr(ln, '\n', i);
  92. if (i >= 0)
  93. gh_buf_truncate(ln, i);
  94. }
  95. // Check to see if a block ends with a blank line, descending
  96. // if needed into lists and sublists.
  97. static bool ends_with_blank_line(block* block)
  98. {
  99. if (block->last_line_blank) {
  100. return true;
  101. }
  102. if ((block->tag == list || block->tag == list_item) && block->last_child) {
  103. return ends_with_blank_line(block->last_child);
  104. } else {
  105. return false;
  106. }
  107. }
  108. // Break out of all containing lists
  109. static int break_out_of_lists(block ** bptr, int line_number)
  110. {
  111. block * container = *bptr;
  112. block * b = container->top;
  113. // find first containing list:
  114. while (b && b->tag != list) {
  115. b = b->last_child;
  116. }
  117. if (b) {
  118. while (container && container != b) {
  119. finalize(container, line_number);
  120. container = container->parent;
  121. }
  122. finalize(b, line_number);
  123. *bptr = b->parent;
  124. }
  125. return 0;
  126. }
  127. static void finalize(block* b, int line_number)
  128. {
  129. int firstlinelen;
  130. int pos;
  131. block* item;
  132. block* subitem;
  133. if (!b->open)
  134. return; // don't do anything if the block is already closed
  135. b->open = false;
  136. if (line_number > b->start_line) {
  137. b->end_line = line_number - 1;
  138. } else {
  139. b->end_line = line_number;
  140. }
  141. switch (b->tag) {
  142. case paragraph:
  143. pos = 0;
  144. while (gh_buf_at(&b->string_content, 0) == '[' &&
  145. (pos = parse_reference(&b->string_content, b->top->attributes.refmap))) {
  146. gh_buf_drop(&b->string_content, pos);
  147. }
  148. if (is_blank(&b->string_content, 0)) {
  149. b->tag = reference_def;
  150. }
  151. break;
  152. case indented_code:
  153. remove_trailing_blank_lines(&b->string_content);
  154. gh_buf_putc(&b->string_content, '\n');
  155. break;
  156. case fenced_code:
  157. // first line of contents becomes info
  158. firstlinelen = gh_buf_strchr(&b->string_content, '\n', 0);
  159. gh_buf_init(&b->attributes.fenced_code_data.info, 0);
  160. gh_buf_set(
  161. &b->attributes.fenced_code_data.info,
  162. b->string_content.ptr,
  163. firstlinelen
  164. );
  165. gh_buf_drop(&b->string_content, firstlinelen + 1);
  166. gh_buf_trim(&b->attributes.fenced_code_data.info);
  167. unescape_buffer(&b->attributes.fenced_code_data.info);
  168. break;
  169. case list: // determine tight/loose status
  170. b->attributes.list_data.tight = true; // tight by default
  171. item = b->children;
  172. while (item) {
  173. // check for non-final non-empty list item ending with blank line:
  174. if (item->last_line_blank && item->next) {
  175. b->attributes.list_data.tight = false;
  176. break;
  177. }
  178. // recurse into children of list item, to see if there are
  179. // spaces between them:
  180. subitem = item->children;
  181. while (subitem) {
  182. if (ends_with_blank_line(subitem) &&
  183. (item->next || subitem->next)) {
  184. b->attributes.list_data.tight = false;
  185. break;
  186. }
  187. subitem = subitem->next;
  188. }
  189. if (!(b->attributes.list_data.tight)) {
  190. break;
  191. }
  192. item = item->next;
  193. }
  194. break;
  195. default:
  196. break;
  197. }
  198. }
  199. // Add a block as child of another. Return pointer to child.
  200. extern block* add_child(block* parent,
  201. int block_type, int start_line, int start_column)
  202. {
  203. assert(parent);
  204. // if 'parent' isn't the kind of block that can accept this child,
  205. // then back up til we hit a block that can.
  206. while (!can_contain(parent->tag, block_type)) {
  207. finalize(parent, start_line);
  208. parent = parent->parent;
  209. }
  210. block* child = make_block(block_type, start_line, start_column);
  211. child->parent = parent;
  212. child->top = parent->top;
  213. if (parent->last_child) {
  214. parent->last_child->next = child;
  215. child->prev = parent->last_child;
  216. } else {
  217. parent->children = child;
  218. child->prev = NULL;
  219. }
  220. parent->last_child = child;
  221. return child;
  222. }
  223. // Free a block list and any children.
  224. extern void free_blocks(block* e)
  225. {
  226. block * next;
  227. while (e != NULL) {
  228. next = e->next;
  229. free_inlines(e->inline_content);
  230. gh_buf_free(&e->string_content);
  231. if (e->tag == fenced_code) {
  232. gh_buf_free(&e->attributes.fenced_code_data.info);
  233. } else if (e->tag == document) {
  234. free_reference_map(e->attributes.refmap);
  235. }
  236. free_blocks(e->children);
  237. free(e);
  238. e = next;
  239. }
  240. }
  241. // Walk through block and all children, recursively, parsing
  242. // string content into inline content where appropriate.
  243. void process_inlines(block* cur, reference** refmap)
  244. {
  245. switch (cur->tag) {
  246. case paragraph:
  247. case atx_header:
  248. case setext_header:
  249. cur->inline_content = parse_inlines(&cur->string_content, refmap);
  250. // MEM
  251. // gh_buf_free(&cur->string_content);
  252. break;
  253. default:
  254. break;
  255. }
  256. block *child = cur->children;
  257. while (child != NULL) {
  258. process_inlines(child, refmap);
  259. child = child->next;
  260. }
  261. }
  262. // Attempts to parse a list item marker (bullet or enumerated).
  263. // On success, returns length of the marker, and populates
  264. // data with the details. On failure, returns 0.
  265. static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
  266. {
  267. unsigned char c;
  268. int startpos;
  269. struct ListData * data;
  270. startpos = pos;
  271. c = peek_at(input, pos);
  272. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(input, pos)) {
  273. pos++;
  274. if (!isspace(peek_at(input, pos))) {
  275. return 0;
  276. }
  277. data = malloc(sizeof(struct ListData));
  278. data->marker_offset = 0; // will be adjusted later
  279. data->list_type = bullet;
  280. data->bullet_char = c;
  281. data->start = 1;
  282. data->delimiter = period;
  283. data->tight = false;
  284. } else if (isdigit(c)) {
  285. int start = 0;
  286. do {
  287. start = (10 * start) + (peek_at(input, pos) - '0');
  288. pos++;
  289. } while (isdigit(peek_at(input, pos)));
  290. c = peek_at(input, pos);
  291. if (c == '.' || c == ')') {
  292. pos++;
  293. if (!isspace(peek_at(input, pos))) {
  294. return 0;
  295. }
  296. data = malloc(sizeof(struct ListData));
  297. data->marker_offset = 0; // will be adjusted later
  298. data->list_type = ordered;
  299. data->bullet_char = 0;
  300. data->start = start;
  301. data->delimiter = (c == '.' ? period : parens);
  302. data->tight = false;
  303. } else {
  304. return 0;
  305. }
  306. } else {
  307. return 0;
  308. }
  309. *dataptr = data;
  310. return (pos - startpos);
  311. }
  312. // Return 1 if list item belongs in list, else 0.
  313. static int lists_match(struct ListData list_data,
  314. struct ListData item_data)
  315. {
  316. return (list_data.list_type == item_data.list_type &&
  317. list_data.delimiter == item_data.delimiter &&
  318. // list_data.marker_offset == item_data.marker_offset &&
  319. list_data.bullet_char == item_data.bullet_char);
  320. }
  321. static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size)
  322. {
  323. size_t i = 0, tab = 0;
  324. while (i < size) {
  325. size_t org = i;
  326. while (i < size && line[i] != '\t') {
  327. i++; tab++;
  328. }
  329. if (i > org)
  330. gh_buf_put(ob, line + org, i - org);
  331. if (i >= size)
  332. break;
  333. do {
  334. gh_buf_putc(ob, ' '); tab++;
  335. } while (tab % 4);
  336. i++;
  337. }
  338. }
  339. static block *finalize_document(block *document, int linenum)
  340. {
  341. while (document != document->top) {
  342. finalize(document, linenum);
  343. document = document->parent;
  344. }
  345. finalize(document, linenum);
  346. process_inlines(document, document->attributes.refmap);
  347. return document;
  348. }
  349. extern block *stmd_parse_file(FILE *f)
  350. {
  351. gh_buf line = GH_BUF_INIT;
  352. unsigned char buffer[4096];
  353. int linenum = 1;
  354. block *document = make_document();
  355. while (fgets((char *)buffer, sizeof(buffer), f)) {
  356. expand_tabs(&line, buffer, strlen((char *)buffer));
  357. incorporate_line(&line, linenum, &document);
  358. gh_buf_clear(&line);
  359. linenum++;
  360. }
  361. gh_buf_free(&line);
  362. return finalize_document(document, linenum);
  363. }
  364. extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
  365. {
  366. gh_buf line = GH_BUF_INIT;
  367. int linenum = 1;
  368. const unsigned char *end = buffer + len;
  369. block *document = make_document();
  370. while (buffer < end) {
  371. const unsigned char *eol = memchr(buffer, '\n', end - buffer);
  372. if (!eol) {
  373. expand_tabs(&line, buffer, end - buffer);
  374. buffer = end;
  375. } else {
  376. expand_tabs(&line, buffer, (eol - buffer) + 1);
  377. buffer += (eol - buffer) + 1;
  378. }
  379. incorporate_line(&line, linenum, &document);
  380. gh_buf_clear(&line);
  381. linenum++;
  382. }
  383. gh_buf_free(&line);
  384. return finalize_document(document, linenum);
  385. }
  386. static void chop_trailing_hashtags(chunk *ch)
  387. {
  388. int n;
  389. chunk_rtrim(ch);
  390. n = ch->len - 1;
  391. // if string ends in #s, remove these:
  392. while (n >= 0 && peek_at(ch, n) == '#')
  393. n--;
  394. // the last # was escaped, so we include it.
  395. if (n >= 0 && peek_at(ch, n) == '\\')
  396. n++;
  397. ch->len = n + 1;
  398. }
  399. // Process one line at a time, modifying a block.
  400. static void incorporate_line(gh_buf *line, int line_number, block** curptr)
  401. {
  402. block* last_matched_container;
  403. int offset = 0;
  404. int matched = 0;
  405. int lev = 0;
  406. int i;
  407. struct ListData * data = NULL;
  408. bool all_matched = true;
  409. block* container;
  410. block* cur = *curptr;
  411. bool blank = false;
  412. int first_nonspace;
  413. int indent;
  414. chunk input;
  415. input.data = line->ptr;
  416. input.len = line->size;
  417. // container starts at the document root.
  418. container = cur->top;
  419. // for each containing block, try to parse the associated line start.
  420. // bail out on failure: container will point to the last matching block.
  421. while (container->last_child && container->last_child->open) {
  422. container = container->last_child;
  423. first_nonspace = offset;
  424. while (peek_at(&input, first_nonspace) == ' ') {
  425. first_nonspace++;
  426. }
  427. indent = first_nonspace - offset;
  428. blank = peek_at(&input, first_nonspace) == '\n';
  429. if (container->tag == block_quote) {
  430. matched = indent <= 3 && peek_at(&input, first_nonspace) == '>';
  431. if (matched) {
  432. offset = first_nonspace + 1;
  433. if (peek_at(&input, offset) == ' ')
  434. offset++;
  435. } else {
  436. all_matched = false;
  437. }
  438. } else if (container->tag == list_item) {
  439. if (indent >= container->attributes.list_data.marker_offset +
  440. container->attributes.list_data.padding) {
  441. offset += container->attributes.list_data.marker_offset +
  442. container->attributes.list_data.padding;
  443. } else if (blank) {
  444. offset = first_nonspace;
  445. } else {
  446. all_matched = false;
  447. }
  448. } else if (container->tag == indented_code) {
  449. if (indent >= CODE_INDENT) {
  450. offset += CODE_INDENT;
  451. } else if (blank) {
  452. offset = first_nonspace;
  453. } else {
  454. all_matched = false;
  455. }
  456. } else if (container->tag == atx_header ||
  457. container->tag == setext_header) {
  458. // a header can never contain more than one line
  459. all_matched = false;
  460. } else if (container->tag == fenced_code) {
  461. // skip optional spaces of fence offset
  462. i = container->attributes.fenced_code_data.fence_offset;
  463. while (i > 0 && peek_at(&input, offset) == ' ') {
  464. offset++;
  465. i--;
  466. }
  467. } else if (container->tag == html_block) {
  468. if (blank) {
  469. all_matched = false;
  470. }
  471. } else if (container->tag == paragraph) {
  472. if (blank) {
  473. container->last_line_blank = true;
  474. all_matched = false;
  475. }
  476. }
  477. if (!all_matched) {
  478. container = container->parent; // back up to last matching block
  479. break;
  480. }
  481. }
  482. last_matched_container = container;
  483. // check to see if we've hit 2nd blank line, break out of list:
  484. if (blank && container->last_line_blank) {
  485. break_out_of_lists(&container, line_number);
  486. }
  487. // unless last matched container is code block, try new container starts:
  488. while (container->tag != fenced_code && container->tag != indented_code &&
  489. container->tag != html_block) {
  490. first_nonspace = offset;
  491. while (peek_at(&input, first_nonspace) == ' ')
  492. first_nonspace++;
  493. indent = first_nonspace - offset;
  494. blank = peek_at(&input, first_nonspace) == '\n';
  495. if (indent >= CODE_INDENT) {
  496. if (cur->tag != paragraph && !blank) {
  497. offset += CODE_INDENT;
  498. container = add_child(container, indented_code, line_number, offset + 1);
  499. } else { // indent > 4 in lazy line
  500. break;
  501. }
  502. } else if (peek_at(&input, first_nonspace) == '>') {
  503. offset = first_nonspace + 1;
  504. // optional following character
  505. if (peek_at(&input, offset) == ' ')
  506. offset++;
  507. container = add_child(container, block_quote, line_number, offset + 1);
  508. } else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
  509. offset = first_nonspace + matched;
  510. container = add_child(container, atx_header, line_number, offset + 1);
  511. int hashpos = chunk_strchr(&input, '#', first_nonspace);
  512. int level = 0;
  513. while (peek_at(&input, hashpos) == '#') {
  514. level++;
  515. hashpos++;
  516. }
  517. container->attributes.header_level = level;
  518. } else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
  519. container = add_child(container, fenced_code, line_number, first_nonspace + 1);
  520. container->attributes.fenced_code_data.fence_char = peek_at(&input, first_nonspace);
  521. container->attributes.fenced_code_data.fence_length = matched;
  522. container->attributes.fenced_code_data.fence_offset = first_nonspace - offset;
  523. offset = first_nonspace + matched;
  524. } else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
  525. container = add_child(container, html_block, line_number, first_nonspace + 1);
  526. // note, we don't adjust offset because the tag is part of the text
  527. } else if (container->tag == paragraph &&
  528. (lev = scan_setext_header_line(&input, first_nonspace)) &&
  529. // check that there is only one line in the paragraph:
  530. gh_buf_strrchr(&container->string_content, '\n',
  531. gh_buf_len(&container->string_content) - 2) < 0) {
  532. container->tag = setext_header;
  533. container->attributes.header_level = lev;
  534. offset = input.len - 1;
  535. } else if (!(container->tag == paragraph && !all_matched) &&
  536. (matched = scan_hrule(&input, first_nonspace))) {
  537. // it's only now that we know the line is not part of a setext header:
  538. container = add_child(container, hrule, line_number, first_nonspace + 1);
  539. finalize(container, line_number);
  540. container = container->parent;
  541. offset = input.len - 1;
  542. } else if ((matched = parse_list_marker(&input, first_nonspace, &data))) {
  543. // compute padding:
  544. offset = first_nonspace + matched;
  545. i = 0;
  546. while (i <= 5 && peek_at(&input, offset + i) == ' ') {
  547. i++;
  548. }
  549. // i = number of spaces after marker, up to 5
  550. if (i >= 5 || i < 1 || peek_at(&input, offset) == '\n') {
  551. data->padding = matched + 1;
  552. if (i > 0) {
  553. offset += 1;
  554. }
  555. } else {
  556. data->padding = matched + i;
  557. offset += i;
  558. }
  559. // check container; if it's a list, see if this list item
  560. // can continue the list; otherwise, create a list container.
  561. data->marker_offset = indent;
  562. if (container->tag != list ||
  563. !lists_match(container->attributes.list_data, *data)) {
  564. container = add_child(container, list, line_number,
  565. first_nonspace + 1);
  566. container->attributes.list_data = *data;
  567. }
  568. // add the list item
  569. container = add_child(container, list_item, line_number,
  570. first_nonspace + 1);
  571. /* TODO: static */
  572. container->attributes.list_data = *data;
  573. free(data);
  574. } else {
  575. break;
  576. }
  577. if (accepts_lines(container->tag)) {
  578. // if it's a line container, it can't contain other containers
  579. break;
  580. }
  581. }
  582. // what remains at offset is a text line. add the text to the
  583. // appropriate container.
  584. first_nonspace = offset;
  585. while (peek_at(&input, first_nonspace) == ' ')
  586. first_nonspace++;
  587. indent = first_nonspace - offset;
  588. blank = peek_at(&input, first_nonspace) == '\n';
  589. // block quote lines are never blank as they start with >
  590. // and we don't count blanks in fenced code for purposes of tight/loose
  591. // lists or breaking out of lists. we also don't set last_line_blank
  592. // on an empty list item.
  593. container->last_line_blank = (blank &&
  594. container->tag != block_quote &&
  595. container->tag != fenced_code &&
  596. !(container->tag == list_item &&
  597. container->children == NULL &&
  598. container->start_line == line_number));
  599. block *cont = container;
  600. while (cont->parent) {
  601. cont->parent->last_line_blank = false;
  602. cont = cont->parent;
  603. }
  604. if (cur != last_matched_container &&
  605. container == last_matched_container &&
  606. !blank &&
  607. cur->tag == paragraph &&
  608. gh_buf_len(&cur->string_content) > 0) {
  609. add_line(cur, &input, offset);
  610. } else { // not a lazy continuation
  611. // finalize any blocks that were not matched and set cur to container:
  612. while (cur != last_matched_container) {
  613. finalize(cur, line_number);
  614. cur = cur->parent;
  615. assert(cur != NULL);
  616. }
  617. if (container->tag == indented_code) {
  618. add_line(container, &input, offset);
  619. } else if (container->tag == fenced_code) {
  620. matched = (indent <= 3
  621. && peek_at(&input, first_nonspace) == container->attributes.fenced_code_data.fence_char)
  622. && scan_close_code_fence(&input, first_nonspace,
  623. container->attributes.fenced_code_data.fence_length);
  624. if (matched) {
  625. // if closing fence, don't add line to container; instead, close it:
  626. finalize(container, line_number);
  627. container = container->parent; // back up to parent
  628. } else {
  629. add_line(container, &input, offset);
  630. }
  631. } else if (container->tag == html_block) {
  632. add_line(container, &input, offset);
  633. } else if (blank) {
  634. // ??? do nothing
  635. } else if (container->tag == atx_header) {
  636. chop_trailing_hashtags(&input);
  637. add_line(container, &input, first_nonspace);
  638. finalize(container, line_number);
  639. container = container->parent;
  640. } else if (accepts_lines(container->tag)) {
  641. add_line(container, &input, first_nonspace);
  642. } else if (container->tag != hrule && container->tag != setext_header) {
  643. // create paragraph container for line
  644. container = add_child(container, paragraph, line_number, first_nonspace + 1);
  645. add_line(container, &input, first_nonspace);
  646. } else {
  647. assert(false);
  648. }
  649. *curptr = container;
  650. }
  651. }