aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: eabac03b588d2981fd4d0e727f9bcfaad73fd78c (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "bstrlib.h"
  7. #include "stmd.h"
  8. #include "uthash.h"
  9. #include "debug.h"
  10. #include "scanners.h"
  11. static block* make_block(int tag, int start_line, int start_column)
  12. {
  13. block* e;
  14. e = (block*) malloc(sizeof(block));
  15. e->tag = tag;
  16. e->open = true;
  17. e->last_line_blank = false;
  18. e->start_line = start_line;
  19. e->start_column = start_column;
  20. e->end_line = start_line;
  21. e->children = NULL;
  22. e->last_child = NULL;
  23. e->parent = NULL;
  24. e->top = NULL;
  25. e->attributes.refmap = NULL;
  26. gh_buf_init(&e->string_content, 32);
  27. e->string_pos = 0;
  28. e->inline_content = NULL;
  29. e->next = NULL;
  30. e->prev = NULL;
  31. return e;
  32. }
  33. // Create a root document block.
  34. extern block* make_document()
  35. {
  36. block * e = make_block(document, 1, 1);
  37. reference * map = NULL;
  38. reference ** refmap;
  39. refmap = (reference**) malloc(sizeof(reference*));
  40. *refmap = map;
  41. e->attributes.refmap = refmap;
  42. e->top = e;
  43. return e;
  44. }
  45. // Returns true if line has only space characters, else false.
  46. bool is_blank(gh_buf *s, int offset)
  47. {
  48. while (offset < s->size) {
  49. switch (s->ptr[offset]) {
  50. case '\n':
  51. return true;
  52. case ' ':
  53. offset++;
  54. default:
  55. return false;
  56. }
  57. }
  58. return true;
  59. }
  60. static inline bool can_contain(int parent_type, int child_type)
  61. {
  62. return ( parent_type == document ||
  63. parent_type == block_quote ||
  64. parent_type == list_item ||
  65. (parent_type == list && child_type == list_item) );
  66. }
  67. static inline bool accepts_lines(int block_type)
  68. {
  69. return (block_type == paragraph ||
  70. block_type == atx_header ||
  71. block_type == indented_code ||
  72. block_type == fenced_code);
  73. }
  74. static void add_line(block* block, gh_buf *ln, int offset)
  75. {
  76. assert(block->open);
  77. gh_buf_put(&block->string_content, ln->ptr + offset, ln->size - offset);
  78. }
  79. static void remove_trailing_blank_lines(gh_buf *ln)
  80. {
  81. int i;
  82. for (i = ln->size - 1; i >= 0; --i) {
  83. char c = ln->ptr[i];
  84. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  85. break;
  86. }
  87. if (i < 0) {
  88. gh_buf_clear(ln);
  89. return;
  90. }
  91. i = gh_buf_strchr(ln, '\n', i);
  92. if (i >= 0)
  93. gh_buf_truncate(ln, i + 1);
  94. }
  95. // Check to see if a block ends with a blank line, descending
  96. // if needed into lists and sublists.
  97. static bool ends_with_blank_line(block* block)
  98. {
  99. if (block->last_line_blank) {
  100. return true;
  101. }
  102. if ((block->tag == list || block->tag == list_item) && block->last_child) {
  103. return ends_with_blank_line(block->last_child);
  104. } else {
  105. return false;
  106. }
  107. }
  108. // Break out of all containing lists
  109. static int break_out_of_lists(block ** bptr, int line_number)
  110. {
  111. block * container = *bptr;
  112. block * b = container->top;
  113. // find first containing list:
  114. while (b && b->tag != list) {
  115. b = b->last_child;
  116. }
  117. if (b) {
  118. while (container && container != b) {
  119. finalize(container, line_number);
  120. container = container->parent;
  121. }
  122. finalize(b, line_number);
  123. *bptr = b->parent;
  124. }
  125. return 0;
  126. }
  127. extern void finalize(block* b, int line_number)
  128. {
  129. int firstlinelen;
  130. int pos;
  131. block* item;
  132. block* subitem;
  133. if (!b->open)
  134. return; // don't do anything if the block is already closed
  135. b->open = false;
  136. if (line_number > b->start_line) {
  137. b->end_line = line_number - 1;
  138. } else {
  139. b->end_line = line_number;
  140. }
  141. switch (b->tag) {
  142. case paragraph:
  143. pos = 0;
  144. while (gh_buf_at(&b->string_content, b->string_pos) == '[' &&
  145. (pos = parse_reference(&b->string_content, b->string_pos,
  146. b->top->attributes.refmap))) {
  147. b->string_pos = pos;
  148. }
  149. if (is_blank(&b->string_content, b->string_pos)) {
  150. b->tag = reference_def;
  151. }
  152. break;
  153. case indented_code:
  154. remove_trailing_blank_lines(&b->string_content);
  155. gh_buf_putc(&b->string_content, '\n');
  156. break;
  157. case fenced_code:
  158. // first line of contents becomes info
  159. firstlinelen = gh_buf_strchr(&b->string_content, '\n', b->string_pos);
  160. gh_buf_set(
  161. &b->attributes.fenced_code_data.info,
  162. b->string_content.ptr + b->string_pos,
  163. firstlinelen
  164. );
  165. b->string_pos = firstlinelen + 1;
  166. gh_buf_trim(&b->attributes.fenced_code_data.info);
  167. unescape_buffer(&b->attributes.fenced_code_data.info);
  168. break;
  169. case list: // determine tight/loose status
  170. b->attributes.list_data.tight = true; // tight by default
  171. item = b->children;
  172. while (item) {
  173. // check for non-final non-empty list item ending with blank line:
  174. if (item->last_line_blank && item->next) {
  175. b->attributes.list_data.tight = false;
  176. break;
  177. }
  178. // recurse into children of list item, to see if there are
  179. // spaces between them:
  180. subitem = item->children;
  181. while (subitem) {
  182. if (ends_with_blank_line(subitem) &&
  183. (item->next || subitem->next)) {
  184. b->attributes.list_data.tight = false;
  185. break;
  186. }
  187. subitem = subitem->next;
  188. }
  189. if (!(b->attributes.list_data.tight)) {
  190. break;
  191. }
  192. item = item->next;
  193. }
  194. break;
  195. default:
  196. break;
  197. }
  198. }
  199. // Add a block as child of another. Return pointer to child.
  200. extern block* add_child(block* parent,
  201. int block_type, int start_line, int start_column)
  202. {
  203. assert(parent);
  204. // if 'parent' isn't the kind of block that can accept this child,
  205. // then back up til we hit a block that can.
  206. while (!can_contain(parent->tag, block_type)) {
  207. finalize(parent, start_line);
  208. parent = parent->parent;
  209. }
  210. block* child = make_block(block_type, start_line, start_column);
  211. child->parent = parent;
  212. child->top = parent->top;
  213. if (parent->last_child) {
  214. parent->last_child->next = child;
  215. child->prev = parent->last_child;
  216. } else {
  217. parent->children = child;
  218. child->prev = NULL;
  219. }
  220. parent->last_child = child;
  221. return child;
  222. }
  223. // Free a block list and any children.
  224. extern void free_blocks(block* e)
  225. {
  226. block * next;
  227. while (e != NULL) {
  228. next = e->next;
  229. free_inlines(e->inline_content);
  230. gh_buf_free(&e->string_content);
  231. if (e->tag == fenced_code) {
  232. gh_buf_free(&e->attributes.fenced_code_data.info);
  233. } else if (e->tag == document) {
  234. free_reference_map(e->attributes.refmap);
  235. }
  236. free_blocks(e->children);
  237. free(e);
  238. e = next;
  239. }
  240. }
  241. // Walk through block and all children, recursively, parsing
  242. // string content into inline content where appropriate.
  243. void process_inlines(block* cur, reference** refmap)
  244. {
  245. switch (cur->tag) {
  246. case paragraph:
  247. case atx_header:
  248. case setext_header:
  249. cur->inline_content = parse_inlines(&cur->string_content, cur->string_pos, refmap);
  250. // MEM
  251. // gh_buf_free(&cur->string_content);
  252. break;
  253. default:
  254. break;
  255. }
  256. block *child = cur->children;
  257. while (child != NULL) {
  258. process_inlines(child, refmap);
  259. child = child->next;
  260. }
  261. }
  262. // Attempts to parse a list item marker (bullet or enumerated).
  263. // On success, returns length of the marker, and populates
  264. // data with the details. On failure, returns 0.
  265. static int parse_list_marker(gh_buf *ln, int pos,
  266. struct ListData ** dataptr)
  267. {
  268. char c;
  269. int startpos;
  270. struct ListData * data;
  271. startpos = pos;
  272. c = gh_buf_at(ln, pos);
  273. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(ln, pos)) {
  274. pos++;
  275. if (!isspace(gh_buf_at(ln, pos))) {
  276. return 0;
  277. }
  278. data = malloc(sizeof(struct ListData));
  279. data->marker_offset = 0; // will be adjusted later
  280. data->list_type = bullet;
  281. data->bullet_char = c;
  282. data->start = 1;
  283. data->delimiter = period;
  284. data->tight = false;
  285. } else if (isdigit(c)) {
  286. int start = 0;
  287. do {
  288. start = (10 * start) + (gh_buf_at(ln, pos) - '0');
  289. pos++;
  290. } while (isdigit(gh_buf_at(ln, pos)));
  291. c = gh_buf_at(ln, pos);
  292. if (c == '.' || c == ')') {
  293. pos++;
  294. if (!isspace(gh_buf_at(ln, pos))) {
  295. return 0;
  296. }
  297. data = malloc(sizeof(struct ListData));
  298. data->marker_offset = 0; // will be adjusted later
  299. data->list_type = ordered;
  300. data->bullet_char = 0;
  301. data->start = start;
  302. data->delimiter = (c == '.' ? period : parens);
  303. data->tight = false;
  304. } else {
  305. return 0;
  306. }
  307. } else {
  308. return 0;
  309. }
  310. *dataptr = data;
  311. return (pos - startpos);
  312. }
  313. // Return 1 if list item belongs in list, else 0.
  314. static int lists_match(struct ListData list_data,
  315. struct ListData item_data)
  316. {
  317. return (list_data.list_type == item_data.list_type &&
  318. list_data.delimiter == item_data.delimiter &&
  319. // list_data.marker_offset == item_data.marker_offset &&
  320. list_data.bullet_char == item_data.bullet_char);
  321. }
  322. static void expand_tabs(gh_buf *ob, const char *line, size_t size)
  323. {
  324. size_t i = 0, tab = 0;
  325. while (i < size) {
  326. size_t org = i;
  327. while (i < size && line[i] != '\t') {
  328. i++; tab++;
  329. }
  330. if (i > org)
  331. gh_buf_put(ob, line + org, i - org);
  332. if (i >= size)
  333. break;
  334. do {
  335. gh_buf_putc(ob, ' '); tab++;
  336. } while (tab % 4);
  337. i++;
  338. }
  339. }
  340. extern block *stmd_parse_document(const char *buffer, size_t len)
  341. {
  342. gh_buf line = GH_BUF_INIT;
  343. block *document = make_document();
  344. int linenum = 1;
  345. const char *end = buffer + len;
  346. while (buffer < end) {
  347. const char *eol = memchr(buffer, '\n', end - buffer);
  348. if (!eol) {
  349. expand_tabs(&line, buffer, end - buffer);
  350. buffer = end;
  351. } else {
  352. expand_tabs(&line, buffer, (eol - buffer) + 1);
  353. buffer += (eol - buffer) + 1;
  354. }
  355. incorporate_line(&line, linenum, &document);
  356. gh_buf_clear(&line);
  357. linenum++;
  358. }
  359. gh_buf_free(&line);
  360. while (document != document->top) {
  361. finalize(document, linenum);
  362. document = document->parent;
  363. }
  364. finalize(document, linenum);
  365. process_inlines(document, document->attributes.refmap);
  366. return document;
  367. }
  368. // Process one line at a time, modifying a block.
  369. // Returns 0 if successful. curptr is changed to point to
  370. // the currently open block.
  371. extern void incorporate_line(gh_buf *ln, int line_number, block** curptr)
  372. {
  373. block* last_matched_container;
  374. int offset = 0;
  375. int matched = 0;
  376. int lev = 0;
  377. int i;
  378. struct ListData * data = NULL;
  379. bool all_matched = true;
  380. block* container;
  381. block* cur = *curptr;
  382. bool blank = false;
  383. int first_nonspace;
  384. int indent;
  385. // container starts at the document root.
  386. container = cur->top;
  387. // for each containing block, try to parse the associated line start.
  388. // bail out on failure: container will point to the last matching block.
  389. while (container->last_child && container->last_child->open) {
  390. container = container->last_child;
  391. first_nonspace = offset;
  392. while (gh_buf_at(ln, first_nonspace) == ' ') {
  393. first_nonspace++;
  394. }
  395. indent = first_nonspace - offset;
  396. blank = gh_buf_at(ln, first_nonspace) == '\n';
  397. if (container->tag == block_quote) {
  398. matched = indent <= 3 && gh_buf_at(ln, first_nonspace) == '>';
  399. if (matched) {
  400. offset = first_nonspace + 1;
  401. if (gh_buf_at(ln, offset) == ' ') {
  402. offset++;
  403. }
  404. } else {
  405. all_matched = false;
  406. }
  407. } else if (container->tag == list_item) {
  408. if (indent >= container->attributes.list_data.marker_offset +
  409. container->attributes.list_data.padding) {
  410. offset += container->attributes.list_data.marker_offset +
  411. container->attributes.list_data.padding;
  412. } else if (blank) {
  413. offset = first_nonspace;
  414. } else {
  415. all_matched = false;
  416. }
  417. } else if (container->tag == indented_code) {
  418. if (indent >= CODE_INDENT) {
  419. offset += CODE_INDENT;
  420. } else if (blank) {
  421. offset = first_nonspace;
  422. } else {
  423. all_matched = false;
  424. }
  425. } else if (container->tag == atx_header ||
  426. container->tag == setext_header) {
  427. // a header can never contain more than one line
  428. all_matched = false;
  429. } else if (container->tag == fenced_code) {
  430. // skip optional spaces of fence offset
  431. i = container->attributes.fenced_code_data.fence_offset;
  432. while (i > 0 && gh_buf_at(ln, offset) == ' ') {
  433. offset++;
  434. i--;
  435. }
  436. } else if (container->tag == html_block) {
  437. if (blank) {
  438. all_matched = false;
  439. }
  440. } else if (container->tag == paragraph) {
  441. if (blank) {
  442. container->last_line_blank = true;
  443. all_matched = false;
  444. }
  445. }
  446. if (!all_matched) {
  447. container = container->parent; // back up to last matching block
  448. break;
  449. }
  450. }
  451. last_matched_container = container;
  452. // check to see if we've hit 2nd blank line, break out of list:
  453. if (blank && container->last_line_blank) {
  454. break_out_of_lists(&container, line_number);
  455. }
  456. // unless last matched container is code block, try new container starts:
  457. while (container->tag != fenced_code && container->tag != indented_code &&
  458. container->tag != html_block) {
  459. first_nonspace = offset;
  460. while (gh_buf_at(ln, first_nonspace) == ' ') {
  461. first_nonspace++;
  462. }
  463. indent = first_nonspace - offset;
  464. blank = gh_buf_at(ln, first_nonspace) == '\n';
  465. if (indent >= CODE_INDENT) {
  466. if (cur->tag != paragraph && !blank) {
  467. offset += CODE_INDENT;
  468. container = add_child(container, indented_code, line_number, offset + 1);
  469. } else { // indent > 4 in lazy line
  470. break;
  471. }
  472. } else if (gh_buf_at(ln, first_nonspace) == '>') {
  473. offset = first_nonspace + 1;
  474. // optional following character
  475. if (gh_buf_at(ln, offset) == ' ') {
  476. offset++;
  477. }
  478. container = add_child(container, block_quote, line_number, offset + 1);
  479. } else if ((matched = scan_atx_header_start(ln, first_nonspace))) {
  480. offset = first_nonspace + matched;
  481. container = add_child(container, atx_header, line_number, offset + 1);
  482. int hashpos = gh_buf_strchr(ln, '#', first_nonspace);
  483. assert(hashpos >= 0);
  484. int level = 0;
  485. while (gh_buf_at(ln, hashpos) == '#') {
  486. level++;
  487. hashpos++;
  488. }
  489. container->attributes.header_level = level;
  490. } else if ((matched = scan_open_code_fence(ln, first_nonspace))) {
  491. container = add_child(container, fenced_code, line_number,
  492. first_nonspace + 1);
  493. container->attributes.fenced_code_data.fence_char = gh_buf_at(ln,
  494. first_nonspace);
  495. container->attributes.fenced_code_data.fence_length = matched;
  496. container->attributes.fenced_code_data.fence_offset =
  497. first_nonspace - offset;
  498. offset = first_nonspace + matched;
  499. } else if ((matched = scan_html_block_tag(ln, first_nonspace))) {
  500. container = add_child(container, html_block, line_number,
  501. first_nonspace + 1);
  502. // note, we don't adjust offset because the tag is part of the text
  503. } else if (container->tag == paragraph &&
  504. (lev = scan_setext_header_line(ln, first_nonspace)) &&
  505. // check that there is only one line in the paragraph:
  506. gh_buf_strrchr(&container->string_content, '\n',
  507. gh_buf_len(&container->string_content) - 2) < 0) {
  508. container->tag = setext_header;
  509. container->attributes.header_level = lev;
  510. offset = gh_buf_len(ln) - 1;
  511. } else if (!(container->tag == paragraph && !all_matched) &&
  512. (matched = scan_hrule(ln, first_nonspace))) {
  513. // it's only now that we know the line is not part of a setext header:
  514. container = add_child(container, hrule, line_number, first_nonspace + 1);
  515. finalize(container, line_number);
  516. container = container->parent;
  517. offset = gh_buf_len(ln) - 1;
  518. } else if ((matched = parse_list_marker(ln, first_nonspace, &data))) {
  519. // compute padding:
  520. offset = first_nonspace + matched;
  521. i = 0;
  522. while (i <= 5 && gh_buf_at(ln, offset + i) == ' ') {
  523. i++;
  524. }
  525. // i = number of spaces after marker, up to 5
  526. if (i >= 5 || i < 1 || gh_buf_at(ln, offset) == '\n') {
  527. data->padding = matched + 1;
  528. if (i > 0) {
  529. offset += 1;
  530. }
  531. } else {
  532. data->padding = matched + i;
  533. offset += i;
  534. }
  535. // check container; if it's a list, see if this list item
  536. // can continue the list; otherwise, create a list container.
  537. data->marker_offset = indent;
  538. if (container->tag != list ||
  539. !lists_match(container->attributes.list_data, *data)) {
  540. container = add_child(container, list, line_number,
  541. first_nonspace + 1);
  542. container->attributes.list_data = *data;
  543. }
  544. // add the list item
  545. container = add_child(container, list_item, line_number,
  546. first_nonspace + 1);
  547. container->attributes.list_data = *data;
  548. free(data);
  549. } else {
  550. break;
  551. }
  552. if (accepts_lines(container->tag)) {
  553. // if it's a line container, it can't contain other containers
  554. break;
  555. }
  556. }
  557. // what remains at offset is a text line. add the text to the
  558. // appropriate container.
  559. first_nonspace = offset;
  560. while (gh_buf_at(ln, first_nonspace) == ' ') {
  561. first_nonspace++;
  562. }
  563. indent = first_nonspace - offset;
  564. blank = gh_buf_at(ln, first_nonspace) == '\n';
  565. // block quote lines are never blank as they start with >
  566. // and we don't count blanks in fenced code for purposes of tight/loose
  567. // lists or breaking out of lists. we also don't set last_line_blank
  568. // on an empty list item.
  569. container->last_line_blank = (blank &&
  570. container->tag != block_quote &&
  571. container->tag != fenced_code &&
  572. !(container->tag == list_item &&
  573. container->children == NULL &&
  574. container->start_line == line_number));
  575. block *cont = container;
  576. while (cont->parent) {
  577. cont->parent->last_line_blank = false;
  578. cont = cont->parent;
  579. }
  580. if (cur != last_matched_container &&
  581. container == last_matched_container &&
  582. !blank &&
  583. cur->tag == paragraph &&
  584. gh_buf_len(&cur->string_content) > 0) {
  585. add_line(cur, ln, offset);
  586. } else { // not a lazy continuation
  587. // finalize any blocks that were not matched and set cur to container:
  588. while (cur != last_matched_container) {
  589. finalize(cur, line_number);
  590. cur = cur->parent;
  591. assert(cur != NULL);
  592. }
  593. if (container->tag == indented_code) {
  594. add_line(container, ln, offset);
  595. } else if (container->tag == fenced_code) {
  596. matched = (indent <= 3
  597. && gh_buf_at(ln, first_nonspace) == container->attributes.fenced_code_data.fence_char)
  598. && scan_close_code_fence(ln, first_nonspace,
  599. container->attributes.fenced_code_data.fence_length);
  600. if (matched) {
  601. // if closing fence, don't add line to container; instead, close it:
  602. finalize(container, line_number);
  603. container = container->parent; // back up to parent
  604. } else {
  605. add_line(container, ln, offset);
  606. }
  607. } else if (container->tag == html_block) {
  608. add_line(container, ln, offset);
  609. } else if (blank) {
  610. // ??? do nothing
  611. } else if (container->tag == atx_header) {
  612. // chop off trailing ###s...use a scanner?
  613. gh_buf_trim(ln);
  614. int p = gh_buf_len(ln) - 1;
  615. // if string ends in #s, remove these:
  616. while (gh_buf_at(ln, p) == '#') {
  617. p--;
  618. }
  619. if (gh_buf_at(ln, p) == '\\') {
  620. // the last # was escaped, so we include it.
  621. p++;
  622. }
  623. gh_buf_truncate(ln, p + 1);
  624. add_line(container, ln, first_nonspace);
  625. finalize(container, line_number);
  626. container = container->parent;
  627. } else if (accepts_lines(container->tag)) {
  628. add_line(container, ln, first_nonspace);
  629. } else if (container->tag != hrule && container->tag != setext_header) {
  630. // create paragraph container for line
  631. container = add_child(container, paragraph, line_number, first_nonspace + 1);
  632. add_line(container, ln, first_nonspace);
  633. } else {
  634. assert(false);
  635. }
  636. *curptr = container;
  637. }
  638. }