aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: 71dc8300632ab7e5286eab5efa6cd67c55eb5ed2 (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "stmd.h"
  7. #include "scanners.h"
  8. #include "uthash.h"
  9. static void finalize(block* b, int line_number);
  10. static block* make_block(int tag, int start_line, int start_column)
  11. {
  12. block* e;
  13. e = (block*) malloc(sizeof(block));
  14. e->tag = tag;
  15. e->open = true;
  16. e->last_line_blank = false;
  17. e->start_line = start_line;
  18. e->start_column = start_column;
  19. e->end_line = start_line;
  20. e->children = NULL;
  21. e->last_child = NULL;
  22. e->parent = NULL;
  23. e->top = NULL;
  24. e->attributes.refmap = NULL;
  25. gh_buf_init(&e->string_content, 32);
  26. e->string_pos = 0;
  27. e->inline_content = NULL;
  28. e->next = NULL;
  29. e->prev = NULL;
  30. return e;
  31. }
  32. // Create a root document block.
  33. extern block* make_document()
  34. {
  35. block * e = make_block(document, 1, 1);
  36. reference * map = NULL;
  37. reference ** refmap;
  38. refmap = (reference**) malloc(sizeof(reference*));
  39. *refmap = map;
  40. e->attributes.refmap = refmap;
  41. e->top = e;
  42. return e;
  43. }
  44. // Returns true if line has only space characters, else false.
  45. bool is_blank(gh_buf *s, int offset)
  46. {
  47. while (offset < s->size) {
  48. switch (s->ptr[offset]) {
  49. case '\n':
  50. return true;
  51. case ' ':
  52. offset++;
  53. default:
  54. return false;
  55. }
  56. }
  57. return true;
  58. }
  59. static inline bool can_contain(int parent_type, int child_type)
  60. {
  61. return ( parent_type == document ||
  62. parent_type == block_quote ||
  63. parent_type == list_item ||
  64. (parent_type == list && child_type == list_item) );
  65. }
  66. static inline bool accepts_lines(int block_type)
  67. {
  68. return (block_type == paragraph ||
  69. block_type == atx_header ||
  70. block_type == indented_code ||
  71. block_type == fenced_code);
  72. }
  73. static void add_line(block* block, gh_buf *ln, int offset)
  74. {
  75. assert(block->open);
  76. gh_buf_put(&block->string_content, ln->ptr + offset, ln->size - offset);
  77. }
  78. static void remove_trailing_blank_lines(gh_buf *ln)
  79. {
  80. int i;
  81. for (i = ln->size - 1; i >= 0; --i) {
  82. char c = ln->ptr[i];
  83. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  84. break;
  85. }
  86. if (i < 0) {
  87. gh_buf_clear(ln);
  88. return;
  89. }
  90. i = gh_buf_strchr(ln, '\n', i);
  91. if (i >= 0)
  92. gh_buf_truncate(ln, i + 1);
  93. }
  94. // Check to see if a block ends with a blank line, descending
  95. // if needed into lists and sublists.
  96. static bool ends_with_blank_line(block* block)
  97. {
  98. if (block->last_line_blank) {
  99. return true;
  100. }
  101. if ((block->tag == list || block->tag == list_item) && block->last_child) {
  102. return ends_with_blank_line(block->last_child);
  103. } else {
  104. return false;
  105. }
  106. }
  107. // Break out of all containing lists
  108. static int break_out_of_lists(block ** bptr, int line_number)
  109. {
  110. block * container = *bptr;
  111. block * b = container->top;
  112. // find first containing list:
  113. while (b && b->tag != list) {
  114. b = b->last_child;
  115. }
  116. if (b) {
  117. while (container && container != b) {
  118. finalize(container, line_number);
  119. container = container->parent;
  120. }
  121. finalize(b, line_number);
  122. *bptr = b->parent;
  123. }
  124. return 0;
  125. }
  126. static void finalize(block* b, int line_number)
  127. {
  128. int firstlinelen;
  129. int pos;
  130. block* item;
  131. block* subitem;
  132. if (!b->open)
  133. return; // don't do anything if the block is already closed
  134. b->open = false;
  135. if (line_number > b->start_line) {
  136. b->end_line = line_number - 1;
  137. } else {
  138. b->end_line = line_number;
  139. }
  140. switch (b->tag) {
  141. case paragraph:
  142. pos = 0;
  143. while (gh_buf_at(&b->string_content, b->string_pos) == '[' &&
  144. (pos = parse_reference(&b->string_content, b->string_pos,
  145. b->top->attributes.refmap))) {
  146. b->string_pos = pos;
  147. }
  148. if (is_blank(&b->string_content, b->string_pos)) {
  149. b->tag = reference_def;
  150. }
  151. break;
  152. case indented_code:
  153. remove_trailing_blank_lines(&b->string_content);
  154. gh_buf_putc(&b->string_content, '\n');
  155. break;
  156. case fenced_code:
  157. // first line of contents becomes info
  158. firstlinelen = gh_buf_strchr(&b->string_content, '\n', b->string_pos);
  159. gh_buf_set(
  160. &b->attributes.fenced_code_data.info,
  161. b->string_content.ptr + b->string_pos,
  162. firstlinelen
  163. );
  164. b->string_pos = firstlinelen + 1;
  165. gh_buf_trim(&b->attributes.fenced_code_data.info);
  166. unescape_buffer(&b->attributes.fenced_code_data.info);
  167. break;
  168. case list: // determine tight/loose status
  169. b->attributes.list_data.tight = true; // tight by default
  170. item = b->children;
  171. while (item) {
  172. // check for non-final non-empty list item ending with blank line:
  173. if (item->last_line_blank && item->next) {
  174. b->attributes.list_data.tight = false;
  175. break;
  176. }
  177. // recurse into children of list item, to see if there are
  178. // spaces between them:
  179. subitem = item->children;
  180. while (subitem) {
  181. if (ends_with_blank_line(subitem) &&
  182. (item->next || subitem->next)) {
  183. b->attributes.list_data.tight = false;
  184. break;
  185. }
  186. subitem = subitem->next;
  187. }
  188. if (!(b->attributes.list_data.tight)) {
  189. break;
  190. }
  191. item = item->next;
  192. }
  193. break;
  194. default:
  195. break;
  196. }
  197. }
  198. // Add a block as child of another. Return pointer to child.
  199. extern block* add_child(block* parent,
  200. int block_type, int start_line, int start_column)
  201. {
  202. assert(parent);
  203. // if 'parent' isn't the kind of block that can accept this child,
  204. // then back up til we hit a block that can.
  205. while (!can_contain(parent->tag, block_type)) {
  206. finalize(parent, start_line);
  207. parent = parent->parent;
  208. }
  209. block* child = make_block(block_type, start_line, start_column);
  210. child->parent = parent;
  211. child->top = parent->top;
  212. if (parent->last_child) {
  213. parent->last_child->next = child;
  214. child->prev = parent->last_child;
  215. } else {
  216. parent->children = child;
  217. child->prev = NULL;
  218. }
  219. parent->last_child = child;
  220. return child;
  221. }
  222. // Free a block list and any children.
  223. extern void free_blocks(block* e)
  224. {
  225. block * next;
  226. while (e != NULL) {
  227. next = e->next;
  228. free_inlines(e->inline_content);
  229. gh_buf_free(&e->string_content);
  230. if (e->tag == fenced_code) {
  231. gh_buf_free(&e->attributes.fenced_code_data.info);
  232. } else if (e->tag == document) {
  233. free_reference_map(e->attributes.refmap);
  234. }
  235. free_blocks(e->children);
  236. free(e);
  237. e = next;
  238. }
  239. }
  240. // Walk through block and all children, recursively, parsing
  241. // string content into inline content where appropriate.
  242. void process_inlines(block* cur, reference** refmap)
  243. {
  244. switch (cur->tag) {
  245. case paragraph:
  246. case atx_header:
  247. case setext_header:
  248. cur->inline_content = parse_inlines(&cur->string_content, cur->string_pos, refmap);
  249. // MEM
  250. // gh_buf_free(&cur->string_content);
  251. break;
  252. default:
  253. break;
  254. }
  255. block *child = cur->children;
  256. while (child != NULL) {
  257. process_inlines(child, refmap);
  258. child = child->next;
  259. }
  260. }
  261. // Attempts to parse a list item marker (bullet or enumerated).
  262. // On success, returns length of the marker, and populates
  263. // data with the details. On failure, returns 0.
  264. static int parse_list_marker(gh_buf *ln, int pos,
  265. struct ListData ** dataptr)
  266. {
  267. char c;
  268. int startpos;
  269. struct ListData * data;
  270. startpos = pos;
  271. c = gh_buf_at(ln, pos);
  272. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(ln, pos)) {
  273. pos++;
  274. if (!isspace(gh_buf_at(ln, pos))) {
  275. return 0;
  276. }
  277. data = malloc(sizeof(struct ListData));
  278. data->marker_offset = 0; // will be adjusted later
  279. data->list_type = bullet;
  280. data->bullet_char = c;
  281. data->start = 1;
  282. data->delimiter = period;
  283. data->tight = false;
  284. } else if (isdigit(c)) {
  285. int start = 0;
  286. do {
  287. start = (10 * start) + (gh_buf_at(ln, pos) - '0');
  288. pos++;
  289. } while (isdigit(gh_buf_at(ln, pos)));
  290. c = gh_buf_at(ln, pos);
  291. if (c == '.' || c == ')') {
  292. pos++;
  293. if (!isspace(gh_buf_at(ln, pos))) {
  294. return 0;
  295. }
  296. data = malloc(sizeof(struct ListData));
  297. data->marker_offset = 0; // will be adjusted later
  298. data->list_type = ordered;
  299. data->bullet_char = 0;
  300. data->start = start;
  301. data->delimiter = (c == '.' ? period : parens);
  302. data->tight = false;
  303. } else {
  304. return 0;
  305. }
  306. } else {
  307. return 0;
  308. }
  309. *dataptr = data;
  310. return (pos - startpos);
  311. }
  312. // Return 1 if list item belongs in list, else 0.
  313. static int lists_match(struct ListData list_data,
  314. struct ListData item_data)
  315. {
  316. return (list_data.list_type == item_data.list_type &&
  317. list_data.delimiter == item_data.delimiter &&
  318. // list_data.marker_offset == item_data.marker_offset &&
  319. list_data.bullet_char == item_data.bullet_char);
  320. }
  321. static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size)
  322. {
  323. size_t i = 0, tab = 0;
  324. while (i < size) {
  325. size_t org = i;
  326. while (i < size && line[i] != '\t') {
  327. i++; tab++;
  328. }
  329. if (i > org)
  330. gh_buf_put(ob, line + org, i - org);
  331. if (i >= size)
  332. break;
  333. do {
  334. gh_buf_putc(ob, ' '); tab++;
  335. } while (tab % 4);
  336. i++;
  337. }
  338. }
  339. static block *finalize_parsing(block *document, int linenum)
  340. {
  341. while (document != document->top) {
  342. finalize(document, linenum);
  343. document = document->parent;
  344. }
  345. finalize(document, linenum);
  346. process_inlines(document, document->attributes.refmap);
  347. return document;
  348. }
  349. extern block *stmd_parse_file(FILE *f)
  350. {
  351. gh_buf line = GH_BUF_INIT;
  352. unsigned char buffer[4096];
  353. int linenum = 1;
  354. block *document = make_document();
  355. while (fgets((char *)buffer, sizeof(buffer), f)) {
  356. expand_tabs(&line, buffer, strlen(buffer));
  357. incorporate_line(&line, linenum, &document);
  358. gh_buf_clear(&line);
  359. linenum++;
  360. }
  361. gh_buf_free(&line);
  362. return finalize_document(document, linenum);
  363. }
  364. extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
  365. {
  366. gh_buf line = GH_BUF_INIT;
  367. int linenum = 1;
  368. const unsigned char *end = buffer + len;
  369. block *document = make_document();
  370. while (buffer < end) {
  371. const char *eol = memchr(buffer, '\n', end - buffer);
  372. if (!eol) {
  373. expand_tabs(&line, buffer, end - buffer);
  374. buffer = end;
  375. } else {
  376. expand_tabs(&line, buffer, (eol - buffer) + 1);
  377. buffer += (eol - buffer) + 1;
  378. }
  379. incorporate_line(&line, linenum, &document);
  380. gh_buf_clear(&line);
  381. linenum++;
  382. }
  383. gh_buf_free(&line);
  384. return finalize_document(document, linenum);
  385. }
  386. // Process one line at a time, modifying a block.
  387. // Returns 0 if successful. curptr is changed to point to
  388. // the currently open block.
  389. extern void incorporate_line(gh_buf *ln, int line_number, block** curptr)
  390. {
  391. block* last_matched_container;
  392. int offset = 0;
  393. int matched = 0;
  394. int lev = 0;
  395. int i;
  396. struct ListData * data = NULL;
  397. bool all_matched = true;
  398. block* container;
  399. block* cur = *curptr;
  400. bool blank = false;
  401. int first_nonspace;
  402. int indent;
  403. // container starts at the document root.
  404. container = cur->top;
  405. // for each containing block, try to parse the associated line start.
  406. // bail out on failure: container will point to the last matching block.
  407. while (container->last_child && container->last_child->open) {
  408. container = container->last_child;
  409. first_nonspace = offset;
  410. while (gh_buf_at(ln, first_nonspace) == ' ') {
  411. first_nonspace++;
  412. }
  413. indent = first_nonspace - offset;
  414. blank = gh_buf_at(ln, first_nonspace) == '\n';
  415. if (container->tag == block_quote) {
  416. matched = indent <= 3 && gh_buf_at(ln, first_nonspace) == '>';
  417. if (matched) {
  418. offset = first_nonspace + 1;
  419. if (gh_buf_at(ln, offset) == ' ') {
  420. offset++;
  421. }
  422. } else {
  423. all_matched = false;
  424. }
  425. } else if (container->tag == list_item) {
  426. if (indent >= container->attributes.list_data.marker_offset +
  427. container->attributes.list_data.padding) {
  428. offset += container->attributes.list_data.marker_offset +
  429. container->attributes.list_data.padding;
  430. } else if (blank) {
  431. offset = first_nonspace;
  432. } else {
  433. all_matched = false;
  434. }
  435. } else if (container->tag == indented_code) {
  436. if (indent >= CODE_INDENT) {
  437. offset += CODE_INDENT;
  438. } else if (blank) {
  439. offset = first_nonspace;
  440. } else {
  441. all_matched = false;
  442. }
  443. } else if (container->tag == atx_header ||
  444. container->tag == setext_header) {
  445. // a header can never contain more than one line
  446. all_matched = false;
  447. } else if (container->tag == fenced_code) {
  448. // skip optional spaces of fence offset
  449. i = container->attributes.fenced_code_data.fence_offset;
  450. while (i > 0 && gh_buf_at(ln, offset) == ' ') {
  451. offset++;
  452. i--;
  453. }
  454. } else if (container->tag == html_block) {
  455. if (blank) {
  456. all_matched = false;
  457. }
  458. } else if (container->tag == paragraph) {
  459. if (blank) {
  460. container->last_line_blank = true;
  461. all_matched = false;
  462. }
  463. }
  464. if (!all_matched) {
  465. container = container->parent; // back up to last matching block
  466. break;
  467. }
  468. }
  469. last_matched_container = container;
  470. // check to see if we've hit 2nd blank line, break out of list:
  471. if (blank && container->last_line_blank) {
  472. break_out_of_lists(&container, line_number);
  473. }
  474. // unless last matched container is code block, try new container starts:
  475. while (container->tag != fenced_code && container->tag != indented_code &&
  476. container->tag != html_block) {
  477. first_nonspace = offset;
  478. while (gh_buf_at(ln, first_nonspace) == ' ') {
  479. first_nonspace++;
  480. }
  481. indent = first_nonspace - offset;
  482. blank = gh_buf_at(ln, first_nonspace) == '\n';
  483. if (indent >= CODE_INDENT) {
  484. if (cur->tag != paragraph && !blank) {
  485. offset += CODE_INDENT;
  486. container = add_child(container, indented_code, line_number, offset + 1);
  487. } else { // indent > 4 in lazy line
  488. break;
  489. }
  490. } else if (gh_buf_at(ln, first_nonspace) == '>') {
  491. offset = first_nonspace + 1;
  492. // optional following character
  493. if (gh_buf_at(ln, offset) == ' ') {
  494. offset++;
  495. }
  496. container = add_child(container, block_quote, line_number, offset + 1);
  497. } else if ((matched = scan_atx_header_start(ln, first_nonspace))) {
  498. offset = first_nonspace + matched;
  499. container = add_child(container, atx_header, line_number, offset + 1);
  500. int hashpos = gh_buf_strchr(ln, '#', first_nonspace);
  501. assert(hashpos >= 0);
  502. int level = 0;
  503. while (gh_buf_at(ln, hashpos) == '#') {
  504. level++;
  505. hashpos++;
  506. }
  507. container->attributes.header_level = level;
  508. } else if ((matched = scan_open_code_fence(ln, first_nonspace))) {
  509. container = add_child(container, fenced_code, line_number,
  510. first_nonspace + 1);
  511. container->attributes.fenced_code_data.fence_char = gh_buf_at(ln,
  512. first_nonspace);
  513. container->attributes.fenced_code_data.fence_length = matched;
  514. container->attributes.fenced_code_data.fence_offset =
  515. first_nonspace - offset;
  516. offset = first_nonspace + matched;
  517. } else if ((matched = scan_html_block_tag(ln, first_nonspace))) {
  518. container = add_child(container, html_block, line_number,
  519. first_nonspace + 1);
  520. // note, we don't adjust offset because the tag is part of the text
  521. } else if (container->tag == paragraph &&
  522. (lev = scan_setext_header_line(ln, first_nonspace)) &&
  523. // check that there is only one line in the paragraph:
  524. gh_buf_strrchr(&container->string_content, '\n',
  525. gh_buf_len(&container->string_content) - 2) < 0) {
  526. container->tag = setext_header;
  527. container->attributes.header_level = lev;
  528. offset = gh_buf_len(ln) - 1;
  529. } else if (!(container->tag == paragraph && !all_matched) &&
  530. (matched = scan_hrule(ln, first_nonspace))) {
  531. // it's only now that we know the line is not part of a setext header:
  532. container = add_child(container, hrule, line_number, first_nonspace + 1);
  533. finalize(container, line_number);
  534. container = container->parent;
  535. offset = gh_buf_len(ln) - 1;
  536. } else if ((matched = parse_list_marker(ln, first_nonspace, &data))) {
  537. // compute padding:
  538. offset = first_nonspace + matched;
  539. i = 0;
  540. while (i <= 5 && gh_buf_at(ln, offset + i) == ' ') {
  541. i++;
  542. }
  543. // i = number of spaces after marker, up to 5
  544. if (i >= 5 || i < 1 || gh_buf_at(ln, offset) == '\n') {
  545. data->padding = matched + 1;
  546. if (i > 0) {
  547. offset += 1;
  548. }
  549. } else {
  550. data->padding = matched + i;
  551. offset += i;
  552. }
  553. // check container; if it's a list, see if this list item
  554. // can continue the list; otherwise, create a list container.
  555. data->marker_offset = indent;
  556. if (container->tag != list ||
  557. !lists_match(container->attributes.list_data, *data)) {
  558. container = add_child(container, list, line_number,
  559. first_nonspace + 1);
  560. container->attributes.list_data = *data;
  561. }
  562. // add the list item
  563. container = add_child(container, list_item, line_number,
  564. first_nonspace + 1);
  565. container->attributes.list_data = *data;
  566. free(data);
  567. } else {
  568. break;
  569. }
  570. if (accepts_lines(container->tag)) {
  571. // if it's a line container, it can't contain other containers
  572. break;
  573. }
  574. }
  575. // what remains at offset is a text line. add the text to the
  576. // appropriate container.
  577. first_nonspace = offset;
  578. while (gh_buf_at(ln, first_nonspace) == ' ') {
  579. first_nonspace++;
  580. }
  581. indent = first_nonspace - offset;
  582. blank = gh_buf_at(ln, first_nonspace) == '\n';
  583. // block quote lines are never blank as they start with >
  584. // and we don't count blanks in fenced code for purposes of tight/loose
  585. // lists or breaking out of lists. we also don't set last_line_blank
  586. // on an empty list item.
  587. container->last_line_blank = (blank &&
  588. container->tag != block_quote &&
  589. container->tag != fenced_code &&
  590. !(container->tag == list_item &&
  591. container->children == NULL &&
  592. container->start_line == line_number));
  593. block *cont = container;
  594. while (cont->parent) {
  595. cont->parent->last_line_blank = false;
  596. cont = cont->parent;
  597. }
  598. if (cur != last_matched_container &&
  599. container == last_matched_container &&
  600. !blank &&
  601. cur->tag == paragraph &&
  602. gh_buf_len(&cur->string_content) > 0) {
  603. add_line(cur, ln, offset);
  604. } else { // not a lazy continuation
  605. // finalize any blocks that were not matched and set cur to container:
  606. while (cur != last_matched_container) {
  607. finalize(cur, line_number);
  608. cur = cur->parent;
  609. assert(cur != NULL);
  610. }
  611. if (container->tag == indented_code) {
  612. add_line(container, ln, offset);
  613. } else if (container->tag == fenced_code) {
  614. matched = (indent <= 3
  615. && gh_buf_at(ln, first_nonspace) == container->attributes.fenced_code_data.fence_char)
  616. && scan_close_code_fence(ln, first_nonspace,
  617. container->attributes.fenced_code_data.fence_length);
  618. if (matched) {
  619. // if closing fence, don't add line to container; instead, close it:
  620. finalize(container, line_number);
  621. container = container->parent; // back up to parent
  622. } else {
  623. add_line(container, ln, offset);
  624. }
  625. } else if (container->tag == html_block) {
  626. add_line(container, ln, offset);
  627. } else if (blank) {
  628. // ??? do nothing
  629. } else if (container->tag == atx_header) {
  630. // chop off trailing ###s...use a scanner?
  631. gh_buf_trim(ln);
  632. int p = gh_buf_len(ln) - 1;
  633. // if string ends in #s, remove these:
  634. while (gh_buf_at(ln, p) == '#') {
  635. p--;
  636. }
  637. if (gh_buf_at(ln, p) == '\\') {
  638. // the last # was escaped, so we include it.
  639. p++;
  640. }
  641. gh_buf_truncate(ln, p + 1);
  642. add_line(container, ln, first_nonspace);
  643. finalize(container, line_number);
  644. container = container->parent;
  645. } else if (accepts_lines(container->tag)) {
  646. add_line(container, ln, first_nonspace);
  647. } else if (container->tag != hrule && container->tag != setext_header) {
  648. // create paragraph container for line
  649. container = add_child(container, paragraph, line_number, first_nonspace + 1);
  650. add_line(container, ln, first_nonspace);
  651. } else {
  652. assert(false);
  653. }
  654. *curptr = container;
  655. }
  656. }