aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: 42f20dbebe13a69ea576a85587121545b911311f (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "stmd.h"
  7. #include "scanners.h"
  8. #include "uthash.h"
  9. static void incorporate_line(gh_buf *ln, int line_number, block** curptr);
  10. static void finalize(block* b, int line_number);
  11. static block* make_block(int tag, int start_line, int start_column)
  12. {
  13. block* e;
  14. e = (block*) malloc(sizeof(block));
  15. e->tag = tag;
  16. e->open = true;
  17. e->last_line_blank = false;
  18. e->start_line = start_line;
  19. e->start_column = start_column;
  20. e->end_line = start_line;
  21. e->children = NULL;
  22. e->last_child = NULL;
  23. e->parent = NULL;
  24. e->top = NULL;
  25. e->attributes.refmap = NULL;
  26. gh_buf_init(&e->string_content, 32);
  27. e->string_pos = 0;
  28. e->inline_content = NULL;
  29. e->next = NULL;
  30. e->prev = NULL;
  31. return e;
  32. }
  33. // Create a root document block.
  34. extern block* make_document()
  35. {
  36. block * e = make_block(document, 1, 1);
  37. reference * map = NULL;
  38. reference ** refmap;
  39. refmap = (reference**) malloc(sizeof(reference*));
  40. *refmap = map;
  41. e->attributes.refmap = refmap;
  42. e->top = e;
  43. return e;
  44. }
  45. // Returns true if line has only space characters, else false.
  46. bool is_blank(gh_buf *s, int offset)
  47. {
  48. while (offset < s->size) {
  49. switch (s->ptr[offset]) {
  50. case '\n':
  51. return true;
  52. case ' ':
  53. offset++;
  54. default:
  55. return false;
  56. }
  57. }
  58. return true;
  59. }
  60. static inline bool can_contain(int parent_type, int child_type)
  61. {
  62. return ( parent_type == document ||
  63. parent_type == block_quote ||
  64. parent_type == list_item ||
  65. (parent_type == list && child_type == list_item) );
  66. }
  67. static inline bool accepts_lines(int block_type)
  68. {
  69. return (block_type == paragraph ||
  70. block_type == atx_header ||
  71. block_type == indented_code ||
  72. block_type == fenced_code);
  73. }
  74. static void add_line(block* block, gh_buf *ln, int offset)
  75. {
  76. assert(block->open);
  77. gh_buf_put(&block->string_content, ln->ptr + offset, ln->size - offset);
  78. }
  79. static void remove_trailing_blank_lines(gh_buf *ln)
  80. {
  81. int i;
  82. for (i = ln->size - 1; i >= 0; --i) {
  83. char c = ln->ptr[i];
  84. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  85. break;
  86. }
  87. if (i < 0) {
  88. gh_buf_clear(ln);
  89. return;
  90. }
  91. i = gh_buf_strchr(ln, '\n', i);
  92. if (i >= 0)
  93. gh_buf_truncate(ln, i + 1);
  94. }
  95. // Check to see if a block ends with a blank line, descending
  96. // if needed into lists and sublists.
  97. static bool ends_with_blank_line(block* block)
  98. {
  99. if (block->last_line_blank) {
  100. return true;
  101. }
  102. if ((block->tag == list || block->tag == list_item) && block->last_child) {
  103. return ends_with_blank_line(block->last_child);
  104. } else {
  105. return false;
  106. }
  107. }
  108. // Break out of all containing lists
  109. static int break_out_of_lists(block ** bptr, int line_number)
  110. {
  111. block * container = *bptr;
  112. block * b = container->top;
  113. // find first containing list:
  114. while (b && b->tag != list) {
  115. b = b->last_child;
  116. }
  117. if (b) {
  118. while (container && container != b) {
  119. finalize(container, line_number);
  120. container = container->parent;
  121. }
  122. finalize(b, line_number);
  123. *bptr = b->parent;
  124. }
  125. return 0;
  126. }
  127. static void finalize(block* b, int line_number)
  128. {
  129. int firstlinelen;
  130. int pos;
  131. block* item;
  132. block* subitem;
  133. if (!b->open)
  134. return; // don't do anything if the block is already closed
  135. b->open = false;
  136. if (line_number > b->start_line) {
  137. b->end_line = line_number - 1;
  138. } else {
  139. b->end_line = line_number;
  140. }
  141. switch (b->tag) {
  142. case paragraph:
  143. pos = 0;
  144. while (gh_buf_at(&b->string_content, b->string_pos) == '[' &&
  145. (pos = parse_reference(&b->string_content, b->string_pos,
  146. b->top->attributes.refmap))) {
  147. b->string_pos = pos;
  148. }
  149. if (is_blank(&b->string_content, b->string_pos)) {
  150. b->tag = reference_def;
  151. }
  152. break;
  153. case indented_code:
  154. remove_trailing_blank_lines(&b->string_content);
  155. gh_buf_putc(&b->string_content, '\n');
  156. break;
  157. case fenced_code:
  158. // first line of contents becomes info
  159. firstlinelen = gh_buf_strchr(&b->string_content, '\n', b->string_pos);
  160. gh_buf_set(
  161. &b->attributes.fenced_code_data.info,
  162. b->string_content.ptr + b->string_pos,
  163. firstlinelen
  164. );
  165. b->string_pos = firstlinelen + 1;
  166. gh_buf_trim(&b->attributes.fenced_code_data.info);
  167. unescape_buffer(&b->attributes.fenced_code_data.info);
  168. break;
  169. case list: // determine tight/loose status
  170. b->attributes.list_data.tight = true; // tight by default
  171. item = b->children;
  172. while (item) {
  173. // check for non-final non-empty list item ending with blank line:
  174. if (item->last_line_blank && item->next) {
  175. b->attributes.list_data.tight = false;
  176. break;
  177. }
  178. // recurse into children of list item, to see if there are
  179. // spaces between them:
  180. subitem = item->children;
  181. while (subitem) {
  182. if (ends_with_blank_line(subitem) &&
  183. (item->next || subitem->next)) {
  184. b->attributes.list_data.tight = false;
  185. break;
  186. }
  187. subitem = subitem->next;
  188. }
  189. if (!(b->attributes.list_data.tight)) {
  190. break;
  191. }
  192. item = item->next;
  193. }
  194. break;
  195. default:
  196. break;
  197. }
  198. }
  199. // Add a block as child of another. Return pointer to child.
  200. extern block* add_child(block* parent,
  201. int block_type, int start_line, int start_column)
  202. {
  203. assert(parent);
  204. // if 'parent' isn't the kind of block that can accept this child,
  205. // then back up til we hit a block that can.
  206. while (!can_contain(parent->tag, block_type)) {
  207. finalize(parent, start_line);
  208. parent = parent->parent;
  209. }
  210. block* child = make_block(block_type, start_line, start_column);
  211. child->parent = parent;
  212. child->top = parent->top;
  213. if (parent->last_child) {
  214. parent->last_child->next = child;
  215. child->prev = parent->last_child;
  216. } else {
  217. parent->children = child;
  218. child->prev = NULL;
  219. }
  220. parent->last_child = child;
  221. return child;
  222. }
  223. // Free a block list and any children.
  224. extern void free_blocks(block* e)
  225. {
  226. block * next;
  227. while (e != NULL) {
  228. next = e->next;
  229. free_inlines(e->inline_content);
  230. gh_buf_free(&e->string_content);
  231. if (e->tag == fenced_code) {
  232. gh_buf_free(&e->attributes.fenced_code_data.info);
  233. } else if (e->tag == document) {
  234. free_reference_map(e->attributes.refmap);
  235. }
  236. free_blocks(e->children);
  237. free(e);
  238. e = next;
  239. }
  240. }
  241. // Walk through block and all children, recursively, parsing
  242. // string content into inline content where appropriate.
  243. void process_inlines(block* cur, reference** refmap)
  244. {
  245. switch (cur->tag) {
  246. case paragraph:
  247. case atx_header:
  248. case setext_header:
  249. cur->inline_content = parse_inlines(&cur->string_content, cur->string_pos, refmap);
  250. // MEM
  251. // gh_buf_free(&cur->string_content);
  252. break;
  253. default:
  254. break;
  255. }
  256. block *child = cur->children;
  257. while (child != NULL) {
  258. process_inlines(child, refmap);
  259. child = child->next;
  260. }
  261. }
  262. // Attempts to parse a list item marker (bullet or enumerated).
  263. // On success, returns length of the marker, and populates
  264. // data with the details. On failure, returns 0.
  265. static int parse_list_marker(gh_buf *ln, int pos,
  266. struct ListData ** dataptr)
  267. {
  268. char c;
  269. int startpos;
  270. struct ListData * data;
  271. startpos = pos;
  272. c = gh_buf_at(ln, pos);
  273. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(ln, pos)) {
  274. pos++;
  275. if (!isspace(gh_buf_at(ln, pos))) {
  276. return 0;
  277. }
  278. data = malloc(sizeof(struct ListData));
  279. data->marker_offset = 0; // will be adjusted later
  280. data->list_type = bullet;
  281. data->bullet_char = c;
  282. data->start = 1;
  283. data->delimiter = period;
  284. data->tight = false;
  285. } else if (isdigit(c)) {
  286. int start = 0;
  287. do {
  288. start = (10 * start) + (gh_buf_at(ln, pos) - '0');
  289. pos++;
  290. } while (isdigit(gh_buf_at(ln, pos)));
  291. c = gh_buf_at(ln, pos);
  292. if (c == '.' || c == ')') {
  293. pos++;
  294. if (!isspace(gh_buf_at(ln, pos))) {
  295. return 0;
  296. }
  297. data = malloc(sizeof(struct ListData));
  298. data->marker_offset = 0; // will be adjusted later
  299. data->list_type = ordered;
  300. data->bullet_char = 0;
  301. data->start = start;
  302. data->delimiter = (c == '.' ? period : parens);
  303. data->tight = false;
  304. } else {
  305. return 0;
  306. }
  307. } else {
  308. return 0;
  309. }
  310. *dataptr = data;
  311. return (pos - startpos);
  312. }
  313. // Return 1 if list item belongs in list, else 0.
  314. static int lists_match(struct ListData list_data,
  315. struct ListData item_data)
  316. {
  317. return (list_data.list_type == item_data.list_type &&
  318. list_data.delimiter == item_data.delimiter &&
  319. // list_data.marker_offset == item_data.marker_offset &&
  320. list_data.bullet_char == item_data.bullet_char);
  321. }
  322. static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size)
  323. {
  324. size_t i = 0, tab = 0;
  325. while (i < size) {
  326. size_t org = i;
  327. while (i < size && line[i] != '\t') {
  328. i++; tab++;
  329. }
  330. if (i > org)
  331. gh_buf_put(ob, line + org, i - org);
  332. if (i >= size)
  333. break;
  334. do {
  335. gh_buf_putc(ob, ' '); tab++;
  336. } while (tab % 4);
  337. i++;
  338. }
  339. }
  340. static block *finalize_document(block *document, int linenum)
  341. {
  342. while (document != document->top) {
  343. finalize(document, linenum);
  344. document = document->parent;
  345. }
  346. finalize(document, linenum);
  347. process_inlines(document, document->attributes.refmap);
  348. return document;
  349. }
  350. extern block *stmd_parse_file(FILE *f)
  351. {
  352. gh_buf line = GH_BUF_INIT;
  353. unsigned char buffer[4096];
  354. int linenum = 1;
  355. block *document = make_document();
  356. while (fgets((char *)buffer, sizeof(buffer), f)) {
  357. expand_tabs(&line, buffer, strlen((char *)buffer));
  358. incorporate_line(&line, linenum, &document);
  359. gh_buf_clear(&line);
  360. linenum++;
  361. }
  362. gh_buf_free(&line);
  363. return finalize_document(document, linenum);
  364. }
  365. extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
  366. {
  367. gh_buf line = GH_BUF_INIT;
  368. int linenum = 1;
  369. const unsigned char *end = buffer + len;
  370. block *document = make_document();
  371. while (buffer < end) {
  372. const unsigned char *eol = memchr(buffer, '\n', end - buffer);
  373. if (!eol) {
  374. expand_tabs(&line, buffer, end - buffer);
  375. buffer = end;
  376. } else {
  377. expand_tabs(&line, buffer, (eol - buffer) + 1);
  378. buffer += (eol - buffer) + 1;
  379. }
  380. incorporate_line(&line, linenum, &document);
  381. gh_buf_clear(&line);
  382. linenum++;
  383. }
  384. gh_buf_free(&line);
  385. return finalize_document(document, linenum);
  386. }
  387. // Process one line at a time, modifying a block.
  388. static void incorporate_line(gh_buf *ln, int line_number, block** curptr)
  389. {
  390. block* last_matched_container;
  391. int offset = 0;
  392. int matched = 0;
  393. int lev = 0;
  394. int i;
  395. struct ListData * data = NULL;
  396. bool all_matched = true;
  397. block* container;
  398. block* cur = *curptr;
  399. bool blank = false;
  400. int first_nonspace;
  401. int indent;
  402. // container starts at the document root.
  403. container = cur->top;
  404. // for each containing block, try to parse the associated line start.
  405. // bail out on failure: container will point to the last matching block.
  406. while (container->last_child && container->last_child->open) {
  407. container = container->last_child;
  408. first_nonspace = offset;
  409. while (gh_buf_at(ln, first_nonspace) == ' ') {
  410. first_nonspace++;
  411. }
  412. indent = first_nonspace - offset;
  413. blank = gh_buf_at(ln, first_nonspace) == '\n';
  414. if (container->tag == block_quote) {
  415. matched = indent <= 3 && gh_buf_at(ln, first_nonspace) == '>';
  416. if (matched) {
  417. offset = first_nonspace + 1;
  418. if (gh_buf_at(ln, offset) == ' ') {
  419. offset++;
  420. }
  421. } else {
  422. all_matched = false;
  423. }
  424. } else if (container->tag == list_item) {
  425. if (indent >= container->attributes.list_data.marker_offset +
  426. container->attributes.list_data.padding) {
  427. offset += container->attributes.list_data.marker_offset +
  428. container->attributes.list_data.padding;
  429. } else if (blank) {
  430. offset = first_nonspace;
  431. } else {
  432. all_matched = false;
  433. }
  434. } else if (container->tag == indented_code) {
  435. if (indent >= CODE_INDENT) {
  436. offset += CODE_INDENT;
  437. } else if (blank) {
  438. offset = first_nonspace;
  439. } else {
  440. all_matched = false;
  441. }
  442. } else if (container->tag == atx_header ||
  443. container->tag == setext_header) {
  444. // a header can never contain more than one line
  445. all_matched = false;
  446. } else if (container->tag == fenced_code) {
  447. // skip optional spaces of fence offset
  448. i = container->attributes.fenced_code_data.fence_offset;
  449. while (i > 0 && gh_buf_at(ln, offset) == ' ') {
  450. offset++;
  451. i--;
  452. }
  453. } else if (container->tag == html_block) {
  454. if (blank) {
  455. all_matched = false;
  456. }
  457. } else if (container->tag == paragraph) {
  458. if (blank) {
  459. container->last_line_blank = true;
  460. all_matched = false;
  461. }
  462. }
  463. if (!all_matched) {
  464. container = container->parent; // back up to last matching block
  465. break;
  466. }
  467. }
  468. last_matched_container = container;
  469. // check to see if we've hit 2nd blank line, break out of list:
  470. if (blank && container->last_line_blank) {
  471. break_out_of_lists(&container, line_number);
  472. }
  473. // unless last matched container is code block, try new container starts:
  474. while (container->tag != fenced_code && container->tag != indented_code &&
  475. container->tag != html_block) {
  476. first_nonspace = offset;
  477. while (gh_buf_at(ln, first_nonspace) == ' ') {
  478. first_nonspace++;
  479. }
  480. indent = first_nonspace - offset;
  481. blank = gh_buf_at(ln, first_nonspace) == '\n';
  482. if (indent >= CODE_INDENT) {
  483. if (cur->tag != paragraph && !blank) {
  484. offset += CODE_INDENT;
  485. container = add_child(container, indented_code, line_number, offset + 1);
  486. } else { // indent > 4 in lazy line
  487. break;
  488. }
  489. } else if (gh_buf_at(ln, first_nonspace) == '>') {
  490. offset = first_nonspace + 1;
  491. // optional following character
  492. if (gh_buf_at(ln, offset) == ' ') {
  493. offset++;
  494. }
  495. container = add_child(container, block_quote, line_number, offset + 1);
  496. } else if ((matched = scan_atx_header_start(ln, first_nonspace))) {
  497. offset = first_nonspace + matched;
  498. container = add_child(container, atx_header, line_number, offset + 1);
  499. int hashpos = gh_buf_strchr(ln, '#', first_nonspace);
  500. assert(hashpos >= 0);
  501. int level = 0;
  502. while (gh_buf_at(ln, hashpos) == '#') {
  503. level++;
  504. hashpos++;
  505. }
  506. container->attributes.header_level = level;
  507. } else if ((matched = scan_open_code_fence(ln, first_nonspace))) {
  508. container = add_child(container, fenced_code, line_number,
  509. first_nonspace + 1);
  510. container->attributes.fenced_code_data.fence_char = gh_buf_at(ln,
  511. first_nonspace);
  512. container->attributes.fenced_code_data.fence_length = matched;
  513. container->attributes.fenced_code_data.fence_offset =
  514. first_nonspace - offset;
  515. offset = first_nonspace + matched;
  516. } else if ((matched = scan_html_block_tag(ln, first_nonspace))) {
  517. container = add_child(container, html_block, line_number,
  518. first_nonspace + 1);
  519. // note, we don't adjust offset because the tag is part of the text
  520. } else if (container->tag == paragraph &&
  521. (lev = scan_setext_header_line(ln, first_nonspace)) &&
  522. // check that there is only one line in the paragraph:
  523. gh_buf_strrchr(&container->string_content, '\n',
  524. gh_buf_len(&container->string_content) - 2) < 0) {
  525. container->tag = setext_header;
  526. container->attributes.header_level = lev;
  527. offset = gh_buf_len(ln) - 1;
  528. } else if (!(container->tag == paragraph && !all_matched) &&
  529. (matched = scan_hrule(ln, first_nonspace))) {
  530. // it's only now that we know the line is not part of a setext header:
  531. container = add_child(container, hrule, line_number, first_nonspace + 1);
  532. finalize(container, line_number);
  533. container = container->parent;
  534. offset = gh_buf_len(ln) - 1;
  535. } else if ((matched = parse_list_marker(ln, first_nonspace, &data))) {
  536. // compute padding:
  537. offset = first_nonspace + matched;
  538. i = 0;
  539. while (i <= 5 && gh_buf_at(ln, offset + i) == ' ') {
  540. i++;
  541. }
  542. // i = number of spaces after marker, up to 5
  543. if (i >= 5 || i < 1 || gh_buf_at(ln, offset) == '\n') {
  544. data->padding = matched + 1;
  545. if (i > 0) {
  546. offset += 1;
  547. }
  548. } else {
  549. data->padding = matched + i;
  550. offset += i;
  551. }
  552. // check container; if it's a list, see if this list item
  553. // can continue the list; otherwise, create a list container.
  554. data->marker_offset = indent;
  555. if (container->tag != list ||
  556. !lists_match(container->attributes.list_data, *data)) {
  557. container = add_child(container, list, line_number,
  558. first_nonspace + 1);
  559. container->attributes.list_data = *data;
  560. }
  561. // add the list item
  562. container = add_child(container, list_item, line_number,
  563. first_nonspace + 1);
  564. container->attributes.list_data = *data;
  565. free(data);
  566. } else {
  567. break;
  568. }
  569. if (accepts_lines(container->tag)) {
  570. // if it's a line container, it can't contain other containers
  571. break;
  572. }
  573. }
  574. // what remains at offset is a text line. add the text to the
  575. // appropriate container.
  576. first_nonspace = offset;
  577. while (gh_buf_at(ln, first_nonspace) == ' ') {
  578. first_nonspace++;
  579. }
  580. indent = first_nonspace - offset;
  581. blank = gh_buf_at(ln, first_nonspace) == '\n';
  582. // block quote lines are never blank as they start with >
  583. // and we don't count blanks in fenced code for purposes of tight/loose
  584. // lists or breaking out of lists. we also don't set last_line_blank
  585. // on an empty list item.
  586. container->last_line_blank = (blank &&
  587. container->tag != block_quote &&
  588. container->tag != fenced_code &&
  589. !(container->tag == list_item &&
  590. container->children == NULL &&
  591. container->start_line == line_number));
  592. block *cont = container;
  593. while (cont->parent) {
  594. cont->parent->last_line_blank = false;
  595. cont = cont->parent;
  596. }
  597. if (cur != last_matched_container &&
  598. container == last_matched_container &&
  599. !blank &&
  600. cur->tag == paragraph &&
  601. gh_buf_len(&cur->string_content) > 0) {
  602. add_line(cur, ln, offset);
  603. } else { // not a lazy continuation
  604. // finalize any blocks that were not matched and set cur to container:
  605. while (cur != last_matched_container) {
  606. finalize(cur, line_number);
  607. cur = cur->parent;
  608. assert(cur != NULL);
  609. }
  610. if (container->tag == indented_code) {
  611. add_line(container, ln, offset);
  612. } else if (container->tag == fenced_code) {
  613. matched = (indent <= 3
  614. && gh_buf_at(ln, first_nonspace) == container->attributes.fenced_code_data.fence_char)
  615. && scan_close_code_fence(ln, first_nonspace,
  616. container->attributes.fenced_code_data.fence_length);
  617. if (matched) {
  618. // if closing fence, don't add line to container; instead, close it:
  619. finalize(container, line_number);
  620. container = container->parent; // back up to parent
  621. } else {
  622. add_line(container, ln, offset);
  623. }
  624. } else if (container->tag == html_block) {
  625. add_line(container, ln, offset);
  626. } else if (blank) {
  627. // ??? do nothing
  628. } else if (container->tag == atx_header) {
  629. // chop off trailing ###s...use a scanner?
  630. gh_buf_trim(ln);
  631. int p = gh_buf_len(ln) - 1;
  632. // if string ends in #s, remove these:
  633. while (gh_buf_at(ln, p) == '#') {
  634. p--;
  635. }
  636. if (gh_buf_at(ln, p) == '\\') {
  637. // the last # was escaped, so we include it.
  638. p++;
  639. }
  640. gh_buf_truncate(ln, p + 1);
  641. add_line(container, ln, first_nonspace);
  642. finalize(container, line_number);
  643. container = container->parent;
  644. } else if (accepts_lines(container->tag)) {
  645. add_line(container, ln, first_nonspace);
  646. } else if (container->tag != hrule && container->tag != setext_header) {
  647. // create paragraph container for line
  648. container = add_child(container, paragraph, line_number, first_nonspace + 1);
  649. add_line(container, ln, first_nonspace);
  650. } else {
  651. assert(false);
  652. }
  653. *curptr = container;
  654. }
  655. }