aboutsummaryrefslogtreecommitdiff
path: root/src/blocks.c
blob: 604b9333ce8409b4de389f35f131519e0500d07b (plain)
  1. #include <stdlib.h>
  2. #include <assert.h>
  3. #include <stdio.h>
  4. #include <stdbool.h>
  5. #include <ctype.h>
  6. #include "cmark.h"
  7. #include "utf8.h"
  8. #include "scanners.h"
  9. #include "inlines.h"
  10. #include "html/houdini.h"
  11. #include "buffer.h"
  12. #include "bench.h"
  13. #define peek_at(i, n) (i)->data[n]
  14. static void incorporate_line(strbuf *ln, int line_number, node_block** curptr);
  15. static void finalize(node_block* b, int line_number);
  16. static node_block* make_block(int tag, int start_line, int start_column)
  17. {
  18. node_block* e;
  19. e = calloc(1, sizeof(*e));
  20. if(e != NULL) {
  21. e->tag = tag;
  22. e->open = true;
  23. e->start_line = start_line;
  24. e->start_column = start_column;
  25. e->end_line = start_line;
  26. strbuf_init(&e->string_content, 32);
  27. }
  28. return e;
  29. }
  30. // Create a root document node_block.
  31. static node_block* make_document()
  32. {
  33. node_block *e = make_block(BLOCK_DOCUMENT, 1, 1);
  34. e->as.document.refmap = reference_map_new();
  35. e->top = e;
  36. return e;
  37. }
  38. // Returns true if line has only space characters, else false.
  39. static bool is_blank(strbuf *s, int offset)
  40. {
  41. while (offset < s->size) {
  42. switch (s->ptr[offset]) {
  43. case '\n':
  44. return true;
  45. case ' ':
  46. offset++;
  47. break;
  48. default:
  49. return false;
  50. }
  51. }
  52. return true;
  53. }
  54. static inline bool can_contain(int parent_type, int child_type)
  55. {
  56. return ( parent_type == BLOCK_DOCUMENT ||
  57. parent_type == BLOCK_BQUOTE ||
  58. parent_type == BLOCK_LIST_ITEM ||
  59. (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
  60. }
  61. static inline bool accepts_lines(int block_type)
  62. {
  63. return (block_type == BLOCK_PARAGRAPH ||
  64. block_type == BLOCK_ATX_HEADER ||
  65. block_type == BLOCK_INDENTED_CODE ||
  66. block_type == BLOCK_FENCED_CODE);
  67. }
  68. static void add_line(node_block* node_block, chunk *ch, int offset)
  69. {
  70. assert(node_block->open);
  71. strbuf_put(&node_block->string_content, ch->data + offset, ch->len - offset);
  72. }
  73. static void remove_trailing_blank_lines(strbuf *ln)
  74. {
  75. int i;
  76. for (i = ln->size - 1; i >= 0; --i) {
  77. unsigned char c = ln->ptr[i];
  78. if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
  79. break;
  80. }
  81. if (i < 0) {
  82. strbuf_clear(ln);
  83. return;
  84. }
  85. i = strbuf_strchr(ln, '\n', i);
  86. if (i >= 0)
  87. strbuf_truncate(ln, i);
  88. }
  89. // Check to see if a node_block ends with a blank line, descending
  90. // if needed into lists and sublists.
  91. static bool ends_with_blank_line(node_block* node_block)
  92. {
  93. if (node_block->last_line_blank) {
  94. return true;
  95. }
  96. if ((node_block->tag == BLOCK_LIST || node_block->tag == BLOCK_LIST_ITEM) && node_block->last_child) {
  97. return ends_with_blank_line(node_block->last_child);
  98. } else {
  99. return false;
  100. }
  101. }
  102. // Break out of all containing lists
  103. static int break_out_of_lists(node_block ** bptr, int line_number)
  104. {
  105. node_block *container = *bptr;
  106. node_block *b = container->top;
  107. // find first containing BLOCK_LIST:
  108. while (b && b->tag != BLOCK_LIST) {
  109. b = b->last_child;
  110. }
  111. if (b) {
  112. while (container && container != b) {
  113. finalize(container, line_number);
  114. container = container->parent;
  115. }
  116. finalize(b, line_number);
  117. *bptr = b->parent;
  118. }
  119. return 0;
  120. }
  121. static void finalize(node_block* b, int line_number)
  122. {
  123. int firstlinelen;
  124. int pos;
  125. node_block* item;
  126. node_block* subitem;
  127. if (!b->open)
  128. return; // don't do anything if the node_block is already closed
  129. b->open = false;
  130. if (line_number > b->start_line) {
  131. b->end_line = line_number - 1;
  132. } else {
  133. b->end_line = line_number;
  134. }
  135. switch (b->tag) {
  136. case BLOCK_PARAGRAPH:
  137. pos = 0;
  138. while (strbuf_at(&b->string_content, 0) == '[' &&
  139. (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {
  140. strbuf_drop(&b->string_content, pos);
  141. }
  142. if (is_blank(&b->string_content, 0)) {
  143. b->tag = BLOCK_REFERENCE_DEF;
  144. }
  145. break;
  146. case BLOCK_INDENTED_CODE:
  147. remove_trailing_blank_lines(&b->string_content);
  148. strbuf_putc(&b->string_content, '\n');
  149. break;
  150. case BLOCK_FENCED_CODE:
  151. // first line of contents becomes info
  152. firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
  153. strbuf_init(&b->as.code.info, 0);
  154. houdini_unescape_html_f(
  155. &b->as.code.info,
  156. b->string_content.ptr,
  157. firstlinelen
  158. );
  159. strbuf_drop(&b->string_content, firstlinelen + 1);
  160. strbuf_trim(&b->as.code.info);
  161. strbuf_unescape(&b->as.code.info);
  162. break;
  163. case BLOCK_LIST: // determine tight/loose status
  164. b->as.list.tight = true; // tight by default
  165. item = b->children;
  166. while (item) {
  167. // check for non-final non-empty list item ending with blank line:
  168. if (item->last_line_blank && item->next) {
  169. b->as.list.tight = false;
  170. break;
  171. }
  172. // recurse into children of list item, to see if there are
  173. // spaces between them:
  174. subitem = item->children;
  175. while (subitem) {
  176. if (ends_with_blank_line(subitem) &&
  177. (item->next || subitem->next)) {
  178. b->as.list.tight = false;
  179. break;
  180. }
  181. subitem = subitem->next;
  182. }
  183. if (!(b->as.list.tight)) {
  184. break;
  185. }
  186. item = item->next;
  187. }
  188. break;
  189. default:
  190. break;
  191. }
  192. }
  193. // Add a node_block as child of another. Return pointer to child.
  194. static node_block* add_child(node_block* parent,
  195. int block_type, int start_line, int start_column)
  196. {
  197. assert(parent);
  198. // if 'parent' isn't the kind of node_block that can accept this child,
  199. // then back up til we hit a node_block that can.
  200. while (!can_contain(parent->tag, block_type)) {
  201. finalize(parent, start_line);
  202. parent = parent->parent;
  203. }
  204. node_block* child = make_block(block_type, start_line, start_column);
  205. child->parent = parent;
  206. child->top = parent->top;
  207. if (parent->last_child) {
  208. parent->last_child->next = child;
  209. child->prev = parent->last_child;
  210. } else {
  211. parent->children = child;
  212. child->prev = NULL;
  213. }
  214. parent->last_child = child;
  215. return child;
  216. }
  217. typedef struct BlockStack {
  218. struct BlockStack *previous;
  219. node_block *next_sibling;
  220. } block_stack;
  221. // Walk through node_block and all children, recursively, parsing
  222. // string content into inline content where appropriate.
  223. static void process_inlines(node_block* cur, reference_map *refmap)
  224. {
  225. block_stack* stack = NULL;
  226. block_stack* newstack = NULL;
  227. while (cur != NULL) {
  228. switch (cur->tag) {
  229. case BLOCK_PARAGRAPH:
  230. case BLOCK_ATX_HEADER:
  231. case BLOCK_SETEXT_HEADER:
  232. cur->inline_content = parse_inlines(&cur->string_content, refmap);
  233. break;
  234. default:
  235. break;
  236. }
  237. if (cur->children) {
  238. newstack = (block_stack*)malloc(sizeof(block_stack));
  239. if (newstack == NULL) return;
  240. newstack->previous = stack;
  241. stack = newstack;
  242. stack->next_sibling = cur->next;
  243. cur = cur->children;
  244. } else {
  245. cur = cur->next;
  246. }
  247. while (cur == NULL && stack != NULL) {
  248. cur = stack->next_sibling;
  249. newstack = stack->previous;
  250. free(stack);
  251. stack = newstack;
  252. }
  253. }
  254. while (stack != NULL) {
  255. newstack = stack->previous;
  256. free(stack);
  257. stack = newstack;
  258. }
  259. }
  260. // Attempts to parse a list item marker (bullet or enumerated).
  261. // On success, returns length of the marker, and populates
  262. // data with the details. On failure, returns 0.
  263. static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
  264. {
  265. unsigned char c;
  266. int startpos;
  267. struct ListData * data;
  268. startpos = pos;
  269. c = peek_at(input, pos);
  270. if ((c == '*' || c == '-' || c == '+') && !scan_hrule(input, pos)) {
  271. pos++;
  272. if (!isspace(peek_at(input, pos))) {
  273. return 0;
  274. }
  275. data = calloc(1, sizeof(*data));
  276. if(data == NULL) {
  277. return 0;
  278. } else {
  279. data->marker_offset = 0; // will be adjusted later
  280. data->list_type = bullet;
  281. data->bullet_char = c;
  282. data->start = 1;
  283. data->delimiter = period;
  284. data->tight = false;
  285. }
  286. } else if (isdigit(c)) {
  287. int start = 0;
  288. do {
  289. start = (10 * start) + (peek_at(input, pos) - '0');
  290. pos++;
  291. } while (isdigit(peek_at(input, pos)));
  292. c = peek_at(input, pos);
  293. if (c == '.' || c == ')') {
  294. pos++;
  295. if (!isspace(peek_at(input, pos))) {
  296. return 0;
  297. }
  298. data = calloc(1, sizeof(*data));
  299. if(data == NULL) {
  300. return 0;
  301. } else {
  302. data->marker_offset = 0; // will be adjusted later
  303. data->list_type = ordered;
  304. data->bullet_char = 0;
  305. data->start = start;
  306. data->delimiter = (c == '.' ? period : parens);
  307. data->tight = false;
  308. }
  309. } else {
  310. return 0;
  311. }
  312. } else {
  313. return 0;
  314. }
  315. *dataptr = data;
  316. return (pos - startpos);
  317. }
  318. // Return 1 if list item belongs in list, else 0.
  319. static int lists_match(struct ListData *list_data, struct ListData *item_data)
  320. {
  321. return (list_data->list_type == item_data->list_type &&
  322. list_data->delimiter == item_data->delimiter &&
  323. // list_data->marker_offset == item_data.marker_offset &&
  324. list_data->bullet_char == item_data->bullet_char);
  325. }
  326. static node_block *finalize_document(node_block *document, int linenum)
  327. {
  328. start_timer();
  329. while (document != document->top) {
  330. finalize(document, linenum);
  331. document = document->parent;
  332. }
  333. finalize(document, linenum);
  334. process_inlines(document, document->as.document.refmap);
  335. end_timer("finalize_document");
  336. return document;
  337. }
  338. extern node_block *cmark_parse_file(FILE *f)
  339. {
  340. strbuf line = GH_BUF_INIT;
  341. unsigned char buffer[4096];
  342. int linenum = 1;
  343. node_block *document = make_document();
  344. start_timer();
  345. while (fgets((char *)buffer, sizeof(buffer), f)) {
  346. utf8proc_detab(&line, buffer, strlen((char *)buffer));
  347. incorporate_line(&line, linenum, &document);
  348. strbuf_clear(&line);
  349. linenum++;
  350. }
  351. end_timer("incorporate_line(s)");
  352. strbuf_free(&line);
  353. return finalize_document(document, linenum);
  354. }
  355. extern node_block *cmark_parse_document(const unsigned char *buffer, size_t len)
  356. {
  357. strbuf line = GH_BUF_INIT;
  358. int linenum = 1;
  359. const unsigned char *end = buffer + len;
  360. node_block *document = make_document();
  361. while (buffer < end) {
  362. const unsigned char *eol = memchr(buffer, '\n', end - buffer);
  363. if (!eol) {
  364. utf8proc_detab(&line, buffer, end - buffer);
  365. buffer = end;
  366. } else {
  367. utf8proc_detab(&line, buffer, (eol - buffer) + 1);
  368. buffer += (eol - buffer) + 1;
  369. }
  370. incorporate_line(&line, linenum, &document);
  371. strbuf_clear(&line);
  372. linenum++;
  373. }
  374. strbuf_free(&line);
  375. return finalize_document(document, linenum);
  376. }
  377. static void chop_trailing_hashtags(chunk *ch)
  378. {
  379. int n, orig_n;
  380. chunk_rtrim(ch);
  381. orig_n = n = ch->len - 1;
  382. // if string ends in space followed by #s, remove these:
  383. while (n >= 0 && peek_at(ch, n) == '#')
  384. n--;
  385. // Check for a be a space before the final #s:
  386. if (n != orig_n && n >= 0 && peek_at(ch, n) == ' ') {
  387. ch->len = n;
  388. chunk_rtrim(ch);
  389. }
  390. }
  391. // Process one line at a time, modifying a node_block.
  392. static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
  393. {
  394. node_block* last_matched_container;
  395. int offset = 0;
  396. int matched = 0;
  397. int lev = 0;
  398. int i;
  399. struct ListData * data = NULL;
  400. bool all_matched = true;
  401. node_block* container;
  402. node_block* cur = *curptr;
  403. bool blank = false;
  404. int first_nonspace;
  405. int indent;
  406. chunk input;
  407. // Add a newline to the end if not present:
  408. if (line->ptr[line->size - 1] != '\n') {
  409. strbuf_putc(line, '\n');
  410. }
  411. input.data = line->ptr;
  412. input.len = line->size;
  413. // container starts at the document root.
  414. container = cur->top;
  415. // for each containing node_block, try to parse the associated line start.
  416. // bail out on failure: container will point to the last matching node_block.
  417. while (container->last_child && container->last_child->open) {
  418. container = container->last_child;
  419. first_nonspace = offset;
  420. while (peek_at(&input, first_nonspace) == ' ') {
  421. first_nonspace++;
  422. }
  423. indent = first_nonspace - offset;
  424. blank = peek_at(&input, first_nonspace) == '\n';
  425. if (container->tag == BLOCK_BQUOTE) {
  426. matched = indent <= 3 && peek_at(&input, first_nonspace) == '>';
  427. if (matched) {
  428. offset = first_nonspace + 1;
  429. if (peek_at(&input, offset) == ' ')
  430. offset++;
  431. } else {
  432. all_matched = false;
  433. }
  434. } else if (container->tag == BLOCK_LIST_ITEM) {
  435. if (indent >= container->as.list.marker_offset +
  436. container->as.list.padding) {
  437. offset += container->as.list.marker_offset +
  438. container->as.list.padding;
  439. } else if (blank) {
  440. offset = first_nonspace;
  441. } else {
  442. all_matched = false;
  443. }
  444. } else if (container->tag == BLOCK_INDENTED_CODE) {
  445. if (indent >= CODE_INDENT) {
  446. offset += CODE_INDENT;
  447. } else if (blank) {
  448. offset = first_nonspace;
  449. } else {
  450. all_matched = false;
  451. }
  452. } else if (container->tag == BLOCK_ATX_HEADER ||
  453. container->tag == BLOCK_SETEXT_HEADER) {
  454. // a header can never contain more than one line
  455. all_matched = false;
  456. } else if (container->tag == BLOCK_FENCED_CODE) {
  457. // skip optional spaces of fence offset
  458. i = container->as.code.fence_offset;
  459. while (i > 0 && peek_at(&input, offset) == ' ') {
  460. offset++;
  461. i--;
  462. }
  463. } else if (container->tag == BLOCK_HTML) {
  464. if (blank) {
  465. all_matched = false;
  466. }
  467. } else if (container->tag == BLOCK_PARAGRAPH) {
  468. if (blank) {
  469. container->last_line_blank = true;
  470. all_matched = false;
  471. }
  472. }
  473. if (!all_matched) {
  474. container = container->parent; // back up to last matching node_block
  475. break;
  476. }
  477. }
  478. last_matched_container = container;
  479. // check to see if we've hit 2nd blank line, break out of list:
  480. if (blank && container->last_line_blank) {
  481. break_out_of_lists(&container, line_number);
  482. }
  483. // unless last matched container is code node_block, try new container starts:
  484. while (container->tag != BLOCK_FENCED_CODE && container->tag != BLOCK_INDENTED_CODE &&
  485. container->tag != BLOCK_HTML) {
  486. first_nonspace = offset;
  487. while (peek_at(&input, first_nonspace) == ' ')
  488. first_nonspace++;
  489. indent = first_nonspace - offset;
  490. blank = peek_at(&input, first_nonspace) == '\n';
  491. if (indent >= CODE_INDENT) {
  492. if (cur->tag != BLOCK_PARAGRAPH && !blank) {
  493. offset += CODE_INDENT;
  494. container = add_child(container, BLOCK_INDENTED_CODE, line_number, offset + 1);
  495. } else { // indent > 4 in lazy line
  496. break;
  497. }
  498. } else if (peek_at(&input, first_nonspace) == '>') {
  499. offset = first_nonspace + 1;
  500. // optional following character
  501. if (peek_at(&input, offset) == ' ')
  502. offset++;
  503. container = add_child(container, BLOCK_BQUOTE, line_number, offset + 1);
  504. } else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
  505. offset = first_nonspace + matched;
  506. container = add_child(container, BLOCK_ATX_HEADER, line_number, offset + 1);
  507. int hashpos = chunk_strchr(&input, '#', first_nonspace);
  508. int level = 0;
  509. while (peek_at(&input, hashpos) == '#') {
  510. level++;
  511. hashpos++;
  512. }
  513. container->as.header.level = level;
  514. } else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
  515. container = add_child(container, BLOCK_FENCED_CODE, line_number, first_nonspace + 1);
  516. container->as.code.fence_char = peek_at(&input, first_nonspace);
  517. container->as.code.fence_length = matched;
  518. container->as.code.fence_offset = first_nonspace - offset;
  519. offset = first_nonspace + matched;
  520. } else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
  521. container = add_child(container, BLOCK_HTML, line_number, first_nonspace + 1);
  522. // note, we don't adjust offset because the tag is part of the text
  523. } else if (container->tag == BLOCK_PARAGRAPH &&
  524. (lev = scan_setext_header_line(&input, first_nonspace)) &&
  525. // check that there is only one line in the paragraph:
  526. strbuf_strrchr(&container->string_content, '\n',
  527. strbuf_len(&container->string_content) - 2) < 0) {
  528. container->tag = BLOCK_SETEXT_HEADER;
  529. container->as.header.level = lev;
  530. offset = input.len - 1;
  531. } else if (!(container->tag == BLOCK_PARAGRAPH && !all_matched) &&
  532. (matched = scan_hrule(&input, first_nonspace))) {
  533. // it's only now that we know the line is not part of a setext header:
  534. container = add_child(container, BLOCK_HRULE, line_number, first_nonspace + 1);
  535. finalize(container, line_number);
  536. container = container->parent;
  537. offset = input.len - 1;
  538. } else if ((matched = parse_list_marker(&input, first_nonspace, &data))) {
  539. // compute padding:
  540. offset = first_nonspace + matched;
  541. i = 0;
  542. while (i <= 5 && peek_at(&input, offset + i) == ' ') {
  543. i++;
  544. }
  545. // i = number of spaces after marker, up to 5
  546. if (i >= 5 || i < 1 || peek_at(&input, offset) == '\n') {
  547. data->padding = matched + 1;
  548. if (i > 0) {
  549. offset += 1;
  550. }
  551. } else {
  552. data->padding = matched + i;
  553. offset += i;
  554. }
  555. // check container; if it's a list, see if this list item
  556. // can continue the list; otherwise, create a list container.
  557. data->marker_offset = indent;
  558. if (container->tag != BLOCK_LIST ||
  559. !lists_match(&container->as.list, data)) {
  560. container = add_child(container, BLOCK_LIST, line_number,
  561. first_nonspace + 1);
  562. memcpy(&container->as.list, data, sizeof(*data));
  563. }
  564. // add the list item
  565. container = add_child(container, BLOCK_LIST_ITEM, line_number,
  566. first_nonspace + 1);
  567. /* TODO: static */
  568. memcpy(&container->as.list, data, sizeof(*data));
  569. free(data);
  570. } else {
  571. break;
  572. }
  573. if (accepts_lines(container->tag)) {
  574. // if it's a line container, it can't contain other containers
  575. break;
  576. }
  577. }
  578. // what remains at offset is a text line. add the text to the
  579. // appropriate container.
  580. first_nonspace = offset;
  581. while (peek_at(&input, first_nonspace) == ' ')
  582. first_nonspace++;
  583. indent = first_nonspace - offset;
  584. blank = peek_at(&input, first_nonspace) == '\n';
  585. // node_block quote lines are never blank as they start with >
  586. // and we don't count blanks in fenced code for purposes of tight/loose
  587. // lists or breaking out of lists. we also don't set last_line_blank
  588. // on an empty list item.
  589. container->last_line_blank = (blank &&
  590. container->tag != BLOCK_BQUOTE &&
  591. container->tag != BLOCK_FENCED_CODE &&
  592. !(container->tag == BLOCK_LIST_ITEM &&
  593. container->children == NULL &&
  594. container->start_line == line_number));
  595. node_block *cont = container;
  596. while (cont->parent) {
  597. cont->parent->last_line_blank = false;
  598. cont = cont->parent;
  599. }
  600. if (cur != last_matched_container &&
  601. container == last_matched_container &&
  602. !blank &&
  603. cur->tag == BLOCK_PARAGRAPH &&
  604. strbuf_len(&cur->string_content) > 0) {
  605. add_line(cur, &input, offset);
  606. } else { // not a lazy continuation
  607. // finalize any blocks that were not matched and set cur to container:
  608. while (cur != last_matched_container) {
  609. finalize(cur, line_number);
  610. cur = cur->parent;
  611. assert(cur != NULL);
  612. }
  613. if (container->tag == BLOCK_INDENTED_CODE) {
  614. add_line(container, &input, offset);
  615. } else if (container->tag == BLOCK_FENCED_CODE) {
  616. matched = 0;
  617. if (indent <= 3 &&
  618. peek_at(&input, first_nonspace) == container->as.code.fence_char) {
  619. int fence_len = scan_close_code_fence(&input, first_nonspace);
  620. if (fence_len > container->as.code.fence_length)
  621. matched = 1;
  622. }
  623. if (matched) {
  624. // if closing fence, don't add line to container; instead, close it:
  625. finalize(container, line_number);
  626. container = container->parent; // back up to parent
  627. } else {
  628. add_line(container, &input, offset);
  629. }
  630. } else if (container->tag == BLOCK_HTML) {
  631. add_line(container, &input, offset);
  632. } else if (blank) {
  633. // ??? do nothing
  634. } else if (container->tag == BLOCK_ATX_HEADER) {
  635. chop_trailing_hashtags(&input);
  636. add_line(container, &input, first_nonspace);
  637. finalize(container, line_number);
  638. container = container->parent;
  639. } else if (accepts_lines(container->tag)) {
  640. add_line(container, &input, first_nonspace);
  641. } else if (container->tag != BLOCK_HRULE && container->tag != BLOCK_SETEXT_HEADER) {
  642. // create paragraph container for line
  643. container = add_child(container, BLOCK_PARAGRAPH, line_number, first_nonspace + 1);
  644. add_line(container, &input, first_nonspace);
  645. } else {
  646. assert(false);
  647. }
  648. *curptr = container;
  649. }
  650. }