aboutsummaryrefslogtreecommitdiff
path: root/src/inlines.c
blob: f681080d9842056f3da898f6461cb8dab22f653c (plain)
  1. #include <stdlib.h>
  2. #include <string.h>
  3. #include <stdio.h>
  4. #include <ctype.h>
  5. #include "config.h"
  6. #include "node.h"
  7. #include "parser.h"
  8. #include "references.h"
  9. #include "cmark.h"
  10. #include "html/houdini.h"
  11. #include "utf8.h"
  12. #include "scanners.h"
  13. #include "inlines.h"
  14. // Macros for creating various kinds of simple.
  15. #define make_str(s) make_literal(CMARK_NODE_STRING, s)
  16. #define make_code(s) make_literal(CMARK_NODE_INLINE_CODE, s)
  17. #define make_raw_html(s) make_literal(CMARK_NODE_INLINE_HTML, s)
  18. #define make_linebreak() make_simple(CMARK_NODE_LINEBREAK)
  19. #define make_softbreak() make_simple(CMARK_NODE_SOFTBREAK)
  20. #define make_emph(contents) make_inlines(CMARK_NODE_EMPH, contents)
  21. #define make_strong(contents) make_inlines(CMARK_NODE_STRONG, contents)
  22. typedef struct DelimiterStack {
  23. struct DelimiterStack *previous;
  24. struct DelimiterStack *next;
  25. cmark_node *first_inline;
  26. int delim_count;
  27. unsigned char delim_char;
  28. int position;
  29. bool can_open;
  30. bool can_close;
  31. } delimiter_stack;
  32. typedef struct Subject {
  33. chunk input;
  34. int pos;
  35. reference_map *refmap;
  36. delimiter_stack *delimiters;
  37. } subject;
  38. static int parse_inline(subject* subj, cmark_node * parent);
  39. static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap);
  40. static int subject_find_special_char(subject *subj);
  41. static unsigned char *cmark_clean_autolink(chunk *url, int is_email)
  42. {
  43. strbuf buf = GH_BUF_INIT;
  44. chunk_trim(url);
  45. if (url->len == 0)
  46. return NULL;
  47. if (is_email)
  48. strbuf_puts(&buf, "mailto:");
  49. houdini_unescape_html_f(&buf, url->data, url->len);
  50. return strbuf_detach(&buf);
  51. }
  52. static inline cmark_node *make_link(cmark_node *label, unsigned char *url, unsigned char *title)
  53. {
  54. cmark_node* e = (cmark_node *)calloc(1, sizeof(*e));
  55. if(e != NULL) {
  56. e->type = CMARK_NODE_LINK;
  57. e->first_child = label;
  58. e->last_child = label;
  59. e->as.link.url = url;
  60. e->as.link.title = title;
  61. e->next = NULL;
  62. label->parent = e;
  63. }
  64. return e;
  65. }
  66. static inline cmark_node* make_autolink(cmark_node* label, cmark_chunk url, int is_email)
  67. {
  68. return make_link(label, cmark_clean_autolink(&url, is_email), NULL);
  69. }
  70. // Setting 'last_child' and the parent of 'contents' is up to the caller.
  71. static inline cmark_node* make_inlines(cmark_node_type t, cmark_node* contents)
  72. {
  73. cmark_node * e = (cmark_node *)calloc(1, sizeof(*e));
  74. if(e != NULL) {
  75. e->type = t;
  76. e->first_child = contents;
  77. e->next = NULL;
  78. }
  79. return e;
  80. }
  81. // Create an inline with a literal string value.
  82. static inline cmark_node* make_literal(cmark_node_type t, cmark_chunk s)
  83. {
  84. cmark_node * e = (cmark_node *)calloc(1, sizeof(*e));
  85. if(e != NULL) {
  86. e->type = t;
  87. e->as.literal = s;
  88. e->next = NULL;
  89. }
  90. return e;
  91. }
  92. // Create an inline with no value.
  93. static inline cmark_node* make_simple(cmark_node_type t)
  94. {
  95. cmark_node* e = (cmark_node *)calloc(1, sizeof(*e));
  96. if(e != NULL) {
  97. e->type = t;
  98. e->next = NULL;
  99. }
  100. return e;
  101. }
  102. static unsigned char *bufdup(const unsigned char *buf)
  103. {
  104. unsigned char *new_buf = NULL;
  105. if (buf) {
  106. int len = strlen((char *)buf);
  107. new_buf = (unsigned char *)calloc(len + 1, sizeof(*new_buf));
  108. if(new_buf != NULL) {
  109. memcpy(new_buf, buf, len + 1);
  110. }
  111. }
  112. return new_buf;
  113. }
  114. static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap)
  115. {
  116. e->input.data = buffer->ptr;
  117. e->input.len = buffer->size;
  118. e->input.alloc = 0;
  119. e->pos = 0;
  120. e->refmap = refmap;
  121. e->delimiters = NULL;
  122. chunk_rtrim(&e->input);
  123. }
  124. static inline int isbacktick(int c)
  125. {
  126. return (c == '`');
  127. }
  128. static inline unsigned char peek_char(subject *subj)
  129. {
  130. return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0;
  131. }
  132. static inline unsigned char peek_at(subject *subj, int pos)
  133. {
  134. return subj->input.data[pos];
  135. }
  136. // Return true if there are more characters in the subject.
  137. static inline int is_eof(subject* subj)
  138. {
  139. return (subj->pos >= subj->input.len);
  140. }
  141. // Advance the subject. Doesn't check for eof.
  142. #define advance(subj) (subj)->pos += 1
  143. // Take characters while a predicate holds, and return a string.
  144. static inline chunk take_while(subject* subj, int (*f)(int))
  145. {
  146. unsigned char c;
  147. int startpos = subj->pos;
  148. int len = 0;
  149. while ((c = peek_char(subj)) && (*f)(c)) {
  150. advance(subj);
  151. len++;
  152. }
  153. return chunk_dup(&subj->input, startpos, len);
  154. }
  155. // Try to process a backtick code span that began with a
  156. // span of ticks of length openticklength length (already
  157. // parsed). Return 0 if you don't find matching closing
  158. // backticks, otherwise return the position in the subject
  159. // after the closing backticks.
  160. static int scan_to_closing_backticks(subject* subj, int openticklength)
  161. {
  162. // read non backticks
  163. unsigned char c;
  164. while ((c = peek_char(subj)) && c != '`') {
  165. advance(subj);
  166. }
  167. if (is_eof(subj)) {
  168. return 0; // did not find closing ticks, return 0
  169. }
  170. int numticks = 0;
  171. while (peek_char(subj) == '`') {
  172. advance(subj);
  173. numticks++;
  174. }
  175. if (numticks != openticklength){
  176. return(scan_to_closing_backticks(subj, openticklength));
  177. }
  178. return (subj->pos);
  179. }
  180. // Parse backtick code section or raw backticks, return an inline.
  181. // Assumes that the subject has a backtick at the current position.
  182. static cmark_node* handle_backticks(subject *subj)
  183. {
  184. chunk openticks = take_while(subj, isbacktick);
  185. int startpos = subj->pos;
  186. int endpos = scan_to_closing_backticks(subj, openticks.len);
  187. if (endpos == 0) { // not found
  188. subj->pos = startpos; // rewind
  189. return make_str(openticks);
  190. } else {
  191. strbuf buf = GH_BUF_INIT;
  192. strbuf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len);
  193. strbuf_trim(&buf);
  194. strbuf_normalize_whitespace(&buf);
  195. return make_code(chunk_buf_detach(&buf));
  196. }
  197. }
  198. // Scan ***, **, or * and return number scanned, or 0.
  199. // Advances position.
  200. static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
  201. {
  202. int numdelims = 0;
  203. unsigned char char_before, char_after;
  204. char_before = subj->pos == 0 ? '\n' : peek_at(subj, subj->pos - 1);
  205. while (peek_char(subj) == c) {
  206. numdelims++;
  207. advance(subj);
  208. }
  209. char_after = peek_char(subj);
  210. *can_open = numdelims > 0 && !isspace(char_after);
  211. *can_close = numdelims > 0 && !isspace(char_before);
  212. if (c == '_') {
  213. *can_open = *can_open && !isalnum(char_before);
  214. *can_close = *can_close && !isalnum(char_after);
  215. }
  216. return numdelims;
  217. }
  218. /*
  219. static void print_delimiters(subject *subj)
  220. {
  221. delimiter_stack *tempstack;
  222. tempstack = subj->delimiters;
  223. while (tempstack != NULL) {
  224. printf("Item at %p: %d %d %d %d next(%p) prev(%p)\n",
  225. tempstack, tempstack->delim_count, tempstack->delim_char,
  226. tempstack->can_open, tempstack->can_close,
  227. tempstack->next, tempstack->previous);
  228. tempstack = tempstack->previous;
  229. }
  230. }
  231. */
  232. static void remove_delimiter(subject *subj, delimiter_stack *stack)
  233. {
  234. if (stack->previous != NULL) {
  235. stack->previous->next = stack->next;
  236. }
  237. if (stack->next == NULL) {
  238. // top of stack
  239. subj->delimiters = stack->previous;
  240. } else {
  241. stack->next->previous = stack->previous;
  242. }
  243. free(stack);
  244. }
  245. static delimiter_stack * push_delimiter(subject *subj,
  246. int numdelims,
  247. unsigned char c,
  248. bool can_open,
  249. bool can_close,
  250. cmark_node *inl_text)
  251. {
  252. delimiter_stack *istack =
  253. (delimiter_stack*)malloc(sizeof(delimiter_stack));
  254. if (istack == NULL) {
  255. return NULL;
  256. }
  257. istack->delim_count = numdelims;
  258. istack->delim_char = c;
  259. istack->can_open = can_open;
  260. istack->can_close = can_close;
  261. istack->first_inline = inl_text;
  262. istack->previous = subj->delimiters;
  263. istack->next = NULL;
  264. if (istack->previous != NULL) {
  265. istack->previous->next = istack;
  266. }
  267. istack->position = subj->pos;
  268. return istack;
  269. }
  270. // Parse strong/emph or a fallback.
  271. // Assumes the subject has '_' or '*' at the current position.
  272. static cmark_node* handle_strong_emph(subject* subj, unsigned char c)
  273. {
  274. int numdelims;
  275. cmark_node * inl_text;
  276. bool can_open, can_close;
  277. numdelims = scan_delims(subj, c, &can_open, &can_close);
  278. inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims));
  279. if (can_open || can_close) {
  280. subj->delimiters = push_delimiter(subj, numdelims, c, can_open, can_close,
  281. inl_text);
  282. }
  283. return inl_text;
  284. }
  285. static void process_emphasis(subject *subj, delimiter_stack *stack_bottom)
  286. {
  287. delimiter_stack *closer = subj->delimiters;
  288. delimiter_stack *opener, *tempstack, *nextstack;
  289. int use_delims;
  290. cmark_node *inl, *tmp, *emph;
  291. // move back to first relevant delim.
  292. while (closer != NULL && closer->previous != stack_bottom) {
  293. closer = closer->previous;
  294. }
  295. // now move forward, looking for closers, and handling each
  296. while (closer != NULL) {
  297. if (closer->can_close &&
  298. (closer->delim_char == '*' || closer->delim_char == '_')) {
  299. // Now look backwards for first matching opener:
  300. opener = closer->previous;
  301. while (opener != NULL && opener != stack_bottom) {
  302. if (opener->delim_char == closer->delim_char &&
  303. opener->can_open) {
  304. break;
  305. }
  306. opener = opener->previous;
  307. }
  308. if (opener != NULL && opener != stack_bottom) {
  309. // calculate the actual number of delimeters used from this closer
  310. if (closer->delim_count < 3 || opener->delim_count < 3) {
  311. use_delims = closer->delim_count <= opener->delim_count ?
  312. closer->delim_count : opener->delim_count;
  313. } else { // closer and opener both have >= 3 delims
  314. use_delims = closer->delim_count % 2 == 0 ? 2 : 1;
  315. }
  316. inl = opener->first_inline;
  317. // remove used delimiters from stack elements and associated inlines.
  318. opener->delim_count -= use_delims;
  319. closer->delim_count -= use_delims;
  320. inl->as.literal.len = opener->delim_count;
  321. closer->first_inline->as.literal.len = closer->delim_count;
  322. // free delimiters between opener and closer
  323. tempstack = closer->previous;
  324. while (tempstack != NULL && tempstack != opener) {
  325. nextstack = tempstack->previous;
  326. remove_delimiter(subj, tempstack);
  327. tempstack = nextstack;
  328. }
  329. // create new emph or strong, and splice it in to our inlines
  330. // between the opener and closer
  331. emph = use_delims == 1 ? make_emph(inl->next) : make_strong(inl->next);
  332. emph->next = closer->first_inline;
  333. emph->prev = inl;
  334. inl->next = emph;
  335. // if opener has 0 delims, remove it and its associated inline
  336. if (opener->delim_count == 0) {
  337. // replace empty opener inline with emph
  338. chunk_free(&(inl->as.literal));
  339. inl->type = emph->type;
  340. inl->next = emph->next;
  341. inl->first_child = emph->first_child;
  342. free(emph);
  343. emph = inl;
  344. // remove opener from stack
  345. remove_delimiter(subj, opener);
  346. }
  347. // fix tree structure
  348. tmp = emph->first_child;
  349. while (tmp->next != NULL && tmp->next != closer->first_inline) {
  350. tmp->parent = emph;
  351. tmp = tmp->next;
  352. }
  353. tmp->parent = emph;
  354. if (tmp->next) {
  355. tmp->next->prev = emph;
  356. }
  357. tmp->next = NULL;
  358. emph->last_child = tmp;
  359. // if closer has 0 delims, remove it and its associated inline
  360. if (closer->delim_count == 0) {
  361. // remove empty closer inline
  362. tmp = closer->first_inline;
  363. emph->next = tmp->next;
  364. if (tmp->next) {
  365. tmp->next->prev = emph;
  366. }
  367. cmark_node_unlink(tmp);
  368. cmark_free_nodes(tmp);
  369. // remove closer from stack
  370. tempstack = closer->next;
  371. remove_delimiter(subj, closer);
  372. closer = tempstack;
  373. }
  374. } else {
  375. closer = closer->next;
  376. }
  377. } else {
  378. closer = closer->next;
  379. }
  380. }
  381. // free all delimiters in stack down to stack_bottom:
  382. while (subj->delimiters != stack_bottom) {
  383. remove_delimiter(subj, subj->delimiters);
  384. }
  385. }
  386. // Parse backslash-escape or just a backslash, returning an inline.
  387. static cmark_node* handle_backslash(subject *subj)
  388. {
  389. advance(subj);
  390. unsigned char nextchar = peek_char(subj);
  391. if (ispunct(nextchar)) { // only ascii symbols and newline can be escaped
  392. advance(subj);
  393. return make_str(chunk_dup(&subj->input, subj->pos - 1, 1));
  394. } else if (nextchar == '\n') {
  395. advance(subj);
  396. return make_linebreak();
  397. } else {
  398. return make_str(chunk_literal("\\"));
  399. }
  400. }
  401. // Parse an entity or a regular "&" string.
  402. // Assumes the subject has an '&' character at the current position.
  403. static cmark_node* handle_entity(subject* subj)
  404. {
  405. strbuf ent = GH_BUF_INIT;
  406. size_t len;
  407. advance(subj);
  408. len = houdini_unescape_ent(&ent,
  409. subj->input.data + subj->pos,
  410. subj->input.len - subj->pos
  411. );
  412. if (len == 0)
  413. return make_str(chunk_literal("&"));
  414. subj->pos += len;
  415. return make_str(chunk_buf_detach(&ent));
  416. }
  417. // Like make_str, but parses entities.
  418. // Returns an inline sequence consisting of str and entity elements.
  419. static cmark_node *make_str_with_entities(chunk *content)
  420. {
  421. strbuf unescaped = GH_BUF_INIT;
  422. if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) {
  423. return make_str(chunk_buf_detach(&unescaped));
  424. } else {
  425. return make_str(*content);
  426. }
  427. }
  428. // Clean a URL: remove surrounding whitespace and surrounding <>,
  429. // and remove \ that escape punctuation.
  430. unsigned char *clean_url(chunk *url)
  431. {
  432. strbuf buf = GH_BUF_INIT;
  433. chunk_trim(url);
  434. if (url->len == 0)
  435. return NULL;
  436. if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
  437. houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
  438. } else {
  439. houdini_unescape_html_f(&buf, url->data, url->len);
  440. }
  441. strbuf_unescape(&buf);
  442. return strbuf_detach(&buf);
  443. }
  444. unsigned char *clean_title(chunk *title)
  445. {
  446. strbuf buf = GH_BUF_INIT;
  447. unsigned char first, last;
  448. if (title->len == 0)
  449. return NULL;
  450. first = title->data[0];
  451. last = title->data[title->len - 1];
  452. // remove surrounding quotes if any:
  453. if ((first == '\'' && last == '\'') ||
  454. (first == '(' && last == ')') ||
  455. (first == '"' && last == '"')) {
  456. houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
  457. } else {
  458. houdini_unescape_html_f(&buf, title->data, title->len);
  459. }
  460. strbuf_unescape(&buf);
  461. return strbuf_detach(&buf);
  462. }
  463. // Parse an autolink or HTML tag.
  464. // Assumes the subject has a '<' character at the current position.
  465. static cmark_node* handle_pointy_brace(subject* subj)
  466. {
  467. int matchlen = 0;
  468. chunk contents;
  469. advance(subj); // advance past first <
  470. // first try to match a URL autolink
  471. matchlen = scan_autolink_uri(&subj->input, subj->pos);
  472. if (matchlen > 0) {
  473. contents = chunk_dup(&subj->input, subj->pos, matchlen - 1);
  474. subj->pos += matchlen;
  475. return make_autolink(
  476. make_str_with_entities(&contents),
  477. contents, 0
  478. );
  479. }
  480. // next try to match an email autolink
  481. matchlen = scan_autolink_email(&subj->input, subj->pos);
  482. if (matchlen > 0) {
  483. contents = chunk_dup(&subj->input, subj->pos, matchlen - 1);
  484. subj->pos += matchlen;
  485. return make_autolink(
  486. make_str_with_entities(&contents),
  487. contents, 1
  488. );
  489. }
  490. // finally, try to match an html tag
  491. matchlen = scan_html_tag(&subj->input, subj->pos);
  492. if (matchlen > 0) {
  493. contents = chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
  494. subj->pos += matchlen;
  495. return make_raw_html(contents);
  496. }
  497. // if nothing matches, just return the opening <:
  498. return make_str(chunk_literal("<"));
  499. }
  500. // Parse a link label. Returns 1 if successful.
  501. // Note: unescaped brackets are not allowed in labels.
  502. // The label begins with `[` and ends with the first `]` character
  503. // encountered. Backticks in labels do not start code spans.
  504. static int link_label(subject* subj, chunk *raw_label)
  505. {
  506. int startpos = subj->pos;
  507. int length = 0;
  508. advance(subj); // advance past [
  509. unsigned char c;
  510. while ((c = peek_char(subj)) && c != '[' && c != ']') {
  511. if (c == '\\') {
  512. advance(subj);
  513. length++;
  514. if (ispunct(peek_char(subj))) {
  515. advance(subj);
  516. length++;
  517. }
  518. } else {
  519. advance(subj);
  520. length++;
  521. }
  522. if (length > MAX_LINK_LABEL_LENGTH) {
  523. goto noMatch;
  524. }
  525. }
  526. if (c == ']') { // match found
  527. *raw_label = chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));
  528. advance(subj); // advance past ]
  529. return 1;
  530. }
  531. noMatch:
  532. subj->pos = startpos; // rewind
  533. return 0;
  534. }
  535. // Return a link, an image, or a literal close bracket.
  536. static cmark_node* handle_close_bracket(subject* subj, cmark_node *parent)
  537. {
  538. int initial_pos;
  539. int starturl, endurl, starttitle, endtitle, endall;
  540. int n;
  541. int sps;
  542. reference *ref;
  543. bool is_image = false;
  544. chunk urlchunk, titlechunk;
  545. unsigned char *url, *title;
  546. delimiter_stack *opener;
  547. delimiter_stack *tempstack;
  548. cmark_node *link_text;
  549. cmark_node *inl;
  550. chunk raw_label;
  551. advance(subj); // advance past ]
  552. initial_pos = subj->pos;
  553. // look through stack of delimiters for a [ or !
  554. opener = subj->delimiters;
  555. while (opener) {
  556. if (opener->delim_char == '[' || opener->delim_char == '!') {
  557. break;
  558. }
  559. opener = opener->previous;
  560. }
  561. if (opener == NULL) {
  562. return make_str(chunk_literal("]"));
  563. }
  564. // If we got here, we matched a potential link/image text.
  565. is_image = opener->delim_char == '!';
  566. link_text = opener->first_inline->next;
  567. // Now we check to see if it's a link/image.
  568. // First, look for an inline link.
  569. if (peek_char(subj) == '(' &&
  570. ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
  571. ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) {
  572. // try to parse an explicit link:
  573. starturl = subj->pos + 1 + sps; // after (
  574. endurl = starturl + n;
  575. starttitle = endurl + scan_spacechars(&subj->input, endurl);
  576. // ensure there are spaces btw url and title
  577. endtitle = (starttitle == endurl) ? starttitle :
  578. starttitle + scan_link_title(&subj->input, starttitle);
  579. endall = endtitle + scan_spacechars(&subj->input, endtitle);
  580. if (peek_at(subj, endall) == ')') {
  581. subj->pos = endall + 1;
  582. urlchunk = chunk_dup(&subj->input, starturl, endurl - starturl);
  583. titlechunk = chunk_dup(&subj->input, starttitle, endtitle - starttitle);
  584. url = clean_url(&urlchunk);
  585. title = clean_title(&titlechunk);
  586. chunk_free(&urlchunk);
  587. chunk_free(&titlechunk);
  588. goto match;
  589. } else {
  590. goto noMatch;
  591. }
  592. }
  593. // Next, look for a following [link label] that matches in refmap.
  594. // skip spaces
  595. subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos);
  596. raw_label = chunk_literal("");
  597. if (!link_label(subj, &raw_label) || raw_label.len == 0) {
  598. chunk_free(&raw_label);
  599. raw_label = chunk_dup(&subj->input, opener->position,
  600. initial_pos - opener->position - 1);
  601. }
  602. ref = reference_lookup(subj->refmap, &raw_label);
  603. chunk_free(&raw_label);
  604. if (ref != NULL) { // found
  605. url = bufdup(ref->url);
  606. title = bufdup(ref->title);
  607. goto match;
  608. } else {
  609. goto noMatch;
  610. }
  611. noMatch:
  612. // If we fall through to here, it means we didn't match a link:
  613. remove_delimiter(subj, opener); // remove this opener from delimiter stack
  614. subj->pos = initial_pos;
  615. return make_str(chunk_literal("]"));
  616. match:
  617. inl = opener->first_inline;
  618. inl->type = is_image ? NODE_IMAGE : NODE_LINK;
  619. chunk_free(&inl->as.literal);
  620. inl->first_child = link_text;
  621. process_emphasis(subj, opener->previous);
  622. inl->as.link.url = url;
  623. inl->as.link.title = title;
  624. inl->next = NULL;
  625. if (link_text) {
  626. cmark_node *tmp;
  627. link_text->prev = NULL;
  628. for (tmp = link_text; tmp->next != NULL; tmp = tmp->next) {
  629. tmp->parent = inl;
  630. }
  631. tmp->parent = inl;
  632. inl->last_child = tmp;
  633. }
  634. parent->last_child = inl;
  635. // process_emphasis will remove this delimiter and all later ones.
  636. // Now, if we have a link, we also want to remove earlier link
  637. // delimiters. (This code can be removed if we decide to allow links
  638. // inside links.)
  639. if (!is_image) {
  640. opener = subj->delimiters;
  641. while (opener != NULL) {
  642. tempstack = opener->previous;
  643. if (opener->delim_char == '[') {
  644. remove_delimiter(subj, opener);
  645. }
  646. opener = tempstack;
  647. }
  648. }
  649. return NULL;
  650. }
  651. // Parse a hard or soft linebreak, returning an inline.
  652. // Assumes the subject has a newline at the current position.
  653. static cmark_node* handle_newline(subject *subj)
  654. {
  655. int nlpos = subj->pos;
  656. // skip over newline
  657. advance(subj);
  658. // skip spaces at beginning of line
  659. while (peek_char(subj) == ' ') {
  660. advance(subj);
  661. }
  662. if (nlpos > 1 &&
  663. peek_at(subj, nlpos - 1) == ' ' &&
  664. peek_at(subj, nlpos - 2) == ' ') {
  665. return make_linebreak();
  666. } else {
  667. return make_softbreak();
  668. }
  669. }
  670. static int subject_find_special_char(subject *subj)
  671. {
  672. // "\n\\`&_*[]<!"
  673. static const int8_t SPECIAL_CHARS[256] = {
  674. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
  675. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  676. 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
  677. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
  678. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  679. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
  680. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  681. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  682. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  683. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  684. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  685. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  686. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  687. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  688. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  689. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  690. int n = subj->pos + 1;
  691. while (n < subj->input.len) {
  692. if (SPECIAL_CHARS[subj->input.data[n]])
  693. return n;
  694. n++;
  695. }
  696. return subj->input.len;
  697. }
  698. // Parse an inline, advancing subject, and add it as a child of parent.
  699. // Return 0 if no inline can be parsed, 1 otherwise.
  700. static int parse_inline(subject* subj, cmark_node * parent)
  701. {
  702. cmark_node* new_inl = NULL;
  703. chunk contents;
  704. unsigned char c;
  705. int endpos;
  706. c = peek_char(subj);
  707. if (c == 0) {
  708. return 0;
  709. }
  710. switch(c){
  711. case '\n':
  712. new_inl = handle_newline(subj);
  713. break;
  714. case '`':
  715. new_inl = handle_backticks(subj);
  716. break;
  717. case '\\':
  718. new_inl = handle_backslash(subj);
  719. break;
  720. case '&':
  721. new_inl = handle_entity(subj);
  722. break;
  723. case '<':
  724. new_inl = handle_pointy_brace(subj);
  725. break;
  726. case '*':
  727. case '_':
  728. new_inl = handle_strong_emph(subj, c);
  729. break;
  730. case '[':
  731. advance(subj);
  732. new_inl = make_str(chunk_literal("["));
  733. subj->delimiters = push_delimiter(subj, 1, '[', true, false, new_inl);
  734. break;
  735. case ']':
  736. new_inl = handle_close_bracket(subj, parent);
  737. break;
  738. case '!':
  739. advance(subj);
  740. if (peek_char(subj) == '[') {
  741. advance(subj);
  742. new_inl = make_str(chunk_literal("!["));
  743. subj->delimiters = push_delimiter(subj, 1, '!', false, true, new_inl);
  744. } else {
  745. new_inl = make_str(chunk_literal("!"));
  746. }
  747. break;
  748. default:
  749. endpos = subject_find_special_char(subj);
  750. contents = chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
  751. subj->pos = endpos;
  752. // if we're at a newline, strip trailing spaces.
  753. if (peek_char(subj) == '\n') {
  754. chunk_rtrim(&contents);
  755. }
  756. new_inl = make_str(contents);
  757. }
  758. if (new_inl != NULL) {
  759. cmark_node_append_child(parent, new_inl);
  760. }
  761. return 1;
  762. }
  763. // Parse inlines from parent's string_content, adding as children of parent.
  764. extern void cmark_parse_inlines(cmark_node* parent, cmark_reference_map *refmap)
  765. {
  766. subject subj;
  767. subject_from_buf(&subj, &parent->string_content, refmap);
  768. while (!is_eof(&subj) && parse_inline(&subj, parent)) ;
  769. process_emphasis(&subj, NULL);
  770. }
  771. // Parse zero or more space characters, including at most one newline.
  772. static void spnl(subject* subj)
  773. {
  774. bool seen_newline = false;
  775. while (peek_char(subj) == ' ' ||
  776. (!seen_newline &&
  777. (seen_newline = peek_char(subj) == '\n'))) {
  778. advance(subj);
  779. }
  780. }
  781. // Parse reference. Assumes string begins with '[' character.
  782. // Modify refmap if a reference is encountered.
  783. // Return 0 if no reference found, otherwise position of subject
  784. // after reference is parsed.
  785. int parse_reference_inline(strbuf *input, reference_map *refmap)
  786. {
  787. subject subj;
  788. chunk lab;
  789. chunk url;
  790. chunk title;
  791. int matchlen = 0;
  792. int beforetitle;
  793. subject_from_buf(&subj, input, NULL);
  794. // parse label:
  795. if (!link_label(&subj, &lab))
  796. return 0;
  797. // colon:
  798. if (peek_char(&subj) == ':') {
  799. advance(&subj);
  800. } else {
  801. return 0;
  802. }
  803. // parse link url:
  804. spnl(&subj);
  805. matchlen = scan_link_url(&subj.input, subj.pos);
  806. if (matchlen) {
  807. url = chunk_dup(&subj.input, subj.pos, matchlen);
  808. subj.pos += matchlen;
  809. } else {
  810. return 0;
  811. }
  812. // parse optional link_title
  813. beforetitle = subj.pos;
  814. spnl(&subj);
  815. matchlen = scan_link_title(&subj.input, subj.pos);
  816. if (matchlen) {
  817. title = chunk_dup(&subj.input, subj.pos, matchlen);
  818. subj.pos += matchlen;
  819. } else {
  820. subj.pos = beforetitle;
  821. title = chunk_literal("");
  822. }
  823. // parse final spaces and newline:
  824. while (peek_char(&subj) == ' ') {
  825. advance(&subj);
  826. }
  827. if (peek_char(&subj) == '\n') {
  828. advance(&subj);
  829. } else if (peek_char(&subj) != 0) {
  830. return 0;
  831. }
  832. // insert reference into refmap
  833. reference_create(refmap, &lab, &url, &title);
  834. return subj.pos;
  835. }