aboutsummaryrefslogtreecommitdiff
path: root/src/inlines.c
blob: ef27a240a1003abb4f6937dfb260277ae21b92f4 (plain)
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <stdbool.h>
  4. #include <ctype.h>
  5. #include <string.h>
  6. #include "stmd.h"
  7. #include "uthash.h"
  8. #include "scanners.h"
  9. typedef struct Subject {
  10. chunk input;
  11. int pos;
  12. int label_nestlevel;
  13. reference** reference_map;
  14. } subject;
  15. reference* lookup_reference(reference** refmap, chunk *label);
  16. reference* make_reference(chunk *label, chunk *url, chunk *title);
  17. static unsigned char *clean_url(chunk *url);
  18. static unsigned char *clean_title(chunk *title);
  19. inline static unsigned char *chunk_to_cstr(chunk *c);
  20. inline static void chunk_free(chunk *c);
  21. inline static void chunk_trim(chunk *c);
  22. inline static chunk chunk_literal(const char *data);
  23. inline static chunk chunk_buf_detach(gh_buf *buf);
  24. inline static chunk chunk_dup(const chunk *ch, int pos, int len);
  25. static inl *parse_chunk_inlines(chunk *chunk, reference** refmap);
  26. static inl *parse_inlines_while(subject* subj, int (*f)(subject*));
  27. static int parse_inline(subject* subj, inl ** last);
  28. static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap);
  29. static void subject_from_buf(subject *e, gh_buf *buffer, reference** refmap);
  30. static int subject_find_special_char(subject *subj);
  31. extern void free_reference(reference *ref) {
  32. free(ref->label);
  33. free(ref->url);
  34. free(ref->title);
  35. free(ref);
  36. }
  37. extern void free_reference_map(reference **refmap) {
  38. /* free the hash table contents */
  39. reference *s;
  40. reference *tmp;
  41. if (refmap != NULL) {
  42. HASH_ITER(hh, *refmap, s, tmp) {
  43. HASH_DEL(*refmap, s);
  44. free_reference(s);
  45. }
  46. free(refmap);
  47. }
  48. }
  49. // normalize reference: collapse internal whitespace to single space,
  50. // remove leading/trailing whitespace, case fold
  51. static unsigned char *normalize_reference(chunk *ref)
  52. {
  53. gh_buf normalized = GH_BUF_INIT;
  54. int r, w;
  55. utf8proc_case_fold(&normalized, ref->data, ref->len);
  56. gh_buf_trim(&normalized);
  57. for (r = 0, w = 0; r < normalized.size; ++r) {
  58. if (r && gh_buf_at(&normalized, r - 1) == ' ') {
  59. while (gh_buf_at(&normalized, r) == ' ')
  60. r++;
  61. }
  62. normalized.ptr[w++] = normalized.ptr[r];
  63. }
  64. return gh_buf_detach(&normalized);
  65. }
  66. // Returns reference if refmap contains a reference with matching
  67. // label, otherwise NULL.
  68. extern reference* lookup_reference(reference** refmap, chunk *label)
  69. {
  70. reference *ref = NULL;
  71. unsigned char *norm = normalize_reference(label);
  72. if (refmap != NULL) {
  73. HASH_FIND_STR(*refmap, (char*)norm, ref);
  74. }
  75. free(norm);
  76. return ref;
  77. }
  78. extern reference* make_reference(chunk *label, chunk *url, chunk *title)
  79. {
  80. reference *ref;
  81. ref = malloc(sizeof(reference));
  82. ref->label = normalize_reference(label);
  83. ref->url = clean_url(url);
  84. ref->title = clean_title(title);
  85. return ref;
  86. }
  87. extern void add_reference(reference** refmap, reference* ref)
  88. {
  89. reference * t = NULL;
  90. const char *label = (const char *)ref->label;
  91. HASH_FIND(hh, *refmap, label, strlen(label), t);
  92. if (t == NULL) {
  93. HASH_ADD_KEYPTR(hh, *refmap, label, strlen(label), ref);
  94. } else {
  95. free_reference(ref); // we free this now since it won't be in the refmap
  96. }
  97. }
  98. // Create an inline with a linkable string value.
  99. inline static inl* make_linkable(int t, inl* label, chunk url, chunk title)
  100. {
  101. inl* e = (inl*) malloc(sizeof(inl));
  102. e->tag = t;
  103. e->content.linkable.label = label;
  104. e->content.linkable.url = chunk_to_cstr(&url);
  105. e->content.linkable.title = url.len ? chunk_to_cstr(&title) : NULL;
  106. e->next = NULL;
  107. return e;
  108. }
  109. inline static inl* make_inlines(int t, inl* contents)
  110. {
  111. inl* e = (inl*) malloc(sizeof(inl));
  112. e->tag = t;
  113. e->content.inlines = contents;
  114. e->next = NULL;
  115. return e;
  116. }
  117. // Create an inline with a literal string value.
  118. inline static inl* make_literal(int t, chunk s)
  119. {
  120. inl* e = (inl*) malloc(sizeof(inl));
  121. e->tag = t;
  122. e->content.literal = s;
  123. e->next = NULL;
  124. return e;
  125. }
  126. // Create an inline with no value.
  127. inline static inl* make_simple(int t)
  128. {
  129. inl* e = (inl*) malloc(sizeof(inl));
  130. e->tag = t;
  131. e->next = NULL;
  132. return e;
  133. }
  134. // Macros for creating various kinds of inlines.
  135. #define make_str(s) make_literal(INL_STRING, s)
  136. #define make_code(s) make_literal(INL_CODE, s)
  137. #define make_raw_html(s) make_literal(INL_RAW_HTML, s)
  138. #define make_entity(s) make_literal(INL_ENTITY, s)
  139. #define make_linebreak() make_simple(INL_LINEBREAK)
  140. #define make_softbreak() make_simple(INL_SOFTBREAK)
  141. #define make_link(label, url, title) make_linkable(INL_LINK, label, url, title)
  142. #define make_emph(contents) make_inlines(INL_EMPH, contents)
  143. #define make_strong(contents) make_inlines(INL_STRONG, contents)
  144. // Free an inline list.
  145. extern void free_inlines(inl* e)
  146. {
  147. inl * next;
  148. while (e != NULL) {
  149. switch (e->tag){
  150. case INL_STRING:
  151. case INL_RAW_HTML:
  152. case INL_CODE:
  153. case INL_ENTITY:
  154. chunk_free(&e->content.literal);
  155. break;
  156. case INL_LINEBREAK:
  157. case INL_SOFTBREAK:
  158. break;
  159. case INL_LINK:
  160. case INL_IMAGE:
  161. free(e->content.linkable.url);
  162. free(e->content.linkable.title);
  163. free_inlines(e->content.linkable.label);
  164. break;
  165. case INL_EMPH:
  166. case INL_STRONG:
  167. free_inlines(e->content.inlines);
  168. break;
  169. default:
  170. break;
  171. }
  172. next = e->next;
  173. free(e);
  174. e = next;
  175. }
  176. }
  177. // Append inline list b to the end of inline list a.
  178. // Return pointer to head of new list.
  179. inline static inl* append_inlines(inl* a, inl* b)
  180. {
  181. if (a == NULL) { // NULL acts like an empty list
  182. return b;
  183. }
  184. inl* cur = a;
  185. while (cur->next) {
  186. cur = cur->next;
  187. }
  188. cur->next = b;
  189. return a;
  190. }
  191. static void subject_from_buf(subject *e, gh_buf *buffer, reference** refmap)
  192. {
  193. e->input.data = buffer->ptr;
  194. e->input.len = buffer->size;
  195. e->input.alloc = 0;
  196. e->pos = 0;
  197. e->label_nestlevel = 0;
  198. e->reference_map = refmap;
  199. chunk_rtrim(&e->input);
  200. }
  201. static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap)
  202. {
  203. e->input.data = chunk->data;
  204. e->input.len = chunk->len;
  205. e->input.alloc = 0;
  206. e->pos = 0;
  207. e->label_nestlevel = 0;
  208. e->reference_map = refmap;
  209. chunk_rtrim(&e->input);
  210. }
  211. inline static int isbacktick(int c)
  212. {
  213. return (c == '`');
  214. }
  215. static inline unsigned char peek_char(subject *subj)
  216. {
  217. return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0;
  218. }
  219. static inline unsigned char peek_at(subject *subj, int pos)
  220. {
  221. return subj->input.data[pos];
  222. }
  223. // Return true if there are more characters in the subject.
  224. inline static int is_eof(subject* subj)
  225. {
  226. return (subj->pos >= subj->input.len);
  227. }
  228. // Advance the subject. Doesn't check for eof.
  229. #define advance(subj) (subj)->pos += 1
  230. // Take characters while a predicate holds, and return a string.
  231. inline static chunk take_while(subject* subj, int (*f)(int))
  232. {
  233. unsigned char c;
  234. int startpos = subj->pos;
  235. int len = 0;
  236. while ((c = peek_char(subj)) && (*f)(c)) {
  237. advance(subj);
  238. len++;
  239. }
  240. return chunk_dup(&subj->input, startpos, len);
  241. }
  242. // Try to process a backtick code span that began with a
  243. // span of ticks of length openticklength length (already
  244. // parsed). Return 0 if you don't find matching closing
  245. // backticks, otherwise return the position in the subject
  246. // after the closing backticks.
  247. static int scan_to_closing_backticks(subject* subj, int openticklength)
  248. {
  249. // read non backticks
  250. char c;
  251. while ((c = peek_char(subj)) && c != '`') {
  252. advance(subj);
  253. }
  254. if (is_eof(subj)) {
  255. return 0; // did not find closing ticks, return 0
  256. }
  257. int numticks = 0;
  258. while (peek_char(subj) == '`') {
  259. advance(subj);
  260. numticks++;
  261. }
  262. if (numticks != openticklength){
  263. return(scan_to_closing_backticks(subj, openticklength));
  264. }
  265. return (subj->pos);
  266. }
  267. // Destructively modify string, collapsing consecutive
  268. // space and newline characters into a single space.
  269. static void normalize_whitespace(gh_buf *s)
  270. {
  271. /* TODO */
  272. #if 0
  273. bool last_char_was_space = false;
  274. int pos = 0;
  275. char c;
  276. while ((c = gh_buf_at(s, pos))) {
  277. switch (c) {
  278. case ' ':
  279. if (last_char_was_space) {
  280. bdelete(s, pos, 1);
  281. } else {
  282. pos++;
  283. }
  284. last_char_was_space = true;
  285. break;
  286. case '\n':
  287. if (last_char_was_space) {
  288. bdelete(s, pos, 1);
  289. } else {
  290. bdelete(s, pos, 1);
  291. binsertch(s, pos, 1, ' ');
  292. pos++;
  293. }
  294. last_char_was_space = true;
  295. break;
  296. default:
  297. pos++;
  298. last_char_was_space = false;
  299. }
  300. }
  301. #endif
  302. }
  303. // Parse backtick code section or raw backticks, return an inline.
  304. // Assumes that the subject has a backtick at the current position.
  305. static inl* handle_backticks(subject *subj)
  306. {
  307. chunk openticks = take_while(subj, isbacktick);
  308. int startpos = subj->pos;
  309. int endpos = scan_to_closing_backticks(subj, openticks.len);
  310. if (endpos == 0) { // not found
  311. subj->pos = startpos; // rewind
  312. return make_str(openticks);
  313. } else {
  314. gh_buf buf = GH_BUF_INIT;
  315. gh_buf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len);
  316. gh_buf_trim(&buf);
  317. normalize_whitespace(&buf);
  318. return make_code(chunk_buf_detach(&buf));
  319. }
  320. }
  321. // Scan ***, **, or * and return number scanned, or 0.
  322. // Don't advance position.
  323. static int scan_delims(subject* subj, char c, bool * can_open, bool * can_close)
  324. {
  325. int numdelims = 0;
  326. char char_before, char_after;
  327. int startpos = subj->pos;
  328. char_before = subj->pos == 0 ? '\n' : peek_at(subj, subj->pos - 1);
  329. while (peek_char(subj) == c) {
  330. numdelims++;
  331. advance(subj);
  332. }
  333. char_after = peek_char(subj);
  334. *can_open = numdelims > 0 && numdelims <= 3 && !isspace(char_after);
  335. *can_close = numdelims > 0 && numdelims <= 3 && !isspace(char_before);
  336. if (c == '_') {
  337. *can_open = *can_open && !isalnum(char_before);
  338. *can_close = *can_close && !isalnum(char_after);
  339. }
  340. subj->pos = startpos;
  341. return numdelims;
  342. }
  343. // Parse strong/emph or a fallback.
  344. // Assumes the subject has '_' or '*' at the current position.
  345. static inl* handle_strong_emph(subject* subj, char c)
  346. {
  347. bool can_open, can_close;
  348. inl * result = NULL;
  349. inl ** last = malloc(sizeof(inl *));
  350. inl * new;
  351. inl * il;
  352. inl * first_head = NULL;
  353. inl * first_close = NULL;
  354. int first_close_delims = 0;
  355. int numdelims;
  356. *last = NULL;
  357. numdelims = scan_delims(subj, c, &can_open, &can_close);
  358. subj->pos += numdelims;
  359. new = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims));
  360. *last = new;
  361. first_head = new;
  362. result = new;
  363. if (!can_open || numdelims == 0) {
  364. goto done;
  365. }
  366. switch (numdelims) {
  367. case 1:
  368. while (true) {
  369. numdelims = scan_delims(subj, c, &can_open, &can_close);
  370. if (numdelims >= 1 && can_close) {
  371. subj->pos += 1;
  372. first_head->tag = INL_EMPH;
  373. chunk_free(&first_head->content.literal);
  374. first_head->content.inlines = first_head->next;
  375. first_head->next = NULL;
  376. goto done;
  377. } else {
  378. if (!parse_inline(subj, last)) {
  379. goto done;
  380. }
  381. }
  382. }
  383. break;
  384. case 2:
  385. while (true) {
  386. numdelims = scan_delims(subj, c, &can_open, &can_close);
  387. if (numdelims >= 2 && can_close) {
  388. subj->pos += 2;
  389. first_head->tag = INL_STRONG;
  390. chunk_free(&first_head->content.literal);
  391. first_head->content.inlines = first_head->next;
  392. first_head->next = NULL;
  393. goto done;
  394. } else {
  395. if (!parse_inline(subj, last)) {
  396. goto done;
  397. }
  398. }
  399. }
  400. break;
  401. case 3:
  402. while (true) {
  403. numdelims = scan_delims(subj, c, &can_open, &can_close);
  404. if (can_close && numdelims >= 1 && numdelims <= 3 &&
  405. numdelims != first_close_delims) {
  406. new = make_str(chunk_dup(&subj->input, subj->pos, numdelims));
  407. append_inlines(*last, new);
  408. *last = new;
  409. if (first_close_delims == 1 && numdelims > 2) {
  410. numdelims = 2;
  411. } else if (first_close_delims == 2) {
  412. numdelims = 1;
  413. } else if (numdelims == 3) {
  414. // If we opened with ***, we interpret it as ** followed by *
  415. // giving us <strong><em>
  416. numdelims = 1;
  417. }
  418. subj->pos += numdelims;
  419. if (first_close) {
  420. first_head->tag = first_close_delims == 1 ? INL_STRONG : INL_EMPH;
  421. chunk_free(&first_head->content.literal);
  422. first_head->content.inlines =
  423. make_inlines(first_close_delims == 1 ? INL_EMPH : INL_STRONG,
  424. first_head->next);
  425. il = first_head->next;
  426. while (il->next && il->next != first_close) {
  427. il = il->next;
  428. }
  429. il->next = NULL;
  430. first_head->content.inlines->next = first_close->next;
  431. il = first_head->content.inlines;
  432. while (il->next && il->next != *last) {
  433. il = il->next;
  434. }
  435. il->next = NULL;
  436. free_inlines(*last);
  437. first_close->next = NULL;
  438. free_inlines(first_close);
  439. first_head->next = NULL;
  440. goto done;
  441. } else {
  442. first_close = *last;
  443. first_close_delims = numdelims;
  444. }
  445. } else {
  446. if (!parse_inline(subj, last)) {
  447. goto done;
  448. }
  449. }
  450. }
  451. break;
  452. default:
  453. goto done;
  454. }
  455. done:
  456. free(last);
  457. return result;
  458. }
  459. // Parse backslash-escape or just a backslash, returning an inline.
  460. static inl* handle_backslash(subject *subj)
  461. {
  462. advance(subj);
  463. unsigned char nextchar = peek_char(subj);
  464. if (ispunct(nextchar)) { // only ascii symbols and newline can be escaped
  465. advance(subj);
  466. return make_str(chunk_dup(&subj->input, subj->pos - 1, 1));
  467. } else if (nextchar == '\n') {
  468. advance(subj);
  469. return make_linebreak();
  470. } else {
  471. return make_str(chunk_literal("\\"));
  472. }
  473. }
  474. // Parse an entity or a regular "&" string.
  475. // Assumes the subject has an '&' character at the current position.
  476. static inl* handle_entity(subject* subj)
  477. {
  478. int match;
  479. inl *result;
  480. match = scan_entity(&subj->input, subj->pos);
  481. if (match) {
  482. result = make_entity(chunk_dup(&subj->input, subj->pos, match));
  483. subj->pos += match;
  484. } else {
  485. advance(subj);
  486. result = make_str(chunk_literal("&"));
  487. }
  488. return result;
  489. }
  490. // Like make_str, but parses entities.
  491. // Returns an inline sequence consisting of str and entity elements.
  492. static inl *make_str_with_entities(chunk *content)
  493. {
  494. inl *result = NULL;
  495. inl *new;
  496. int searchpos;
  497. char c;
  498. subject subj;
  499. subject_from_chunk(&subj, content, NULL);
  500. while ((c = peek_char(&subj))) {
  501. switch (c) {
  502. case '&':
  503. new = handle_entity(&subj);
  504. break;
  505. default:
  506. searchpos = chunk_strchr(&subj.input, '&', subj.pos);
  507. new = make_str(chunk_dup(&subj.input, subj.pos, searchpos - subj.pos));
  508. subj.pos = searchpos;
  509. }
  510. result = append_inlines(result, new);
  511. }
  512. return result;
  513. }
  514. // Destructively unescape a string: remove backslashes before punctuation chars.
  515. extern void unescape_buffer(gh_buf *buf)
  516. {
  517. int r, w;
  518. for (r = 0, w = 0; r < buf->size; ++r) {
  519. if (buf->ptr[r] == '\\' && ispunct(buf->ptr[r + 1]))
  520. continue;
  521. buf->ptr[w++] = buf->ptr[r];
  522. }
  523. gh_buf_truncate(buf, w);
  524. }
  525. // Clean a URL: remove surrounding whitespace and surrounding <>,
  526. // and remove \ that escape punctuation.
  527. static unsigned char *clean_url(chunk *url)
  528. {
  529. gh_buf buf = GH_BUF_INIT;
  530. chunk_trim(url);
  531. if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
  532. gh_buf_set(&buf, url->data + 1, url->len - 2);
  533. } else {
  534. gh_buf_set(&buf, url->data, url->len);
  535. }
  536. unescape_buffer(&buf);
  537. return gh_buf_detach(&buf);
  538. }
  539. // Clean a title: remove surrounding quotes and remove \ that escape punctuation.
  540. static unsigned char *clean_title(chunk *title)
  541. {
  542. gh_buf buf = GH_BUF_INIT;
  543. unsigned char first = title->data[0];
  544. unsigned char last = title->data[title->len - 1];
  545. // remove surrounding quotes if any:
  546. if ((first == '\'' && last == '\'') ||
  547. (first == '(' && last == ')') ||
  548. (first == '"' && last == '"')) {
  549. gh_buf_set(&buf, title->data + 1, title->len - 2);
  550. } else {
  551. gh_buf_set(&buf, title->data, title->len);
  552. }
  553. unescape_buffer(&buf);
  554. return gh_buf_detach(&buf);
  555. }
  556. // Parse an autolink or HTML tag.
  557. // Assumes the subject has a '<' character at the current position.
  558. static inl* handle_pointy_brace(subject* subj)
  559. {
  560. int matchlen = 0;
  561. chunk contents;
  562. advance(subj); // advance past first <
  563. // first try to match a URL autolink
  564. matchlen = scan_autolink_uri(&subj->input, subj->pos);
  565. if (matchlen > 0) {
  566. contents = chunk_dup(&subj->input, subj->pos, matchlen - 1);
  567. subj->pos += matchlen;
  568. return make_link(
  569. make_str_with_entities(&contents),
  570. contents,
  571. chunk_literal("")
  572. );
  573. }
  574. // next try to match an email autolink
  575. matchlen = scan_autolink_email(&subj->input, subj->pos);
  576. if (matchlen > 0) {
  577. gh_buf mail_url = GH_BUF_INIT;
  578. contents = chunk_dup(&subj->input, subj->pos, matchlen - 1);
  579. subj->pos += matchlen;
  580. gh_buf_puts(&mail_url, "mailto:");
  581. gh_buf_put(&mail_url, contents.data, contents.len);
  582. return make_link(
  583. make_str_with_entities(&contents),
  584. chunk_buf_detach(&mail_url),
  585. chunk_literal("")
  586. );
  587. }
  588. // finally, try to match an html tag
  589. matchlen = scan_html_tag(&subj->input, subj->pos);
  590. if (matchlen > 0) {
  591. contents = chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
  592. subj->pos += matchlen;
  593. return make_raw_html(contents);
  594. }
  595. // if nothing matches, just return the opening <:
  596. return make_str(chunk_literal("<"));
  597. }
  598. // Parse a link label. Returns 1 if successful.
  599. // Unless raw_label is null, it is set to point to the raw contents of the [].
  600. // Assumes the subject has a '[' character at the current position.
  601. // Returns 0 and does not advance if no matching ] is found.
  602. // Note the precedence: code backticks have precedence over label bracket
  603. // markers, which have precedence over *, _, and other inline formatting
  604. // markers. So, 2 below contains a link while 1 does not:
  605. // 1. [a link `with a ](/url)` character
  606. // 2. [a link *with emphasized ](/url) text*
  607. static int link_label(subject* subj, chunk *raw_label)
  608. {
  609. int nestlevel = 0;
  610. inl* tmp = NULL;
  611. int startpos = subj->pos;
  612. if (subj->label_nestlevel) {
  613. // if we've already checked to the end of the subject
  614. // for a label, even with a different starting [, we
  615. // know we won't find one here and we can just return.
  616. // Note: nestlevel 1 would be: [foo [bar]
  617. // nestlevel 2 would be: [foo [bar [baz]
  618. subj->label_nestlevel--;
  619. return 0;
  620. }
  621. advance(subj); // advance past [
  622. char c;
  623. while ((c = peek_char(subj)) && (c != ']' || nestlevel > 0)) {
  624. switch (c) {
  625. case '`':
  626. tmp = handle_backticks(subj);
  627. free_inlines(tmp);
  628. break;
  629. case '<':
  630. tmp = handle_pointy_brace(subj);
  631. free_inlines(tmp);
  632. break;
  633. case '[': // nested []
  634. nestlevel++;
  635. advance(subj);
  636. break;
  637. case ']': // nested []
  638. nestlevel--;
  639. advance(subj);
  640. break;
  641. case '\\':
  642. advance(subj);
  643. if (ispunct(peek_char(subj))) {
  644. advance(subj);
  645. }
  646. break;
  647. default:
  648. advance(subj);
  649. }
  650. }
  651. if (c == ']') {
  652. *raw_label = chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));
  653. subj->label_nestlevel = 0;
  654. advance(subj); // advance past ]
  655. return 1;
  656. } else {
  657. if (c == 0) {
  658. subj->label_nestlevel = nestlevel;
  659. }
  660. subj->pos = startpos; // rewind
  661. return 0;
  662. }
  663. }
  664. // Parse a link or the link portion of an image, or return a fallback.
  665. static inl* handle_left_bracket(subject* subj)
  666. {
  667. inl *lab = NULL;
  668. inl *result = NULL;
  669. reference *ref;
  670. int n;
  671. int sps;
  672. int found_label;
  673. int endlabel, starturl, endurl, starttitle, endtitle, endall;
  674. chunk rawlabel;
  675. chunk url, title;
  676. found_label = link_label(subj, &rawlabel);
  677. endlabel = subj->pos;
  678. if (found_label) {
  679. if (peek_char(subj) == '(' &&
  680. ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
  681. ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) {
  682. // try to parse an explicit link:
  683. starturl = subj->pos + 1 + sps; // after (
  684. endurl = starturl + n;
  685. starttitle = endurl + scan_spacechars(&subj->input, endurl);
  686. // ensure there are spaces btw url and title
  687. endtitle = (starttitle == endurl) ? starttitle :
  688. starttitle + scan_link_title(&subj->input, starttitle);
  689. endall = endtitle + scan_spacechars(&subj->input, endtitle);
  690. if (peek_at(subj, endall) == ')') {
  691. subj->pos = endall + 1;
  692. url = chunk_dup(&subj->input, starturl, endurl - starturl);
  693. title = chunk_dup(&subj->input, starttitle, endtitle - starttitle);
  694. lab = parse_chunk_inlines(&rawlabel, NULL);
  695. return make_link(lab, url, title);
  696. } else {
  697. // if we get here, we matched a label but didn't get further:
  698. subj->pos = endlabel;
  699. lab = parse_chunk_inlines(&rawlabel, subj->reference_map);
  700. result = append_inlines(make_str(chunk_literal("[")),
  701. append_inlines(lab,
  702. make_str(chunk_literal("]"))));
  703. return result;
  704. }
  705. } else {
  706. chunk rawlabel_tmp;
  707. chunk reflabel;
  708. // Check for reference link.
  709. // First, see if there's another label:
  710. subj->pos = subj->pos + scan_spacechars(&subj->input, endlabel);
  711. reflabel = rawlabel;
  712. // if followed by a nonempty link label, we change reflabel to it:
  713. if (peek_char(subj) == '[' && link_label(subj, &rawlabel_tmp)) {
  714. if (rawlabel_tmp.len > 0)
  715. reflabel = rawlabel_tmp;
  716. } else {
  717. subj->pos = endlabel;
  718. }
  719. // lookup rawlabel in subject->reference_map:
  720. ref = lookup_reference(subj->reference_map, &reflabel);
  721. if (ref != NULL) { // found
  722. lab = parse_chunk_inlines(&rawlabel, NULL);
  723. result = make_link(lab, chunk_literal(ref->url), chunk_literal(ref->title));
  724. } else {
  725. subj->pos = endlabel;
  726. lab = parse_chunk_inlines(&rawlabel, subj->reference_map);
  727. result = append_inlines(make_str(chunk_literal("[")),
  728. append_inlines(lab, make_str(chunk_literal("]"))));
  729. }
  730. return result;
  731. }
  732. }
  733. // If we fall through to here, it means we didn't match a link:
  734. advance(subj); // advance past [
  735. return make_str(chunk_literal("["));
  736. }
  737. // Parse a hard or soft linebreak, returning an inline.
  738. // Assumes the subject has a newline at the current position.
  739. static inl* handle_newline(subject *subj)
  740. {
  741. int nlpos = subj->pos;
  742. // skip over newline
  743. advance(subj);
  744. // skip spaces at beginning of line
  745. while (peek_char(subj) == ' ') {
  746. advance(subj);
  747. }
  748. if (nlpos > 1 &&
  749. peek_at(subj, nlpos - 1) == ' ' &&
  750. peek_at(subj, nlpos - 2) == ' ') {
  751. return make_linebreak();
  752. } else {
  753. return make_softbreak();
  754. }
  755. }
  756. inline static int not_eof(subject* subj)
  757. {
  758. return !is_eof(subj);
  759. }
  760. // Parse inlines while a predicate is satisfied. Return inlines.
  761. extern inl* parse_inlines_while(subject* subj, int (*f)(subject*))
  762. {
  763. inl* result = NULL;
  764. inl** last = &result;
  765. while ((*f)(subj) && parse_inline(subj, last)) {
  766. }
  767. return result;
  768. }
  769. inl *parse_chunk_inlines(chunk *chunk, reference** refmap)
  770. {
  771. subject subj;
  772. subject_from_chunk(&subj, chunk, refmap);
  773. return parse_inlines_while(&subj, not_eof);
  774. }
  775. static int subject_find_special_char(subject *subj)
  776. {
  777. int n = subj->pos + 1;
  778. while (n < subj->input.len) {
  779. if (strchr("\n\\`&_*[]<!", subj->input.data[n]))
  780. return n;
  781. n++;
  782. }
  783. return subj->input.len;
  784. }
  785. // Parse an inline, advancing subject, and add it to last element.
  786. // Adjust tail to point to new last element of list.
  787. // Return 0 if no inline can be parsed, 1 otherwise.
  788. static int parse_inline(subject* subj, inl ** last)
  789. {
  790. inl* new = NULL;
  791. chunk contents;
  792. unsigned char c;
  793. int endpos;
  794. c = peek_char(subj);
  795. if (c == 0) {
  796. return 0;
  797. }
  798. switch(c){
  799. case '\n':
  800. new = handle_newline(subj);
  801. break;
  802. case '`':
  803. new = handle_backticks(subj);
  804. break;
  805. case '\\':
  806. new = handle_backslash(subj);
  807. break;
  808. case '&':
  809. new = handle_entity(subj);
  810. break;
  811. case '<':
  812. new = handle_pointy_brace(subj);
  813. break;
  814. case '_':
  815. if (subj->pos > 0) {
  816. unsigned char prev = peek_at(subj, subj->pos - 1);
  817. if (isalnum(prev) || prev == '_') {
  818. new = make_str(chunk_literal("_"));
  819. advance(subj);
  820. break;
  821. }
  822. }
  823. new = handle_strong_emph(subj, '_');
  824. break;
  825. case '*':
  826. new = handle_strong_emph(subj, '*');
  827. break;
  828. case '[':
  829. new = handle_left_bracket(subj);
  830. break;
  831. case '!':
  832. advance(subj);
  833. if (peek_char(subj) == '[') {
  834. new = handle_left_bracket(subj);
  835. if (new != NULL && new->tag == INL_LINK) {
  836. new->tag = INL_IMAGE;
  837. } else {
  838. new = append_inlines(make_str(chunk_literal("!")), new);
  839. }
  840. } else {
  841. new = make_str(chunk_literal("!"));
  842. }
  843. break;
  844. default:
  845. endpos = subject_find_special_char(subj);
  846. contents = chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
  847. subj->pos = endpos;
  848. // if we're at a newline, strip trailing spaces.
  849. if (peek_char(subj) == '\n') {
  850. chunk_rtrim(&contents);
  851. }
  852. new = make_str(contents);
  853. }
  854. if (*last == NULL) {
  855. *last = new;
  856. } else {
  857. append_inlines(*last, new);
  858. }
  859. return 1;
  860. }
  861. extern inl* parse_inlines(gh_buf *input, reference** refmap)
  862. {
  863. subject subj;
  864. subject_from_buf(&subj, input, refmap);
  865. return parse_inlines_while(&subj, not_eof);
  866. }
  867. // Parse zero or more space characters, including at most one newline.
  868. void spnl(subject* subj)
  869. {
  870. bool seen_newline = false;
  871. while (peek_char(subj) == ' ' ||
  872. (!seen_newline &&
  873. (seen_newline = peek_char(subj) == '\n'))) {
  874. advance(subj);
  875. }
  876. }
  877. // Parse reference. Assumes string begins with '[' character.
  878. // Modify refmap if a reference is encountered.
  879. // Return 0 if no reference found, otherwise position of subject
  880. // after reference is parsed.
  881. extern int parse_reference(gh_buf *input, reference** refmap)
  882. {
  883. subject subj;
  884. chunk lab;
  885. chunk url;
  886. chunk title;
  887. int matchlen = 0;
  888. int beforetitle;
  889. reference *new = NULL;
  890. subject_from_buf(&subj, input, NULL);
  891. // parse label:
  892. if (!link_label(&subj, &lab))
  893. return 0;
  894. // colon:
  895. if (peek_char(&subj) == ':') {
  896. advance(&subj);
  897. } else {
  898. return 0;
  899. }
  900. // parse link url:
  901. spnl(&subj);
  902. matchlen = scan_link_url(&subj.input, subj.pos);
  903. if (matchlen) {
  904. url = chunk_dup(&subj.input, subj.pos, matchlen);
  905. subj.pos += matchlen;
  906. } else {
  907. return 0;
  908. }
  909. // parse optional link_title
  910. beforetitle = subj.pos;
  911. spnl(&subj);
  912. matchlen = scan_link_title(&subj.input, subj.pos);
  913. if (matchlen) {
  914. title = chunk_dup(&subj.input, subj.pos, matchlen);
  915. subj.pos += matchlen;
  916. } else {
  917. subj.pos = beforetitle;
  918. title = chunk_literal("");
  919. }
  920. // parse final spaces and newline:
  921. while (peek_char(&subj) == ' ') {
  922. advance(&subj);
  923. }
  924. if (peek_char(&subj) == '\n') {
  925. advance(&subj);
  926. } else if (peek_char(&subj) != 0) {
  927. return 0;
  928. }
  929. // insert reference into refmap
  930. new = make_reference(&lab, &url, &title);
  931. add_reference(refmap, new);
  932. return subj.pos;
  933. }