aboutsummaryrefslogtreecommitdiff
path: root/src/scanners.re
blob: ca25c162d0bbf4b58b535ed250ff8862d36cf8c1 (plain)
  1. #include <stdlib.h>
  2. #include "chunk.h"
  3. #include "scanners.h"
  4. int _scan_at(int (*scanner)(const unsigned char *), cmark_chunk *c, int offset)
  5. {
  6. int res;
  7. unsigned char *ptr = (unsigned char *)c->data;
  8. unsigned char lim = ptr[c->len];
  9. ptr[c->len] = '\0';
  10. res = scanner(ptr + offset);
  11. ptr[c->len] = lim;
  12. return res;
  13. }
  14. /*!re2c
  15. re2c:define:YYCTYPE = "unsigned char";
  16. re2c:define:YYCURSOR = p;
  17. re2c:define:YYMARKER = marker;
  18. re2c:define:YYCTXMARKER = marker;
  19. re2c:yyfill:enable = 0;
  20. wordchar = [^\x00-\x20];
  21. spacechar = [ \t\n];
  22. reg_char = [^\\()\x00-\x20];
  23. escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];
  24. tagname = [A-Za-z][A-Za-z0-9]*;
  25. blocktagname = 'article'|'header'|'aside'|'hgroup'|'iframe'|'blockquote'|'hr'|'body'|'li'|'map'|'button'|'object'|'canvas'|'ol'|'caption'|'output'|'col'|'p'|'colgroup'|'pre'|'dd'|'progress'|'div'|'section'|'dl'|'table'|'td'|'dt'|'tbody'|'embed'|'textarea'|'fieldset'|'tfoot'|'figcaption'|'th'|'figure'|'thead'|'footer'|'footer'|'tr'|'form'|'ul'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'video'|'script'|'style';
  26. attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
  27. unquotedvalue = [^\"'=<>`\x00]+;
  28. singlequotedvalue = ['][^'\x00]*['];
  29. doublequotedvalue = [\"][^\"\x00]*[\"];
  30. attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue;
  31. attributevaluespec = spacechar* [=] spacechar* attributevalue;
  32. attribute = spacechar+ attributename attributevaluespec?;
  33. opentag = tagname attribute* spacechar* [/]? [>];
  34. closetag = [/] tagname spacechar* [>];
  35. htmlcomment = "!---->" | ("!--" ([-]? [^\x00>-]) ([-]? [^\x00-])* "-->");
  36. processinginstruction = "?" ([^?>\x00]+ | [?][^>\x00] | [>])* "?>";
  37. declaration = "!" [A-Z]+ spacechar+ [^>\x00]* ">";
  38. cdata = "![CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])* "]]>";
  39. htmltag = opentag | closetag | htmlcomment | processinginstruction |
  40. declaration | cdata;
  41. in_parens_nosp = [(] (reg_char|escaped_char)* [)];
  42. in_double_quotes = ["] (escaped_char|[^"\x00])* ["];
  43. in_single_quotes = ['] (escaped_char|[^'\x00])* ['];
  44. in_parens = [(] (escaped_char|[^)\x00])* [)];
  45. scheme = 'coap'|'doi'|'javascript'|'aaa'|'aaas'|'about'|'acap'|'cap'|'cid'|'crid'|'data'|'dav'|'dict'|'dns'|'file'|'ftp'|'geo'|'go'|'gopher'|'h323'|'http'|'https'|'iax'|'icap'|'im'|'imap'|'info'|'ipp'|'iris'|'iris.beep'|'iris.xpc'|'iris.xpcs'|'iris.lwz'|'ldap'|'mailto'|'mid'|'msrp'|'msrps'|'mtqp'|'mupdate'|'news'|'nfs'|'ni'|'nih'|'nntp'|'opaquelocktoken'|'pop'|'pres'|'rtsp'|'service'|'session'|'shttp'|'sieve'|'sip'|'sips'|'sms'|'snmp'|'soap.beep'|'soap.beeps'|'tag'|'tel'|'telnet'|'tftp'|'thismessage'|'tn3270'|'tip'|'tv'|'urn'|'vemmi'|'ws'|'wss'|'xcon'|'xcon-userid'|'xmlrpc.beep'|'xmlrpc.beeps'|'xmpp'|'z39.50r'|'z39.50s'|'adiumxtra'|'afp'|'afs'|'aim'|'apt'|'attachment'|'aw'|'beshare'|'bitcoin'|'bolo'|'callto'|'chrome'|'chrome-extension'|'com-eventbrite-attendee'|'content'|'cvs'|'dlna-playsingle'|'dlna-playcontainer'|'dtn'|'dvb'|'ed2k'|'facetime'|'feed'|'finger'|'fish'|'gg'|'git'|'gizmoproject'|'gtalk'|'hcp'|'icon'|'ipn'|'irc'|'irc6'|'ircs'|'itms'|'jar'|'jms'|'keyparc'|'lastfm'|'ldaps'|'magnet'|'maps'|'market'|'message'|'mms'|'ms-help'|'msnim'|'mumble'|'mvn'|'notes'|'oid'|'palm'|'paparazzi'|'platform'|'proxy'|'psyc'|'query'|'res'|'resource'|'rmi'|'rsync'|'rtmp'|'secondlife'|'sftp'|'sgn'|'skype'|'smb'|'soldat'|'spotify'|'ssh'|'steam'|'svn'|'teamspeak'|'things'|'udp'|'unreal'|'ut2004'|'ventrilo'|'view-source'|'webcal'|'wtai'|'wyciwyg'|'xfire'|'xri'|'ymsgr';
  46. */
  47. // Try to match URI autolink after first <, returning number of chars matched.
  48. int _scan_autolink_uri(const unsigned char *p)
  49. {
  50. const unsigned char *marker = NULL;
  51. const unsigned char *start = p;
  52. /*!re2c
  53. scheme [:]([^\x00-\x20<>\\]|escaped_char)*[>] { return (p - start); }
  54. .? { return 0; }
  55. */
  56. }
  57. // Try to match email autolink after first <, returning num of chars matched.
  58. int _scan_autolink_email(const unsigned char *p)
  59. {
  60. const unsigned char *marker = NULL;
  61. const unsigned char *start = p;
  62. /*!re2c
  63. [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+
  64. [@]
  65. [a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
  66. ([.][a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
  67. [>] { return (p - start); }
  68. .? { return 0; }
  69. */
  70. }
  71. // Try to match an HTML tag after first <, returning num of chars matched.
  72. int _scan_html_tag(const unsigned char *p)
  73. {
  74. const unsigned char *marker = NULL;
  75. const unsigned char *start = p;
  76. /*!re2c
  77. htmltag { return (p - start); }
  78. .? { return 0; }
  79. */
  80. }
  81. // Try to match an HTML block tag including first <,
  82. // returning num of chars matched.
  83. int _scan_html_block_tag(const unsigned char *p)
  84. {
  85. const unsigned char *marker = NULL;
  86. const unsigned char *start = p;
  87. /*!re2c
  88. [<] [/] blocktagname (spacechar | [>]) { return (p - start); }
  89. [<] blocktagname (spacechar | [/>]) { return (p - start); }
  90. [<] [!?] { return (p - start); }
  91. .? { return 0; }
  92. */
  93. }
  94. // Try to match a URL in a link or reference, return number of chars matched.
  95. // This may optionally be contained in <..>; otherwise
  96. // whitespace and unbalanced right parentheses aren't allowed.
  97. // Newlines aren't ever allowed.
  98. int _scan_link_url(const unsigned char *p)
  99. {
  100. const unsigned char *marker = NULL;
  101. const unsigned char *start = p;
  102. /*!re2c
  103. [ \n]* [<] ([^<>\n\\\x00] | escaped_char | [\\])* [>] { return (p - start); }
  104. [ \n]* (reg_char+ | escaped_char | in_parens_nosp)* { return (p - start); }
  105. .? { return 0; }
  106. */
  107. }
  108. // Try to match a link title (in single quotes, in double quotes, or
  109. // in parentheses), returning number of chars matched. Allow one
  110. // level of internal nesting (quotes within quotes).
  111. int _scan_link_title(const unsigned char *p)
  112. {
  113. const unsigned char *marker = NULL;
  114. const unsigned char *start = p;
  115. /*!re2c
  116. ["] (escaped_char|[^"\x00])* ["] { return (p - start); }
  117. ['] (escaped_char|[^'\x00])* ['] { return (p - start); }
  118. [(] (escaped_char|[^)\x00])* [)] { return (p - start); }
  119. .? { return 0; }
  120. */
  121. }
  122. // Match space characters, including newlines.
  123. int _scan_spacechars(const unsigned char *p)
  124. {
  125. const unsigned char *start = p; \
  126. /*!re2c
  127. [ \t\n]* { return (p - start); }
  128. . { return 0; }
  129. */
  130. }
  131. // Match ATX header start.
  132. int _scan_atx_header_start(const unsigned char *p)
  133. {
  134. const unsigned char *marker = NULL;
  135. const unsigned char *start = p;
  136. /*!re2c
  137. [#]{1,6} ([ ]+|[\n]) { return (p - start); }
  138. .? { return 0; }
  139. */
  140. }
  141. // Match sexext header line. Return 1 for level-1 header,
  142. // 2 for level-2, 0 for no match.
  143. int _scan_setext_header_line(const unsigned char *p)
  144. {
  145. const unsigned char *marker = NULL;
  146. /*!re2c
  147. [=]+ [ ]* [\n] { return 1; }
  148. [-]+ [ ]* [\n] { return 2; }
  149. .? { return 0; }
  150. */
  151. }
  152. // Scan a horizontal rule line: "...three or more hyphens, asterisks,
  153. // or underscores on a line by themselves. If you wish, you may use
  154. // spaces between the hyphens or asterisks."
  155. int _scan_hrule(const unsigned char *p)
  156. {
  157. const unsigned char *marker = NULL;
  158. const unsigned char *start = p;
  159. /*!re2c
  160. ([*][ ]*){3,} [ \t]* [\n] { return (p - start); }
  161. ([_][ ]*){3,} [ \t]* [\n] { return (p - start); }
  162. ([-][ ]*){3,} [ \t]* [\n] { return (p - start); }
  163. .? { return 0; }
  164. */
  165. }
  166. // Scan an opening code fence.
  167. int _scan_open_code_fence(const unsigned char *p)
  168. {
  169. const unsigned char *marker = NULL;
  170. const unsigned char *start = p;
  171. /*!re2c
  172. [`]{3,} / [^`\n\x00]*[\n] { return (p - start); }
  173. [~]{3,} / [^~\n\x00]*[\n] { return (p - start); }
  174. .? { return 0; }
  175. */
  176. }
  177. // Scan a closing code fence with length at least len.
  178. int _scan_close_code_fence(const unsigned char *p)
  179. {
  180. const unsigned char *marker = NULL;
  181. const unsigned char *start = p;
  182. /*!re2c
  183. ([`]{3,} | [~]{3,}) / spacechar* [\n] { return (p - start); }
  184. .? { return 0; }
  185. */
  186. }
  187. // Scans an entity.
  188. // Returns number of chars matched.
  189. int _scan_entity(const unsigned char *p)
  190. {
  191. const unsigned char *marker = NULL;
  192. const unsigned char *start = p;
  193. /*!re2c
  194. [&] ([#] ([Xx][A-Fa-f0-9]{1,8}|[0-9]{1,8}) |[A-Za-z][A-Za-z0-9]{1,31} ) [;]
  195. { return (p - start); }
  196. .? { return 0; }
  197. */
  198. }