aboutsummaryrefslogtreecommitdiff
path: root/src/scanners.re
blob: 305d1ea0294b19a2932fbfc6ad9d3eca537f2105 (plain)
  1. #include "bstrlib.h"
  2. /*!re2c
  3. re2c:define:YYCTYPE = "unsigned char";
  4. re2c:define:YYCURSOR = p;
  5. re2c:define:YYMARKER = marker;
  6. re2c:define:YYCTXMARKER = marker;
  7. re2c:yyfill:enable = 0;
  8. wordchar = [^\x00-\x20];
  9. spacechar = [ \t\n];
  10. reg_char = [^\\()\x00-\x20];
  11. escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];
  12. tagname = [A-Za-z][A-Za-z0-9]*;
  13. blocktagname = 'article'|'header'|'aside'|'hgroup'|'iframe'|'blockquote'|'hr'|'body'|'li'|'map'|'button'|'object'|'canvas'|'ol'|'caption'|'output'|'col'|'p'|'colgroup'|'pre'|'dd'|'progress'|'div'|'section'|'dl'|'table'|'td'|'dt'|'tbody'|'embed'|'textarea'|'fieldset'|'tfoot'|'figcaption'|'th'|'figure'|'thead'|'footer'|'footer'|'tr'|'form'|'ul'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'video'|'script'|'style';
  14. attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
  15. unquotedvalue = [^\"'=<>`\x00]+;
  16. singlequotedvalue = ['][^'\x00]*['];
  17. doublequotedvalue = [\"][^\"\x00]*[\"];
  18. attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue;
  19. attributevaluespec = spacechar* [=] spacechar* attributevalue;
  20. attribute = spacechar+ attributename attributevaluespec?;
  21. opentag = tagname attribute* spacechar* [/]? [>];
  22. closetag = [/] tagname spacechar* [>];
  23. htmlcomment = "!--" ([^-\x00]+ | [-][^-\x00]+)* "-->";
  24. processinginstruction = "?" ([^?>\x00]+ | [?][^>\x00])* "?>";
  25. declaration = "!" [A-Z]+ spacechar+ [^>\x00]* ">";
  26. cdata = "![CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])* "]]>";
  27. htmltag = opentag | closetag | htmlcomment | processinginstruction |
  28. declaration | cdata;
  29. in_parens_nosp = [(] (reg_char|escaped_char)* [)];
  30. in_double_quotes = ["] (escaped_char|[^"\x00])* ["];
  31. in_single_quotes = ['] (escaped_char|[^'\x00])* ['];
  32. in_parens = [(] (escaped_char|[^)\x00])* [)];
  33. scheme = 'coap'|'doi'|'javascript'|'aaa'|'aaas'|'about'|'acap'|'cap'|'cid'|'crid'|'data'|'dav'|'dict'|'dns'|'file'|'ftp'|'geo'|'go'|'gopher'|'h323'|'http'|'https'|'iax'|'icap'|'im'|'imap'|'info'|'ipp'|'iris'|'iris.beep'|'iris.xpc'|'iris.xpcs'|'iris.lwz'|'ldap'|'mailto'|'mid'|'msrp'|'msrps'|'mtqp'|'mupdate'|'news'|'nfs'|'ni'|'nih'|'nntp'|'opaquelocktoken'|'pop'|'pres'|'rtsp'|'service'|'session'|'shttp'|'sieve'|'sip'|'sips'|'sms'|'snmp'|'soap.beep'|'soap.beeps'|'tag'|'tel'|'telnet'|'tftp'|'thismessage'|'tn3270'|'tip'|'tv'|'urn'|'vemmi'|'ws'|'wss'|'xcon'|'xcon-userid'|'xmlrpc.beep'|'xmlrpc.beeps'|'xmpp'|'z39.50r'|'z39.50s'|'adiumxtra'|'afp'|'afs'|'aim'|'apt'|'attachment'|'aw'|'beshare'|'bitcoin'|'bolo'|'callto'|'chrome'|'chrome-extension'|'com-eventbrite-attendee'|'content'|'cvs'|'dlna-playsingle'|'dlna-playcontainer'|'dtn'|'dvb'|'ed2k'|'facetime'|'feed'|'finger'|'fish'|'gg'|'git'|'gizmoproject'|'gtalk'|'hcp'|'icon'|'ipn'|'irc'|'irc6'|'ircs'|'itms'|'jar'|'jms'|'keyparc'|'lastfm'|'ldaps'|'magnet'|'maps'|'market'|'message'|'mms'|'ms-help'|'msnim'|'mumble'|'mvn'|'notes'|'oid'|'palm'|'paparazzi'|'platform'|'proxy'|'psyc'|'query'|'res'|'resource'|'rmi'|'rsync'|'rtmp'|'secondlife'|'sftp'|'sgn'|'skype'|'smb'|'soldat'|'spotify'|'ssh'|'steam'|'svn'|'teamspeak'|'things'|'udp'|'unreal'|'ut2004'|'ventrilo'|'view-source'|'webcal'|'wtai'|'wyciwyg'|'xfire'|'xri'|'ymsgr';
  34. */
  35. // Try to match URI autolink after first <, returning number of chars matched.
  36. extern int scan_autolink_uri(bstring s, int pos)
  37. {
  38. unsigned char * marker = NULL;
  39. unsigned char * p = &(s->data[pos]);
  40. unsigned char * start = p;
  41. /*!re2c
  42. scheme [:]([^\x00-\x20<>\\]|escaped_char)*[>] { return (p - start); }
  43. .? { return 0; }
  44. */
  45. }
  46. // Try to match email autolink after first <, returning num of chars matched.
  47. extern int scan_autolink_email(bstring s, int pos)
  48. {
  49. unsigned char * marker = NULL;
  50. unsigned char * p = &(s->data[pos]);
  51. unsigned char * start = p;
  52. /*!re2c
  53. [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+
  54. [@]
  55. [a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
  56. ([.][a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
  57. [>] { return (p - start); }
  58. .? { return 0; }
  59. */
  60. }
  61. // Try to match an HTML tag after first <, returning num of chars matched.
  62. extern int scan_html_tag(bstring s, int pos)
  63. {
  64. unsigned char * marker = NULL;
  65. unsigned char * p = &(s->data[pos]);
  66. unsigned char * start = p;
  67. /*!re2c
  68. htmltag { return (p - start); }
  69. .? { return 0; }
  70. */
  71. }
  72. // Try to match an HTML block tag including first <,
  73. // returning num of chars matched.
  74. extern int scan_html_block_tag(bstring s, int pos)
  75. {
  76. unsigned char * marker = NULL;
  77. unsigned char * p = &(s->data[pos]);
  78. unsigned char * start = p;
  79. /*!re2c
  80. [<] [/] blocktagname (spacechar | [>]) { return (p - start); }
  81. [<] blocktagname (spacechar | [/>]) { return (p - start); }
  82. [<] [!?] { return (p - start); }
  83. .? { return 0; }
  84. */
  85. }
  86. // Try to match a URL in a link or reference, return number of chars matched.
  87. // This may optionally be contained in <..>; otherwise
  88. // whitespace and unbalanced right parentheses aren't allowed.
  89. // Newlines aren't ever allowed.
  90. extern int scan_link_url(bstring s, int pos)
  91. {
  92. unsigned char * marker = NULL;
  93. unsigned char * p = &(s->data[pos]);
  94. unsigned char * start = p;
  95. /*!re2c
  96. [ \n]* [<] ([^<>\n\\\x00] | escaped_char | [\\])* [>] { return (p - start); }
  97. [ \n]* (reg_char+ | escaped_char | in_parens_nosp)* { return (p - start); }
  98. .? { return 0; }
  99. */
  100. }
  101. // Try to match a link title (in single quotes, in double quotes, or
  102. // in parentheses), returning number of chars matched. Allow one
  103. // level of internal nesting (quotes within quotes).
  104. extern int scan_link_title(bstring s, int pos)
  105. {
  106. unsigned char * marker = NULL;
  107. unsigned char * p = &(s->data[pos]);
  108. unsigned char * start = p;
  109. /*!re2c
  110. ["] (escaped_char|[^"\x00])* ["] { return (p - start); }
  111. ['] (escaped_char|[^'\x00])* ['] { return (p - start); }
  112. [(] (escaped_char|[^)\x00])* [)] { return (p - start); }
  113. .? { return 0; }
  114. */
  115. }
  116. // Match space characters, including newlines.
  117. extern int scan_spacechars(bstring s, int pos)
  118. {
  119. unsigned char * p = &(s->data[pos]);
  120. unsigned char * start = p;
  121. /*!re2c
  122. [ \t\n]* { return (p - start); }
  123. . { return 0; }
  124. */
  125. }
  126. // Match ATX header start.
  127. extern int scan_atx_header_start(bstring s, int pos)
  128. {
  129. unsigned char * marker = NULL;
  130. unsigned char * p = &(s->data[pos]);
  131. unsigned char * start = p;
  132. /*!re2c
  133. [#]{1,6} ([ ]+|[\n]) { return (p - start); }
  134. .? { return 0; }
  135. */
  136. }
  137. // Match sexext header line. Return 1 for level-1 header,
  138. // 2 for level-2, 0 for no match.
  139. extern int scan_setext_header_line(bstring s, int pos)
  140. {
  141. unsigned char * marker = NULL;
  142. unsigned char * p = &(s->data[pos]);
  143. /*!re2c
  144. [=]+ [ ]* [\n] { return 1; }
  145. [-]+ [ ]* [\n] { return 2; }
  146. .? { return 0; }
  147. */
  148. }
  149. // Scan a horizontal rule line: "...three or more hyphens, asterisks,
  150. // or underscores on a line by themselves. If you wish, you may use
  151. // spaces between the hyphens or asterisks."
  152. extern int scan_hrule(bstring s, int pos)
  153. {
  154. unsigned char * marker = NULL;
  155. unsigned char * p = &(s->data[pos]);
  156. unsigned char * start = p;
  157. /*!re2c
  158. ([*][ ]*){3,} [ \t]* [\n] { return (p - start); }
  159. ([_][ ]*){3,} [ \t]* [\n] { return (p - start); }
  160. ([-][ ]*){3,} [ \t]* [\n] { return (p - start); }
  161. .? { return 0; }
  162. */
  163. }
  164. // Scan an opening code fence.
  165. extern int scan_open_code_fence(bstring s, int pos)
  166. {
  167. unsigned char * marker = NULL;
  168. unsigned char * p = &(s->data[pos]);
  169. unsigned char * start = p;
  170. /*!re2c
  171. [`]{3,} / [^`\n\x00]*[\n] { return (p - start); }
  172. [~]{3,} / [^~\n\x00]*[\n] { return (p - start); }
  173. .? { return 0; }
  174. */
  175. }
  176. // Scan a closing code fence with length at least len.
  177. extern int scan_close_code_fence(bstring s, int pos, int len)
  178. {
  179. unsigned char * marker = NULL;
  180. unsigned char * p = &(s->data[pos]);
  181. unsigned char * start = p;
  182. /*!re2c
  183. ([`]{3,} | [~]{3,}) / spacechar* [\n]
  184. { if (p - start > len) {
  185. return (p - start);
  186. } else {
  187. return 0;
  188. } }
  189. .? { return 0; }
  190. */
  191. }
  192. // Scans an entity.
  193. // Returns number of chars matched.
  194. extern int scan_entity(bstring s, int pos)
  195. {
  196. unsigned char * marker = NULL;
  197. unsigned char * p = &(s->data[pos]);
  198. unsigned char * start = p;
  199. /*!re2c
  200. [&] ([#] ([Xx][A-Fa-f0-9]{1,8}|[0-9]{1,8}) |[A-Za-z][A-Za-z0-9]{1,31} ) [;]
  201. { return (p - start); }
  202. .? { return 0; }
  203. */
  204. }