aboutsummaryrefslogtreecommitdiff
path: root/test/spec_tests.py
blob: 22af15fc80043b2d24a38a57a9992e65c4052a2c (plain)
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import sys
  4. from difflib import unified_diff
  5. import argparse
  6. from HTMLParser import HTMLParser, HTMLParseError
  7. from htmlentitydefs import name2codepoint
  8. import re
  9. import cgi
  10. import json
  11. from cmark import CMark
  12. if __name__ == "__main__":
  13. parser = argparse.ArgumentParser(description='Run cmark tests.')
  14. parser.add_argument('--program', dest='program', nargs='?', default=None,
  15. help='program to test')
  16. parser.add_argument('--spec', dest='spec', nargs='?', default='spec.txt',
  17. help='path to spec')
  18. parser.add_argument('--pattern', dest='pattern', nargs='?',
  19. default=None, help='limit to sections matching regex pattern')
  20. parser.add_argument('--library-dir', dest='library_dir', nargs='?',
  21. default=None, help='directory containing dynamic library')
  22. parser.add_argument('--no-normalize', dest='normalize',
  23. action='store_const', const=False, default=True,
  24. help='do not normalize HTML')
  25. parser.add_argument('--dump-tests', dest='dump_tests',
  26. action='store_const', const=True, default=False,
  27. help='dump tests in JSON format')
  28. parser.add_argument('--debug-normalization', dest='debug_normalization',
  29. action='store_const', const=True,
  30. default=False, help='filter stdin through normalizer for testing')
  31. args = parser.parse_args(sys.argv[1:])
  32. if not args.dump_tests:
  33. cmark = CMark(prog=args.program, library_dir=args.library_dir)
  34. # Normalization code, adapted from
  35. # https://github.com/karlcow/markdown-testsuite/
  36. significant_attrs = ["alt", "href", "src", "title"]
  37. whitespace_re = re.compile('/s+/')
  38. class MyHTMLParser(HTMLParser):
  39. def __init__(self):
  40. HTMLParser.__init__(self)
  41. self.last = "starttag"
  42. self.in_pre = False
  43. self.output = u""
  44. self.last_tag = ""
  45. def handle_data(self, data):
  46. after_tag = self.last == "endtag" or self.last == "starttag"
  47. after_block_tag = after_tag and self.is_block_tag(self.last_tag)
  48. if after_tag and self.last_tag == "br":
  49. data = data.lstrip('\n')
  50. data = whitespace_re.sub(' ', data)
  51. if after_block_tag and not self.in_pre:
  52. if self.last == "starttag":
  53. data = data.lstrip()
  54. elif self.last == "endtag":
  55. data = data.strip()
  56. self.output += data
  57. self.last = "data"
  58. def handle_endtag(self, tag):
  59. if tag == "pre":
  60. self.in_pre = False
  61. if self.is_block_tag(tag):
  62. self.output = self.output.rstrip()
  63. self.output += "</" + tag + ">"
  64. self.last_tag = tag
  65. self.last = "endtag"
  66. def handle_starttag(self, tag, attrs):
  67. if tag == "pre":
  68. self.in_pre = True
  69. self.output += "<" + tag
  70. # For now we don't strip out 'extra' attributes, because of
  71. # raw HTML test cases.
  72. # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs)
  73. if attrs:
  74. attrs.sort()
  75. for (k,v) in attrs:
  76. self.output += " " + k
  77. if v != None:
  78. self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"')
  79. self.output += ">"
  80. self.last_tag = tag
  81. self.last = "starttag"
  82. def handle_startendtag(self, tag, attrs):
  83. """Ignore closing tag for self-closing """
  84. self.handle_starttag(tag, attrs)
  85. self.last_tag = tag
  86. self.last = "endtag"
  87. def handle_comment(self, data):
  88. self.output += '<!--' + data + '-->'
  89. self.last = "comment"
  90. def handle_decl(self, data):
  91. self.output += '<!' + data + '>'
  92. self.last = "decl"
  93. def unknown_decl(self, data):
  94. self.output += '<!' + data + '>'
  95. self.last = "decl"
  96. def handle_pi(self,data):
  97. self.output += '<?' + data + '>'
  98. self.last = "pi"
  99. def handle_entityref(self, name):
  100. try:
  101. c = unichr(name2codepoint[name])
  102. except KeyError:
  103. c = None
  104. self.output_char(c, '&' + name + ';')
  105. self.last = "ref"
  106. def handle_charref(self, name):
  107. try:
  108. if name.startswith("x"):
  109. c = unichr(int(name[1:], 16))
  110. else:
  111. c = unichr(int(name))
  112. except ValueError:
  113. c = None
  114. self.output_char(c, '&' + name + ';')
  115. self.last = "ref"
  116. # Helpers.
  117. def output_char(self, c, fallback):
  118. if c == u'<':
  119. self.output += "&lt;"
  120. elif c == u'>':
  121. self.output += "&gt;"
  122. elif c == u'&':
  123. self.output += "&amp;"
  124. elif c == u'"':
  125. self.output += "&quot;"
  126. elif c == None:
  127. self.output += fallback
  128. else:
  129. self.output += c
  130. def is_block_tag(self,tag):
  131. return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
  132. 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
  133. 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
  134. 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
  135. 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
  136. 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
  137. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
  138. def normalize_html(html):
  139. r"""
  140. Return normalized form of HTML which ignores insignificant output
  141. differences:
  142. * Multiple inner whitespaces are collapsed to a single space (except
  143. in pre tags).
  144. * Outer whitespace (outside block-level tags) is removed.
  145. * Self-closing tags are converted to open tags.
  146. * Attributes are sorted and lowercased.
  147. * References are converted to unicode, except that '<', '>', '&', and
  148. '&' are rendered using entities.
  149. """
  150. html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
  151. try:
  152. parser = MyHTMLParser()
  153. # We work around HTMLParser's limitations parsing CDATA
  154. # by breaking the input into chunks and passing CDATA chunks
  155. # through verbatim.
  156. for chunk in re.finditer(html_chunk_re, html):
  157. if chunk.group(0)[:8] == "<![CDATA":
  158. parser.output += chunk.group(0)
  159. else:
  160. parser.feed(chunk.group(0).decode(encoding='UTF-8'))
  161. parser.close()
  162. return parser.output
  163. except HTMLParseError as e:
  164. sys.stderr.write("Normalization error: " + e.msg + "\n")
  165. return html # on error, return unnormalized HTML
  166. def print_test_header(headertext, example_number, start_line, end_line):
  167. print "Example %d (lines %d-%d) %s" % (example_number,start_line,end_line,headertext)
  168. def do_test(markdown_lines, expected_html_lines, headertext,
  169. example_number, start_line, end_line, normalize):
  170. real_markdown_text = ''.join(markdown_lines).replace('→','\t')
  171. [retcode, actual_html, err] = cmark.to_html(real_markdown_text)
  172. if retcode == 0:
  173. actual_html_lines = actual_html.splitlines(True)
  174. expected_html = ''.join(expected_html_lines)
  175. if normalize:
  176. passed = normalize_html(actual_html) == normalize_html(expected_html)
  177. else:
  178. passed = actual_html == expected_html
  179. if passed:
  180. return 'pass'
  181. else:
  182. print_test_header(headertext, example_number,start_line,end_line)
  183. sys.stdout.write(real_markdown_text)
  184. for diffline in unified_diff(expected_html_lines, actual_html_lines,
  185. "expected HTML", "actual HTML"):
  186. sys.stdout.write(diffline)
  187. sys.stdout.write('\n')
  188. return 'fail'
  189. else:
  190. print_test_header(headertext, example_number, start_line, end_line)
  191. print "program returned error code %d" % retcode
  192. print(err)
  193. return 'error'
  194. def do_tests(specfile, pattern, normalize, dump_tests):
  195. line_number = 0
  196. start_line = 0
  197. end_line = 0
  198. example_number = 0
  199. passed = 0
  200. failed = 0
  201. errored = 0
  202. markdown_lines = []
  203. html_lines = []
  204. active = True
  205. state = 0 # 0 regular text, 1 markdown example, 2 html output
  206. headertext = ''
  207. tests_json = []
  208. header_re = re.compile('#+ ')
  209. if pattern:
  210. pattern_re = re.compile(pattern, re.IGNORECASE)
  211. with open(specfile, 'r') as specf:
  212. for line in specf:
  213. line_number = line_number + 1
  214. if state == 0 and re.match(header_re, line):
  215. headertext = header_re.sub('', line).strip()
  216. if pattern:
  217. if re.search(pattern_re, line):
  218. active = True
  219. else:
  220. active = False
  221. if line.strip() == ".":
  222. state = (state + 1) % 3
  223. if state == 0:
  224. example_number = example_number + 1
  225. end_line = line_number
  226. if active:
  227. if dump_tests:
  228. tests_json.append({
  229. "markdown":''.join(markdown_lines).replace('→',"\t"),
  230. "html":''.join(html_lines),
  231. "example": example_number,
  232. "start_line": start_line,
  233. "end_line": end_line,
  234. "section": headertext})
  235. else:
  236. result = do_test(markdown_lines, html_lines,
  237. headertext, example_number,
  238. start_line, end_line, normalize)
  239. if result == 'pass':
  240. passed = passed + 1
  241. elif result == 'fail':
  242. failed = failed + 1
  243. else:
  244. errored = errored + 1
  245. start_line = 0
  246. markdown_lines = []
  247. html_lines = []
  248. elif state == 1:
  249. if start_line == 0:
  250. start_line = line_number - 1
  251. markdown_lines.append(line)
  252. elif state == 2:
  253. html_lines.append(line)
  254. if dump_tests:
  255. print json.dumps(tests_json, ensure_ascii=False, indent=2)
  256. return True
  257. else:
  258. print "%d passed, %d failed, %d errored" % (passed, failed, errored)
  259. return (failed == 0 and errored == 0)
  260. if __name__ == "__main__":
  261. if args.debug_normalization:
  262. print normalize_html(sys.stdin.read())
  263. elif do_tests(args.spec, args.pattern, args.normalize, args.dump_tests):
  264. exit(0)
  265. else:
  266. exit(1)