aboutsummaryrefslogtreecommitdiff
path: root/test/spec_tests.py
blob: 394d1a22612525429e7a1181df6e4c8f484efeca (plain)
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import sys
  4. from difflib import unified_diff
  5. import argparse
  6. from HTMLParser import HTMLParser, HTMLParseError
  7. from htmlentitydefs import name2codepoint
  8. import re
  9. import cgi
  10. import json
  11. from cmark import CMark
  12. if __name__ == "__main__":
  13. parser = argparse.ArgumentParser(description='Run cmark tests.')
  14. parser.add_argument('--program', dest='program', nargs='?', default=None,
  15. help='program to test')
  16. parser.add_argument('--spec', dest='spec', nargs='?', default='spec.txt',
  17. help='path to spec')
  18. parser.add_argument('--pattern', dest='pattern', nargs='?',
  19. default=None, help='limit to sections matching regex pattern')
  20. parser.add_argument('--library-dir', dest='library_dir', nargs='?',
  21. default=None, help='directory containing dynamic library')
  22. parser.add_argument('--no-normalize', dest='normalize',
  23. action='store_const', const=False, default=True,
  24. help='do not normalize HTML')
  25. parser.add_argument('--dump-tests', dest='dump_tests',
  26. action='store_const', const=True, default=False,
  27. help='dump tests in JSON format')
  28. parser.add_argument('--debug-normalization', dest='debug_normalization',
  29. action='store_const', const=True,
  30. default=False, help='filter stdin through normalizer for testing')
  31. args = parser.parse_args(sys.argv[1:])
  32. cmark = CMark(prog=args.program, library_dir=args.library_dir)
  33. # Normalization code, adapted from
  34. # https://github.com/karlcow/markdown-testsuite/
  35. significant_attrs = ["alt", "href", "src", "title"]
  36. whitespace_re = re.compile('/s+/')
  37. class MyHTMLParser(HTMLParser):
  38. def __init__(self):
  39. HTMLParser.__init__(self)
  40. self.last = "starttag"
  41. self.in_pre = False
  42. self.output = u""
  43. self.last_tag = ""
  44. def handle_data(self, data):
  45. after_tag = self.last == "endtag" or self.last == "starttag"
  46. after_block_tag = after_tag and self.is_block_tag(self.last_tag)
  47. if after_tag and self.last_tag == "br":
  48. data = data.lstrip('\n')
  49. data = whitespace_re.sub(' ', data)
  50. if after_block_tag and not self.in_pre:
  51. if self.last == "starttag":
  52. data = data.lstrip()
  53. elif self.last == "endtag":
  54. data = data.strip()
  55. self.output += data
  56. self.last = "data"
  57. def handle_endtag(self, tag):
  58. if tag == "pre":
  59. self.in_pre = False
  60. if self.is_block_tag(tag):
  61. self.output = self.output.rstrip()
  62. self.output += "</" + tag + ">"
  63. self.last_tag = tag
  64. self.last = "endtag"
  65. def handle_starttag(self, tag, attrs):
  66. if tag == "pre":
  67. self.in_pre = True
  68. self.output += "<" + tag
  69. # For now we don't strip out 'extra' attributes, because of
  70. # raw HTML test cases.
  71. # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs)
  72. if attrs:
  73. attrs.sort()
  74. for (k,v) in attrs:
  75. self.output += " " + k
  76. if v != None:
  77. self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"')
  78. self.output += ">"
  79. self.last_tag = tag
  80. self.last = "starttag"
  81. def handle_startendtag(self, tag, attrs):
  82. """Ignore closing tag for self-closing """
  83. self.handle_starttag(tag, attrs)
  84. self.last_tag = tag
  85. self.last = "endtag"
  86. def handle_comment(self, data):
  87. self.output += '<!--' + data + '-->'
  88. self.last = "comment"
  89. def handle_decl(self, data):
  90. self.output += '<!' + data + '>'
  91. self.last = "decl"
  92. def unknown_decl(self, data):
  93. self.output += '<!' + data + '>'
  94. self.last = "decl"
  95. def handle_pi(self,data):
  96. self.output += '<?' + data + '>'
  97. self.last = "pi"
  98. def handle_entityref(self, name):
  99. try:
  100. c = unichr(name2codepoint[name])
  101. except KeyError:
  102. c = None
  103. self.output_char(c, '&' + name + ';')
  104. self.last = "ref"
  105. def handle_charref(self, name):
  106. try:
  107. if name.startswith("x"):
  108. c = unichr(int(name[1:], 16))
  109. else:
  110. c = unichr(int(name))
  111. except ValueError:
  112. c = None
  113. self.output_char(c, '&' + name + ';')
  114. self.last = "ref"
  115. # Helpers.
  116. def output_char(self, c, fallback):
  117. if c == u'<':
  118. self.output += "&lt;"
  119. elif c == u'>':
  120. self.output += "&gt;"
  121. elif c == u'&':
  122. self.output += "&amp;"
  123. elif c == u'"':
  124. self.output += "&quot;"
  125. elif c == None:
  126. self.output += fallback
  127. else:
  128. self.output += c
  129. def is_block_tag(self,tag):
  130. return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
  131. 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
  132. 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
  133. 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
  134. 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
  135. 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
  136. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
  137. def normalize_html(html):
  138. r"""
  139. Return normalized form of HTML which ignores insignificant output
  140. differences:
  141. * Multiple inner whitespaces are collapsed to a single space (except
  142. in pre tags).
  143. * Outer whitespace (outside block-level tags) is removed.
  144. * Self-closing tags are converted to open tags.
  145. * Attributes are sorted and lowercased.
  146. * References are converted to unicode, except that '<', '>', '&', and
  147. '&' are rendered using entities.
  148. """
  149. html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
  150. try:
  151. parser = MyHTMLParser()
  152. # We work around HTMLParser's limitations parsing CDATA
  153. # by breaking the input into chunks and passing CDATA chunks
  154. # through verbatim.
  155. for chunk in re.finditer(html_chunk_re, html):
  156. if chunk.group(0)[:8] == "<![CDATA":
  157. parser.output += chunk.group(0)
  158. else:
  159. parser.feed(chunk.group(0).decode(encoding='UTF-8'))
  160. parser.close()
  161. return parser.output
  162. except HTMLParseError as e:
  163. sys.stderr.write("Normalization error: " + e.msg + "\n")
  164. return html # on error, return unnormalized HTML
  165. def print_test_header(headertext, example_number, start_line, end_line):
  166. print "Example %d (lines %d-%d) %s" % (example_number,start_line,end_line,headertext)
  167. def do_test(markdown_lines, expected_html_lines, headertext,
  168. example_number, start_line, end_line, normalize):
  169. real_markdown_text = ''.join(markdown_lines).replace('→','\t')
  170. [retcode, actual_html, err] = cmark.to_html(real_markdown_text)
  171. if retcode == 0:
  172. actual_html_lines = actual_html.splitlines(True)
  173. expected_html = ''.join(expected_html_lines)
  174. if normalize:
  175. passed = normalize_html(actual_html) == normalize_html(expected_html)
  176. else:
  177. passed = actual_html == expected_html
  178. if passed:
  179. return 'pass'
  180. else:
  181. print_test_header(headertext, example_number,start_line,end_line)
  182. sys.stdout.write(real_markdown_text)
  183. for diffline in unified_diff(expected_html_lines, actual_html_lines,
  184. "expected HTML", "actual HTML"):
  185. sys.stdout.write(diffline)
  186. sys.stdout.write('\n')
  187. return 'fail'
  188. else:
  189. print_test_header(headertext, example_number, start_line, end_line)
  190. print "program returned error code %d" % retcode
  191. print(err)
  192. return 'error'
  193. def do_tests(specfile, pattern, normalize, dump_tests):
  194. line_number = 0
  195. start_line = 0
  196. end_line = 0
  197. example_number = 0
  198. passed = 0
  199. failed = 0
  200. errored = 0
  201. markdown_lines = []
  202. html_lines = []
  203. active = True
  204. state = 0 # 0 regular text, 1 markdown example, 2 html output
  205. headertext = ''
  206. tests_json = []
  207. header_re = re.compile('#+ ')
  208. if pattern:
  209. pattern_re = re.compile(pattern, re.IGNORECASE)
  210. with open(specfile, 'r') as specf:
  211. for line in specf:
  212. line_number = line_number + 1
  213. if state == 0 and re.match(header_re, line):
  214. headertext = header_re.sub('', line).strip()
  215. if pattern:
  216. if re.search(pattern_re, line):
  217. active = True
  218. else:
  219. active = False
  220. if line.strip() == ".":
  221. state = (state + 1) % 3
  222. if state == 0:
  223. example_number = example_number + 1
  224. end_line = line_number
  225. if active:
  226. if dump_tests:
  227. tests_json.append({
  228. "markdown":''.join(markdown_lines).replace('→',"\t"),
  229. "html":''.join(html_lines),
  230. "example": example_number,
  231. "start_line": start_line,
  232. "end_line": end_line,
  233. "section": headertext})
  234. else:
  235. result = do_test(markdown_lines, html_lines,
  236. headertext, example_number,
  237. start_line, end_line, normalize)
  238. if result == 'pass':
  239. passed = passed + 1
  240. elif result == 'fail':
  241. failed = failed + 1
  242. else:
  243. errored = errored + 1
  244. start_line = 0
  245. markdown_lines = []
  246. html_lines = []
  247. elif state == 1:
  248. if start_line == 0:
  249. start_line = line_number - 1
  250. markdown_lines.append(line)
  251. elif state == 2:
  252. html_lines.append(line)
  253. if dump_tests:
  254. print json.dumps(tests_json, ensure_ascii=False, indent=2)
  255. return True
  256. else:
  257. print "%d passed, %d failed, %d errored" % (passed, failed, errored)
  258. return (failed == 0 and errored == 0)
  259. if __name__ == "__main__":
  260. if args.debug_normalization:
  261. print normalize_html(sys.stdin.read())
  262. elif do_tests(args.spec, args.pattern, args.normalize, args.dump_tests):
  263. exit(0)
  264. else:
  265. exit(1)