diff options
Diffstat (limited to 'test')
-rw-r--r-- | test/cmark.py | 40 | ||||
-rw-r--r-- | test/normalize.py | 179 | ||||
-rwxr-xr-x | test/spec_tests.py | 139 |
3 files changed, 358 insertions, 0 deletions
diff --git a/test/cmark.py b/test/cmark.py new file mode 100644 index 0000000..253e3a8 --- /dev/null +++ b/test/cmark.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from ctypes import CDLL, c_char_p, c_long +from subprocess import * +import platform + +def pipe_through_prog(prog, text): + p1 = Popen(prog.split(), stdout=PIPE, stdin=PIPE, stderr=PIPE) + [result, err] = p1.communicate(input=text.encode('utf-8')) + return [p1.returncode, result.decode('utf-8'), err] + +def use_library(lib, text): + textbytes = text.encode('utf-8') + textlen = len(textbytes) + return [0, lib(textbytes, textlen).decode('utf-8'), ''] + +class CMark: + def __init__(self, prog=None, library_dir=None): + self.prog = prog + if prog: + self.to_html = lambda x: pipe_through_prog(prog, x) + else: + sysname = platform.system() + libname = "libcmark" + if sysname == 'Darwin': + libname += ".dylib" + elif sysname == 'Windows': + libname = "cmark.dll" + else: + libname += ".so" + if library_dir: + libpath = library_dir + "/" + libname + else: + libpath = "build/src/" + libname + cmark = CDLL(libpath) + markdown = cmark.cmark_markdown_to_html + markdown.restype = c_char_p + markdown.argtypes = [c_char_p, c_long] + self.to_html = lambda x: use_library(markdown, x) diff --git a/test/normalize.py b/test/normalize.py new file mode 100644 index 0000000..03d958e --- /dev/null +++ b/test/normalize.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +from html.parser import HTMLParser, HTMLParseError +from html.entities import name2codepoint +import sys +import re +import cgi + +# Normalization code, adapted from +# https://github.com/karlcow/markdown-testsuite/ +significant_attrs = ["alt", "href", "src", "title"] +whitespace_re = re.compile('\s+') +class MyHTMLParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.last = "starttag" + self.in_pre = False + self.output = "" + self.last_tag = "" + def handle_data(self, data): + after_tag = self.last == "endtag" or self.last == "starttag" + after_block_tag = after_tag and self.is_block_tag(self.last_tag) + if after_tag and self.last_tag == "br": + data = data.lstrip('\n') + data = whitespace_re.sub(' ', data) + if after_block_tag and not self.in_pre: + if self.last == "starttag": + data = data.lstrip() + elif self.last == "endtag": + data = data.strip() + self.output += data + self.last = "data" + def handle_endtag(self, tag): + if tag == "pre": + self.in_pre = False + elif self.is_block_tag(tag): + self.output = self.output.rstrip() + self.output += "</" + tag + ">" + self.last_tag = tag + self.last = "endtag" + def handle_starttag(self, tag, attrs): + if tag == "pre": + self.in_pre = True + if self.is_block_tag(tag): + self.output = self.output.rstrip() + self.output += "<" + tag + # For now we don't strip out 'extra' attributes, because of + # raw HTML test cases. + # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs) + if attrs: + attrs.sort() + for (k,v) in attrs: + self.output += " " + k + if v != None: + self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"') + self.output += ">" + self.last_tag = tag + self.last = "starttag" + def handle_startendtag(self, tag, attrs): + """Ignore closing tag for self-closing """ + self.handle_starttag(tag, attrs) + self.last_tag = tag + self.last = "endtag" + def handle_comment(self, data): + self.output += '<!--' + data + '-->' + self.last = "comment" + def handle_decl(self, data): + self.output += '<!' + data + '>' + self.last = "decl" + def unknown_decl(self, data): + self.output += '<!' + data + '>' + self.last = "decl" + def handle_pi(self,data): + self.output += '<?' + data + '>' + self.last = "pi" + def handle_entityref(self, name): + try: + c = chr(name2codepoint[name]) + except KeyError: + c = None + self.output_char(c, '&' + name + ';') + self.last = "ref" + def handle_charref(self, name): + try: + if name.startswith("x"): + c = chr(int(name[1:], 16)) + else: + c = chr(int(name)) + except ValueError: + c = None + self.output_char(c, '&' + name + ';') + self.last = "ref" + # Helpers. + def output_char(self, c, fallback): + if c == '<': + self.output += "<" + elif c == '>': + self.output += ">" + elif c == '&': + self.output += "&" + elif c == '"': + self.output += """ + elif c == None: + self.output += fallback + else: + self.output += c + + def is_block_tag(self,tag): + return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote', + 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas', + 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd', + 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt', + 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption', + 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style']) + +def normalize_html(html): + r""" + Return normalized form of HTML which ignores insignificant output + differences: + + Multiple inner whitespaces are collapsed to a single space (except + in pre tags): + + >>> normalize_html("<p>a \t b</p>") + '<p>a b</p>' + + >>> normalize_html("<p>a \t\nb</p>") + '<p>a b</p>' + + * Whitespace surrounding block-level tags is removed. + + >>> normalize_html("<p>a b</p>") + '<p>a b</p>' + + >>> normalize_html(" <p>a b</p>") + '<p>a b</p>' + + >>> normalize_html("<p>a b</p> ") + '<p>a b</p>' + + >>> normalize_html("\n\t<p>\n\t\ta b\t\t</p>\n\t") + '<p>a b</p>' + + >>> normalize_html("<i>a b</i> ") + '<i>a b</i> ' + + * Self-closing tags are converted to open tags. + + >>> normalize_html("<br />") + '<br>' + + * Attributes are sorted and lowercased. + + >>> normalize_html('<a title="bar" HREF="foo">x</a>') + '<a href="foo" title="bar">x</a>' + + * References are converted to unicode, except that '<', '>', '&', and + '"' are rendered using entities. + + >>> normalize_html("∀&><"") + '\u2200&><"' + + """ + html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)") + try: + parser = MyHTMLParser() + # We work around HTMLParser's limitations parsing CDATA + # by breaking the input into chunks and passing CDATA chunks + # through verbatim. + for chunk in re.finditer(html_chunk_re, html): + if chunk.group(0)[:8] == "<![CDATA": + parser.output += chunk.group(0) + else: + parser.feed(chunk.group(0)) + parser.close() + return parser.output + except HTMLParseError as e: + sys.stderr.write("Normalization error: " + e.msg + "\n") + return html # on error, return unnormalized HTML diff --git a/test/spec_tests.py b/test/spec_tests.py new file mode 100755 index 0000000..6c276ca --- /dev/null +++ b/test/spec_tests.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +from difflib import unified_diff +import argparse +import re +import json +from normalize import normalize_html +from cmark import CMark + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run cmark tests.') + parser.add_argument('-p', '--program', dest='program', nargs='?', default=None, + help='program to test') + parser.add_argument('-s', '--spec', dest='spec', nargs='?', default='spec.txt', + help='path to spec') + parser.add_argument('-P', '--pattern', dest='pattern', nargs='?', + default=None, help='limit to sections matching regex pattern') + parser.add_argument('--library-dir', dest='library_dir', nargs='?', + default=None, help='directory containing dynamic library') + parser.add_argument('--no-normalize', dest='normalize', + action='store_const', const=False, default=True, + help='do not normalize HTML') + parser.add_argument('-d', '--dump-tests', dest='dump_tests', + action='store_const', const=True, default=False, + help='dump tests in JSON format') + parser.add_argument('--debug-normalization', dest='debug_normalization', + action='store_const', const=True, + default=False, help='filter stdin through normalizer for testing') + parser.add_argument('-n', '--number', type=int, default=None, + help='only consider the test with the given number') + args = parser.parse_args(sys.argv[1:]) + +def print_test_header(headertext, example_number, start_line, end_line): + print("Example %d (lines %d-%d) %s" % (example_number,start_line,end_line,headertext)) + +def do_test(test, normalize, result_counts): + [retcode, actual_html, err] = cmark.to_html(test['markdown']) + if retcode == 0: + expected_html = test['html'] + unicode_error = None + if normalize: + try: + passed = normalize_html(actual_html) == normalize_html(expected_html) + except UnicodeDecodeError as e: + unicode_error = e + passed = False + else: + passed = actual_html == expected_html + if passed: + result_counts['pass'] += 1 + else: + print_test_header(test['section'], test['example'], test['start_line'], test['end_line']) + sys.stdout.write(test['markdown']) + if unicode_error: + print("Unicode error: " + str(unicode_error)) + print("Expected: " + repr(expected_html)) + print("Got: " + repr(actual_html)) + else: + expected_html_lines = expected_html.splitlines(True) + actual_html_lines = actual_html.splitlines(True) + for diffline in unified_diff(expected_html_lines, actual_html_lines, + "expected HTML", "actual HTML"): + sys.stdout.write(diffline) + sys.stdout.write('\n') + result_counts['fail'] += 1 + else: + print_test_header(test['section'], test['example'], test['start_line'], test['end_line']) + print("program returned error code %d" % retcode) + print(err) + result_counts['error'] += 1 + +def get_tests(specfile): + line_number = 0 + start_line = 0 + end_line = 0 + example_number = 0 + markdown_lines = [] + html_lines = [] + state = 0 # 0 regular text, 1 markdown example, 2 html output + headertext = '' + tests = [] + + header_re = re.compile('#+ ') + + with open(specfile, 'r', encoding='utf-8') as specf: + for line in specf: + line_number = line_number + 1 + if state == 0 and re.match(header_re, line): + headertext = header_re.sub('', line).strip() + if line.strip() == ".": + state = (state + 1) % 3 + if state == 0: + example_number = example_number + 1 + end_line = line_number + tests.append({ + "markdown":''.join(markdown_lines).replace('→',"\t"), + "html":''.join(html_lines), + "example": example_number, + "start_line": start_line, + "end_line": end_line, + "section": headertext}) + start_line = 0 + markdown_lines = [] + html_lines = [] + elif state == 1: + if start_line == 0: + start_line = line_number - 1 + markdown_lines.append(line) + elif state == 2: + html_lines.append(line) + return tests + +if __name__ == "__main__": + if args.debug_normalization: + print(normalize_html(sys.stdin.read())) + exit(0) + + all_tests = get_tests(args.spec) + if args.pattern: + pattern_re = re.compile(args.pattern, re.IGNORECASE) + else: + pattern_re = re.compile('.') + tests = [ test for test in all_tests if re.search(pattern_re, test['section']) and (not args.number or test['example'] == args.number) ] + if args.dump_tests: + print(json.dumps(tests, ensure_ascii=False, indent=2)) + exit(0) + else: + skipped = len(all_tests) - len(tests) + cmark = CMark(prog=args.program, library_dir=args.library_dir) + result_counts = {'pass': 0, 'fail': 0, 'error': 0, 'skip': skipped} + for test in tests: + do_test(test, args.normalize, result_counts) + print("{pass} passed, {fail} failed, {error} errored, {skip} skipped".format(**result_counts)) + if result_counts['fail'] == 0 and result_counts['error'] == 0: + exit(0) + else: + exit(1) |