aboutsummaryrefslogtreecommitdiff
path: root/test/normalize.py
blob: 29e404bfd09b1c6f1d18f0e7ea1091fd9f75046d (plain)
  1. from HTMLParser import HTMLParser, HTMLParseError
  2. from htmlentitydefs import name2codepoint
  3. import sys
  4. import re
  5. import cgi
  6. # Normalization code, adapted from
  7. # https://github.com/karlcow/markdown-testsuite/
  8. significant_attrs = ["alt", "href", "src", "title"]
  9. whitespace_re = re.compile('/s+/')
  10. class MyHTMLParser(HTMLParser):
  11. def __init__(self):
  12. HTMLParser.__init__(self)
  13. self.last = "starttag"
  14. self.in_pre = False
  15. self.output = u""
  16. self.last_tag = ""
  17. def handle_data(self, data):
  18. after_tag = self.last == "endtag" or self.last == "starttag"
  19. after_block_tag = after_tag and self.is_block_tag(self.last_tag)
  20. if after_tag and self.last_tag == "br":
  21. data = data.lstrip('\n')
  22. data = whitespace_re.sub(' ', data)
  23. if after_block_tag and not self.in_pre:
  24. if self.last == "starttag":
  25. data = data.lstrip()
  26. elif self.last == "endtag":
  27. data = data.strip()
  28. self.output += data
  29. self.last = "data"
  30. def handle_endtag(self, tag):
  31. if tag == "pre":
  32. self.in_pre = False
  33. if self.is_block_tag(tag):
  34. self.output = self.output.rstrip()
  35. self.output += "</" + tag + ">"
  36. self.last_tag = tag
  37. self.last = "endtag"
  38. def handle_starttag(self, tag, attrs):
  39. if tag == "pre":
  40. self.in_pre = True
  41. self.output += "<" + tag
  42. # For now we don't strip out 'extra' attributes, because of
  43. # raw HTML test cases.
  44. # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs)
  45. if attrs:
  46. attrs.sort()
  47. for (k,v) in attrs:
  48. self.output += " " + k
  49. if v != None:
  50. self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"')
  51. self.output += ">"
  52. self.last_tag = tag
  53. self.last = "starttag"
  54. def handle_startendtag(self, tag, attrs):
  55. """Ignore closing tag for self-closing """
  56. self.handle_starttag(tag, attrs)
  57. self.last_tag = tag
  58. self.last = "endtag"
  59. def handle_comment(self, data):
  60. self.output += '<!--' + data + '-->'
  61. self.last = "comment"
  62. def handle_decl(self, data):
  63. self.output += '<!' + data + '>'
  64. self.last = "decl"
  65. def unknown_decl(self, data):
  66. self.output += '<!' + data + '>'
  67. self.last = "decl"
  68. def handle_pi(self,data):
  69. self.output += '<?' + data + '>'
  70. self.last = "pi"
  71. def handle_entityref(self, name):
  72. try:
  73. c = unichr(name2codepoint[name])
  74. except KeyError:
  75. c = None
  76. self.output_char(c, '&' + name + ';')
  77. self.last = "ref"
  78. def handle_charref(self, name):
  79. try:
  80. if name.startswith("x"):
  81. c = unichr(int(name[1:], 16))
  82. else:
  83. c = unichr(int(name))
  84. except ValueError:
  85. c = None
  86. self.output_char(c, '&' + name + ';')
  87. self.last = "ref"
  88. # Helpers.
  89. def output_char(self, c, fallback):
  90. if c == u'<':
  91. self.output += "&lt;"
  92. elif c == u'>':
  93. self.output += "&gt;"
  94. elif c == u'&':
  95. self.output += "&amp;"
  96. elif c == u'"':
  97. self.output += "&quot;"
  98. elif c == None:
  99. self.output += fallback
  100. else:
  101. self.output += c
  102. def is_block_tag(self,tag):
  103. return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
  104. 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
  105. 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
  106. 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
  107. 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
  108. 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
  109. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
  110. def normalize_html(html):
  111. r"""
  112. Return normalized form of HTML which ignores insignificant output
  113. differences:
  114. Multiple inner whitespaces are collapsed to a single space (except
  115. in pre tags):
  116. >>> normalize_html("<p>a \t\nb</p>")
  117. u'<p>a b</p>'
  118. * Outer whitespace (outside block-level tags) is removed.
  119. * Self-closing tags are converted to open tags.
  120. * Attributes are sorted and lowercased.
  121. * References are converted to unicode, except that '<', '>', '&', and
  122. '&' are rendered using entities.
  123. """
  124. html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
  125. try:
  126. parser = MyHTMLParser()
  127. # We work around HTMLParser's limitations parsing CDATA
  128. # by breaking the input into chunks and passing CDATA chunks
  129. # through verbatim.
  130. for chunk in re.finditer(html_chunk_re, html):
  131. if chunk.group(0)[:8] == "<![CDATA":
  132. parser.output += chunk.group(0)
  133. else:
  134. parser.feed(chunk.group(0).decode(encoding='UTF-8'))
  135. parser.close()
  136. return parser.output
  137. except HTMLParseError as e:
  138. sys.stderr.write("Normalization error: " + e.msg + "\n")
  139. return html # on error, return unnormalized HTML