aboutsummaryrefslogtreecommitdiff
path: root/html2ast.py
blob: 9d23e8771e635e3d042161f14d42b32296fedb9c (plain)
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import sys
  4. from HTMLParser import HTMLParser, HTMLParseError
  5. import re
  6. from htmlentitydefs import name2codepoint
  7. import cgi
  8. class MyHTMLParser(HTMLParser):
  9. def __init__(self):
  10. HTMLParser.__init__(self)
  11. self.in_pre = False
  12. self.output = u""
  13. def handle_data(self, data):
  14. self.output += data
  15. def handle_endtag(self, tag):
  16. if tag == "pre":
  17. self.in_pre = False
  18. def handle_starttag(self, tag, attrs):
  19. if tag == "pre":
  20. self.in_pre = True
  21. def handle_startendtag(self, tag, attrs):
  22. self.output += ""
  23. def handle_comment(self, data):
  24. self.output += '<!--' + data + '-->'
  25. def handle_decl(self, data):
  26. self.output += '<!' + data + '>'
  27. def unknown_decl(self, data):
  28. self.output += '<!' + data + '>'
  29. def handle_pi(self,data):
  30. self.output += '<?' + data + '>'
  31. def handle_entityref(self, name):
  32. try:
  33. c = unichr(name2codepoint[name])
  34. except KeyError:
  35. c = None
  36. self.output_char(c)
  37. def handle_charref(self, name):
  38. try:
  39. if name.startswith("x"):
  40. c = unichr(int(name[1:], 16))
  41. else:
  42. c = unichr(int(name))
  43. except ValueError:
  44. c = None
  45. self.output_char(c)
  46. # Helpers.
  47. def output_char(self, c):
  48. if c == u'\n':
  49. self.output += "\\n"
  50. elif c == u'"':
  51. self.output += "\\\""
  52. elif c == u'\\':
  53. self.output += "\\\\"
  54. else:
  55. self.output += c
  56. def is_block_tag(self,tag):
  57. return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
  58. 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
  59. 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
  60. 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
  61. 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
  62. 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
  63. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
  64. def html2ast(html):
  65. html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
  66. parser = MyHTMLParser()
  67. # We work around HTMLParser's limitations parsing CDATA
  68. # by breaking the input into chunks and passing CDATA chunks
  69. # through verbatim.
  70. for chunk in re.finditer(html_chunk_re, html):
  71. if chunk.group(0)[:8] == "<![CDATA":
  72. parser.output += chunk.group(0)
  73. else:
  74. parser.feed(chunk.group(0).decode(encoding='UTF-8'))
  75. parser.close()
  76. return parser.output
  77. if __name__ == "__main__":
  78. print(html2ast(sys.stdin.read()))