diff options
author | John MacFarlane <jgm@berkeley.edu> | 2014-11-20 08:57:20 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2014-11-20 08:57:49 -0800 |
commit | 7600cd859014bac31200d52b1c4f6e88136b3c97 (patch) | |
tree | 3bc2b56967eb25127252349624a674ac496c3c23 | |
parent | c9875cbbbe293e6727a7a25b79e7ea4949ef5670 (diff) |
runtests.py: Fixed normalization of declarations and CDATA.
If the input contains CDATA, we break it out and pass it through
verbatim, without sending it through HTMLParser, which breaks on
CDATA.
Improves on #161.
-rwxr-xr-x | runtests.py | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/runtests.py b/runtests.py index 83c331d..8a37f6d 100755 --- a/runtests.py +++ b/runtests.py @@ -115,7 +115,7 @@ class MyHTMLParser(HTMLParser): def handle_decl(self, data): self.output += '<!' + data + '>' self.last = "decl" - def handle_unknown_decl(self, data): + def unknown_decl(self, data): self.output += '<!' + data + '>' self.last = "decl" def handle_pi(self,data): @@ -174,15 +174,18 @@ def normalize_html(html): * Attributes are sorted and lowercased. * References are converted to unicode, except that '<', '>', '&', and '&' are rendered using entities. - - Known limitations: - - * HTMLParser just swallows CDATA. - * HTMLParser seems to treat unknown declarations as comments. """ + html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)") try: parser = MyHTMLParser() - parser.feed(html.decode(encoding='UTF-8')) + # We work around HTMLParser's limitations parsing CDATA + # by breaking the input into chunks and passing CDATA chunks + # through verbatim. + for chunk in re.finditer(html_chunk_re, html): + if chunk.group(0)[:8] == "<![CDATA": + parser.output += chunk.group(0) + else: + parser.feed(chunk.group(0).decode(encoding='UTF-8')) parser.close() return parser.output except HTMLParseError as e: |