diff options
author | John MacFarlane <jgm@berkeley.edu> | 2014-12-19 08:14:13 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2014-12-19 08:14:36 -0800 |
commit | b28c97c9b8af266d4f12deb5febcf28807d9f5c6 (patch) | |
tree | 264557669682ec42dc1d8c48fe9de65e57ace733 /test | |
parent | b5f809582e073a3b4cb31a167e03f18145a04249 (diff) |
Added a few more doctests for HTML normalization (#245).
Diffstat (limited to 'test')
-rw-r--r-- | test/normalize.py | 22 |
1 files changed, 21 insertions, 1 deletions
diff --git a/test/normalize.py b/test/normalize.py index 4b922e6..5b4803b 100644 --- a/test/normalize.py +++ b/test/normalize.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from HTMLParser import HTMLParser, HTMLParseError from htmlentitydefs import name2codepoint import sys @@ -118,14 +119,33 @@ def normalize_html(html): Multiple inner whitespaces are collapsed to a single space (except in pre tags): + >>> normalize_html("<p>a \t b</p>") + u'<p>a b</p>' + >>> normalize_html("<p>a \t\nb</p>") u'<p>a b</p>' * Outer whitespace (outside block-level tags) is removed. + + >>> normalize_html("<p>a b</p> ") + u'<p>a b</p>' + * Self-closing tags are converted to open tags. + + >>> normalize_html("<br />") + u'<br>' + * Attributes are sorted and lowercased. + + >>> normalize_html('<a title="bar" HREF="foo">x</a>') + u'<a href="foo" title="bar">x</a>' + * References are converted to unicode, except that '<', '>', '&', and - '&' are rendered using entities. + '"' are rendered using entities. + + >>> normalize_html("∀&><"") + u'\u2200&><"' + """ html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)") try: |