aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2014-12-19 08:14:13 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2014-12-19 08:14:36 -0800
commitb28c97c9b8af266d4f12deb5febcf28807d9f5c6 (patch)
tree264557669682ec42dc1d8c48fe9de65e57ace733
parentb5f809582e073a3b4cb31a167e03f18145a04249 (diff)
Added a few more doctests for HTML normalization (#245).
-rw-r--r--test/normalize.py22
1 files changed, 21 insertions, 1 deletions
diff --git a/test/normalize.py b/test/normalize.py
index 4b922e6..5b4803b 100644
--- a/test/normalize.py
+++ b/test/normalize.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser, HTMLParseError
from htmlentitydefs import name2codepoint
import sys
@@ -118,14 +119,33 @@ def normalize_html(html):
Multiple inner whitespaces are collapsed to a single space (except
in pre tags):
+ >>> normalize_html("<p>a \t b</p>")
+ u'<p>a b</p>'
+
>>> normalize_html("<p>a \t\nb</p>")
u'<p>a b</p>'
* Outer whitespace (outside block-level tags) is removed.
+
+ >>> normalize_html("<p>a b</p> ")
+ u'<p>a b</p>'
+
* Self-closing tags are converted to open tags.
+
+ >>> normalize_html("<br />")
+ u'<br>'
+
* Attributes are sorted and lowercased.
+
+ >>> normalize_html('<a title="bar" HREF="foo">x</a>')
+ u'<a href="foo" title="bar">x</a>'
+
* References are converted to unicode, except that '<', '>', '&', and
- '&' are rendered using entities.
+ '"' are rendered using entities.
+
+ >>> normalize_html("&forall;&amp;&gt;&lt;&quot;")
+ u'\u2200&amp;&gt;&lt;&quot;'
+
"""
html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
try: