Added a few more doctests for HTML normalization (#245).

author: John MacFarlane <jgm@berkeley.edu> 2014-12-19 08:14:13 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2014-12-19 08:14:36 -0800
commit: b28c97c9b8af266d4f12deb5febcf28807d9f5c6 (patch)
tree: 264557669682ec42dc1d8c48fe9de65e57ace733
parent: b5f809582e073a3b4cb31a167e03f18145a04249 (diff)
1 files changed, 21 insertions, 1 deletions
diff --git a/test/normalize.py b/test/normalize.py
index 4b922e6..5b4803b 100644
--- a/test/normalize.py
+++ b/test/normalize.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from HTMLParser import HTMLParser, HTMLParseError
 from htmlentitydefs import name2codepoint
 import sys
@@ -118,14 +119,33 @@ def normalize_html(html):
     Multiple inner whitespaces are collapsed to a single space (except
     in pre tags):
 
+        >>> normalize_html("<p>a  \t b</p>")
+        u'<p>a b</p>'
+
         >>> normalize_html("<p>a  \t\nb</p>")
         u'<p>a b</p>'
 
     * Outer whitespace (outside block-level tags) is removed.
+
+        >>> normalize_html("<p>a  b</p>  ")
+        u'<p>a b</p>'
+
     * Self-closing tags are converted to open tags.
+
+        >>> normalize_html("<br />")
+        u'<br>'
+
     * Attributes are sorted and lowercased.
+
+        >>> normalize_html('<a title="bar" HREF="foo">x</a>')
+        u'<a href="foo" title="bar">x</a>'
+
     * References are converted to unicode, except that '<', '>', '&', and
-      '&' are rendered using entities.
+      '"' are rendered using entities.
+
+        >>> normalize_html("&forall;&amp;&gt;&lt;&quot;")
+        u'\u2200&amp;&gt;&lt;&quot;'
+
     """
     html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
     try:
author	John MacFarlane <jgm@berkeley.edu>	2014-12-19 08:14:13 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2014-12-19 08:14:36 -0800
commit	b28c97c9b8af266d4f12deb5febcf28807d9f5c6 (patch)
tree	264557669682ec42dc1d8c48fe9de65e57ace733
parent	b5f809582e073a3b4cb31a167e03f18145a04249 (diff)