From b28c97c9b8af266d4f12deb5febcf28807d9f5c6 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 19 Dec 2014 08:14:13 -0800 Subject: Added a few more doctests for HTML normalization (#245). --- test/normalize.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'test') diff --git a/test/normalize.py b/test/normalize.py index 4b922e6..5b4803b 100644 --- a/test/normalize.py +++ b/test/normalize.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from HTMLParser import HTMLParser, HTMLParseError from htmlentitydefs import name2codepoint import sys @@ -118,14 +119,33 @@ def normalize_html(html): Multiple inner whitespaces are collapsed to a single space (except in pre tags): + >>> normalize_html("

a \t b

") + u'

a b

' + >>> normalize_html("

a \t\nb

") u'

a b

' * Outer whitespace (outside block-level tags) is removed. + + >>> normalize_html("

a b

") + u'

a b

' + * Self-closing tags are converted to open tags. + + >>> normalize_html("
") + u'
' + * Attributes are sorted and lowercased. + + >>> normalize_html('x') + u'x' + * References are converted to unicode, except that '<', '>', '&', and - '&' are rendered using entities. + '"' are rendered using entities. + + >>> normalize_html("∀&><"") + u'\u2200&><"' + """ html_chunk_re = re.compile("(\|\<[^>]*\>|[^<]+)") try: -- cgit v1.2.3