inline: Improve RSS url munging to use a proper html parser

and support all elements that HTML::Tagset knows about. (Which doesn't include html5 just yet, but then the old version didn't either.) Bonus: 4 times faster than old regexp method.
author: Joey Hess <joey@kitenet.net> 2010-11-16 16:57:50 -0400
committer: Joey Hess <joey@kitenet.net> 2010-11-16 16:57:50 -0400
commit: b00c6c9640453bf1407c4e880ef0c171388197c7 (patch)
tree: 24fe57766a1522af6a86dcc76df71a015c04e2e3 /IkiWiki/Plugin/inline.pm
parent: fcf0ee574a974ca482509da595cdf9196a3afbb3 (diff)
1 files changed, 48 insertions, 15 deletions
diff --git a/IkiWiki/Plugin/inline.pm b/IkiWiki/Plugin/inline.pm
index 3b98bf8dd..1fe40a5ea 100644
--- a/IkiWiki/Plugin/inline.pm
+++ b/IkiWiki/Plugin/inline.pm
@@ -506,26 +506,59 @@ sub date_822 ($) {
 }
 
 sub absolute_urls ($$) {
-	# sucky sub because rss sucks
-	my $content=shift;
+	# needed because rss sucks
+	my $html=shift;
 	my $baseurl=shift;
 
 	my $url=$baseurl;
 	$url=~s/[^\/]+$//;
+	my $urltop; # calculated if needed
 
-	# what is the non path part of the url?
-	my $top_uri = URI->new($url);
-	$top_uri->path_query(""); # reset the path
-	my $urltop = $top_uri->as_string;
-
-	$content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(#[^"]+)"/$1 href="$baseurl$2"/mig;
-	# relative to another wiki page
-	$content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)([^\/][^"]*)"/$1 href="$url$2"/mig;
-	$content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)([^\/][^"]*)"/$1 src="$url$2"/mig;
-	# relative to the top of the site
-	$content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)(\/[^"]*)"/$1 href="$urltop$2"/mig;
-	$content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)(\/[^"]*)"/$1 src="$urltop$2"/mig;
-	return $content;
+	my $ret="";
+
+	eval q{use HTML::Parser; use HTML::Tagset};
+	die $@ if $@;
+	my $p = HTML::Parser->new(api_version => 3);
+	$p->handler(default => sub { $ret.=join("", @_) }, "text");
+	$p->handler(start => sub {
+		my ($tagname, $pos, $text) = @_;
+		if (ref $HTML::Tagset::linkElements{$tagname}) {
+			while (4 <= @$pos) {
+				# use attribute sets from right to left
+				# to avoid invalidating the offsets
+				# when replacing the values
+				my ($k_offset, $k_len, $v_offset, $v_len) =
+					splice(@$pos, -4);
+				my $attrname = lc(substr($text, $k_offset, $k_len));
+				next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}};
+				next unless $v_offset; # 0 v_offset means no value
+				my $v = substr($text, $v_offset, $v_len);
+				$v =~ s/^([\'\"])(.*)\1$/$2/;
+				if ($v=~/^#/) {
+					$v=$baseurl.$v; # anchor
+				}
+				elsif ($v=~/^(?!\w+:)[^\/]/) {
+					$v=$url.$v; # relative url
+				}
+				elsif ($v=~/^\//) {
+					if (! defined $urltop) {
+						# what is the non path part of the url?
+						my $top_uri = URI->new($url);
+						$top_uri->path_query(""); # reset the path
+						$urltop = $top_uri->as_string;
+					}
+					$v=$urltop.$v; # url relative to top of site
+				}
+				$v =~ s/\"/&quot;/g; # since we quote with ""
+				substr($text, $v_offset, $v_len) = qq("$v");
+			}
+		}
+		$ret.=$text;
+	}, "tagname, tokenpos, text");
+	$p->parse($html);
+	$p->eof;
+
+	return $ret;
 }
 
 sub genfeed ($$$$$@) {
author	Joey Hess <joey@kitenet.net>	2010-11-16 16:57:50 -0400
committer	Joey Hess <joey@kitenet.net>	2010-11-16 16:57:50 -0400
commit	b00c6c9640453bf1407c4e880ef0c171388197c7 (patch)
tree	24fe57766a1522af6a86dcc76df71a015c04e2e3 /IkiWiki/Plugin/inline.pm
parent	fcf0ee574a974ca482509da595cdf9196a3afbb3 (diff)