path: root/IkiWiki/Plugin
diff options
authorJoey Hess <>2010-11-16 16:57:50 -0400
committerJoey Hess <>2010-11-16 16:57:50 -0400
commitb00c6c9640453bf1407c4e880ef0c171388197c7 (patch)
tree24fe57766a1522af6a86dcc76df71a015c04e2e3 /IkiWiki/Plugin
parentfcf0ee574a974ca482509da595cdf9196a3afbb3 (diff)
inline: Improve RSS url munging to use a proper html parser
and support all elements that HTML::Tagset knows about. (Which doesn't include html5 just yet, but then the old version didn't either.) Bonus: 4 times faster than old regexp method.
Diffstat (limited to 'IkiWiki/Plugin')
1 files changed, 48 insertions, 15 deletions
diff --git a/IkiWiki/Plugin/ b/IkiWiki/Plugin/
index 3b98bf8dd..1fe40a5ea 100644
--- a/IkiWiki/Plugin/
+++ b/IkiWiki/Plugin/
@@ -506,26 +506,59 @@ sub date_822 ($) {
sub absolute_urls ($$) {
- # sucky sub because rss sucks
- my $content=shift;
+ # needed because rss sucks
+ my $html=shift;
my $baseurl=shift;
my $url=$baseurl;
+ my $urltop; # calculated if needed
- # what is the non path part of the url?
- my $top_uri = URI->new($url);
- $top_uri->path_query(""); # reset the path
- my $urltop = $top_uri->as_string;
- $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(#[^"]+)"/$1 href="$baseurl$2"/mig;
- # relative to another wiki page
- $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)([^\/][^"]*)"/$1 href="$url$2"/mig;
- $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)([^\/][^"]*)"/$1 src="$url$2"/mig;
- # relative to the top of the site
- $content=~s/(<a(?:\s+(?:class|id)\s*="?\w+"?)?)\s+href=\s*"(?!\w+:)(\/[^"]*)"/$1 href="$urltop$2"/mig;
- $content=~s/(<img(?:\s+(?:class|id|width|height)\s*="?\w+"?)*)\s+src=\s*"(?!\w+:)(\/[^"]*)"/$1 src="$urltop$2"/mig;
- return $content;
+ my $ret="";
+ eval q{use HTML::Parser; use HTML::Tagset};
+ die $@ if $@;
+ my $p = HTML::Parser->new(api_version => 3);
+ $p->handler(default => sub { $ret.=join("", @_) }, "text");
+ $p->handler(start => sub {
+ my ($tagname, $pos, $text) = @_;
+ if (ref $HTML::Tagset::linkElements{$tagname}) {
+ while (4 <= @$pos) {
+ # use attribute sets from right to left
+ # to avoid invalidating the offsets
+ # when replacing the values
+ my ($k_offset, $k_len, $v_offset, $v_len) =
+ splice(@$pos, -4);
+ my $attrname = lc(substr($text, $k_offset, $k_len));
+ next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}};
+ next unless $v_offset; # 0 v_offset means no value
+ my $v = substr($text, $v_offset, $v_len);
+ $v =~ s/^([\'\"])(.*)\1$/$2/;
+ if ($v=~/^#/) {
+ $v=$baseurl.$v; # anchor
+ }
+ elsif ($v=~/^(?!\w+:)[^\/]/) {
+ $v=$url.$v; # relative url
+ }
+ elsif ($v=~/^\//) {
+ if (! defined $urltop) {
+ # what is the non path part of the url?
+ my $top_uri = URI->new($url);
+ $top_uri->path_query(""); # reset the path
+ $urltop = $top_uri->as_string;
+ }
+ $v=$urltop.$v; # url relative to top of site
+ }
+ $v =~ s/\"/&quot;/g; # since we quote with ""
+ substr($text, $v_offset, $v_len) = qq("$v");
+ }
+ }
+ $ret.=$text;
+ }, "tagname, tokenpos, text");
+ $p->parse($html);
+ $p->eof;
+ return $ret;
sub genfeed ($$$$$@) {