summaryrefslogtreecommitdiff
path: root/IkiWiki/Plugin/htmlscrubber.pm
blob: 634674b9c91b858f1e72c0fd6b949f18046ef183 (plain)
  1. #!/usr/bin/perl
  2. package IkiWiki::Plugin::htmlscrubber;
  3. use warnings;
  4. use strict;
  5. use IkiWiki 2.00;
  6. sub import { #{{{
  7. hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
  8. } # }}}
  9. sub sanitize (@) { #{{{
  10. my %params=@_;
  11. return scrubber()->scrub($params{content});
  12. } # }}}
  13. my $_scrubber;
  14. sub scrubber { #{{{
  15. return $_scrubber if defined $_scrubber;
  16. # Only known uri schemes are allowed to avoid all the ways of
  17. # embedding javascrpt.
  18. # List at http://en.wikipedia.org/wiki/URI_scheme
  19. my $uri_schemes=join("|",
  20. # IANA registered schemes
  21. "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
  22. "aaa", "aaas", "acap", "cap", "cid", "crid",
  23. "dav", "dict", "dns", "fax", "go", "h323", "im", "imap",
  24. "ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
  25. "sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
  26. "z39.50r", "z39.50s",
  27. # Selected unofficial schemes
  28. "about", "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
  29. "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
  30. "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
  31. "sftp", "sms", "steam", "webcal", "ymsgr",
  32. );
  33. # data is a special case. Allow data:image/*, but
  34. # disallow data:text/javascript and everything else.
  35. my $link=qr/^(?:$uri_schemes:|data:image\/|[^:]+$)/i;
  36. eval q{use HTML::Scrubber};
  37. error($@) if $@;
  38. # Lists based on http://feedparser.org/docs/html-sanitization.html
  39. # With html 5 video and audio tags added.
  40. $_scrubber = HTML::Scrubber->new(
  41. allow => [qw{
  42. a abbr acronym address area b big blockquote br br/
  43. button caption center cite code col colgroup dd del
  44. dfn dir div dl dt em fieldset font form h1 h2 h3 h4
  45. h5 h6 hr hr/ i img input ins kbd label legend li map
  46. menu ol optgroup option p p/ pre q s samp select small
  47. span strike strong sub sup table tbody td textarea
  48. tfoot th thead tr tt u ul var
  49. video audio
  50. }],
  51. default => [undef, { (
  52. map { $_ => 1 } qw{
  53. abbr accept accept-charset accesskey
  54. align alt axis border cellpadding cellspacing
  55. char charoff charset checked cite class
  56. clear cols colspan color compact coords
  57. datetime dir disabled enctype for frame
  58. headers height hreflang hspace id ismap
  59. label lang longdesc maxlength media method
  60. multiple name nohref noshade nowrap prompt
  61. readonly rel rev rows rowspan rules scope
  62. selected shape size span start summary
  63. tabindex target title type usemap valign
  64. value vspace width
  65. autoplay loopstart loopend end
  66. playcount controls
  67. } ),
  68. "/" => 1, # emit proper <hr /> XHTML
  69. href => $link,
  70. src => $link,
  71. action => $link,
  72. poster => $link,
  73. }],
  74. );
  75. return $_scrubber;
  76. } # }}}
  77. 1