summaryrefslogtreecommitdiff
path: root/IkiWiki/Plugin/htmlscrubber.pm
blob: 25caa8a506cdf5ab8ea0c9b0d3e9e935f62bd6f8 (plain)
  1. #!/usr/bin/perl
  2. package IkiWiki::Plugin::htmlscrubber;
  3. use warnings;
  4. use strict;
  5. use IkiWiki 2.00;
  6. sub import { #{{{
  7. hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
  8. } # }}}
  9. sub sanitize (@) { #{{{
  10. my %params=@_;
  11. return scrubber()->scrub($params{content});
  12. } # }}}
  13. my $_scrubber;
  14. sub scrubber { #{{{
  15. return $_scrubber if defined $_scrubber;
  16. # Only known uri schemes are allowed to avoid all the ways of
  17. # embedding javascrpt.
  18. # List at http://en.wikipedia.org/wiki/URI_scheme
  19. my $uri_schemes=join("|",
  20. # IANA registered schemes
  21. "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
  22. "aaa", "aaas", "acap", "cap", "cid", "crid",
  23. "dav", "dict", "dns", "fax", "go", "h323", "im", "imap",
  24. "ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
  25. "sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
  26. "z39.50r", "z39.50s",
  27. # data is a special case. Allow data:text/<image>, but
  28. # disallow data:text/javascript and everything else.
  29. qr/data:text\/(?:png|gif|jpeg)/,
  30. # Selected unofficial schemes
  31. "about", "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
  32. "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
  33. "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
  34. "sftp", "sms", "steam", "webcal", "ymsgr",
  35. );
  36. my $link=qr/^(?:$uri_schemes:|[^:]+$)/i;
  37. eval q{use HTML::Scrubber};
  38. error($@) if $@;
  39. # Lists based on http://feedparser.org/docs/html-sanitization.html
  40. # With html 5 video and audio tags added.
  41. $_scrubber = HTML::Scrubber->new(
  42. allow => [qw{
  43. a abbr acronym address area b big blockquote br br/
  44. button caption center cite code col colgroup dd del
  45. dfn dir div dl dt em fieldset font form h1 h2 h3 h4
  46. h5 h6 hr hr/ i img input ins kbd label legend li map
  47. menu ol optgroup option p p/ pre q s samp select small
  48. span strike strong sub sup table tbody td textarea
  49. tfoot th thead tr tt u ul var
  50. video audio
  51. }],
  52. default => [undef, { (
  53. map { $_ => 1 } qw{
  54. abbr accept accept-charset accesskey
  55. align alt axis border cellpadding cellspacing
  56. char charoff charset checked cite class
  57. clear cols colspan color compact coords
  58. datetime dir disabled enctype for frame
  59. headers height hreflang hspace id ismap
  60. label lang longdesc maxlength media method
  61. multiple name nohref noshade nowrap prompt
  62. readonly rel rev rows rowspan rules scope
  63. selected shape size span start summary
  64. tabindex target title type usemap valign
  65. value vspace width
  66. autoplay loopstart loopend end
  67. playcount controls
  68. } ),
  69. "/" => 1, # emit proper <hr /> XHTML
  70. href => $link,
  71. src => $link,
  72. action => $link,
  73. poster => $link,
  74. }],
  75. );
  76. return $_scrubber;
  77. } # }}}
  78. 1