summaryrefslogtreecommitdiff
path: root/doc/tips/convert_mediawiki_to_ikiwiki
diff options
context:
space:
mode:
authorJoey Hess <joey@gnu.kitenet.net>2009-04-04 17:27:48 -0400
committerJoey Hess <joey@gnu.kitenet.net>2009-04-04 17:27:48 -0400
commit8e92468eae9ac0ab8161a0c71ff6c6a0a8aef07a (patch)
tree9e26465e0ca98a5f3cbc6c72a0cace4bf83b93db /doc/tips/convert_mediawiki_to_ikiwiki
parent78a69e5bd632eb86ef8135e9c1d05d2c48b43362 (diff)
parent08fda4c9d374de1d3de3172a192d4d915d3dc0c1 (diff)
Merge branch 'master'
Conflicts: doc/ikiwiki-makerepo.mdwn
Diffstat (limited to 'doc/tips/convert_mediawiki_to_ikiwiki')
-rw-r--r--doc/tips/convert_mediawiki_to_ikiwiki/discussion.mdwn612
1 files changed, 612 insertions, 0 deletions
diff --git a/doc/tips/convert_mediawiki_to_ikiwiki/discussion.mdwn b/doc/tips/convert_mediawiki_to_ikiwiki/discussion.mdwn
new file mode 100644
index 000000000..15ddccb92
--- /dev/null
+++ b/doc/tips/convert_mediawiki_to_ikiwiki/discussion.mdwn
@@ -0,0 +1,612 @@
+The u32 page is excellent, but I wonder if documenting the procedure here
+would be worthwhile. Who knows, the remote site might disappear. But also
+there are some variations on the approach that might be useful:
+
+ * using a python script and the dom library to extract the page names from
+ Special:Allpages (such as
+ <http://www.staff.ncl.ac.uk/jon.dowland/unix/docs/get_pagenames.py>)
+ * Or, querying the mysql back-end to get the names
+ * using WWW::MediaWiki for importing/exporting pages from the wiki, instead
+ of Special::Export
+
+Also, some detail on converting mediawiki transclusion to ikiwiki inlines...
+
+-- [[users/Jon]]
+
+> "Who knows, the remote site might disappear.". Right now, it appears to
+> have done just that. -- [[users/Jon]]
+
+
+The iki-fast-load ruby script from the u32 page is given below:
+
+ #!/usr/bin/env ruby
+
+ # This script is called on the final sorted, de-spammed revision
+ # XML file.
+ #
+ # It doesn't currently check for no-op revisions... I believe
+ # that git-fast-load will dutifully load them even though nothing
+ # happened. I don't care to solve this by adding a file cache
+ # to this script. You can run iki-diff-next.rb to highlight any
+ # empty revisions that need to be removed.
+ #
+ # This turns each node into an equivalent file.
+ # It does not convert spaces to underscores in file names.
+ # This would break wikilinks.
+ # I suppose you could fix this with mod_speling or mod_rewrite.
+ #
+ # It replaces nodes in the Image: namespace with the files themselves.
+
+
+ require 'rubygems'
+ require 'node-callback'
+ require 'time'
+ require 'ostruct'
+
+
+ # pipe is the stream to receive the git-fast-import commands
+ # putfrom is true if this branch has existing commits on it, false if not.
+ def format_git_commit(pipe, f)
+ # Need to escape backslashes and double-quotes for git?
+ # No, git breaks when I do this.
+ # For the filename "path with \\", git sez: bad default revision 'HEAD'
+ # filename = '"' + filename.gsub('\\', '\\\\\\\\').gsub('"', '\\"') + '"'
+
+ # In the calls below, length must be the size in bytes!!
+ # TODO: I haven't figured out how this works in the land of UTF8 and Ruby 1.9.
+ pipe.puts "commit #{f.branch}"
+ pipe.puts "committer #{f.username} <#{f.email}> #{f.timestamp.rfc2822}"
+ pipe.puts "data #{f.message.length}\n#{f.message}\n"
+ pipe.puts "from #{f.branch}^0" if f.putfrom
+ pipe.puts "M 644 inline #{f.filename}"
+ pipe.puts "data #{f.content.length}\n#{f.content}\n"
+ pipe.puts
+ end
+
+
+Mediawiki.pm - A plugin which supports mediawiki format.
+
+ #!/usr/bin/perl
+ # By Scott Bronson. Licensed under the GPLv2+ License.
+ # Extends Ikiwiki to be able to handle Mediawiki markup.
+ #
+ # To use the Mediawiki Plugin:
+ # - Install Text::MediawikiFormat
+ # - Turn of prefix_directives in your setup file.
+ # (TODO: we probably don't need to do this anymore?)
+ # prefix_directives => 1,
+ # - Add this plugin on Ikiwiki's path (perl -V, look for @INC)
+ # cp mediawiki.pm something/IkiWiki/Plugin
+ # - And enable it in your setup file
+ # add_plugins => [qw{mediawiki}],
+ # - Finally, turn off the link plugin in setup (this is important)
+ # disable_plugins => [qw{link}],
+ # - Rebuild everything (actually, this should be automatic right?)
+ # - Now all files with a .mediawiki extension should be rendered properly.
+
+
+ package IkiWiki::Plugin::mediawiki;
+
+ use warnings;
+ use strict;
+ use IkiWiki 2.00;
+ use URI;
+
+
+ # This is a gross hack... We disable the link plugin so that our
+ # linkify routine is always called. Then we call the link plugin
+ # directly for all non-mediawiki pages. Ouch... Hopefully Ikiwiki
+ # will be updated soon to support multiple link plugins.
+ require IkiWiki::Plugin::link;
+
+ # Even if T:MwF is not installed, we can still handle all the linking.
+ # The user will just see Mediawiki markup rather than formatted markup.
+ eval q{use Text::MediawikiFormat ()};
+ my $markup_disabled = $@;
+
+ # Work around a UTF8 bug in Text::MediawikiFormat
+ # http://rt.cpan.org/Public/Bug/Display.html?id=26880
+ unless($markup_disabled) {
+ no strict 'refs';
+ no warnings;
+ *{'Text::MediawikiFormat::uri_escape'} = \&URI::Escape::uri_escape_utf8;
+ }
+
+ my %metaheaders; # keeps track of redirects for pagetemplate.
+ my %tags; # keeps track of tags for pagetemplate.
+
+
+ sub import { #{{{
+ hook(type => "checkconfig", id => "mediawiki", call => \&checkconfig);
+ hook(type => "scan", id => "mediawiki", call => \&scan);
+ hook(type => "linkify", id => "mediawiki", call => \&linkify);
+ hook(type => "htmlize", id => "mediawiki", call => \&htmlize);
+ hook(type => "pagetemplate", id => "mediawiki", call => \&pagetemplate);
+ } # }}}
+
+
+ sub checkconfig
+ {
+ return IkiWiki::Plugin::link::checkconfig(@_);
+ }
+
+
+ my $link_regexp = qr{
+ \[\[(?=[^!]) # beginning of link
+ ([^\n\r\]#|<>]+) # 1: page to link to
+ (?:
+ \# # '#', beginning of anchor
+ ([^|\]]+) # 2: anchor text
+ )? # optional
+
+ (?:
+ \| # followed by '|'
+ ([^\]\|]*) # 3: link text
+ )? # optional
+ \]\] # end of link
+ ([a-zA-Z]*) # optional trailing alphas
+ }x;
+
+
+ # Convert spaces in the passed-in string into underscores.
+ # If passed in undef, returns undef without throwing errors.
+ sub underscorize
+ {
+ my $var = shift;
+ $var =~ tr{ }{_} if $var;
+ return $var;
+ }
+
+
+ # Underscorize, strip leading and trailing space, and scrunch
+ # multiple runs of spaces into one underscore.
+ sub scrunch
+ {
+ my $var = shift;
+ if($var) {
+ $var =~ s/^\s+|\s+$//g; # strip leading and trailing space
+ $var =~ s/\s+/ /g; # squash multiple spaces to one
+ }
+ return $var;
+ }
+
+
+ # Translates Mediawiki paths into Ikiwiki paths.
+ # It needs to be pretty careful because Mediawiki and Ikiwiki handle
+ # relative vs. absolute exactly opposite from each other.
+ sub translate_path
+ {
+ my $page = shift;
+ my $path = scrunch(shift);
+
+ # always start from root unless we're doing relative shenanigans.
+ $page = "/" unless $path =~ /^(?:\/|\.\.)/;
+
+ my @result = ();
+ for(split(/\//, "$page/$path")) {
+ if($_ eq '..') {
+ pop @result;
+ } else {
+ push @result, $_ if $_ ne "";
+ }
+ }
+
+ # temporary hack working around http://ikiwiki.info/bugs/Can__39__t_create_root_page/index.html?updated
+ # put this back the way it was once this bug is fixed upstream.
+ # This is actually a major problem because now Mediawiki pages can't link from /Git/git-svn to /git-svn. And upstream appears to be uninterested in fixing this bug. :(
+ # return "/" . join("/", @result);
+ return join("/", @result);
+ }
+
+
+ # Figures out the human-readable text for a wikilink
+ sub linktext
+ {
+ my($page, $inlink, $anchor, $title, $trailing) = @_;
+ my $link = translate_path($page,$inlink);
+
+ # translate_path always produces an absolute link.
+ # get rid of the leading slash before we display this link.
+ $link =~ s#^/##;
+
+ my $out = "";
+ if($title) {
+ $out = IkiWiki::pagetitle($title);
+ } else {
+ $link = $inlink if $inlink =~ /^\s*\//;
+ $out = $anchor ? "$link#$anchor" : $link;
+ if(defined $title && $title eq "") {
+ # a bare pipe appeared in the link...
+ # user wants to strip namespace and trailing parens.
+ $out =~ s/^[A-Za-z0-9_-]*://;
+ $out =~ s/\s*\(.*\)\s*$//;
+ }
+ # A trailing slash suppresses the leading slash
+ $out =~ s#^/(.*)/$#$1#;
+ }
+ $out .= $trailing if defined $trailing;
+ return $out;
+ }
+
+
+ sub tagpage ($)
+ {
+ my $tag=shift;
+
+ if (exists $config{tagbase} && defined $config{tagbase}) {
+ $tag=$config{tagbase}."/".$tag;
+ }
+
+ return $tag;
+ }
+
+
+ # Pass a URL and optional text associated with it. This call turns
+ # it into fully-formatted HTML the same way Mediawiki would.
+ # Counter is used to number untitled links sequentially on the page.
+ # It should be set to 1 when you start parsing a new page. This call
+ # increments it automatically.
+ sub generate_external_link
+ {
+ my $url = shift;
+ my $text = shift;
+ my $counter = shift;
+
+ # Mediawiki trims off trailing commas.
+ # And apparently it does entity substitution first.
+ # Since we can't, we'll fake it.
+
+ # trim any leading and trailing whitespace
+ $url =~ s/^\s+|\s+$//g;
+
+ # url properly terminates on > but must special-case &gt;
+ my $trailer = "";
+ $url =~ s{(\&(?:gt|lt)\;.*)$}{ $trailer = $1, ''; }eg;
+
+ # Trim some potential trailing chars, put them outside the link.
+ my $tmptrail = "";
+ $url =~ s{([,)]+)$}{ $tmptrail .= $1, ''; }eg;
+ $trailer = $tmptrail . $trailer;
+
+ my $title = $url;
+ if(defined $text) {
+ if($text eq "") {
+ $text = "[$$counter]";
+ $$counter += 1;
+ }
+ $text =~ s/^\s+|\s+$//g;
+ $text =~ s/^\|//;
+ } else {
+ $text = $url;
+ }
+
+ return "<a href='$url' title='$title'>$text</a>$trailer";
+ }
+
+
+ # Called to handle bookmarks like [[#heading]] or <span class="createlink"><a href="http://u32.net/cgi-bin/ikiwiki.cgi?page=%20text%20&amp;from=Mediawiki_Plugin%2Fmediawiki&amp;do=create" rel="nofollow">?</a>#a</span>
+ sub generate_fragment_link
+ {
+ my $url = shift;
+ my $text = shift;
+
+ my $inurl = $url;
+ my $intext = $text;
+ $url = scrunch($url);
+
+ if(defined($text) && $text ne "") {
+ $text = scrunch($text);
+ } else {
+ $text = $url;
+ }
+
+ $url = underscorize($url);
+
+ # For some reason Mediawiki puts blank titles on all its fragment links.
+ # I don't see why we would duplicate that behavior here.
+ return "<a href='$url'>$text</a>";
+ }
+
+
+ sub generate_internal_link
+ {
+ my($page, $inlink, $anchor, $title, $trailing, $proc) = @_;
+
+ # Ikiwiki's link link plugin wrecks this line when displaying on the site.
+ # Until the code highlighter plugin can turn off link finding,
+ # always escape double brackets in double quotes: [[
+ if($inlink eq '..') {
+ # Mediawiki doesn't touch links like [[..#hi|ho]].
+ return "[[" . $inlink . ($anchor?"#$anchor":"") .
+ ($title?"|$title":"") . "]]" . $trailing;
+ }
+
+ my($linkpage, $linktext);
+ if($inlink =~ /^ (:?) \s* Category (\s* \: \s*) ([^\]]*) $/x) {
+ # Handle category links
+ my $sep = $2;
+ $inlink = $3;
+ $linkpage = IkiWiki::linkpage(translate_path($page, $inlink));
+ if($1) {
+ # Produce a link but don't add this page to the given category.
+ $linkpage = tagpage($linkpage);
+ $linktext = ($title ? '' : "Category$sep") .
+ linktext($page, $inlink, $anchor, $title, $trailing);
+ $tags{$page}{$linkpage} = 1;
+ } else {
+ # Add this page to the given category but don't produce a link.
+ $tags{$page}{$linkpage} = 1;
+ &$proc(tagpage($linkpage), $linktext, $anchor);
+ return "";
+ }
+ } else {
+ # It's just a regular link
+ $linkpage = IkiWiki::linkpage(translate_path($page, $inlink));
+ $linktext = linktext($page, $inlink, $anchor, $title, $trailing);
+ }
+
+ return &$proc($linkpage, $linktext, $anchor);
+ }
+
+
+ sub check_redirect
+ {
+ my %params=@_;
+
+ my $page=$params{page};
+ my $destpage=$params{destpage};
+ my $content=$params{content};
+
+ return "" if $page ne $destpage;
+
+ if($content !~ /^ \s* \#REDIRECT \s* \[\[ ( [^\]]+ ) \]\]/x) {
+ # this page isn't a redirect, render it normally.
+ return undef;
+ }
+
+ # The rest of this function is copied from the redir clause
+ # in meta::preprocess and actually handles the redirect.
+
+ my $value = $1;
+ $value =~ s/^\s+|\s+$//g;
+
+ my $safe=0;
+ if ($value !~ /^\w+:\/\//) {
+ # it's a local link
+ my ($redir_page, $redir_anchor) = split /\#/, $value;
+
+ add_depends($page, $redir_page);
+ my $link=bestlink($page, underscorize(translate_path($page,$redir_page)));
+ if (! length $link) {
+ return "<b>Redirect Error:</b> <nowiki>[[$redir_page]] not found.</nowiki>";
+ }
+
+ $value=urlto($link, $page);
+ $value.='#'.$redir_anchor if defined $redir_anchor;
+ $safe=1;
+
+ # redir cycle detection
+ $pagestate{$page}{mediawiki}{redir}=$link;
+ my $at=$page;
+ my %seen;
+ while (exists $pagestate{$at}{mediawiki}{redir}) {
+ if ($seen{$at}) {
+ return "<b>Redirect Error:</b> cycle found on <nowiki>[[$at]]</nowiki>";
+ }
+ $seen{$at}=1;
+ $at=$pagestate{$at}{mediawiki}{redir};
+ }
+ } else {
+ # it's an external link
+ $value = encode_entities($value);
+ }
+
+ my $redir="<meta http-equiv=\"refresh\" content=\"0; URL=$value\" />";
+ $redir=scrub($redir) if !$safe;
+ push @{$metaheaders{$page}}, $redir;
+
+ return "Redirecting to $value ...";
+ }
+
+
+ # Feed this routine a string containing <nowiki>...</nowiki> sections,
+ # this routine calls your callback for every section not within nowikis,
+ # collecting its return values and returning the rewritten string.
+ sub skip_nowiki
+ {
+ my $content = shift;
+ my $proc = shift;
+
+ my $result = "";
+ my $state = 0;
+
+ for(split(/(<nowiki[^>]*>.*?<\/nowiki\s*>)/s, $content)) {
+ $result .= ($state ? $_ : &$proc($_));
+ $state = !$state;
+ }
+
+ return $result;
+ }
+
+
+ # Converts all links in the page, wiki and otherwise.
+ sub linkify (@)
+ {
+ my %params=@_;
+
+ my $page=$params{page};
+ my $destpage=$params{destpage};
+ my $content=$params{content};
+
+ my $file=$pagesources{$page};
+ my $type=pagetype($file);
+ my $counter = 1;
+
+ if($type ne 'mediawiki') {
+ return IkiWiki::Plugin::link::linkify(@_);
+ }
+
+ my $redir = check_redirect(%params);
+ return $redir if defined $redir;
+
+ # this code was copied from MediawikiFormat.pm.
+ # Heavily changed because MF.pm screws up escaping when it does
+ # this awful hack: $uricCheat =~ tr/://d;
+ my $schemas = [qw(http https ftp mailto gopher)];
+ my $re = join "|", map {qr/\Q$_\E/} @$schemas;
+ my $schemes = qr/(?:$re)/;
+ # And this is copied from URI:
+ my $reserved = q(;/?@&=+$,); # NOTE: no colon or [] !
+ my $uric = quotemeta($reserved) . $URI::unreserved . "%#";
+
+ my $result = skip_nowiki($content, sub {
+ $_ = shift;
+
+ # Escape any anchors
+ #s/<(a[\s>\/])/&lt;$1/ig;
+ # Disabled because this appears to screw up the aggregate plugin.
+ # I guess we'll rely on Iki to post-sanitize this sort of stuff.
+
+ # Replace external links, http://blah or [http://blah]
+ s{\b($schemes:[$uric][:$uric]+)|\[($schemes:[$uric][:$uric]+)([^\]]*?)\]}{
+ generate_external_link($1||$2, $3, \$counter)
+ }eg;
+
+ # Handle links that only contain fragments.
+ s{ \[\[ \s* (\#[^|\]'"<>&;]+) (?:\| ([^\]'"<>&;]*))? \]\] }{
+ generate_fragment_link($1, $2)
+ }xeg;
+
+ # Match all internal links
+ s{$link_regexp}{
+ generate_internal_link($page, $1, $2, $3, $4, sub {
+ my($linkpage, $linktext, $anchor) = @_;
+ return htmllink($page, $destpage, $linkpage,
+ linktext => $linktext,
+ anchor => underscorize(scrunch($anchor)));
+ });
+ }eg;
+
+ return $_;
+ });
+
+ return $result;
+ }
+
+
+ # Find all WikiLinks in the page.
+ sub scan (@)
+ {
+ my %params = @_;
+ my $page=$params{page};
+ my $content=$params{content};
+
+ my $file=$pagesources{$page};
+ my $type=pagetype($file);
+
+ if($type ne 'mediawiki') {
+ return IkiWiki::Plugin::link::scan(@_);
+ }
+
+ skip_nowiki($content, sub {
+ $_ = shift;
+ while(/$link_regexp/g) {
+ generate_internal_link($page, $1, '', '', '', sub {
+ my($linkpage, $linktext, $anchor) = @_;
+ push @{$links{$page}}, $linkpage;
+ return undef;
+ });
+ }
+ return '';
+ });
+ }
+
+
+ # Convert the page to HTML.
+ sub htmlize (@)
+ {
+ my %params=@_;
+ my $page = $params{page};
+ my $content = $params{content};
+
+
+ return $content if $markup_disabled;
+
+ # Do a little preprocessing to babysit Text::MediawikiFormat
+ # If a line begins with tabs, T:MwF won't convert it into preformatted blocks.
+ $content =~ s/^\t/ /mg;
+
+ my $ret = Text::MediawikiFormat::format($content, {
+
+ allowed_tags => [#HTML
+ # MediawikiFormat default
+ qw(b big blockquote br caption center cite code dd
+ div dl dt em font h1 h2 h3 h4 h5 h6 hr i li ol p
+ pre rb rp rt ruby s samp small strike strong sub
+ sup table td th tr tt u ul var),
+ # Mediawiki Specific
+ qw(nowiki),
+ # Our additions
+ qw(del ins), # These should have been added all along.
+ qw(span), # Mediawiki allows span but that's rather scary...?
+ qw(a), # this is unfortunate; should handle links after rendering the page.
+ ],
+
+ allowed_attrs => [
+ qw(title align lang dir width height bgcolor),
+ qw(clear), # BR
+ qw(noshade), # HR
+ qw(cite), # BLOCKQUOTE, Q
+ qw(size face color), # FONT
+ # For various lists, mostly deprecated but safe
+ qw(type start value compact),
+ # Tables
+ qw(summary width border frame rules cellspacing
+ cellpadding valign char charoff colgroup col
+ span abbr axis headers scope rowspan colspan),
+ qw(id class name style), # For CSS
+ # Our additions
+ qw(href),
+ ],
+
+ }, {
+ extended => 0,
+ absolute_links => 0,
+ implicit_links => 0
+ });
+
+ return $ret;
+ }
+
+
+ # This is only needed to support the check_redirect call.
+ sub pagetemplate (@)
+ {
+ my %params = @_;
+ my $page = $params{page};
+ my $destpage = $params{destpage};
+ my $template = $params{template};
+
+ # handle metaheaders for redirects
+ if (exists $metaheaders{$page} && $template->query(name => "meta")) {
+ # avoid duplicate meta lines
+ my %seen;
+ $template->param(meta => join("\n", grep { (! $seen{$_}) && ($seen{$_}=1) } @{$metaheaders{$page}}));
+ }
+
+ $template->param(tags => [
+ map {
+ link => htmllink($page, $destpage, tagpage($_), rel => "tag")
+ }, sort keys %{$tags{$page}}
+ ]) if exists $tags{$page} && %{$tags{$page}} && $template->query(name => "tags");
+
+ # It's an rss/atom template. Add any categories.
+ if ($template->query(name => "categories")) {
+ if (exists $tags{$page} && %{$tags{$page}}) {
+ $template->param(categories => [map { category => $_ },
+ sort keys %{$tags{$page}}]);
+ }
+ }
+ }
+
+ 1