5 files changed, 89 insertions, 14 deletions
diff --git a/Makefile b/Makefile
index c0da69d..be72240 100644
--- a/Makefile
+++ b/Makefile
@@ -2,12 +2,6 @@ stem = eut
 source_baseurl = http://euwiki.org/
 source_basename = EUT/2nd-edition
 
-# work around in mediawiki bug parsing blockquote tags on same line
-re_blockquote = s|\n*(</?blockquote>)\n*|\n$$1\n|g
-
-# work around modest URL parsing <https://github.com/jgm/pandoc/issues/1838>
-re_urlencode = s|\bhttps?://$$RE{net}{domain}{-nospace}{-rfc1101}(?::\d+)?/\K(\S*?)(?=[.,;]?[\s<\"\|}])|uri_escape(uri_unescape($$1), "^A-Za-z0-9/\.")|eg
-
 templates = template.tex header.tex before.tex after.tex
 
 filters = ./pandoc-memoir ./pandoc-cs1
@@ -49,14 +43,10 @@ download:
 	wget -O $(stem).raw '$(source_baseurl)w/index.php?title=$(source_basename)&action=edit'
 
 $(stem).mediawiki: $(stem).raw
-	perl -C -0777 -MHTML::Entities -MRegexp::Common=net -MURI::Escape -p \
-	-e 's|.*<textarea[^>]*>||s; s|</textarea.*||s;' \
-	-e 'decode_entities($$_);' \
-	-e 's|.*?\n= |= |s;' \
-	-e 's|<!--.*-->||s;' \
-	-e '$(re_blockquote);' \
-	-e '$(re_urlencode);' \
-	< $< > $@
+	./mediawiki-extract $< $@
+	./mediawiki-trim $@
+	./mediawiki-blockquote $@
+	./mediawiki-uri-escape $@
 
 $(stem).native: $(stem).mediawiki $(localfilters)
 	pandoc -f mediawiki $(args_meta) $(args_filter) -o $@ $<
diff --git a/mediawiki-blockquote b/mediawiki-blockquote
new file mode 100755
index 0000000..46de0cd
--- /dev/null
+++ b/mediawiki-blockquote
@@ -0,0 +1,19 @@
+#!/usr/bin/perl
+
+# work around in mediawiki bug parsing blockquote tags on same line
+
+use warnings;
+use strict;
+
+use Path::Tiny;
+
+my $infile = shift;
+my $outfile = shift || $infile;
+
+$_ = path($infile)->slurp_utf8;
+
+s!\n*(</?blockquote>)\n*!\n$1\n!g;
+
+path($outfile)->spew_utf8($_);
+
+1;
diff --git a/mediawiki-extract b/mediawiki-extract
new file mode 100755
index 0000000..c2dae68
--- /dev/null
+++ b/mediawiki-extract
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+# extract and decode mediawiki content from HTML source view
+
+use warnings;
+use strict;
+
+use Path::Tiny;
+use HTML::Entities;
+
+my $infile = shift;
+my $outfile = shift || $infile;
+
+$_ = path($infile)->slurp_utf8;
+
+s!.*<textarea[^>]*>!!s;
+s!</textarea.*!!s;
+decode_entities($_);
+
+path($outfile)->spew_utf8($_);
+
+1;
diff --git a/mediawiki-trim b/mediawiki-trim
new file mode 100755
index 0000000..a67d205
--- /dev/null
+++ b/mediawiki-trim
@@ -0,0 +1,20 @@
+#!/usr/bin/perl
+
+# drop content above initial headline - i.e. treat it as editorial noise
+
+use warnings;
+use strict;
+
+use Path::Tiny;
+use HTML::Entities;
+
+my $infile = shift;
+my $outfile = shift || $infile;
+
+$_ = path($infile)->slurp_utf8;
+
+s!.*?\n= != !s;
+
+path($outfile)->spew_utf8($_);
+
+1;
diff --git a/mediawiki-uri-escape b/mediawiki-uri-escape
new file mode 100755
index 0000000..40dd10c
--- /dev/null
+++ b/mediawiki-uri-escape
@@ -0,0 +1,24 @@
+#!/usr/bin/perl
+
+# work around modest URL parsing <https://github.com/jgm/pandoc/issues/1838>
+
+use warnings;
+use strict;
+
+use Path::Tiny;
+use Regexp::Common qw(net);
+use URI::Escape;
+
+my $infile = shift;
+my $outfile = shift || $infile;
+
+$_ = path($infile)->slurp_utf8;
+
+my $uri_path = qr!https?://$RE{net}{domain}{-nospace}(?::\d+)?\K(/\S*?)!;
+my $uri_end = qr!(?=[.,;]?[\s<\"\|}])!;
+
+s|\b$uri_path$uri_end|uri_escape(uri_unescape($1), "^A-Za-z0-9/\.")|eg;
+
+path($outfile)->spew_utf8($_);
+
+1;