From 6f98b4dfb2b80e3eb3072dd3f140351581746db2 Mon Sep 17 00:00:00 2001 From: Jonas Smedegaard Date: Tue, 30 Dec 2014 01:50:08 +0100 Subject: Move mediawiki tweaks to separate scripts. --- Makefile | 18 ++++-------------- mediawiki-blockquote | 19 +++++++++++++++++++ mediawiki-extract | 22 ++++++++++++++++++++++ mediawiki-trim | 20 ++++++++++++++++++++ mediawiki-uri-escape | 24 ++++++++++++++++++++++++ 5 files changed, 89 insertions(+), 14 deletions(-) create mode 100755 mediawiki-blockquote create mode 100755 mediawiki-extract create mode 100755 mediawiki-trim create mode 100755 mediawiki-uri-escape diff --git a/Makefile b/Makefile index c0da69d..be72240 100644 --- a/Makefile +++ b/Makefile @@ -2,12 +2,6 @@ stem = eut source_baseurl = http://euwiki.org/ source_basename = EUT/2nd-edition -# work around in mediawiki bug parsing blockquote tags on same line -re_blockquote = s|\n*()\n*|\n$$1\n|g - -# work around modest URL parsing -re_urlencode = s|\bhttps?://$$RE{net}{domain}{-nospace}{-rfc1101}(?::\d+)?/\K(\S*?)(?=[.,;]?[\s<\"\|}])|uri_escape(uri_unescape($$1), "^A-Za-z0-9/\.")|eg - templates = template.tex header.tex before.tex after.tex filters = ./pandoc-memoir ./pandoc-cs1 @@ -49,14 +43,10 @@ download: wget -O $(stem).raw '$(source_baseurl)w/index.php?title=$(source_basename)&action=edit' $(stem).mediawiki: $(stem).raw - perl -C -0777 -MHTML::Entities -MRegexp::Common=net -MURI::Escape -p \ - -e 's|.*]*>||s; s|||s;' \ - -e '$(re_blockquote);' \ - -e '$(re_urlencode);' \ - < $< > $@ + ./mediawiki-extract $< $@ + ./mediawiki-trim $@ + ./mediawiki-blockquote $@ + ./mediawiki-uri-escape $@ $(stem).native: $(stem).mediawiki $(localfilters) pandoc -f mediawiki $(args_meta) $(args_filter) -o $@ $< diff --git a/mediawiki-blockquote b/mediawiki-blockquote new file mode 100755 index 0000000..46de0cd --- /dev/null +++ b/mediawiki-blockquote @@ -0,0 +1,19 @@ +#!/usr/bin/perl + +# work around in mediawiki bug parsing blockquote tags on same line + +use warnings; +use strict; + +use Path::Tiny; + +my $infile = shift; +my $outfile = shift || $infile; + +$_ = path($infile)->slurp_utf8; + +s!\n*()\n*!\n$1\n!g; + +path($outfile)->spew_utf8($_); + +1; diff --git a/mediawiki-extract b/mediawiki-extract new file mode 100755 index 0000000..c2dae68 --- /dev/null +++ b/mediawiki-extract @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +# extract and decode mediawiki content from HTML source view + +use warnings; +use strict; + +use Path::Tiny; +use HTML::Entities; + +my $infile = shift; +my $outfile = shift || $infile; + +$_ = path($infile)->slurp_utf8; + +s!.*]*>!!s; +s!spew_utf8($_); + +1; diff --git a/mediawiki-trim b/mediawiki-trim new file mode 100755 index 0000000..a67d205 --- /dev/null +++ b/mediawiki-trim @@ -0,0 +1,20 @@ +#!/usr/bin/perl + +# drop content above initial headline - i.e. treat it as editorial noise + +use warnings; +use strict; + +use Path::Tiny; +use HTML::Entities; + +my $infile = shift; +my $outfile = shift || $infile; + +$_ = path($infile)->slurp_utf8; + +s!.*?\n= != !s; + +path($outfile)->spew_utf8($_); + +1; diff --git a/mediawiki-uri-escape b/mediawiki-uri-escape new file mode 100755 index 0000000..40dd10c --- /dev/null +++ b/mediawiki-uri-escape @@ -0,0 +1,24 @@ +#!/usr/bin/perl + +# work around modest URL parsing + +use warnings; +use strict; + +use Path::Tiny; +use Regexp::Common qw(net); +use URI::Escape; + +my $infile = shift; +my $outfile = shift || $infile; + +$_ = path($infile)->slurp_utf8; + +my $uri_path = qr!https?://$RE{net}{domain}{-nospace}(?::\d+)?\K(/\S*?)!; +my $uri_end = qr!(?=[.,;]?[\s<\"\|}])!; + +s|\b$uri_path$uri_end|uri_escape(uri_unescape($1), "^A-Za-z0-9/\.")|eg; + +path($outfile)->spew_utf8($_); + +1; -- cgit v1.2.3