From 27d338dec0428bc22e2838eb8641c6e0d1681e22 Mon Sep 17 00:00:00 2001 From: Jonas Smedegaard Date: Fri, 10 May 2013 20:46:04 +0200 Subject: Include mk* scripts --- mktxt2text-1 | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 mktxt2text-1 (limited to 'mktxt2text-1') diff --git a/mktxt2text-1 b/mktxt2text-1 new file mode 100755 index 0000000..89770b4 --- /dev/null +++ b/mktxt2text-1 @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +# normalize text + +use Getopt::Long; +use File::Slurp; +use Lingua::Sentence; + +use strict; +use warnings; + +# setup sentence splitter +my $splitter = Lingua::Sentence->new("en"); + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +$_ = read_file( $stem . '.txt' ); + +# page header +s/^(:?24.3.2009|Official Journal of the European Union|EN)\n+//mg; + +# headline +s/^(TITLE\h+(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I|))/\n#$1#\n\n/mg; +#s/^(Article\h+\d+[a-z]?)$/\n\n##$1##\n\n/mg; +s/^(\((\d+)\))(?:\s+(\S.*)(?:\n(?=\.)(\n\S.*))?)?/\n\n$1$3/mg; +#s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg; +#s/^(TITLE\h+\d+[a-z]?)/\n#$1#\n\n/mg; +#s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg; +#s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg; + +# join non-headline multi-lines, split into sentences, and split after comma +s/([^\n=])\n/$1 /g; +$_ = $splitter->split($_); +s/,\s(?=[^\v=]*\v)/,\n/mg; + +write_file( $stem . '.mdwn', $_ ); + +print "DONE: $0 stem $stem\n"; -- cgit v1.2.3