diff options
author | Jonas Smedegaard <dr@jones.dk> | 2013-05-10 20:46:04 +0200 |
---|---|---|
committer | Jonas Smedegaard <dr@jones.dk> | 2013-05-10 20:46:04 +0200 |
commit | 27d338dec0428bc22e2838eb8641c6e0d1681e22 (patch) | |
tree | 18eeb76e189ce03838dd9b23194f1c707b1eabbb /mktxt2text-default |
Include mk* scripts
Diffstat (limited to 'mktxt2text-default')
-rwxr-xr-x | mktxt2text-default | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/mktxt2text-default b/mktxt2text-default new file mode 100755 index 0000000..aaeb542 --- /dev/null +++ b/mktxt2text-default @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# normalize text + +use Getopt::Long; +use File::Slurp; +use Lingua::Sentence; + +use strict; +use warnings; + +# setup sentence splitter +my $splitter = Lingua::Sentence->new("en"); + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +$_ = read_file( $stem . '.txt' ); + +# page header +s/^(:?\d+|EN)\n+//mg; +s/EN\sEN//mg; +s/EN\s\d+\sEN//mg; + +# headline +s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg; +s/^(TITLE\h+\d+[a-z]?)/\n===$1===\n\n/mg; +s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg; +s/^\'?(Article\h+\d+[a-z]?)$/\n\n====$1====\n\n/mg; +s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg; + +# list item +s/^\'(\d+\.)\s+/$1\n/mg; + +# join non-headline multi-lines, split into sentences, and split after comma, colon and semi-colon +s/([^\n=])\n/$1 /g; +$_ = $splitter->split($_); +s/,\s(?=[^\v=]*\v)/,\n/mg; +s/:\s(?=[^\v=]*\v)/:\n/mg; +s/;\s(?=[^\v=]*\v)/;\n/mg; + +write_file( $stem . '.mdwn', $_ ); + +print "DONE: $0 stem $stem\n"; |