summaryrefslogtreecommitdiff
path: root/mktxt2text-default
diff options
context:
space:
mode:
authorJonas Smedegaard <dr@jones.dk>2013-05-10 20:46:04 +0200
committerJonas Smedegaard <dr@jones.dk>2013-05-10 20:46:04 +0200
commit27d338dec0428bc22e2838eb8641c6e0d1681e22 (patch)
tree18eeb76e189ce03838dd9b23194f1c707b1eabbb /mktxt2text-default
Include mk* scripts
Diffstat (limited to 'mktxt2text-default')
-rwxr-xr-xmktxt2text-default45
1 files changed, 45 insertions, 0 deletions
diff --git a/mktxt2text-default b/mktxt2text-default
new file mode 100755
index 0000000..aaeb542
--- /dev/null
+++ b/mktxt2text-default
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# normalize text
+
+use Getopt::Long;
+use File::Slurp;
+use Lingua::Sentence;
+
+use strict;
+use warnings;
+
+# setup sentence splitter
+my $splitter = Lingua::Sentence->new("en");
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+$_ = read_file( $stem . '.txt' );
+
+# page header
+s/^(:?\d+|EN)\n+//mg;
+s/EN\sEN//mg;
+s/EN\s\d+\sEN//mg;
+
+# headline
+s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg;
+s/^(TITLE\h+\d+[a-z]?)/\n===$1===\n\n/mg;
+s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg;
+s/^\'?(Article\h+\d+[a-z]?)$/\n\n====$1====\n\n/mg;
+s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg;
+
+# list item
+s/^\'(\d+\.)\s+/$1\n/mg;
+
+# join non-headline multi-lines, split into sentences, and split after comma, colon and semi-colon
+s/([^\n=])\n/$1 /g;
+$_ = $splitter->split($_);
+s/,\s(?=[^\v=]*\v)/,\n/mg;
+s/:\s(?=[^\v=]*\v)/:\n/mg;
+s/;\s(?=[^\v=]*\v)/;\n/mg;
+
+write_file( $stem . '.mdwn', $_ );
+
+print "DONE: $0 stem $stem\n";