blob: aaeb542b649a0751e3fe4b5854de478a928529af (
plain)
- #!/usr/bin/perl
- # normalize text
- use Getopt::Long;
- use File::Slurp;
- use Lingua::Sentence;
- use strict;
- use warnings;
- # setup sentence splitter
- my $splitter = Lingua::Sentence->new("en");
- my $force;
- GetOptions ("force|f");
- my $stem = shift;
- $_ = read_file( $stem . '.txt' );
- # page header
- s/^(:?\d+|EN)\n+//mg;
- s/EN\sEN//mg;
- s/EN\s\d+\sEN//mg;
- # headline
- s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg;
- s/^(TITLE\h+\d+[a-z]?)/\n===$1===\n\n/mg;
- s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg;
- s/^\'?(Article\h+\d+[a-z]?)$/\n\n====$1====\n\n/mg;
- s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg;
- # list item
- s/^\'(\d+\.)\s+/$1\n/mg;
- # join non-headline multi-lines, split into sentences, and split after comma, colon and semi-colon
- s/([^\n=])\n/$1 /g;
- $_ = $splitter->split($_);
- s/,\s(?=[^\v=]*\v)/,\n/mg;
- s/:\s(?=[^\v=]*\v)/:\n/mg;
- s/;\s(?=[^\v=]*\v)/;\n/mg;
- write_file( $stem . '.mdwn', $_ );
- print "DONE: $0 stem $stem\n";
|