#!/usr/bin/perl # normalize text use Getopt::Long; use File::Slurp; use Lingua::Sentence; use strict; use warnings; # setup sentence splitter my $splitter = Lingua::Sentence->new("en"); my $force; GetOptions ("force|f"); my $stem = shift; $_ = read_file( $stem . '.txt' ); # page header s/^(:?\d+|EN)\n+//mg; s/EN\sEN//mg; s/EN\s\d+\sEN//mg; # headline s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg; s/^(TITLE\h+\d+[a-z]?)/\n===$1===\n\n/mg; s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg; s/^\'?(Article\h+\d+[a-z]?)$/\n\n====$1====\n\n/mg; s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg; # list item s/^\'(\d+\.)\s+/$1\n/mg; # join non-headline multi-lines, split into sentences, and split after comma, colon and semi-colon s/([^\n=])\n/$1 /g; $_ = $splitter->split($_); s/,\s(?=[^\v=]*\v)/,\n/mg; s/:\s(?=[^\v=]*\v)/:\n/mg; s/;\s(?=[^\v=]*\v)/;\n/mg; write_file( $stem . '.mdwn', $_ ); print "DONE: $0 stem $stem\n";