summaryrefslogtreecommitdiff
path: root/mktxt2text-default
blob: aaeb542b649a0751e3fe4b5854de478a928529af (plain)
  1. #!/usr/bin/perl
  2. # normalize text
  3. use Getopt::Long;
  4. use File::Slurp;
  5. use Lingua::Sentence;
  6. use strict;
  7. use warnings;
  8. # setup sentence splitter
  9. my $splitter = Lingua::Sentence->new("en");
  10. my $force;
  11. GetOptions ("force|f");
  12. my $stem = shift;
  13. $_ = read_file( $stem . '.txt' );
  14. # page header
  15. s/^(:?\d+|EN)\n+//mg;
  16. s/EN\sEN//mg;
  17. s/EN\s\d+\sEN//mg;
  18. # headline
  19. s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg;
  20. s/^(TITLE\h+\d+[a-z]?)/\n===$1===\n\n/mg;
  21. s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg;
  22. s/^\'?(Article\h+\d+[a-z]?)$/\n\n====$1====\n\n/mg;
  23. s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg;
  24. # list item
  25. s/^\'(\d+\.)\s+/$1\n/mg;
  26. # join non-headline multi-lines, split into sentences, and split after comma, colon and semi-colon
  27. s/([^\n=])\n/$1 /g;
  28. $_ = $splitter->split($_);
  29. s/,\s(?=[^\v=]*\v)/,\n/mg;
  30. s/:\s(?=[^\v=]*\v)/:\n/mg;
  31. s/;\s(?=[^\v=]*\v)/;\n/mg;
  32. write_file( $stem . '.mdwn', $_ );
  33. print "DONE: $0 stem $stem\n";