summaryrefslogtreecommitdiff
path: root/mktxt2text-1
blob: 89770b412a971a686fb92f425928c98542e1fa4c (plain)
  1. #!/usr/bin/perl
  2. # normalize text
  3. use Getopt::Long;
  4. use File::Slurp;
  5. use Lingua::Sentence;
  6. use strict;
  7. use warnings;
  8. # setup sentence splitter
  9. my $splitter = Lingua::Sentence->new("en");
  10. my $force;
  11. GetOptions ("force|f");
  12. my $stem = shift;
  13. $_ = read_file( $stem . '.txt' );
  14. # page header
  15. s/^(:?24.3.2009|Official Journal of the European Union|EN)\n+//mg;
  16. # headline
  17. s/^(TITLE\h+(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I|))/\n#$1#\n\n/mg;
  18. #s/^(Article\h+\d+[a-z]?)$/\n\n##$1##\n\n/mg;
  19. s/^(\((\d+)\))(?:\s+(\S.*)(?:\n(?=\.)(\n\S.*))?)?/\n\n$1$3/mg;
  20. #s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg;
  21. #s/^(TITLE\h+\d+[a-z]?)/\n#$1#\n\n/mg;
  22. #s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg;
  23. #s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg;
  24. # join non-headline multi-lines, split into sentences, and split after comma
  25. s/([^\n=])\n/$1 /g;
  26. $_ = $splitter->split($_);
  27. s/,\s(?=[^\v=]*\v)/,\n/mg;
  28. write_file( $stem . '.mdwn', $_ );
  29. print "DONE: $0 stem $stem\n";