summaryrefslogtreecommitdiff
path: root/mkhtm2html-1
blob: 6f37a46f264e336355f3bde22a52f09c7e9994aa (plain)
  1. #!/usr/bin/perl
  2. # normalize HTML
  3. use Getopt::Long;
  4. use File::Slurp;
  5. use strict;
  6. use warnings;
  7. my $force;
  8. GetOptions ("force|f");
  9. my $stem = shift;
  10. my $_ = read_file( $stem . '.htm' );
  11. # whitespace
  12. s/ / /mg;
  13. # page header
  14. s{<P\b[^>]*;top:6[23]px;[^>]*>[^<]*</P>\s*}{}mg;
  15. # footnote
  16. foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) {
  17. s{\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\)}{}mg;
  18. };
  19. foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:6px;)/mg, $_ ) {
  20. s{<P\b[^>]*>\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\) [^<]*</P>\s*}{}mg;
  21. };
  22. # unwrap similarly styled bolded paragraphs
  23. s{<P\b[^>]*class="([^"]+)"[^>]*><b>[^<]+\K</b></P>\s*<P\b[^>]*class="\1"[^>]*><b>}{ }mg;
  24. # headline
  25. s{<P\b[^>]*>(TITLE \S+)</P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
  26. s{<P\b[^>]*><i>(SECTION \S+)</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+.)</b></i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
  27. s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)</i></P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
  28. s{(<P\b[^>]*>)(\d+)\. }{<H3>$2</H3>\n$1}mg;
  29. s{<P\b[^>]*>\(([a-z])\)</P>}{<H4>$1</H4>}mg;
  30. s{(<P\b[^>]*>)\(([ivx]+)\) }{<H5>$2</H5>\n$1}mg;
  31. s{<P\b[^>]*>\(([ivx]+)\)</P>}{<H5>$1</H5>}mg;
  32. # unwrap
  33. s{(?<=\S)-(<br/>|</P>\s*<P\b[^>]*>)(?=[[:lower:]])}{}mg;
  34. s{\s*<br/>\s*}{ }mg;
  35. write_file( $stem . '.html', $_ );
  36. print "DONE: $0 stem $stem\n";