summaryrefslogtreecommitdiff
path: root/mkhtm2html-default
blob: a4b7bc8c526b47f4128f5025a0ab7c53659ae243 (plain)
  1. #!/usr/bin/perl
  2. # normalize HTML
  3. #
  4. # TODO: settle on a specific order of header levels
  5. # TODO: handle more subheaders
  6. use Getopt::Long;
  7. use File::Slurp;
  8. use strict;
  9. use warnings;
  10. my $force;
  11. GetOptions ("force|f");
  12. my $stem = shift;
  13. my $_ = read_file( $stem . '.htm' );
  14. # whitespace
  15. s/ / /mg;
  16. s{<P\b[^>]*>\s*</P>\s*}{}mg;
  17. # page header
  18. s{<DIV\b[^>]*>\s*\K(?:<P\b[^>]*;top:1\d{3}px;[^>]*>(?:<[bi]>)?[^<]+(?:</[bi]>)?</P>\s*)+}{}mg;
  19. # footnote
  20. s{<P\b[^>]*>\h+</P>\s*(?:<P\b[^>]*><b>\S+</b></P>\s*<P\b[^>]*>((?:(?!</P\b).)+.)</P>\s*)+(?=</DIV>)}{}mg;
  21. foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) {
  22. s{\(</P>\s*<P\s[^>]*\sclass="$class">\d+</P>\s*<P\s[^>]*>\)}{}mg;
  23. };
  24. # drop document headers
  25. s{<HEAD>.*?</HEAD>\s*}{}msg;
  26. s{</DIV>\n</BODY>\n</HTML>.*?<DIV\b[^>]*>\s*}{}msg;
  27. # unwrap similarly styled bolded paragraphs
  28. s{<P\b[^>]*class="([^"]+)"[^>]*><b>[^<]+\K</b></P>\s*<P\b[^>]*class="\1"[^>]*><b>}{ }mg;
  29. # headline
  30. s{<P\b[^>]*>(TITLE \S+)</P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
  31. s{<P\b[^>]*><i>(SECTION \S+)</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+.)</b></i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
  32. s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)</i></P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)\s*</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
  33. s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)\s*</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+?.)\s*</b>\s*</i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
  34. s{<P\b[^>]*>(\d+)\.</P>\s*}{<H3>$1</H3>\n}mg;
  35. s{(<P\b[^>]*>)(\d+)\. }{<H3>$2</H3>\n$1}mg;
  36. s{<P\b[^>]*>\(([a-z])\)</P>}{<H4>$1</H4>}mg;
  37. s{(<P\b[^>]*>)\(([ivx]+)\) }{<H5>$2</H5>\n$1}mg;
  38. s{<P\b[^>]*>\(([ivx]+)\)</P>}{<H5>$1</H5>}mg;
  39. # unwrap
  40. s{(?<=[[:lower:]])-(?:<br/>|</P>\s*<P\b[^>]*>)(?=[[:lower:]])}{}mg;
  41. #s{<P\b[^>]*class="([^"]+)"[^>]*>[^<]+\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
  42. s{(?<=class="(ft\d{1})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
  43. s{(?<=class="(ft\d{2})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
  44. s{(?<=class="(ft\d{3})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
  45. #s{<P\b[^>]*class="([^"]+)"[^>]*>([^<]+?)\s*</P>\s*(?=<P\b[^>]*class="\1"[^>]*>)}{$2 }mg;
  46. s{\s*<br/>\s*}{ }mg;
  47. # drop styling
  48. s{<P\b[^>]*>}{<P>}mg;
  49. write_file( $stem . '.html', $_ );
  50. print "DONE: $0 stem $stem\n";