From 27d338dec0428bc22e2838eb8641c6e0d1681e22 Mon Sep 17 00:00:00 2001 From: Jonas Smedegaard Date: Fri, 10 May 2013 20:46:04 +0200 Subject: Include mk* scripts --- mkall | 33 ++++++++++++++++++++++++++++ mkgit | 43 +++++++++++++++++++++++++++++++++++++ mkhtm2html-1 | 49 ++++++++++++++++++++++++++++++++++++++++++ mkhtm2html-2 | 39 +++++++++++++++++++++++++++++++++ mkhtm2html-default | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mkhtml2md-default | 21 ++++++++++++++++++ mkmd2mdwn-default | 35 ++++++++++++++++++++++++++++++ mkpdf | 47 ++++++++++++++++++++++++++++++++++++++++ mkpdf2htm-default | 23 ++++++++++++++++++++ mkslice-1 | 24 +++++++++++++++++++++ mkslice-2 | 30 ++++++++++++++++++++++++++ mkslice-default | 25 ++++++++++++++++++++++ mktxt-default | 21 ++++++++++++++++++ mktxt2text-1 | 40 ++++++++++++++++++++++++++++++++++ mktxt2text-default | 45 ++++++++++++++++++++++++++++++++++++++ 15 files changed, 538 insertions(+) create mode 100755 mkall create mode 100755 mkgit create mode 100755 mkhtm2html-1 create mode 100755 mkhtm2html-2 create mode 100755 mkhtm2html-default create mode 100755 mkhtml2md-default create mode 100755 mkmd2mdwn-default create mode 100755 mkpdf create mode 100755 mkpdf2htm-default create mode 100755 mkslice-1 create mode 100755 mkslice-2 create mode 100755 mkslice-default create mode 100755 mktxt-default create mode 100755 mktxt2text-1 create mode 100755 mktxt2text-default diff --git a/mkall b/mkall new file mode 100755 index 0000000..59b5af6 --- /dev/null +++ b/mkall @@ -0,0 +1,33 @@ +#!/bin/sh + +# process text(s) + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=-f; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +# fetch PDF files +./mkpdf $force "$@" + +# resolve stems from available PDF files if none provided +[ "$#" -gt 0 ] || eval set -- $(ls -1 *.pdf | sed 's/.pdf$//') + +# run tasks - either generic or with "_$stem" suffix +log_action_begin_msg "Apply tasks" +for stem in "$@"; do +# log_action_cont_msg $stem +# for task in pdf2txt txt2text slice; do +# for task in pdf2htm htm2html html2md md2mdwn slice; do + for task in pdf2htm htm2html; do +# log_action_cont_msg $task +# test ! -x $task-all || ./$task-all $force $stem + taskscript=mk$task-$stem + [ -x $taskscript ] || taskscript=mk$task-default + ./$taskscript $force $stem + done +done +log_action_end_msg $? diff --git a/mkgit b/mkgit new file mode 100755 index 0000000..99449a6 --- /dev/null +++ b/mkgit @@ -0,0 +1,43 @@ +#!/bin/sh + +# create local git, and create and push to remote git + +set -e + +login=debian@source.jones.dk +path="/srv/git/source.jones.dk/epfsug/diff" + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=-f; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +# add suffix unless already included +gitpath="$(dirname "$path")/$(basename "$path" .git).git" + +log_action_begin_msg "Publish git" + +# initialize local git +log_action_cont_msg "init" +if [ -d .git ]; then + if [ -n "$force" ]; then + log_warning_msg "purging" + rm -rf .git + else + log_failure_msg "local git already exist (force with -f)" + exit 1 + fi +fi +git init + +# add scripts +log_action_cont_msg "add" +git add mk*; git commit -m "Include mk* scripts" + +# create and populate remote git +log_action_cont_msg "push" +ssh "$login" git init --bare "$gitpath" \&\& cd "$gitpath" \&\& mv hooks/post-update{.sample,} \&\& touch git-daemon-export-ok +git push $force --all -u "$login:$gitpath" || { log_failure_msg "remote git already exist (force with -f)"; exit 1; } + +log_action_end_msg $? diff --git a/mkhtm2html-1 b/mkhtm2html-1 new file mode 100755 index 0000000..6f37a46 --- /dev/null +++ b/mkhtm2html-1 @@ -0,0 +1,49 @@ +#!/usr/bin/perl + +# normalize HTML + +use Getopt::Long; +use File::Slurp; + +use strict; +use warnings; + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +my $_ = read_file( $stem . '.htm' ); + +# whitespace +s/ / /mg; + +# page header +s{]*;top:6[23]px;[^>]*>[^<]*

\s*}{}mg; + +# footnote +foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) { + s{\(

\s*]*\sclass="$class">\d+

\s*]*>\)}{}mg; +}; +foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:6px;)/mg, $_ ) { + s{]*>\(

\s*]*\sclass="$class">\d+

\s*]*>\) [^<]*

\s*}{}mg; +}; + +# unwrap similarly styled bolded paragraphs +s{]*class="([^"]+)"[^>]*>[^<]+\K

\s*]*class="\1"[^>]*>}{ }mg; + +# headline +s{]*>(TITLE \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>(SECTION \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>((?:Article|ANNEX) \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{(]*>)(\d+)\. }{

$2

\n$1}mg; +s{]*>\(([a-z])\)

}{

$1

}mg; +s{(]*>)\(([ivx]+)\) }{
$2
\n$1}mg; +s{]*>\(([ivx]+)\)

}{
$1
}mg; + +# unwrap +s{(?<=\S)-(
|

\s*]*>)(?=[[:lower:]])}{}mg; +s{\s*
\s*}{ }mg; + +write_file( $stem . '.html', $_ ); + +print "DONE: $0 stem $stem\n"; diff --git a/mkhtm2html-2 b/mkhtm2html-2 new file mode 100755 index 0000000..ae18391 --- /dev/null +++ b/mkhtm2html-2 @@ -0,0 +1,39 @@ +#!/usr/bin/perl + +# normalize HTML + +use Getopt::Long; +use File::Slurp; + +use strict; +use warnings; + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +my $_ = read_file( $stem . '.htm' ); + +# whitespace +s/(?: |\h)+/ /mg; + +# preamble +s{.*>HAVE ADOPTED THIS REGULATION:

\s*}{}s; + +# page header +s{]*;top:(?:1172|1187)px;[^>]*>(?:(?!\s*}{}mg; + +# headline +s{]*>(?:In Title \S+, the following Section \S+ is inserted:

\s*]*>)?\'?(SECTION \S+)
((?:(?!

}{

$1

\n

$2

}mg; +s{]*>\'?(Article \S+)

}{

$1

}mg; +s{]*>(?:Article \S+ is replaced by the following:

\s*]*>)?\'?(Article \S+)
((?:(?!

}{

$1

\n

$2

}mg; +s{]*>(Article \S+) is amended as follows:

}{

$1

}mg; +s{]*>(?:paragraph \S+ is replaced by the following:

\s*)(]*>)\'?(\d+)\. }{

$2

\n$1}mg; +s{]*>In (Article \S+), paragraph (\S+) is replaced by the following:

\s*(]*>)\'?(\2)\. }{

$1

\n

$2

\n$3}mg; + +# unwrap +s{\s*
\s*}{ }mg; + +write_file( $stem . '.html', $_ ); + +print "DONE: $0 stem $stem\n"; diff --git a/mkhtm2html-default b/mkhtm2html-default new file mode 100755 index 0000000..c2589a2 --- /dev/null +++ b/mkhtm2html-default @@ -0,0 +1,63 @@ +#!/usr/bin/perl + +# normalize HTML + +use Getopt::Long; +use File::Slurp; + +use strict; +use warnings; + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +my $_ = read_file( $stem . '.htm' ); + +# whitespace +s/ / /mg; +s{]*>\s*

\s*}{}mg; + +# page header +s{]*>\s*\K(?:]*;top:1\d{3}px;[^>]*>(?:<[bi]>)?[^<]+(?:)?

\s*)+}{}mg; + +# footnote +s{]*>\h+

\s*(?:]*>\S+

\s*]*>((?:(?!\s*)+(?=)}{}mg; + +foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) { + s{\(

\s*]*\sclass="$class">\d+

\s*]*>\)}{}mg; +}; + +# document headers +s{.*?\s*}{}msg; +s{\n\n.*?]*>\s*}{}msg; + +# unwrap similarly styled bolded paragraphs +s{]*class="([^"]+)"[^>]*>[^<]+\K

\s*]*class="\1"[^>]*>}{ }mg; + +# headline +s{]*>(TITLE \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>(SECTION \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>((?:Article|ANNEX) \S+)

\s*]*>((?:(?!

}{

$1

\n

$2

}mg; +s{]*>((?:Article|ANNEX) \S+)\s*

\s*]*>((?:(?!\s*

}{

$1

\n

$2

}mg; +s{]*>(\d+)\.

\s*}{

$1

\n}mg; +s{(]*>)(\d+)\. }{

$2

\n$1}mg; +s{]*>\(([a-z])\)

}{

$1

}mg; +s{(]*>)\(([ivx]+)\) }{
$2
\n$1}mg; +s{]*>\(([ivx]+)\)

}{
$1
}mg; + +# unwrap +s{(?<=[[:lower:]])-(?:
|

\s*]*>)(?=[[:lower:]])}{}mg; +#s{]*class="([^"]+)"[^>]*>[^<]+\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{1})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{2})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +s{(?<=class="(ft\d{3})">)[^<]+?\K\s*

\s*]*class="\1"[^>]*>}{ }mg; +#s{]*class="([^"]+)"[^>]*>([^<]+?)\s*

\s*(?=]*class="\1"[^>]*>)}{$2 }mg; +s{\s*
\s*}{ }mg; + +# styling +s{]*>}{

}mg; + +write_file( $stem . '.html', $_ ); + +print "DONE: $0 stem $stem\n"; diff --git a/mkhtml2md-default b/mkhtml2md-default new file mode 100755 index 0000000..90f4d33 --- /dev/null +++ b/mkhtml2md-default @@ -0,0 +1,21 @@ +#!/bin/sh + +# convert text from HTML to markdown + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +log_action_begin_msg "Convert HTML → Markdown" +for stem in "$@"; do + infile=$stem.html + outfile=$stem.md + log_action_cont_msg $stem + [ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot $infile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; } + pandoc --normalize --atx-headers -f html -t markdown -o $outfile $infile +done +log_action_end_msg $? diff --git a/mkmd2mdwn-default b/mkmd2mdwn-default new file mode 100755 index 0000000..588b61e --- /dev/null +++ b/mkmd2mdwn-default @@ -0,0 +1,35 @@ +#!/usr/bin/perl + +# normalize Markdown + +use Getopt::Long; +use File::Slurp; +use Lingua::Sentence; + +use strict; +use warnings; + +# setup sentence splitter +my $splitter = Lingua::Sentence->new("en"); + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +$_ = read_file( $stem . '.md' ); + +# join non-headline multi-lines +s/([^\n=])\n/$1 /g; +#s/(?<=[\S^=])\h*\n(?=[\S^=])/ /g; + +# split into sentences +$_ = $splitter->split($_); + +# split after comma, and before and after ellipsis +#s/,\s(?=[^\v=]*\v)/,\n/mg; +s/\h+(\(…\))/\n$1/mg; +s/(\(…\))\h+/$1\n/mg; + +write_file( $stem . '.mdwn', $_ ); + +print "DONE: $0 stem $stem\n"; diff --git a/mkpdf b/mkpdf new file mode 100755 index 0000000..ec6ca0a --- /dev/null +++ b/mkpdf @@ -0,0 +1,47 @@ +#!/bin/sh + +# fetch PDF text + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +# operate on both files if none provided +test "$#" -gt 0 || eval set -- 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + +log_action_begin_msg "Fetch PDF" +for stem in "$@"; do + outfile=$stem.pdf + log_action_cont_msg $stem + [ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; } + case $stem in + 1) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2009:078:0001:0042:en:PDF';; + 2) wget -O$outfile 'http://www.europarl.europa.eu/registre/docs_autres_institutions/commission_europeenne/com/2013/0161/COM_COM%282013%290161_EN.pdf';; + 3) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:52012PC0011:EN:PDF';; +# disappeared 20130510 +# 4) wget -O$outfile 'http://www.statewatch.org/news/2013/may/eu-coe-data-protection-8825-13.pdf';; + 4) wget -O$outfile 'http://erikjosefsson.eu/sites/default/files/Council-20130424-GDPR-statewatch-leak.pdf';; +# the following 5 documents are ACTA leaks and the 6th is the final ACTA text + 5) wget -O$outfile 'http://www.laquadrature.net/files/201001_acta.pdf';; + 6) wget -O$outfile 'http://trade.ec.europa.eu/doclib/docs/2010/april/tradoc_146029.pdf';; + 7) wget -O$outfile 'http://www.erikjosefsson.eu/sites/default/files/ACTA_leak_20100701.pdf';; + 8) wget -O$outfile 'http://keionline.org/sites/default/files/acta_aug25_dc.pdf';; + 9) wget -O$outfile 'http://keionline.org/sites/default/files/actaoct2010.pdf';; + 10) wget -O$outfile 'http://trade.ec.europa.eu/doclib/docs/2011/may/tradoc_147937.pdf';; +# the following 9 documents are EUR-LEX files, the first each year in the Offical Journal of the EU (OJ) starting 1996 + 11) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1996:001:0001:001:en:PDF';; + 12) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1997:001:0001:001:en:PDF';; + 13) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1998:001:0001:001:en:PDF';; + 14) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1999:001:0001:001:en:PDF';; + 15) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2000:001:0001:001:en:PDF';; + 16) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2001:001:0001:001:en:PDF';; + 17) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2002:001:0001:001:en:PDF';; + 18) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2003:001:0001:001:en:PDF';; + 19) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2004:001:0001:001:en:PDF';; + esac +done +log_action_end_msg $? diff --git a/mkpdf2htm-default b/mkpdf2htm-default new file mode 100755 index 0000000..dbf408a --- /dev/null +++ b/mkpdf2htm-default @@ -0,0 +1,23 @@ +#!/bin/sh + +# convert text from PDF to HTML + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +log_action_begin_msg "Convert PDF → HTML" +for stem in "$@"; do + infile=$stem.pdf + outfile=$stem.htm + log_action_cont_msg $stem + [ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot $infile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; } + pdftohtml -q -c -s -i $infile + mv -f $stem-html.html $outfile + rm -f $stem-outline.html +done +log_action_end_msg $? diff --git a/mkslice-1 b/mkslice-1 new file mode 100755 index 0000000..b4326cd --- /dev/null +++ b/mkslice-1 @@ -0,0 +1,24 @@ +#!/bin/sh + +# create slices of text + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +# actual slicing: first argument is variant, others are csplice patterns +doit() { + variant=$1; shift + csplit -s -f $stem -b "-$variant-%03d.mdwn" $stem.mdwn "$@" +} + +log_action_begin_msg "Create slices" +for stem in "$@"; do + log_action_cont_msg $stem + doit rec '/^([[:digit:]]\+)\|HAS ADOPTED THIS REGULATION/' '{19}' '%%' '{*}' +done +log_action_end_msg $? diff --git a/mkslice-2 b/mkslice-2 new file mode 100755 index 0000000..507f0c5 --- /dev/null +++ b/mkslice-2 @@ -0,0 +1,30 @@ +#!/bin/sh + +# create slices of text + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +# actual slicing: first argument is variant, others are csplice patterns +doit() { + variant=$1; shift + csplit -s -f $stem -b "-$variant-%03d.mdwn" $stem.mdwn "$@" +} + +log_action_begin_msg "Create slices" +for stem in "$@"; do + log_action_cont_msg $stem +# doit preamble \ +# '%Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL%' \ +# '/HAVE ADOPTED THIS REGULATION:/' \ +# '%%' '{*}' + doit rec \ + '/^# \(Title\|Article\)/' '{25}' \ + '%%' '{*}' +done +log_action_end_msg $? diff --git a/mkslice-default b/mkslice-default new file mode 100755 index 0000000..ee4d980 --- /dev/null +++ b/mkslice-default @@ -0,0 +1,25 @@ +#!/bin/sh + +# create slices of text + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +# actual slicing: first argument is variant, others are csplice patterns +doit() { + variant=$1; shift + csplit -s -f $stem -b "-$variant-%03d.mdwn" $stem.mdwn "$@" +} + +log_action_begin_msg "Create slices" +for stem in "$@"; do + log_action_cont_msg $stem + doit rec \ + '/^# \(Title\|Article\)/' '{*}' +done +log_action_end_msg $? diff --git a/mktxt-default b/mktxt-default new file mode 100755 index 0000000..9c2e422 --- /dev/null +++ b/mktxt-default @@ -0,0 +1,21 @@ +#!/bin/sh + +# convert text from PDF to text + +set -e + +# resolve options +eval set -- "$(getopt -s sh -o f -- "$@")" +while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done + +. /lib/lsb/init-functions + +log_action_begin_msg "Convert PDF → text" +for stem in "$@"; do + infile=$stem.pdf + outfile=$stem.txt + log_action_cont_msg $stem + [ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot $infile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; } + pdftotext -raw -nopgbrk $stem.pdf $outfile +done +log_action_end_msg $? diff --git a/mktxt2text-1 b/mktxt2text-1 new file mode 100755 index 0000000..89770b4 --- /dev/null +++ b/mktxt2text-1 @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +# normalize text + +use Getopt::Long; +use File::Slurp; +use Lingua::Sentence; + +use strict; +use warnings; + +# setup sentence splitter +my $splitter = Lingua::Sentence->new("en"); + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +$_ = read_file( $stem . '.txt' ); + +# page header +s/^(:?24.3.2009|Official Journal of the European Union|EN)\n+//mg; + +# headline +s/^(TITLE\h+(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I|))/\n#$1#\n\n/mg; +#s/^(Article\h+\d+[a-z]?)$/\n\n##$1##\n\n/mg; +s/^(\((\d+)\))(?:\s+(\S.*)(?:\n(?=\.)(\n\S.*))?)?/\n\n$1$3/mg; +#s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg; +#s/^(TITLE\h+\d+[a-z]?)/\n#$1#\n\n/mg; +#s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg; +#s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg; + +# join non-headline multi-lines, split into sentences, and split after comma +s/([^\n=])\n/$1 /g; +$_ = $splitter->split($_); +s/,\s(?=[^\v=]*\v)/,\n/mg; + +write_file( $stem . '.mdwn', $_ ); + +print "DONE: $0 stem $stem\n"; diff --git a/mktxt2text-default b/mktxt2text-default new file mode 100755 index 0000000..aaeb542 --- /dev/null +++ b/mktxt2text-default @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# normalize text + +use Getopt::Long; +use File::Slurp; +use Lingua::Sentence; + +use strict; +use warnings; + +# setup sentence splitter +my $splitter = Lingua::Sentence->new("en"); + +my $force; +GetOptions ("force|f"); + +my $stem = shift; +$_ = read_file( $stem . '.txt' ); + +# page header +s/^(:?\d+|EN)\n+//mg; +s/EN\sEN//mg; +s/EN\s\d+\sEN//mg; + +# headline +s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg; +s/^(TITLE\h+\d+[a-z]?)/\n===$1===\n\n/mg; +s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg; +s/^\'?(Article\h+\d+[a-z]?)$/\n\n====$1====\n\n/mg; +s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg; + +# list item +s/^\'(\d+\.)\s+/$1\n/mg; + +# join non-headline multi-lines, split into sentences, and split after comma, colon and semi-colon +s/([^\n=])\n/$1 /g; +$_ = $splitter->split($_); +s/,\s(?=[^\v=]*\v)/,\n/mg; +s/:\s(?=[^\v=]*\v)/:\n/mg; +s/;\s(?=[^\v=]*\v)/;\n/mg; + +write_file( $stem . '.mdwn', $_ ); + +print "DONE: $0 stem $stem\n"; -- cgit v1.2.3