From 27d338dec0428bc22e2838eb8641c6e0d1681e22 Mon Sep 17 00:00:00 2001
From: Jonas Smedegaard <dr@jones.dk>
Date: Fri, 10 May 2013 20:46:04 +0200
Subject: Include mk* scripts

---
 mkall              | 33 ++++++++++++++++++++++++++++
 mkgit              | 43 +++++++++++++++++++++++++++++++++++++
 mkhtm2html-1       | 49 ++++++++++++++++++++++++++++++++++++++++++
 mkhtm2html-2       | 39 +++++++++++++++++++++++++++++++++
 mkhtm2html-default | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mkhtml2md-default  | 21 ++++++++++++++++++
 mkmd2mdwn-default  | 35 ++++++++++++++++++++++++++++++
 mkpdf              | 47 ++++++++++++++++++++++++++++++++++++++++
 mkpdf2htm-default  | 23 ++++++++++++++++++++
 mkslice-1          | 24 +++++++++++++++++++++
 mkslice-2          | 30 ++++++++++++++++++++++++++
 mkslice-default    | 25 ++++++++++++++++++++++
 mktxt-default      | 21 ++++++++++++++++++
 mktxt2text-1       | 40 ++++++++++++++++++++++++++++++++++
 mktxt2text-default | 45 ++++++++++++++++++++++++++++++++++++++
 15 files changed, 538 insertions(+)
 create mode 100755 mkall
 create mode 100755 mkgit
 create mode 100755 mkhtm2html-1
 create mode 100755 mkhtm2html-2
 create mode 100755 mkhtm2html-default
 create mode 100755 mkhtml2md-default
 create mode 100755 mkmd2mdwn-default
 create mode 100755 mkpdf
 create mode 100755 mkpdf2htm-default
 create mode 100755 mkslice-1
 create mode 100755 mkslice-2
 create mode 100755 mkslice-default
 create mode 100755 mktxt-default
 create mode 100755 mktxt2text-1
 create mode 100755 mktxt2text-default

diff --git a/mkall b/mkall
new file mode 100755
index 0000000..59b5af6
--- /dev/null
+++ b/mkall
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# process text(s)
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=-f; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+# fetch PDF files
+./mkpdf $force "$@"
+
+# resolve stems from available PDF files if none provided
+[ "$#" -gt 0 ] || eval set -- $(ls -1 *.pdf | sed 's/.pdf$//')
+
+# run tasks - either generic or with "_$stem" suffix
+log_action_begin_msg "Apply tasks"
+for stem in "$@"; do
+#	log_action_cont_msg $stem
+#	for task in pdf2txt txt2text slice; do
+#	for task in pdf2htm htm2html html2md md2mdwn slice; do
+	for task in pdf2htm htm2html; do
+#		log_action_cont_msg $task
+#		test ! -x $task-all || ./$task-all $force $stem
+		taskscript=mk$task-$stem
+		[ -x $taskscript ] || taskscript=mk$task-default
+		./$taskscript $force $stem
+	done
+done
+log_action_end_msg $?
diff --git a/mkgit b/mkgit
new file mode 100755
index 0000000..99449a6
--- /dev/null
+++ b/mkgit
@@ -0,0 +1,43 @@
+#!/bin/sh
+
+# create local git, and create and push to remote git
+
+set -e
+
+login=debian@source.jones.dk
+path="/srv/git/source.jones.dk/epfsug/diff"
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=-f; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+# add suffix unless already included
+gitpath="$(dirname "$path")/$(basename "$path" .git).git"
+
+log_action_begin_msg "Publish git"
+
+# initialize local git
+log_action_cont_msg "init"
+if [ -d .git ]; then
+	if [ -n "$force" ]; then
+		log_warning_msg "purging"
+		rm -rf .git
+	else
+		log_failure_msg "local git already exist (force with -f)"
+		exit 1
+	fi
+fi
+git init
+
+# add scripts
+log_action_cont_msg "add"
+git add mk*; git commit -m "Include mk* scripts"
+
+# create and populate remote git
+log_action_cont_msg "push"
+ssh "$login" git init --bare "$gitpath" \&\& cd "$gitpath" \&\& mv hooks/post-update{.sample,} \&\& touch git-daemon-export-ok
+git push $force --all -u "$login:$gitpath" || { log_failure_msg "remote git already exist (force with -f)"; exit 1; }
+
+log_action_end_msg $?
diff --git a/mkhtm2html-1 b/mkhtm2html-1
new file mode 100755
index 0000000..6f37a46
--- /dev/null
+++ b/mkhtm2html-1
@@ -0,0 +1,49 @@
+#!/usr/bin/perl
+
+# normalize HTML
+
+use Getopt::Long;
+use File::Slurp;
+
+use strict;
+use warnings;
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+my $_ = read_file( $stem . '.htm' );
+
+# whitespace
+s/&#160;/ /mg;
+
+# page header
+s{<P\b[^>]*;top:6[23]px;[^>]*>[^<]*</P>\s*}{}mg;
+
+# footnote
+foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) {
+	s{\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\)}{}mg;
+};
+foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:6px;)/mg, $_ ) {
+	s{<P\b[^>]*>\(</P>\s*<P\b[^>]*\sclass="$class">\d+</P>\s*<P\b[^>]*>\) [^<]*</P>\s*}{}mg;
+};
+
+# unwrap similarly styled bolded paragraphs
+s{<P\b[^>]*class="([^"]+)"[^>]*><b>[^<]+\K</b></P>\s*<P\b[^>]*class="\1"[^>]*><b>}{ }mg;
+
+# headline
+s{<P\b[^>]*>(TITLE \S+)</P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>(SECTION \S+)</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+.)</b></i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)</i></P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{(<P\b[^>]*>)(\d+)\. }{<H3>$2</H3>\n$1}mg;
+s{<P\b[^>]*>\(([a-z])\)</P>}{<H4>$1</H4>}mg;
+s{(<P\b[^>]*>)\(([ivx]+)\) }{<H5>$2</H5>\n$1}mg;
+s{<P\b[^>]*>\(([ivx]+)\)</P>}{<H5>$1</H5>}mg;
+
+# unwrap
+s{(?<=\S)-(<br/>|</P>\s*<P\b[^>]*>)(?=[[:lower:]])}{}mg;
+s{\s*<br/>\s*}{ }mg;
+
+write_file( $stem . '.html', $_ );
+
+print "DONE: $0 stem $stem\n";
diff --git a/mkhtm2html-2 b/mkhtm2html-2
new file mode 100755
index 0000000..ae18391
--- /dev/null
+++ b/mkhtm2html-2
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+
+# normalize HTML
+
+use Getopt::Long;
+use File::Slurp;
+
+use strict;
+use warnings;
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+my $_ = read_file( $stem . '.htm' );
+
+# whitespace
+s/(?:&#160;|\h)+/ /mg;
+
+# preamble
+s{.*>HAVE ADOPTED THIS REGULATION:</P>\s*}{}s;
+
+# page header
+s{<P\b[^>]*;top:(?:1172|1187)px;[^>]*>(?:(?!</P\b).)+.</P>\s*}{}mg;
+
+# headline
+s{<P\b[^>]*>(?:In Title \S+, the following Section \S+ is inserted:</P>\s*<P\b[^>]*>)?\'?(SECTION \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>\'?(Article \S+)</i></P>}{<H1>$1</H1>}mg;
+s{<P\b[^>]*>(?:Article \S+ is replaced by the following:</P>\s*<P\b[^>]*>)?\'?(Article \S+)<br/><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*>(Article \S+) is amended as follows:</P>}{<H1>$1</H1>}mg;
+s{<P\b[^>]*>(?:paragraph \S+ is replaced by the following:</P>\s*)(<P\b[^>]*>)\'?(\d+)\. }{<H3>$2</H3>\n$1}mg;
+s{<P\b[^>]*>In (Article \S+), paragraph (\S+) is replaced by the following:</P>\s*(<P\b[^>]*>)\'?(\2)\. }{<H1>$1</H1>\n<H3>$2</H3>\n$3}mg;
+
+# unwrap
+s{\s*<br/>\s*}{ }mg;
+
+write_file( $stem . '.html', $_ );
+
+print "DONE: $0 stem $stem\n";
diff --git a/mkhtm2html-default b/mkhtm2html-default
new file mode 100755
index 0000000..c2589a2
--- /dev/null
+++ b/mkhtm2html-default
@@ -0,0 +1,63 @@
+#!/usr/bin/perl
+
+# normalize HTML
+
+use Getopt::Long;
+use File::Slurp;
+
+use strict;
+use warnings;
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+my $_ = read_file( $stem . '.htm' );
+
+# whitespace
+s/&#160;/ /mg;
+s{<P\b[^>]*>\s*</P>\s*}{}mg;
+
+# page header
+s{<DIV\b[^>]*>\s*\K(?:<P\b[^>]*;top:1\d{3}px;[^>]*>(?:<[bi]>)?[^<]+(?:</[bi]>)?</P>\s*)+}{}mg;
+
+# footnote
+s{<P\b[^>]*>\h+</P>\s*(?:<P\b[^>]*><b>\S+</b></P>\s*<P\b[^>]*>((?:(?!</P\b).)+.)</P>\s*)+(?=</DIV>)}{}mg;
+
+foreach my $class ( map /(?<=\.)(ft\d+)(?={font-size:7px;)/mg, $_ ) {
+	s{\(</P>\s*<P\s[^>]*\sclass="$class">\d+</P>\s*<P\s[^>]*>\)}{}mg;
+};
+
+# document headers
+s{<HEAD>.*?</HEAD>\s*}{}msg;
+s{</DIV>\n</BODY>\n</HTML>.*?<DIV\b[^>]*>\s*}{}msg;
+
+# unwrap similarly styled bolded paragraphs
+s{<P\b[^>]*class="([^"]+)"[^>]*><b>[^<]+\K</b></P>\s*<P\b[^>]*class="\1"[^>]*><b>}{ }mg;
+
+# headline
+s{<P\b[^>]*>(TITLE \S+)</P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>(SECTION \S+)</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+.)</b></i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)</i></P>\s*<P\b[^>]*><b>((?:(?!</P\b).)+.)\s*</b></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*><i>((?:Article|ANNEX) \S+)\s*</i></P>\s*<P\b[^>]*><i><b>((?:(?!</P\b).)+?.)\s*</b>\s*</i></P>}{<H1>$1</H1>\n<H2>$2</H2>}mg;
+s{<P\b[^>]*>(\d+)\.</P>\s*}{<H3>$1</H3>\n}mg;
+s{(<P\b[^>]*>)(\d+)\. }{<H3>$2</H3>\n$1}mg;
+s{<P\b[^>]*>\(([a-z])\)</P>}{<H4>$1</H4>}mg;
+s{(<P\b[^>]*>)\(([ivx]+)\) }{<H5>$2</H5>\n$1}mg;
+s{<P\b[^>]*>\(([ivx]+)\)</P>}{<H5>$1</H5>}mg;
+
+# unwrap
+s{(?<=[[:lower:]])-(?:<br/>|</P>\s*<P\b[^>]*>)(?=[[:lower:]])}{}mg;
+#s{<P\b[^>]*class="([^"]+)"[^>]*>[^<]+\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
+s{(?<=class="(ft\d{1})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
+s{(?<=class="(ft\d{2})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
+s{(?<=class="(ft\d{3})">)[^<]+?\K\s*</P>\s*<P\b[^>]*class="\1"[^>]*>}{ }mg;
+#s{<P\b[^>]*class="([^"]+)"[^>]*>([^<]+?)\s*</P>\s*(?=<P\b[^>]*class="\1"[^>]*>)}{$2 }mg;
+s{\s*<br/>\s*}{ }mg;
+
+# styling
+s{<P\b[^>]*>}{<P>}mg;
+
+write_file( $stem . '.html', $_ );
+
+print "DONE: $0 stem $stem\n";
diff --git a/mkhtml2md-default b/mkhtml2md-default
new file mode 100755
index 0000000..90f4d33
--- /dev/null
+++ b/mkhtml2md-default
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+# convert text from HTML to markdown
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+log_action_begin_msg "Convert HTML → Markdown"
+for stem in "$@"; do
+	infile=$stem.html
+	outfile=$stem.md
+	log_action_cont_msg $stem
+	[ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot $infile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; }
+	pandoc --normalize --atx-headers -f html -t markdown -o $outfile $infile
+done
+log_action_end_msg $?
diff --git a/mkmd2mdwn-default b/mkmd2mdwn-default
new file mode 100755
index 0000000..588b61e
--- /dev/null
+++ b/mkmd2mdwn-default
@@ -0,0 +1,35 @@
+#!/usr/bin/perl
+
+# normalize Markdown
+
+use Getopt::Long;
+use File::Slurp;
+use Lingua::Sentence;
+
+use strict;
+use warnings;
+
+# setup sentence splitter
+my $splitter = Lingua::Sentence->new("en");
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+$_ = read_file( $stem . '.md' );
+
+# join non-headline multi-lines
+s/([^\n=])\n/$1 /g;
+#s/(?<=[\S^=])\h*\n(?=[\S^=])/ /g;
+
+# split into sentences
+$_ = $splitter->split($_);
+
+# split after comma, and before and after ellipsis
+#s/,\s(?=[^\v=]*\v)/,\n/mg;
+s/\h+(\(…\))/\n$1/mg;
+s/(\(…\))\h+/$1\n/mg;
+
+write_file( $stem . '.mdwn', $_ );
+
+print "DONE: $0 stem $stem\n";
diff --git a/mkpdf b/mkpdf
new file mode 100755
index 0000000..ec6ca0a
--- /dev/null
+++ b/mkpdf
@@ -0,0 +1,47 @@
+#!/bin/sh
+
+# fetch PDF text
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+# operate on both files if none provided
+test "$#" -gt 0 || eval set -- 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+
+log_action_begin_msg "Fetch PDF"
+for stem in "$@"; do
+	outfile=$stem.pdf
+	log_action_cont_msg $stem
+	[ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; }
+	case $stem in
+	  1) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2009:078:0001:0042:en:PDF';;
+	  2) wget -O$outfile 'http://www.europarl.europa.eu/registre/docs_autres_institutions/commission_europeenne/com/2013/0161/COM_COM%282013%290161_EN.pdf';;
+	  3) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:52012PC0011:EN:PDF';;
+# disappeared 20130510
+#	  4) wget -O$outfile 'http://www.statewatch.org/news/2013/may/eu-coe-data-protection-8825-13.pdf';;
+	  4) wget -O$outfile 'http://erikjosefsson.eu/sites/default/files/Council-20130424-GDPR-statewatch-leak.pdf';;
+# the following 5 documents are ACTA leaks and the 6th is the final ACTA text
+	  5) wget -O$outfile 'http://www.laquadrature.net/files/201001_acta.pdf';;
+	  6) wget -O$outfile 'http://trade.ec.europa.eu/doclib/docs/2010/april/tradoc_146029.pdf';;
+	  7) wget -O$outfile 'http://www.erikjosefsson.eu/sites/default/files/ACTA_leak_20100701.pdf';;
+	  8) wget -O$outfile 'http://keionline.org/sites/default/files/acta_aug25_dc.pdf';;
+	  9) wget -O$outfile 'http://keionline.org/sites/default/files/actaoct2010.pdf';;
+	 10) wget -O$outfile 'http://trade.ec.europa.eu/doclib/docs/2011/may/tradoc_147937.pdf';;
+# the following 9 documents are EUR-LEX files, the first each year in the Offical Journal of the EU (OJ) starting 1996
+	 11) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1996:001:0001:001:en:PDF';;
+	 12) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1997:001:0001:001:en:PDF';;
+	 13) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1998:001:0001:001:en:PDF';;
+	 14) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:1999:001:0001:001:en:PDF';;
+	 15) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2000:001:0001:001:en:PDF';;
+	 16) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2001:001:0001:001:en:PDF';;
+	 17) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2002:001:0001:001:en:PDF';;
+	 18) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2003:001:0001:001:en:PDF';;
+	 19) wget -O$outfile 'http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:L:2004:001:0001:001:en:PDF';;
+	esac
+done
+log_action_end_msg $?
diff --git a/mkpdf2htm-default b/mkpdf2htm-default
new file mode 100755
index 0000000..dbf408a
--- /dev/null
+++ b/mkpdf2htm-default
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# convert text from PDF to HTML
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+log_action_begin_msg "Convert PDF → HTML"
+for stem in "$@"; do
+	infile=$stem.pdf
+	outfile=$stem.htm
+	log_action_cont_msg $stem
+	[ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot $infile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; }
+	pdftohtml -q -c -s -i $infile
+	mv -f $stem-html.html $outfile
+	rm -f $stem-outline.html
+done
+log_action_end_msg $?
diff --git a/mkslice-1 b/mkslice-1
new file mode 100755
index 0000000..b4326cd
--- /dev/null
+++ b/mkslice-1
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+# create slices of text
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+# actual slicing: first argument is variant, others are csplice patterns
+doit() {
+	variant=$1; shift
+	csplit -s -f $stem -b "-$variant-%03d.mdwn" $stem.mdwn "$@"
+}
+
+log_action_begin_msg "Create slices"
+for stem in "$@"; do
+	log_action_cont_msg $stem
+	doit rec '/^([[:digit:]]\+)\|HAS ADOPTED THIS REGULATION/' '{19}' '%%' '{*}'
+done
+log_action_end_msg $?
diff --git a/mkslice-2 b/mkslice-2
new file mode 100755
index 0000000..507f0c5
--- /dev/null
+++ b/mkslice-2
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+# create slices of text
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+# actual slicing: first argument is variant, others are csplice patterns
+doit() {
+	variant=$1; shift
+	csplit -s -f $stem -b "-$variant-%03d.mdwn" $stem.mdwn "$@"
+}
+
+log_action_begin_msg "Create slices"
+for stem in "$@"; do
+	log_action_cont_msg $stem
+#	doit preamble \
+#		'%Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL%' \
+#		'/HAVE ADOPTED THIS REGULATION:/' \
+#		'%%' '{*}'
+	doit rec \
+		'/^# \(Title\|Article\)/' '{25}' \
+		'%%' '{*}'
+done
+log_action_end_msg $?
diff --git a/mkslice-default b/mkslice-default
new file mode 100755
index 0000000..ee4d980
--- /dev/null
+++ b/mkslice-default
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+# create slices of text
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+# actual slicing: first argument is variant, others are csplice patterns
+doit() {
+	variant=$1; shift
+	csplit -s -f $stem -b "-$variant-%03d.mdwn" $stem.mdwn "$@"
+}
+
+log_action_begin_msg "Create slices"
+for stem in "$@"; do
+	log_action_cont_msg $stem
+	doit rec \
+		'/^# \(Title\|Article\)/' '{*}'
+done
+log_action_end_msg $?
diff --git a/mktxt-default b/mktxt-default
new file mode 100755
index 0000000..9c2e422
--- /dev/null
+++ b/mktxt-default
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+# convert text from PDF to text
+
+set -e
+
+# resolve options
+eval set -- "$(getopt -s sh -o f -- "$@")"
+while true; do case "$1" in -f) force=1; shift;; --) shift; break;; esac; done
+
+. /lib/lsb/init-functions
+
+log_action_begin_msg "Convert PDF → text"
+for stem in "$@"; do
+	infile=$stem.pdf
+	outfile=$stem.txt
+	log_action_cont_msg $stem
+	[ -n "$force" ] || [ ! -f $outfile ] || [ $outfile -ot $infile ] || [ $outfile -ot "$0" ] || { log_warning_msg "skipped"; continue; }
+	pdftotext -raw -nopgbrk $stem.pdf $outfile
+done
+log_action_end_msg $?
diff --git a/mktxt2text-1 b/mktxt2text-1
new file mode 100755
index 0000000..89770b4
--- /dev/null
+++ b/mktxt2text-1
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+
+# normalize text
+
+use Getopt::Long;
+use File::Slurp;
+use Lingua::Sentence;
+
+use strict;
+use warnings;
+
+# setup sentence splitter
+my $splitter = Lingua::Sentence->new("en");
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+$_ = read_file( $stem . '.txt' );
+
+# page header
+s/^(:?24.3.2009|Official Journal of the European Union|EN)\n+//mg;
+
+# headline
+s/^(TITLE\h+(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I|))/\n#$1#\n\n/mg;
+#s/^(Article\h+\d+[a-z]?)$/\n\n##$1##\n\n/mg;
+s/^(\((\d+)\))(?:\s+(\S.*)(?:\n(?=\.)(\n\S.*))?)?/\n\n$1$3/mg;
+#s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg;
+#s/^(TITLE\h+\d+[a-z]?)/\n#$1#\n\n/mg;
+#s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg;
+#s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg;
+
+# join non-headline multi-lines, split into sentences, and split after comma
+s/([^\n=])\n/$1 /g;
+$_ = $splitter->split($_);
+s/,\s(?=[^\v=]*\v)/,\n/mg;
+
+write_file( $stem . '.mdwn', $_ );
+
+print "DONE: $0 stem $stem\n";
diff --git a/mktxt2text-default b/mktxt2text-default
new file mode 100755
index 0000000..aaeb542
--- /dev/null
+++ b/mktxt2text-default
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# normalize text
+
+use Getopt::Long;
+use File::Slurp;
+use Lingua::Sentence;
+
+use strict;
+use warnings;
+
+# setup sentence splitter
+my $splitter = Lingua::Sentence->new("en");
+
+my $force;
+GetOptions ("force|f");
+
+my $stem = shift;
+$_ = read_file( $stem . '.txt' );
+
+# page header
+s/^(:?\d+|EN)\n+//mg;
+s/EN\sEN//mg;
+s/EN\s\d+\sEN//mg;
+
+# headline
+s/^(SECTION\h+\d+[a-z]?)/\n==$1==\n\n/mg;
+s/^(TITLE\h+\d+[a-z]?)/\n===$1===\n\n/mg;
+s/^(ANNEX\h+[IVX]+)$/\n===$1===\n\n/mg;
+s/^\'?(Article\h+\d+[a-z]?)$/\n\n====$1====\n\n/mg;
+s/^(?!\.)(Rule\h+\d+[a-z]?)\s+(\S.*)/\n====$1====\n\n$2\n\n/mg;
+
+# list item
+s/^\'(\d+\.)\s+/$1\n/mg;
+
+# join non-headline multi-lines, split into sentences, and split after comma, colon and semi-colon
+s/([^\n=])\n/$1 /g;
+$_ = $splitter->split($_);
+s/,\s(?=[^\v=]*\v)/,\n/mg;
+s/:\s(?=[^\v=]*\v)/:\n/mg;
+s/;\s(?=[^\v=]*\v)/;\n/mg;
+
+write_file( $stem . '.mdwn', $_ );
+
+print "DONE: $0 stem $stem\n";
-- 
cgit v1.2.3