summaryrefslogtreecommitdiff
path: root/perl/Locale/Po4a/Text.pm
blob: ee56046e87d7daac5f8941372ba6945d4c011326 (plain)
  1. #!/usr/bin/perl -w
  2. # Po4a::Text.pm
  3. #
  4. # extract and translate translatable strings from a text documents
  5. #
  6. # This program is free software; you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation; either version 2 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program; if not, write to the Free Software
  18. # Foundation, Inc.,
  19. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. #
  21. ########################################################################
  22. =head1 NAME
  23. Locale::Po4a::Text - Convert text documents from/to PO files
  24. =head1 DESCRIPTION
  25. The po4a (po for anything) project goal is to ease translations (and more
  26. interestingly, the maintenance of translations) using gettext tools on
  27. areas where they were not expected like documentation.
  28. Locale::Po4a::Text is a module to help the translation of text documents into
  29. other [human] languages.
  30. Paragraphs are splitted on empty lines (or lines containing only spaces or
  31. tabulations).
  32. If a paragraph contains a line starting by a space (or tabulation), this
  33. paragraph won't be rewrapped.
  34. =cut
  35. package Locale::Po4a::Text;
  36. use 5.006;
  37. use strict;
  38. use warnings;
  39. require Exporter;
  40. use vars qw(@ISA @EXPORT);
  41. @ISA = qw(Locale::Po4a::TransTractor);
  42. @EXPORT = qw();
  43. use Locale::Po4a::TransTractor;
  44. use Locale::Po4a::Common;
  45. =head1 OPTIONS ACCEPTED BY THIS MODULE
  46. These are this module's particular options:
  47. =over
  48. =item B<nobullet>
  49. Deactivate detection of bullets.
  50. By default, when a bullet is detected, the bullet paragraph is not considered
  51. as a verbatim paragraph (with the no-wrap flag in the PO file), but the module
  52. rewrap this paragraph in the generated PO file and in the translation.
  53. =cut
  54. my $bullets = 1;
  55. =item B<debianchangelog>
  56. Handle the header and footer of
  57. released versions, which only contain non translatable informations.
  58. =cut
  59. my $debianchangelog = 0;
  60. =item B<fortunes>
  61. Handle the fortunes format, which separate fortunes with a line which
  62. consists in '%' or '%%', and use '%%' as the beginning of a comment.
  63. =cut
  64. my $fortunes = 0;
  65. =item B<markdown>
  66. Handle some special markup in Markdown-formatted texts.
  67. =cut
  68. my $markdown = 0;
  69. sub initialize {
  70. my $self = shift;
  71. my %options = @_;
  72. $self->{options}{'nobullets'}='';
  73. if (defined $options{'nobullets'}) {
  74. $bullets = 0;
  75. }
  76. if (defined $options{'debianchangelog'}) {
  77. $debianchangelog=1;
  78. }
  79. if (defined $options{'fortunes'}) {
  80. $fortunes=1;
  81. }
  82. if (defined $options{'markdown'}) {
  83. $markdown=1;
  84. }
  85. }
  86. sub parse {
  87. my $self = shift;
  88. my ($line,$ref);
  89. my $paragraph="";
  90. my $wrapped_mode = 1;
  91. my $expect_header = 1;
  92. my $end_of_paragraph = 0;
  93. ($line,$ref)=$self->shiftline();
  94. my $file = $ref;
  95. $file =~ s/:[0-9]+$//;
  96. while (defined($line)) {
  97. $ref =~ m/^(.*):[0-9]+$/;
  98. if ($1 ne $file) {
  99. $file = $1;
  100. do_paragraph($self,$paragraph,$wrapped_mode);
  101. $paragraph="";
  102. $wrapped_mode = 1;
  103. $expect_header = 1;
  104. }
  105. # TODO: preserve original line ends throughout the code instead
  106. chomp($line);
  107. $self->{ref}="$ref";
  108. if ($debianchangelog and
  109. $expect_header and
  110. $line =~ /^(\w[-+0-9a-z.]*)\ \(([^\(\) \t]+)\) # src, version
  111. \s+([-+0-9a-z.]+); # distribution
  112. \s*urgency\s*\=\s*(.*\S)\s*$/ix) { #
  113. do_paragraph($self,$paragraph,$wrapped_mode);
  114. $paragraph="";
  115. $self->pushline("$line\n");
  116. $expect_header=0;
  117. } elsif ($debianchangelog and
  118. $line =~ m/^ \-\- (.*) <(.*)> ((\w+\,\s*)?\d{1,2}\s+\w+\s+\d{4}\s+\d{1,2}:\d\d:\d\d\s+[-+]\d{4}(\s+\([^\\\(\)]\))?)$/) {
  119. # Found trailer
  120. do_paragraph($self,$paragraph,$wrapped_mode);
  121. $paragraph="";
  122. $self->pushline("$line\n");
  123. $expect_header=1;
  124. } elsif ($fortunes and
  125. $line =~ m/^%%?\s*$/) {
  126. # Found end of fortune
  127. do_paragraph($self,$paragraph,$wrapped_mode);
  128. # FIXME: test if this is still needed when always adding
  129. # newline in do_paragraph()
  130. $self->pushline("\n") unless ( $wrapped_mode == 0
  131. or $paragraph eq "");
  132. $paragraph="";
  133. $wrapped_mode = 1;
  134. $self->pushline("$line\n");
  135. } elsif ($line =~ /^\s*$/) {
  136. # Break paragraphs on lines containing only spaces
  137. do_paragraph($self,$paragraph,$wrapped_mode);
  138. $paragraph="";
  139. $wrapped_mode = 1;
  140. $self->pushline($line."\n");
  141. } elsif ($line =~ /^-- $/) {
  142. # Break paragraphs on email signature hint
  143. do_paragraph($self,$paragraph,$wrapped_mode);
  144. $paragraph="";
  145. $wrapped_mode = 1;
  146. $self->pushline($line."\n");
  147. } elsif ( $line =~ /^=+$/
  148. or $line =~ /^_+$/
  149. or $line =~ /^-+$/) {
  150. $wrapped_mode = 0;
  151. $paragraph .= $line."\n";
  152. do_paragraph($self,$paragraph,$wrapped_mode);
  153. $paragraph="";
  154. $wrapped_mode = 1;
  155. } elsif ($markdown and
  156. ( $line =~ /^\s*\[\[\!\S+\s*$/ # macro begin
  157. or $line =~ /^\s*"""\s*\]\]\s*$/)) { # """ textblock inside macro end
  158. # Avoid translating Markdown lines containing only markup
  159. do_paragraph($self,$paragraph,$wrapped_mode);
  160. $paragraph="";
  161. $wrapped_mode = 1;
  162. $self->pushline("$line\n");
  163. } elsif ($markdown and
  164. ( $line =~ /^#/ # headline
  165. or $line =~ /^\s*\[\[\!\S[^\]]*\]\]\s*$/)) { # sole macro
  166. # Preserve some Markdown markup as a single line
  167. do_paragraph($self,$paragraph,$wrapped_mode);
  168. $paragraph="$line\n";
  169. $wrapped_mode = 0;
  170. $end_of_paragraph = 1;
  171. } elsif ($markdown and
  172. ( $line =~ /^"""/)) { # """ textblock inside macro end
  173. # Markdown markup needing separation _before_ this line
  174. do_paragraph($self,$paragraph,$wrapped_mode);
  175. $paragraph="$line\n";
  176. $wrapped_mode = 1;
  177. } else {
  178. if ($line =~ /^\s/) {
  179. # A line starting by a space indicates a non-wrap
  180. # paragraph
  181. $wrapped_mode = 0;
  182. }
  183. if ($markdown and
  184. ( $line =~ /\S $/ # explicit newline
  185. or $line =~ /"""$/)) { # """ textblock inside macro begin
  186. # Markdown markup needing separation _after_ this line
  187. $end_of_paragraph = 1;
  188. }
  189. if ($fortunes) {
  190. $line =~ s/%%(.*)$//;
  191. }
  192. # TODO: comments
  193. $paragraph .= $line."\n";
  194. }
  195. # paragraphs starting by a bullet, or numbered
  196. # or paragraphs with a line containing many consecutive spaces
  197. # (more than 3)
  198. # are considered as verbatim paragraphs
  199. $wrapped_mode = 0 if ( $paragraph =~ m/^(\*|[0-9]+[.)] )/s
  200. or $paragraph =~ m/[ \t][ \t][ \t]/s);
  201. if ($markdown) {
  202. # Some Markdown markup can (or might) not survive wrapping
  203. $wrapped_mode = 0 if (
  204. $paragraph =~ /^>/ms # blockquote
  205. or $paragraph =~ /^( {8}|\t)/ms # monospaced
  206. or $paragraph =~ /^\$(\S+[{}]\S*\s*)+/ms # Xapian macro
  207. or $paragraph =~ /<(?![a-z]+[:@])/ms # maybe html (tags but not wiki <URI>)
  208. or $paragraph =~ /^[^<]+>/ms # maybe html (tag with vertical space)
  209. or $paragraph =~ /^\s*\[\[\!\S[^\]]+$/ms # macro begin
  210. );
  211. }
  212. if ($end_of_paragraph) {
  213. do_paragraph($self,$paragraph,$wrapped_mode);
  214. $paragraph="";
  215. $wrapped_mode = 1;
  216. $end_of_paragraph = 0;
  217. }
  218. ($line,$ref)=$self->shiftline();
  219. }
  220. if (length $paragraph) {
  221. do_paragraph($self,$paragraph,$wrapped_mode);
  222. }
  223. }
  224. sub do_paragraph {
  225. my ($self, $paragraph, $wrap) = (shift, shift, shift);
  226. return if ($paragraph eq "");
  227. if ($bullets) {
  228. # Detect bullets
  229. # | * blah blah
  230. # |<spaces> blah
  231. # | ^-- aligned
  232. # <empty line>
  233. #
  234. # Other bullets supported:
  235. # - blah o blah + blah
  236. # 1. blah 1) blah (1) blah
  237. TEST_BULLET:
  238. if ($paragraph =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s) {
  239. my $para = $5;
  240. my $bullet = $2;
  241. my $indent1 = $1;
  242. my $indent2 = "$1".(' ' x length $bullet);
  243. my $text = $4;
  244. while ($para !~ m/$indent2(?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+/
  245. and $para =~ s/^$indent2(\S[^\n]*\n)//s) {
  246. $text .= $1;
  247. }
  248. # TODO: detect if a line starts with the same bullet
  249. if ($text !~ m/\S[ \t][ \t][ \t]+\S/s) {
  250. my $bullet_regex = quotemeta($indent1.$bullet);
  251. $bullet_regex =~ s/[0-9]+/\\d\+/;
  252. if ($para eq '' or $para =~ m/^$bullet_regex\S/s) {
  253. my $trans = $self->translate($text,
  254. $self->{ref},
  255. "Bullet: '$indent1$bullet'",
  256. "wrap" => 1,
  257. "wrapcol" => - (length $indent2));
  258. $trans =~ s/^/$indent1$bullet/s;
  259. $trans =~ s/\n(.)/\n$indent2$1/sg;
  260. $self->pushline( $trans."\n" );
  261. if ($para eq '') {
  262. return;
  263. } else {
  264. # Another bullet
  265. $paragraph = $para;
  266. goto TEST_BULLET;
  267. }
  268. }
  269. }
  270. }
  271. }
  272. # TODO: detect indented paragraphs
  273. my $transfinal = $self->translate($paragraph,
  274. $self->{ref},
  275. "Plain text",
  276. "wrap" => $wrap);
  277. # TODO: preserve original line ends throughout the code instead
  278. chomp $transfinal;
  279. $transfinal .= "\n";
  280. $self->pushline( $transfinal );
  281. }
  282. 1;
  283. =head1 STATUS OF THIS MODULE
  284. Tested successfully on simple text files and NEWS.Debian files.
  285. =head1 AUTHORS
  286. Nicolas François <nicolas.francois@centraliens.net>
  287. =head1 COPYRIGHT AND LICENSE
  288. Copyright 2005-2008 by Nicolas FRANÇOIS <nicolas.francois@centraliens.net>.
  289. This program is free software; you may redistribute it and/or modify it
  290. under the terms of GPL (see the COPYING file).