summaryrefslogtreecommitdiff
path: root/perl/Locale/Po4a/Text.pm
blob: e64a4a1de0b2f811ab4d3d33b22eb628c59333f6 (plain)
  1. #!/usr/bin/perl -w
  2. # Po4a::Text.pm
  3. #
  4. # extract and translate translatable strings from a text documents
  5. #
  6. # This program is free software; you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation; either version 2 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program; if not, write to the Free Software
  18. # Foundation, Inc.,
  19. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. #
  21. ########################################################################
  22. =head1 NAME
  23. Locale::Po4a::Text - Convert text documents from/to PO files
  24. =head1 DESCRIPTION
  25. The po4a (po for anything) project goal is to ease translations (and more
  26. interestingly, the maintenance of translations) using gettext tools on
  27. areas where they were not expected like documentation.
  28. Locale::Po4a::Text is a module to help the translation of text documents into
  29. other [human] languages.
  30. Paragraphs are splitted on empty lines (or lines containing only spaces or
  31. tabulations).
  32. If a paragraph contains a line starting by a space (or tabulation), this
  33. paragraph won't be rewrapped.
  34. =cut
  35. package Locale::Po4a::Text;
  36. use 5.006;
  37. use strict;
  38. use warnings;
  39. require Exporter;
  40. use vars qw(@ISA @EXPORT);
  41. @ISA = qw(Locale::Po4a::TransTractor);
  42. @EXPORT = qw();
  43. use Locale::Po4a::TransTractor;
  44. use Locale::Po4a::Common;
  45. =head1 OPTIONS ACCEPTED BY THIS MODULE
  46. These are this module's particular options:
  47. =over
  48. =item B<nobullet>
  49. Deactivate detection of bullets.
  50. By default, when a bullet is detected, the bullet paragraph is not considered
  51. as a verbatim paragraph (with the no-wrap flag in the PO file), but the module
  52. rewrap this paragraph in the generated PO file and in the translation.
  53. =cut
  54. my $bullets = 1;
  55. =item B<debianchangelog>
  56. Handle the header and footer of
  57. released versions, which only contain non translatable informations.
  58. =cut
  59. my $debianchangelog = 0;
  60. =item B<fortunes>
  61. Handle the fortunes format, which separate fortunes with a line which
  62. consists in '%' or '%%', and use '%%' as the beginning of a comment.
  63. =cut
  64. my $fortunes = 0;
  65. =item B<markdown>
  66. Handle some special markup in Markdown-formatted texts.
  67. =cut
  68. my $markdown = 0;
  69. =item B<asciidoc>
  70. Handle documents in the asciidoc format.
  71. =cut
  72. my $asciidoc = 0;
  73. =back
  74. =cut
  75. sub initialize {
  76. my $self = shift;
  77. my %options = @_;
  78. $self->{options}{'nobullets'}='';
  79. if (defined $options{'nobullets'}) {
  80. $bullets = 0;
  81. }
  82. if (defined $options{'debianchangelog'}) {
  83. $debianchangelog=1;
  84. }
  85. if (defined $options{'fortunes'}) {
  86. $fortunes=1;
  87. }
  88. if (defined $options{'markdown'}) {
  89. $markdown=1;
  90. }
  91. $asciidoc=1 if (defined $options{'asciidoc'});
  92. }
  93. sub parse {
  94. my $self = shift;
  95. my ($line,$ref);
  96. my $paragraph="";
  97. my $wrapped_mode = 1;
  98. my $expect_header = 1;
  99. ($line,$ref)=$self->shiftline();
  100. my $file = $ref;
  101. $file =~ s/:[0-9]+$//;
  102. while (defined($line)) {
  103. $ref =~ m/^(.*):[0-9]+$/;
  104. if ($1 ne $file) {
  105. $file = $1;
  106. do_paragraph($self,$paragraph,$wrapped_mode);
  107. $paragraph="";
  108. }
  109. chomp($line);
  110. $self->{ref}="$ref";
  111. if ($debianchangelog and
  112. $expect_header and
  113. $line =~ /^(\w[-+0-9a-z.]*)\ \(([^\(\) \t]+)\) # src, version
  114. \s+([-+0-9a-z.]+); # distribution
  115. \s*urgency\s*\=\s*(.*\S)\s*$/ix) { #
  116. do_paragraph($self,$paragraph,$wrapped_mode);
  117. $paragraph="";
  118. $self->pushline("$line\n");
  119. $expect_header=0;
  120. } elsif ($debianchangelog and
  121. $line =~ m/^ \-\- (.*) <(.*)> ((\w+\,\s*)?\d{1,2}\s+\w+\s+\d{4}\s+\d{1,2}:\d\d:\d\d\s+[-+]\d{4}(\s+\([^\\\(\)]\))?)$/) {
  122. # Found trailer
  123. do_paragraph($self,$paragraph,$wrapped_mode);
  124. $paragraph="";
  125. $self->pushline("$line\n");
  126. $expect_header=1;
  127. } elsif ($fortunes and
  128. $line =~ m/^%%?\s*$/) {
  129. # Found end of fortune
  130. do_paragraph($self,$paragraph,$wrapped_mode);
  131. $self->pushline("\n") unless ( $wrapped_mode == 0
  132. or $paragraph eq "");
  133. $paragraph="";
  134. $wrapped_mode = 1;
  135. $self->pushline("$line\n");
  136. } elsif ( (defined $self->{verbatim})
  137. and ($self->{verbatim} == 2)) {
  138. # Untranslated blocks
  139. $self->pushline($line."\n");
  140. if ($asciidoc and
  141. ($line =~ m/^(\/{4,}|~{4,})$/)) {
  142. undef $self->{verbatim};
  143. undef $self->{type};
  144. $wrapped_mode = 1;
  145. }
  146. } elsif ($line =~ /^\s*$/) {
  147. # Break paragraphs on lines containing only spaces
  148. do_paragraph($self,$paragraph,$wrapped_mode);
  149. $paragraph="";
  150. $wrapped_mode = 1 unless defined($self->{verbatim});
  151. $self->pushline($line."\n");
  152. } elsif ($asciidoc and (not defined($self->{verbatim})) and
  153. ($line =~ m/^(\+|--)$/)) {
  154. # List Item Continuation or List Block
  155. do_paragraph($self,$paragraph,$wrapped_mode);
  156. $paragraph="";
  157. $self->pushline($line."\n");
  158. } elsif ($asciidoc and (not defined($self->{verbatim})) and
  159. ($line =~ m/^(={4,}|-{4,}|~{4,}|\^{4,}|\+{4,})$/) and
  160. (defined($paragraph) )and
  161. ($paragraph =~ m/^[^\n]*\n$/s) and
  162. (length($paragraph) == (length($line)+1))) {
  163. # Found title
  164. $wrapped_mode = 0;
  165. my $level = $line;
  166. $level =~ s/^(.).*$/$1/;
  167. my $t = $self->translate($paragraph,
  168. $self->{ref},
  169. "Title $level",
  170. "wrap" => 0);
  171. $self->pushline($t);
  172. $paragraph="";
  173. $wrapped_mode = 1;
  174. $self->pushline(($level x (length($t)-1))."\n");
  175. } elsif ($asciidoc and
  176. ($line =~ m/^(={1,5})( +)(.*?)( +\1)?$/)) {
  177. my $titlelevel1 = $1;
  178. my $titlespaces = $2;
  179. my $title = $3;
  180. my $titlelevel2 = $4||"";
  181. # Found one line title
  182. do_paragraph($self,$paragraph,$wrapped_mode);
  183. $wrapped_mode = 0;
  184. $paragraph="";
  185. my $t = $self->translate($title,
  186. $self->{ref},
  187. "Title $titlelevel1",
  188. "wrap" => 0);
  189. $self->pushline($titlelevel1.$titlespaces.$t.$titlelevel2."\n");
  190. $wrapped_mode = 1;
  191. } elsif ($asciidoc and
  192. ($line =~ m/^(\/{4,}|\+{4,}|-{4,}|\.{4,}|\*{4,}|_{4,}|={4,}|~{4,})$/)) {
  193. # Found one delimited block
  194. my $t = $line;
  195. $t =~ s/^(.).*$/$1/;
  196. my $type = "delimited block $t";
  197. if (defined $self->{verbatim} and ($self->{type} ne $type)) {
  198. $paragraph .= "$line\n";
  199. } else {
  200. do_paragraph($self,$paragraph,$wrapped_mode);
  201. if ( (defined $self->{type})
  202. and ($self->{type} eq $type)) {
  203. undef $self->{type};
  204. undef $self->{verbatim};
  205. $wrapped_mode = 1;
  206. } else {
  207. if ($t eq "\/") {
  208. # CommentBlock, should not be treated
  209. $self->{verbatim} = 2;
  210. } elsif ($t eq "+") {
  211. # PassthroughBlock
  212. $wrapped_mode = 0;
  213. $self->{verbatim} = 1;
  214. } elsif ($t eq "-") {
  215. # ListingBlock
  216. $wrapped_mode = 0;
  217. $self->{verbatim} = 1;
  218. } elsif ($t eq ".") {
  219. # LiteralBlock
  220. $wrapped_mode = 0;
  221. $self->{verbatim} = 1;
  222. } elsif ($t eq "*") {
  223. # SidebarBlock
  224. $wrapped_mode = 1;
  225. } elsif ($t eq "_") {
  226. # QuoteBlock
  227. if ( (defined $self->{type})
  228. and ($self->{type} eq "verse")) {
  229. $wrapped_mode = 0;
  230. $self->{verbatim} = 1;
  231. } else {
  232. $wrapped_mode = 1;
  233. }
  234. } elsif ($t eq "=") {
  235. # ExampleBlock
  236. $wrapped_mode = 1;
  237. } elsif ($t eq "~") {
  238. # Filter blocks, TBC: not translated
  239. $wrapped_mode = 0;
  240. $self->{verbatim} = 2;
  241. }
  242. $self->{type} = $type;
  243. }
  244. $paragraph="";
  245. $self->pushline($line."\n");
  246. }
  247. } elsif ($asciidoc and not defined $self->{verbatim} and
  248. ($line =~ m/^\[\[([^\]]*)\]\]$/)) {
  249. # Found BlockId
  250. do_paragraph($self,$paragraph,$wrapped_mode);
  251. $paragraph="";
  252. $wrapped_mode = 1;
  253. $self->pushline($line."\n");
  254. undef $self->{bullet};
  255. undef $self->{indent};
  256. } elsif ($asciidoc and not defined $self->{verbatim} and
  257. ($paragraph eq "") and
  258. ($line =~ m/^((?:NOTE|TIP|IMPORTANT|WARNING|CAUTION):\s+)(.*)$/)) {
  259. my $type = $1;
  260. my $text = $2;
  261. do_paragraph($self,$paragraph,$wrapped_mode);
  262. $paragraph=$text."\n";
  263. $wrapped_mode = 1;
  264. $self->pushline($type);
  265. undef $self->{bullet};
  266. undef $self->{indent};
  267. } elsif ($asciidoc and not defined $self->{verbatim} and
  268. ($line =~ m/^\[(NOTE|TIP|IMPORTANT|WARNING|CAUTION|verse|quote)\]$/)) {
  269. my $type = $1;
  270. do_paragraph($self,$paragraph,$wrapped_mode);
  271. $paragraph="";
  272. $wrapped_mode = 1;
  273. $self->pushline($line."\n");
  274. if ($type eq "verse") {
  275. $wrapped_mode = 0;
  276. }
  277. undef $self->{bullet};
  278. undef $self->{indent};
  279. } elsif ($asciidoc and not defined $self->{verbatim} and
  280. ($line =~ m/^\[(verse|quote), +(.*)\]$/)) {
  281. my $type = $1;
  282. my $arg = $2;
  283. do_paragraph($self,$paragraph,$wrapped_mode);
  284. $paragraph="";
  285. my $t = $self->translate($arg,
  286. $self->{ref},
  287. "$type",
  288. "wrap" => 0);
  289. $self->pushline("[$type, $t]\n");
  290. $wrapped_mode = 1;
  291. if ($type eq "verse") {
  292. $wrapped_mode = 0;
  293. }
  294. $self->{type} = $type;
  295. undef $self->{bullet};
  296. undef $self->{indent};
  297. } elsif ($asciidoc and not defined $self->{verbatim} and
  298. ($line =~ m/^\[icon="(.*)"\]$/)) {
  299. my $arg = $1;
  300. do_paragraph($self,$paragraph,$wrapped_mode);
  301. $paragraph="";
  302. my $t = $self->translate($arg,
  303. $self->{ref},
  304. "icon",
  305. "wrap" => 0);
  306. $self->pushline("[icon=\"$t\"]\n");
  307. $wrapped_mode = 1;
  308. undef $self->{bullet};
  309. undef $self->{indent};
  310. } elsif ($asciidoc and not defined $self->{verbatim} and
  311. ($line =~ m/^\[icons=None, +caption="(.*)"\]$/)) {
  312. my $arg = $1;
  313. do_paragraph($self,$paragraph,$wrapped_mode);
  314. $paragraph="";
  315. my $t = $self->translate($arg,
  316. $self->{ref},
  317. "caption",
  318. "wrap" => 0);
  319. $self->pushline("[icons=None, caption=\"$t\"]\n");
  320. $wrapped_mode = 1;
  321. undef $self->{bullet};
  322. undef $self->{indent};
  323. } elsif ($asciidoc and not defined $self->{verbatim} and
  324. ($line =~ m/^(\s*)([*_+`'#[:alnum:]].*)((?:::|;;|\?\?|:-)(?: *\\)?)$/)) {
  325. my $indent = $1;
  326. my $label = $2;
  327. my $labelend = $3;
  328. # Found labeled list
  329. do_paragraph($self,$paragraph,$wrapped_mode);
  330. $paragraph="";
  331. $wrapped_mode = 1;
  332. $self->{bullet} = "";
  333. $self->{indent} = $indent;
  334. my $t = $self->translate($label,
  335. $self->{ref},
  336. "Labeled list",
  337. "wrap" => 0);
  338. $self->pushline("$indent$t$labelend\n");
  339. } elsif ($asciidoc and not defined $self->{verbatim} and
  340. ($line =~ m/^(\s*)(\S.*)((?:::|;;)\s+)(.*)$/)) {
  341. my $indent = $1;
  342. my $label = $2;
  343. my $labelend = $3;
  344. my $labeltext = $4;
  345. # Found Horizontal Labeled Lists
  346. do_paragraph($self,$paragraph,$wrapped_mode);
  347. $paragraph=$labeltext."\n";
  348. $wrapped_mode = 1;
  349. $self->{bullet} = "";
  350. $self->{indent} = $indent;
  351. my $t = $self->translate($label,
  352. $self->{ref},
  353. "Labeled list",
  354. "wrap" => 0);
  355. $self->pushline("$indent$t$labelend");
  356. } elsif ($asciidoc and not defined $self->{verbatim} and
  357. ($line =~ m/^\:(\S.*?)(:\s*)(.*)$/)) {
  358. my $attrname = $1;
  359. my $attrsep = $2;
  360. my $attrvalue = $3;
  361. # Found a Attribute entry
  362. do_paragraph($self,$paragraph,$wrapped_mode);
  363. $paragraph="";
  364. $wrapped_mode = 1;
  365. undef $self->{bullet};
  366. undef $self->{indent};
  367. my $t = $self->translate($attrvalue,
  368. $self->{ref},
  369. "Attribute :$attrname:",
  370. "wrap" => 0);
  371. $self->pushline(":$attrname$attrsep$t\n");
  372. } elsif ($asciidoc and not defined $self->{verbatim} and
  373. ($line !~ m/^\.\./) and ($line =~ m/^\.(\S.*)$/)) {
  374. my $title = $1;
  375. # Found block title
  376. do_paragraph($self,$paragraph,$wrapped_mode);
  377. $paragraph="";
  378. $wrapped_mode = 1;
  379. undef $self->{bullet};
  380. undef $self->{indent};
  381. my $t = $self->translate($title,
  382. $self->{ref},
  383. "Block title",
  384. "wrap" => 0);
  385. $self->pushline(".$t\n");
  386. } elsif ($asciidoc and not defined $self->{verbatim} and
  387. ($line =~ m/^(\s*)((?:[-*o+]|(?:[0-9]+[.\)])|(?:[a-z][.\)])|\([0-9]+\)|\.|\.\.)\s+)(.*)$/)) {
  388. my $indent = $1||"";
  389. my $bullet = $2;
  390. my $text = $3;
  391. do_paragraph($self,$paragraph,$wrapped_mode);
  392. $paragraph = $text."\n";
  393. $self->{indent} = $indent;
  394. $self->{bullet} = $bullet;
  395. } elsif ($asciidoc and not defined $self->{verbatim} and
  396. ($line =~ m/^((?:<?[0-9]+)?> +)(.*)$/)) {
  397. my $bullet = $1;
  398. my $text = $2;
  399. do_paragraph($self,$paragraph,$wrapped_mode);
  400. $paragraph = $text."\n";
  401. $self->{indent} = "";
  402. $self->{bullet} = $bullet;
  403. } elsif ($asciidoc and not defined $self->{verbatim} and
  404. (defined $self->{bullet} and $line =~ m/^(\s+)(.*)$/)) {
  405. my $indent = $1;
  406. my $text = $2;
  407. if (not defined $self->{indent}) {
  408. $paragraph .= $text."\n";
  409. $self->{indent} = $indent;
  410. } elsif (length($paragraph) and (length($self->{bullet}) + length($self->{indent}) == length($indent))) {
  411. $paragraph .= $text."\n";
  412. } else {
  413. do_paragraph($self,$paragraph,$wrapped_mode);
  414. $paragraph = $text."\n";
  415. $self->{indent} = $indent;
  416. $self->{bullet} = "";
  417. }
  418. } elsif ( $line =~ /^=*$/
  419. or $line =~ /^_*$/
  420. or $line =~ /^-*$/) {
  421. $wrapped_mode = 0;
  422. $paragraph .= $line."\n";
  423. do_paragraph($self,$paragraph,$wrapped_mode);
  424. $paragraph="";
  425. $wrapped_mode = 1;
  426. } elsif ($markdown and
  427. ( $line =~ /^#/ # headline
  428. or $line =~ /^\s*\[\[\!\S[^\]]*\]\]\s*$/)) { # sole macro
  429. # Found Markdown markup that should be preserved as a single line
  430. do_paragraph($self,$paragraph,$wrapped_mode);
  431. $paragraph="$line\n";
  432. $wrapped_mode = 0;
  433. do_paragraph($self,$paragraph,$wrapped_mode);
  434. $wrapped_mode = 1;
  435. $paragraph="";
  436. } elsif ($markdown and
  437. ( $paragraph =~ m/^>/ # blockquote
  438. or $paragraph =~ m/[<>]/ # maybe html
  439. or $paragraph =~ m/^"""/ # textblock inside macro end
  440. or $paragraph =~ m/"""$/)) { # textblock inside macro begin
  441. # Found Markdown markup that might not survive wrapping
  442. $wrapped_mode = 0;
  443. $paragraph .= $line."\n";
  444. } else {
  445. if ($line =~ /^\s/) {
  446. # A line starting by a space indicates a non-wrap
  447. # paragraph
  448. $wrapped_mode = 0;
  449. } else {
  450. undef $self->{bullet};
  451. undef $self->{indent};
  452. }
  453. if ($fortunes) {
  454. $line =~ s/%%(.*)$//;
  455. }
  456. # TODO: comments
  457. $paragraph .= $line."\n";
  458. }
  459. # paragraphs starting by a bullet, or numbered
  460. # or paragraphs with a line containing many consecutive spaces
  461. # (more than 3)
  462. # are considered as verbatim paragraphs
  463. $wrapped_mode = 0 if ( $paragraph =~ m/^(\*|[0-9]+[.)] )/s
  464. or $paragraph =~ m/[ \t][ \t][ \t]/s);
  465. ($line,$ref)=$self->shiftline();
  466. }
  467. if (length $paragraph) {
  468. do_paragraph($self,$paragraph,$wrapped_mode);
  469. }
  470. }
  471. sub do_paragraph {
  472. my ($self, $paragraph, $wrap) = (shift, shift, shift);
  473. my $type = shift || $self->{type} || "Plain text";
  474. return if ($paragraph eq "");
  475. # DEBUG
  476. # my $b;
  477. # if (defined $self->{bullet}) {
  478. # $b = $self->{bullet};
  479. # } else {
  480. # $b = "UNDEF";
  481. # }
  482. # $type .= " verbatim: '".($self->{verbatim}||"NONE")."' bullet: '$b' indent: '".($self->{indent}||"NONE")."' type: '".($self->{type}||"NONE")."'";
  483. if ($bullets and not $wrap and not defined $self->{verbatim}) {
  484. # Detect bullets
  485. # | * blah blah
  486. # |<spaces> blah
  487. # | ^-- aligned
  488. # <empty line>
  489. #
  490. # Other bullets supported:
  491. # - blah o blah + blah
  492. # 1. blah 1) blah (1) blah
  493. TEST_BULLET:
  494. if ($paragraph =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s) {
  495. my $para = $5;
  496. my $bullet = $2;
  497. my $indent1 = $1;
  498. my $indent2 = "$1".(' ' x length $bullet);
  499. my $text = $4;
  500. while ($para !~ m/$indent2(?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+/
  501. and $para =~ s/^$indent2(\S[^\n]*\n)//s) {
  502. $text .= $1;
  503. }
  504. # TODO: detect if a line starts with the same bullet
  505. if ($text !~ m/\S[ \t][ \t][ \t]+\S/s) {
  506. my $bullet_regex = quotemeta($indent1.$bullet);
  507. $bullet_regex =~ s/[0-9]+/\\d\+/;
  508. if ($para eq '' or $para =~ m/^$bullet_regex\S/s) {
  509. my $trans = $self->translate($text,
  510. $self->{ref},
  511. "Bullet: '$indent1$bullet'",
  512. "wrap" => 1,
  513. "wrapcol" => - (length $indent2));
  514. $trans =~ s/^/$indent1$bullet/s;
  515. $trans =~ s/\n(.)/\n$indent2$1/sg;
  516. $self->pushline( $trans."\n" );
  517. if ($para eq '') {
  518. return;
  519. } else {
  520. # Another bullet
  521. $paragraph = $para;
  522. goto TEST_BULLET;
  523. }
  524. }
  525. }
  526. }
  527. }
  528. my $end = "";
  529. if ($wrap) {
  530. $paragraph =~ s/^(.*?)(\n*)$/$1/s;
  531. $end = $2 || "";
  532. }
  533. my $t = $self->translate($paragraph,
  534. $self->{ref},
  535. $type,
  536. "wrap" => $wrap);
  537. if (defined $self->{bullet}) {
  538. my $bullet = $self->{bullet};
  539. my $indent1 = $self->{indent};
  540. my $indent2 = $indent1.(' ' x length($bullet));
  541. $t =~ s/^/$indent1$bullet/s;
  542. $t =~ s/\n(.)/\n$indent2$1/sg;
  543. }
  544. $self->pushline( $t.$end );
  545. }
  546. 1;
  547. =head1 STATUS OF THIS MODULE
  548. Tested successfully on simple text files and NEWS.Debian files.
  549. =head1 AUTHORS
  550. Nicolas François <nicolas.francois@centraliens.net>
  551. =head1 COPYRIGHT AND LICENSE
  552. Copyright 2005-2008 by Nicolas FRANÇOIS <nicolas.francois@centraliens.net>.
  553. This program is free software; you may redistribute it and/or modify it
  554. under the terms of GPL (see the COPYING file).