diff options
author | John MacFarlane <jgm@berkeley.edu> | 2014-07-21 22:29:16 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2014-08-13 22:56:32 -0700 |
commit | 870e63be7360b5a0097a27656048e853bc720464 (patch) | |
tree | e8f19ee2d62e529115cb71dcda5f3298cca7d389 | |
parent | 650ad87f35f4405a2ca8270d2b2835daa442e5f1 (diff) |
Initial commit
184 files changed, 27304 insertions, 3 deletions
@@ -21,3 +21,13 @@ *.i*86 *.x86_64 *.hex + +*~ +*.bak +*.diff +*# +scanners.c +*.zip +bstrlib.txt +stmd.dSYM/* +stmd @@ -0,0 +1,30 @@ +Copyright (c) 2014, John MacFarlane + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of John MacFarlane nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..98d0c45 --- /dev/null +++ b/Makefile @@ -0,0 +1,62 @@ +CFLAGS=-g -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS) +LDFLAGS=-g -O3 -Wall -Werror +SRCDIR=src +DATADIR=data + +PROG=./stmd + +.PHONY: all oldtests test spec benchjs testjs +all: $(SRCDIR)/case_fold_switch.c $(PROG) + +spec: test spec.html + +spec.md: spec.txt + perl spec2md.pl < $< > $@ + +spec.html: spec.md template.html + pandoc --no-highlight --number-sections --template template.html -s --toc -S $< > $@ # | perl -pe 's/␣/<span class="space"> <\/span>/g' > $@ + +spec.pdf: spec.md template.tex specfilter.hs + pandoc -s $< --template template.tex \ + --filter ./specfilter.hs -o $@ --latex-engine=xelatex --toc \ + --number-sections -V documentclass=report -V tocdepth=2 \ + -V classoption=twosides + +oldtests: + make -C oldtests --quiet clean all + +test: spec.txt + perl runtests.pl $(PROG) $< + +testjs: spec.txt + node js/test.js +# perl runtests.pl js/markdown $< + +benchjs: + node js/bench.js + +$(PROG): $(SRCDIR)/main.c $(SRCDIR)/inlines.o $(SRCDIR)/blocks.o $(SRCDIR)/detab.o $(SRCDIR)/bstrlib.o $(SRCDIR)/scanners.o $(SRCDIR)/print.o $(SRCDIR)/html.o $(SRCDIR)/utf8.o + $(CC) $(LDFLAGS) -o $@ $^ + +$(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re + re2c --case-insensitive -bis $< > $@ + +$(SRCDIR)/case_fold_switch.c: $(DATADIR)/CaseFolding-3.2.0.txt + perl mkcasefold.pl < $< > $@ + +.PHONY: leakcheck clean fuzztest dingus + +dingus: + cd js && echo "Starting dingus server at http://localhost:9000" && python -m SimpleHTTPServer 9000 + +leakcheck: $(PROG) + cat oldtests/*/*.markdown | valgrind --leak-check=full --dsymutil=yes $(PROG) + +fuzztest: + for i in `seq 1 10`; do \ + time cat /dev/urandom | head -c 100000 | iconv -f latin1 -t utf-8 | $(PROG) >/dev/null; done + +clean: + -rm test $(SRCDIR)/*.o $(SRCDIR)/scanners.c + -rm -r *.dSYM + -rm spec.md fuzz.txt spec.html @@ -1,4 +1,39 @@ -stmd -==== +Standard markdown +================= + +Standard markdown is a specification of markdown syntax, together +with implementations (`stmd`) in C and javascript. + +The C implementation provides both a library and a standalone program +that converts markdown to HTML. It is written in standard C99 and has +no library dependencies. (However, if you check it out from the +repository, you'll need `re2c` to generate `scanners.c` from +`scanners.re`. This is only a build dependency for developers, since +`scanners.c` can be provided in a released source tarball.) + +The javascript implementation is a single javascript file +that can be linked to an HTML page. A standalone version (using +`node.js`) is also provided (`js/markdown`), and there is a +"dingus" for playing with it interactively. (`make dingus` will start +this.) + +The spec contains over 400 embedded examples which serve as +conformance tests. To run the tests for `stmd`, do `make test`. +To run them for another markdown program, say `myprog`, +do `make test PROG=myprog`. To run the tests for `stmd.js`, +do `make testjs`. + +The source of the spec is `spec.txt`. This is basically a markdown +file, with code examples written in a shorthand form: + + . + markdown source + . + expected HTML output + . + +To build an HTML version of the spec, do `make spec.html`. +To build a PDF version, do `make spec.pdf`. Both these commands +require that pandoc is installed, and creating a PDF requires +a latex installation. -a spec for "standard markdown," with matching C and javascript implementations @@ -0,0 +1,5 @@ +- add library function to convert a string +- add README/library documentation +- add man page for prog and library +- document/clean up code + diff --git a/alternative-html-blocks.txt b/alternative-html-blocks.txt new file mode 100644 index 0000000..3ba0d15 --- /dev/null +++ b/alternative-html-blocks.txt @@ -0,0 +1,247 @@ +# Appendix B: An alternate spec for HTML blocks {-} + +(The following spec departs less from original markdown than the +one described above, but is also less flexible.) + +An [HTML block](#html-block) <a id="html-block-tag"/> begins +with an [open tag](#open-tag), [HTML comment](#html-comment), +[processing instruction](#processing-instruction), +[declaration](#declaration), or [CDATA section](#cdata-section). +This opening element may optionally be preceded by 1-3 spaces, +and must not be followed on a line by anything other than white space. + +If the opening tag is self-closing, or if it is an [HTML +comment](#html-comment), [processing +instruction](#processing-instruction), [declaration](#declaration), or +[CDATA section](#cdata-section), then the [HTML block](#html-block) +contains just that tag. + +If it is an [open tag](#open-tag), then the [HTML block](#html-block) +continues until a matching closing tag is found, or until the end +of the document. Note that the matching closing tag is not necessarily +the first closing tag of the same type that is encountered, since +that tag may close a later open tag of the same type. Open and closing +tags must be balanced. + +The contents of the HTML block are interpreted as raw HTML, and will not +be escaped in HTML output. + +Some simple examples: + +. +<table> + <tr> + <td> + hi + </td> + </tr> +</table> + +okay. +. +<table> + <tr> + <td> + hi + </td> + </tr> +</table> +<p>okay.</p> +. + + +. +<div class="outer"> + + <div class="inner"> + + <p>fooö</p> + + </div> + +</div> +. +<div class="outer"> + + <div class="inner"> + + <p>fooö</p> + + </div> + +</div> +. + +A self-closing tag: + +. +<div /> +. +<div /> +. + +Here we have an unclosed tag, and the block continues to the end of +the document: + +. +<div> +<div> +foo +</div> + +*bar* +. +<div> +<div> +foo +</div> + +*bar* +. + +A comment: + +. +<!-- Foo +bar + baz --> +. +<!-- Foo +bar + baz --> +. + +A processing instruction: + +. +<?php + echo 'foo' +?> +. +<?php + echo 'foo' +?> +. + +CDATA: + +. +<![CDATA[ +function matchwo(a,b) +{ +if (a < b && a < 0) then + { + return 1; + } +else + { + return 0; + } +} +]]> +. +<![CDATA[ +function matchwo(a,b) +{ +if (a < b && a < 0) then + { + return 1; + } +else + { + return 0; + } +} +]]> +. + +The opening tag can be indented 1-3 spaces, but not 4: + +. + <!-- foo --> + <!-- foo --> +. + <!-- foo --> +<pre><code><!-- foo --> +</code></pre> +. + +The opening tag must be on a line (or lines) by itself: + +. +<table><tr><td> +foo +</td></tr></table> +. +<p><table><tr<td> foo </td></tr></table></p> +. + +. +<!-- foo -->bar +. +<p><!-- foo -->bar</p> +. + +The opening tag need not be an HTML block tag or even an HTML tag: + +. +<a> +foo +</a> +. +<a> +foo +</a> +. + +. +<foo> +bar +</foo> +. +<foo> +bar +</foo> +. + +So, note the difference: + +. +<del> +bar +</del> + +<del>bar</del> +. +<del> +bar +</del> +<p><del>bar</del></p> +. + +This rule differs from John Gruber's original markdown syntax +specification, which says: + +> The only restrictions are that block-level HTML elements — +> e.g. `<div>`, `<table>`, `<pre>`, `<p>`, etc. — must be separated from +> surrounding content by blank lines, and the start and end tags of the +> block should not be indented with tabs or spaces. + +In some ways Gruber's rule is more restrictive than the one given +here: + +- It requires that an HTML block be preceded and followed by a blank line. +- It does not allow the start tag to be indented. +- It does not allow the end tag to be indented. +- It does not require that the open tag be an HTML block-level tag. + +Indeed, most markdown implementations, including some of Gruber's +own perl implementations, do not impose these restrictions. + +However, unlike Gruber's rule, this one requires that the open +tag be on a line by itself. It also differs from most markdown +implementations in how it handles the case where there is no matching +closing tag (a case not mentioned in Gruber's rule). In such a case, +the rule stated above includes the whole rest of the document in the +HTML block. + diff --git a/data/CaseFolding-3.2.0.txt b/data/CaseFolding-3.2.0.txt new file mode 100644 index 0000000..104a823 --- /dev/null +++ b/data/CaseFolding-3.2.0.txt @@ -0,0 +1,912 @@ +# CaseFolding-3.2.0.txt +# Date: 2002-03-22,20:54:33 GMT [MD] +# +# Case Folding Properties +# +# This file is a supplement to the UnicodeData file. +# It provides a case folding mapping generated from the Unicode Character Database. +# If all characters are mapped according to the full mapping below, then +# case differences (according to UnicodeData.txt and SpecialCasing.txt) +# are eliminated. +# +# The data supports both implementations that require simple case foldings +# (where string lengths don't change), and implementations that allow full case folding +# (where string lengths may grow). Note that where they can be supported, the +# full case foldings are superior: for example, they allow "MASSE" and "Mae" to match. +# +# NOTE: case folding does not preserve normalization formats! +# +# For information on case folding, see +# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/ +# +# ================================================================================ +# Format +# ================================================================================ +# The entries in this file are in the following machine-readable format: +# +# <code>; <status>; <mapping>; # <name> +# +# The status field is: +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) +# +# ================================================================= + +0041; C; 0061; # LATIN CAPITAL LETTER A +0042; C; 0062; # LATIN CAPITAL LETTER B +0043; C; 0063; # LATIN CAPITAL LETTER C +0044; C; 0064; # LATIN CAPITAL LETTER D +0045; C; 0065; # LATIN CAPITAL LETTER E +0046; C; 0066; # LATIN CAPITAL LETTER F +0047; C; 0067; # LATIN CAPITAL LETTER G +0048; C; 0068; # LATIN CAPITAL LETTER H +0049; C; 0069; # LATIN CAPITAL LETTER I +0049; T; 0131; # LATIN CAPITAL LETTER I +004A; C; 006A; # LATIN CAPITAL LETTER J +004B; C; 006B; # LATIN CAPITAL LETTER K +004C; C; 006C; # LATIN CAPITAL LETTER L +004D; C; 006D; # LATIN CAPITAL LETTER M +004E; C; 006E; # LATIN CAPITAL LETTER N +004F; C; 006F; # LATIN CAPITAL LETTER O +0050; C; 0070; # LATIN CAPITAL LETTER P +0051; C; 0071; # LATIN CAPITAL LETTER Q +0052; C; 0072; # LATIN CAPITAL LETTER R +0053; C; 0073; # LATIN CAPITAL LETTER S +0054; C; 0074; # LATIN CAPITAL LETTER T +0055; C; 0075; # LATIN CAPITAL LETTER U +0056; C; 0076; # LATIN CAPITAL LETTER V +0057; C; 0077; # LATIN CAPITAL LETTER W +0058; C; 0078; # LATIN CAPITAL LETTER X +0059; C; 0079; # LATIN CAPITAL LETTER Y +005A; C; 007A; # LATIN CAPITAL LETTER Z +00B5; C; 03BC; # MICRO SIGN +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE +00C6; C; 00E6; # LATIN CAPITAL LETTER AE +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014A; C; 014B; # LATIN CAPITAL LETTER ENG +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152; C; 0153; # LATIN CAPITAL LIGATURE OE +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON +017F; C; 0073; # LATIN SMALL LETTER LONG S +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA +0196; C; 0269; # LATIN CAPITAL LETTER IOTA +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN +01A2; C; 01A3; # LATIN CAPITAL LETTER OI +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK +01A6; C; 0280; # LATIN LETTER YR +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO +01A9; C; 0283; # LATIN CAPITAL LETTER ESH +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE +01B7; C; 0292; # LATIN CAPITAL LETTER EZH +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C; C; 021D; # LATIN CAPITAL LETTER YOGH +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222; C; 0223; # LATIN CAPITAL LETTER OU +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA +0392; C; 03B2; # GREEK CAPITAL LETTER BETA +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA +0397; C; 03B7; # GREEK CAPITAL LETTER ETA +0398; C; 03B8; # GREEK CAPITAL LETTER THETA +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA +039C; C; 03BC; # GREEK CAPITAL LETTER MU +039D; C; 03BD; # GREEK CAPITAL LETTER NU +039E; C; 03BE; # GREEK CAPITAL LETTER XI +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON +03A0; C; 03C0; # GREEK CAPITAL LETTER PI +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA +03D0; C; 03B2; # GREEK BETA SYMBOL +03D1; C; 03B8; # GREEK THETA SYMBOL +03D5; C; 03C6; # GREEK PHI SYMBOL +03D6; C; 03C0; # GREEK PI SYMBOL +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA +03DA; C; 03DB; # GREEK LETTER STIGMA +03DC; C; 03DD; # GREEK LETTER DIGAMMA +03DE; C; 03DF; # GREEK LETTER KOPPA +03E0; C; 03E1; # GREEK LETTER SAMPI +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI +03F0; C; 03BA; # GREEK KAPPA SYMBOL +03F1; C; 03C1; # GREEK RHO SYMBOL +03F2; C; 03C3; # GREEK LUNATE SIGMA SYMBOL +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE +0410; C; 0430; # CYRILLIC CAPITAL LETTER A +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE +0418; C; 0438; # CYRILLIC CAPITAL LETTER I +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN +041E; C; 043E; # CYRILLIC CAPITAL LETTER O +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE +0423; C; 0443; # CYRILLIC CAPITAL LETTER U +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN +042D; C; 044D; # CYRILLIC CAPITAL LETTER E +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +2126; C; 03C9; # OHM SIGN +212A; C; 006B; # KELVIN SIGN +212B; C; 00E5; # ANGSTROM SIGN +2160; C; 2170; # ROMAN NUMERAL ONE +2161; C; 2171; # ROMAN NUMERAL TWO +2162; C; 2172; # ROMAN NUMERAL THREE +2163; C; 2173; # ROMAN NUMERAL FOUR +2164; C; 2174; # ROMAN NUMERAL FIVE +2165; C; 2175; # ROMAN NUMERAL SIX +2166; C; 2176; # ROMAN NUMERAL SEVEN +2167; C; 2177; # ROMAN NUMERAL EIGHT +2168; C; 2178; # ROMAN NUMERAL NINE +2169; C; 2179; # ROMAN NUMERAL TEN +216A; C; 217A; # ROMAN NUMERAL ELEVEN +216B; C; 217B; # ROMAN NUMERAL TWELVE +216C; C; 217C; # ROMAN NUMERAL FIFTY +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z +10400; C; 10428; # DESERET CAPITAL LETTER LONG I +10401; C; 10429; # DESERET CAPITAL LETTER LONG E +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO +1040C; C; 10434; # DESERET CAPITAL LETTER AY +1040D; C; 10435; # DESERET CAPITAL LETTER OW +1040E; C; 10436; # DESERET CAPITAL LETTER WU +1040F; C; 10437; # DESERET CAPITAL LETTER YEE +10410; C; 10438; # DESERET CAPITAL LETTER H +10411; C; 10439; # DESERET CAPITAL LETTER PEE +10412; C; 1043A; # DESERET CAPITAL LETTER BEE +10413; C; 1043B; # DESERET CAPITAL LETTER TEE +10414; C; 1043C; # DESERET CAPITAL LETTER DEE +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE +10416; C; 1043E; # DESERET CAPITAL LETTER JEE +10417; C; 1043F; # DESERET CAPITAL LETTER KAY +10418; C; 10440; # DESERET CAPITAL LETTER GAY +10419; C; 10441; # DESERET CAPITAL LETTER EF +1041A; C; 10442; # DESERET CAPITAL LETTER VEE +1041B; C; 10443; # DESERET CAPITAL LETTER ETH +1041C; C; 10444; # DESERET CAPITAL LETTER THEE +1041D; C; 10445; # DESERET CAPITAL LETTER ES +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE +1041F; C; 10447; # DESERET CAPITAL LETTER ESH +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE +10421; C; 10449; # DESERET CAPITAL LETTER ER +10422; C; 1044A; # DESERET CAPITAL LETTER EL +10423; C; 1044B; # DESERET CAPITAL LETTER EM +10424; C; 1044C; # DESERET CAPITAL LETTER EN +10425; C; 1044D; # DESERET CAPITAL LETTER ENG diff --git a/js/LICENSE b/js/LICENSE new file mode 100644 index 0000000..bb8c36f --- /dev/null +++ b/js/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2014, John MacFarlane + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of John MacFarlane nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/js/bench.js b/js/bench.js new file mode 100644 index 0000000..3c486b1 --- /dev/null +++ b/js/bench.js @@ -0,0 +1,35 @@ +var Benchmark = require('benchmark').Benchmark; +var suite = new Benchmark.Suite; +var fs = require('fs'); +var sm = require('./stmd'); +// https://github.com/coreyti/showdown +var showdown = require('../../showdown/src/showdown'); +// https://github.com/chjj/marked +var marked = require('../../marked/marked.min.js'); + +var benchfile = process.argv[2] || 'oldtests/Original/Markdown_Documentation_Syntax.markdown'; + +var contents = fs.readFileSync(benchfile, 'utf8'); + +// var converter = new showdown.converter(); + +suite.add('stmd markdown->html', function() { + var doc = new sm.DocParser().parse(contents); + var renderer = new sm.HtmlRenderer(); + renderer.renderBlock(doc); +}) + +.add('showdown.js markdown->html', function() { + var converter = new showdown.converter(); + converter.makeHtml(contents); +}) + +.add('marked.js markdown->html', function() { + marked(contents); +}) + +.on('cycle', function(event) { + console.log(String(event.target)); +}) +.run(); + diff --git a/js/markdown b/js/markdown new file mode 100755 index 0000000..05a372a --- /dev/null +++ b/js/markdown @@ -0,0 +1,15 @@ +#!/usr/bin/env node +var fs = require('fs'); +var util = require('util'); +var stmd = require('./stmd'); + +file = process.argv[2] || '/dev/stdin'; + +fs.readFile(file, 'utf8', function(err, data) { + if (err) { + return console.log(err); + } + var parser = new stmd.DocParser(); + var renderer = new stmd.HtmlRenderer(); + console.log(renderer.render(parser.parse(data))); +}); diff --git a/js/stmd.js b/js/stmd.js new file mode 100755 index 0000000..399d58d --- /dev/null +++ b/js/stmd.js @@ -0,0 +1,1540 @@ +// stmd.js - "standard markdown" in javascript +// Copyright (C) 2014 John MacFarlane +// License: BSD3. + +// Basic usage: +// +// var stmd = require('stmd'); +// var parser = new stmd.DocParser(); +// var renderer = new stmd.HtmlRenderer(); +// console.log(renderer.render(parser.parse('Hello *world*'))); + +(function(exports) { + +// Some regexps used in inline parser: + +var ESCAPABLE = '[!"#$%&\'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]'; +var ESCAPED_CHAR = '\\\\' + ESCAPABLE; +var IN_DOUBLE_QUOTES = '"(' + ESCAPED_CHAR + '|[^"\\x00])*"'; +var IN_SINGLE_QUOTES = '\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\''; +var IN_PARENS = '\\((' + ESCAPED_CHAR + '|[^)\\x00])*\\)'; +var REG_CHAR = '[^\\\\()\\x00-\\x20]'; +var IN_PARENS_NOSP = '\\((' + REG_CHAR + '|' + ESCAPED_CHAR + ')*\\)'; +var TAGNAME = '[A-Za-z][A-Za-z0-9]*'; +var BLOCKTAGNAME = '(?:article|header|aside|hgroup|blockquote|hr|body|li|br|map|button|object|canvas|ol|caption|output|col|p|colgroup|pre|dd|progress|div|section|dl|table|td|dt|tbody|embed|textarea|fieldset|tfoot|figcaption|th|figure|thead|footer|footer|tr|form|ul|h1|h2|h3|h4|h5|h6|video|script|style)'; +var ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'; +var UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"; +var SINGLEQUOTEDVALUE = "'[^']*'"; +var DOUBLEQUOTEDVALUE = '"[^"]*"'; +var ATTRIBUTEVALUE = "(?:" + UNQUOTEDVALUE + "|" + SINGLEQUOTEDVALUE + "|" + DOUBLEQUOTEDVALUE + ")"; +var ATTRIBUTEVALUESPEC = "(?:" + "\\s*=" + "\\s*" + ATTRIBUTEVALUE + ")"; +var ATTRIBUTE = "(?:" + "\\s+" + ATTRIBUTENAME + ATTRIBUTEVALUESPEC + "?)"; +var OPENTAG = "<" + TAGNAME + ATTRIBUTE + "*" + "\\s*/?>"; +var CLOSETAG = "</" + TAGNAME + "\\s*[>]"; +var OPENBLOCKTAG = "<" + BLOCKTAGNAME + ATTRIBUTE + "*" + "\\s*/?>"; +var CLOSEBLOCKTAG = "</" + BLOCKTAGNAME + "\\s*[>]"; +var HTMLCOMMENT = "<!--([^-]+|[-][^-]+)*-->"; +var PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"; +var DECLARATION = "<![A-Z]+" + "\\s+[^>]*>"; +var CDATA = "<!\\[CDATA\\[([^\\]]+|\\][^\\]]|\\]\\][^>])*\\]\\]>"; +var HTMLTAG = "(?:" + OPENTAG + "|" + CLOSETAG + "|" + HTMLCOMMENT + "|" + + PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"; +var HTMLBLOCKOPEN = "<(?:" + BLOCKTAGNAME + "[\\s/>]" + "|" + + "/" + BLOCKTAGNAME + "[\\s>]" + "|" + "[?!])"; + +var reHtmlTag = new RegExp('^' + HTMLTAG, 'i'); + +var reHtmlBlockOpen = new RegExp('^' + HTMLBLOCKOPEN, 'i'); + +var reLinkTitle = new RegExp( + '^(?:"(' + ESCAPED_CHAR + '|[^"\\x00])*"' + + '|' + + '\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\'' + + '|' + + '\\((' + ESCAPED_CHAR + '|[^)\\x00])*\\))'); + +var reLinkDestinationBraces = new RegExp( + '[<](?:[^<>\\n\\\\\\x00]' + '|' + ESCAPED_CHAR + '|' + '\\\\)*[>]'); + +var reLinkDestination = new RegExp( + '(?:' + REG_CHAR + '+|' + ESCAPED_CHAR + '|' + IN_PARENS_NOSP + ')*'); + +var reEscapable = new RegExp(ESCAPABLE); + +var reAllEscapedChar = new RegExp('\\\\(' + ESCAPABLE + ')', 'g'); + +var reEscapedChar = new RegExp('^\\\\(' + ESCAPABLE + ')'); + +var reAllTab = /\t/g; + +var reHrule = /^(?:(?:\* *){3,}|(?:_ *){3,}|(?:- *){3,}) *$/; + +// Matches a character with a special meaning in markdown, +// or a string of non-special characters. +var reMain = /[\n`\[\]\\!<&*_]|[^\n`\[\]\\!<&*_]+/m; + +// UTILITY FUNCTIONS + +// Replace backslash escapes with literal characters. +var unescape = function(s) { + return s.replace(reAllEscapedChar, '$1'); +}; + +// Returns true if string contains only space characters. +var isBlank = function(s) { + return /^\s*$/.test(s); +}; + +// Normalize reference label: collapse internal whitespace +// to single space, remove leading/trailing whitespace, case fold. +var normalizeReference = function(s) { + return s.trim() + .replace(/\s+/,' ') + .toUpperCase(); +}; + +// Attempt to match a regex in string s at offset offset. +// Return index of match or null. +var matchAt = function(re, s, offset) { + var res = s.slice(offset).match(re); + if (res) { + return offset + res.index; + } else { + return null; + } +}; + +// Convert tabs to spaces on each line using a 4-space tab stop. +var detabLine = function(text) { + if (text.indexOf('\t') == -1) { + return text; + } else { + var lastStop = 0; + return text.replace(reAllTab, function(match, offset) { + var result = ' '.slice((offset - lastStop) % 4); + lastStop = offset + 1; + return result; + }); + } +}; + +// INLINE PARSER + +// These are methods of an InlineParser object, defined below. +// An InlineParser keeps track of a subject (a string to be +// parsed) and a position in that subject. + +// If re matches at current position in the subject, advance +// position in subject and return the match; otherwise return null. +var match = function(re) { + var match = re.exec(this.subject.slice(this.pos)); + if (match) { + this.pos += match.index + match[0].length; + return match[0]; + } else { + return null; + } +}; + +// Returns the character at the current subject position, or null if +// there are no more characters. +var peek = function() { + return this.subject[this.pos] || null; +}; + +// Parse zero or more space characters, including at most one newline +var spnl = function() { + this.match(/^ *(?:\n *)?/); + return 1; +}; + +// All of the parsers below try to match something at the current position +// in the subject. If they succeed in matching anything, they +// push an inline element onto the 'inlines' list. They return the +// number of characters parsed (possibly 0). + +// Attempt to parse backticks, adding either a backtick code span or a +// literal sequence of backticks to the 'inlines' list. +var parseBackticks = function(inlines) { + var startpos = this.pos; + var ticks = this.match(/^`+/); + if (!ticks) { + return 0; + } + var afterOpenTicks = this.pos; + var foundCode = false; + var match; + while (!foundCode && (match = this.match(/`+/m))) { + if (match == ticks) { + inlines.push({ t: 'Code', c: this.subject.slice(afterOpenTicks, + this.pos - ticks.length) + .replace(/[ \n]+/g,' ') + .trim() }); + return (this.pos - startpos); + } + } + // If we got here, we didn't match a closing backtick sequence. + inlines.push({ t: 'Str', c: ticks }); + this.pos = afterOpenTicks; + return (this.pos - startpos); +}; + +// Parse a backslash-escaped special character, adding either the escaped +// character, a hard line break (if the backslash is followed by a newline), +// or a literal backslash to the 'inlines' list. +var parseEscaped = function(inlines) { + var subj = this.subject, + pos = this.pos; + if (subj[pos] === '\\') { + if (subj[pos + 1] === '\n') { + inlines.push({ t: 'Hardbreak' }); + this.pos = this.pos + 2; + return 2; + } else if (reEscapable.test(subj[pos + 1])) { + inlines.push({ t: 'Str', c: subj[pos + 1] }); + this.pos = this.pos + 2; + return 2; + } else { + this.pos++; + inlines.push({t: 'Str', c: '\\'}); + return 1; + } + } else { + return 0; + } +}; + +// Attempt to parse an autolink (URL or email in pointy brackets). +var parseAutolink = function(inlines) { + var m; + var dest; + if ((m = this.match(/^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/))) { // email autolink + dest = m.slice(1,-1); + inlines.push({ t: 'Link', label: [{ t: 'Str', c: dest }], + destination: 'mailto:' + dest }); + return m.length; + } else if ((m = this.match(/^<(?:coap|doi|javascript|aaa|aaas|about|acap|cap|cid|crid|data|dav|dict|dns|file|ftp|geo|go|gopher|h323|http|https|iax|icap|im|imap|info|ipp|iris|iris.beep|iris.xpc|iris.xpcs|iris.lwz|ldap|mailto|mid|msrp|msrps|mtqp|mupdate|news|nfs|ni|nih|nntp|opaquelocktoken|pop|pres|rtsp|service|session|shttp|sieve|sip|sips|sms|snmp|soap.beep|soap.beeps|tag|tel|telnet|tftp|thismessage|tn3270|tip|tv|urn|vemmi|ws|wss|xcon|xcon-userid|xmlrpc.beep|xmlrpc.beeps|xmpp|z39.50r|z39.50s|adiumxtra|afp|afs|aim|apt|attachment|aw|beshare|bitcoin|bolo|callto|chrome|chrome-extension|com-eventbrite-attendee|content|cvs|dlna-playsingle|dlna-playcontainer|dtn|dvb|ed2k|facetime|feed|finger|fish|gg|git|gizmoproject|gtalk|hcp|icon|ipn|irc|irc6|ircs|itms|jar|jms|keyparc|lastfm|ldaps|magnet|maps|market|message|mms|ms-help|msnim|mumble|mvn|notes|oid|palm|paparazzi|platform|proxy|psyc|query|res|resource|rmi|rsync|rtmp|secondlife|sftp|sgn|skype|smb|soldat|spotify|ssh|steam|svn|teamspeak|things|udp|unreal|ut2004|ventrilo|view-source|webcal|wtai|wyciwyg|xfire|xri|ymsgr):[^<>\x00-\x20]*>/i))) { + dest = m.slice(1,-1); + inlines.push({ t: 'Link', label: [{ t: 'Str', c: dest }], + destination: dest }); + return m.length; + } else { + return 0; + } +}; + +// Attempt to parse a raw HTML tag. +var parseHtmlTag = function(inlines) { + var m = this.match(reHtmlTag); + if (m) { + inlines.push({ t: 'Html', c: m }); + return m.length; + } else { + return 0; + } +}; + +// Scan a sequence of characters == c, and return information about +// the number of delimiters and whether they are positioned such that +// they can open and/or close emphasis or strong emphasis. A utility +// function for strong/emph parsing. +var scanDelims = function(c) { + var numdelims = 0; + var first_close_delims = 0; + var char_before, char_after; + var startpos = this.pos; + + char_before = this.pos === 0 ? '\n' : + this.subject[this.pos - 1]; + + while (this.peek() === c) { + numdelims++; + this.pos++; + } + + char_after = this.peek() || '\n'; + + var can_open = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_after)); + var can_close = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_before)); + if (c === '_') { + can_open = can_open && !((/[a-z0-9]/i).test(char_before)); + can_close = can_close && !((/[a-z0-9]/i).test(char_after)); + } + this.pos = startpos; + return { numdelims: numdelims, + can_open: can_open, + can_close: can_close }; +}; + +// Attempt to parse emphasis or strong emphasis in an efficient way, +// with no backtracking. +var parseEmphasis = function(inlines) { + var startpos = this.pos; + var c ; + var first_close = 0; + var nxt = this.peek(); + if (nxt == '*' || nxt == '_') { + c = nxt; + } else { + return 0; + } + + var numdelims; + var delimpos; + + // Get opening delimiters. + res = this.scanDelims(c); + numdelims = res.numdelims; + this.pos += numdelims; + // We provisionally add a literal string. If we match appropriate + // closing delimiters, we'll change this to Strong or Emph. + inlines.push({t: 'Str', + c: this.subject.substr(this.pos - numdelims, numdelims)}); + // Record the position of this opening delimiter: + delimpos = inlines.length - 1; + + if (!res.can_open || numdelims === 0) { + return 0; + } + + var first_close_delims = 0; + + switch (numdelims) { + case 1: // we started with * or _ + while (true) { + res = this.scanDelims(c); + if (res.numdelims >= 1 && res.can_close) { + this.pos += 1; + // Convert the inline at delimpos, currently a string with the delim, + // into an Emph whose contents are the succeeding inlines + inlines[delimpos].t = 'Emph'; + inlines[delimpos].c = inlines.slice(delimpos + 1); + inlines.splice(delimpos + 1); + break; + } else { + if (this.parseInline(inlines) === 0) { + break; + } + } + } + return (this.pos - startpos); + + case 2: // We started with ** or __ + while (true) { + res = this.scanDelims(c); + if (res.numdelims >= 2 && res.can_close) { + this.pos += 2; + inlines[delimpos].t = 'Strong'; + inlines[delimpos].c = inlines.slice(delimpos + 1); + inlines.splice(delimpos + 1); + break; + } else { + if (this.parseInline(inlines) === 0) { + break; + } + } + } + return (this.pos - startpos); + + case 3: // We started with *** or ___ + while (true) { + res = this.scanDelims(c); + if (res.numdelims >= 1 && res.numdelims <= 3 && res.can_close && + res.numdelims != first_close_delims) { + if (res.numdelims === 3) { + // If we opened with ***, then we interpret *** as * followed by ** + // giving us <strong><em> + res.numdelims = 1; + } + this.pos += res.numdelims; + + if (first_close > 0) { // if we've already passed the first closer: + inlines[delimpos].t = first_close_delims === 1 ? 'Strong' : 'Emph'; + inlines[delimpos].c = [ + { t: first_close_delims === 1 ? 'Emph' : 'Strong', + c: inlines.slice(delimpos + 1, first_close)} + ].concat(inlines.slice(first_close + 1)); + inlines.splice(delimpos + 1); + break; + } else { // this is the first closer; for now, add literal string; + // we'll change this when he hit the second closer + inlines.push({t: 'Str', + c: this.subject.slice(this.pos - res.numdelims, + this.pos) }); + first_close = inlines.length - 1; + first_close_delims = res.numdelims; + } + } else { // parse another inline element, til we hit the end + if (this.parseInline(inlines) === 0) { + break; + } + } + } + return (this.pos - startpos); + + default: + return result; + } + + return 0; +}; + +// Attempt to parse link title (sans quotes), returning the string +// or null if no match. +var parseLinkTitle = function() { + title = this.match(reLinkTitle); + if (title) { + // chop off quotes from title and unescape: + return unescape(title.substr(1, title.length - 2)); + } else { + return null; + } +}; + +// Attempt to parse link destination, returning the string or +// null if no match. +var parseLinkDestination = function() { + var res = this.match(reLinkDestinationBraces); + if (res) { // chop off surrounding <..>: + return unescape(res.substr(1, res.length - 2)); + } else { + res = this.match(reLinkDestination); + if (res !== null) { + return unescape(res); + } else { + return null; + } + } +}; + +// Attempt to parse a link label, returning number of characters parsed. +var parseLinkLabel = function() { + if (this.peek() != '[') { + return 0; + } + var startpos = this.pos; + var nest_level = 0; + if (this.label_nest_level > 0) { + // If we've already checked to the end of this subject + // for a label, even with a different starting [, we + // know we won't find one here and we can just return. + // This avoids lots of backtracking. + // Note: nest level 1 would be: [foo [bar] + // nest level 2 would be: [foo [bar [baz] + this.label_nest_level--; + return 0; + } + this.pos++; // advance past [ + var c; + while ((c = this.peek()) && (c != ']' || nest_level > 0)) { + switch (c) { + case '`': + this.parseBackticks([]); + break; + case '<': + this.parseAutolink([]) || this.parseHtmlTag([]) || this.parseString([]); + break; + case '[': // nested [] + nest_level++; + this.pos++; + break; + case ']': // nested [] + nest_level--; + this.pos++; + break; + case '\\': + this.parseEscaped([]); + break; + default: + this.parseString([]); + } + } + if (c === ']') { + this.label_nest_level = 0; + this.pos++; // advance past ] + return this.pos - startpos; + } else { + if (!c) { + this.label_nest_level = nest_level; + } + this.pos = startpos; + return 0; + } +}; + +// Parse raw link label, including surrounding [], and return +// inline contents. (Note: this is not a method of InlineParser.) +var parseRawLabel = function(s) { + // note: parse without a refmap; we don't want links to resolve + // in nested brackets! + return new InlineParser().parse(s.substr(1, s.length - 2), {}); +}; + +// Attempt to parse a link. If successful, add the link to +// inlines. +var parseLink = function(inlines) { + var startpos = this.pos; + var reflabel; + var n; + var dest; + var title; + + n = this.parseLinkLabel(); + if (n === 0) { + return 0; + } + var afterlabel = this.pos; + var rawlabel = this.subject.substr(startpos, n); + + // if we got this far, we've parsed a label. + // Try to parse an explicit link: [label](url "title") + if (this.peek() == '(') { + this.pos++; + if (this.spnl() && + ((dest = this.parseLinkDestination()) !== null) && + this.spnl() && + // make sure there's a space before the title: + (/^\s/.test(this.subject[this.pos - 1]) && + (title = this.parseLinkTitle() || '') || true) && + this.spnl() && + this.match(/^\)/)) { + inlines.push({ t: 'Link', + destination: dest, + title: title, + label: parseRawLabel(rawlabel) }); + return this.pos - startpos; + } else { + this.pos = startpos; + return 0; + } + } + // If we're here, it wasn't an explicit link. Try to parse a reference link. + // first, see if there's another label + var savepos = this.pos; + this.spnl(); + var beforelabel = this.pos; + n = this.parseLinkLabel(); + if (n == 2) { + // empty second label + reflabel = rawlabel; + } else if (n > 0) { + reflabel = this.subject.slice(beforelabel, beforelabel + n); + } else { + this.pos = savepos; + reflabel = rawlabel; + } + // lookup rawlabel in refmap + var link = this.refmap[normalizeReference(reflabel)]; + if (link) { + inlines.push({t: 'Link', + destination: link.destination, + title: link.title, + label: parseRawLabel(rawlabel) }); + return this.pos - startpos; + } else { + this.pos = startpos; + return 0; + } + // Nothing worked, rewind: + this.pos = startpos; + return 0; +}; + +// Attempt to parse an entity, adding to inlines if successful. +var parseEntity = function(inlines) { + var m; + if ((m = this.match(/^&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});/i))) { + inlines.push({ t: 'Entity', c: m }); + return m.length; + } else { + return 0; + } +}; + +// Parse a run of ordinary characters, or a single character with +// a special meaning in markdown, as a plain string, adding to inlines. +var parseString = function(inlines) { + var m; + if ((m = this.match(reMain))) { + inlines.push({ t: 'Str', c: m }); + return m.length; + } else { + return 0; + } +}; + +// Parse a newline. If it was preceded by two spaces, return a hard +// line break; otherwise a soft line break. +var parseNewline = function(inlines) { + if (this.peek() == '\n') { + this.pos++; + var last = inlines[inlines.length - 1]; + if (last && last.t == 'Str' && last.c.slice(-2) == ' ') { + last.c = last.c.replace(/ *$/,''); + inlines.push({ t: 'Hardbreak' }); + } else { + if (last && last.t == 'Str' && last.c.slice(-1) == ' ') { + last.c = last.c.slice(0, -1); + } + inlines.push({ t: 'Softbreak' }); + } + return 1; + } else { + return 0; + } +}; + +// Attempt to parse an image. If the opening '!' is not followed +// by a link, add a literal '!' to inlines. +var parseImage = function(inlines) { + if (this.match(/^!/)) { + var n = this.parseLink(inlines); + if (n === 0) { + inlines.push({ t: 'Str', c: '!' }); + return 1; + } else if (inlines[inlines.length - 1] && + inlines[inlines.length - 1].t == 'Link') { + inlines[inlines.length - 1].t = 'Image'; + return n+1; + } else { + throw "Shouldn't happen"; + } + } else { + return 0; + } +}; + +// Attempt to parse a link reference, modifying refmap. +var parseReference = function(s, refmap) { + this.subject = s; + this.pos = 0; + var rawlabel; + var dest; + var title; + var matchChars; + var startpos = this.pos; + var match; + + // label: + matchChars = this.parseLinkLabel(); + if (matchChars === 0) { + return 0; + } else { + rawlabel = this.subject.substr(0, matchChars); + } + + // colon: + if (this.peek() === ':') { + this.pos++; + } else { + this.pos = startpos; + return 0; + } + + // link url + this.spnl(); + + dest = this.parseLinkDestination(); + if (dest === null || dest.length === 0) { + this.pos = startpos; + return 0; + } + + var beforetitle = this.pos; + this.spnl(); + title = this.parseLinkTitle(); + if (title === null) { + title = ''; + // rewind before spaces + this.pos = beforetitle; + } + + // make sure we're at line end: + if (this.match(/^ *(?:\n|$)/) === null) { + this.pos = startpos; + return 0; + } + + var normlabel = normalizeReference(rawlabel); + + if (!refmap[normlabel]) { + refmap[normlabel] = { destination: dest, title: title }; + } + return this.pos - startpos; +}; + +// Parse the next inline element in subject, advancing subject position +// and adding the result to 'inlines'. +var parseInline = function(inlines) { + var c = this.peek(); + var res; + switch(c) { + case '\n': + res = this.parseNewline(inlines); + break; + case '\\': + res = this.parseEscaped(inlines); + break; + case '`': + res = this.parseBackticks(inlines); + break; + case '*': + case '_': + res = this.parseEmphasis(inlines); + break; + case '[': + res = this.parseLink(inlines); + break; + case '!': + res = this.parseImage(inlines); + break; + case '<': + res = this.parseAutolink(inlines) || + this.parseHtmlTag(inlines); + break; + case '&': + res = this.parseEntity(inlines); + break; + default: + } + return res || this.parseString(inlines); +}; + +// Parse s as a list of inlines, using refmap to resolve references. +var parseInlines = function(s, refmap) { + this.subject = s; + this.pos = 0; + this.refmap = refmap || {}; + var inlines = []; + while (this.parseInline(inlines)) ; + return inlines; +}; + +// The InlineParser object. +function InlineParser(){ + return { + subject: '', + label_nest_level: 0, // used by parseLinkLabel method + pos: 0, + refmap: {}, + match: match, + peek: peek, + spnl: spnl, + parseBackticks: parseBackticks, + parseEscaped: parseEscaped, + parseAutolink: parseAutolink, + parseHtmlTag: parseHtmlTag, + scanDelims: scanDelims, + parseEmphasis: parseEmphasis, + parseLinkTitle: parseLinkTitle, + parseLinkDestination: parseLinkDestination, + parseLinkLabel: parseLinkLabel, + parseLink: parseLink, + parseEntity: parseEntity, + parseString: parseString, + parseNewline: parseNewline, + parseImage: parseImage, + parseReference: parseReference, + parseInline: parseInline, + parse: parseInlines, + }; +} + +// DOC PARSER + +// These are methods of a DocParser object, defined below. + +var makeBlock = function(tag, start_line, start_column) { + return { t: tag, + open: true, + last_line_blank: false, + start_line: start_line, + start_column: start_column, + end_line: start_line, + children: [], + parent: null, + // string_content is formed by concatenating strings, in finalize: + string_content: "", + strings: [], + inline_content: [], + }; +}; + +// Returns true if parent block can contain child block. +var canContain = function(parent_type, child_type) { + return ( parent_type == 'Document' || + parent_type == 'BlockQuote' || + parent_type == 'ListItem' || + (parent_type == 'List' && child_type == 'ListItem') ); +}; + +// Returns true if block type can accept lines of text. +var acceptsLines = function(block_type) { + return ( block_type == 'Paragraph' || + block_type == 'IndentedCode' || + block_type == 'FencedCode' ); +}; + +// Returns true if block ends with a blank line, descending if needed +// into lists and sublists. +var endsWithBlankLine = function(block) { + if (block.last_line_blank) { + return true; + } + if ((block.t == 'List' || block.t == 'ListItem') && block.children.length > 0) { + return endsWithBlankLine(block.children[block.children.length - 1]); + } else { + return false; + } +}; + +// Break out of all containing lists, resetting the tip of the +// document to the parent of the highest list, and finalizing +// all the lists. (This is used to implement the "two blank lines +// break of of all lists" feature.) +var breakOutOfLists = function(block, line_number) { + var b = block; + var last_list = null; + do { + if (b.t === 'List') { + last_list = b; + } + b = b.parent; + } while (b); + + if (last_list) { + while (block != last_list) { + this.finalize(block, line_number); + block = block.parent; + } + this.finalize(last_list, line_number); + this.tip = last_list.parent; + } +}; + +// Add a line to the block at the tip. We assume the tip +// can accept lines -- that check should be done before calling this. +var addLine = function(ln, offset) { + var s = ln.slice(offset); + if (!(this.tip.open)) { + throw({ msg: "Attempted to add line (" + ln + ") to closed container." }); + } + this.tip.strings.push(s); +}; + +// Add block of type tag as a child of the tip. If the tip can't +// accept children, close and finalize it and try its parent, +// and so on til we find a block that can accept children. +var addChild = function(tag, line_number, offset) { + while (!canContain(this.tip.t, tag)) { + this.finalize(this.tip, line_number); + } + + var column_number = offset + 1; // offset 0 = column 1 + var newBlock = makeBlock(tag, line_number, column_number); + this.tip.children.push(newBlock); + newBlock.parent = this.tip; + this.tip = newBlock; + return newBlock; +}; + +// Parse a list marker and return data on the marker (type, +// start, delimiter, bullet character, padding) or null. +var parseListMarker = function(ln, offset) { + var rest = ln.slice(offset); + var match; + var spaces_after_marker; + var data = {}; + if (rest.match(reHrule)) { + return null; + } + if ((match = rest.match(/^[*+-]( +|$)/))) { + spaces_after_marker = match[1].length; + data.type = 'Bullet'; + data.bullet_char = match[0][0]; + + } else if ((match = rest.match(/^(\d+)([.)])( +|$)/))) { + spaces_after_marker = match[3].length; + data.type = 'Ordered'; + data.start = parseInt(match[1]); + data.delimiter = match[2]; + } else { + return null; + } + blank_item = match[0].length === rest.length; + if (spaces_after_marker >= 5 || + spaces_after_marker < 1 || + blank_item) { + data.padding = match[0].length - spaces_after_marker + 1; + } else { + data.padding = match[0].length; + } + return data; +}; + +// Returns true if the two list items are of the same type, +// with the same delimiter and bullet character. This is used +// in agglomerating list items into lists. +var listsMatch = function(list_data, item_data) { + return (list_data.type === item_data.type && + list_data.delimiter === item_data.delimiter && + list_data.bullet_char === item_data.bullet_char); +}; + +// Analyze a line of text and update the document appropriately. +// We parse markdown text by calling this on each line of input, +// then finalizing the document. +var incorporateLine = function(ln, line_number) { + + var all_matched = true; + var last_child; + var first_nonspace; + var offset = 0; + var match; + var data; + var blank; + var indent; + var last_matched_container; + var i; + var CODE_INDENT = 4; + + var container = this.doc; + var oldtip = this.tip; + + // Convert tabs to spaces: + ln = detabLine(ln); + + // For each containing block, try to parse the associated line start. + // Bail out on failure: container will point to the last matching block. + // Set all_matched to false if not all containers match. + while (container.children.length > 0) { + last_child = container.children[container.children.length - 1]; + if (!last_child.open) { + break; + } + container = last_child; + + match = matchAt(/[^ ]/, ln, offset); + if (match === null) { + first_nonspace = ln.length; + blank = true; + } else { + first_nonspace = match; + blank = false; + } + indent = first_nonspace - offset; + + switch (container.t) { + case 'BlockQuote': + matched = indent <= 3 && ln[first_nonspace] === '>'; + if (matched) { + offset = first_nonspace + 1; + if (ln[offset] === ' ') { + offset++; + } + } else { + all_matched = false; + } + break; + + case 'ListItem': + if (indent >= container.list_data.marker_offset + + container.list_data.padding) { + offset += container.list_data.marker_offset + + container.list_data.padding; + } else if (blank) { + offset = first_nonspace; + } else { + all_matched = false; + } + break; + + case 'IndentedCode': + if (indent >= CODE_INDENT) { + offset += CODE_INDENT; + } else if (blank) { + offset = first_nonspace; + } else { + all_matched = false; + } + break; + + case 'ATXHeader': + case 'SetextHeader': + case 'HorizontalRule': + // a header can never container > 1 line, so fail to match: + all_matched = false; + break; + + case 'FencedCode': + // skip optional spaces of fence offset + i = container.fence_offset; + while (i > 0 && ln[offset] === ' ') { + offset++; + i--; + } + break; + + case 'HtmlBlock': + if (blank) { + all_matched = false; + } + break; + + case 'Paragraph': + if (blank) { + container.last_line_blank = true; + all_matched = false; + } + break; + + default: + } + + if (!all_matched) { + container = container.parent; // back up to last matching block + break; + } + } + + last_matched_container = container; + + // This function is used to finalize and close any unmatched + // blocks. We aren't ready to do this now, because we might + // have a lazy paragraph continuation, in which case we don't + // want to close unmatched blocks. So we store this closure for + // use later, when we have more information. + var closeUnmatchedBlocks = function(mythis) { + // finalize any blocks not matched + while (!already_done && oldtip != last_matched_container) { + mythis.finalize(oldtip, line_number); + oldtip = oldtip.parent; + } + var already_done = true; + }; + + // Check to see if we've hit 2nd blank line; if so break out of list: + if (blank && container.last_line_blank) { + this.breakOutOfLists(container, line_number); + } + + // Unless last matched container is a code block, try new container starts, + // adding children to the last matched container: + while (container.t != 'FencedCode' && + container.t != 'IndentedCode' && + container.t != 'HtmlBlock' && + // this is a little performance optimization: + matchAt(/^[ #`~*+_=<>0-9-]/,ln,offset) !== null) { + + match = matchAt(/[^ ]/, ln, offset); + if (match === null) { + first_nonspace = ln.length; + blank = true; + } else { + first_nonspace = match; + blank = false; + } + indent = first_nonspace - offset; + + if (indent >= CODE_INDENT) { + // indented code + if (this.tip.t != 'Paragraph' && !blank) { + offset += CODE_INDENT; + closeUnmatchedBlocks(this); + container = this.addChild('IndentedCode', line_number, offset); + } else { // indent > 4 in a lazy paragraph continuation + break; + } + + } else if (ln[first_nonspace] === '>') { + // blockquote + offset = first_nonspace + 1; + // optional following space + if (ln[offset] === ' ') { + offset++; + } + closeUnmatchedBlocks(this); + container = this.addChild('BlockQuote', line_number, offset); + + } else if ((match = ln.slice(first_nonspace).match(/^#{1,6}(?: +|$)/))) { + // ATX header + offset = first_nonspace + match[0].length; + closeUnmatchedBlocks(this); + container = this.addChild('ATXHeader', line_number, first_nonspace); + container.level = match[0].trim().length; // number of #s + // remove trailing ###s: + container.strings = + [ln.slice(offset).replace(/(?:(\\#) *#+| *#+) *$/,'$1')]; + break; + + } else if ((match = ln.slice(first_nonspace).match(/^`{3,}(?!.*`)|^~{3,}(?!.*~)/))) { + // fenced code block + var fence_length = match[0].length; + closeUnmatchedBlocks(this); + container = this.addChild('FencedCode', line_number, first_nonspace); + container.fence_length = fence_length; + container.fence_char = match[0][0]; + container.fence_offset = first_nonspace - offset; + offset = first_nonspace + fence_length; + break; + + } else if (matchAt(reHtmlBlockOpen, ln, first_nonspace) !== null) { + // html block + closeUnmatchedBlocks(this); + container = this.addChild('HtmlBlock', line_number, first_nonspace); + // note, we don't adjust offset because the tag is part of the text + break; + + } else if (container.t == 'Paragraph' && + container.strings.length === 1 && + ((match = ln.slice(first_nonspace).match(/^(?:=+|-+) *$/)))) { + // setext header line + closeUnmatchedBlocks(this); + container.t = 'SetextHeader'; // convert Paragraph to SetextHeader + container.level = match[0][0] === '=' ? 1 : 2; + offset = ln.length; + + } else if (matchAt(reHrule, ln, first_nonspace) !== null) { + // hrule + closeUnmatchedBlocks(this); + container = this.addChild('HorizontalRule', line_number, first_nonspace); + offset = ln.length - 1; + break; + + } else if ((data = parseListMarker(ln, first_nonspace))) { + // list item + closeUnmatchedBlocks(this); + data.marker_offset = indent; + offset = first_nonspace + data.padding; + + // add the list if needed + if (container.t !== 'List' || + !(listsMatch(container.list_data, data))) { + container = this.addChild('List', line_number, first_nonspace); + container.list_data = data; + } + + // add the list item + container = this.addChild('ListItem', line_number, first_nonspace); + container.list_data = data; + + } else { + break; + + } + + if (acceptsLines(container.t)) { + // if it's a line container, it can't contain other containers + break; + } + } + + // What remains at the offset is a text line. Add the text to the + // appropriate container. + + match = matchAt(/[^ ]/, ln, offset); + if (match === null) { + first_nonspace = ln.length; + blank = true; + } else { + first_nonspace = match; + blank = false; + } + indent = first_nonspace - offset; + + // First check for a lazy paragraph continuation: + if (this.tip !== last_matched_container && + !blank && + this.tip.t == 'Paragraph' && + this.tip.strings.length > 0) { + // lazy paragraph continuation + + this.last_line_blank = false; + this.addLine(ln, offset); + + } else { // not a lazy continuation + + // finalize any blocks not matched + closeUnmatchedBlocks(this); + + // Block quote lines are never blank as they start with > + // and we don't count blanks in fenced code for purposes of tight/loose + // lists or breaking out of lists. We also don't set last_line_blank + // on an empty list item. + container.last_line_blank = blank && + !(container.t == 'BlockQuote' || + container.t == 'FencedCode' || + (container.t == 'ListItem' && + container.children.length === 0 && + container.start_line == line_number)); + + var cont = container; + while (cont.parent) { + cont.parent.last_line_blank = false; + cont = cont.parent; + } + + switch (container.t) { + case 'IndentedCode': + case 'HtmlBlock': + this.addLine(ln, offset); + break; + + case 'FencedCode': + // check for closing code fence: + match = (indent <= 3 && + ln[first_nonspace] == container.fence_char && + ln.slice(first_nonspace).match(/^(?:`{3,}|~{3,})(?= *$)/)); + if (match && match[0].length >= container.fence_length) { + // don't add closing fence to container; instead, close it: + this.finalize(container, line_number); + } else { + this.addLine(ln, offset); + } + break; + + case 'ATXHeader': + case 'SetextHeader': + case 'HorizontalRule': + // nothing to do; we already added the contents. + break; + + default: + if (acceptsLines(container.t)) { + this.addLine(ln, first_nonspace); + } else if (blank) { + // do nothing + } else if (container.t != 'HorizontalRule' && + container.t != 'SetextHeader') { + // create paragraph container for line + container = this.addChild('Paragraph', line_number, first_nonspace); + this.addLine(ln, first_nonspace); + } else { + console.log("Line " + line_number.toString() + + " with container type " + container.t + + " did not match any condition."); + + } + } + } +}; + +// Finalize a block. Close it and do any necessary postprocessing, +// e.g. creating string_content from strings, setting the 'tight' +// or 'loose' status of a list, and parsing the beginnings +// of paragraphs for reference definitions. Reset the tip to the +// parent of the closed block. +var finalize = function(block, line_number) { + var pos; + // don't do anything if the block is already closed + if (!block.open) { + return 0; + } + block.open = false; + if (line_number > block.start_line) { + block.end_line = line_number - 1; + } else { + block_end_line = line_number; + } + + switch (block.t) { + case 'Paragraph': + block.string_content = block.strings.join('\n').replace(/^ */m,''); + + // try parsing the beginning as link reference definitions: + while (block.string_content[0] === '[' && + (pos = this.inlineParser.parseReference(block.string_content, + this.refmap))) { + block.string_content = block.string_content.slice(pos); + if (isBlank(block.string_content)) { + block.t = 'ReferenceDef'; + break; + } + } + break; + + case 'ATXHeader': + case 'SetextHeader': + case 'HtmlBlock': + block.string_content = block.strings.join('\n'); + break; + + case 'IndentedCode': + block.string_content = block.strings.join('\n').replace(/(\n *)*$/,'\n'); + break; + + case 'FencedCode': + // first line becomes info string + block.info = unescape(block.strings[0].trim()); + if (block.strings.length == 1) { + block.string_content = ''; + } else { + block.string_content = block.strings.slice(1).join('\n') + '\n'; + } + break; + + case 'List': + block.tight = true; // tight by default + + var numitems = block.children.length; + var i = 0; + while (i < numitems) { + var item = block.children[i]; + // check for non-final list item ending with blank line: + var last_item = i == numitems - 1; + if (endsWithBlankLine(item) && !last_item) { + block.tight = false; + break; + } + // recurse into children of list item, to see if there are + // spaces between any of them: + var numsubitems = item.children.length; + var j = 0; + while (j < numsubitems) { + var subitem = item.children[j]; + var last_subitem = j == numsubitems - 1; + if (endsWithBlankLine(subitem) && !(last_item && last_subitem)) { + block.tight = false; + break; + } + j++; + } + i++; + } + break; + + default: + break; + } + + this.tip = block.parent || this.top; +}; + +// Walk through a block & children recursively, parsing string content +// into inline content where appropriate. +var processInlines = function(block) { + switch(block.t) { + case 'Paragraph': + case 'SetextHeader': + case 'ATXHeader': + block.inline_content = + this.inlineParser.parse(block.string_content.trim(), this.refmap); + block.string_content = ""; + break; + default: + break; + } + + if (block.children) { + for (var i = 0; i < block.children.length; i++) { + this.processInlines(block.children[i]); + } + } + +}; + +// The main parsing function. Returns a parsed document AST. +var parse = function(input) { + this.doc = makeBlock('Document', 1, 1); + this.tip = this.doc; + this.refmap = {}; + var lines = input.replace(/\n$/,'').split(/\r\n|\n|\r/); + var len = lines.length; + for (var i = 0; i < len; i++) { + this.incorporateLine(lines[i], i+1); + } + while (this.tip) { + this.finalize(this.tip, len - 1); + } + this.processInlines(this.doc); + return this.doc; +}; + + +// The DocParser object. +function DocParser(){ + return { + doc: makeBlock('Document', 1, 1), + tip: this.doc, + refmap: {}, + inlineParser: new InlineParser(), + breakOutOfLists: breakOutOfLists, + addLine: addLine, + addChild: addChild, + incorporateLine: incorporateLine, + finalize: finalize, + processInlines: processInlines, + parse: parse, + }; +} + +// HTML RENDERER + +// Helper function to produce content in a pair of HTML tags. +var inTags = function(tag, attribs, contents, selfclosing) { + var result = '<' + tag; + if (attribs) { + var i = 0; + var attrib; + while ((attrib = attribs[i]) !== undefined) { + result = result.concat(' ', attrib[0], '="', attrib[1], '"'); + i++; + } + } + if (contents) { + result = result.concat('>', contents, '</', tag, '>'); + } else if (selfclosing) { + result = result + ' />'; + } else { + result = result.concat('></', tag, '>'); + } + return result; +}; + +// Render an inline element as HTML. +var renderInline = function(inline) { + var attrs; + switch (inline.t) { + case 'Str': + return this.escape(inline.c); + case 'Softbreak': + return this.softbreak; + case 'Hardbreak': + return inTags('br',[],"",true) + '\n'; + case 'Emph': + return inTags('em', [], this.renderInlines(inline.c)); + case 'Strong': + return inTags('strong', [], this.renderInlines(inline.c)); + case 'Html': + return inline.c; + case 'Entity': + return inline.c; + case 'Link': + attrs = [['href', this.escape(inline.destination, true)]]; + if (inline.title) { + attrs.push(['title', this.escape(inline.title, true)]); + } + return inTags('a', attrs, this.renderInlines(inline.label)); + case 'Image': + attrs = [['src', this.escape(inline.destination, true)], + ['alt', this.escape(this.renderInlines(inline.label))]]; + if (inline.title) { + attrs.push(['title', this.escape(inline.title, true)]); + } + return inTags('img', attrs, "", true); + case 'Code': + return inTags('code', [], this.escape(inline.c)); + default: + console.log("Uknown inline type " + inline.t); + return ""; + } +}; + +// Render a list of inlines. +var renderInlines = function(inlines) { + var result = ''; + for (var i=0; i < inlines.length; i++) { + result = result + this.renderInline(inlines[i]); + } + return result; +}; + +// Render a single block element. +var renderBlock = function(block, in_tight_list) { + var tag; + var attr; + var info_words; + switch (block.t) { + case 'Document': + var whole_doc = this.renderBlocks(block.children); + return (whole_doc === '' ? '' : whole_doc + '\n'); + case 'Paragraph': + if (in_tight_list) { + return this.renderInlines(block.inline_content); + } else { + return inTags('p', [], this.renderInlines(block.inline_content)); + } + break; + case 'BlockQuote': + var filling = this.renderBlocks(block.children); + return inTags('blockquote', [], filling === '' ? this.innersep : + this.innersep + this.renderBlocks(block.children) + this.innersep); + case 'ListItem': + return inTags('li', [], this.renderBlocks(block.children, in_tight_list).trim()); + case 'List': + tag = block.list_data.type == 'Bullet' ? 'ul' : 'ol'; + attr = (!block.list_data.start || block.list_data.start == 1) ? + [] : [['start', block.list_data.start.toString()]]; + return inTags(tag, attr, this.innersep + + this.renderBlocks(block.children, block.tight) + + this.innersep); + case 'ATXHeader': + case 'SetextHeader': + tag = 'h' + block.level; + return inTags(tag, [], this.renderInlines(block.inline_content)); + case 'IndentedCode': + return inTags('pre', [], + inTags('code', [], this.escape(block.string_content))); + case 'FencedCode': + info_words = block.info.split(/ +/); + attr = info_words.length === 0 || info_words[0].length === 0 ? + [] : [['class',this.escape(info_words[0],true)]]; + return inTags('pre', attr, + inTags('code', [], this.escape(block.string_content))); + case 'HtmlBlock': + return block.string_content; + case 'ReferenceDef': + return ""; + case 'HorizontalRule': + return inTags('hr',[],"",true); + default: + console.log("Uknown block type " + block.t); + return ""; + } +}; + +// Render a list of block elements, separated by this.blocksep. +var renderBlocks = function(blocks, in_tight_list) { + var result = []; + for (var i=0; i < blocks.length; i++) { + if (blocks[i].t !== 'ReferenceDef') { + result.push(this.renderBlock(blocks[i], in_tight_list)); + } + } + return result.join(this.blocksep); +}; + +// The HtmlRenderer object. +function HtmlRenderer(){ + return { + // default options: + blocksep: '\n', // space between blocks + innersep: '\n', // space between block container tag and contents + softbreak: '\n', // by default, soft breaks are rendered as newlines in HTML + // set to "<br />" to make them hard breaks + // set to " " if you want to ignore line wrapping in source + escape: function(s, preserve_entities) { + if (preserve_entities) { + return s.replace(/[&](?![#](x[a-f0-9]{1,8}|[0-9]{1,8});|[a-z][a-z0-9]{1,31};)/gi,'&') + .replace(/[<]/g,'<') + .replace(/[>]/g,'>') + .replace(/["]/g,'"'); + } else { + return s.replace(/[&]/g,'&') + .replace(/[<]/g,'<') + .replace(/[>]/g,'>') + .replace(/["]/g,'"'); + } + }, + renderInline: renderInline, + renderInlines: renderInlines, + renderBlock: renderBlock, + renderBlocks: renderBlocks, + render: renderBlock + }; +} + +exports.DocParser = DocParser; +exports.HtmlRenderer = HtmlRenderer; + +})(typeof exports === 'undefined' ? this.stmd = {} : exports); diff --git a/js/test.js b/js/test.js new file mode 100755 index 0000000..c1ea5b6 --- /dev/null +++ b/js/test.js @@ -0,0 +1,79 @@ +#!/usr/bin/env node + +var fs = require('fs'); +var util = require('util'); +var stmd = require('./stmd'); +var ansi = require('ansi') +var cursor = ansi(process.stdout); + +var writer = new stmd.HtmlRenderer(); +var reader = new stmd.DocParser(); + +var passed = 0; +var failed = 0; + +var showSpaces = function(s) { + var t = s; + return t.replace(/\t/g,'→') + .replace(/ /g,'␣'); +} + +fs.readFile('spec.txt', 'utf8', function(err, data) { + if (err) { + return console.log(err); + } + var examples = []; + var current_section = ""; + var example_number = 0; + tests = data.replace(/^<!-- END TESTS -->(.|[\n])*/m,''); + tests.replace(/^\.\n([\s\S]*?)^\.\n([\s\S]*?)^\.$|^#{1,6} *(.*)$/gm, + function(_,x,y,z,w){ + if (z) { + current_section = z; + } else { + example_number++; + examples.push({markdown: x, + html: y, + section: current_section, + number: example_number}); + } + }); + + current_section = ""; + + console.time("Elapsed time"); + + for (i = 0; i < examples.length; i++) { + var example = examples[i]; + if (example.section != current_section) { + if (current_section !== '') { + cursor.write('\n'); + } + current_section = example.section; + cursor.reset().write(current_section).reset().write(' '); + } + var actual = writer.renderBlock(reader.parse(example.markdown.replace(/→/g, '\t'))); + if (actual == example.html) { + passed++; + cursor.green().write('✓').reset(); + } else { + failed++; + cursor.write('\n'); + + cursor.red().write('✘ Example ' + example.number + '\n'); + cursor.cyan(); + cursor.write('=== markdown ===============\n'); + cursor.write(showSpaces(example.markdown)); + cursor.write('=== expected ===============\n'); + cursor.write(showSpaces(example.html)); + cursor.write('=== got ====================\n'); + cursor.write(showSpaces(actual)); + cursor.reset(); + } + } + cursor.write('\n' + passed.toString() + ' tests passed, ' + + failed.toString() + ' failed.\n'); + + console.timeEnd("Elapsed time"); +}); + diff --git a/license.bstrlib.txt b/license.bstrlib.txt new file mode 100644 index 0000000..cf78a98 --- /dev/null +++ b/license.bstrlib.txt @@ -0,0 +1,29 @@ +Copyright (c) 2002-2008 Paul Hsieh +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + Neither the name of bstrlib nor the names of its contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + diff --git a/license.uthash.txt b/license.uthash.txt new file mode 100644 index 0000000..ad8e16a --- /dev/null +++ b/license.uthash.txt @@ -0,0 +1,21 @@ +Copyright (c) 2005-2013, Troy D. Hanson http://troydhanson.github.com/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/mkcasefold.pl b/mkcasefold.pl new file mode 100644 index 0000000..4c08634 --- /dev/null +++ b/mkcasefold.pl @@ -0,0 +1,21 @@ +print(" switch (c) {\n"); +my $lastchar = ""; +while (<STDIN>) { + if (/^[A-F0-9]/ and / [CF]; /) { + my ($char, $type, $subst) = m/([A-F0-9]+); ([CF]); ([^;]+)/; + if ($char eq $lastchar) { + break; + } + my @subst = $subst =~ m/(\w+)/g; + printf(" case 0x%s:\n", $char); + foreach (@subst) { + printf(" bufpush(0x%s);\n", $_); + } + printf(" break;\n"); + $lastchar = $char; + } +} +printf(" default:\n"); +printf(" bufpush(c);\n"); +print(" }\n"); + diff --git a/oldtests/Blockquotes/Indents.html b/oldtests/Blockquotes/Indents.html new file mode 100644 index 0000000..fd98ee8 --- /dev/null +++ b/oldtests/Blockquotes/Indents.html @@ -0,0 +1,12 @@ +<blockquote> +<p>one +blockquote</p> +</blockquote> +<blockquote> +<blockquote> +<blockquote> +<p>triply nested +triply nested</p> +</blockquote> +</blockquote> +</blockquote> diff --git a/oldtests/Blockquotes/Indents.markdown b/oldtests/Blockquotes/Indents.markdown new file mode 100644 index 0000000..f9342ff --- /dev/null +++ b/oldtests/Blockquotes/Indents.markdown @@ -0,0 +1,5 @@ +> one + > blockquote + +>>> triply nested + > > > triply nested diff --git a/oldtests/Blockquotes/Nesting.html b/oldtests/Blockquotes/Nesting.html new file mode 100644 index 0000000..f40e999 --- /dev/null +++ b/oldtests/Blockquotes/Nesting.html @@ -0,0 +1,32 @@ +<p>These are all equivalent:</p> +<blockquote> +<blockquote> +<p>nested +blockquote</p> +</blockquote> +</blockquote> +<blockquote> +<blockquote> +<p>nested +blockquote</p> +</blockquote> +</blockquote> +<blockquote> +<blockquote> +<p>nested +blockquote</p> +</blockquote> +</blockquote> +<blockquote> +<blockquote> +<p>nested +blockquote</p> +</blockquote> +</blockquote> +<p>This is not:</p> +<blockquote> +<p>nested</p> +<blockquote> +<p>blockquote</p> +</blockquote> +</blockquote> diff --git a/oldtests/Blockquotes/Nesting.markdown b/oldtests/Blockquotes/Nesting.markdown new file mode 100644 index 0000000..3d67843 --- /dev/null +++ b/oldtests/Blockquotes/Nesting.markdown @@ -0,0 +1,22 @@ +These are all equivalent: + +> > nested +> > blockquote + + +>> nested +>> blockquote + + +> > nested +blockquote + + +> > nested +> blockquote + + +This is not: + +> nested +> > blockquote diff --git a/oldtests/Blockquotes/Separation.html b/oldtests/Blockquotes/Separation.html new file mode 100644 index 0000000..910d545 --- /dev/null +++ b/oldtests/Blockquotes/Separation.html @@ -0,0 +1,39 @@ +<p>One blockquote, two paragraphs:</p> +<blockquote> +<p>one</p> +<p>two</p> +</blockquote> +<p>Two blockquotes:</p> +<blockquote> +<p>one</p> +</blockquote> +<blockquote> +<p>two</p> +</blockquote> +<p>Nested blockquote, two paragraphs:</p> +<blockquote> +<blockquote> +<p>one</p> +<p>two</p> +</blockquote> +</blockquote> +<p>Nested blockquote, two blockquotes:</p> +<blockquote> +<blockquote> +<p>one</p> +</blockquote> +<blockquote> +<p>two</p> +</blockquote> +</blockquote> +<p>Two nested blockquotes:</p> +<blockquote> +<blockquote> +<p>one</p> +</blockquote> +</blockquote> +<blockquote> +<blockquote> +<p>two</p> +</blockquote> +</blockquote> diff --git a/oldtests/Blockquotes/Separation.markdown b/oldtests/Blockquotes/Separation.markdown new file mode 100644 index 0000000..823d865 --- /dev/null +++ b/oldtests/Blockquotes/Separation.markdown @@ -0,0 +1,29 @@ +One blockquote, two paragraphs: + +> one +> +> two + +Two blockquotes: + +> one + +> two + +Nested blockquote, two paragraphs: + +> > one +> > +> > two + +Nested blockquote, two blockquotes: + +> > one +> +> > two + +Two nested blockquotes: + +> > one + +> > two diff --git a/oldtests/Code/BlankLines.html b/oldtests/Code/BlankLines.html new file mode 100644 index 0000000..ae0abf7 --- /dev/null +++ b/oldtests/Code/BlankLines.html @@ -0,0 +1,33 @@ +<pre><code>foo + + + +bar +</code></pre> +<blockquote> +<pre><code>foo + + + +bar +</code></pre> +</blockquote> +<pre><code>foo + + + +bar +</code></pre> +<ol> +<li><p>One</p> +<pre><code>CodeA + +CodeB +</code></pre></li> +<li><p>Two</p> +<pre><code>CodeA +</code></pre></li> +</ol> +<ol> +<li>One</li> +</ol> diff --git a/oldtests/Code/BlankLines.markdown b/oldtests/Code/BlankLines.markdown new file mode 100644 index 0000000..b0d5a0c --- /dev/null +++ b/oldtests/Code/BlankLines.markdown @@ -0,0 +1,28 @@ + foo + + + + bar +> foo +> +> +> +> bar + foo + + + + bar + +1. One + + CodeA + + CodeB + +2. Two + + CodeA + + +1. One diff --git a/oldtests/Code/BlankLinesAtEnd.html b/oldtests/Code/BlankLinesAtEnd.html new file mode 100644 index 0000000..ac803d9 --- /dev/null +++ b/oldtests/Code/BlankLinesAtEnd.html @@ -0,0 +1,14 @@ +<ul> +<li><p>List</p> +<pre><code>code +</code></pre></li> +</ul> +<ul> +<li>one</li> +<li>two</li> +</ul> +<ul> +<li><p>one +not code</p></li> +<li><p>two</p></li> +</ul> diff --git a/oldtests/Code/BlankLinesAtEnd.markdown b/oldtests/Code/BlankLinesAtEnd.markdown new file mode 100644 index 0000000..55879ae --- /dev/null +++ b/oldtests/Code/BlankLinesAtEnd.markdown @@ -0,0 +1,14 @@ +* List + + code + + + * one + * two + + + +* one + not code + +* two diff --git a/oldtests/Code/FenceMatching.html b/oldtests/Code/FenceMatching.html new file mode 100644 index 0000000..4c7468e --- /dev/null +++ b/oldtests/Code/FenceMatching.html @@ -0,0 +1,8 @@ +<pre class="abc"><code>``` +</code></pre> +<pre class="blah"><code> +````` + +```` + +</code></pre> diff --git a/oldtests/Code/FenceMatching.markdown b/oldtests/Code/FenceMatching.markdown new file mode 100644 index 0000000..d86169a --- /dev/null +++ b/oldtests/Code/FenceMatching.markdown @@ -0,0 +1,10 @@ +````abc +``` +```` +``````blah + +````` + +```` + +``````````` diff --git a/oldtests/Code/FencedCodeBlocks.html b/oldtests/Code/FencedCodeBlocks.html new file mode 100644 index 0000000..4813d72 --- /dev/null +++ b/oldtests/Code/FencedCodeBlocks.html @@ -0,0 +1,24 @@ +<p>This is a fenced code block:</p> +<pre class="haskell"><code>pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] +</code></pre> +<p>Here is one with tildes:</p> +<pre class="haskell"><code>pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] +</code></pre> +<p>More metadata:</p> +<pre class="haskell"><code>pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] +</code></pre> +<p>More backticks:</p> +<pre class="haskell"><code>pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] + +backticks :: String +backticks = "`````" +</code></pre> +<p>Without an end:</p> +<pre><code>code with +no end + +</code></pre> diff --git a/oldtests/Code/FencedCodeBlocks.markdown b/oldtests/Code/FencedCodeBlocks.markdown new file mode 100644 index 0000000..6ccc6be --- /dev/null +++ b/oldtests/Code/FencedCodeBlocks.markdown @@ -0,0 +1,35 @@ +This is a fenced code block: +```haskell +pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] +``` +Here is one with tildes: + +~~~ haskell +pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] +~~~ + +More metadata: + +```haskell numberLines start=50 +pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] +``` + +More backticks: + +```````` haskell +pairs :: [(Int,Char)] +pairs = [(x,y) | x <- [0..10], y <- ['a'..'z']] + +backticks :: String +backticks = "`````" +````````````` + +Without an end: + +``` +code with +no end + diff --git a/oldtests/Code/IndentedCodeBlocks.html b/oldtests/Code/IndentedCodeBlocks.html new file mode 100644 index 0000000..0b9b7e7 --- /dev/null +++ b/oldtests/Code/IndentedCodeBlocks.html @@ -0,0 +1,22 @@ +<p>Indented code with two space indent in first and last line:</p> +<pre><code> two spaces *hello* +{ more } + + and +</code></pre> +<p>Indented code requires a leading/trailing blank line: +quick-command --option "$*"</p> +<p>Indented code does not require a trailing blank line:</p> +<pre><code>code +</code></pre> +<p>and not code.</p> +<p>Code in blockquote:</p> +<blockquote> +<pre><code>code +</code></pre> +</blockquote> +<p>Code in list:</p> +<ol> +<li><pre><code>code +</code></pre></li> +</ol> diff --git a/oldtests/Code/IndentedCodeBlocks.markdown b/oldtests/Code/IndentedCodeBlocks.markdown new file mode 100644 index 0000000..2a99db0 --- /dev/null +++ b/oldtests/Code/IndentedCodeBlocks.markdown @@ -0,0 +1,22 @@ +Indented code with two space indent in first and last line: + + two spaces *hello* + { more } + + and + +Indented code requires a leading/trailing blank line: + quick-command --option "$*" + +Indented code does not require a trailing blank line: + + code +and not code. + +Code in blockquote: + +> code + +Code in list: + +1. code diff --git a/oldtests/Code/IndentedFences.html b/oldtests/Code/IndentedFences.html new file mode 100644 index 0000000..66e76da --- /dev/null +++ b/oldtests/Code/IndentedFences.html @@ -0,0 +1,20 @@ +<pre><code>a +</code></pre> +<pre><code>z +</code></pre> +<pre><code>a +a +a + a +</code></pre> +<ul> +<li><p>foo</p> +<pre><code> Hello + +World +</code></pre></li> +</ul> +<blockquote> +<pre><code>a +</code></pre> +</blockquote> diff --git a/oldtests/Code/IndentedFences.markdown b/oldtests/Code/IndentedFences.markdown new file mode 100644 index 0000000..098545f --- /dev/null +++ b/oldtests/Code/IndentedFences.markdown @@ -0,0 +1,26 @@ + ``` + a + ``` + + ``` +z +``` + + ``` +a + a + a + a + ``` + +* foo + + ``` + Hello + + World + ``` + +> ``` +>a +>``` diff --git a/oldtests/Code/IndentedInLists.html b/oldtests/Code/IndentedInLists.html new file mode 100644 index 0000000..76ed424 --- /dev/null +++ b/oldtests/Code/IndentedInLists.html @@ -0,0 +1,22 @@ +<ul> +<li><pre><code>code starts here +</code></pre></li> +</ul> +<ol> +<li><p>foo</p> +<pre><code>code starts here +</code></pre></li> +<li><p>foo</p> +<pre><code>code starts here +</code></pre></li> +</ol> +<ul> +<li><p>foo</p> +<pre><code>code starts here +</code></pre> +<ul> +<li><p>foo</p> +<pre><code>code starts here +</code></pre></li> +</ul></li> +</ul> diff --git a/oldtests/Code/IndentedInLists.markdown b/oldtests/Code/IndentedInLists.markdown new file mode 100644 index 0000000..54e1af1 --- /dev/null +++ b/oldtests/Code/IndentedInLists.markdown @@ -0,0 +1,17 @@ +- code starts here + +1. foo + + code starts here + +2. foo + + code starts here + +- foo + + code starts here + + - foo + + code starts here diff --git a/oldtests/Code/Inline.html b/oldtests/Code/Inline.html new file mode 100644 index 0000000..9c52790 --- /dev/null +++ b/oldtests/Code/Inline.html @@ -0,0 +1,13 @@ +<p>All of these are equivalent:</p> +<ul> +<li><code>*hi*</code></li> +<li><code>*hi*</code></li> +<li><code>*hi*</code></li> +<li><code>*hi*</code></li> +<li><code>*hi*</code></li> +</ul> +<p>Backticks in code spans:</p> +<ul> +<li><code>``code``</code></li> +<li><code>``code``</code></li> +</ul> diff --git a/oldtests/Code/Inline.markdown b/oldtests/Code/Inline.markdown new file mode 100644 index 0000000..38e5b0c --- /dev/null +++ b/oldtests/Code/Inline.markdown @@ -0,0 +1,13 @@ +All of these are equivalent: + +- `*hi*` +- ` *hi* ` +- ``*hi* `` +- ````*hi*```` +- `*hi* + ` + +Backticks in code spans: + +- ``` ``code`` ``` +- ` ``code`` ` diff --git a/oldtests/Code/ListBreakAfter.html b/oldtests/Code/ListBreakAfter.html new file mode 100644 index 0000000..29d6d5e --- /dev/null +++ b/oldtests/Code/ListBreakAfter.html @@ -0,0 +1,30 @@ +<ul> +<li><p>foo</p> +<ul> +<li><p>bar</p> +<pre><code>code1 +code2 +</code></pre> +<p>code?</p></li> +</ul></li> +<li><p>foo</p> +<ul> +<li><p>bar</p> +<pre><code>code1 +code2 +</code></pre></li> +</ul></li> +</ul> +<pre><code>code? +</code></pre> +<ul> +<li>foo +<ul> +<li><p>bar</p> +<pre><code>code1 +code2 +</code></pre></li> +</ul></li> +</ul> +<pre><code>code? +</code></pre> diff --git a/oldtests/Code/ListBreakAfter.markdown b/oldtests/Code/ListBreakAfter.markdown new file mode 100644 index 0000000..4fa79f1 --- /dev/null +++ b/oldtests/Code/ListBreakAfter.markdown @@ -0,0 +1,26 @@ +* foo + * bar + + code1 + code2 + + code? + +* foo + * bar + + code1 + code2 + + + code? + +* foo + * bar + + code1 + code2 + + + + code? diff --git a/oldtests/Code/WhiteLines.html b/oldtests/Code/WhiteLines.html new file mode 100644 index 0000000..7fa137f --- /dev/null +++ b/oldtests/Code/WhiteLines.html @@ -0,0 +1,7 @@ +<pre><code>ABC + + + +DEF +</code></pre> +<p>GHI</p> diff --git a/oldtests/Code/WhiteLines.markdown b/oldtests/Code/WhiteLines.markdown new file mode 100644 index 0000000..ea17af7 --- /dev/null +++ b/oldtests/Code/WhiteLines.markdown @@ -0,0 +1,9 @@ + ABC + + + + DEF + + + +GHI diff --git a/oldtests/Emphasis/Escapes.html b/oldtests/Emphasis/Escapes.html new file mode 100644 index 0000000..17c9e2d --- /dev/null +++ b/oldtests/Emphasis/Escapes.html @@ -0,0 +1 @@ +<p><em>hi* there</em></p> diff --git a/oldtests/Emphasis/Escapes.markdown b/oldtests/Emphasis/Escapes.markdown new file mode 100644 index 0000000..4f14698 --- /dev/null +++ b/oldtests/Emphasis/Escapes.markdown @@ -0,0 +1 @@ +*hi\* there*
\ No newline at end of file diff --git a/oldtests/Emphasis/NestedEmphAndStrong.html b/oldtests/Emphasis/NestedEmphAndStrong.html new file mode 100644 index 0000000..b41b527 --- /dev/null +++ b/oldtests/Emphasis/NestedEmphAndStrong.html @@ -0,0 +1,66 @@ +<ol> +<li><strong><em>test test</em></strong></li> +<li><strong><em>test test</em></strong></li> +<li><em>test <strong>test</strong></em></li> +<li><strong>test <em>test</em></strong></li> +<li><strong><em>test</em> test</strong></li> +<li><em><strong>test</strong> test</em></li> +<li><strong><em>test</em> test</strong></li> +<li><strong>test <em>test</em></strong></li> +<li><em>test <strong>test</strong></em></li> +<li><em>test <strong>test</strong></em></li> +<li><strong>test <em>test</em></strong></li> +<li><strong><em>test</em> test</strong></li> +<li><em><strong>test</strong> test</em></li> +<li><strong><em>test</em> test</strong></li> +<li><strong>test <em>test</em></strong></li> +<li><em>test <strong>test</strong></em></li> +</ol> +<p>Incorrect nesting:</p> +<ol> +<li>*test <strong>test* test</strong></li> +<li>_test <strong>test_ test</strong></li> +<li>**test <em>test</em>* test*</li> +<li>__test␣<em>test</em>_␣test_</li> +<li><em>test <em>test</em> test</em></li> +<li><em>test <em>test</em> test</em></li> +<li><strong>test <strong>test</strong> test</strong></li> +<li><strong>test <strong>test</strong> test</strong></li> +</ol> +<p>No emphasis:</p> +<ol> +<li>test* test *test</li> +<li>test** test **test</li> +<li>test_ test _test</li> +<li>test__ test __test</li> +</ol> +<p>Middle-word emphasis (asterisks):</p> +<ol> +<li><em>a</em>b</li> +<li>a<em>b</em></li> +<li>a<em>b</em>c</li> +<li><strong>a</strong>b</li> +<li>a<strong>b</strong></li> +<li>a<strong>b</strong>c</li> +</ol> +<p>Middle-word emphasis (underscore):</p> +<ol> +<li>_a_b</li> +<li>a_b_</li> +<li>a_b_c</li> +<li>__a__b</li> +<li>a__b__</li> +<li>a__b__c</li> +<li>my_precious_file.txt</li> +</ol> +<p>Tricky Cases:</p> +<ol> +<li>E**. <strong>Test</strong> TestTestTest</li> +<li>E**. <strong>Test</strong> Test Test Test</li> +</ol> +<p>Overlong emphasis:</p> +<p>Name: ____________<br /> +Organization: ____<br /> +Region/Country: __</p> +<p>_____Cut here_____</p> +<p>____Cut here____</p> diff --git a/oldtests/Emphasis/NestedEmphAndStrong.markdown b/oldtests/Emphasis/NestedEmphAndStrong.markdown new file mode 100644 index 0000000..ec7da25 --- /dev/null +++ b/oldtests/Emphasis/NestedEmphAndStrong.markdown @@ -0,0 +1,69 @@ +1. ***test test*** +2. ___test test___ +3. *test **test*** +4. **test *test*** +5. ***test* test** +6. ***test** test* +7. ***test* test** +8. **test *test*** +9. *test **test*** +10. _test __test___ +11. __test _test___ +12. ___test_ test__ +13. ___test__ test_ +14. ___test_ test__ +15. __test _test___ +16. _test __test___ + +Incorrect nesting: + +1. *test **test* test** +2. _test __test_ test__ +3. **test *test** test* +4. __test _test__ test_ +5. *test *test* test* +6. _test _test_ test_ +7. **test **test** test** +8. __test __test__ test__ + +No emphasis: + +1. test* test *test +2. test** test **test +3. test_ test _test +4. test__ test __test + +Middle-word emphasis (asterisks): + +1. *a*b +2. a*b* +3. a*b*c +4. **a**b +5. a**b** +6. a**b**c + +Middle-word emphasis (underscore): + +1. _a_b +2. a_b_ +3. a_b_c +4. __a__b +5. a__b__ +6. a__b__c +7. my_precious_file.txt + +Tricky Cases: + +1. E**. **Test** TestTestTest +2. E**. **Test** Test Test Test + +Overlong emphasis: + +Name: ____________ +Organization: ____ +Region/Country: __ + +_____Cut here_____ + +____Cut here____ + diff --git a/oldtests/Emphasis/Pathological.html b/oldtests/Emphasis/Pathological.html new file mode 100644 index 0000000..37eb9fa --- /dev/null +++ b/oldtests/Emphasis/Pathological.html @@ -0,0 +1,24 @@ +<p>This input can take a long time to parse in some implementations.</p> +<p>*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +aaaaa</p> +<p><em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a<strong>a<em>a</em><em>a</em>a</strong>a<em>a</em><em>a</em>a**</p> diff --git a/oldtests/Emphasis/Pathological.markdown b/oldtests/Emphasis/Pathological.markdown new file mode 100644 index 0000000..5deb95e --- /dev/null +++ b/oldtests/Emphasis/Pathological.markdown @@ -0,0 +1,26 @@ +This input can take a long time to parse in some implementations. + +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +*a +aaaaa + +*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a**a*a** diff --git a/oldtests/Emphasis/Punctuation.html b/oldtests/Emphasis/Punctuation.html new file mode 100644 index 0000000..6061b81 --- /dev/null +++ b/oldtests/Emphasis/Punctuation.html @@ -0,0 +1,10 @@ +<p>Here is a _ that is <em>cool</em>.</p> +<p><em>Foo.</em></p> +<p><strong>Foo.</strong></p> +<p><strong><em>Foo.</em></strong></p> +<p><em>Foo</em>.</p> +<p><strong>Foo</strong>.</p> +<p><strong><em>Foo</em></strong>.</p> +<p><em>Foo</em>. <em>Foo</em>? <em>Foo</em>! <em>Foo</em>: <em>Foo</em>; (<em>Foo</em>)</p> +<p><strong>Foo</strong>. <strong>Foo</strong>? <strong>Foo</strong>! <strong>Foo</strong>: <strong>Foo</strong>; (<strong>Foo</strong>)</p> +<p><strong><em>Foo</em></strong>. <strong><em>Foo</em></strong>? <strong><em>Foo</em></strong>! <strong><em>Foo</em></strong>: <strong><em>Foo</em></strong>; (<strong><em>Foo</em></strong>)</p> diff --git a/oldtests/Emphasis/Punctuation.markdown b/oldtests/Emphasis/Punctuation.markdown new file mode 100644 index 0000000..e3f23b8 --- /dev/null +++ b/oldtests/Emphasis/Punctuation.markdown @@ -0,0 +1,19 @@ +Here is a _ that is _cool_. + +_Foo._ + +__Foo.__ + +___Foo.___ + +_Foo_. + +__Foo__. + +___Foo___. + +_Foo_. _Foo_? _Foo_! _Foo_: _Foo_; (_Foo_) + +__Foo__. __Foo__? __Foo__! __Foo__: __Foo__; (__Foo__) + +___Foo___. ___Foo___? ___Foo___! ___Foo___: ___Foo___; (___Foo___) diff --git a/oldtests/HTML/Blocks.html b/oldtests/HTML/Blocks.html new file mode 100644 index 0000000..dc80335 --- /dev/null +++ b/oldtests/HTML/Blocks.html @@ -0,0 +1,18 @@ +<div> + <div> + *raw html* + </div> +</div> +<div> +<div> +<p><em>this is markdown</em></p> +</div> +</div> +<!-- comment +here --> +<!-- +<p><em>commented out markdown</em></p> +--> +<div> +* raw html with trailing space +</div> diff --git a/oldtests/HTML/Blocks.markdown b/oldtests/HTML/Blocks.markdown new file mode 100644 index 0000000..a83fa66 --- /dev/null +++ b/oldtests/HTML/Blocks.markdown @@ -0,0 +1,26 @@ +<div> + <div> + *raw html* + </div> +</div> + +<div> +<div> + +*this is markdown* + +</div> +</div> + +<!-- comment +here --> + +<!-- + +*commented out markdown* + +--> + +<div> +* raw html with trailing space +</div> diff --git a/oldtests/HTML/Inline.html b/oldtests/HTML/Inline.html new file mode 100644 index 0000000..94d40ac --- /dev/null +++ b/oldtests/HTML/Inline.html @@ -0,0 +1,8 @@ +<p><span>hi</span> +<span><code>hi</code></span> +<span class="foo bar" +title='whatever' blue=yes/> +Hello <!-- this +is a comment --> there. +A line<br />break. +<not a tag></p> diff --git a/oldtests/HTML/Inline.markdown b/oldtests/HTML/Inline.markdown new file mode 100644 index 0000000..2259421 --- /dev/null +++ b/oldtests/HTML/Inline.markdown @@ -0,0 +1,8 @@ +<span>hi</span> +<span>`hi`</span> +<span class="foo bar" +title='whatever' blue=yes/> +Hello <!-- this +is a comment --> there. +A line<br />break. +<not a tag> diff --git a/oldtests/HTML/UppercaseTags.html b/oldtests/HTML/UppercaseTags.html new file mode 100644 index 0000000..8d2d828 --- /dev/null +++ b/oldtests/HTML/UppercaseTags.html @@ -0,0 +1,4 @@ +<DIV> +this is a block +</DIV> +<p>Here is some <I>inline</I> html.</p> diff --git a/oldtests/HTML/UppercaseTags.markdown b/oldtests/HTML/UppercaseTags.markdown new file mode 100644 index 0000000..b476ffb --- /dev/null +++ b/oldtests/HTML/UppercaseTags.markdown @@ -0,0 +1,5 @@ +<DIV> +this is a block +</DIV> + +Here is some <I>inline</I> html.
\ No newline at end of file diff --git a/oldtests/Headers/ATX.html b/oldtests/Headers/ATX.html new file mode 100644 index 0000000..f375b98 --- /dev/null +++ b/oldtests/Headers/ATX.html @@ -0,0 +1,14 @@ +<h1>One</h1> +<h2>Two</h2> +<h3>Three</h3> +<h4>Four</h4> +<h5>Five</h5> +<h6>Six</h6> +<p>####### Seven</p> +<h3>Three with</h3> +<h2>Spacing doesn't matter</h2> +<h2>Escaped final #</h2> +<p>## Not a header</p> +<p>#5 not a header</p> +<h2></h2> +<p>(empty header)</p> diff --git a/oldtests/Headers/ATX.markdown b/oldtests/Headers/ATX.markdown new file mode 100644 index 0000000..f687aa5 --- /dev/null +++ b/oldtests/Headers/ATX.markdown @@ -0,0 +1,20 @@ +# One +## Two +### Three +#### Four +##### Five + +###### Six + +####### Seven + +### Three with ### +## Spacing doesn't matter ## +## Escaped final \## + +\## Not a header + +#5 not a header + +## +(empty header) diff --git a/oldtests/Headers/Setext.html b/oldtests/Headers/Setext.html new file mode 100644 index 0000000..787fb02 --- /dev/null +++ b/oldtests/Headers/Setext.html @@ -0,0 +1,9 @@ +<h1>Level one</h1> +<h2>Two</h2> +<p>In a paragraph</p> +<h2>Level two</h2> +<p>more text</p> +<p>====== +no empty headers</p> +<p>not a header</p> +<hr /> diff --git a/oldtests/Headers/Setext.markdown b/oldtests/Headers/Setext.markdown new file mode 100644 index 0000000..da0c7e2 --- /dev/null +++ b/oldtests/Headers/Setext.markdown @@ -0,0 +1,17 @@ +Level one +========= + +Two +--- + +In a paragraph + +Level two +--------- +more text + +====== +no empty headers + +not a header +------------ ----- diff --git a/oldtests/Links/AngleBrackets.html b/oldtests/Links/AngleBrackets.html new file mode 100644 index 0000000..21ac00d --- /dev/null +++ b/oldtests/Links/AngleBrackets.html @@ -0,0 +1,3 @@ +<p><a href="?}]*+|&)">silly URL with angle brackets</a>.</p> +<p><a href="url://with spaces" title="title">link</a>.</p> +<p><a href="url with spaces" title="title">link</a>.</p> diff --git a/oldtests/Links/AngleBrackets.markdown b/oldtests/Links/AngleBrackets.markdown new file mode 100644 index 0000000..c2e06ff --- /dev/null +++ b/oldtests/Links/AngleBrackets.markdown @@ -0,0 +1,7 @@ +[silly URL with angle brackets](<?}]*+|&)>). + +[link](<url://with spaces> "title"). + +[link][]. + +[link]: <url with spaces> "title" diff --git a/oldtests/Links/AutoLinks.html b/oldtests/Links/AutoLinks.html new file mode 100644 index 0000000..092353f --- /dev/null +++ b/oldtests/Links/AutoLinks.html @@ -0,0 +1,7 @@ +<p><a href="http://google.com?query=blah&time=15">http://google.com?query=blah&time=15</a> +<a href="mailto:someone.else@somedomain.com">someone.else@somedomain.com</a> +<a href="ftp://old.ftp.server.edu">ftp://old.ftp.server.edu</a> +<a href="git://some.git.repo/project.git">git://some.git.repo/project.git</a> +<not autolink> +<http://not.an autolink> +<relative/not/autolink></p> diff --git a/oldtests/Links/AutoLinks.markdown b/oldtests/Links/AutoLinks.markdown new file mode 100644 index 0000000..bf95b8d --- /dev/null +++ b/oldtests/Links/AutoLinks.markdown @@ -0,0 +1,7 @@ +<http://google.com?query=blah&time=15> +<someone.else@somedomain.com> +<ftp://old.ftp.server.edu> +<git://some.git.repo/project.git> +<not autolink> +<http://not.an autolink> +<relative/not/autolink> diff --git a/oldtests/Links/BackticksInLinks.html b/oldtests/Links/BackticksInLinks.html new file mode 100644 index 0000000..ff70383 --- /dev/null +++ b/oldtests/Links/BackticksInLinks.html @@ -0,0 +1 @@ +<p><a href="/url">the right bracket character (<code>]</code>)</a></p> diff --git a/oldtests/Links/BackticksInLinks.markdown b/oldtests/Links/BackticksInLinks.markdown new file mode 100644 index 0000000..539fd52 --- /dev/null +++ b/oldtests/Links/BackticksInLinks.markdown @@ -0,0 +1 @@ +[the right bracket character (`]`)](/url) diff --git a/oldtests/Links/CaseInsensitiveReferences.html b/oldtests/Links/CaseInsensitiveReferences.html new file mode 100644 index 0000000..afe4557 --- /dev/null +++ b/oldtests/Links/CaseInsensitiveReferences.html @@ -0,0 +1 @@ +<p><a href="/url">Толпой</a> is a Russian word.</p> diff --git a/oldtests/Links/CaseInsensitiveReferences.markdown b/oldtests/Links/CaseInsensitiveReferences.markdown new file mode 100644 index 0000000..f9653b9 --- /dev/null +++ b/oldtests/Links/CaseInsensitiveReferences.markdown @@ -0,0 +1,3 @@ +[Толпой] is a Russian word. + +[ТОЛПОЙ]: /url diff --git a/oldtests/Links/Entities.html b/oldtests/Links/Entities.html new file mode 100644 index 0000000..252dadb --- /dev/null +++ b/oldtests/Links/Entities.html @@ -0,0 +1,2 @@ +<p><a href="http://göögle.com">http://göögle.com</a></p> +<p><a href="/url" title="göögle & yahoo">hi</a></p> diff --git a/oldtests/Links/Entities.markdown b/oldtests/Links/Entities.markdown new file mode 100644 index 0000000..d81ee36 --- /dev/null +++ b/oldtests/Links/Entities.markdown @@ -0,0 +1,3 @@ +<http://göögle.com> + +[hi](/url "göögle & yahoo") diff --git a/oldtests/Links/InlineLinks.html b/oldtests/Links/InlineLinks.html new file mode 100644 index 0000000..ae33f33 --- /dev/null +++ b/oldtests/Links/InlineLinks.html @@ -0,0 +1,10 @@ +<ol> +<li><a href="/url">link</a></li> +<li><a href="/url" title="title">link</a></li> +<li><a href="/url" title="title">link</a></li> +<li><a href="/url with spaces" title="title +with linebreak">link <em>with +linebreak</em></a>.</li> +<li><a href="/url(withparens)" title="and single quoted title">link</a></li> +<li>[not a link] (/url)</li> +</ol> diff --git a/oldtests/Links/InlineLinks.markdown b/oldtests/Links/InlineLinks.markdown new file mode 100644 index 0000000..a822c4d --- /dev/null +++ b/oldtests/Links/InlineLinks.markdown @@ -0,0 +1,9 @@ +1. [link](/url) +2. [link](/url "title") +3. [link](/url + "title") +4. [link *with +linebreak*](</url with spaces> "title +with linebreak"). +5. [link](/url(withparens) 'and single quoted title') +6. [not a link] (/url) diff --git a/oldtests/Links/ParensInURLs.html b/oldtests/Links/ParensInURLs.html new file mode 100644 index 0000000..9cd6de7 --- /dev/null +++ b/oldtests/Links/ParensInURLs.html @@ -0,0 +1,6 @@ +<p><a href="/url(test)" title="title">Inline link 1 with parens</a>.</p> +<p><a href="/url(test)" title="title">Inline link 2 with parens</a>.</p> +<p><a href="/url(test)" title="title">Inline link 3 with non-escaped parens</a>.</p> +<p><a href="/url(test)" title="title">Inline link 4 with non-escaped parens</a>.</p> +<p><a href="/url(test)" title="title">Reference link 1 with parens</a>.</p> +<p><a href="/url(test)" title="title">Reference link 2 with parens</a>.</p> diff --git a/oldtests/Links/ParensInURLs.markdown b/oldtests/Links/ParensInURLs.markdown new file mode 100644 index 0000000..bb7be4f --- /dev/null +++ b/oldtests/Links/ParensInURLs.markdown @@ -0,0 +1,14 @@ +[Inline link 1 with parens](/url\(test\) "title"). + +[Inline link 2 with parens](</url\(test\)> "title"). + +[Inline link 3 with non-escaped parens](/url(test) "title"). + +[Inline link 4 with non-escaped parens](</url(test)> "title"). + +[Reference link 1 with parens][1]. + +[Reference link 2 with parens][2]. + + [1]: /url(test) "title" + [2]: </url(test)> "title" diff --git a/oldtests/Links/ReferenceLinks.html b/oldtests/Links/ReferenceLinks.html new file mode 100644 index 0000000..397cdb2 --- /dev/null +++ b/oldtests/Links/ReferenceLinks.html @@ -0,0 +1,7 @@ +<ol> +<li><p><a href="/url" title="even in a list item">Link references</a> can be defined anywhere.</p></li> +</ol> +<blockquote> +<p><a href="/foo" title="can break +lines">another</a> one</p> +</blockquote> diff --git a/oldtests/Links/ReferenceLinks.markdown b/oldtests/Links/ReferenceLinks.markdown new file mode 100644 index 0000000..ebcf5a9 --- /dev/null +++ b/oldtests/Links/ReferenceLinks.markdown @@ -0,0 +1,10 @@ +1. [Link references] can be defined anywhere. + + [Link references]: /url + (even in a list item) + +> [another] one +> +> [another]: +> /foo "can break +> lines" diff --git a/oldtests/Lists/CodeBlocksInLists.html b/oldtests/Lists/CodeBlocksInLists.html new file mode 100644 index 0000000..fcd3e2a --- /dev/null +++ b/oldtests/Lists/CodeBlocksInLists.html @@ -0,0 +1,14 @@ +<ol> +<li><p>list item +code</p></li> +<li><p>list item</p> +<pre><code>code +</code></pre></li> +<li><pre><code>code +</code></pre></li> +<li><pre><code>code +</code></pre></li> +<li><pre><code>code +code +</code></pre></li> +</ol> diff --git a/oldtests/Lists/CodeBlocksInLists.markdown b/oldtests/Lists/CodeBlocksInLists.markdown new file mode 100644 index 0000000..7730808 --- /dev/null +++ b/oldtests/Lists/CodeBlocksInLists.markdown @@ -0,0 +1,18 @@ +1. list item + code + +2. list item + ~~~ + code + ~~~ + +3. ~~~ + code + ~~~ + +4. ~~~ + code + ~~~ + +5. code + code diff --git a/oldtests/Lists/ConsecutiveLists.html b/oldtests/Lists/ConsecutiveLists.html new file mode 100644 index 0000000..f8f9098 --- /dev/null +++ b/oldtests/Lists/ConsecutiveLists.html @@ -0,0 +1,20 @@ +<ul> +<li>one</li> +<li>one</li> +</ul> +<ul> +<li>two</li> +<li>two</li> +</ul> +<ul> +<li>three</li> +<li>three</li> +</ul> +<ol> +<li>four</li> +<li>four</li> +</ol> +<ol> +<li>five</li> +<li>five</li> +</ol> diff --git a/oldtests/Lists/ConsecutiveLists.markdown b/oldtests/Lists/ConsecutiveLists.markdown new file mode 100644 index 0000000..c4faa54 --- /dev/null +++ b/oldtests/Lists/ConsecutiveLists.markdown @@ -0,0 +1,10 @@ +* one +* one ++ two ++ two +- three +- three +1. four +1. four +1) five +1) five diff --git a/oldtests/Lists/EmptyListItem.html b/oldtests/Lists/EmptyListItem.html new file mode 100644 index 0000000..2c23fe1 --- /dev/null +++ b/oldtests/Lists/EmptyListItem.html @@ -0,0 +1,10 @@ +<ul> +<li>one</li> +<li></li> +<li>three</li> +</ul> +<ol> +<li>one</li> +<li></li> +<li>three</li> +</ol> diff --git a/oldtests/Lists/EmptyListItem.markdown b/oldtests/Lists/EmptyListItem.markdown new file mode 100644 index 0000000..d30cbc3 --- /dev/null +++ b/oldtests/Lists/EmptyListItem.markdown @@ -0,0 +1,7 @@ +- one +- +- three + +1. one +2. +3. three diff --git a/oldtests/Lists/InBlockquote.html b/oldtests/Lists/InBlockquote.html new file mode 100644 index 0000000..da233e8 --- /dev/null +++ b/oldtests/Lists/InBlockquote.html @@ -0,0 +1,22 @@ +<blockquote> +<ul> +<li>tight</li> +<li>tight</li> +</ul> +</blockquote> +<blockquote> +<ul> +<li><p>loose</p></li> +<li><p>loose</p></li> +</ul> +</blockquote> +<blockquote> +<ul> +<li>one-item list</li> +</ul> +</blockquote> +<blockquote> +<ul> +<li>one-item list</li> +</ul> +</blockquote> diff --git a/oldtests/Lists/InBlockquote.markdown b/oldtests/Lists/InBlockquote.markdown new file mode 100644 index 0000000..511563b --- /dev/null +++ b/oldtests/Lists/InBlockquote.markdown @@ -0,0 +1,12 @@ +> - tight +> - tight + + +> - loose +> +> - loose + + +> - one-item list + +> - one-item list diff --git a/oldtests/Lists/Indents.html b/oldtests/Lists/Indents.html new file mode 100644 index 0000000..a11a5a6 --- /dev/null +++ b/oldtests/Lists/Indents.html @@ -0,0 +1,22 @@ +<blockquote> +<ul> +<li><p>foo</p> +<p>bar</p></li> +</ul> +</blockquote> +<ul> +<li>one</li> +<li>two</li> +</ul> +<ul> +<li>one</li> +<li>two</li> +<li>three</li> +</ul> +<ul> +<li>one +<ul> +<li>two</li> +</ul></li> +<li>three</li> +</ul> diff --git a/oldtests/Lists/Indents.markdown b/oldtests/Lists/Indents.markdown new file mode 100644 index 0000000..293d112 --- /dev/null +++ b/oldtests/Lists/Indents.markdown @@ -0,0 +1,17 @@ + > * foo +> +> bar + + + - one + - two + + +- one + - two +- three + + +- one + - two +- three diff --git a/oldtests/Lists/ListsAndHRs.html b/oldtests/Lists/ListsAndHRs.html new file mode 100644 index 0000000..40826f7 --- /dev/null +++ b/oldtests/Lists/ListsAndHRs.html @@ -0,0 +1,7 @@ +<ul> +<li>item 1 +<ul> +<li>item 2</li> +</ul></li> +</ul> +<hr /> diff --git a/oldtests/Lists/ListsAndHRs.markdown b/oldtests/Lists/ListsAndHRs.markdown new file mode 100644 index 0000000..19c07e7 --- /dev/null +++ b/oldtests/Lists/ListsAndHRs.markdown @@ -0,0 +1,3 @@ +* item 1 + * item 2 +* * * * * diff --git a/oldtests/Lists/ListsAndSetextHeaders.html b/oldtests/Lists/ListsAndSetextHeaders.html new file mode 100644 index 0000000..c6af9eb --- /dev/null +++ b/oldtests/Lists/ListsAndSetextHeaders.html @@ -0,0 +1,6 @@ +<ol> +<li>item</li> +<li>item +Not header</li> +</ol> +<hr /> diff --git a/oldtests/Lists/ListsAndSetextHeaders.markdown b/oldtests/Lists/ListsAndSetextHeaders.markdown new file mode 100644 index 0000000..acfa655 --- /dev/null +++ b/oldtests/Lists/ListsAndSetextHeaders.markdown @@ -0,0 +1,4 @@ +1. item +2. item +Not header +---------- diff --git a/oldtests/Lists/MultipleBlankLines.html b/oldtests/Lists/MultipleBlankLines.html new file mode 100644 index 0000000..d894db1 --- /dev/null +++ b/oldtests/Lists/MultipleBlankLines.html @@ -0,0 +1,56 @@ +<ol> +<li><p>First Item</p> +<ul> +<li><p>one</p> +<ul> +<li>two</li> +</ul></li> +<li><p>one</p> +<ul> +<li>two</li> +</ul></li> +</ul></li> +<li><p>Second Item</p> +<ul> +<li>one +<ul> +<li>two</li> +</ul></li> +</ul></li> +</ol> +<ul> +<li>one +<ul> +<li>two</li> +</ul></li> +</ul> +<ol> +<li><p>Third Item</p> +<ul> +<li>one +<ul> +<li>two</li> +</ul></li> +</ul></li> +</ol> +<ul> +<li>one +<ul> +<li>two</li> +</ul></li> +</ul> +<ol> +<li><p>Fourth Item</p> +<ul> +<li>one +<ul> +<li>two</li> +</ul></li> +</ul></li> +</ol> +<ul> +<li>one +<ul> +<li>two</li> +</ul></li> +</ul> diff --git a/oldtests/Lists/MultipleBlankLines.markdown b/oldtests/Lists/MultipleBlankLines.markdown new file mode 100644 index 0000000..e24a4f2 --- /dev/null +++ b/oldtests/Lists/MultipleBlankLines.markdown @@ -0,0 +1,37 @@ +1. First Item + + * one + * two + + * one + * two + +1. Second Item + + * one + * two + + + * one + * two + +1. Third Item + + * one + * two + + + + * one + * two + +1. Fourth Item + + * one + * two + + + + + * one + * two diff --git a/oldtests/Lists/Start.html b/oldtests/Lists/Start.html new file mode 100644 index 0000000..46e0550 --- /dev/null +++ b/oldtests/Lists/Start.html @@ -0,0 +1,11 @@ +<ol start="4"> +<li>this list starts with 4.</li> +<li>and continues</li> +<li>the continuation number is irrelevant.</li> +</ol> +<ol start="2001"> +<li>a space odyssey</li> +</ol> +<ol> +<li>standard lists get no start attribute</li> +</ol> diff --git a/oldtests/Lists/Start.markdown b/oldtests/Lists/Start.markdown new file mode 100644 index 0000000..175226f --- /dev/null +++ b/oldtests/Lists/Start.markdown @@ -0,0 +1,7 @@ +4. this list starts with 4. +5. and continues +1. the continuation number is irrelevant. + +2001) a space odyssey + +1. standard lists get no start attribute diff --git a/oldtests/Lists/Sublists.html b/oldtests/Lists/Sublists.html new file mode 100644 index 0000000..af62915 --- /dev/null +++ b/oldtests/Lists/Sublists.html @@ -0,0 +1,49 @@ +<p>Four levels:</p> +<ul> +<li>one +<ul> +<li>two +<ul> +<li>three +<ul> +<li>four</li> +</ul></li> +</ul></li> +</ul></li> +</ul> +<ol> +<li>one +<ol> +<li>two +<ol> +<li>three +<ol> +<li>four</li> +</ol></li> +</ol></li> +</ol></li> +</ol> +<ol> +<li>one +<ul> +<li>two +<ol> +<li>three +<ul> +<li>four</li> +</ul></li> +</ol></li> +</ul></li> +</ol> +<ul> +<li>one +<ul> +<li>two +<ol> +<li>three +<ol> +<li>four</li> +</ol></li> +</ol></li> +</ul></li> +</ul> diff --git a/oldtests/Lists/Sublists.markdown b/oldtests/Lists/Sublists.markdown new file mode 100644 index 0000000..9eced9e --- /dev/null +++ b/oldtests/Lists/Sublists.markdown @@ -0,0 +1,24 @@ +Four levels: + +- one + - two + - three + - four + + +1. one + 1. two + 1. three + 1. four + + +1) one + - two + 1) three + - four + + +- one + - two + 1. three + 1) four diff --git a/oldtests/Lists/TightAndLoose.html b/oldtests/Lists/TightAndLoose.html new file mode 100644 index 0000000..7792ebb --- /dev/null +++ b/oldtests/Lists/TightAndLoose.html @@ -0,0 +1,49 @@ +<ol> +<li>tight</li> +</ol> +<ul> +<li>tight</li> +<li>list</li> +</ul> +<ul> +<li><p>loose</p></li> +<li><p>list</p></li> +</ul> +<ol> +<li>tight</li> +<li>list</li> +</ol> +<ol> +<li><p>loose</p></li> +<li><p>list</p></li> +</ol> +<ol> +<li><p>loose</p> +<ul> +<li>sublist</li> +</ul></li> +</ol> +<ol> +<li>tight +<ul> +<li>sublist</li> +</ul></li> +</ol> +<ul> +<li>tight +<blockquote> +<p>blockquote +and</p> +</blockquote> +<pre><code>code +</code></pre></li> +<li>tight</li> +</ul> +<ul> +<li>tight +<ul> +<li><p>with loose</p></li> +<li><p>sublist</p></li> +</ul></li> +<li>tight</li> +</ul> diff --git a/oldtests/Lists/TightAndLoose.markdown b/oldtests/Lists/TightAndLoose.markdown new file mode 100644 index 0000000..263a34c --- /dev/null +++ b/oldtests/Lists/TightAndLoose.markdown @@ -0,0 +1,45 @@ +1. tight + + +- tight +- list + + +- loose + +- list + + +1. tight +2. list + + +1. loose + +2. list + + +1. loose + + - sublist + + + +1. tight + - sublist + + +- tight + > blockquote + and + ``` + code + ``` +- tight + + +- tight + - with loose + + - sublist +- tight diff --git a/oldtests/Lists/TightLooseBlockquote.html b/oldtests/Lists/TightLooseBlockquote.html new file mode 100644 index 0000000..7e78214 --- /dev/null +++ b/oldtests/Lists/TightLooseBlockquote.html @@ -0,0 +1,32 @@ +<ul> +<li>tight I +<blockquote> +<p>bq</p> +</blockquote></li> +<li>tight I</li> +</ul> +<ul> +<li>tight II +<blockquote> +<p>bq</p> +</blockquote> +foo</li> +<li>tight II</li> +</ul> +<ol> +<li>Blank lines in bq don't break list +<blockquote> +<p>bq</p> +</blockquote></li> +<li>Should say (2) in output</li> +</ol> +<ul> +<li>Blank lines in bq don't break LI +<ul> +<li>item A +<blockquote> +<p>bq</p> +</blockquote></li> +<li>item B</li> +</ul></li> +</ul> diff --git a/oldtests/Lists/TightLooseBlockquote.markdown b/oldtests/Lists/TightLooseBlockquote.markdown new file mode 100644 index 0000000..08200cc --- /dev/null +++ b/oldtests/Lists/TightLooseBlockquote.markdown @@ -0,0 +1,25 @@ +* tight I + > bq + > +* tight I + + +* tight II + > bq + > + foo +* tight II + +1. Blank lines in bq don't break list + > bq + > + > +1. Should say (2) in output + +* Blank lines in bq don't break LI + * item A + > bq + > + > + * item B +
\ No newline at end of file diff --git a/oldtests/Lists/TightLooseMore.html b/oldtests/Lists/TightLooseMore.html new file mode 100644 index 0000000..f26f457 --- /dev/null +++ b/oldtests/Lists/TightLooseMore.html @@ -0,0 +1,7 @@ +<ul> +<li><p>foo</p> +<ul> +<li>bar</li> +</ul> +<p>blah</p></li> +</ul> diff --git a/oldtests/Lists/TightLooseMore.markdown b/oldtests/Lists/TightLooseMore.markdown new file mode 100644 index 0000000..7ace63f --- /dev/null +++ b/oldtests/Lists/TightLooseMore.markdown @@ -0,0 +1,4 @@ +* foo + * bar + + blah
\ No newline at end of file diff --git a/oldtests/Lists/TwoBlankLinesEndList.html b/oldtests/Lists/TwoBlankLinesEndList.html new file mode 100644 index 0000000..629add1 --- /dev/null +++ b/oldtests/Lists/TwoBlankLinesEndList.html @@ -0,0 +1,21 @@ +<ol> +<li><p>one</p></li> +<li><p>two</p></li> +</ol> +<ol> +<li>new list</li> +</ol> +<blockquote> +<ul> +<li><p>one</p></li> +<li><p>two</p></li> +</ul> +<ul> +<li>new list</li> +</ul> +</blockquote> +<ol> +<li>one</li> +</ol> +<pre><code>code +</code></pre> diff --git a/oldtests/Lists/TwoBlankLinesEndList.markdown b/oldtests/Lists/TwoBlankLinesEndList.markdown new file mode 100644 index 0000000..2984a19 --- /dev/null +++ b/oldtests/Lists/TwoBlankLinesEndList.markdown @@ -0,0 +1,20 @@ +1. one + +2. two + + +1. new list + + +> - one +> +> - two +> +> +> - new list + + +1. one + + + code diff --git a/oldtests/Makefile b/oldtests/Makefile new file mode 100644 index 0000000..c8a30bd --- /dev/null +++ b/oldtests/Makefile @@ -0,0 +1,55 @@ +SHELL=/bin/bash +TESTDIR ?= * +PATT ?= . +TESTS=$(shell ls $(TESTDIR)/*.markdown | grep $(PATT)) +DIFFS=$(patsubst %.markdown,%.diff,$(TESTS)) +PROG ?= ../stmd +FILTER ?= perl -pe 's/ /␣/g' +TIDYCMD ?= tidy -asxhtml -utf8 --show-body-only yes --show-warnings no -quiet +DETAILS ?= 1 + +# Check to see if echo supports -e option to allow backslash escapes +ifeq ($(shell echo -e),-e) +ECHO=echo +else +ECHO=echo -e +endif + +all: $(DIFFS) + PASS=0;TESTS=0; \ + for f in $(DIFFS); do \ + let TESTS=TESTS+1; \ + [ -s $$f ] || let PASS=PASS+1; \ + done; \ + $(ECHO) "\033[1m$$PASS of $$TESTS tests passed.\033[0m"; \ + if [ $$TESTS -eq $$PASS ]; then exit 0; else exit 1; fi + +%.actual.html: %.markdown +ifeq ($(TIDY),1) + -cat $< | $(PROG) | $(TIDYCMD) > $@ +else + -cat $< | $(PROG) > $@ +endif + +%.expected.html: %.html +ifeq ($(TIDY),1) + -$(TIDYCMD) $< > $@ +else + cp $< $@ +endif + +%.diff: %.expected.html %.actual.html + diff --unified=1 <(cat $(word 1,$^) | $(FILTER)) <(cat $(word 2,$^) | $(FILTER)) > $@ ; \ + if [ -s $@ ]; then \ + $(ECHO) "\033[1;31m✘ $(patsubst %.diff,%,$@)\033[0m"; \ + if [ $(DETAILS) == "1" ]; then \ + $(ECHO) "\033[0;36m" ; cat $@; $(ECHO) "\033[0m"; \ + fi \ + else \ + $(ECHO) "\033[1;32m✓ $(patsubst %.diff,%,$@)\033[0m"; \ + fi + +.PHONY: all clean + +clean: + -@rm */*.{diff,actual.html,expected.html} diff --git a/oldtests/Misc/BackslashEscapes.html b/oldtests/Misc/BackslashEscapes.html new file mode 100644 index 0000000..3eb2aed --- /dev/null +++ b/oldtests/Misc/BackslashEscapes.html @@ -0,0 +1,14 @@ +<p>*not emphasized* +\<em>emphasis</em> +**not bold** +<br/> not a tag +[link](/foo) not a link +<a href="/foo)" title="title"">link</a> +`not code`</p> +<p>1. not a list item</p> +<p>* not a list.</p> +<p># Not a header</p> +<p>[foo]: /url "not a reference"</p> +<p>$ ^ ; can be escaped. +\a \b \T cannot. +unicode letters and symbols cannot: \π \‥.</p> diff --git a/oldtests/Misc/BackslashEscapes.markdown b/oldtests/Misc/BackslashEscapes.markdown new file mode 100644 index 0000000..23496dc --- /dev/null +++ b/oldtests/Misc/BackslashEscapes.markdown @@ -0,0 +1,19 @@ +\*not emphasized* +\\*emphasis* +\*\*not bold** +\<br/> not a tag +\[link](/foo) not a link +[link](/foo\) "title\"") +\`not code` + +1\. not a list item + +\* not a list. + +\# Not a header + +\[foo]: /url "not a reference" + +\$ \^ \; can be escaped. +\a \b \T cannot. +unicode letters and symbols cannot: \π \‥. diff --git a/oldtests/Misc/Laziness.html b/oldtests/Misc/Laziness.html new file mode 100644 index 0000000..e130eb5 --- /dev/null +++ b/oldtests/Misc/Laziness.html @@ -0,0 +1,22 @@ +<blockquote> +<ol> +<li>one +two</li> +</ol> +</blockquote> +<p>Laziness only affects paragraph continuations:</p> +<blockquote> +<pre><code>code +</code></pre> +</blockquote> +<pre><code>not same code block +</code></pre> +<ol> +<li>hello</li> +</ol> +<hr /> +<blockquote> +<pre><code></code></pre> +</blockquote> +<p>code</p> +<pre><code></code></pre> diff --git a/oldtests/Misc/Laziness.markdown b/oldtests/Misc/Laziness.markdown new file mode 100644 index 0000000..2c32870 --- /dev/null +++ b/oldtests/Misc/Laziness.markdown @@ -0,0 +1,14 @@ +> 1. one +> two + +Laziness only affects paragraph continuations: + +> code + not same code block + +1. hello +----- + +> ``` +code +``` diff --git a/oldtests/Misc/LineBreaks.html b/oldtests/Misc/LineBreaks.html new file mode 100644 index 0000000..2d85e85 --- /dev/null +++ b/oldtests/Misc/LineBreaks.html @@ -0,0 +1,11 @@ +<p>Two spaces<br /> +break a line. Or more than two<br /> +and spaces in the following line are absorbed.</p> +<p>You can also break lines with<br /> +a backslash.</p> +<p>Two spaces at the end of a paragraph are +not a line break.</p> +<p>A backslash at the end of a paragraph is +not a line break.\</p> +<h2>Similarly with setext headers</h2> +<h2>And with backslashes\</h2> diff --git a/oldtests/Misc/LineBreaks.markdown b/oldtests/Misc/LineBreaks.markdown new file mode 100644 index 0000000..3632dcb --- /dev/null +++ b/oldtests/Misc/LineBreaks.markdown @@ -0,0 +1,18 @@ +Two spaces +break a line. Or more than two + and spaces in the following line are absorbed. + +You can also break lines with\ +a backslash. + +Two spaces at the end of a paragraph are +not a line break. + +A backslash at the end of a paragraph is +not a line break.\ + +Similarly with setext headers +------------------------------- + +And with backslashes\ +--------------------- diff --git a/oldtests/Misc/Transitions.html b/oldtests/Misc/Transitions.html new file mode 100644 index 0000000..fceff9f --- /dev/null +++ b/oldtests/Misc/Transitions.html @@ -0,0 +1,26 @@ +<blockquote> +<p>blockquote</p> +<blockquote> +<p>blockquote</p> +</blockquote> +</blockquote> +<ol> +<li>list</li> +<li>list +<ul> +<li>sublist</li> +</ul></li> +</ol> +<hr /> +<p>paragraph</p> +<h2>header</h2> +<h3>header</h3> +<pre><code>code +</code></pre> +<pre><code>code +</code></pre> +<div> + <div> +# not a header + </div> +</div> diff --git a/oldtests/Misc/Transitions.markdown b/oldtests/Misc/Transitions.markdown new file mode 100644 index 0000000..5f3a9d3 --- /dev/null +++ b/oldtests/Misc/Transitions.markdown @@ -0,0 +1,20 @@ +> blockquote +> > blockquote +1. list +2. list + - sublist +* * * * * +paragraph + +header +------ +### header + code +``` +code +``` +<div> + <div> +# not a header + </div> +</div> diff --git a/oldtests/Original/Amps_and_angle_encoding.html b/oldtests/Original/Amps_and_angle_encoding.html new file mode 100644 index 0000000..fc1b2c3 --- /dev/null +++ b/oldtests/Original/Amps_and_angle_encoding.html @@ -0,0 +1,9 @@ +<p>AT&T has an ampersand in their name.</p> +<p>AT&T is another way to write it.</p> +<p>This & that.</p> +<p>4 < 5.</p> +<p>6 > 5.</p> +<p>Here's a <a href="http://example.com/?foo=1&bar=2">link</a> with an ampersand in the URL.</p> +<p>Here's a link with an amersand in the link text: <a href="http://att.com/" title="AT&T">AT&T</a>.</p> +<p>Here's an inline <a href="/script?foo=1&bar=2">link</a>.</p> +<p>Here's an inline <a href="/script?foo=1&bar=2">link</a>.</p> diff --git a/oldtests/Original/Amps_and_angle_encoding.markdown b/oldtests/Original/Amps_and_angle_encoding.markdown new file mode 100644 index 0000000..0e9527f --- /dev/null +++ b/oldtests/Original/Amps_and_angle_encoding.markdown @@ -0,0 +1,21 @@ +AT&T has an ampersand in their name. + +AT&T is another way to write it. + +This & that. + +4 < 5. + +6 > 5. + +Here's a [link] [1] with an ampersand in the URL. + +Here's a link with an amersand in the link text: [AT&T] [2]. + +Here's an inline [link](/script?foo=1&bar=2). + +Here's an inline [link](</script?foo=1&bar=2>). + + +[1]: http://example.com/?foo=1&bar=2 +[2]: http://att.com/ "AT&T"
\ No newline at end of file diff --git a/oldtests/Original/Auto_links.html b/oldtests/Original/Auto_links.html new file mode 100644 index 0000000..f517fe6 --- /dev/null +++ b/oldtests/Original/Auto_links.html @@ -0,0 +1,13 @@ +<p>Link: <a href="http://example.com/">http://example.com/</a>.</p> +<p>With an ampersand: <a href="http://example.com/?foo=1&bar=2">http://example.com/?foo=1&bar=2</a></p> +<ul> +<li>In a list?</li> +<li><a href="http://example.com/">http://example.com/</a></li> +<li>It should.</li> +</ul> +<blockquote> +<p>Blockquoted: <a href="http://example.com/">http://example.com/</a></p> +</blockquote> +<p>Auto-links should not occur here: <code><http://example.com/></code></p> +<pre><code>or here: <http://example.com/> +</code></pre> diff --git a/oldtests/Original/Auto_links.markdown b/oldtests/Original/Auto_links.markdown new file mode 100644 index 0000000..abbc488 --- /dev/null +++ b/oldtests/Original/Auto_links.markdown @@ -0,0 +1,13 @@ +Link: <http://example.com/>. + +With an ampersand: <http://example.com/?foo=1&bar=2> + +* In a list? +* <http://example.com/> +* It should. + +> Blockquoted: <http://example.com/> + +Auto-links should not occur here: `<http://example.com/>` + + or here: <http://example.com/>
\ No newline at end of file diff --git a/oldtests/Original/Backslash_escapes.html b/oldtests/Original/Backslash_escapes.html new file mode 100644 index 0000000..9a83379 --- /dev/null +++ b/oldtests/Original/Backslash_escapes.html @@ -0,0 +1,75 @@ +<p>These should all get escaped:</p> +<p>Backslash: \</p> +<p>Backtick: `</p> +<p>Asterisk: *</p> +<p>Underscore: _</p> +<p>Left brace: {</p> +<p>Right brace: }</p> +<p>Left bracket: [</p> +<p>Right bracket: ]</p> +<p>Left paren: (</p> +<p>Right paren: )</p> +<p>Greater-than: ></p> +<p>Hash: #</p> +<p>Period: .</p> +<p>Bang: !</p> +<p>Plus: +</p> +<p>Minus: -</p> +<p>These should not, because they occur within a code block:</p> +<pre><code>Backslash: \\ + +Backtick: \` + +Asterisk: \* + +Underscore: \_ + +Left brace: \{ + +Right brace: \} + +Left bracket: \[ + +Right bracket: \] + +Left paren: \( + +Right paren: \) + +Greater-than: \> + +Hash: \# + +Period: \. + +Bang: \! + +Plus: \+ + +Minus: \- +</code></pre> +<p>Nor should these, which occur in code spans:</p> +<p>Backslash: <code>\\</code></p> +<p>Backtick: <code>\`</code></p> +<p>Asterisk: <code>\*</code></p> +<p>Underscore: <code>\_</code></p> +<p>Left brace: <code>\{</code></p> +<p>Right brace: <code>\}</code></p> +<p>Left bracket: <code>\[</code></p> +<p>Right bracket: <code>\]</code></p> +<p>Left paren: <code>\(</code></p> +<p>Right paren: <code>\)</code></p> +<p>Greater-than: <code>\></code></p> +<p>Hash: <code>\#</code></p> +<p>Period: <code>\.</code></p> +<p>Bang: <code>\!</code></p> +<p>Plus: <code>\+</code></p> +<p>Minus: <code>\-</code></p> +<p>These should get escaped, even though they're matching pairs for +other Markdown constructs:</p> +<p>*asterisks*</p> +<p>_underscores_</p> +<p>`backticks`</p> +<p>This is a code span with a literal backslash-backtick sequence: <code>\`</code></p> +<p>This is a tag with unescaped backticks <span attr='`ticks`'>bar</span>.</p> +<p>This is a tag with backslashes <span attr='\\backslashes\\'>bar</span>.</p> diff --git a/oldtests/Original/Backslash_escapes.markdown b/oldtests/Original/Backslash_escapes.markdown new file mode 100644 index 0000000..5b014cb --- /dev/null +++ b/oldtests/Original/Backslash_escapes.markdown @@ -0,0 +1,120 @@ +These should all get escaped: + +Backslash: \\ + +Backtick: \` + +Asterisk: \* + +Underscore: \_ + +Left brace: \{ + +Right brace: \} + +Left bracket: \[ + +Right bracket: \] + +Left paren: \( + +Right paren: \) + +Greater-than: \> + +Hash: \# + +Period: \. + +Bang: \! + +Plus: \+ + +Minus: \- + + + +These should not, because they occur within a code block: + + Backslash: \\ + + Backtick: \` + + Asterisk: \* + + Underscore: \_ + + Left brace: \{ + + Right brace: \} + + Left bracket: \[ + + Right bracket: \] + + Left paren: \( + + Right paren: \) + + Greater-than: \> + + Hash: \# + + Period: \. + + Bang: \! + + Plus: \+ + + Minus: \- + + +Nor should these, which occur in code spans: + +Backslash: `\\` + +Backtick: `` \` `` + +Asterisk: `\*` + +Underscore: `\_` + +Left brace: `\{` + +Right brace: `\}` + +Left bracket: `\[` + +Right bracket: `\]` + +Left paren: `\(` + +Right paren: `\)` + +Greater-than: `\>` + +Hash: `\#` + +Period: `\.` + +Bang: `\!` + +Plus: `\+` + +Minus: `\-` + + +These should get escaped, even though they're matching pairs for +other Markdown constructs: + +\*asterisks\* + +\_underscores\_ + +\`backticks\` + +This is a code span with a literal backslash-backtick sequence: `` \` `` + +This is a tag with unescaped backticks <span attr='`ticks`'>bar</span>. + +This is a tag with backslashes <span attr='\\backslashes\\'>bar</span>. diff --git a/oldtests/Original/Blockquotes_with_code_blocks.html b/oldtests/Original/Blockquotes_with_code_blocks.html new file mode 100644 index 0000000..fd1cb1b --- /dev/null +++ b/oldtests/Original/Blockquotes_with_code_blocks.html @@ -0,0 +1,12 @@ +<blockquote> +<p>Example:</p> +<pre><code>sub status { + print "working"; +} +</code></pre> +<p>Or:</p> +<pre><code>sub status { + return "working"; +} +</code></pre> +</blockquote> diff --git a/oldtests/Original/Blockquotes_with_code_blocks.markdown b/oldtests/Original/Blockquotes_with_code_blocks.markdown new file mode 100644 index 0000000..c31d171 --- /dev/null +++ b/oldtests/Original/Blockquotes_with_code_blocks.markdown @@ -0,0 +1,11 @@ +> Example: +> +> sub status { +> print "working"; +> } +> +> Or: +> +> sub status { +> return "working"; +> } diff --git a/oldtests/Original/Code_Blocks.html b/oldtests/Original/Code_Blocks.html new file mode 100644 index 0000000..7d89615 --- /dev/null +++ b/oldtests/Original/Code_Blocks.html @@ -0,0 +1,12 @@ +<pre><code>code block on the first line +</code></pre> +<p>Regular text.</p> +<pre><code>code block indented by spaces +</code></pre> +<p>Regular text.</p> +<pre><code>the lines in this block +all contain trailing spaces +</code></pre> +<p>Regular Text.</p> +<pre><code>code block on the last line +</code></pre> diff --git a/oldtests/Original/Code_Blocks.markdown b/oldtests/Original/Code_Blocks.markdown new file mode 100644 index 0000000..b54b092 --- /dev/null +++ b/oldtests/Original/Code_Blocks.markdown @@ -0,0 +1,14 @@ + code block on the first line + +Regular text. + + code block indented by spaces + +Regular text. + + the lines in this block + all contain trailing spaces + +Regular Text. + + code block on the last line
\ No newline at end of file diff --git a/oldtests/Original/Code_Spans.html b/oldtests/Original/Code_Spans.html new file mode 100644 index 0000000..27acea1 --- /dev/null +++ b/oldtests/Original/Code_Spans.html @@ -0,0 +1,3 @@ +<p><code><test a="</code> content of attribute <code>"></code></p> +<p>Fix for backticks within HTML tag: <span attr='`ticks`'>like this</span></p> +<p>Here's how you put <code>`backticks`</code> in a code span.</p> diff --git a/oldtests/Original/Code_Spans.markdown b/oldtests/Original/Code_Spans.markdown new file mode 100644 index 0000000..5c229c7 --- /dev/null +++ b/oldtests/Original/Code_Spans.markdown @@ -0,0 +1,5 @@ +`<test a="` content of attribute `">` + +Fix for backticks within HTML tag: <span attr='`ticks`'>like this</span> + +Here's how you put `` `backticks` `` in a code span.
\ No newline at end of file diff --git a/oldtests/Original/Horizontal_rules.html b/oldtests/Original/Horizontal_rules.html new file mode 100644 index 0000000..a89efdb --- /dev/null +++ b/oldtests/Original/Horizontal_rules.html @@ -0,0 +1,39 @@ +<p>Dashes:</p> +<hr /> +<hr /> +<hr /> +<hr /> +<pre><code>--- +</code></pre> +<hr /> +<hr /> +<hr /> +<hr /> +<pre><code>- - - +</code></pre> +<p>Asterisks:</p> +<hr /> +<hr /> +<hr /> +<hr /> +<pre><code>*** +</code></pre> +<hr /> +<hr /> +<hr /> +<hr /> +<pre><code>* * * +</code></pre> +<p>Underscores:</p> +<hr /> +<hr /> +<hr /> +<hr /> +<pre><code>___ +</code></pre> +<hr /> +<hr /> +<hr /> +<hr /> +<pre><code>_ _ _ +</code></pre> diff --git a/oldtests/Original/Horizontal_rules.markdown b/oldtests/Original/Horizontal_rules.markdown new file mode 100644 index 0000000..1594bda --- /dev/null +++ b/oldtests/Original/Horizontal_rules.markdown @@ -0,0 +1,67 @@ +Dashes: + +--- + + --- + + --- + + --- + + --- + +- - - + + - - - + + - - - + + - - - + + - - - + + +Asterisks: + +*** + + *** + + *** + + *** + + *** + +* * * + + * * * + + * * * + + * * * + + * * * + + +Underscores: + +___ + + ___ + + ___ + + ___ + + ___ + +_ _ _ + + _ _ _ + + _ _ _ + + _ _ _ + + _ _ _ diff --git a/oldtests/Original/Images.html b/oldtests/Original/Images.html new file mode 100644 index 0000000..bd5a7e0 --- /dev/null +++ b/oldtests/Original/Images.html @@ -0,0 +1,11 @@ +<p><img src="/path/to/img.jpg" alt="Alt text" /></p> +<p><img src="/path/to/img.jpg" alt="Alt text" title="Optional title" /></p> +<p>Inline within a paragraph: <a href="/url/">alt text</a>.</p> +<p><img src="/url/" alt="alt text" title="title preceded by two spaces" /></p> +<p><img src="/url/" alt="alt text" title="title has spaces afterward" /></p> +<p><img src="/url/" alt="alt text" /></p> +<p><img src="/url/" alt="alt text" title="with a title" />.</p> +<p><img src="" alt="Empty" /></p> +<p><img src="http://example.com/(parens).jpg" alt="this is a stupid URL" /></p> +<p><img src="/url/" alt="alt text" /></p> +<p><img src="/url/" alt="alt text" title="Title here" /></p> diff --git a/oldtests/Original/Images.markdown b/oldtests/Original/Images.markdown new file mode 100644 index 0000000..5707590 --- /dev/null +++ b/oldtests/Original/Images.markdown @@ -0,0 +1,26 @@ +![Alt text](/path/to/img.jpg) + +![Alt text](/path/to/img.jpg "Optional title") + +Inline within a paragraph: [alt text](/url/). + +![alt text](/url/ "title preceded by two spaces") + +![alt text](/url/ "title has spaces afterward" ) + +![alt text](</url/>) + +![alt text](</url/> "with a title"). + +![Empty]() + +![this is a stupid URL](http://example.com/(parens).jpg) + + +![alt text][foo] + + [foo]: /url/ + +![alt text][bar] + + [bar]: /url/ "Title here"
\ No newline at end of file diff --git a/oldtests/Original/Inline_HTML_Advanced.html b/oldtests/Original/Inline_HTML_Advanced.html new file mode 100644 index 0000000..631c135 --- /dev/null +++ b/oldtests/Original/Inline_HTML_Advanced.html @@ -0,0 +1,23 @@ +<p>Simple block on one line:</p> +<div>foo</div> +<p>And nested without indentation:</p> +<div> +<div> +<div> +foo +</div> +<div style=">"/> +</div> +<div>bar</div> +</div> +<p>And with attributes:</p> +<div> + <div id="foo"> + </div> +</div> +<p>This was broken in 1.0.2b7:</p> +<div class="inlinepage"> +<div class="toggleableend"> +foo +</div> +</div> diff --git a/oldtests/Original/Inline_HTML_Advanced.markdown b/oldtests/Original/Inline_HTML_Advanced.markdown new file mode 100644 index 0000000..3633f81 --- /dev/null +++ b/oldtests/Original/Inline_HTML_Advanced.markdown @@ -0,0 +1,30 @@ +Simple block on one line: + +<div>foo</div> + +And nested without indentation: + +<div> +<div> +<div> +foo +</div> +<div style=">"/> +</div> +<div>bar</div> +</div> + +And with attributes: + +<div> + <div id="foo"> + </div> +</div> + +This was broken in 1.0.2b7: + +<div class="inlinepage"> +<div class="toggleableend"> +foo +</div> +</div> diff --git a/oldtests/Original/Inline_HTML_Simple.html b/oldtests/Original/Inline_HTML_Simple.html new file mode 100644 index 0000000..923a18c --- /dev/null +++ b/oldtests/Original/Inline_HTML_Simple.html @@ -0,0 +1,45 @@ +<p>Here's a simple block:</p> +<div> + foo +</div> +<p>This should be a code block, though:</p> +<pre><code><div> + foo +</div> +</code></pre> +<p>As should this:</p> +<pre><code><div>foo</div> +</code></pre> +<p>Now, nested:</p> +<div> + <div> + <div> + foo + </div> + </div> +</div> +<p>This should just be an HTML comment:</p> +<!-- Comment --> +<p>Multiline:</p> +<!-- +Blah +Blah +--> +<p>Code block:</p> +<pre><code><!-- Comment --> +</code></pre> +<p>Just plain comment, with trailing spaces on the line:</p> +<!-- foo --> +<p>Code:</p> +<pre><code><hr /> +</code></pre> +<p>Hr's:</p> +<hr> +<hr/> +<hr /> +<hr> +<hr/> +<hr /> +<hr class="foo" id="bar" /> +<hr class="foo" id="bar"/> +<hr class="foo" id="bar" > diff --git a/oldtests/Original/Inline_HTML_Simple.markdown b/oldtests/Original/Inline_HTML_Simple.markdown new file mode 100644 index 0000000..14aa2dc --- /dev/null +++ b/oldtests/Original/Inline_HTML_Simple.markdown @@ -0,0 +1,69 @@ +Here's a simple block: + +<div> + foo +</div> + +This should be a code block, though: + + <div> + foo + </div> + +As should this: + + <div>foo</div> + +Now, nested: + +<div> + <div> + <div> + foo + </div> + </div> +</div> + +This should just be an HTML comment: + +<!-- Comment --> + +Multiline: + +<!-- +Blah +Blah +--> + +Code block: + + <!-- Comment --> + +Just plain comment, with trailing spaces on the line: + +<!-- foo --> + +Code: + + <hr /> + +Hr's: + +<hr> + +<hr/> + +<hr /> + +<hr> + +<hr/> + +<hr /> + +<hr class="foo" id="bar" /> + +<hr class="foo" id="bar"/> + +<hr class="foo" id="bar" > + diff --git a/oldtests/Original/Inline_HTML_comments.html b/oldtests/Original/Inline_HTML_comments.html new file mode 100644 index 0000000..ebc4818 --- /dev/null +++ b/oldtests/Original/Inline_HTML_comments.html @@ -0,0 +1,8 @@ +<p>Paragraph one.</p> +<!-- This is a simple comment --> +<!-- + This is another comment. +--> +<p>Paragraph two.</p> +<!-- one comment block -- -- with two comments --> +<p>The end.</p> diff --git a/oldtests/Original/Inline_HTML_comments.markdown b/oldtests/Original/Inline_HTML_comments.markdown new file mode 100644 index 0000000..41d830d --- /dev/null +++ b/oldtests/Original/Inline_HTML_comments.markdown @@ -0,0 +1,13 @@ +Paragraph one. + +<!-- This is a simple comment --> + +<!-- + This is another comment. +--> + +Paragraph two. + +<!-- one comment block -- -- with two comments --> + +The end. diff --git a/oldtests/Original/Links_inline_style.html b/oldtests/Original/Links_inline_style.html new file mode 100644 index 0000000..feb4637 --- /dev/null +++ b/oldtests/Original/Links_inline_style.html @@ -0,0 +1,12 @@ +<p>Just a <a href="/url/">URL</a>.</p> +<p><a href="/url/" title="title">URL and title</a>.</p> +<p><a href="/url/" title="title preceded by two spaces">URL and title</a>.</p> +<p><a href="/url/" title="title preceded by a tab">URL and title</a>.</p> +<p><a href="/url/" title="title has spaces afterward">URL and title</a>.</p> +<p><a href="/url/">URL wrapped in angle brackets</a>.</p> +<p><a href="/url/" title="Here's the title">URL w/ angle brackets + title</a>.</p> +<p><a href="">Empty</a>.</p> +<p><a href="http://en.wikipedia.org/wiki/WIMP_(computing)">With parens in the URL</a></p> +<p>(With outer parens and <a href="/foo(bar)">parens in url</a>)</p> +<p><a href="/foo(bar)" title="and a title">With parens in the URL</a></p> +<p>(With outer parens and <a href="/foo(bar)" title="and a title">parens in url</a>)</p> diff --git a/oldtests/Original/Links_inline_style.markdown b/oldtests/Original/Links_inline_style.markdown new file mode 100644 index 0000000..aba9658 --- /dev/null +++ b/oldtests/Original/Links_inline_style.markdown @@ -0,0 +1,24 @@ +Just a [URL](/url/). + +[URL and title](/url/ "title"). + +[URL and title](/url/ "title preceded by two spaces"). + +[URL and title](/url/ "title preceded by a tab"). + +[URL and title](/url/ "title has spaces afterward" ). + +[URL wrapped in angle brackets](</url/>). + +[URL w/ angle brackets + title](</url/> "Here's the title"). + +[Empty](). + +[With parens in the URL](http://en.wikipedia.org/wiki/WIMP_(computing)) + +(With outer parens and [parens in url](/foo(bar))) + + +[With parens in the URL](/foo(bar) "and a title") + +(With outer parens and [parens in url](/foo(bar) "and a title")) diff --git a/oldtests/Original/Links_reference_style.html b/oldtests/Original/Links_reference_style.html new file mode 100644 index 0000000..6d78b96 --- /dev/null +++ b/oldtests/Original/Links_reference_style.html @@ -0,0 +1,28 @@ +<p>Foo <a href="/url/" title="Title">bar</a>.</p> +<p>Foo <a href="/url/" title="Title">bar</a>.</p> +<p>Foo <a href="/url/" title="Title">bar</a>.</p> +<p>With <a href="/url/">embedded [brackets]</a>.</p> +<p>Indented <a href="/url">once</a>.</p> +<p>Indented <a href="/url">twice</a>.</p> +<p>Indented <a href="/url">thrice</a>.</p> +<p>Indented [four][] times.</p> +<pre><code>[four]: /url +</code></pre> +<hr /> +<p><a href="foo">this</a> should work</p> +<p>So should <a href="foo">this</a>.</p> +<p>And <a href="foo">this</a>.</p> +<p>And <a href="foo">this</a>.</p> +<p>And <a href="foo">this</a>.</p> +<p>But not [that] [].</p> +<p>Nor [that][].</p> +<p>Nor [that].</p> +<p>[Something in brackets like <a href="foo">this</a> should work]</p> +<p>[Same with <a href="foo">this</a>.]</p> +<p>In this case, <a href="/somethingelse/">this</a> points to something else.</p> +<p>Backslashing should suppress [this] and [this].</p> +<hr /> +<p>Here's one where the <a href="/url/">link +breaks</a> across lines.</p> +<p>Here's another where the <a href="/url/">link +breaks</a> across lines, but with a line-ending space.</p> diff --git a/oldtests/Original/Links_reference_style.markdown b/oldtests/Original/Links_reference_style.markdown new file mode 100644 index 0000000..341ec88 --- /dev/null +++ b/oldtests/Original/Links_reference_style.markdown @@ -0,0 +1,71 @@ +Foo [bar] [1]. + +Foo [bar][1]. + +Foo [bar] +[1]. + +[1]: /url/ "Title" + + +With [embedded [brackets]] [b]. + + +Indented [once][]. + +Indented [twice][]. + +Indented [thrice][]. + +Indented [four][] times. + + [once]: /url + + [twice]: /url + + [thrice]: /url + + [four]: /url + + +[b]: /url/ + +* * * + +[this] [this] should work + +So should [this][this]. + +And [this] []. + +And [this][]. + +And [this]. + +But not [that] []. + +Nor [that][]. + +Nor [that]. + +[Something in brackets like [this][] should work] + +[Same with [this].] + +In this case, [this](/somethingelse/) points to something else. + +Backslashing should suppress \[this] and [this\]. + +[this]: foo + + +* * * + +Here's one where the [link +breaks] across lines. + +Here's another where the [link +breaks] across lines, but with a line-ending space. + + +[link breaks]: /url/ diff --git a/oldtests/Original/Links_shortcut_references.html b/oldtests/Original/Links_shortcut_references.html new file mode 100644 index 0000000..8163ade --- /dev/null +++ b/oldtests/Original/Links_shortcut_references.html @@ -0,0 +1,6 @@ +<p>This is the <a href="/simple">simple case</a>.</p> +<p>This one has a <a href="/foo">line +break</a>.</p> +<p>This one has a <a href="/foo">line +break</a> with a line-ending space.</p> +<p><a href="/that">this</a> and the <a href="/other">other</a></p> diff --git a/oldtests/Original/Links_shortcut_references.markdown b/oldtests/Original/Links_shortcut_references.markdown new file mode 100644 index 0000000..8c44c98 --- /dev/null +++ b/oldtests/Original/Links_shortcut_references.markdown @@ -0,0 +1,20 @@ +This is the [simple case]. + +[simple case]: /simple + + + +This one has a [line +break]. + +This one has a [line +break] with a line-ending space. + +[line break]: /foo + + +[this] [that] and the [other] + +[this]: /this +[that]: /that +[other]: /other diff --git a/oldtests/Original/Literal_quotes_in_titles.html b/oldtests/Original/Literal_quotes_in_titles.html new file mode 100644 index 0000000..62e8641 --- /dev/null +++ b/oldtests/Original/Literal_quotes_in_titles.html @@ -0,0 +1,2 @@ +<p>Foo <a href="/url/" title="Title with "quotes" inside">bar</a>.</p> +<p>Foo <a href="/url/" title="Title with "quotes" inside">bar</a>.</p> diff --git a/oldtests/Original/Literal_quotes_in_titles.markdown b/oldtests/Original/Literal_quotes_in_titles.markdown new file mode 100644 index 0000000..29d0e42 --- /dev/null +++ b/oldtests/Original/Literal_quotes_in_titles.markdown @@ -0,0 +1,7 @@ +Foo [bar][]. + +Foo [bar](/url/ "Title with "quotes" inside"). + + + [bar]: /url/ "Title with "quotes" inside" + diff --git a/oldtests/Original/Markdown_Documentation_Basics.html b/oldtests/Original/Markdown_Documentation_Basics.html new file mode 100644 index 0000000..0dee67f --- /dev/null +++ b/oldtests/Original/Markdown_Documentation_Basics.html @@ -0,0 +1,242 @@ +<h1>Markdown: Basics</h1> +<ul id="ProjectSubmenu"> + <li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li> + <li><a class="selected" title="Markdown Basics">Basics</a></li> + <li><a href="/projects/markdown/syntax" title="Markdown Syntax Documentation">Syntax</a></li> + <li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li> + <li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li> +</ul> +<h2>Getting the Gist of Markdown's Formatting Syntax</h2> +<p>This page offers a brief overview of what it's like to use Markdown. +The <a href="/projects/markdown/syntax" title="Markdown Syntax">syntax page</a> provides complete, detailed documentation for +every feature, but Markdown should be very easy to pick up simply by +looking at a few examples of it in action. The examples on this page +are written in a before/after style, showing example syntax and the +HTML output produced by Markdown.</p> +<p>It's also helpful to simply try Markdown out; the <a href="/projects/markdown/dingus" title="Markdown Dingus">Dingus</a> is a +web application that allows you type your own Markdown-formatted text +and translate it to XHTML.</p> +<p><strong>Note:</strong> This document is itself written using Markdown; you +can <a href="/projects/markdown/basics.text">see the source for it by adding '.text' to the URL</a>.</p> +<h2>Paragraphs, Headers, Blockquotes</h2> +<p>A paragraph is simply one or more consecutive lines of text, separated +by one or more blank lines. (A blank line is any line that looks like a +blank line -- a line containing nothing spaces or tabs is considered +blank.) Normal paragraphs should not be intended with spaces or tabs.</p> +<p>Markdown offers two styles of headers: <em>Setext</em> and <em>atx</em>. +Setext-style headers for <code><h1></code> and <code><h2></code> are created by +"underlining" with equal signs (<code>=</code>) and hyphens (<code>-</code>), respectively. +To create an atx-style header, you put 1-6 hash marks (<code>#</code>) at the +beginning of the line -- the number of hashes equals the resulting +HTML header level.</p> +<p>Blockquotes are indicated using email-style '<code>></code>' angle brackets.</p> +<p>Markdown:</p> +<pre><code>A First Level Header +==================== + +A Second Level Header +--------------------- + +Now is the time for all good men to come to +the aid of their country. This is just a +regular paragraph. + +The quick brown fox jumped over the lazy +dog's back. + +### Header 3 + +> This is a blockquote. +> +> This is the second paragraph in the blockquote. +> +> ## This is an H2 in a blockquote +</code></pre> +<p>Output:</p> +<pre><code><h1>A First Level Header</h1> + +<h2>A Second Level Header</h2> + +<p>Now is the time for all good men to come to +the aid of their country. This is just a +regular paragraph.</p> + +<p>The quick brown fox jumped over the lazy +dog's back.</p> + +<h3>Header 3</h3> + +<blockquote> + <p>This is a blockquote.</p> + + <p>This is the second paragraph in the blockquote.</p> + + <h2>This is an H2 in a blockquote</h2> +</blockquote> +</code></pre> +<h3>Phrase Emphasis</h3> +<p>Markdown uses asterisks and underscores to indicate spans of emphasis.</p> +<p>Markdown:</p> +<pre><code>Some of these words *are emphasized*. +Some of these words _are emphasized also_. + +Use two asterisks for **strong emphasis**. +Or, if you prefer, __use two underscores instead__. +</code></pre> +<p>Output:</p> +<pre><code><p>Some of these words <em>are emphasized</em>. +Some of these words <em>are emphasized also</em>.</p> + +<p>Use two asterisks for <strong>strong emphasis</strong>. +Or, if you prefer, <strong>use two underscores instead</strong>.</p> +</code></pre> +<h2>Lists</h2> +<p>Unordered (bulleted) lists use asterisks, pluses, and hyphens (<code>*</code>, +<code>+</code>, and <code>-</code>) as list markers. These three markers are +interchangable; this:</p> +<pre><code>* Candy. +* Gum. +* Booze. +</code></pre> +<p>this:</p> +<pre><code>+ Candy. ++ Gum. ++ Booze. +</code></pre> +<p>and this:</p> +<pre><code>- Candy. +- Gum. +- Booze. +</code></pre> +<p>all produce the same output:</p> +<pre><code><ul> +<li>Candy.</li> +<li>Gum.</li> +<li>Booze.</li> +</ul> +</code></pre> +<p>Ordered (numbered) lists use regular numbers, followed by periods, as +list markers:</p> +<pre><code>1. Red +2. Green +3. Blue +</code></pre> +<p>Output:</p> +<pre><code><ol> +<li>Red</li> +<li>Green</li> +<li>Blue</li> +</ol> +</code></pre> +<p>If you put blank lines between items, you'll get <code><p></code> tags for the +list item text. You can create multi-paragraph list items by indenting +the paragraphs by 4 spaces or 1 tab:</p> +<pre><code>* A list item. + + With multiple paragraphs. + +* Another item in the list. +</code></pre> +<p>Output:</p> +<pre><code><ul> +<li><p>A list item.</p> +<p>With multiple paragraphs.</p></li> +<li><p>Another item in the list.</p></li> +</ul> +</code></pre> +<h3>Links</h3> +<p>Markdown supports two styles for creating links: <em>inline</em> and +<em>reference</em>. With both styles, you use square brackets to delimit the +text you want to turn into a link.</p> +<p>Inline-style links use parentheses immediately after the link text. +For example:</p> +<pre><code>This is an [example link](http://example.com/). +</code></pre> +<p>Output:</p> +<pre><code><p>This is an <a href="http://example.com/"> +example link</a>.</p> +</code></pre> +<p>Optionally, you may include a title attribute in the parentheses:</p> +<pre><code>This is an [example link](http://example.com/ "With a Title"). +</code></pre> +<p>Output:</p> +<pre><code><p>This is an <a href="http://example.com/" title="With a Title"> +example link</a>.</p> +</code></pre> +<p>Reference-style links allow you to refer to your links by names, which +you define elsewhere in your document:</p> +<pre><code>I get 10 times more traffic from [Google][1] than from +[Yahoo][2] or [MSN][3]. + +[1]: http://google.com/ "Google" +[2]: http://search.yahoo.com/ "Yahoo Search" +[3]: http://search.msn.com/ "MSN Search" +</code></pre> +<p>Output:</p> +<pre><code><p>I get 10 times more traffic from <a href="http://google.com/" +title="Google">Google</a> than from <a href="http://search.yahoo.com/" +title="Yahoo Search">Yahoo</a> or <a href="http://search.msn.com/" +title="MSN Search">MSN</a>.</p> +</code></pre> +<p>The title attribute is optional. Link names may contain letters, +numbers and spaces, but are <em>not</em> case sensitive:</p> +<pre><code>I start my morning with a cup of coffee and +[The New York Times][NY Times]. + +[ny times]: http://www.nytimes.com/ +</code></pre> +<p>Output:</p> +<pre><code><p>I start my morning with a cup of coffee and +<a href="http://www.nytimes.com/">The New York Times</a>.</p> +</code></pre> +<h3>Images</h3> +<p>Image syntax is very much like link syntax.</p> +<p>Inline (titles are optional):</p> +<pre><code>![alt text](/path/to/img.jpg "Title") +</code></pre> +<p>Reference-style:</p> +<pre><code>![alt text][id] + +[id]: /path/to/img.jpg "Title" +</code></pre> +<p>Both of the above examples produce the same output:</p> +<pre><code><img src="/path/to/img.jpg" alt="alt text" title="Title" /> +</code></pre> +<h3>Code</h3> +<p>In a regular paragraph, you can create code span by wrapping text in +backtick quotes. Any ampersands (<code>&</code>) and angle brackets (<code><</code> or +<code>></code>) will automatically be translated into HTML entities. This makes +it easy to use Markdown to write about HTML example code:</p> +<pre><code>I strongly recommend against using any `<blink>` tags. + +I wish SmartyPants used named entities like `&mdash;` +instead of decimal-encoded entites like `&#8212;`. +</code></pre> +<p>Output:</p> +<pre><code><p>I strongly recommend against using any +<code>&lt;blink&gt;</code> tags.</p> + +<p>I wish SmartyPants used named entities like +<code>&amp;mdash;</code> instead of decimal-encoded +entites like <code>&amp;#8212;</code>.</p> +</code></pre> +<p>To specify an entire block of pre-formatted code, indent every line of +the block by 4 spaces or 1 tab. Just like with code spans, <code>&</code>, <code><</code>, +and <code>></code> characters will be escaped automatically.</p> +<p>Markdown:</p> +<pre><code>If you want your page to validate under XHTML 1.0 Strict, +you've got to put paragraph tags in your blockquotes: + + <blockquote> + <p>For example.</p> + </blockquote> +</code></pre> +<p>Output:</p> +<pre><code><p>If you want your page to validate under XHTML 1.0 Strict, +you've got to put paragraph tags in your blockquotes:</p> + +<pre><code>&lt;blockquote&gt; + &lt;p&gt;For example.&lt;/p&gt; +&lt;/blockquote&gt; +</code></pre> +</code></pre> diff --git a/oldtests/Original/Markdown_Documentation_Basics.markdown b/oldtests/Original/Markdown_Documentation_Basics.markdown new file mode 100644 index 0000000..24eba65 --- /dev/null +++ b/oldtests/Original/Markdown_Documentation_Basics.markdown @@ -0,0 +1,306 @@ +Markdown: Basics +================ + +<ul id="ProjectSubmenu"> + <li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li> + <li><a class="selected" title="Markdown Basics">Basics</a></li> + <li><a href="/projects/markdown/syntax" title="Markdown Syntax Documentation">Syntax</a></li> + <li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li> + <li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li> +</ul> + + +Getting the Gist of Markdown's Formatting Syntax +------------------------------------------------ + +This page offers a brief overview of what it's like to use Markdown. +The [syntax page] [s] provides complete, detailed documentation for +every feature, but Markdown should be very easy to pick up simply by +looking at a few examples of it in action. The examples on this page +are written in a before/after style, showing example syntax and the +HTML output produced by Markdown. + +It's also helpful to simply try Markdown out; the [Dingus] [d] is a +web application that allows you type your own Markdown-formatted text +and translate it to XHTML. + +**Note:** This document is itself written using Markdown; you +can [see the source for it by adding '.text' to the URL] [src]. + + [s]: /projects/markdown/syntax "Markdown Syntax" + [d]: /projects/markdown/dingus "Markdown Dingus" + [src]: /projects/markdown/basics.text + + +## Paragraphs, Headers, Blockquotes ## + +A paragraph is simply one or more consecutive lines of text, separated +by one or more blank lines. (A blank line is any line that looks like a +blank line -- a line containing nothing spaces or tabs is considered +blank.) Normal paragraphs should not be intended with spaces or tabs. + +Markdown offers two styles of headers: *Setext* and *atx*. +Setext-style headers for `<h1>` and `<h2>` are created by +"underlining" with equal signs (`=`) and hyphens (`-`), respectively. +To create an atx-style header, you put 1-6 hash marks (`#`) at the +beginning of the line -- the number of hashes equals the resulting +HTML header level. + +Blockquotes are indicated using email-style '`>`' angle brackets. + +Markdown: + + A First Level Header + ==================== + + A Second Level Header + --------------------- + + Now is the time for all good men to come to + the aid of their country. This is just a + regular paragraph. + + The quick brown fox jumped over the lazy + dog's back. + + ### Header 3 + + > This is a blockquote. + > + > This is the second paragraph in the blockquote. + > + > ## This is an H2 in a blockquote + + +Output: + + <h1>A First Level Header</h1> + + <h2>A Second Level Header</h2> + + <p>Now is the time for all good men to come to + the aid of their country. This is just a + regular paragraph.</p> + + <p>The quick brown fox jumped over the lazy + dog's back.</p> + + <h3>Header 3</h3> + + <blockquote> + <p>This is a blockquote.</p> + + <p>This is the second paragraph in the blockquote.</p> + + <h2>This is an H2 in a blockquote</h2> + </blockquote> + + + +### Phrase Emphasis ### + +Markdown uses asterisks and underscores to indicate spans of emphasis. + +Markdown: + + Some of these words *are emphasized*. + Some of these words _are emphasized also_. + + Use two asterisks for **strong emphasis**. + Or, if you prefer, __use two underscores instead__. + +Output: + + <p>Some of these words <em>are emphasized</em>. + Some of these words <em>are emphasized also</em>.</p> + + <p>Use two asterisks for <strong>strong emphasis</strong>. + Or, if you prefer, <strong>use two underscores instead</strong>.</p> + + + +## Lists ## + +Unordered (bulleted) lists use asterisks, pluses, and hyphens (`*`, +`+`, and `-`) as list markers. These three markers are +interchangable; this: + + * Candy. + * Gum. + * Booze. + +this: + + + Candy. + + Gum. + + Booze. + +and this: + + - Candy. + - Gum. + - Booze. + +all produce the same output: + + <ul> + <li>Candy.</li> + <li>Gum.</li> + <li>Booze.</li> + </ul> + +Ordered (numbered) lists use regular numbers, followed by periods, as +list markers: + + 1. Red + 2. Green + 3. Blue + +Output: + + <ol> + <li>Red</li> + <li>Green</li> + <li>Blue</li> + </ol> + +If you put blank lines between items, you'll get `<p>` tags for the +list item text. You can create multi-paragraph list items by indenting +the paragraphs by 4 spaces or 1 tab: + + * A list item. + + With multiple paragraphs. + + * Another item in the list. + +Output: + + <ul> + <li><p>A list item.</p> + <p>With multiple paragraphs.</p></li> + <li><p>Another item in the list.</p></li> + </ul> + + + +### Links ### + +Markdown supports two styles for creating links: *inline* and +*reference*. With both styles, you use square brackets to delimit the +text you want to turn into a link. + +Inline-style links use parentheses immediately after the link text. +For example: + + This is an [example link](http://example.com/). + +Output: + + <p>This is an <a href="http://example.com/"> + example link</a>.</p> + +Optionally, you may include a title attribute in the parentheses: + + This is an [example link](http://example.com/ "With a Title"). + +Output: + + <p>This is an <a href="http://example.com/" title="With a Title"> + example link</a>.</p> + +Reference-style links allow you to refer to your links by names, which +you define elsewhere in your document: + + I get 10 times more traffic from [Google][1] than from + [Yahoo][2] or [MSN][3]. + + [1]: http://google.com/ "Google" + [2]: http://search.yahoo.com/ "Yahoo Search" + [3]: http://search.msn.com/ "MSN Search" + +Output: + + <p>I get 10 times more traffic from <a href="http://google.com/" + title="Google">Google</a> than from <a href="http://search.yahoo.com/" + title="Yahoo Search">Yahoo</a> or <a href="http://search.msn.com/" + title="MSN Search">MSN</a>.</p> + +The title attribute is optional. Link names may contain letters, +numbers and spaces, but are *not* case sensitive: + + I start my morning with a cup of coffee and + [The New York Times][NY Times]. + + [ny times]: http://www.nytimes.com/ + +Output: + + <p>I start my morning with a cup of coffee and + <a href="http://www.nytimes.com/">The New York Times</a>.</p> + + +### Images ### + +Image syntax is very much like link syntax. + +Inline (titles are optional): + + ![alt text](/path/to/img.jpg "Title") + +Reference-style: + + ![alt text][id] + + [id]: /path/to/img.jpg "Title" + +Both of the above examples produce the same output: + + <img src="/path/to/img.jpg" alt="alt text" title="Title" /> + + + +### Code ### + +In a regular paragraph, you can create code span by wrapping text in +backtick quotes. Any ampersands (`&`) and angle brackets (`<` or +`>`) will automatically be translated into HTML entities. This makes +it easy to use Markdown to write about HTML example code: + + I strongly recommend against using any `<blink>` tags. + + I wish SmartyPants used named entities like `—` + instead of decimal-encoded entites like `—`. + +Output: + + <p>I strongly recommend against using any + <code><blink></code> tags.</p> + + <p>I wish SmartyPants used named entities like + <code>&mdash;</code> instead of decimal-encoded + entites like <code>&#8212;</code>.</p> + + +To specify an entire block of pre-formatted code, indent every line of +the block by 4 spaces or 1 tab. Just like with code spans, `&`, `<`, +and `>` characters will be escaped automatically. + +Markdown: + + If you want your page to validate under XHTML 1.0 Strict, + you've got to put paragraph tags in your blockquotes: + + <blockquote> + <p>For example.</p> + </blockquote> + +Output: + + <p>If you want your page to validate under XHTML 1.0 Strict, + you've got to put paragraph tags in your blockquotes:</p> + + <pre><code><blockquote> + <p>For example.</p> + </blockquote> + </code></pre> diff --git a/oldtests/Original/Markdown_Documentation_Syntax.html b/oldtests/Original/Markdown_Documentation_Syntax.html new file mode 100644 index 0000000..f379dcf --- /dev/null +++ b/oldtests/Original/Markdown_Documentation_Syntax.html @@ -0,0 +1,708 @@ +<h1>Markdown: Syntax</h1> +<ul id="ProjectSubmenu"> + <li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li> + <li><a href="/projects/markdown/basics" title="Markdown Basics">Basics</a></li> + <li><a class="selected" title="Markdown Syntax Documentation">Syntax</a></li> + <li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li> + <li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li> +</ul> +<ul> +<li><a href="#overview">Overview</a> +<ul> +<li><a href="#philosophy">Philosophy</a></li> +<li><a href="#html">Inline HTML</a></li> +<li><a href="#autoescape">Automatic Escaping for Special Characters</a></li> +</ul></li> +<li><a href="#block">Block Elements</a> +<ul> +<li><a href="#p">Paragraphs and Line Breaks</a></li> +<li><a href="#header">Headers</a></li> +<li><a href="#blockquote">Blockquotes</a></li> +<li><a href="#list">Lists</a></li> +<li><a href="#precode">Code Blocks</a></li> +<li><a href="#hr">Horizontal Rules</a></li> +</ul></li> +<li><a href="#span">Span Elements</a> +<ul> +<li><a href="#link">Links</a></li> +<li><a href="#em">Emphasis</a></li> +<li><a href="#code">Code</a></li> +<li><a href="#img">Images</a></li> +</ul></li> +<li><a href="#misc">Miscellaneous</a> +<ul> +<li><a href="#backslash">Backslash Escapes</a></li> +<li><a href="#autolink">Automatic Links</a></li> +</ul></li> +</ul> +<p><strong>Note:</strong> This document is itself written using Markdown; you +can <a href="/projects/markdown/syntax.text">see the source for it by adding '.text' to the URL</a>.</p> +<hr /> +<h2 id="overview">Overview</h2> +<h3 id="philosophy">Philosophy</h3> +<p>Markdown is intended to be as easy-to-read and easy-to-write as is feasible.</p> +<p>Readability, however, is emphasized above all else. A Markdown-formatted +document should be publishable as-is, as plain text, without looking +like it's been marked up with tags or formatting instructions. While +Markdown's syntax has been influenced by several existing text-to-HTML +filters -- including <a href="http://docutils.sourceforge.net/mirror/setext.html">Setext</a>, <a href="http://www.aaronsw.com/2002/atx/">atx</a>, <a href="http://textism.com/tools/textile/">Textile</a>, <a href="http://docutils.sourceforge.net/rst.html">reStructuredText</a>, +<a href="http://www.triptico.com/software/grutatxt.html">Grutatext</a>, and <a href="http://ettext.taint.org/doc/">EtText</a> -- the single biggest source of +inspiration for Markdown's syntax is the format of plain text email.</p> +<p>To this end, Markdown's syntax is comprised entirely of punctuation +characters, which punctuation characters have been carefully chosen so +as to look like what they mean. E.g., asterisks around a word actually +look like *emphasis*. Markdown lists look like, well, lists. Even +blockquotes look like quoted passages of text, assuming you've ever +used email.</p> +<h3 id="html">Inline HTML</h3> +<p>Markdown's syntax is intended for one purpose: to be used as a +format for <em>writing</em> for the web.</p> +<p>Markdown is not a replacement for HTML, or even close to it. Its +syntax is very small, corresponding only to a very small subset of +HTML tags. The idea is <em>not</em> to create a syntax that makes it easier +to insert HTML tags. In my opinion, HTML tags are already easy to +insert. The idea for Markdown is to make it easy to read, write, and +edit prose. HTML is a <em>publishing</em> format; Markdown is a <em>writing</em> +format. Thus, Markdown's formatting syntax only addresses issues that +can be conveyed in plain text.</p> +<p>For any markup that is not covered by Markdown's syntax, you simply +use HTML itself. There's no need to preface it or delimit it to +indicate that you're switching from Markdown to HTML; you just use +the tags.</p> +<p>The only restrictions are that block-level HTML elements -- e.g. <code><div></code>, +<code><table></code>, <code><pre></code>, <code><p></code>, etc. -- must be separated from surrounding +content by blank lines, and the start and end tags of the block should +not be indented with tabs or spaces. Markdown is smart enough not +to add extra (unwanted) <code><p></code> tags around HTML block-level tags.</p> +<p>For example, to add an HTML table to a Markdown article:</p> +<pre><code>This is a regular paragraph. + +<table> + <tr> + <td>Foo</td> + </tr> +</table> + +This is another regular paragraph. +</code></pre> +<p>Note that Markdown formatting syntax is not processed within block-level +HTML tags. E.g., you can't use Markdown-style <code>*emphasis*</code> inside an +HTML block.</p> +<p>Span-level HTML tags -- e.g. <code><span></code>, <code><cite></code>, or <code><del></code> -- can be +used anywhere in a Markdown paragraph, list item, or header. If you +want, you can even use HTML tags instead of Markdown formatting; e.g. if +you'd prefer to use HTML <code><a></code> or <code><img></code> tags instead of Markdown's +link or image syntax, go right ahead.</p> +<p>Unlike block-level HTML tags, Markdown syntax <em>is</em> processed within +span-level tags.</p> +<h3 id="autoescape">Automatic Escaping for Special Characters</h3> +<p>In HTML, there are two characters that demand special treatment: <code><</code> +and <code>&</code>. Left angle brackets are used to start tags; ampersands are +used to denote HTML entities. If you want to use them as literal +characters, you must escape them as entities, e.g. <code>&lt;</code>, and +<code>&amp;</code>.</p> +<p>Ampersands in particular are bedeviling for web writers. If you want to +write about 'AT&T', you need to write '<code>AT&amp;T</code>'. You even need to +escape ampersands within URLs. Thus, if you want to link to:</p> +<pre><code>http://images.google.com/images?num=30&q=larry+bird +</code></pre> +<p>you need to encode the URL as:</p> +<pre><code>http://images.google.com/images?num=30&amp;q=larry+bird +</code></pre> +<p>in your anchor tag <code>href</code> attribute. Needless to say, this is easy to +forget, and is probably the single most common source of HTML validation +errors in otherwise well-marked-up web sites.</p> +<p>Markdown allows you to use these characters naturally, taking care of +all the necessary escaping for you. If you use an ampersand as part of +an HTML entity, it remains unchanged; otherwise it will be translated +into <code>&amp;</code>.</p> +<p>So, if you want to include a copyright symbol in your article, you can write:</p> +<pre><code>&copy; +</code></pre> +<p>and Markdown will leave it alone. But if you write:</p> +<pre><code>AT&T +</code></pre> +<p>Markdown will translate it to:</p> +<pre><code>AT&amp;T +</code></pre> +<p>Similarly, because Markdown supports <a href="#html">inline HTML</a>, if you use +angle brackets as delimiters for HTML tags, Markdown will treat them as +such. But if you write:</p> +<pre><code>4 < 5 +</code></pre> +<p>Markdown will translate it to:</p> +<pre><code>4 &lt; 5 +</code></pre> +<p>However, inside Markdown code spans and blocks, angle brackets and +ampersands are <em>always</em> encoded automatically. This makes it easy to use +Markdown to write about HTML code. (As opposed to raw HTML, which is a +terrible format for writing about HTML syntax, because every single <code><</code> +and <code>&</code> in your example code needs to be escaped.)</p> +<hr /> +<h2 id="block">Block Elements</h2> +<h3 id="p">Paragraphs and Line Breaks</h3> +<p>A paragraph is simply one or more consecutive lines of text, separated +by one or more blank lines. (A blank line is any line that looks like a +blank line -- a line containing nothing but spaces or tabs is considered +blank.) Normal paragraphs should not be intended with spaces or tabs.</p> +<p>The implication of the "one or more consecutive lines of text" rule is +that Markdown supports "hard-wrapped" text paragraphs. This differs +significantly from most other text-to-HTML formatters (including Movable +Type's "Convert Line Breaks" option) which translate every line break +character in a paragraph into a <code><br /></code> tag.</p> +<p>When you <em>do</em> want to insert a <code><br /></code> break tag using Markdown, you +end a line with two or more spaces, then type return.</p> +<p>Yes, this takes a tad more effort to create a <code><br /></code>, but a simplistic +"every line break is a <code><br /></code>" rule wouldn't work for Markdown. +Markdown's email-style <a href="#blockquote">blockquoting</a> and multi-paragraph <a href="#list">list items</a> +work best -- and look better -- when you format them with hard breaks.</p> +<h3 id="header">Headers</h3> +<p>Markdown supports two styles of headers, <a href="http://docutils.sourceforge.net/mirror/setext.html">Setext</a> and <a href="http://www.aaronsw.com/2002/atx/">atx</a>.</p> +<p>Setext-style headers are "underlined" using equal signs (for first-level +headers) and dashes (for second-level headers). For example:</p> +<pre><code>This is an H1 +============= + +This is an H2 +------------- +</code></pre> +<p>Any number of underlining <code>=</code>'s or <code>-</code>'s will work.</p> +<p>Atx-style headers use 1-6 hash characters at the start of the line, +corresponding to header levels 1-6. For example:</p> +<pre><code># This is an H1 + +## This is an H2 + +###### This is an H6 +</code></pre> +<p>Optionally, you may "close" atx-style headers. This is purely +cosmetic -- you can use this if you think it looks better. The +closing hashes don't even need to match the number of hashes +used to open the header. (The number of opening hashes +determines the header level.) :</p> +<pre><code># This is an H1 # + +## This is an H2 ## + +### This is an H3 ###### +</code></pre> +<h3 id="blockquote">Blockquotes</h3> +<p>Markdown uses email-style <code>></code> characters for blockquoting. If you're +familiar with quoting passages of text in an email message, then you +know how to create a blockquote in Markdown. It looks best if you hard +wrap the text and put a <code>></code> before every line:</p> +<pre><code>> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, +> consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. +> Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. +> +> Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse +> id sem consectetuer libero luctus adipiscing. +</code></pre> +<p>Markdown allows you to be lazy and only put the <code>></code> before the first +line of a hard-wrapped paragraph:</p> +<pre><code>> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, +consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. +Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. + +> Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse +id sem consectetuer libero luctus adipiscing. +</code></pre> +<p>Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by +adding additional levels of <code>></code>:</p> +<pre><code>> This is the first level of quoting. +> +> > This is nested blockquote. +> +> Back to the first level. +</code></pre> +<p>Blockquotes can contain other Markdown elements, including headers, lists, +and code blocks:</p> +<pre><code>> ## This is a header. +> +> 1. This is the first list item. +> 2. This is the second list item. +> +> Here's some example code: +> +> return shell_exec("echo $input | $markdown_script"); +</code></pre> +<p>Any decent text editor should make email-style quoting easy. For +example, with BBEdit, you can make a selection and choose Increase +Quote Level from the Text menu.</p> +<h3 id="list">Lists</h3> +<p>Markdown supports ordered (numbered) and unordered (bulleted) lists.</p> +<p>Unordered lists use asterisks, pluses, and hyphens -- interchangably +-- as list markers:</p> +<pre><code>* Red +* Green +* Blue +</code></pre> +<p>is equivalent to:</p> +<pre><code>+ Red ++ Green ++ Blue +</code></pre> +<p>and:</p> +<pre><code>- Red +- Green +- Blue +</code></pre> +<p>Ordered lists use numbers followed by periods:</p> +<pre><code>1. Bird +2. McHale +3. Parish +</code></pre> +<p>It's important to note that the actual numbers you use to mark the +list have no effect on the HTML output Markdown produces. The HTML +Markdown produces from the above list is:</p> +<pre><code><ol> +<li>Bird</li> +<li>McHale</li> +<li>Parish</li> +</ol> +</code></pre> +<p>If you instead wrote the list in Markdown like this:</p> +<pre><code>1. Bird +1. McHale +1. Parish +</code></pre> +<p>or even:</p> +<pre><code>3. Bird +1. McHale +8. Parish +</code></pre> +<p>you'd get the exact same HTML output. The point is, if you want to, +you can use ordinal numbers in your ordered Markdown lists, so that +the numbers in your source match the numbers in your published HTML. +But if you want to be lazy, you don't have to.</p> +<p>If you do use lazy list numbering, however, you should still start the +list with the number 1. At some point in the future, Markdown may support +starting ordered lists at an arbitrary number.</p> +<p>List markers typically start at the left margin, but may be indented by +up to three spaces. List markers must be followed by one or more spaces +or a tab.</p> +<p>To make lists look nice, you can wrap items with hanging indents:</p> +<pre><code>* Lorem ipsum dolor sit amet, consectetuer adipiscing elit. + Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi, + viverra nec, fringilla in, laoreet vitae, risus. +* Donec sit amet nisl. Aliquam semper ipsum sit amet velit. + Suspendisse id sem consectetuer libero luctus adipiscing. +</code></pre> +<p>But if you want to be lazy, you don't have to:</p> +<pre><code>* Lorem ipsum dolor sit amet, consectetuer adipiscing elit. +Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi, +viverra nec, fringilla in, laoreet vitae, risus. +* Donec sit amet nisl. Aliquam semper ipsum sit amet velit. +Suspendisse id sem consectetuer libero luctus adipiscing. +</code></pre> +<p>If list items are separated by blank lines, Markdown will wrap the +items in <code><p></code> tags in the HTML output. For example, this input:</p> +<pre><code>* Bird +* Magic +</code></pre> +<p>will turn into:</p> +<pre><code><ul> +<li>Bird</li> +<li>Magic</li> +</ul> +</code></pre> +<p>But this:</p> +<pre><code>* Bird + +* Magic +</code></pre> +<p>will turn into:</p> +<pre><code><ul> +<li><p>Bird</p></li> +<li><p>Magic</p></li> +</ul> +</code></pre> +<p>List items may consist of multiple paragraphs. Each subsequent +paragraph in a list item must be intended by either 4 spaces +or one tab:</p> +<pre><code>1. This is a list item with two paragraphs. Lorem ipsum dolor + sit amet, consectetuer adipiscing elit. Aliquam hendrerit + mi posuere lectus. + + Vestibulum enim wisi, viverra nec, fringilla in, laoreet + vitae, risus. Donec sit amet nisl. Aliquam semper ipsum + sit amet velit. + +2. Suspendisse id sem consectetuer libero luctus adipiscing. +</code></pre> +<p>It looks nice if you indent every line of the subsequent +paragraphs, but here again, Markdown will allow you to be +lazy:</p> +<pre><code>* This is a list item with two paragraphs. + + This is the second paragraph in the list item. You're +only required to indent the first line. Lorem ipsum dolor +sit amet, consectetuer adipiscing elit. + +* Another item in the same list. +</code></pre> +<p>To put a blockquote within a list item, the blockquote's <code>></code> +delimiters need to be indented:</p> +<pre><code>* A list item with a blockquote: + + > This is a blockquote + > inside a list item. +</code></pre> +<p>To put a code block within a list item, the code block needs +to be indented <em>twice</em> -- 8 spaces or two tabs:</p> +<pre><code>* A list item with a code block: + + <code goes here> +</code></pre> +<p>It's worth noting that it's possible to trigger an ordered list by +accident, by writing something like this:</p> +<pre><code>1986. What a great season. +</code></pre> +<p>In other words, a <em>number-period-space</em> sequence at the beginning of a +line. To avoid this, you can backslash-escape the period:</p> +<pre><code>1986\. What a great season. +</code></pre> +<h3 id="precode">Code Blocks</h3> +<p>Pre-formatted code blocks are used for writing about programming or +markup source code. Rather than forming normal paragraphs, the lines +of a code block are interpreted literally. Markdown wraps a code block +in both <code><pre></code> and <code><code></code> tags.</p> +<p>To produce a code block in Markdown, simply indent every line of the +block by at least 4 spaces or 1 tab. For example, given this input:</p> +<pre><code>This is a normal paragraph: + + This is a code block. +</code></pre> +<p>Markdown will generate:</p> +<pre><code><p>This is a normal paragraph:</p> + +<pre><code>This is a code block. +</code></pre> +</code></pre> +<p>One level of indentation -- 4 spaces or 1 tab -- is removed from each +line of the code block. For example, this:</p> +<pre><code>Here is an example of AppleScript: + + tell application "Foo" + beep + end tell +</code></pre> +<p>will turn into:</p> +<pre><code><p>Here is an example of AppleScript:</p> + +<pre><code>tell application "Foo" + beep +end tell +</code></pre> +</code></pre> +<p>A code block continues until it reaches a line that is not indented +(or the end of the article).</p> +<p>Within a code block, ampersands (<code>&</code>) and angle brackets (<code><</code> and <code>></code>) +are automatically converted into HTML entities. This makes it very +easy to include example HTML source code using Markdown -- just paste +it and indent it, and Markdown will handle the hassle of encoding the +ampersands and angle brackets. For example, this:</p> +<pre><code> <div class="footer"> + &copy; 2004 Foo Corporation + </div> +</code></pre> +<p>will turn into:</p> +<pre><code><pre><code>&lt;div class="footer"&gt; + &amp;copy; 2004 Foo Corporation +&lt;/div&gt; +</code></pre> +</code></pre> +<p>Regular Markdown syntax is not processed within code blocks. E.g., +asterisks are just literal asterisks within a code block. This means +it's also easy to use Markdown to write about Markdown's own syntax.</p> +<h3 id="hr">Horizontal Rules</h3> +<p>You can produce a horizontal rule tag (<code><hr /></code>) by placing three or +more hyphens, asterisks, or underscores on a line by themselves. If you +wish, you may use spaces between the hyphens or asterisks. Each of the +following lines will produce a horizontal rule:</p> +<pre><code>* * * + +*** + +***** + +- - - + +--------------------------------------- + +_ _ _ +</code></pre> +<hr /> +<h2 id="span">Span Elements</h2> +<h3 id="link">Links</h3> +<p>Markdown supports two style of links: <em>inline</em> and <em>reference</em>.</p> +<p>In both styles, the link text is delimited by [square brackets].</p> +<p>To create an inline link, use a set of regular parentheses immediately +after the link text's closing square bracket. Inside the parentheses, +put the URL where you want the link to point, along with an <em>optional</em> +title for the link, surrounded in quotes. For example:</p> +<pre><code>This is [an example](http://example.com/ "Title") inline link. + +[This link](http://example.net/) has no title attribute. +</code></pre> +<p>Will produce:</p> +<pre><code><p>This is <a href="http://example.com/" title="Title"> +an example</a> inline link.</p> + +<p><a href="http://example.net/">This link</a> has no +title attribute.</p> +</code></pre> +<p>If you're referring to a local resource on the same server, you can +use relative paths:</p> +<pre><code>See my [About](/about/) page for details. +</code></pre> +<p>Reference-style links use a second set of square brackets, inside +which you place a label of your choosing to identify the link:</p> +<pre><code>This is [an example][id] reference-style link. +</code></pre> +<p>You can optionally use a space to separate the sets of brackets:</p> +<pre><code>This is [an example] [id] reference-style link. +</code></pre> +<p>Then, anywhere in the document, you define your link label like this, +on a line by itself:</p> +<pre><code>[id]: http://example.com/ "Optional Title Here" +</code></pre> +<p>That is:</p> +<ul> +<li>Square brackets containing the link identifier (optionally +indented from the left margin using up to three spaces);</li> +<li>followed by a colon;</li> +<li>followed by one or more spaces (or tabs);</li> +<li>followed by the URL for the link;</li> +<li>optionally followed by a title attribute for the link, enclosed +in double or single quotes.</li> +</ul> +<p>The link URL may, optionally, be surrounded by angle brackets:</p> +<pre><code>[id]: <http://example.com/> "Optional Title Here" +</code></pre> +<p>You can put the title attribute on the next line and use extra spaces +or tabs for padding, which tends to look better with longer URLs:</p> +<pre><code>[id]: http://example.com/longish/path/to/resource/here + "Optional Title Here" +</code></pre> +<p>Link definitions are only used for creating links during Markdown +processing, and are stripped from your document in the HTML output.</p> +<p>Link definition names may constist of letters, numbers, spaces, and punctuation -- but they are <em>not</em> case sensitive. E.g. these two links:</p> +<pre><code>[link text][a] +[link text][A] +</code></pre> +<p>are equivalent.</p> +<p>The <em>implicit link name</em> shortcut allows you to omit the name of the +link, in which case the link text itself is used as the name. +Just use an empty set of square brackets -- e.g., to link the word +"Google" to the google.com web site, you could simply write:</p> +<pre><code>[Google][] +</code></pre> +<p>And then define the link:</p> +<pre><code>[Google]: http://google.com/ +</code></pre> +<p>Because link names may contain spaces, this shortcut even works for +multiple words in the link text:</p> +<pre><code>Visit [Daring Fireball][] for more information. +</code></pre> +<p>And then define the link:</p> +<pre><code>[Daring Fireball]: http://daringfireball.net/ +</code></pre> +<p>Link definitions can be placed anywhere in your Markdown document. I +tend to put them immediately after each paragraph in which they're +used, but if you want, you can put them all at the end of your +document, sort of like footnotes.</p> +<p>Here's an example of reference links in action:</p> +<pre><code>I get 10 times more traffic from [Google] [1] than from +[Yahoo] [2] or [MSN] [3]. + + [1]: http://google.com/ "Google" + [2]: http://search.yahoo.com/ "Yahoo Search" + [3]: http://search.msn.com/ "MSN Search" +</code></pre> +<p>Using the implicit link name shortcut, you could instead write:</p> +<pre><code>I get 10 times more traffic from [Google][] than from +[Yahoo][] or [MSN][]. + + [google]: http://google.com/ "Google" + [yahoo]: http://search.yahoo.com/ "Yahoo Search" + [msn]: http://search.msn.com/ "MSN Search" +</code></pre> +<p>Both of the above examples will produce the following HTML output:</p> +<pre><code><p>I get 10 times more traffic from <a href="http://google.com/" +title="Google">Google</a> than from +<a href="http://search.yahoo.com/" title="Yahoo Search">Yahoo</a> +or <a href="http://search.msn.com/" title="MSN Search">MSN</a>.</p> +</code></pre> +<p>For comparison, here is the same paragraph written using +Markdown's inline link style:</p> +<pre><code>I get 10 times more traffic from [Google](http://google.com/ "Google") +than from [Yahoo](http://search.yahoo.com/ "Yahoo Search") or +[MSN](http://search.msn.com/ "MSN Search"). +</code></pre> +<p>The point of reference-style links is not that they're easier to +write. The point is that with reference-style links, your document +source is vastly more readable. Compare the above examples: using +reference-style links, the paragraph itself is only 81 characters +long; with inline-style links, it's 176 characters; and as raw HTML, +it's 234 characters. In the raw HTML, there's more markup than there +is text.</p> +<p>With Markdown's reference-style links, a source document much more +closely resembles the final output, as rendered in a browser. By +allowing you to move the markup-related metadata out of the paragraph, +you can add links without interrupting the narrative flow of your +prose.</p> +<h3 id="em">Emphasis</h3> +<p>Markdown treats asterisks (<code>*</code>) and underscores (<code>_</code>) as indicators of +emphasis. Text wrapped with one <code>*</code> or <code>_</code> will be wrapped with an +HTML <code><em></code> tag; double <code>*</code>'s or <code>_</code>'s will be wrapped with an HTML +<code><strong></code> tag. E.g., this input:</p> +<pre><code>*single asterisks* + +_single underscores_ + +**double asterisks** + +__double underscores__ +</code></pre> +<p>will produce:</p> +<pre><code><em>single asterisks</em> + +<em>single underscores</em> + +<strong>double asterisks</strong> + +<strong>double underscores</strong> +</code></pre> +<p>You can use whichever style you prefer; the lone restriction is that +the same character must be used to open and close an emphasis span.</p> +<p>Emphasis can be used in the middle of a word:</p> +<pre><code>un*fucking*believable +</code></pre> +<p>But if you surround an <code>*</code> or <code>_</code> with spaces, it'll be treated as a +literal asterisk or underscore.</p> +<p>To produce a literal asterisk or underscore at a position where it +would otherwise be used as an emphasis delimiter, you can backslash +escape it:</p> +<pre><code>\*this text is surrounded by literal asterisks\* +</code></pre> +<h3 id="code">Code</h3> +<p>To indicate a span of code, wrap it with backtick quotes (<code>`</code>). +Unlike a pre-formatted code block, a code span indicates code within a +normal paragraph. For example:</p> +<pre><code>Use the `printf()` function. +</code></pre> +<p>will produce:</p> +<pre><code><p>Use the <code>printf()</code> function.</p> +</code></pre> +<p>To include a literal backtick character within a code span, you can use +multiple backticks as the opening and closing delimiters:</p> +<pre><code>``There is a literal backtick (`) here.`` +</code></pre> +<p>which will produce this:</p> +<pre><code><p><code>There is a literal backtick (`) here.</code></p> +</code></pre> +<p>The backtick delimiters surrounding a code span may include spaces -- +one after the opening, one before the closing. This allows you to place +literal backtick characters at the beginning or end of a code span:</p> +<pre><code>A single backtick in a code span: `` ` `` + +A backtick-delimited string in a code span: `` `foo` `` +</code></pre> +<p>will produce:</p> +<pre><code><p>A single backtick in a code span: <code>`</code></p> + +<p>A backtick-delimited string in a code span: <code>`foo`</code></p> +</code></pre> +<p>With a code span, ampersands and angle brackets are encoded as HTML +entities automatically, which makes it easy to include example HTML +tags. Markdown will turn this:</p> +<pre><code>Please don't use any `<blink>` tags. +</code></pre> +<p>into:</p> +<pre><code><p>Please don't use any <code>&lt;blink&gt;</code> tags.</p> +</code></pre> +<p>You can write this:</p> +<pre><code>`&#8212;` is the decimal-encoded equivalent of `&mdash;`. +</code></pre> +<p>to produce:</p> +<pre><code><p><code>&amp;#8212;</code> is the decimal-encoded +equivalent of <code>&amp;mdash;</code>.</p> +</code></pre> +<h3 id="img">Images</h3> +<p>Admittedly, it's fairly difficult to devise a "natural" syntax for +placing images into a plain text document format.</p> +<p>Markdown uses an image syntax that is intended to resemble the syntax +for links, allowing for two styles: <em>inline</em> and <em>reference</em>.</p> +<p>Inline image syntax looks like this:</p> +<pre><code>![Alt text](/path/to/img.jpg) + +![Alt text](/path/to/img.jpg "Optional title") +</code></pre> +<p>That is:</p> +<ul> +<li>An exclamation mark: <code>!</code>;</li> +<li>followed by a set of square brackets, containing the <code>alt</code> +attribute text for the image;</li> +<li>followed by a set of parentheses, containing the URL or path to +the image, and an optional <code>title</code> attribute enclosed in double +or single quotes.</li> +</ul> +<p>Reference-style image syntax looks like this:</p> +<pre><code>![Alt text][id] +</code></pre> +<p>Where "id" is the name of a defined image reference. Image references +are defined using syntax identical to link references:</p> +<pre><code>[id]: url/to/image "Optional title attribute" +</code></pre> +<p>As of this writing, Markdown has no syntax for specifying the +dimensions of an image; if this is important to you, you can simply +use regular HTML <code><img></code> tags.</p> +<hr /> +<h2 id="misc">Miscellaneous</h2> +<h3 id="autolink">Automatic Links</h3> +<p>Markdown supports a shortcut style for creating "automatic" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this:</p> +<pre><code><http://example.com/> +</code></pre> +<p>Markdown will turn this into:</p> +<pre><code><a href="http://example.com/">http://example.com/</a> +</code></pre> +<p>Automatic links for email addresses work similarly, except that +Markdown will also perform a bit of randomized decimal and hex +entity-encoding to help obscure your address from address-harvesting +spambots. For example, Markdown will turn this:</p> +<pre><code><address@example.com> +</code></pre> +<p>into something like this:</p> +<pre><code><a href="&#x6D;&#x61;i&#x6C;&#x74;&#x6F;:&#x61;&#x64;&#x64;&#x72;&#x65; +&#115;&#115;&#64;&#101;&#120;&#x61;&#109;&#x70;&#x6C;e&#x2E;&#99;&#111; +&#109;">&#x61;&#x64;&#x64;&#x72;&#x65;&#115;&#115;&#64;&#101;&#120;&#x61; +&#109;&#x70;&#x6C;e&#x2E;&#99;&#111;&#109;</a> +</code></pre> +<p>which will render in a browser as a clickable link to "address@example.com".</p> +<p>(This sort of entity-encoding trick will indeed fool many, if not +most, address-harvesting bots, but it definitely won't fool all of +them. It's better than nothing, but an address published in this way +will probably eventually start receiving spam.)</p> +<h3 id="backslash">Backslash Escapes</h3> +<p>Markdown allows you to use backslash escapes to generate literal +characters which would otherwise have special meaning in Markdown's +formatting syntax. For example, if you wanted to surround a word with +literal asterisks (instead of an HTML <code><em></code> tag), you can backslashes +before the asterisks, like this:</p> +<pre><code>\*literal asterisks\* +</code></pre> +<p>Markdown provides backslash escapes for the following characters:</p> +<pre><code>\ backslash +` backtick +* asterisk +_ underscore +{} curly braces +[] square brackets +() parentheses +# hash mark ++ plus sign +- minus sign (hyphen) +. dot +! exclamation mark +</code></pre> diff --git a/oldtests/Original/Markdown_Documentation_Syntax.markdown b/oldtests/Original/Markdown_Documentation_Syntax.markdown new file mode 100644 index 0000000..57360a1 --- /dev/null +++ b/oldtests/Original/Markdown_Documentation_Syntax.markdown @@ -0,0 +1,888 @@ +Markdown: Syntax +================ + +<ul id="ProjectSubmenu"> + <li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li> + <li><a href="/projects/markdown/basics" title="Markdown Basics">Basics</a></li> + <li><a class="selected" title="Markdown Syntax Documentation">Syntax</a></li> + <li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li> + <li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li> +</ul> + + +* [Overview](#overview) + * [Philosophy](#philosophy) + * [Inline HTML](#html) + * [Automatic Escaping for Special Characters](#autoescape) +* [Block Elements](#block) + * [Paragraphs and Line Breaks](#p) + * [Headers](#header) + * [Blockquotes](#blockquote) + * [Lists](#list) + * [Code Blocks](#precode) + * [Horizontal Rules](#hr) +* [Span Elements](#span) + * [Links](#link) + * [Emphasis](#em) + * [Code](#code) + * [Images](#img) +* [Miscellaneous](#misc) + * [Backslash Escapes](#backslash) + * [Automatic Links](#autolink) + + +**Note:** This document is itself written using Markdown; you +can [see the source for it by adding '.text' to the URL][src]. + + [src]: /projects/markdown/syntax.text + +* * * + +<h2 id="overview">Overview</h2> + +<h3 id="philosophy">Philosophy</h3> + +Markdown is intended to be as easy-to-read and easy-to-write as is feasible. + +Readability, however, is emphasized above all else. A Markdown-formatted +document should be publishable as-is, as plain text, without looking +like it's been marked up with tags or formatting instructions. While +Markdown's syntax has been influenced by several existing text-to-HTML +filters -- including [Setext] [1], [atx] [2], [Textile] [3], [reStructuredText] [4], +[Grutatext] [5], and [EtText] [6] -- the single biggest source of +inspiration for Markdown's syntax is the format of plain text email. + + [1]: http://docutils.sourceforge.net/mirror/setext.html + [2]: http://www.aaronsw.com/2002/atx/ + [3]: http://textism.com/tools/textile/ + [4]: http://docutils.sourceforge.net/rst.html + [5]: http://www.triptico.com/software/grutatxt.html + [6]: http://ettext.taint.org/doc/ + +To this end, Markdown's syntax is comprised entirely of punctuation +characters, which punctuation characters have been carefully chosen so +as to look like what they mean. E.g., asterisks around a word actually +look like \*emphasis\*. Markdown lists look like, well, lists. Even +blockquotes look like quoted passages of text, assuming you've ever +used email. + + + +<h3 id="html">Inline HTML</h3> + +Markdown's syntax is intended for one purpose: to be used as a +format for *writing* for the web. + +Markdown is not a replacement for HTML, or even close to it. Its +syntax is very small, corresponding only to a very small subset of +HTML tags. The idea is *not* to create a syntax that makes it easier +to insert HTML tags. In my opinion, HTML tags are already easy to +insert. The idea for Markdown is to make it easy to read, write, and +edit prose. HTML is a *publishing* format; Markdown is a *writing* +format. Thus, Markdown's formatting syntax only addresses issues that +can be conveyed in plain text. + +For any markup that is not covered by Markdown's syntax, you simply +use HTML itself. There's no need to preface it or delimit it to +indicate that you're switching from Markdown to HTML; you just use +the tags. + +The only restrictions are that block-level HTML elements -- e.g. `<div>`, +`<table>`, `<pre>`, `<p>`, etc. -- must be separated from surrounding +content by blank lines, and the start and end tags of the block should +not be indented with tabs or spaces. Markdown is smart enough not +to add extra (unwanted) `<p>` tags around HTML block-level tags. + +For example, to add an HTML table to a Markdown article: + + This is a regular paragraph. + + <table> + <tr> + <td>Foo</td> + </tr> + </table> + + This is another regular paragraph. + +Note that Markdown formatting syntax is not processed within block-level +HTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an +HTML block. + +Span-level HTML tags -- e.g. `<span>`, `<cite>`, or `<del>` -- can be +used anywhere in a Markdown paragraph, list item, or header. If you +want, you can even use HTML tags instead of Markdown formatting; e.g. if +you'd prefer to use HTML `<a>` or `<img>` tags instead of Markdown's +link or image syntax, go right ahead. + +Unlike block-level HTML tags, Markdown syntax *is* processed within +span-level tags. + + +<h3 id="autoescape">Automatic Escaping for Special Characters</h3> + +In HTML, there are two characters that demand special treatment: `<` +and `&`. Left angle brackets are used to start tags; ampersands are +used to denote HTML entities. If you want to use them as literal +characters, you must escape them as entities, e.g. `<`, and +`&`. + +Ampersands in particular are bedeviling for web writers. If you want to +write about 'AT&T', you need to write '`AT&T`'. You even need to +escape ampersands within URLs. Thus, if you want to link to: + + http://images.google.com/images?num=30&q=larry+bird + +you need to encode the URL as: + + http://images.google.com/images?num=30&q=larry+bird + +in your anchor tag `href` attribute. Needless to say, this is easy to +forget, and is probably the single most common source of HTML validation +errors in otherwise well-marked-up web sites. + +Markdown allows you to use these characters naturally, taking care of +all the necessary escaping for you. If you use an ampersand as part of +an HTML entity, it remains unchanged; otherwise it will be translated +into `&`. + +So, if you want to include a copyright symbol in your article, you can write: + + © + +and Markdown will leave it alone. But if you write: + + AT&T + +Markdown will translate it to: + + AT&T + +Similarly, because Markdown supports [inline HTML](#html), if you use +angle brackets as delimiters for HTML tags, Markdown will treat them as +such. But if you write: + + 4 < 5 + +Markdown will translate it to: + + 4 < 5 + +However, inside Markdown code spans and blocks, angle brackets and +ampersands are *always* encoded automatically. This makes it easy to use +Markdown to write about HTML code. (As opposed to raw HTML, which is a +terrible format for writing about HTML syntax, because every single `<` +and `&` in your example code needs to be escaped.) + + +* * * + + +<h2 id="block">Block Elements</h2> + + +<h3 id="p">Paragraphs and Line Breaks</h3> + +A paragraph is simply one or more consecutive lines of text, separated +by one or more blank lines. (A blank line is any line that looks like a +blank line -- a line containing nothing but spaces or tabs is considered +blank.) Normal paragraphs should not be intended with spaces or tabs. + +The implication of the "one or more consecutive lines of text" rule is +that Markdown supports "hard-wrapped" text paragraphs. This differs +significantly from most other text-to-HTML formatters (including Movable +Type's "Convert Line Breaks" option) which translate every line break +character in a paragraph into a `<br />` tag. + +When you *do* want to insert a `<br />` break tag using Markdown, you +end a line with two or more spaces, then type return. + +Yes, this takes a tad more effort to create a `<br />`, but a simplistic +"every line break is a `<br />`" rule wouldn't work for Markdown. +Markdown's email-style [blockquoting][bq] and multi-paragraph [list items][l] +work best -- and look better -- when you format them with hard breaks. + + [bq]: #blockquote + [l]: #list + + + +<h3 id="header">Headers</h3> + +Markdown supports two styles of headers, [Setext] [1] and [atx] [2]. + +Setext-style headers are "underlined" using equal signs (for first-level +headers) and dashes (for second-level headers). For example: + + This is an H1 + ============= + + This is an H2 + ------------- + +Any number of underlining `=`'s or `-`'s will work. + +Atx-style headers use 1-6 hash characters at the start of the line, +corresponding to header levels 1-6. For example: + + # This is an H1 + + ## This is an H2 + + ###### This is an H6 + +Optionally, you may "close" atx-style headers. This is purely +cosmetic -- you can use this if you think it looks better. The +closing hashes don't even need to match the number of hashes +used to open the header. (The number of opening hashes +determines the header level.) : + + # This is an H1 # + + ## This is an H2 ## + + ### This is an H3 ###### + + +<h3 id="blockquote">Blockquotes</h3> + +Markdown uses email-style `>` characters for blockquoting. If you're +familiar with quoting passages of text in an email message, then you +know how to create a blockquote in Markdown. It looks best if you hard +wrap the text and put a `>` before every line: + + > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, + > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. + > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. + > + > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse + > id sem consectetuer libero luctus adipiscing. + +Markdown allows you to be lazy and only put the `>` before the first +line of a hard-wrapped paragraph: + + > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, + consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. + Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. + + > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse + id sem consectetuer libero luctus adipiscing. + +Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by +adding additional levels of `>`: + + > This is the first level of quoting. + > + > > This is nested blockquote. + > + > Back to the first level. + +Blockquotes can contain other Markdown elements, including headers, lists, +and code blocks: + + > ## This is a header. + > + > 1. This is the first list item. + > 2. This is the second list item. + > + > Here's some example code: + > + > return shell_exec("echo $input | $markdown_script"); + +Any decent text editor should make email-style quoting easy. For +example, with BBEdit, you can make a selection and choose Increase +Quote Level from the Text menu. + + +<h3 id="list">Lists</h3> + +Markdown supports ordered (numbered) and unordered (bulleted) lists. + +Unordered lists use asterisks, pluses, and hyphens -- interchangably +-- as list markers: + + * Red + * Green + * Blue + +is equivalent to: + + + Red + + Green + + Blue + +and: + + - Red + - Green + - Blue + +Ordered lists use numbers followed by periods: + + 1. Bird + 2. McHale + 3. Parish + +It's important to note that the actual numbers you use to mark the +list have no effect on the HTML output Markdown produces. The HTML +Markdown produces from the above list is: + + <ol> + <li>Bird</li> + <li>McHale</li> + <li>Parish</li> + </ol> + +If you instead wrote the list in Markdown like this: + + 1. Bird + 1. McHale + 1. Parish + +or even: + + 3. Bird + 1. McHale + 8. Parish + +you'd get the exact same HTML output. The point is, if you want to, +you can use ordinal numbers in your ordered Markdown lists, so that +the numbers in your source match the numbers in your published HTML. +But if you want to be lazy, you don't have to. + +If you do use lazy list numbering, however, you should still start the +list with the number 1. At some point in the future, Markdown may support +starting ordered lists at an arbitrary number. + +List markers typically start at the left margin, but may be indented by +up to three spaces. List markers must be followed by one or more spaces +or a tab. + +To make lists look nice, you can wrap items with hanging indents: + + * Lorem ipsum dolor sit amet, consectetuer adipiscing elit. + Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi, + viverra nec, fringilla in, laoreet vitae, risus. + * Donec sit amet nisl. Aliquam semper ipsum sit amet velit. + Suspendisse id sem consectetuer libero luctus adipiscing. + +But if you want to be lazy, you don't have to: + + * Lorem ipsum dolor sit amet, consectetuer adipiscing elit. + Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi, + viverra nec, fringilla in, laoreet vitae, risus. + * Donec sit amet nisl. Aliquam semper ipsum sit amet velit. + Suspendisse id sem consectetuer libero luctus adipiscing. + +If list items are separated by blank lines, Markdown will wrap the +items in `<p>` tags in the HTML output. For example, this input: + + * Bird + * Magic + +will turn into: + + <ul> + <li>Bird</li> + <li>Magic</li> + </ul> + +But this: + + * Bird + + * Magic + +will turn into: + + <ul> + <li><p>Bird</p></li> + <li><p>Magic</p></li> + </ul> + +List items may consist of multiple paragraphs. Each subsequent +paragraph in a list item must be intended by either 4 spaces +or one tab: + + 1. This is a list item with two paragraphs. Lorem ipsum dolor + sit amet, consectetuer adipiscing elit. Aliquam hendrerit + mi posuere lectus. + + Vestibulum enim wisi, viverra nec, fringilla in, laoreet + vitae, risus. Donec sit amet nisl. Aliquam semper ipsum + sit amet velit. + + 2. Suspendisse id sem consectetuer libero luctus adipiscing. + +It looks nice if you indent every line of the subsequent +paragraphs, but here again, Markdown will allow you to be +lazy: + + * This is a list item with two paragraphs. + + This is the second paragraph in the list item. You're + only required to indent the first line. Lorem ipsum dolor + sit amet, consectetuer adipiscing elit. + + * Another item in the same list. + +To put a blockquote within a list item, the blockquote's `>` +delimiters need to be indented: + + * A list item with a blockquote: + + > This is a blockquote + > inside a list item. + +To put a code block within a list item, the code block needs +to be indented *twice* -- 8 spaces or two tabs: + + * A list item with a code block: + + <code goes here> + + +It's worth noting that it's possible to trigger an ordered list by +accident, by writing something like this: + + 1986. What a great season. + +In other words, a *number-period-space* sequence at the beginning of a +line. To avoid this, you can backslash-escape the period: + + 1986\. What a great season. + + + +<h3 id="precode">Code Blocks</h3> + +Pre-formatted code blocks are used for writing about programming or +markup source code. Rather than forming normal paragraphs, the lines +of a code block are interpreted literally. Markdown wraps a code block +in both `<pre>` and `<code>` tags. + +To produce a code block in Markdown, simply indent every line of the +block by at least 4 spaces or 1 tab. For example, given this input: + + This is a normal paragraph: + + This is a code block. + +Markdown will generate: + + <p>This is a normal paragraph:</p> + + <pre><code>This is a code block. + </code></pre> + +One level of indentation -- 4 spaces or 1 tab -- is removed from each +line of the code block. For example, this: + + Here is an example of AppleScript: + + tell application "Foo" + beep + end tell + +will turn into: + + <p>Here is an example of AppleScript:</p> + + <pre><code>tell application "Foo" + beep + end tell + </code></pre> + +A code block continues until it reaches a line that is not indented +(or the end of the article). + +Within a code block, ampersands (`&`) and angle brackets (`<` and `>`) +are automatically converted into HTML entities. This makes it very +easy to include example HTML source code using Markdown -- just paste +it and indent it, and Markdown will handle the hassle of encoding the +ampersands and angle brackets. For example, this: + + <div class="footer"> + © 2004 Foo Corporation + </div> + +will turn into: + + <pre><code><div class="footer"> + &copy; 2004 Foo Corporation + </div> + </code></pre> + +Regular Markdown syntax is not processed within code blocks. E.g., +asterisks are just literal asterisks within a code block. This means +it's also easy to use Markdown to write about Markdown's own syntax. + + + +<h3 id="hr">Horizontal Rules</h3> + +You can produce a horizontal rule tag (`<hr />`) by placing three or +more hyphens, asterisks, or underscores on a line by themselves. If you +wish, you may use spaces between the hyphens or asterisks. Each of the +following lines will produce a horizontal rule: + + * * * + + *** + + ***** + + - - - + + --------------------------------------- + + _ _ _ + + +* * * + +<h2 id="span">Span Elements</h2> + +<h3 id="link">Links</h3> + +Markdown supports two style of links: *inline* and *reference*. + +In both styles, the link text is delimited by [square brackets]. + +To create an inline link, use a set of regular parentheses immediately +after the link text's closing square bracket. Inside the parentheses, +put the URL where you want the link to point, along with an *optional* +title for the link, surrounded in quotes. For example: + + This is [an example](http://example.com/ "Title") inline link. + + [This link](http://example.net/) has no title attribute. + +Will produce: + + <p>This is <a href="http://example.com/" title="Title"> + an example</a> inline link.</p> + + <p><a href="http://example.net/">This link</a> has no + title attribute.</p> + +If you're referring to a local resource on the same server, you can +use relative paths: + + See my [About](/about/) page for details. + +Reference-style links use a second set of square brackets, inside +which you place a label of your choosing to identify the link: + + This is [an example][id] reference-style link. + +You can optionally use a space to separate the sets of brackets: + + This is [an example] [id] reference-style link. + +Then, anywhere in the document, you define your link label like this, +on a line by itself: + + [id]: http://example.com/ "Optional Title Here" + +That is: + +* Square brackets containing the link identifier (optionally + indented from the left margin using up to three spaces); +* followed by a colon; +* followed by one or more spaces (or tabs); +* followed by the URL for the link; +* optionally followed by a title attribute for the link, enclosed + in double or single quotes. + +The link URL may, optionally, be surrounded by angle brackets: + + [id]: <http://example.com/> "Optional Title Here" + +You can put the title attribute on the next line and use extra spaces +or tabs for padding, which tends to look better with longer URLs: + + [id]: http://example.com/longish/path/to/resource/here + "Optional Title Here" + +Link definitions are only used for creating links during Markdown +processing, and are stripped from your document in the HTML output. + +Link definition names may constist of letters, numbers, spaces, and punctuation -- but they are *not* case sensitive. E.g. these two links: + + [link text][a] + [link text][A] + +are equivalent. + +The *implicit link name* shortcut allows you to omit the name of the +link, in which case the link text itself is used as the name. +Just use an empty set of square brackets -- e.g., to link the word +"Google" to the google.com web site, you could simply write: + + [Google][] + +And then define the link: + + [Google]: http://google.com/ + +Because link names may contain spaces, this shortcut even works for +multiple words in the link text: + + Visit [Daring Fireball][] for more information. + +And then define the link: + + [Daring Fireball]: http://daringfireball.net/ + +Link definitions can be placed anywhere in your Markdown document. I +tend to put them immediately after each paragraph in which they're +used, but if you want, you can put them all at the end of your +document, sort of like footnotes. + +Here's an example of reference links in action: + + I get 10 times more traffic from [Google] [1] than from + [Yahoo] [2] or [MSN] [3]. + + [1]: http://google.com/ "Google" + [2]: http://search.yahoo.com/ "Yahoo Search" + [3]: http://search.msn.com/ "MSN Search" + +Using the implicit link name shortcut, you could instead write: + + I get 10 times more traffic from [Google][] than from + [Yahoo][] or [MSN][]. + + [google]: http://google.com/ "Google" + [yahoo]: http://search.yahoo.com/ "Yahoo Search" + [msn]: http://search.msn.com/ "MSN Search" + +Both of the above examples will produce the following HTML output: + + <p>I get 10 times more traffic from <a href="http://google.com/" + title="Google">Google</a> than from + <a href="http://search.yahoo.com/" title="Yahoo Search">Yahoo</a> + or <a href="http://search.msn.com/" title="MSN Search">MSN</a>.</p> + +For comparison, here is the same paragraph written using +Markdown's inline link style: + + I get 10 times more traffic from [Google](http://google.com/ "Google") + than from [Yahoo](http://search.yahoo.com/ "Yahoo Search") or + [MSN](http://search.msn.com/ "MSN Search"). + +The point of reference-style links is not that they're easier to +write. The point is that with reference-style links, your document +source is vastly more readable. Compare the above examples: using +reference-style links, the paragraph itself is only 81 characters +long; with inline-style links, it's 176 characters; and as raw HTML, +it's 234 characters. In the raw HTML, there's more markup than there +is text. + +With Markdown's reference-style links, a source document much more +closely resembles the final output, as rendered in a browser. By +allowing you to move the markup-related metadata out of the paragraph, +you can add links without interrupting the narrative flow of your +prose. + + +<h3 id="em">Emphasis</h3> + +Markdown treats asterisks (`*`) and underscores (`_`) as indicators of +emphasis. Text wrapped with one `*` or `_` will be wrapped with an +HTML `<em>` tag; double `*`'s or `_`'s will be wrapped with an HTML +`<strong>` tag. E.g., this input: + + *single asterisks* + + _single underscores_ + + **double asterisks** + + __double underscores__ + +will produce: + + <em>single asterisks</em> + + <em>single underscores</em> + + <strong>double asterisks</strong> + + <strong>double underscores</strong> + +You can use whichever style you prefer; the lone restriction is that +the same character must be used to open and close an emphasis span. + +Emphasis can be used in the middle of a word: + + un*fucking*believable + +But if you surround an `*` or `_` with spaces, it'll be treated as a +literal asterisk or underscore. + +To produce a literal asterisk or underscore at a position where it +would otherwise be used as an emphasis delimiter, you can backslash +escape it: + + \*this text is surrounded by literal asterisks\* + + + +<h3 id="code">Code</h3> + +To indicate a span of code, wrap it with backtick quotes (`` ` ``). +Unlike a pre-formatted code block, a code span indicates code within a +normal paragraph. For example: + + Use the `printf()` function. + +will produce: + + <p>Use the <code>printf()</code> function.</p> + +To include a literal backtick character within a code span, you can use +multiple backticks as the opening and closing delimiters: + + ``There is a literal backtick (`) here.`` + +which will produce this: + + <p><code>There is a literal backtick (`) here.</code></p> + +The backtick delimiters surrounding a code span may include spaces -- +one after the opening, one before the closing. This allows you to place +literal backtick characters at the beginning or end of a code span: + + A single backtick in a code span: `` ` `` + + A backtick-delimited string in a code span: `` `foo` `` + +will produce: + + <p>A single backtick in a code span: <code>`</code></p> + + <p>A backtick-delimited string in a code span: <code>`foo`</code></p> + +With a code span, ampersands and angle brackets are encoded as HTML +entities automatically, which makes it easy to include example HTML +tags. Markdown will turn this: + + Please don't use any `<blink>` tags. + +into: + + <p>Please don't use any <code><blink></code> tags.</p> + +You can write this: + + `—` is the decimal-encoded equivalent of `—`. + +to produce: + + <p><code>&#8212;</code> is the decimal-encoded + equivalent of <code>&mdash;</code>.</p> + + + +<h3 id="img">Images</h3> + +Admittedly, it's fairly difficult to devise a "natural" syntax for +placing images into a plain text document format. + +Markdown uses an image syntax that is intended to resemble the syntax +for links, allowing for two styles: *inline* and *reference*. + +Inline image syntax looks like this: + + ![Alt text](/path/to/img.jpg) + + ![Alt text](/path/to/img.jpg "Optional title") + +That is: + +* An exclamation mark: `!`; +* followed by a set of square brackets, containing the `alt` + attribute text for the image; +* followed by a set of parentheses, containing the URL or path to + the image, and an optional `title` attribute enclosed in double + or single quotes. + +Reference-style image syntax looks like this: + + ![Alt text][id] + +Where "id" is the name of a defined image reference. Image references +are defined using syntax identical to link references: + + [id]: url/to/image "Optional title attribute" + +As of this writing, Markdown has no syntax for specifying the +dimensions of an image; if this is important to you, you can simply +use regular HTML `<img>` tags. + + +* * * + + +<h2 id="misc">Miscellaneous</h2> + +<h3 id="autolink">Automatic Links</h3> + +Markdown supports a shortcut style for creating "automatic" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this: + + <http://example.com/> + +Markdown will turn this into: + + <a href="http://example.com/">http://example.com/</a> + +Automatic links for email addresses work similarly, except that +Markdown will also perform a bit of randomized decimal and hex +entity-encoding to help obscure your address from address-harvesting +spambots. For example, Markdown will turn this: + + <address@example.com> + +into something like this: + + <a href="mailto:addre + ss@example.co + m">address@exa + mple.com</a> + +which will render in a browser as a clickable link to "address@example.com". + +(This sort of entity-encoding trick will indeed fool many, if not +most, address-harvesting bots, but it definitely won't fool all of +them. It's better than nothing, but an address published in this way +will probably eventually start receiving spam.) + + + +<h3 id="backslash">Backslash Escapes</h3> + +Markdown allows you to use backslash escapes to generate literal +characters which would otherwise have special meaning in Markdown's +formatting syntax. For example, if you wanted to surround a word with +literal asterisks (instead of an HTML `<em>` tag), you can backslashes +before the asterisks, like this: + + \*literal asterisks\* + +Markdown provides backslash escapes for the following characters: + + \ backslash + ` backtick + * asterisk + _ underscore + {} curly braces + [] square brackets + () parentheses + # hash mark + + plus sign + - minus sign (hyphen) + . dot + ! exclamation mark + diff --git a/oldtests/Original/Nested_blockquotes.html b/oldtests/Original/Nested_blockquotes.html new file mode 100644 index 0000000..02efc59 --- /dev/null +++ b/oldtests/Original/Nested_blockquotes.html @@ -0,0 +1,7 @@ +<blockquote> +<p>foo</p> +<blockquote> +<p>bar</p> +</blockquote> +<p>foo</p> +</blockquote> diff --git a/oldtests/Original/Nested_blockquotes.markdown b/oldtests/Original/Nested_blockquotes.markdown new file mode 100644 index 0000000..ed3c624 --- /dev/null +++ b/oldtests/Original/Nested_blockquotes.markdown @@ -0,0 +1,5 @@ +> foo +> +> > bar +> +> foo diff --git a/oldtests/Original/Ordered_and_unordered_lists.html b/oldtests/Original/Ordered_and_unordered_lists.html new file mode 100644 index 0000000..78d752e --- /dev/null +++ b/oldtests/Original/Ordered_and_unordered_lists.html @@ -0,0 +1,112 @@ +<h2>Unordered</h2> +<p>Asterisks tight:</p> +<ul> +<li>asterisk 1</li> +<li>asterisk 2</li> +<li>asterisk 3</li> +</ul> +<p>Asterisks loose:</p> +<ul> +<li><p>asterisk 1</p></li> +<li><p>asterisk 2</p></li> +<li><p>asterisk 3</p></li> +</ul> +<hr /> +<p>Pluses tight:</p> +<ul> +<li>Plus 1</li> +<li>Plus 2</li> +<li>Plus 3</li> +</ul> +<p>Pluses loose:</p> +<ul> +<li><p>Plus 1</p></li> +<li><p>Plus 2</p></li> +<li><p>Plus 3</p></li> +</ul> +<hr /> +<p>Minuses tight:</p> +<ul> +<li>Minus 1</li> +<li>Minus 2</li> +<li>Minus 3</li> +</ul> +<p>Minuses loose:</p> +<ul> +<li><p>Minus 1</p></li> +<li><p>Minus 2</p></li> +<li><p>Minus 3</p></li> +</ul> +<h2>Ordered</h2> +<p>Tight:</p> +<ol> +<li>First</li> +<li>Second</li> +<li>Third</li> +</ol> +<p>and:</p> +<ol> +<li>One</li> +<li>Two</li> +<li>Three</li> +</ol> +<p>Loose using tabs:</p> +<ol> +<li><p>First</p></li> +<li><p>Second</p></li> +<li><p>Third</p></li> +</ol> +<p>and using spaces:</p> +<ol> +<li><p>One</p></li> +<li><p>Two</p></li> +<li><p>Three</p></li> +</ol> +<p>Multiple paragraphs:</p> +<ol> +<li><p>Item 1, graf one.</p> +<p>Item 2. graf two. The quick brown fox jumped over the lazy dog's +back.</p></li> +<li><p>Item 2.</p></li> +<li><p>Item 3.</p></li> +</ol> +<h2>Nested</h2> +<ul> +<li>Tab +<ul> +<li>Tab +<ul> +<li>Tab</li> +</ul></li> +</ul></li> +</ul> +<p>Here's another:</p> +<ol> +<li>First</li> +<li>Second: +<ul> +<li>Fee</li> +<li>Fie</li> +<li>Foe</li> +</ul></li> +<li>Third</li> +</ol> +<p>Same thing but with paragraphs:</p> +<ol> +<li><p>First</p></li> +<li><p>Second:</p> +<ul> +<li>Fee</li> +<li>Fie</li> +<li>Foe</li> +</ul></li> +<li><p>Third</p></li> +</ol> +<p>This was an error in Markdown 1.0.1:</p> +<ul> +<li><p>this</p> +<ul> +<li>sub</li> +</ul> +<p>that</p></li> +</ul> diff --git a/oldtests/Original/Ordered_and_unordered_lists.markdown b/oldtests/Original/Ordered_and_unordered_lists.markdown new file mode 100644 index 0000000..7f3b497 --- /dev/null +++ b/oldtests/Original/Ordered_and_unordered_lists.markdown @@ -0,0 +1,131 @@ +## Unordered + +Asterisks tight: + +* asterisk 1 +* asterisk 2 +* asterisk 3 + + +Asterisks loose: + +* asterisk 1 + +* asterisk 2 + +* asterisk 3 + +* * * + +Pluses tight: + ++ Plus 1 ++ Plus 2 ++ Plus 3 + + +Pluses loose: + ++ Plus 1 + ++ Plus 2 + ++ Plus 3 + +* * * + + +Minuses tight: + +- Minus 1 +- Minus 2 +- Minus 3 + + +Minuses loose: + +- Minus 1 + +- Minus 2 + +- Minus 3 + + +## Ordered + +Tight: + +1. First +2. Second +3. Third + +and: + +1. One +2. Two +3. Three + + +Loose using tabs: + +1. First + +2. Second + +3. Third + +and using spaces: + +1. One + +2. Two + +3. Three + +Multiple paragraphs: + +1. Item 1, graf one. + + Item 2. graf two. The quick brown fox jumped over the lazy dog's + back. + +2. Item 2. + +3. Item 3. + + + +## Nested + +* Tab + * Tab + * Tab + +Here's another: + +1. First +2. Second: + * Fee + * Fie + * Foe +3. Third + +Same thing but with paragraphs: + +1. First + +2. Second: + * Fee + * Fie + * Foe + +3. Third + + +This was an error in Markdown 1.0.1: + +* this + + * sub + + that diff --git a/oldtests/Original/README b/oldtests/Original/README new file mode 100644 index 0000000..5143258 --- /dev/null +++ b/oldtests/Original/README @@ -0,0 +1,15 @@ +These are from John Gruber's original markdown test suite, via +Michel Fortin's mdtest. + +The html files have been modified slightly in ways that do not affect the +semantics. For example, entities are used for quotes in text, and +blank lines are omitted between block-level tags. + +Trailing blank spaces are removed from lines in raw HTML blocks. + +The one (insignificant) semantic change is switching the order +of emph and strong tags in the output for ***strong and emph***. + +We have removed Hard-wrapped_paragraphs_with_list-like_lines tests, +because the new implementation no longer requires a blank line +before a list. diff --git a/oldtests/Original/Strong_and_em_together.html b/oldtests/Original/Strong_and_em_together.html new file mode 100644 index 0000000..2629594 --- /dev/null +++ b/oldtests/Original/Strong_and_em_together.html @@ -0,0 +1,4 @@ +<p><strong><em>This is strong and em.</em></strong></p> +<p>So is <strong><em>this</em></strong> word.</p> +<p><strong><em>This is strong and em.</em></strong></p> +<p>So is <strong><em>this</em></strong> word.</p> diff --git a/oldtests/Original/Strong_and_em_together.markdown b/oldtests/Original/Strong_and_em_together.markdown new file mode 100644 index 0000000..95ee690 --- /dev/null +++ b/oldtests/Original/Strong_and_em_together.markdown @@ -0,0 +1,7 @@ +***This is strong and em.*** + +So is ***this*** word. + +___This is strong and em.___ + +So is ___this___ word. diff --git a/oldtests/Original/Tabs.html b/oldtests/Original/Tabs.html new file mode 100644 index 0000000..5389bdf --- /dev/null +++ b/oldtests/Original/Tabs.html @@ -0,0 +1,19 @@ +<ul> +<li><p>this is a list item +indented with tabs</p></li> +<li><p>this is a list item +indented with spaces</p></li> +</ul> +<p>Code:</p> +<pre><code>this code block is indented by one tab +</code></pre> +<p>And:</p> +<pre><code> this code block is indented by two tabs +</code></pre> +<p>And:</p> +<pre><code>+ this is an example list item + indented with tabs + ++ this is an example list item + indented with spaces +</code></pre> diff --git a/oldtests/Original/Tabs.markdown b/oldtests/Original/Tabs.markdown new file mode 100644 index 0000000..589d113 --- /dev/null +++ b/oldtests/Original/Tabs.markdown @@ -0,0 +1,21 @@ ++ this is a list item + indented with tabs + ++ this is a list item + indented with spaces + +Code: + + this code block is indented by one tab + +And: + + this code block is indented by two tabs + +And: + + + this is an example list item + indented with tabs + + + this is an example list item + indented with spaces diff --git a/oldtests/Original/Tidyness.html b/oldtests/Original/Tidyness.html new file mode 100644 index 0000000..f2a8ce7 --- /dev/null +++ b/oldtests/Original/Tidyness.html @@ -0,0 +1,8 @@ +<blockquote> +<p>A list within a blockquote:</p> +<ul> +<li>asterisk 1</li> +<li>asterisk 2</li> +<li>asterisk 3</li> +</ul> +</blockquote> diff --git a/oldtests/Original/Tidyness.markdown b/oldtests/Original/Tidyness.markdown new file mode 100644 index 0000000..5f18b8d --- /dev/null +++ b/oldtests/Original/Tidyness.markdown @@ -0,0 +1,5 @@ +> A list within a blockquote: +> +> * asterisk 1 +> * asterisk 2 +> * asterisk 3 diff --git a/oldtests/Tabs/TabConversionUnicode.html b/oldtests/Tabs/TabConversionUnicode.html new file mode 100644 index 0000000..f596f6a --- /dev/null +++ b/oldtests/Tabs/TabConversionUnicode.html @@ -0,0 +1 @@ +<p><code>То лпой</code> is a Russian word with a tab inside.</p> diff --git a/oldtests/Tabs/TabConversionUnicode.markdown b/oldtests/Tabs/TabConversionUnicode.markdown new file mode 100644 index 0000000..0bd7b52 --- /dev/null +++ b/oldtests/Tabs/TabConversionUnicode.markdown @@ -0,0 +1 @@ +`То лпой` is a Russian word with a tab inside. diff --git a/runtests.pl b/runtests.pl new file mode 100644 index 0000000..5facbe6 --- /dev/null +++ b/runtests.pl @@ -0,0 +1,159 @@ +#!/usr/bin/env perl +use warnings; +use strict; +use Term::ANSIColor; +use IO::Handle; +use IPC::Open2; + +my $usage="runtests.pl PROGRAM SPEC\nSet ANSI_COLORS_DISABLED=1 if you redirect to a file.\nSet PATT='...' to restrict tests to sections matching a regex.\n"; + +my $PROG=$ARGV[0]; +my $SPEC=$ARGV[1]; +my $PATT=$ENV{'PATT'}; + +if (!(defined $PROG && defined $SPEC)) { + print STDERR $usage; + exit 1; +} + +my $passed = 0; +my $failed = 0; +my $skipped = 0; + +# Markdown implementations vary on insignificant whitespace. +# Some leave blanks between block elements, others don't. +# This function tries to normalize the output so it can be +# compared with our test. tidy takes two arguments: the +# string containing the actual output, and a pathname of the +# file to which the tidied output is to be saved. +sub tidy +{ + my $inpre = 0; + my $out = ""; + my $outfh; + open($outfh, '>', \$out); + for (split /^/, $_[0]) { + if (/<pre/) { + $inpre = 1; + } elsif (/<\/pre/) { + $inpre = 0; + } + if ($inpre) { + print $outfh $_; + } else { + # remove leading spaces + s/^ *//; + # remove trailing spaces + s/ *$//; + # collapse consecutive spaces + s/ */ /; + # collapse space before /> in tag + s/ *\/>/\/>/; + # skip blank line + if (/^$/) { + next; + } + print $outfh $_; + } + } + close $outfh; + return $out; +} + +sub dotest +{ + my $markdown = $_[0]; + my $html = $_[1]; + my $testname = $_[2]; + my $actual = ""; + # We use → to indicate tab and ␣ space in the spec + $markdown =~ s/→/\t/g;s/␣/ /g; + $html =~ s/→/\t/g;s/␣/ /g; + open2(my $out, my $in, $PROG); + print $in $markdown; + close $in; + flush $out; + $actual = do { local $/; <$out>; }; + close $out; + $html = &tidy($html); + $actual = &tidy($actual); + $actual =~ s/\'/'/; + if ($actual eq $html) { + print colored("✓", "green"); + return 1; + } else { + print colored("\n✘ $testname", "red"); + print "\n"; + print color "cyan"; + print "=== markdown ===============\n"; + print $markdown; + print "=== expected ===============\n"; + print $html; + print "=== got ====================\n"; + print $actual; + print color "black"; + return 0; + } +} + +my $stage = 0; +my $markdown = ""; +my $html = ""; +my $example = 0; +my $linenum = 0; +my $exampleline = 0; +my @secnums = (); +my $secheading; + +open(SPEC, "< $SPEC"); +while (<SPEC>) { + $linenum++; + if (/^\.$/) { + $stage = ($stage + 1) % 3; + if ($stage == 1) { + $exampleline = $linenum; + } + if ($stage == 0) { + $example++; + if (!$PATT || $secheading =~ /$PATT/) { + if (&dotest($markdown, $html, + "Example $example (line $exampleline)")) { + $passed++; + } else { + $failed++; + } + } else { + $skipped++; + } + $markdown = ""; + $html = ""; + } + } elsif ($stage == 0 && $_ =~ /^<!-- END TESTS -->/) { + last; + } elsif ($stage == 0 && $_ =~ /^(#+) +(.*)/) { + my $seclevel = length($1); + $secheading = $2; + if ($#secnums == $seclevel - 1) { + $secnums[$#secnums]++; + } elsif ($#secnums > $seclevel - 1) { + @secnums = @secnums[0..($seclevel - 1)]; + $secnums[$#secnums]++; + } else { + while ($#secnums < $seclevel - 1) { + push(@secnums, 1); + } + } + if (!$PATT || $secheading =~ /$PATT/) { + print ("\n", join(".", @secnums) . " " . $secheading, " "); + } + } elsif ($stage == 1) { + $markdown .= $_; + } elsif ($stage == 2) { + $html .= $_; + } +} + +print "\n"; +print STDERR colored("$passed tests passed, $failed failed, $skipped skipped.", "bold"); +print STDERR "\n"; +exit $failed; diff --git a/spec.txt b/spec.txt new file mode 100644 index 0000000..96721e6 --- /dev/null +++ b/spec.txt @@ -0,0 +1,6044 @@ +--- +title: Standard Markdown Spec +author: +- John MacFarlane +version: 1 +date: 2014-07-21 +... + +# Introduction + +## What is markdown? + +Markdown is a plain text format for writing structured documents, +based on conventions used for indicating formatting in email and +usenet posts. It was developed in 2004 by John Gruber, who wrote +the first markdown-to-HTML converter in perl, and it soon became +widely used in websites. By 2014 there were dozens of +implementations in many languages. Some of them extended basic +markdown syntax with conventions for footnotes, definition lists, +tables, and other constructs, and some allowed output not just in +HTML but in LaTeX and many other formats. + +## Why is a spec needed? + +John Gruber's [canonical description of markdown's +syntax](http://daringfireball.net/projects/markdown/syntax) +does not specify the syntax unambiguously. Here are some examples of +questions it does not answer: + +1. How much indentation is needed for a sublist? The spec says that + continuation paragraphs need to be indented four spaces, but is + not fully explicit about sublists. It is natural to think that + they, too, must be indented four spaces, but `Markdown.pl` does + not require that. This is hardly a "corner case," and divergences + between implementations on this issue often lead to surprises for + users in real documents. (See [this comment by John + Gruber](http://article.gmane.org/gmane.text.markdown.general/1997).) + +2. Is a blank line needed before a block quote or header? + Most implementations do not require the blank line. However, + this can lead to unexpected results in hard-wrapped text, and + also to ambiguities in parsing (note that some implementations + put the header inside the blockquote, while others do not). + (John Gruber has also spoken [in favor of requiring the blank + lines](http://article.gmane.org/gmane.text.markdown.general/2146).) + +3. What is the exact rule for determining when list items get + wrapped in `<p>` tags? Can a list be partially "loose" and partially + "tight"? What should we do with a list like this? + + ``` markdown + 1. one + + 2. two + 3. three + ``` + + Or this? + + ``` markdown + 1. one + + - a + + - b + 2. two + ``` + + (There are some relevant comments by John Gruber + [here](http://article.gmane.org/gmane.text.markdown.general/2554).) + +4. When list markers change from bullets to numbers, should we have + two lists or one? + + ``` markdown + 1. fee + 2. fie + - foe + - fum + ``` + +5. What are the precedence rules for the markers of inline structure? + For example, is the following a valid link, or does the code span + take precedence ? + + ``` markdown + [foo `](bar)` + ``` + +6. What are the precedence rules for markers of emphasis and strong + emphasis? For example, how should the following be parsed? + + ``` markdown + *foo *bar** baz* + ``` + +7. Can list items include headers? + + ``` markdown + - # Heading + ``` + +8. Can link references be defined inside block quotes or list items? + + ``` markdown + > Blockquote [foo]. + > + > [foo]: /url + ``` + +In the absence of a spec, early implementers consulted `Markdown.pl` +to resolve these ambiguities. But `Markdown.pl` was quite buggy, and +gave manifestly bad results in many cases, so it was not a +satisfactory replacement for a spec. + +Because there is no unambiguous spec, implementations have diverged +considerably. As a result, users are often surprised to find that +a document that renders one way on one system (say, a github wiki) +renders differently on another (say, converting to docbook using +pandoc). To make matters worse, because nothing in markdown counts +as a "syntax error," the divergence often isn't discovered right away. + +## About this document + +This document attempts to specify markdown syntax unambiguously. +It contains many examples with side-by-side markdown and +HTML. These are intended to double as conformance tests. An +accompanying script `runtests.pl` can be used to run the tests +against any markdown program: + + perl runtests.pl PROGRAM spec.html + +Since this document describes how markdown is to be parsed into +an abstract syntax tree, it would have made sense to use an abstract +representation of the syntax tree instead of HTML. But HTML is capable +of representing the structural distinctions we need to make, and the +choice of HTML for the tests makes it possible to run the tests against +an implementation without writing an abstract syntax tree renderer. + +This document is generated from a text file, `spec.txt`, written +in markdown with a small extension for the side-by-side tests. +The script `spec2md.pl` can be used to turn `spec.txt` into pandoc +markdown, which can then be converted into other formats. + +In the examples, the `→` character is used to represent tabs. + +# Preprocessing + +A [line](#line) <a id="line"/> +is a sequence of one or more characters followed by a line +ending (CR, LF, or CRLF, depending on the platform) or by the end of +file. + +This spec does not specify an encoding; it thinks of lines as composed +of characters rather than bytes. A conforming parser may be limited +to a certain encoding. + +Tabs in lines are expanded to spaces, with a tab stop of 4 characters: + +. +foo→baz→→bim +. +<p>foo baz bim</p> +. + +. +οὐ→χρῆν +. +<p>οὐ χρῆν</p> +. + +Line endings are replaced by newline characters (LF). + +A line containing only spaces (after tab expansion) followed by +a line ending is called a [blank line](#blank-line). <a +id="blank-line"/> + +# Blocks and inlines + +We can think of a document as a sequence of [blocks](#block)<a +id="block"/>---structural elements like paragraphs, block quotations, +lists, headers, rules, and code blocks. Blocks can contain other +blocks, or they can contain [inline](#inline)<a id="inline"/> content: +words, spaces, links, emphasized text, images, and inline code. + +## Precedence + +Indicators of block structure always take precedence over indicators +of inline structure. So, for example, the following is a list with +two items, not a list with one item containing a code span: + +. +- `one +- two` +. +<ul> +<li>`one</li> +<li>two`</li> +</ul> +. + +This means that parsing can proceed in two steps: first, the block +structure of the document can be discerned; second, text lines inside +paragraphs, headers, and other block constructs can be parsed for inline +structure. The second step requires information about link reference +definitions that will be available only at the end of the first +step. Note that the first step requires processing lines in sequence, +but the second can be parallelized, since the inline parsing of +one block element does not affect the inline parsing of any other. + +## Container blocks and leaf blocks + +We can divide blocks into two types: +[container blocks](#container-block), <a id="container-block"/> +which can contain other blocks, and [leaf blocks](#leaf-block), +<a id="leaf-block"/> which cannot. + +# Leaf blocks + +This section describes the different kinds of leaf block that make up a +markdown document. + +## Horizontal rules + +A line consisting of 0-3 spaces of indentation, followed by a sequence +of three or more matching `-`, `_`, or `*` characters, each followed +optionally any number of spaces, forms a [horizontal +rule](#horizontal-rule). <a id="horizontal-rule"/> + +. +*** +--- +___ +. +<hr /> +<hr /> +<hr /> +. + +Wrong characters: + +. ++++ +. +<p>+++</p> +. + +. +=== +. +<p>===</p> +. + +Not enough characters: + +. +-- +** +__ +. +<p>-- +** +__</p> +. + +One to three spaces indent are allowed: + +. + *** + *** + *** +. +<hr /> +<hr /> +<hr /> +. + +Four spaces is too many: + +. + *** +. +<pre><code>*** +</code></pre> +. + +. +Foo + *** +. +<p>Foo +***</p> +. + +More than three characters may be used: + +. +_____________________________________ +. +<hr /> +. + +Spaces are allowed between the characters: + +. + - - - +. +<hr /> +. + +. + ** * ** * ** * ** +. +<hr /> +. + +. +- - - - +. +<hr /> +. + +Spaces are allowed at the end: + +. +- - - - +. +<hr /> +. + +However, no other characters may occur at the end or the +beginning: + +. +_ _ _ _ a + +a------ +. +<p>_ _ _ _ a</p> +<p>a------</p> +. + +It is required that all of the non-space characters be the same. +So, this is not a horizontal rule: + +. + *-* +. +<p><em>-</em></p> +. + +Horizontal rules do not need blank lines before or after: + +. +- foo +*** +- bar +. +<ul> +<li>foo</li> +</ul> +<hr /> +<ul> +<li>bar</li> +</ul> +. + +Horizontal rules can interrupt a paragraph: + +. +Foo +*** +bar +. +<p>Foo</p> +<hr /> +<p>bar</p> +. + +Note, however, that this is a setext header, not a paragraph followed +by a horizontal rule: + +. +Foo +--- +bar +. +<h2>Foo</h2> +<p>bar</p> +. + +When both a horizontal rule and a list item are possible +interpretations of a line, the horizontal rule is preferred: + +. +* Foo +* * * +* Bar +. +<ul> +<li>Foo</li> +</ul> +<hr /> +<ul> +<li>Bar</li> +</ul> +. + +If you want a horizontal rule in a list item, use a different bullet: + +. +- Foo +- * * * +. +<ul> +<li>Foo</li> +<li><hr /></li> +</ul> +. + +## ATX headers + +An [ATX header](#atx-header) <a id="atx-header"/> +consists of a string of characters, parsed as inline content, between an +opening sequence of 1--6 unescaped `#` characters and an optional +closing sequence of any number of `#` characters. The opening sequence +of `#` characters cannot be followed directly by a nonspace character. +The closing `#` characters may be followed by spaces only. The opening +`#` character may be indented 0-3 spaces. The raw contents of the +header are stripped of leading and trailing spaces before being parsed +as inline content. The header level is equal to the number of `#` +characters in the opening sequence. + +Simple headers: + +. +# foo +## foo +### foo +#### foo +##### foo +###### foo +. +<h1>foo</h1> +<h2>foo</h2> +<h3>foo</h3> +<h4>foo</h4> +<h5>foo</h5> +<h6>foo</h6> +. + +More than six `#` characters is not a header: + +. +####### foo +. +<p>####### foo</p> +. + +A space is required between the `#` characters and the header's +contents. Note that many implementations currently do not require +the space. However, the space was required by the [original ATX +implementation](http://www.aaronsw.com/2002/atx/atx.py), and it helps +prevent things like the following from being parsed as headers: + +. +#5 bolt +. +<p>#5 bolt</p> +. + +This is not a header, because the first `#` is escaped: + +. +\## foo +. +<p>## foo</p> +. + +Contents are parsed as inlines: + +. +# foo *bar* \*baz\* +. +<h1>foo <em>bar</em> *baz*</h1> +. + +Leading and trailing blanks are ignored in parsing inline content: + +. +# foo +. +<h1>foo</h1> +. + +One to three spaces indentation are allowed: + +. + ### foo + ## foo + # foo +. +<h3>foo</h3> +<h2>foo</h2> +<h1>foo</h1> +. + +Four spaces are too much: + +. + # foo +. +<pre><code># foo +</code></pre> +. + +. +foo + # bar +. +<p>foo +# bar</p> +. + +A closing sequence of `#` characters is optional: + +. +## foo ## + ### bar ### +. +<h2>foo</h2> +<h3>bar</h3> +. + +It need not be the same length as the opening sequence: + +. +# foo ################################## +##### foo ## +. +<h1>foo</h1> +<h5>foo</h5> +. + +Spaces are allowed after the closing sequence: + +. +### foo ### +. +<h3>foo</h3> +. + +A sequence of `#` characters with a nonspace character following it +is not a closing sequence, but counts as part of the contents of the +header: + +. +### foo ### b +. +<h3>foo ### b</h3> +. + +Backslash-escaped `#` characters do not count as part +of the closing sequence: + +. +### foo \### +## foo \#\## +. +<h3>foo #</h3> +<h2>foo ##</h2> +. + +ATX headers need not be separated from surrounding content by blank +lines, and they can interrupt paragraphs: + +. +**** +## foo +**** +. +<hr /> +<h2>foo</h2> +<hr /> +. + +. +Foo bar +# baz +Bar foo +. +<p>Foo bar</p> +<h1>baz</h1> +<p>Bar foo</p> +. + +ATX headers can be empty: + +. +## +# +### ### +. +<h2></h2> +<h1></h1> +<h3></h3> +. + +## Setext headers + +A [setext header](#setext-header) <a id="setext-header"/> +consists of a line of text, containing at least one nonspace character, +with no more than 3 spaces indentation, followed by a [setext header +underline](#setext-header-underline). A [setext header +underline](#setext-header-underline) <a id="setext-header-underline"/> +is a sequence of `=` characters or a sequence of `-` characters, with no +more than 3 spaces indentation and any number of leading or trailing +spaces. The header is a level 1 header if `=` characters are used, and +a level 2 header if `-` characters are used. The contents of the header +are the result of parsing the first line as markdown inline content. + +In general, a setext header need not be preceded or followed by a +blank line. However, it cannot interrupt a paragraph, so when a +setext header comes after a paragraph, a blank line is needed between +them. + +Simple examples: + +. +Foo *bar* +========= + +Foo *bar* +--------- +. +<h1>Foo <em>bar</em></h1> +<h2>Foo <em>bar</em></h2> +. + +The underlining can be any length: + +. +Foo +------------------------- + +Foo += +. +<h2>Foo</h2> +<h1>Foo</h1> +. + +The header content can be indented up to three spaces, and need +not line up with the underlining: + +. + Foo +--- + + Foo +----- + + Foo + === +. +<h2>Foo</h2> +<h2>Foo</h2> +<h1>Foo</h1> +. + +Four spaces indent is too much: + +. + Foo + --- + + Foo +--- +. +<pre><code>Foo +--- + +Foo +</code></pre> +<hr /> +. + +The setext header underline can be indented up to three spaces, and +may have trailing spaces: + +. +Foo + ---- +. +<h2>Foo</h2> +. + +Four spaces is too much: + +. +Foo + --- +. +<p>Foo +---</p> +. + +The setext header underline cannot contain internal spaces: + +. +Foo += = + +Foo +--- - +. +<p>Foo += =</p> +<p>Foo</p> +<hr /> +. + +Trailing spaces in the content line do not cause a line break: + +. +Foo +----- +. +<h2>Foo</h2> +. + +Nor does a backslash at the end: + +. +Foo\ +---- +. +<h2>Foo\</h2> +. + +Since indicators of block structure take precedence over +indicators of inline structure, the following are setext headers: + +. +`Foo +---- +` + +<a title="a lot +--- +of dashes"/> +. +<h2>`Foo</h2> +<p>`</p> +<h2><a title="a lot</h2> +<p>of dashes"/></p> +. + +The setext header underline cannot be a lazy line: + +. +> Foo +--- +. +<blockquote> +<p>Foo</p> +</blockquote> +<hr /> +. + +A setext header cannot interrupt a paragraph: + +. +Foo +Bar +--- + +Foo +Bar +=== +. +<p>Foo +Bar</p> +<hr /> +<p>Foo +Bar +===</p> +. + +But in general a blank line is not required before or after: + +. +--- +Foo +--- +Bar +--- +Baz +. +<hr /> +<h2>Foo</h2> +<h2>Bar</h2> +<p>Baz</p> +. + +Setext headers cannot be empty: + +. + +==== +. +<p>====</p> +. + + +## Indented code blocks + +An [indented code block](#indented-code-block) +<a id="indented-code-block"/> is composed of one or more +[indented chunks](#indented-chunk) separated by blank lines. +An [indented chunk](#indented-chunk) <a id="indented-chunk"/> +is a sequence of non-blank lines, each indented four or more +spaces. An indented code block cannot interrupt a paragraph, so +if it occurs before or after a paragraph, there must be an +intervening blank line. The contents of the code block are +the literal contents of the lines, including trailing newlines, +minus four spaces of indentation. An indented code block has no +attributes. + +. + a simple + indented code block +. +<pre><code>a simple + indented code block +</code></pre> +. + +The contents are literal text, and do not get parsed as markdown: + +. + <a/> + *hi* + + - one +. +<pre><code><a/> +*hi* + +- one +</code></pre> +. + +Here we have three chunks separated by blank lines: + +. + chunk1 + + chunk2 + + + + chunk3 +. +<pre><code>chunk1 + +chunk2 + + + +chunk3 +</code></pre> +. + +Any initial spaces beyond four will be included in the content, even +in interior blank lines: + +. + chunk1 + + chunk2 +. +<pre><code>chunk1 + + chunk2 +</code></pre> +. + +An indented code code block cannot interrupt a paragraph. (This +allows hanging indents and the like.) + +. +Foo + bar + +. +<p>Foo +bar</p> +. + +However, any non-blank line with fewer than four leading spaces ends +the code block immediately. So a paragraph may occur immediately +after indented code: + +. + foo +bar +. +<pre><code>foo +</code></pre> +<p>bar</p> +. + +And indented code can occur immediately before and after other kinds of +blocks: + +. +# Header + foo +Header +------ + foo +---- +. +<h1>Header</h1> +<pre><code>foo +</code></pre> +<h2>Header</h2> +<pre><code>foo +</code></pre> +<hr /> +. + +The first line can be indented more than four spaces: + +. + foo + bar +. +<pre><code> foo +bar +</code></pre> +. + +Blank lines preceding or following an indented code block +are not included in it: + +. + + + foo + + +. +<pre><code>foo +</code></pre> +. + +Trailing spaces are included in the code block's content: + +. + foo +. +<pre><code>foo +</code></pre> +. + + +## Fenced code blocks + +A [code fence](#code-fence) <a id="code-fence"/> is a sequence +of at least three consecutive backtick characters (`` ` ``) or +tildes (`~`). (Tildes and backticks cannot be mixed.). +A [fenced code block](#fenced-code-block) <a id="fenced-code-block"/> +begins with a code fence, indented no more than three spaces. + +The line with the opening code fence may optionally contain some text +following the code fence; this is trimmed of leading and trailing +spaces and called the [info string](#info-string). <a +id="info-string"/> The [info string] may not contain any backtick +characters. (The reason for this restriction is that otherwise +some inline code would be incorrectly interpreted as the +beginning of a fenced code block.) + +The content of the code block consists of all subsequent lines, until +a closing [code fence](#code-fence) of the same type as the code block +began with (backticks or tildes), and with at least as many backticks +or tildes as the opening code fence. If the leading code fence is +indented N spaces, then up to N spaces of indentation are removed from +each line of the content (if present). (If a content line is not +indented, it is preserved unchanged. If it is indented less than N +spaces, all of the indentation is removed.) + +The closing code fence may be indented up to three spaces, and may be +followed only by spaces, which are ignored. If the end of the +document is reached and no closing code fence has been found, the code +block contains all of the lines after the opening code fence. +(An alternative spec would require backtracking in the event +that a closing code fence is not found. But this makes parsing much +less efficient, and there seems to be no real down side to the +behavior described here.) + +A fenced code block may interrupt a paragraph, and does not require +a blank line either before or after. + +The content of a code fence is treated as literal text, not parsed +as inlines. The first word of the info string is typically used to +specify the language of the code sample, and rendered in the `class` +attribute of the `pre` tag. However, this spec does not mandate any +particular treatment of the info string. + +Here is a simple example with backticks: + +. +``` +< + > +``` +. +<pre><code>< + > +</code></pre> +. + +With tildes: + +. +~~~ +< + > +~~~ +. +<pre><code>< + > +</code></pre> +. + +The closing code fence must use the same character as the opening +fence: + +. +``` +aaa +~~~ +``` +. +<pre><code>aaa +~~~ +</code></pre> +. + +. +~~~ +aaa +``` +~~~ +. +<pre><code>aaa +``` +</code></pre> +. + +The closing code fence must be at least as long as the opening fence: + +. +```` +aaa +``` +`````` +. +<pre><code>aaa +``` +</code></pre> +. + +. +~~~~ +aaa +~~~ +~~~~ +. +<pre><code>aaa +~~~ +</code></pre> +. + +Unclosed code blocks are closed by the end of the document: + +. +``` +. +<pre><code></code></pre> +. + +. +````` + +``` +aaa +. +<pre><code> +``` +aaa +</code></pre> +. + +A code block can have all empty lines as its content: + +. +``` + + +``` +. +<pre><code> + +</code></pre> +. + +A code block can be empty: + +. +``` +``` +. +<pre><code></code></pre> +. + +Fences can be indented. If the opening fence is indented, +content lines will have equivalent opening indentation removed, +if present: + +. + ``` + aaa +aaa +``` +. +<pre><code>aaa +aaa +</code></pre> +. + +. + ``` +aaa + aaa +aaa + ``` +. +<pre><code>aaa +aaa +aaa +</code></pre> +. + +. + ``` + aaa + aaa + aaa + ``` +. +<pre><code>aaa + aaa +aaa +</code></pre> +. + +Four spaces indentation produces an indented code block: + +. + ``` + aaa + ``` +. +<pre><code>``` +aaa +``` +</code></pre> +. + +Code fences (opening and closing) cannot contain internal spaces: + +. +``` ``` +aaa +. +<p><code></code> +aaa</p> +. + +. +~~~~~~ +aaa +~~~ ~~ +. +<pre><code>aaa +~~~ ~~ +</code></pre> +. + +Fenced code blocks can interrupt paragraphs, and can be followed +directly by paragraphs, without a blank line between: + +. +foo +``` +bar +``` +baz +. +<p>foo</p> +<pre><code>bar +</code></pre> +<p>baz</p> +. + +Other blocks can also occur before and after fenced code blocks +without an intervening blank line: + +. +foo +--- +~~~ +bar +~~~ +# baz +. +<h2>foo</h2> +<pre><code>bar +</code></pre> +<h1>baz</h1> +. + +An [info string](#info-string) can be provided after the opening code fence. +Opening and closing spaces will be stripped, and the first word +is used here to populate the `class` attribute of the enclosing +`pre` tag. + +. +```ruby +def foo(x) + return 3 +end +``` +. +<pre class="ruby"><code>def foo(x) + return 3 +end +</code></pre> +. + +. +~~~~ ruby startline=3 $%@#$ +def foo(x) + return 3 +end +~~~~~~~ +. +<pre class="ruby"><code>def foo(x) + return 3 +end +</code></pre> +. + +. +````; +```` +. +<pre class=";"><code></code></pre> +. + +Info strings for backtick code blocks cannot contain backticks: + +. +``` aa ``` +foo +. +<p><code>aa</code> +foo</p> +. + +Closing code fences cannot have info strings: + +. +``` +``` aaa +``` +. +<pre><code>``` aaa +</code></pre> +. + + +## HTML blocks + +An [HTML block tag](#html-block-tag) <a id="html-block-tag"/> is +an [open tag](#open-tag) or [closing tag](#closing-tag) whose tag +name is one of the following (case-insensitive): +`article`, `header`, `aside`, `hgroup`, `blockquote`, `hr`, `body`, +`li`, `br`, `map`, `button`, `object`, `canvas`, `ol`, `caption`, +`output`, `col`, `p`, `colgroup`, `pre`, `dd`, `progress`, `div`, +`section`, `dl`, `table`, `td`, `dt`, `tbody`, `embed`, `textarea`, +`fieldset`, `tfoot`, `figcaption`, `th`, `figure`, `thead`, `footer`, +`footer`, `tr`, `form`, `ul`, `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, +`video`, `script`, `style`. + +An [HTML block](#html-block) <a id="html-block"/> begins with an +[HTML block tag](#html-block-tag), [HTML comment](#html-comment), +[processing instruction](#processing-instruction), +[declaration](#declaration), or [CDATA section](#cdata-section). +It ends when a [blank line](#blank-line) or the end of the +input is encountered. The initial line may be indented up to three +spaces, and subsequent lines may have any indentation. The contents +of the HTML block are interpreted as raw HTML, and will not be escaped +in HTML output. + +Some simple examples: + +. +<table> + <tr> + <td> + hi + </td> + </tr> +</table> + +okay. +. +<table> + <tr> + <td> + hi + </td> + </tr> +</table> +<p>okay.</p> +. + +. + <div> + *hello* + <foo><a> +. + <div> + *hello* + <foo><a> +. + +Here we have two code blocks with a markdown paragraph between them: + +. +<DIV CLASS="foo"> + +*Markdown* + +</DIV> +. +<DIV CLASS="foo"> +<p><em>Markdown</em></p> +</DIV> +. + +In the following example, what looks like a markdown code block +is actually part of the HTML block, which continues until a blank +line or the end of the document is reached: + +. +<div></div> +``` c +int x = 33; +``` +. +<div></div> +``` c +int x = 33; +``` +. + +A comment: + +. +<!-- Foo +bar + baz --> +. +<!-- Foo +bar + baz --> +. + +A processing instruction: + +. +<?php + echo 'foo' +?> +. +<?php + echo 'foo' +?> +. + +CDATA: + +. +<![CDATA[ +function matchwo(a,b) +{ +if (a < b && a < 0) then + { + return 1; + } +else + { + return 0; + } +} +]]> +. +<![CDATA[ +function matchwo(a,b) +{ +if (a < b && a < 0) then + { + return 1; + } +else + { + return 0; + } +} +]]> +. + +The opening tag can be indented 1-3 spaces, but not 4: + +. + <!-- foo --> + + <!-- foo --> +. + <!-- foo --> +<pre><code><!-- foo --> +</code></pre> +. + +An HTML block can interrupt a paragraph, and need not be preceded +by a blank line. + +. +Foo +<div> +bar +</div> +. +<p>Foo</p> +<div> +bar +</div> +. + +However, a following blank line is always needed, except at the end of +a document: + +. +<div> +bar +</div> +*foo* +. +<div> +bar +</div> +*foo* +. + +An incomplete HTML block tag may also start an HTML block: + +. +<div class +foo +. +<div class +foo +. + +This rule differs from John Gruber's original markdown syntax +specification, which says: + +> The only restrictions are that block-level HTML elements — +> e.g. `<div>`, `<table>`, `<pre>`, `<p>`, etc. — must be separated from +> surrounding content by blank lines, and the start and end tags of the +> block should not be indented with tabs or spaces. + +In some ways Gruber's rule is more restrictive than the one given +here: + +- It requires that an HTML block be preceded by a blank line. +- It does not allow the start tag to be indented. +- It requires a matching end tag, which it also does not allow to + be indented. + +Indeed, most markdown implementations, including some of Gruber's +own perl implementations, do not impose these restrictions. + +There is one respect, however, in which Gruber's rule is more liberal +than the one given here, since it allows blank lines to occur inside +an HTML block. There are two reasons for disallowing them here. +First, it removes the need to parse balanced tags, which is +expensive and can require backtracking from the end of the document +if no matching end tag is found. Second, it provides a very simple +and flexible way of including markdown content inside HTML tags: +simply separate the markdown from the HTML using blank lines: + +. +<div> + +*Emphasized* text. + +</div> +. +<div> +<p><em>Emphasized</em> text.</p> +</div> +. + +Compare: + +. +<div> +*Emphasized* text. +</div> +. +<div> +*Emphasized* text. +</div> +. + +Some markdown implementations have adopted a convention of +interpreting content inside tags as text if the open tag has +the attribute `markdown=1`. The rule given above seems a simpler and +more elegant way of achieving the same expressive power, which is also +much simpler to parse. + +The main potential drawback is that one can no longer paste HTML +blocks into markdown documents with 100% reliability. However, +*in most cases* this will work fine, because the blank lines in +HTML are usually followed by HTML block tags. For example: + +. +<table> + +<tr> + +<td> +Hi +</td> + +</tr> + +</table> +. +<table> +<tr> +<td> +Hi +</td> +</tr> +</table> +. + +Moreover, blank lines are usually not necessary and can be +deleted. The exception is inside `<pre>` tags; here, one can +replace the blank lines with ` ` entities. + +So there is no important loss of expressive power with the new rule. + +## Link reference definitions + +A [link reference definition](#link-reference-definition) +<a id="link-reference-definition"/> consists of a [link +label](#link-label), indented up to three spaces, followed +by a colon (`:`), optional blank space (including up to one +newline), a [link destination](#link-destination), optional +blank space (including up to one newline), and an optional [link +title](#link-title), which if it is present must be separated +from the [link destination](#link-destination) by whitespace. +No further non-space characters may occur on the line. + +A [link reference-definition](#link-reference-definition) +does not correspond to a structural element of a document. Instead, it +defines a label which can be used in [reference links](#reference-link) +and reference-style [images](#image) elsewhere in the document. [Link +references] can be defined either before or after the links that use +them. + +. +[foo]: /url "title" + +[foo] +. +<p><a href="/url" title="title">foo</a></p> +. + +. + [foo]: + /url + 'the title' + +[foo] +. +<p><a href="/url" title="the title">foo</a></p> +. + +. +[Foo*bar\]]:my_(url) 'title (with parens)' + +[Foo*bar\]] +. +<p><a href="my_(url)" title="title (with parens)">Foo*bar]</a></p> +. + +. +[Foo bar]: +<my url> +'title' + +[Foo bar] +. +<p><a href="my url" title="title">Foo bar</a></p> +. + +The title may be omitted: + +. +[foo]: +/url + +[foo] +. +<p><a href="/url">foo</a></p> +. + +The link destination may not be omitted: + +. +[foo]: + +[foo] +. +<p>[foo]:</p> +<p>[foo]</p> +. + +A link can come before its corresponding definition: + +. +[foo] + +[foo]: url +. +<p><a href="url">foo</a></p> +. + +If there are several matching definitions, the first one takes +precedence: + +. +[foo] + +[foo]: first +[foo]: second +. +<p><a href="first">foo</a></p> +. + +As noted in the section on [Links], matching of labels is +case-insensitive (see [matches](#matches)). + +. +[FOO]: /url + +[Foo] +. +<p><a href="/url">Foo</a></p> +. + +. +[ΑΓΩ]: /φου + +[αγω] +. +<p><a href="/φου">αγω</a></p> +. + +Here is a link reference definition with no corresponding link. +It contributes nothing to the document. + +. +[foo]: /url +. +. + +This is not a link reference definition, because there are +non-space characters after the title: + +. +[foo]: /url "title" ok +. +<p>[foo]: /url "title" ok</p> +. + +This is not a link reference definition, because it is indented +four spaces: + +. + [foo]: /url "title" + +[foo] +. +<pre><code>[foo]: /url "title" +</code></pre> +<p>[foo]</p> +. + +This is not a link reference definition, because it occurs inside +a code block: + +. +``` +[foo]: /url +``` + +[foo] +. +<pre><code>[foo]: /url +</code></pre> +<p>[foo]</p> +. + +A [link reference definition](#link-reference-definition) cannot +interrupt a paragraph. + +. +Foo +[bar]: /baz + +[bar] +. +<p>Foo +[bar]: /baz</p> +<p>[bar]</p> +. + +However, it can directly follow other block elements, such as headers +and horizontal rules, and it need not be followed by a blank line. + +. +# [Foo] +[foo]: /url +> bar +. +<h1><a href="/url">Foo</a></h1> +<blockquote> +<p>bar</p> +</blockquote> +. + +Several [link references](#link-reference) can occur one after another, +without intervening blank lines. + +. +[foo]: /foo-url "foo" +[bar]: /bar-url + "bar" +[baz]: /baz-url + +[foo], +[bar], +[baz] +. +<p><a href="/foo-url" title="foo">foo</a>, +<a href="/bar-url" title="bar">bar</a>, +<a href="/baz-url">baz</a></p> +. + +[Link reference definitions](#link-reference-definition) can occur +inside block containers, like lists and block quotations. They +affect the entire document, not just the container in which they +are defined: + +. +[foo] + +> [foo]: /url +. +<p><a href="/url">foo</a></p> +<blockquote> +</blockquote> +. + + +## Paragraphs + +A sequence of non-blank lines that cannot be interpreted as other +kinds of blocks forms a [paragraph](#paragraph) <a id="paragraph"/>. +The contents of the paragraph are the result of parsing the +paragraph's raw content as inlines. The paragraph's raw content +is formed by concatenating the lines and removing initial and final +spaces. + +A simple example with two paragraphs: + +. +aaa + +bbb +. +<p>aaa</p> +<p>bbb</p> +. + +Paragraphs can contain multiple lines, but no blank lines: + +. +aaa +bbb + +ccc +ddd +. +<p>aaa +bbb</p> +<p>ccc +ddd</p> +. + +Multiple blank lines between paragraph have no effect: + +. +aaa + + +bbb +. +<p>aaa</p> +<p>bbb</p> +. + +Leading spaces are skipped: + +. + aaa + bbb +. +<p>aaa +bbb</p> +. + +Lines after the first may be indented any amount, since indented +code blocks cannot interrupt paragraphs. + +. +aaa + bbb + ccc +. +<p>aaa +bbb +ccc</p> +. + +However, the first line may be indented at most three spaces, +or an indented code block will be triggered: + +. + aaa +bbb +. +<p>aaa +bbb</p> +. + +. + aaa +bbb +. +<pre><code>aaa +</code></pre> +<p>bbb</p> +. + +Final spaces are stripped before inline parsing, so a paragraph +that ends with two or more spaces will not end with a hard line +break: + +. +aaa +bbb +. +<p>aaa<br /> +bbb</p> +. + +## Blank lines + +[Blank lines](#blank-line) between block-level elements are ignored, +except for the role they play in determining whether a [list](#list) +is [tight](#tight) or [loose](#loose). + +Blank lines at the beginning and end of the document are also ignored. + +. + + +aaa + + +# aaa + + +. +<p>aaa</p> +<h1>aaa</h1> +. + + +# Container blocks + +A [container block](#container-block) is a block that has other +blocks as its contents. There are two basic kinds of container blocks: +[block quotes](#block-quote) and [list items](#list-item). +[Lists](#list) are meta-containers for [list items](#list-item). + +We define the syntax for container blocks recursively. The general +form of the definition is: + +> If X is a sequence of blocks, then the result of +> transforming X in such-and-such a way is a container of type Y +> with these blocks as its content. + +So, we explain what counts as a block quote or list item by +explaining how these can be *generated* from their contents. +This should suffice to define the syntax, although it does not +give a recipe for *parsing* these constructions. (A recipe is +provided below in the section entitled [A parsing strategy].) + +## Block quotes + +A [block quote marker](#block-quote-marker) <a id="block-quote-marker"/> +consists of 0-3 spaces of initial indent, plus (a) the character `>` together +with a following space, or (b) a single character `>` not followed by a space. + +The following rules define [block quotes](#block-quote): +<a id="block-quote"/> + +1. **Basic case.** If a string of lines *Ls* constitute a sequence + of blocks *Bs*, then the result of appending a [block quote marker] + to the beginning of each line in *Ls* is a [block quote](#block-quote) + containing *Bs*. + +2. **Laziness.** If a string of lines *Ls* constitute a [block + quote](#block-quote) with contents *Bs*, then the result of deleting + the initial [block quote marker](#block-quote-marker) from one or + more lines in which the next non-space character after the [block + quote marker](#block-quote-marker) is [paragraph continuation + text](#paragraph-continuation-text) is a block quote with *Bs* as + its content. [Paragraph continuation + text](#paragraph-continuation-text) is text that will be parsed as + part of the content of a paragraph, but does not occur at the + beginning of the paragraph. + +3. **Consecutiveness.** A document cannot contain two [block + quotes](#block-quote) in a row unless there is a [blank + line](#blank-line) between them. + +Nothing else counts as a [block quote](#block-quote). + +Here is a simple example: + +. +> # Foo +> bar +> baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +The spaces after the `>` characters can be omitted: + +. +># Foo +>bar +> baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +The `>` characters can be indented 1-3 spaces: + +. + > # Foo + > bar + > baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +Four spaces gives us a code block: + +. + > # Foo + > bar + > baz +. +<pre><code>> # Foo +> bar +> baz +</code></pre> +. + +The Laziness clause allows us to omit the `>` before a +paragraph continuation line: + +. +> # Foo +> bar +baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +. + +A block quote can contain some lazy and some non-lazy +continuation lines: + +. +> bar +baz +> foo +. +<blockquote> +<p>bar +baz +foo</p> +</blockquote> +. + +Laziness only applies to lines that are continuations of +paragraphs. Lines containing characters or indentation that indicate +block structure cannot be lazy. + +. +> foo +--- +. +<blockquote> +<p>foo</p> +</blockquote> +<hr /> +. + +. +> - foo +- bar +. +<blockquote> +<ul> +<li>foo</li> +</ul> +</blockquote> +<ul> +<li>bar</li> +</ul> +. + +. +> foo + bar +. +<blockquote> +<pre><code>foo +</code></pre> +</blockquote> +<pre><code>bar +</code></pre> +. + +. +> ``` +foo +``` +. +<blockquote> +<pre><code></code></pre> +</blockquote> +<p>foo</p> +<pre><code></code></pre> +. + +A block quote can be empty: + +. +> +. +<blockquote> +</blockquote> +. + +. +> +> +> +. +<blockquote> +</blockquote> +. + +A block quote can have initial or final blank lines: + +. +> +> foo +> +. +<blockquote> +<p>foo</p> +</blockquote> +. + +A blank line always separates block quotes: + +. +> foo + +> bar +. +<blockquote> +<p>foo</p> +</blockquote> +<blockquote> +<p>bar</p> +</blockquote> +. + +(Most current markdown implementations, including John Gruber's +original `Markdown.pl`, will parse this eample as a single block quote +with two paragraphs. But it seems better to allow the author to decide +whether two block quotes or one are wanted.) + +Consecutiveness means that if we put these block quotes together, +we get a single block quote: + +. +> foo +> bar +. +<blockquote> +<p>foo +bar</p> +</blockquote> +. + +To get a block quote with two paragraphs, use: + +. +> foo +> +> bar +. +<blockquote> +<p>foo</p> +<p>bar</p> +</blockquote> +. + +Block quotes can interrupt paragraphs: + +. +foo +> bar +. +<p>foo</p> +<blockquote> +<p>bar</p> +</blockquote> +. + +In general, blank lines are not needed before or after block +quotes: + +. +> aaa +*** +> bbb +. +<blockquote> +<p>aaa</p> +</blockquote> +<hr /> +<blockquote> +<p>bbb</p> +</blockquote> +. + +However, because of laziness, a blank line is needed between +a block quote and a following paragraph: + +. +> bar +baz +. +<blockquote> +<p>bar +baz</p> +</blockquote> +. + +. +> bar + +baz +. +<blockquote> +<p>bar</p> +</blockquote> +<p>baz</p> +. + +. +> bar +> +baz +. +<blockquote> +<p>bar</p> +</blockquote> +<p>baz</p> +. + +It is a consequence of the Laziness rule that any number +of initial `>`s may be omitted on a continuation line of a +nested block quote: + +. +> > > foo +bar +. +<blockquote> +<blockquote> +<blockquote> +<p>foo +bar</p> +</blockquote> +</blockquote> +</blockquote> +. + +. +>>> foo +> bar +>>baz +. +<blockquote> +<blockquote> +<blockquote> +<p>foo +bar +baz</p> +</blockquote> +</blockquote> +</blockquote> +. + +When including an indented code block in a block quote, +remember that the [block quote marker](#block-quote-marker) includes +both the `>` and a following space. So *five spaces* are needed after +the `>`: + +. +> code + +> not code +. +<blockquote> +<pre><code>code +</code></pre> +</blockquote> +<blockquote> +<p>not code</p> +</blockquote> +. + + +## List items + +A [list marker](#list-marker) <a id="list-marker"/> is a +[bullet list marker](#bullet-list-marker) or an [ordered list +marker](#ordered-list-marker). + +A [bullet list marker](#bullet-list-marker) <a id="bullet-list-marker"/> +is a `-`, `+`, or `*` character. + +An [ordered list marker](#ordered-list-marker) <a id="ordered-list-marker"/> +is a sequence of one of more digits (`0-9`), followed by either a +`.` character or a `)` character. + +The following rules define [list items](#list-item): + +1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of + blocks *Bs* starting with a non-space character and not separated + from each other by more than one blank line, and *M* is a list + marker *M* of width *W* followed by 0 < *N* < 5 spaces, then the result + of prepending *M* and the following spaces to the first line of + *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a + list item with *Bs* as its contents. The type of the list item + (bullet or ordered) is determined by the type of its list marker. + If the list item is ordered, then it is also assigned a start + number, based on the ordered list marker. + +For example, let *Ls* be the lines + +. +A paragraph +with two lines. + + indented code + +> A block quote. +. +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +. + +And let *M* be the marker `1.`, and *N* = 2. Then rule #1 says +that the following is an ordered list item with start number 1, +and the same contents as *Ls*: + +. +1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +The most important thing to notice is that the position of +the text after the list marker determines how much indentation +is needed in subsequent blocks in the list item. If the list +marker takes up two spaces, and there are three spaces between +the list marker and the next nonspace character, then blocks +must be indented five spaces in order to fall under the list +item. + +Here are some examples showing how far content must be indented to be +put under the list item: + +. +- one + + two +. +<ul> +<li>one</li> +</ul> +<p>two</p> +. + +. +- one + + two +. +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +. + +. + - one + + two +. +<ul> +<li>one</li> +</ul> +<pre><code> two +</code></pre> +. + +. + - one + + two +. +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +. + +It is tempting to think of this in terms of columns: the continuation +blocks must be indented at least to the column of the first nonspace +character after the list marker. However, that is not quite right. +The spaces after the list marker determine how much relative indentation +is needed. Which column this indentation reaches will depend on +how the list item is embedded in other constructions, as show by +this example: + +. + > > 1. one +>> +>> two +. +<blockquote> +<blockquote> +<ol> +<li><p>one</p> +<p>two</p></li> +</ol> +</blockquote> +</blockquote> +. + +Here `two` occurs in the same column as the list marker `1.`, +but is actually contained in the list item, because there is +sufficent indentation after the last containing blockquote marker. + +The converse is also possible. In the following example, the word `two` +occurs far to the right of the initial text of the list item, `one`, but +it is not considered part of the list item, because it is not indented +far enough past the blockquote marker: + +. +>>- one +>> + > > two +. +<blockquote> +<blockquote> +<ul> +<li>one</li> +</ul> +<p>two</p> +</blockquote> +</blockquote> +. + +A list item may not contain blocks that are separated by more than +one blank line. Thus, two blank lines will end a list: + +. +- foo + + bar + +- foo + + + bar +. +<ul> +<li><p>foo</p> +<p>bar</p></li> +<li><p>foo</p></li> +</ul> +<p>bar</p> +. + +A list item may contain any kind of block: + +. +1. foo + + ``` + bar + ``` + + baz + + > bam +. +<ol> +<li><p>foo</p> +<pre><code>bar +</code></pre> +<p>baz</p> +<blockquote> +<p>bam</p> +</blockquote></li> +</ol> +. + +2. **Item starting with indented code.** If a sequence of lines *Ls* + constitute a sequence of blocks *Bs* starting with an indented code + block and not separated from each other by more than one blank line, + and *M* is a list marker *M* of width *W* followed by + one space, then the result of prepending *M* and the following + space to the first line of *Ls*, and indenting subsequent lines of + *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. + If a line is empty, then it need not be indented. The type of the + list item (bullet or ordered) is determined by the type of its list + marker. If the list item is ordered, then it is also assigned a + start number, based on the ordered list marker. + +An indented code block will have to be indented four spaces beyond +the edge of the region where text will be included in the list item. +In the following case that is 6 spaces: + +. +- foo + + bar +. +<ul> +<li><p>foo</p> +<pre><code>bar +</code></pre></li> +</ul> +. + +And in this case it is 11 spaces: + +. + 10. foo + + bar +. +<ol start="10"> +<li><p>foo</p> +<pre><code>bar +</code></pre></li> +</ol> +. + +If the *first* block in the list item is an indented code block, +then by rule #2, the contents must be indented *one* space after the +list marker: + +. + indented code + +paragraph + + more code +. +<pre><code>indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre> +. + +. +1. indented code + + paragraph + + more code +. +<ol> +<li><pre><code>indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre></li> +</ol> +. + +Note that an additional space indent is interpreted as space +inside the code block: + +. +1. indented code + + paragraph + + more code +. +<ol> +<li><pre><code> indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre></li> +</ol> +. + +Note that rules #1 and #2 only apply to two cases: (a) cases +in which the lines to be included in a list item begin with a nonspace +character, and (b) cases in which they begin with an indented code +block. In a case like the following, where the first block begins with +a three-space indent, the rules do not allow us to form a list item by +indenting the whole thing and prepending a list marker: + +. + foo + +bar +. +<p>foo</p> +<p>bar</p> +. + +. +- foo + + bar +. +<ul> +<li>foo</li> +</ul> +<p>bar</p> +. + +This is not a significant restriction, because when a block begins +with 1-3 spaces indent, the indentation can always be removed without +a change in interpretation, allowing rule #1 to be applied. So, in +the above case: + +. +- foo + + bar +. +<ul> +<li><p>foo</p> +<p>bar</p></li> +</ul> +. + + +3. **Indentation.** If a sequence of lines *Ls* constitutes a list item + according to rule #1 or #2, then the result of indenting each line + of *L* by 1-3 spaces (the same for each line) also constitutes a + list item with the same contents and attributes. If a line is + empty, then it need not be indented. + +Indented one space: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Indented two spaces: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Indented three spaces: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Four spaces indent gives a code block: + +. + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<pre><code>1. A paragraph + with two lines. + + indented code + + > A block quote. +</code></pre> +. + + +4. **Laziness.** If a string of lines *Ls* constitute a [list + item](#list-item) with contents *Bs*, then the result of deleting + some or all of the indentation from one or more lines in which the + next non-space character after the [list marker](#list--marker) is + [paragraph continuation text](#paragraph-continuation-text) is a + list item with the same contents and attributes. + +Here is an example with lazy continuation lines: + +. + 1. A paragraph +with two lines. + + indented code + + > A block quote. +. +<ol> +<li><p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote></li> +</ol> +. + +Indentation can be partially deleted: + +. + 1. A paragraph + with two lines. +. +<ol> +<li>A paragraph +with two lines.</li> +</ol> +. + +These examples show how laziness can work in nested structures: + +. +> 1. > Blockquote +continued here. +. +<blockquote> +<ol> +<li><blockquote> +<p>Blockquote +continued here.</p> +</blockquote></li> +</ol> +</blockquote> +. + +. +> 1. > Blockquote +> continued here. +. +<blockquote> +<ol> +<li><blockquote> +<p>Blockquote +continued here.</p> +</blockquote></li> +</ol> +</blockquote> +. + + +5. **That's all.** Nothing that is not counted as a list item by rules + #1--4 counts as a [list item](#block-quote). + +The rules for sublists follow from the general rules above. A sublist +must be indented the same number of spaces a paragraph would need to be +in order to be included in the list item. + +So, in this case we need two spaces indent: + +. +- foo + - bar + - baz +. +<ul> +<li>foo +<ul> +<li>bar +<ul> +<li>baz</li> +</ul></li> +</ul></li> +</ul> +. + +One is not enough: + +. +- foo + - bar + - baz +. +<ul> +<li>foo</li> +<li>bar</li> +<li>baz</li> +</ul> +. + +Here we need four, because the list marker is wider: + +. +10) foo + - bar +. +<ol start="10"> +<li>foo +<ul> +<li>bar</li> +</ul></li> +</ol> +. + +Three is not enough: + +. +10) foo + - bar +. +<ol start="10"> +<li>foo</li> +</ol> +<ul> +<li>bar</li> +</ul> +. + +A list may be the first block in a list item: + +. +- - foo +. +<ul> +<li><ul> +<li>foo</li> +</ul></li> +</ul> +. + +. +1. - 2. foo +. +<ol> +<li><ul> +<li><ol start="2"> +<li>foo</li> +</ol></li> +</ul></li> +</ol> +. + +A list item may be empty: + +. +- foo +- +- bar +. +<ul> +<li>foo</li> +<li></li> +<li>bar</li> +</ul> +. + +. +- +. +<ul> +<li></li> +</ul> +. + +### Motivation + +John Gruber's markdown spec says the following about list items: + +1. "List markers typically start at the left margin, but may be indented + by up to three spaces. List markers must be followed by one or more + spaces or a tab." + +2. "To make lists look nice, you can wrap items with hanging indents.... + But if you don't want to, you don't have to." + +3. "List items may consist of multiple paragraphs. Each subsequent + paragraph in a list item must be indented by either 4 spaces or one + tab." + +4. "It looks nice if you indent every line of the subsequent paragraphs, + but here again, Markdown will allow you to be lazy." + +5. "To put a blockquote within a list item, the blockquote's `>` + delimiters need to be indented." + +6. "To put a code block within a list item, the code block needs to be + indented twice — 8 spaces or two tabs." + +These rules specify that a paragraph under a list item must be indented +four spaces (presumably, from the left margin, rather than the start of +the list marker, but this is not said), and that code under a list item +must be indented eight spaces instead of the usual four. They also say +that a block quote must be indented, but not by how much; however, the +example given has four spaces indentation. Although nothing is said +about other kinds of block-level content, it is certainly reasonable to +infer that *all* block elements under a list item, including other +lists, must be indented four spaces. This principle has been called the +*four-space rule*. + +The four-space rule is clear and principled, and if the reference +implementation `Markdown.pl` had followed it, it probably would have +become the standard. However, `Markdown.pl` allowed paragraphs and +sublists to start with only two spaces indentation, at least on the +outer level. Worse, its behavior was inconsistent: a sublist of an +outer-level list needed two spaces indentation, but a sublist of this +sublist needed three spaces. It is not surprising, then, that different +implementations of markdown have developed very different rules for +determining what comes under a list item. (Pandoc and python-markdown, +for example, stuck with Gruber's syntax description and the four-space +rule, while discount, redcarpet, marked, PHP markdown, and others +followed `Markdown.pl`'s behavior more closely.) + +Unfortunately, given the divergences between implementations, there +is no way to give a spec for list items that will be guaranteed not +to break any existing documents. However, the spec given here should +correctly handle lists formatted with either the four-space rule or +the more forgiving `Markdown.pl` behavior, provided they are laid out +in a way that is natural for a human to read. + +The strategy here is to let the width and indentation of the list marker +determine the indentation necessary for blocks to fall under the list +item, rather than having a fixed and arbitrary number. The writer can +think of the body of the list item as a unit which gets indented to the +right enough to fit the list marker (and any indentation on the list +marker). (The laziness rule, #4, then allows continuation lines to be +unindented if needed.) + +This rule is superior, we claim, to any rule requiring a fixed level of +indentation from the margin. The four-space rule is clear but +unnatural. It is quite unintuitive that + +``` markdown +- foo + + bar + + - baz +``` + +should be parsed as two lists with an intervening paragraph, + +``` html +<ul> +<li>foo</li> +</ul> +<p>bar</p> +<ul> +<li>baz</li> +</ul> +``` + +as the four-space rule demands, rather than a single list, + +``` html +<ul> +<li><p>foo<p> +<p>bar></p></li> +<li><p>baz</p><li> +</ul> +``` + +The choice of four spaces is arbitrary. It can be learned, but it is +not likely to be guessed, and it trips up beginners regularly. + +Would it help to adopt a two-space rule? The problem is that such +a rule, together with the rule allowing 1--3 spaces indentation of the +initial list marker, allows text that is indented *less than* the +original list marker to be included in the list item. For example, +`Markdown.pl` parses + +``` markdown + - one + + two +``` + +as a single list item, with `two` a continuation paragraph: + +``` html +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +``` + +and similarly + +``` markdown +> - one +> +> two +``` + +as + +``` html +<blockquote> +<ul> +<li><p>one</p> +<p>two</p></li> +</ul> +</blockquote> +``` + +This is extremely unintuitive. + +Rather than requiring a fixed indent from the margin, we could require +a fixed indent (say, two spaces, or even one space) from the list marker (which +may itself be indented). This proposal would remove the last anomaly +discussed. Unlike the spec presented above, it would count the following +as a list item with a subparagraph, even though the paragraph `bar` +is not indented as far as the first paragraph `foo`: + +``` markdown + 10. foo + + bar +``` + +Arguably this text does read like a list item with `bar` as a subparagraph, +which may count in favor of the proposal. However, on this proposal indented +code would have to be indented six spaces after the list marker. And this +would break a lot of existing markdown, which has the pattern: + +``` markdown +1. foo + + indented code +``` + +where the code is indented eight spaces. The spec above, by contrast, will +parse this text as expected, since the code block's indentation is measured +from the beginning of `foo`. + +The one case that needs special treatment is a list item that *starts* +with indented code. How much indentation is required in that case, since +we don't have a "first paragraph" to measure from? Rule #2 simply stipulates +that in such cases, we require one space indentation from the list marker +(and then the normal four spaces for the indented code). This will match the +four-space rule in cases where the list marker plus its initial indentation +takes four spaces (a common case), but diverge in other cases. + +## Lists + +A [list](#list) <a id="list"/> is a sequence of one or more +list items [of the same type](#of-the-same-type). The list items +may be separated by single [blank lines](#blank-line), but two +blank lines end all containing lists. + +Two list items are [of the same type](#of-the-same-type) +<a id="of-the-same-type"/> if they begin with a [list +marker](#list-marker) of the same type. Two list markers are of the +same type if (a) they are bullet list markers using the same character +(`-`, `+`, or `*`) or (b) they are ordered list numbers with the same +delimiter (either `.` or `)`). + +A list is an [ordered list](#ordered-list) <a id="ordered-list"/> +if its constituent list items begin with +[ordered list markers](#ordered-list-marker), and a [bullet +list](#bullet-list) <a id="bullet-list"/> if its constituent list +items begin with [bullet list markers](#bullet-list-marker). + +The [start number](#start-number) <a id="start-number"/> +of an [ordered list](#ordered-list) is determined by the list number of +its initial list item. The numbers of subsequent list items are +disregarded. + +A list is [loose](#loose) if it any of its constituent list items are +separated by blank lines, or if any of its constituent list items +directly contain two block-level elements with a blank line between +them. Otherwise a list is [tight](#tight). (The difference in HTML output +is that paragraphs in a loose with are wrapped in `<p>` tags, while +paragraphs in a tight list are not.) + +Changing the bullet or ordered list delimiter starts a new list: + +. +- foo +- bar ++ baz +. +<ul> +<li>foo</li> +<li>bar</li> +</ul> +<ul> +<li>baz</li> +</ul> +. + +. +1. foo +2. bar +3) baz +. +<ol> +<li>foo</li> +<li>bar</li> +</ol> +<ol start="3"> +<li>baz</li> +</ol> +. + +There can be blank lines between items, but two blank lines end +a list: + +. +- foo + +- bar + + +- baz +. +<ul> +<li><p>foo</p></li> +<li><p>bar</p></li> +</ul> +<ul> +<li>baz</li> +</ul> +. + +As illustrated above in the section on [list items](#list-item), +two blank lines between blocks *within* a list item will also end a +list: + +. +- foo + + + bar +- baz +. +<ul> +<li>foo</li> +</ul> +<p>bar</p> +<ul> +<li>baz</li> +</ul> +. + +Indeed, two blank lines will end *all* containing lists: + +. +- foo + - bar + - baz + + + bim +. +<ul> +<li>foo +<ul> +<li>bar +<ul> +<li>baz</li> +</ul></li> +</ul></li> +</ul> +<pre><code> bim +</code></pre> +. + +Thus, two blank lines can be used to separate consecutive lists of +the same type, or to separate a list from an indented code block +that would otherwise be parsed as a subparagraph of the final list +item: + +. +- foo +- bar + + +- baz +- bim +. +<ul> +<li>foo</li> +<li>bar</li> +</ul> +<ul> +<li>baz</li> +<li>bim</li> +</ul> +. + +. +- foo + + notcode + +- foo + + + code +. +<ul> +<li><p>foo</p> +<p>notcode</p></li> +<li><p>foo</p></li> +</ul> +<pre><code>code +</code></pre> +. + +List items need not be indented to the same level. The following +list items will be treated as items at the same list level, +since none is indented enough to belong to the previous list +item: + +. +- a + - b + - c + - d + - e + - f +- g +. +<ul> +<li>a</li> +<li>b</li> +<li>c</li> +<li>d</li> +<li>e</li> +<li>f</li> +<li>g</li> +</ul> +. + +This is a loose list, because there is a blank line between +two of the list items: + +. +- a +- b + +- c +. +<ul> +<li><p>a</p></li> +<li><p>b</p></li> +<li><p>c</p></li> +</ul> +. + +So is this, with a empty second item: + +. +* a +* + +* c +. +<ul> +<li><p>a</p></li> +<li></li> +<li><p>c</p></li> +</ul> +. + +These are loose lists, even though there is no space between the items, +because one of the items directly contains two block-level elements +with a blank line between them: + +. +- a +- b + + c +- d +. +<ul> +<li><p>a</p></li> +<li><p>b</p> +<p>c</p></li> +<li><p>d</p></li> +</ul> +. + +. +- a +- b + + [ref]: /url +- d +. +<ul> +<li><p>a</p></li> +<li><p>b</p></li> +<li><p>d</p></li> +</ul> +. + +This is a tight list, because the blank lines are in a code block: + +. +- a +- ``` + b + + + ``` +- c +. +<ul> +<li>a</li> +<li><pre><code>b + + +</code></pre></li> +<li>c</li> +</ul> +. + +This is a tight list, because the blank line is between two +paragraphs of a sublist. So the inner list is loose while +the other list is tight: + +. +- a + - b + + c +- d +. +<ul> +<li>a +<ul> +<li><p>b</p> +<p>c</p></li> +</ul></li> +<li>d</li> +</ul> +. + +This is a tight list, because the blank line is inside the +block quote: + +. +* a + > b + > +* c +. +<ul> +<li>a +<blockquote> +<p>b</p> +</blockquote></li> +<li>c</li> +</ul> +. + +This list is tight, because the consecutive block elements +are not separated by blank lines: + +. +- a + > b + ``` + c + ``` +- d +. +<ul> +<li>a +<blockquote> +<p>b</p> +</blockquote> +<pre><code>c +</code></pre></li> +<li>d</li> +</ul> +. + +A single-paragraph list is tight: + +. +- a +. +<ul> +<li>a</li> +</ul> +. + +. +- a + - b +. +<ul> +<li>a +<ul> +<li>b</li> +</ul></li> +</ul> +. + +Here the outer list is loose, the inner list tight: + +. +* foo + * bar + + baz +. +<ul> +<li><p>foo</p> +<ul> +<li>bar</li> +</ul> +<p>baz</p></li> +</ul> +. + +. +- a + - b + - c + +- d + - e + - f +. +<ul> +<li><p>a</p> +<ul> +<li>b</li> +<li>c</li> +</ul></li> +<li><p>d</p> +<ul> +<li>e</li> +<li>f</li> +</ul></li> +</ul> +. + +# Inlines + +Inlines are parsed sequentially from the beginning of the character +stream to the end (left to right, in left-to-right languages). +Thus, for example, in + +. +`hi`lo` +. +<p><code>hi</code>lo`</p> +. + +`hi` is parsed as code, leaving the backtick at the end as a literal +backtick. + +## Backslash escapes + +Any ASCII punctuation character may be backslash-escaped: + +. +\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ +. +<p>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</p> +. + +Backslashes before other characters are treated as literal +backslashes: + +. +\→\A\a\ \3\φ\« +. +<p>\ \A\a\ \3\φ\«</p> +. + +Escaped characters are treated as regular characters and do +not have their usual markdown meanings: + +. +\*not emphasized* +\<br/> not a tag +\[not a link](/foo) +\`not code` +1\. not a list +\* not a list +\# not a header +\[foo]: /url "not a reference" +. +<p>*not emphasized* +<br/> not a tag +[not a link](/foo) +`not code` +1. not a list +* not a list +# not a header +[foo]: /url "not a reference"</p> +. + +If a backslash is itself escaped, the following character is not: + +. +\\*emphasis* +. +<p>\<em>emphasis</em></p> +. + +A backslash at the end of the line is a hard line break: + +. +foo\ +bar +. +<p>foo<br /> +bar</p> +. + +Backslash escapes do not work in code blocks, code spans, autolinks, or +raw HTML: + +. +`` \[\` `` +. +<p><code>\[\`</code></p> +. + +. + \[\] +. +<pre><code>\[\] +</code></pre> +. + +. +~~~ +\[\] +~~~ +. +<pre><code>\[\] +</code></pre> +. + +. +<http://google.com?find=\*> +. +<p><a href="http://google.com?find=\*">http://google.com?find=\*</a></p> +. + +. +<a href="/bar\/)"> +. +<p><a href="/bar\/)"></p> +. + +But they work in all other contexts, including URLs and link titles, +link references, and info strings in [fenced code +blocks](#fenced-code-block): + +. +[foo](/bar\* "ti\*tle") +. +<p><a href="/bar*" title="ti*tle">foo</a></p> +. + +. +[foo] + +[foo]: /bar\* "ti\*tle" +. +<p><a href="/bar*" title="ti*tle">foo</a></p> +. + +. +``` foo\+bar +foo +``` +. +<pre class="foo+bar"><code>foo +</code></pre> +. + + +## Entities + +Entities are parsed as entities, not as literal text, in all contexts +except code spans and code blocks. Three kinds of entities are recognized. + +[Named entities](#name-entities) <a id="named-entities"/> consist of `&` ++ a string of 2-32 alphanumerics beginning with a letter + `;`. + +. + & © Æ Ď ¾ ℋ ⅆ ∲ +. +<p> & © Æ Ď ¾ ℋ ⅆ ∲</p> +. + +[Decimal entities](#decimal-entities) <a id="decimal-entities"/> +consist of `&` + a string of 1--8 arabic digits + `;`. + +. + # Ӓ Ϡ � +. +<p> # Ӓ Ϡ �</p> +. + +[Hexadecimal entities](#hexadecimal-entities) <a id="hexadecimal-entities"/> +consist of `&` + either `X` or `x` + a string of 1-8 hexadecimal digits ++ `;`. + +. + " ആ ಫ +. +<p> " ആ ಫ</p> +. + +Here are some nonentities: + +. +  &x; &#; &#x; � &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?; +. +<p>&nbsp &x; &#; &#x; &#123456789; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?;</p> +. + +Although HTML5 does accept some entities without a trailing semicolon +(such as `©`), these are not recognized as entities here: + +. +© +. +<p>&copy</p> +. + +On the other hand, many strings that are not on the list of HTML5 +named entities are recognized as entities here: + +. +&MadeUpEntity; +. +<p>&MadeUpEntity;</p> +. + +Entities are recognized in any any context besides code spans or +code blocks, including raw HTML, URLs, [link titles](#link-title), and +[fenced code block](#fenced-code-block) info strings: + +. +<a href="öö.html"> +. +<p><a href="öö.html"></p> +. + +. +[foo](/föö "föö") +. +<p><a href="/föö" title="föö">foo</a></p> +. + +. +[foo] + +[foo]: /föö "föö" +. +<p><a href="/föö" title="föö">foo</a></p> +. + +. +``` föö +foo +``` +. +<pre class="föö"><code>foo +</code></pre> +. + +Entities are treated as literal text in code spans and code blocks: + +. +`föö` +. +<p><code>f&ouml;&ouml;</code></p> +. + +. + föfö +. +<pre><code>f&ouml;f&ouml; +</code></pre> +. + +## Code span + +A [backtick string](#backtick-string) <a id="backtick-string"/> +is a string of one or more backtick characters (`` ` ``) that is neither +preceded nor followed by a backtick. + +A code span begins with a backtick string and ends with a backtick +string of equal length. The contents of the code span are the +characters between the two backtick strings, with leading and trailing +spaces and newlines removed, and consecutive spaces and newlines +collapsed to single spaces. + +This is a simple code span: + +. +`foo` +. +<p><code>foo</code></p> +. + +Here two backticks are used, because the code contains a backtick. +This example also illustrates stripping of leading and trailing spaces: + +. +`` foo ` bar `` +. +<p><code>foo ` bar</code></p> +. + +This example shows the motivation for stripping leading and trailing +spaces: + +. +` `` ` +. +<p><code>``</code></p> +. + +Newlines are treated like spaces: + +. +`` +foo +`` +. +<p><code>foo</code></p> +. + +Interior spaces and newlines are collapsed into single spaces, just +as they would be by a browser: + +. +`foo bar + baz` +. +<p><code>foo bar baz</code></p> +. + +Q: Why not just leave the spaces, since browsers will collapse them +anyway? A: Because we might be targeting a non-HTML format, and we +shouldn't rely on HTML-specific rendering assumptions. + +(Existing implementations differ in their treatment of internal +spaces and newlines. Some, including `Markdown.pl` and +`showdown`, convert an internal newline into a `<br />` tag. +But this makes things difficult for those who like to hard-wrap +their paragraphs, since a line break in the midst of a code +span will cause an unintended line break in the output. Others +just leave internal spaces as they are, which is fine if only +HTML is being targeted.) + +. +`foo `` bar` +. +<p><code>foo `` bar</code></p> +. + +Note that backslash escapes do not work in code spans. All backslashes +are treated literally: + +. +`foo\`bar` +. +<p><code>foo\</code>bar`</p> +. + +Backslash escapes are never needed, because one can always choose a +string of *n* backtick characters as delimiters, where the code does +not contain any strings of exactly *n* backtick characters. + +Code span backticks have higher precedence than any other inline +constructs except HTML tags and autolinks. Thus, for example, this is +not parsed as emphasized text, since the second `*` is part of a code +span: + +. +*foo`*` +. +<p>*foo<code>*</code></p> +. + +And this is not parsed as a link: + +. +[not a `link](/foo`) +. +<p>[not a <code>link](/foo</code>)</p> +. + +But this is a link: + +. +<http://foo.bar.`baz>` +. +<p><a href="http://foo.bar.`baz">http://foo.bar.`baz</a>`</p> +. + +And this is an HTML tag: + +. +<a href="`">` +. +<p><a href="`">`</p> +. + +When a backtick string is not closed by a matching backtick string, +we just have literal backticks: + +. +```foo`` +. +<p>```foo``</p> +. + +. +`foo +. +<p>`foo</p> +. + +## Emphasis and strong emphasis + +John Gruber's original [markdown syntax +description](http://daringfireball.net/projects/markdown/syntax#em) says: + +> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of +> emphasis. Text wrapped with one `*` or `_` will be wrapped with an HTML +> `<em>` tag; double `*`'s or `_`'s will be wrapped with an HTML `<strong>` +> tag. + +This is enough for most users, but these rules leave much undecided, +especially when it comes to nested emphasis. The original +`Markdown.pl` test suite makes it clear that triple `***` and +`___` delimiters can be used for strong emphasis, and most +implementations have also allowed the following patterns: + +``` markdown +***strong emph*** +***strong** in emph* +***emph* in strong** +**in strong *emph*** +*in emph **strong*** +``` + +The following patterns are less widely supported, but the intent +is clear and they are useful (especially in contexts like bibliography +entries): + +``` markdown +*emph *with emph* in it* +**strong **with strong** in it** +``` + +Many implementations have also restricted intraword emphasis to +the `*` forms, to avoid unwanted emphasis in words containing +internal underscores. (It is best practice to put these in code +spans, but users often do not.) + +``` markdown +internal emphasis: foo*bar*baz +no emphasis: foo_bar_baz +``` + +The following rules capture all of these patterns, while allowing +for efficient parsing strategies that do not backtrack: + +1. A single `*` character [can open emphasis](#can-open-emphasis) + <a id="can-open-emphasis"/> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, + (b) it is not followed by whitespace, and + (c) either it is not followed by a `*` character or it is + followed immediately by strong emphasis. + +2. A single `_` character [can open emphasis](#can-open-emphasis) iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not followed by whitespace, + (c) is is not preceded by an ASCII alphanumeric character, and + (d) either it is not followed by a `_` character or it is + followed immediately by strong emphasis. + +3. A single `*` character [can close emphasis](#can-close-emphasis) + <a id="can-close-emphasis"/> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, and + (b) it is not preceded by whitespace. + +4. A single `_` character [can close emphasis](#can-close-emphasis) iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not preceded by whitespace, and + (c) it is not followed by an ASCII alphanumeric character. + +5. A double `**` [can open strong emphasis](#can-open-strong-emphasis) + <a id="can-open-strong-emphasis" /> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, + (b) it is not followed by whitespace, and + (c) either it is not followed by a `*` character or it is + followed immediately by emphasis. + +6. A double `__` [can open strong emphasis](#can-open-strong-emphasis) + iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not followed by whitespace, and + (c) it is not preceded by an ASCII alphanumeric character, and + (d) either it is not followed by a `_` character or it is + followed immediately by emphasis. + +7. A double `**` [can close strong emphasis](#can-close-strong-emphasis) + <a id="can-close-strong-emphasis" /> iff + + (a) it is not part of a sequence of four or more unescaped `*`s, and + (b) it is not preceded by whitespace. + +8. A double `__` [can close strong emphasis](#can-close-strong-emphasis) + iff + + (a) it is not part of a sequence of four or more unescaped `_`s, + (b) it is not preceded by whitespace, and + (c) it is not followed by an ASCII alphanumeric character. + +9. Emphasis begins with a delimiter that [can open + emphasis](#can-open-emphasis) and includes inlines parsed + sequentially until a delimiter that [can close + emphasis](#can-close-emphasis), and that uses the same + character (`_` or `*`) as the opening delimiter, is reached. + +10. Strong emphasis begins with a delimiter that [can open strong + emphasis](#can-open-strong-emphasis) and includes inlines parsed + sequentially until a delimiter that [can close strong + emphasis](#can-close-strong-emphasis), and that uses the + same character (`_` or `*`) as the opening delimiter, is reached. + +These rules can be illustrated through a series of examples. + +Simple emphasis: + +. +*foo bar* +. +<p><em>foo bar</em></p> +. + +. +_foo bar_ +. +<p><em>foo bar</em></p> +. + +Simple strong emphasis: + +. +**foo bar** +. +<p><strong>foo bar</strong></p> +. + +. +__foo bar__ +. +<p><strong>foo bar</strong></p> +. + +Emphasis can continue over line breaks: + +. +*foo +bar* +. +<p><em>foo +bar</em></p> +. + +. +_foo +bar_ +. +<p><em>foo +bar</em></p> +. + +. +**foo +bar** +. +<p><strong>foo +bar</strong></p> +. + +. +__foo +bar__ +. +<p><strong>foo +bar</strong></p> +. + +Emphasis can contain other inline constructs: + +. +*foo [bar](/url)* +. +<p><em>foo <a href="/url">bar</a></em></p> +. + +. +_foo [bar](/url)_ +. +<p><em>foo <a href="/url">bar</a></em></p> +. + +. +**foo [bar](/url)** +. +<p><strong>foo <a href="/url">bar</a></strong></p> +. + +. +__foo [bar](/url)__ +. +<p><strong>foo <a href="/url">bar</a></strong></p> +. + +Symbols contained in other inline constructs will not +close emphasis: + +. +*foo [bar*](/url) +. +<p>*foo <a href="/url">bar*</a></p> +. + +. +_foo [bar_](/url) +. +<p>_foo <a href="/url">bar_</a></p> +. + +. +**<a href="**"> +. +<p>**<a href="**"></p> +. + +. +__<a href="__"> +. +<p>__<a href="__"></p> +. + +. +*a `*`* +. +<p><em>a <code>*</code></em></p> +. + +. +_a `_`_ +. +<p><em>a <code>_</code></em></p> +. + +. +**a<http://foo.bar?q=**> +. +<p>**a<a href="http://foo.bar?q=**">http://foo.bar?q=**</a></p> +. + +. +__a<http://foo.bar?q=__> +. +<p>__a<a href="http://foo.bar?q=__">http://foo.bar?q=__</a></p> +. + +This is not emphasis, because the opening delimiter is +followed by white space: + +. +and * foo bar* +. +<p>and * foo bar*</p> +. + +. +_ foo bar_ +. +<p>_ foo bar_</p> +. + +. +and ** foo bar** +. +<p>and ** foo bar**</p> +. + +. +__ foo bar__ +. +<p>__ foo bar__</p> +. + +This is not emphasis, because the closing delimiter is +preceded by white space: + +. +and *foo bar * +. +<p>and *foo bar *</p> +. + +. +and _foo bar _ +. +<p>and _foo bar _</p> +. + +. +and **foo bar ** +. +<p>and **foo bar **</p> +. + +. +and __foo bar __ +. +<p>and __foo bar __</p> +. + +The rules imply that a sequence of four or more unescaped `*` or +`_` characters will always be parsed as a literal string: + +. +****hi**** +. +<p>****hi****</p> +. + +. +_____hi_____ +. +<p>_____hi_____</p> +. + +. +Sign here: _________ +. +<p>Sign here: _________</p> +. + +The rules also imply that there can be no empty emphasis or strong +emphasis: + +. +** is not an empty emphasis +. +<p>** is not an empty emphasis</p> +. + +. +**** is not an empty strong emphasis +. +<p>**** is not an empty strong emphasis</p> +. + +To include `*` or `_` in emphasized sections, use backslash escapes +or code spans: + +. +*here is a \** +. +<p><em>here is a *</em></p> +. + +. +__this is a double underscore (`__`)__ +. +<p><strong>this is a double underscore (<code>__</code>)</strong></p> +. + +`*` delimiters allow intra-word emphasis; `_` delimiters do not: + +. +foo*bar*baz +. +<p>foo<em>bar</em>baz</p> +. + +. +foo_bar_baz +. +<p>foo_bar_baz</p> +. + +. +foo__bar__baz +. +<p>foo__bar__baz</p> +. + +. +_foo_bar_baz_ +. +<p><em>foo_bar_baz</em></p> +. + +. +11*15*32 +. +<p>11<em>15</em>32</p> +. + +. +11_15_32 +. +<p>11_15_32</p> +. + +Internal underscores will be ignored in underscore-delimited +emphasis: + +. +_foo_bar_baz_ +. +<p><em>foo_bar_baz</em></p> +. + +. +__foo__bar__baz__ +. +<p><strong>foo__bar__baz</strong></p> +. + +The rules are sufficient for the following nesting patterns: + +. +***foo bar*** +. +<p><strong><em>foo bar</em></strong></p> +. + +. +___foo bar___ +. +<p><strong><em>foo bar</em></strong></p> +. + +. +***foo** bar* +. +<p><em><strong>foo</strong> bar</em></p> +. + +. +___foo__ bar_ +. +<p><em><strong>foo</strong> bar</em></p> +. + +. +***foo* bar** +. +<p><strong><em>foo</em> bar</strong></p> +. + +. +___foo_ bar__ +. +<p><strong><em>foo</em> bar</strong></p> +. + +. +*foo **bar*** +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +_foo __bar___ +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +**foo *bar*** +. +<p><strong>foo <em>bar</em></strong></p> +. + +. +__foo _bar___ +. +<p><strong>foo <em>bar</em></strong></p> +. + +. +*foo **bar*** +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +_foo __bar___ +. +<p><em>foo <strong>bar</strong></em></p> +. + +. +*foo *bar* baz* +. +<p><em>foo <em>bar</em> baz</em></p> +. + +. +_foo _bar_ baz_ +. +<p><em>foo <em>bar</em> baz</em></p> +. + +. +**foo **bar** baz** +. +<p><strong>foo <strong>bar</strong> baz</strong></p> +. + +. +__foo __bar__ baz__ +. +<p><strong>foo <strong>bar</strong> baz</strong></p> +. + +. +*foo **bar** baz* +. +<p><em>foo <strong>bar</strong> baz</em></p> +. + +. +_foo __bar__ baz_ +. +<p><em>foo <strong>bar</strong> baz</em></p> +. + +. +**foo *bar* baz** +. +<p><strong>foo <em>bar</em> baz</strong></p> +. + +. +__foo _bar_ baz__ +. +<p><strong>foo <em>bar</em> baz</strong></p> +. + +Note that you cannot nest emphasis directly inside emphasis +using the same delimeter, or strong emphasis directly inside +strong emphasis: + +. +**foo** +. +<p><strong>foo</strong></p> +. + +. +****foo**** +. +<p>****foo****</p> +. + +For these nestings, you need to switch delimiters: + +. +*_foo_* +. +<p><em><em>foo</em></em></p> +. + +. +**__foo__** +. +<p><strong><strong>foo</strong></strong></p> +. + +Note that a `*` followed by a `*` can close emphasis, and +a `**` followed by a `*` can close strong emphasis (and +similarly for `_` and `__`): + +. +*foo** +. +<p><em>foo</em>*</p> +. + +. +*foo *bar** +. +<p><em>foo <em>bar</em></em></p> +. + +. +**foo*** +. +<p><strong>foo</strong>*</p> +. + +The following contains no strong emphasis, because the opening +delimiter is closed by the first `*` before `bar`: + +. +*foo**bar*** +. +<p><em>foo</em><em>bar</em>**</p> +. + +However, a string of four or more `****` can never close emphasis: + +. +*foo**** +. +<p>*foo****</p> +. + +Note that there are some asymmetries here: + +. +*foo** + +**foo* +. +<p><em>foo</em>*</p> +<p>**foo*</p> +. + +. +*foo *bar** + +**foo* bar* +. +<p><em>foo <em>bar</em></em></p> +<p>**foo* bar*</p> +. + +More cases with mismatched delimiters: + +. +**foo* bar* +. +<p>**foo* bar*</p> +. + +. +*bar*** +. +<p><em>bar</em>**</p> +. + +. +***foo* +. +<p>***foo*</p> +. + +. +**bar*** +. +<p><strong>bar</strong>*</p> +. + +. +***foo** +. +<p>***foo**</p> +. + +. +***foo *bar* +. +<p>***foo <em>bar</em></p> +. + +## Links + +A link contains a [link label](#link-label) (the visible text), +a [destination](#destination) (the URI that is the link destination), +and optionally a [link title](#link-title). There are two basic kinds +of links in markdown. In [inline links](#inline-links) the destination +and title are given immediately after the lable. In [reference +links](#reference-links) the destination and title are defined elsewhere +in the document. + +A [link label](#link-label) <a id="link-label"/> consists of + +- an opening `[`, followed by +- zero or more backtick code spans, autolinks, HTML tags, link labels, + backslash-escaped ASCII punctuation characters, or non-`]` characters, + followed by +- a closing `]`. + +These rules are motivated by the following intuitive ideas: + +- A link label is a container for inline elements. +- The square brackets bind more tightly than emphasis markers, + but less tightly than `<>` or `` ` ``. +- Link labels may contain material in matching square brackets. + +A [link destination](#link-destination) <a id="link-destination"/> +consists of either + +- a sequence of zero or more characters between an opening `<` and a + closing `>` that contains no line breaks or unescaped `<` or `>` + characters, or + +- a nonempty sequence of characters that does not include + ASCII space or control characters, and includes parentheses + only if (a) they are backslash-escaped or (b) they are part of + a balanced pair of unescaped parentheses that is not itself + inside a balanced pair of unescaped paretheses. + +A [link title](#link-title) <a id="link-title"/> consists of either + +- a sequence of zero or more characters between straight double-quote + characters (`"`), including a `"` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between straight single-quote + characters (`'`), including a `'` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between matching parentheses + (`(...)`), including a `)` character only if it is backslash-escaped. + +An [inline link](#inline-link) <a id="inline-link"/> +consists of a [link label](#link-label) followed immediately +by a left parenthesis `(`, optional whitespace, +an optional [link destination](#link-destination), +an optional [link title](#link-title) separated from the link +destination by whitespace, optional whitespace, and a right +parenthesis `)`. The link's text consists of the label (excluding +the enclosing square brackets) parsed as inlines. The link's +URI consists of the link destination, excluding enclosing `<...>` if +present, with backslash-escapes in effect as described above. The +link's title consists of the link title, excluding its enclosing +delimiters, with backslash-escapes in effect as described above. + +Here is a simple inline link: + +. +[link](/uri "title") +. +<p><a href="/uri" title="title">link</a></p> +. + +The title may be omitted: + +. +[link](/uri) +. +<p><a href="/uri">link</a></p> +. + +Both the title and the destination may be omitted: + +. +[link]() +. +<p><a href="">link</a></p> +. + +. +[link](<>) +. +<p><a href="">link</a></p> +. + + +If the destination contains spaces, it must be enclosed in pointy +braces: + +. +[link](/my uri) +. +<p>[link](/my uri)</p> +. + +. +[link](</my uri>) +. +<p><a href="/my uri">link</a></p> +. + +The destination cannot contain line breaks, even with pointy braces: + +. +[link](foo +bar) +. +<p>[link](foo +bar)</p> +. + +One level of balanced parentheses is allowed without escaping: + +. +[link]((foo)and(bar)) +. +<p><a href="(foo)and(bar)">link</a></p> +. + +However, if you have parentheses within parentheses, you need to escape +or use the `<...>` form: + +. +[link](foo(and(bar))) +. +<p>[link](foo(and(bar)))</p> +. + +. +[link](foo(and\(bar\))) +. +<p><a href="foo(and(bar))">link</a></p> +. + +. +[link](<foo(and(bar))>) +. +<p><a href="foo(and(bar))">link</a></p> +. + +Parentheses and other symbols can also be escaped, as usual +in markdown: + +. +[link](foo\)\:) +. +<p><a href="foo):">link</a></p> +. + +URL-escaping and entities should be left alone inside the destination: + +. +[link](foo%20bä) +. +<p><a href="foo%20bä">link</a></p> +. + +Note that, because titles can often be parsed as destinations, +if you try to omit the destination and keep the title, you'll +get unexpected results: + +. +[link]("title") +. +<p><a href=""title"">link</a></p> +. + +Titles may be in single quotes, double quotes, or parentheses: + +. +[link](/url "title") +[link](/url 'title') +[link](/url (title)) +. +<p><a href="/url" title="title">link</a> +<a href="/url" title="title">link</a> +<a href="/url" title="title">link</a></p> +. + +Backslash escapes and entities may be used in titles: + +. +[link](/url "title \""") +. +<p><a href="/url" title="title """>link</a></p> +. + +Nested balanced quotes are not allowed without escaping: + +. +[link](/url "title "and" title") +. +<p>[link](/url "title "and" title")</p> +. + +But it is easy to work around this by using a different quote type: + +. +[link](/url 'title "and" title') +. +<p><a href="/url" title="title "and" title">link</a></p> +. + +(Note: `Markdown.pl` did allow double quotes inside a double-quoted +title, and its test suite included a test demonstrating this. +But it is hard to see a good rationale for the extra complexity this +brings, since there are already many ways---backslash escaping, +entities, or using a different quote type for the enclosing title---to +write titles containing double quotes. `Markdown.pl`'s handling of +titles has a number of other strange features. For example, it allows +single-quoted titles in inline links, but not reference links. And, in +reference links but not inline links, it allows a title to begin with +`"` and end with `)`. `Markdown.pl` 1.0.1 even allows titles with no closing +quotation mark, though 1.0.2b8 does not. It seems preferable to adopt +a simple, rational rule that works the same way in inline links and +link reference definitions.) + +Whitespace is allowed around the destination and title: + +. +[link]( /uri + "title" ) +. +<p><a href="/uri" title="title">link</a></p> +. + +But it is not allowed between the link label and the +following parenthesis: + +. +[link] (/uri) +. +<p>[link] (/uri)</p> +. + +Note that this is not a link, because the closing `]` occurs in +an HTML tag: + +. +[foo <bar attr="](baz)"> +. +<p>[foo <bar attr="](baz)"></p> +. + + +There are three kinds of [reference links](#reference-link): +<a id="reference-link"/> + +A [full reference link](#full-reference-link) <a id="full-reference-link"/> +consists of a [link label](#link-label), optional whitespace, and +another [link label](#link-label) that [matches](#matches) a +[reference link definition](#reference-link-definition) elsewhere in the +document. + +One label [matches](#matches) <a id="matches"/> +another just in case their normalized forms are equal. To normalize a +label, perform the *unicode case fold* and collapse consecutive internal +whitespace to a single space. If there are multiple matching reference +link definitions, the one that comes first in the document is used. (It +is desirable in such cases to emit a warning.) + +The contents of the first link label are parsed as inlines, which are +used as the link's text. The link's URI and title are provided by the +matching reference link definition. + +Here is a simple example: + +. +[foo][bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +The first label can contain inline content: + +. +[*foo\!*][bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo!</em></a></p> +. + +Matching is case-insensitive: + +. +[foo][BaR] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +Unicode case fold is used: + +. +[Толпой][Толпой] is a Russian word. + +[ТОЛПОЙ]: /url +. +<p><a href="/url">Толпой</a> is a Russian word.</p> +. + +Consecutive internal whitespace is treated as one space for +purposes of determining matching: + +. +[Foo + bar]: /url + +[Baz][Foo bar] +. +<p><a href="/url">Baz</a></p> +. + +There can be whitespace between the two labels: + +. +[foo] [bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +. +[foo] +[bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +When there are multiple matching reference link definitions, +the first is used: + +. +[foo]: /url1 + +[foo]: /url2 + +[bar][foo] +. +<p><a href="/url1">bar</a></p> +. + +Note that matching is performed on normalized strings, not parsed +inline content. So the following does not match, even though the +labels define equivalent inline content: + +. +[bar][foo\!] + +[foo!]: /url +. +<p>[bar][foo!]</p> +. + +A [collapsed reference link](#collapsed-reference-link) +<a id="collapsed-reference-link"/> consists of a [link +label](#link-label) that [matches](#matches) a [reference link +definition](#reference-link-definition) elsewhere in the +document, optional whitespace, and the string `[]`. The contents of the +first link label are parsed as inlines, which are used as the link's +text. The link's URI and title are provided by the matching reference +link definition. Thus, `[foo][]` is equivalent to `[foo][foo]`. + +. +[foo][] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +. +[*foo* bar][] + +[*foo* bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo</em> bar</a></p> +. + +The link labels are case-insensitive: + +. +[Foo][] + +[foo]: /url "title" +. +<p><a href="/url" title="title">Foo</a></p> +. + + +As with full reference links, whitespace is allowed +between the two sets of brackets: + +. +[foo] +[] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +A [shortcut reference link](#shortcut-reference-link) +<a id="shortcut-reference-link"/> consists of a [link +label](#link-label) that [matches](#matches) a [reference link +definition](#reference-link-definition) elsewhere in the +document and is not followed by `[]` or a link label. +The contents of the first link label are parsed as inlines, +which are used as the link's text. the link's URI and title +are provided by the matching reference link definition. +Thus, `[foo]` is equivalent to `[foo][]`. + +. +[foo] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +. + +. +[*foo* bar] + +[*foo* bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo</em> bar</a></p> +. + +. +[[*foo* bar]] + +[*foo* bar]: /url "title" +. +<p>[<a href="/url" title="title"><em>foo</em> bar</a>]</p> +. + +The link labels are case-insensitive: + +. +[Foo] + +[foo]: /url "title" +. +<p><a href="/url" title="title">Foo</a></p> +. + +If you just want bracketed text, you can backslash-escape the +opening bracket to avoid links: + +. +\[foo] + +[foo]: /url "title" +. +<p>[foo]</p> +. + +Note that this is a link, because link labels bind more tightly +than emphasis: + +. +[foo*]: /url + +*[foo*] +. +<p>*<a href="/url">foo*</a></p> +. + +However, this is not, because link labels bind tight less +tightly than code backticks: + +. +[foo`]: /url + +[foo`]` +. +<p>[foo<code>]</code></p> +. + +Link labels can contain matched square brackets: + +. +[[[foo]]] + +[[[foo]]]: /url +. +<p><a href="/url">[[foo]]</a></p> +. + +. +[[[foo]]] + +[[[foo]]]: /url1 +[foo]: /url2 +. +<p><a href="/url1">[[foo]]</a></p> +. + +For non-matching brackets, use backslash escapes: + +. +[\[foo] + +[\[foo]: /url +. +<p><a href="/url">[foo</a></p> +. + +Full references take precedence over shortcut references: + +. +[foo][bar] + +[foo]: /url1 +[bar]: /url2 +. +<p><a href="/url2">foo</a></p> +. + +In the following case `[bar][baz]` is parsed as a reference, +`[foo]` as normal text: + +. +[foo][bar][baz] + +[baz]: /url +. +<p>[foo]<a href="/url">bar</a></p> +. + +Here, though, `[foo][bar]` is parsed as a reference, since +`[bar]` is defined: + +. +[foo][bar][baz] + +[baz]: /url1 +[bar]: /url2 +. +<p><a href="/url2">foo</a><a href="/url1">baz</a></p> +. + +Here `[foo]` is not parsed as a shortcut reference, because it +is followed by a link label (even though `[bar]` is not defined): + +. +[foo][bar][baz] + +[baz]: /url1 +[foo]: /url2 +. +<p>[foo]<a href="/url1">bar</a></p> +. + + +## Images + +An (unescaped) exclamation mark (`!`) followed by a reference or +inline link will be parsed as an image. The link label will be +used as the image's alt text, and the link title, if any, will +be used as the image's title. + +. +![foo](/url "title") +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +. +![foo *bar*] + +[foo *bar*]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +. + +. +![foo *bar*][] + +[foo *bar*]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +. + +. +![foo *bar*][foobar] + +[FOOBAR]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo <em>bar</em>" title="train & tracks" /></p> +. + +. +![foo](train.jpg) +. +<p><img src="train.jpg" alt="foo" /></p> +. + +. +My ![foo bar](/path/to/train.jpg "title" ) +. +<p>My <img src="/path/to/train.jpg" alt="foo bar" title="title" /></p> +. + +. +![foo](<url>) +. +<p><img src="url" alt="foo" /></p> +. + +. +![](/url) +. +<p><img src="/url" alt="" /></p> +. + +Reference-style: + +. +![foo] [bar] + +[bar]: /url +. +<p><img src="/url" alt="foo" /></p> +. + +. +![foo] [bar] + +[BAR]: /url +. +<p><img src="/url" alt="foo" /></p> +. + +Collapsed: + +. +![foo][] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +. +![*foo* bar][] + +[*foo* bar]: /url "title" +. +<p><img src="/url" alt="<em>foo</em> bar" title="title" /></p> +. + +The labels are case-insensitive: + +. +![Foo][] + +[foo]: /url "title" +. +<p><img src="/url" alt="Foo" title="title" /></p> +. + +As with full reference links, whitespace is allowed +between the two sets of brackets: + +. +![foo] +[] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +Shortcut: + +. +![foo] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +. + +. +![*foo* bar] + +[*foo* bar]: /url "title" +. +<p><img src="/url" alt="<em>foo</em> bar" title="title" /></p> +. + +. +![[foo]] + +[[foo]]: /url "title" +. +<p><img src="/url" alt="[foo]" title="title" /></p> +. + +The link labels are case-insensitive: + +. +![Foo] + +[foo]: /url "title" +. +<p><img src="/url" alt="Foo" title="title" /></p> +. + +If you just want bracketed text, you can backslash-escape the +opening `!` and `[`: + +. +\!\[foo] + +[foo]: /url "title" +. +<p>![foo]</p> +. + +If you want a link after a literal `!`, backslash-escape the +`!`: + +. +\![foo] + +[foo]: /url "title" +. +<p>!<a href="/url" title="title">foo</a></p> +. + +## Autolinks + +Autolinks are absolute URIs and email addresses inside `<` and `>`. +They are parsed as links, with the URL or email address as the link +label. + +A [URI autolink](#uri-autolink) <a id="uri-autolink"/> +consists of `<`, followed by an [absolute +URI](#absolute-uri) not containing `<`, followed by `>`. It is parsed +as a link to the URI, with the URI as the link's label. + +An [absolute URI](#absolute-uri), <a id="absolute-uri"/> +for these purposes, consists of a [scheme](#scheme) followed by a colon (`:`) +followed by zero or more characters other than ASCII whitespace and +control characters, `<`, and `>`. If the URI includes these characters, +you must use percent-encoding (e.g. `%20` for a space). + +The following [schemes](#scheme) <a id="scheme"/> +are recognized (case-insensitive): +`coap`, `doi`, `javascript`, `aaa`, `aaas`, `about`, `acap`, `cap`, +`cid`, `crid`, `data`, `dav`, `dict`, `dns`, `file`, `ftp`, `geo`, `go`, +`gopher`, `h323`, `http`, `https`, `iax`, `icap`, `im`, `imap`, `info`, +`ipp`, `iris`, `iris.beep`, `iris.xpc`, `iris.xpcs`, `iris.lwz`, `ldap`, +`mailto`, `mid`, `msrp`, `msrps`, `mtqp`, `mupdate`, `news`, `nfs`, +`ni`, `nih`, `nntp`, `opaquelocktoken`, `pop`, `pres`, `rtsp`, +`service`, `session`, `shttp`, `sieve`, `sip`, `sips`, `sms`, `snmp`,` +soap.beep`, `soap.beeps`, `tag`, `tel`, `telnet`, `tftp`, `thismessage`, +`tn3270`, `tip`, `tv`, `urn`, `vemmi`, `ws`, `wss`, `xcon`, +`xcon-userid`, `xmlrpc.beep`, `xmlrpc.beeps`, `xmpp`, `z39.50r`, +`z39.50s`, `adiumxtra`, `afp`, `afs`, `aim`, `apt`,` attachment`, `aw`, +`beshare`, `bitcoin`, `bolo`, `callto`, `chrome`,` chrome-extension`, +`com-eventbrite-attendee`, `content`, `cvs`,` dlna-playsingle`, +`dlna-playcontainer`, `dtn`, `dvb`, `ed2k`, `facetime`, `feed`, +`finger`, `fish`, `gg`, `git`, `gizmoproject`, `gtalk`, `hcp`, `icon`, +`ipn`, `irc`, `irc6`, `ircs`, `itms`, `jar`, `jms`, `keyparc`, `lastfm`, +`ldaps`, `magnet`, `maps`, `market`,` message`, `mms`, `ms-help`, +`msnim`, `mumble`, `mvn`, `notes`, `oid`, `palm`, `paparazzi`, +`platform`, `proxy`, `psyc`, `query`, `res`, `resource`, `rmi`, `rsync`, +`rtmp`, `secondlife`, `sftp`, `sgn`, `skype`, `smb`, `soldat`, +`spotify`, `ssh`, `steam`, `svn`, `teamspeak`, `things`, `udp`, +`unreal`, `ut2004`, `ventrilo`, `view-source`, `webcal`, `wtai`, +`wyciwyg`, `xfire`, `xri`, `ymsgr`. + +Here are some valid autolinks: + +. +<http://foo.bar.baz> +. +<p><a href="http://foo.bar.baz">http://foo.bar.baz</a></p> +. + +. +<http://foo.bar.baz?q=hello&id=22&boolean> +. +<p><a href="http://foo.bar.baz?q=hello&id=22&boolean">http://foo.bar.baz?q=hello&id=22&boolean</a></p> +. + +. +<irc://foo.bar:2233/baz> +. +<p><a href="irc://foo.bar:2233/baz">irc://foo.bar:2233/baz</a></p> +. + +Uppercase is also fine: + +. +<MAILTO:FOO@BAR.BAZ> +. +<p><a href="MAILTO:FOO@BAR.BAZ">MAILTO:FOO@BAR.BAZ</a></p> +. + +Spaces are not allowed in autolinks: + +. +<http://foo.bar/baz bim> +. +<p><http://foo.bar/baz bim></p> +. + +An [email autolink](#email-autolink) <a id="email-autolink"/> +consists of `<`, followed by an [email address](#email-address), +followed by `>`. The link's label is the email address, +and the URL is `mailto:` followed by the email address. + +An [email address](#email-address), <a id="email-address"/> +for these purposes, is anything that matches +the [non-normative regex from the HTML5 +spec](http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#e-mail-state-%28type=email%29): + + /^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? + (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ + +Examples of email autolinks: + +. +<foo@bar.baz.com> +. +<p><a href="mailto:foo@bar.baz.com">foo@bar.baz.com</a></p> +. + +. +<foo+special@Bar.baz-bar0.com> +. +<p><a href="mailto:foo+special@Bar.baz-bar0.com">foo+special@Bar.baz-bar0.com</a></p> +. + +These are not autolinks: + +. +<> +. +<p><></p> +. + +. +<heck://bing.bong> +. +<p><heck://bing.bong></p> +. + +. +< http://foo.bar > +. +<p>< http://foo.bar ></p> +. + +. +<foo.bar.baz> +. +<p><foo.bar.baz></p> +. + +. +<localhost:5001/foo> +. +<p><localhost:5001/foo></p> +. + +## Raw HTML + +Text between `<` and `>` that looks like an HTML tag is parsed as a +raw HTML tag and will be rendered in HTML without escaping. +Tag and attribute names are not limited to current HTML tags, +so custom tags (and even, say, DocBook tags) may be used. + +Here is the grammar for tags: + +A [tag name](#tag-name) <a id="tag-name"/> consists of an ASCII letter +followed by zero or more ASCII letters or digits. + +An [attribute](#attribute) <a id="attribute"/> consists of whitespace, +an **attribute name**, and an optional **attribute value +specification**. + +An [attribute name](#attribute-name) <a id="attribute-name"/> +consists of an ASCII letter, `_`, or `:`, followed by zero or more ASCII +letters, digits, `_`, `.`, `:`, or `-`. (Note: This is the XML +specification restricted to ASCII. HTML5 is laxer.) + +An [attribute value specification](#attribute-value-specification) +<a id="attribute-value-specification"/> consists of optional whitespace, +a `=` character, optional whitespace, and an [attribute +value](#attribute-value). + +An [attribute value](#attribute-value) <a id="attribute-value"/> +consists of an [unquoted attribute value](#unquoted-attribute-value), +a [single-quoted attribute value](#single-quoted-attribute-value), +or a [double-quoted attribute value](#double-quoted-attribute-value). + +An [unquoted attribute value](#unquoted-attribute-value) +<a id="unquoted-attribute-value"/> is a nonempty string of characters not +including spaces, `"`, `'`, `=`, `<`, `>`, or `` ` ``. + +A [single-quoted attribute value](#single-quoted-attribute-value) +<a id="single-quoted-attribute-value"/> consists of `'`, zero or more +characters not including `'`, and a final `'`. + +A [double-quoted attribute value](#double-quoted-attribute-value) +<a id="double-quoted-attribute-value"/> consists of `"`, zero or more +characters not including `"`, and a final `"`. + +An [open tag](#open-tag) <a id="open-tag"/> consists of a `<` character, +a [tag name](#tag-name), zero or more [attributes](#attribute), +optional whitespace, an optional `/` character, and a `>` character. + +A [closing tag](#closing-tag) <a id="closing-tag"/> consists of the +string `</`, a [tag name](#tag-name), optional whitespace, and the +character `>`. + +An [HTML comment](#html-comment) <a id="html-comment"/> consists of the +string `<!--`, a string of characters not including the string `--`, and +the string `-->`. + +A [processing instruction](#processing-instruction) +<a id="processing-instruction"/> consists of the string `<?`, a string +of characters not including the string `?>`, and the string +`?>`. + +A [declaration](#declaration) <a id="declaration"/> consists of the +string `<!`, a name consisting of one or more uppercase ASCII letters, +whitespace, a string of characters not including the character `>`, and +the character `>`. + +A [CDATA section](#cdata-section) <a id="cdata-section"/> consists of +the string `<![CDATA[`, a string of characters not including the string +`]]>`, and the string `]]>`. + +An [HTML tag](#html-tag) <a id="html-tag"/> consists of an [open +tag](#open-tag), a [closing tag](#closing-tag), an [HTML +comment](#html-comment), a [processing +instruction](#processing-instruction), an [element type +declaration](#element-type-declaration), or a [CDATA +section](#cdata-section). + +Here are some simple open tags: + +. +<a><bab><c2c> +. +<p><a><bab><c2c></p> +. + +Empty elements: + +. +<a/><b2/> +. +<p><a/><b2/></p> +. + +Whitespace is allowed: + +. +<a /><b2 +data="foo" > +. +<p><a /><b2 +data="foo" ></p> +. + +With attributes: + +. +<a foo="bar" bam = 'baz <em>"</em>' +_boolean zoop:33=zoop:33 /> +. +<p><a foo="bar" bam = 'baz <em>"</em>' +_boolean zoop:33=zoop:33 /></p> +. + +Illegal tag names, not parsed as HTML: + +. +<33> <__> +. +<p><33> <__></p> +. + +Illegal attribute names: + +. +<a h*#ref="hi"> +. +<p><a h*#ref="hi"></p> +. + +Illegal attribute values: + +. +<a href="hi'> <a href=hi'> +. +<p><a href="hi'> <a href=hi'></p> +. + +Illegal whitespace: + +. +< a>< +foo><bar/ > +. +<p>< a>< +foo><bar/ ></p> +. + +Missing whitespace: + +. +<a href='bar'title=title> +. +<p><a href='bar'title=title></p> +. + +Closing tags: + +. +</a> +</foo > +. +<p></a> +</foo ></p> +. + +Illegal attributes in closing tag: + +. +</a href="foo"> +. +<p></a href="foo"></p> +. + +Comments: + +. +foo <!-- this is a +comment - with hyphen --> +. +<p>foo <!-- this is a +comment - with hyphen --></p> +. + +. +foo <!-- not a comment -- two hyphens --> +. +<p>foo <!-- not a comment -- two hyphens --></p> +. + +Processing instructions: + +. +foo <?php echo $a; ?> +. +<p>foo <?php echo $a; ?></p> +. + +Declarations: + +. +foo <!ELEMENT br EMPTY> +. +<p>foo <!ELEMENT br EMPTY></p> +. + +CDATA sections: + +. +foo <![CDATA[>&<]]> +. +<p>foo <![CDATA[>&<]]></p> +. + +Entities are preserved in HTML attributes: + +. +<a href="ö"> +. +<p><a href="ö"></p> +. + +Backslash escapes do not work in HTML attributes: + +. +<a href="\*"> +. +<p><a href="\*"></p> +. + +. +<a href="\""> +. +<p><a href="""></p> +. + +## Hard line breaks + +A line break (not in a code span or HTML tag) that is preceded +by two or more spaces is parsed as a linebreak (rendered +in HTML as a `<br />` tag): + +. +foo +baz +. +<p>foo<br /> +baz</p> +. + +For a more visible alternative, a backslash before the newline may be +used instead of two spaces: + +. +foo\ +baz +. +<p>foo<br /> +baz</p> +. + +More than two spaces can be used: + +. +foo +baz +. +<p>foo<br /> +baz</p> +. + +Leading spaces at the beginning of the next line are ignored: + +. +foo + bar +. +<p>foo<br /> +bar</p> +. + +. +foo\ + bar +. +<p>foo<br /> +bar</p> +. + +Line breaks can occur inside emphasis, links, and other constructs +that allow inline content: + +. +*foo +bar* +. +<p><em>foo<br /> +bar</em></p> +. + +. +*foo\ +bar* +. +<p><em>foo<br /> +bar</em></p> +. + +Line breaks do not occur inside code spans + +. +`code +span` +. +<p><code>code span</code></p> +. + +. +`code\ +span` +. +<p><code>code\ span</code></p> +. + +or HTML tags: + +. +<a href="foo +bar"> +. +<p><a href="foo +bar"></p> +. + +. +<a href="foo\ +bar"> +. +<p><a href="foo\ +bar"></p> +. + +## Soft line breaks + +A regular line break (not in a code span or HTML tag) that is not +preceded by two or more spaces is parsed as a softbreak. (A +softbreak may be rendered in HTML either as a newline or as a space. +The result will be the same in browsers. In the examples here, a +newline will be used.) + +. +foo +baz +. +<p>foo +baz</p> +. + +Spaces at the end of the line and beginning of the next line are +removed: + +. +foo + baz +. +<p>foo +baz</p> +. + +A conforming parser may render a soft line break in HTML either as a +line break or as a space. + +A renderer may also provide an option to render soft line breaks +as hard line breaks. + +## Strings + +Any characters not given an interpretation by the above rules will +be parsed as string content. + +. +hello $.;'there +. +<p>hello $.;'there</p> +. + +. +Foo χρῆν +. +<p>Foo χρῆν</p> +. + +Internal spaces are preserved verbatim: + +. +Multiple spaces +. +<p>Multiple spaces</p> +. + +<!-- END TESTS --> + +# Appendix A: A parsing strategy {-} + +## Overview {-} + +Parsing has two phases: + +1. In the first phase, lines of input are consumed and the block +structure of the document---its division into paragraphs, block quotes, +list items, and so on---is constructed. Text is assigned to these +blocks but not parsed. Reference link definitions are parsed and a +map of links is constructed. + +2. In the second phase, the raw text contents of paragraphs and headers +are parsed into sequences of markdown inline elements (strings, +code spans, links, emphasis, and so on), using the map of link +references constructed in phase 1. + +## The document tree {-} + +At each point in processing, the document is represented as a tree of +**blocks**. The root of the tree is a `document` block. The `document` +may have any number of other blocks as **children**. These children +may, in turn, have other blocks a children. The last child of a block +is normally considered **open**, meaning that subsequent lines of input +can alter its contents. (Blocks that are not open are **closed**.) +Here, for example, is a possible document tree, with the open blocks +marked by arrows: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## How source lines alter the document tree {-} + +Each line that is processed has an effect on this tree. The line is +analyzed and, depending on its contents, the document may be altered +in one or more of the following ways: + +1. One or more open blocks may be closed. +2. One or more new blocks may be created as children of the + last open block. +3. Text may be added to the last (deepest) open block remaining + on the tree. + +Once a line has been incorporated into the tree in this way, +it can be discarded, so input can be read in a stream. + +We can see how this works by considering how the tree above is +generated by four lines of markdown: + +``` markdown +> Lorem ipsum dolor +sit amet. +> - Qui *quodsi iracundia* +> - aliquando id +``` + +At the outset, our document model is just + +``` tree +-> document +``` + +The first line of our text, + +``` markdown +> Lorem ipsum dolor +``` + +causes a `block_quote` block to be created as a child of our +open `document` block, and a `paragraph` block as a child of +the `block_quote`. Then the text is added to the last open +block, the `paragraph`: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor" +``` + +The next line, + +``` markdown +sit amet. +``` + +is a "lazy continuation" of the open `paragraph`, so it gets added +to the paragraph's text: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor\nsit amet." +``` + +The third line, + +``` markdown +> - Qui *quodsi iracundia* +``` + +causes the `paragraph` block to be closed, and a new `list` block +opened as a child of the `block_quote`. A `list_item` is also +added as a child of the `list`, and a `paragraph` as a chid of +the `list_item`. The text is then added to the `paragraph`: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + -> list_item + -> paragraph + "Qui *quodsi iracundia*" +``` + +The fourth line, + +``` markdown +> - aliquando id +``` + +causes the `list_item` (and its child the `paragraph`) to be closed, +and a new `list_item` opened up as child of the `list`. A `paragraph` +is added as a child of the new `list_item`, to contain the text. +We thus obtain the final tree: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## From block structure to the final document {-} + +Once all of the input has been parsed, all open blocks are closed. + +We then "walk the tree," visiting every node, and parse raw +string contents of paragraphs and headers as inlines. At this +point we have seen all the link reference definitions, so we can +resolve reference links as we go. + +``` tree +document + block_quote + paragraph + str "Lorem ipsum dolor" + softbreak + str "sit amet." + list (type=bullet tight=true bullet_char=-) + list_item + paragraph + str "Qui " + emph + str "quodsi iracundia" + list_item + paragraph + str "aliquando id" +``` + +Notice how the newline in the first paragraph has been parsed as +a `softbreak`, and the asterisks in the first list item have become +an `emph`. + +The document can be rendered as HTML, or in any other format, given +an appropriate renderer. + + diff --git a/spec2js.js b/spec2js.js new file mode 100755 index 0000000..6bf366f --- /dev/null +++ b/spec2js.js @@ -0,0 +1,17 @@ +#!/usr/bin/env node + +var fs = require('fs'); +var util = require('util'); + +fs.readFile('spec.txt', 'utf8', function(err, data) { + if (err) { + return console.log(err); + } + var examples = []; + data.replace(/^\.\n([\s\S]*?)^\.\n([\s\S]*?)^\.$/gm, + function(_,x,y){ + examples.push({markdown: x, html: y}); + }); + console.log(util.inspect(examples, { depth: null })); + console.warn(examples.length + ' examples'); +}); diff --git a/spec2md.pl b/spec2md.pl new file mode 100644 index 0000000..1b4f26e --- /dev/null +++ b/spec2md.pl @@ -0,0 +1,36 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +my $stage = 0; +my $example = 0; +my @match; +my $section = ""; + +while (<STDIN>) { + if (/^\.$/) { + if ($stage == 0) { + $example++; + print "\n<div class=\"example\" id=\"example-$example\" data-section=\"$section\">\n"; + print "<div class=\"examplenum\">Example $example</div>\n\n"; + print "````````````````````````````````````````````````````````` markdown\n"; + } elsif ($stage == 1) { + print "`````````````````````````````````````````````````````````\n\n"; + print "````````````````````````````````````````````````````````` html\n"; + } elsif ($stage == 2) { + print "`````````````````````````````````````````````````````````\n\n"; + print "</div>\n\n"; + } else { + die "Encountered unknown stage $stage"; + } + $stage = ($stage + 1) % 3; + } else { + if ($stage == 0 && (@match = ($_ =~ /^#{1,6} *(.*)/))) { + $section = $match[0]; + } + if ($stage != 0) { + # $_ =~ s/ /␣/g; + } + print $_; + } +} diff --git a/specfilter.hs b/specfilter.hs new file mode 100755 index 0000000..67c8fa5 --- /dev/null +++ b/specfilter.hs @@ -0,0 +1,37 @@ +#!/usr/bin/env runhaskell + +import Text.Pandoc.JSON +import Text.Pandoc.Walk + +main = toJSONFilter go + where go :: Pandoc -> Pandoc + go = walk exampleDivs . walk anchors + +exampleDivs :: Block -> Block +exampleDivs (Div (ident, ["example"], kvs) + [ d@(Div (_,["examplenum"],_) _), + c1@(CodeBlock (_,["markdown"],_) _), + c2@(CodeBlock (_,["html"],_) _) + ]) = Div (ident, ["example"], kvs) + [ rawtex "\\begin{minipage}[t]{\\textwidth}\n{\\scriptsize " + , d + , rawtex "\\vspace{-1em}}" + , rawtex "\\begin{minipage}[t]{0.49\\textwidth}\n\\definecolor{shadecolor}{gray}{0.85}\n" + , addBreaks c1 + , rawtex "\\end{minipage}\n\\hfill\n\\begin{minipage}[t]{0.49\\textwidth}\n\\definecolor{shadecolor}{gray}{0.95}\n" + , addBreaks c2 + , rawtex "\\end{minipage}\n\\end{minipage}" + ] + where rawtex = RawBlock (Format "latex") + addBreaks (CodeBlock attrs code) = CodeBlock attrs $ addBreaks' code + addBreaks' code = + if length code > 49 + then take 49 code ++ ('\n':addBreaks' (drop 49 code)) + else code +exampleDivs x = x + +anchors :: Inline -> Inline +anchors (RawInline (Format "html") ('<':'a':' ':'i':'d':'=':'"':xs)) = + RawInline (Format "latex") ("\\hyperdef{}{" ++ lab ++ "}{\\label{" ++ lab ++ "}}") + where lab = takeWhile (/='"') xs +anchors x = x diff --git a/src/blocks.c b/src/blocks.c new file mode 100644 index 0000000..2776231 --- /dev/null +++ b/src/blocks.c @@ -0,0 +1,747 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include <ctype.h> +#include "bstrlib.h" +#include "stmd.h" +#include "uthash.h" +#include "debug.h" +#include "scanners.h" + +static block* make_block(int tag, int start_line, int start_column) +{ + block* e; + e = (block*) malloc(sizeof(block)); + e->tag = tag; + e->open = true; + e->last_line_blank = false; + e->start_line = start_line; + e->start_column = start_column; + e->end_line = start_line; + e->children = NULL; + e->last_child = NULL; + e->parent = NULL; + e->top = NULL; + e->attributes.refmap = NULL; + e->string_content = bfromcstr(""); + e->inline_content = NULL; + e->next = NULL; + e->prev = NULL; + return e; +} + +// Create a root document block. +extern block* make_document() +{ + block * e = make_block(document, 1, 1); + reference * map = NULL; + reference ** refmap; + refmap = (reference**) malloc(sizeof(reference*)); + *refmap = map; + e->attributes.refmap = refmap; + e->top = e; + return e; +} + +// Returns true if line has only space characters, else false. +bool is_blank(bstring s, int offset) +{ + char c; + while ((c = bchar(s, offset))) { + if (c == '\n') { + return true; + } else if (c == ' ') { + offset++; + } else { + return false; + } + } + return true; +} + +static inline bool can_contain(int parent_type, int child_type) +{ + return ( parent_type == document || + parent_type == block_quote || + parent_type == list_item || + (parent_type == list && child_type == list_item) ); +} + +static inline bool accepts_lines(int block_type) +{ + return (block_type == paragraph || + block_type == atx_header || + block_type == indented_code || + block_type == fenced_code); +} + +static int add_line(block* block, bstring ln, int offset) +{ + bstring s = bmidstr(ln, offset, blength(ln) - offset); + check(block->open, "attempted to add line (%s) to closed container (%d)", + ln->data, block->tag); + check(bformata(block->string_content, "%s", s->data) == 0, + "could not append line to string_content"); + bdestroy(s); + return 0; + error: + return -1; +} + +static int remove_trailing_blank_lines(bstring ln) +{ + bstring tofind = bfromcstr(" \t\r\n"); + int pos; + // find last nonspace: + pos = bninchrr(ln, blength(ln) - 1, tofind); + if (pos == BSTR_ERR) { // all spaces + bassigncstr(ln, ""); + } else { + // find next newline after it + pos = bstrchrp(ln, '\n', pos); + if (pos != BSTR_ERR) { + check(bdelete(ln, pos, blength(ln) - pos) != BSTR_ERR, + "failed to delete trailing blank lines"); + } + } + bdestroy(tofind); + return 0; + error: + return -1; +} + +// Check to see if a block ends with a blank line, descending +// if needed into lists and sublists. +static bool ends_with_blank_line(block* block) +{ + if (block->last_line_blank) { + return true; + } + if ((block->tag == list || block->tag == list_item) && block->last_child) { + return ends_with_blank_line(block->last_child); + } else { + return false; + } +} + +// Break out of all containing lists +static int break_out_of_lists(block ** bptr, int line_number) +{ + block * container = *bptr; + block * b = container->top; + // find first containing list: + while (b && b->tag != list) { + b = b->last_child; + } + if (b) { + while (container && container != b) { + finalize(container, line_number); + container = container->parent; + } + finalize(b, line_number); + *bptr = b->parent; + } + return 0; +} + + +extern int finalize(block* b, int line_number) +{ + int firstlinelen; + int pos; + block* item; + block* subitem; + + check(b != NULL, "finalize called on null block"); + if (!b->open) { + return 0; // don't do anything if the block is already closed + } + b->open = false; + if (line_number > b->start_line) { + b->end_line = line_number - 1; + } else { + b->end_line = line_number; + } + + switch (b->tag) { + + case paragraph: + pos = 0; + while (bchar(b->string_content, 0) == '[' && + (pos = parse_reference(b->string_content, + b->top->attributes.refmap))) { + bdelete(b->string_content, 0, pos); + } + if (is_blank(b->string_content, 0)) { + b->tag = reference_def; + } + break; + + case indented_code: + remove_trailing_blank_lines(b->string_content); + bformata(b->string_content, "\n"); + break; + + case fenced_code: + // first line of contents becomes info + firstlinelen = bstrchr(b->string_content, '\n'); + b->attributes.fenced_code_data.info = + bmidstr(b->string_content, 0, firstlinelen); + bdelete(b->string_content, 0, firstlinelen + 1); // +1 for \n + btrimws(b->attributes.fenced_code_data.info); + unescape(b->attributes.fenced_code_data.info); + break; + + case list: // determine tight/loose status + b->attributes.list_data.tight = true; // tight by default + item = b->children; + + while (item) { + // check for non-final non-empty list item ending with blank line: + if (item->last_line_blank && item->next) { + b->attributes.list_data.tight = false; + break; + } + // recurse into children of list item, to see if there are + // spaces between them: + subitem = item->children; + while (subitem) { + if (ends_with_blank_line(subitem) && + (item->next || subitem->next)) { + b->attributes.list_data.tight = false; + break; + } + subitem = subitem->next; + } + if (!(b->attributes.list_data.tight)) { + break; + } + item = item->next; + } + + break; + + default: + break; + } + + return 0; + error: + return -1; +} + +// Add a block as child of another. Return pointer to child. +extern block* add_child(block* parent, + int block_type, int start_line, int start_column) +{ + // if 'parent' isn't the kind of block that can accept this child, + // then back up til we hit a block that can. + while (!can_contain(parent->tag, block_type)) { + finalize(parent, start_line); + parent = parent->parent; + } + + check(parent != NULL, "parent container cannot accept children"); + + block* child = make_block(block_type, start_line, start_column); + child->parent = parent; + child->top = parent->top; + + if (parent->last_child) { + parent->last_child->next = child; + child->prev = parent->last_child; + } else { + parent->children = child; + child->prev = NULL; + } + parent->last_child = child; + return child; + error: + return NULL; +} + +// Free a block list and any children. +extern void free_blocks(block* e) +{ + block * next; + while (e != NULL) { + next = e->next; + free_inlines(e->inline_content); + bdestroy(e->string_content); + if (e->tag == fenced_code) { + bdestroy(e->attributes.fenced_code_data.info); + } else if (e->tag == document) { + free_reference_map(e->attributes.refmap); + } + free_blocks(e->children); + free(e); + e = next; + } +} + +// Walk through block and all children, recursively, parsing +// string content into inline content where appropriate. +int process_inlines(block* cur, reference** refmap) +{ + switch (cur->tag) { + + case paragraph: + case atx_header: + case setext_header: + check(cur->string_content != NULL, "string_content is NULL"); + cur->inline_content = parse_inlines(cur->string_content, refmap); + bdestroy(cur->string_content); + cur->string_content = NULL; + break; + + default: + break; + } + + block * child = cur->children; + while (child != NULL) { + process_inlines(child, refmap); + child = child->next; + } + + return 0; + error: + return -1; +} + +// Attempts to parse a list item marker (bullet or enumerated). +// On success, returns length of the marker, and populates +// data with the details. On failure, returns 0. +static int parse_list_marker(bstring ln, int pos, + struct ListData ** dataptr) +{ + char c; + int startpos; + int start = 1; + struct ListData * data; + + startpos = pos; + c = bchar(ln, pos); + + if ((c == '*' || c == '-' || c == '+') && !scan_hrule(ln, pos)) { + pos++; + if (!isspace(bchar(ln, pos))) { + return 0; + } + data = malloc(sizeof(struct ListData)); + data->marker_offset = 0; // will be adjusted later + data->list_type = bullet; + data->bullet_char = c; + data->start = 1; + data->delimiter = period; + data->tight = false; + + } else if (isdigit(c)) { + + pos++; + while (isdigit(bchar(ln, pos))) { + pos++; + } + + if (!sscanf((char *) ln->data + startpos, "%d", &start)) { + log_err("sscanf failed"); + return 0; + } + + c = bchar(ln, pos); + if (c == '.' || c == ')') { + pos++; + if (!isspace(bchar(ln, pos))) { + return 0; + } + data = malloc(sizeof(struct ListData)); + data->marker_offset = 0; // will be adjusted later + data->list_type = ordered; + data->bullet_char = 0; + data->start = start; + data->delimiter = (c == '.' ? period : parens); + data->tight = false; + } else { + return 0; + } + + } else { + return 0; + } + + *dataptr = data; + return (pos - startpos); +} + +// Return 1 if list item belongs in list, else 0. +static int lists_match(struct ListData list_data, + struct ListData item_data) +{ + return (list_data.list_type == item_data.list_type && + list_data.delimiter == item_data.delimiter && + // list_data.marker_offset == item_data.marker_offset && + list_data.bullet_char == item_data.bullet_char); +} + +// Process one line at a time, modifying a block. +// Returns 0 if successful. curptr is changed to point to +// the currently open block. +extern int incorporate_line(bstring ln, int line_number, block** curptr) +{ + block* last_matched_container; + int offset = 0; + int matched = 0; + int lev = 0; + int i; + struct ListData * data = NULL; + bool all_matched = true; + block* container; + block* cur = *curptr; + bool blank = false; + int first_nonspace; + int indent; + + // detab input line + check(bdetab(ln, 1) != BSTR_ERR, + "invalid UTF-8 sequence in line %d\n", line_number); + + // container starts at the document root. + container = cur->top; + + // for each containing block, try to parse the associated line start. + // bail out on failure: container will point to the last matching block. + + while (container->last_child && container->last_child->open) { + container = container->last_child; + + first_nonspace = offset; + while (bchar(ln, first_nonspace) == ' ') { + first_nonspace++; + } + + indent = first_nonspace - offset; + blank = bchar(ln, first_nonspace) == '\n'; + + if (container->tag == block_quote) { + + matched = indent <= 3 && bchar(ln, first_nonspace) == '>'; + if (matched) { + offset = first_nonspace + 1; + if (bchar(ln, offset) == ' ') { + offset++; + } + } else { + all_matched = false; + } + + } else if (container->tag == list_item) { + + if (indent >= container->attributes.list_data.marker_offset + + container->attributes.list_data.padding) { + offset += container->attributes.list_data.marker_offset + + container->attributes.list_data.padding; + } else if (blank) { + offset = first_nonspace; + } else { + all_matched = false; + } + + } else if (container->tag == indented_code) { + + if (indent >= CODE_INDENT) { + offset += CODE_INDENT; + } else if (blank) { + offset = first_nonspace; + } else { + all_matched = false; + } + + } else if (container->tag == atx_header || + container->tag == setext_header) { + + // a header can never contain more than one line + all_matched = false; + + } else if (container->tag == fenced_code) { + + // skip optional spaces of fence offset + i = container->attributes.fenced_code_data.fence_offset; + while (i > 0 && bchar(ln, offset) == ' ') { + offset++; + i--; + } + + } else if (container->tag == html_block) { + + if (blank) { + all_matched = false; + } + + } else if (container->tag == paragraph) { + + if (blank) { + container->last_line_blank =true; + all_matched = false; + } + + } + + if (!all_matched) { + container = container->parent; // back up to last matching block + break; + } + } + + last_matched_container = container; + + // check to see if we've hit 2nd blank line, break out of list: + if (blank && container->last_line_blank) { + break_out_of_lists(&container, line_number); + } + + // unless last matched container is code block, try new container starts: + while (container->tag != fenced_code && container->tag != indented_code && + container->tag != html_block) { + + first_nonspace = offset; + while (bchar(ln, first_nonspace) == ' ') { + first_nonspace++; + } + + indent = first_nonspace - offset; + blank = bchar(ln, first_nonspace) == '\n'; + + if (indent >= CODE_INDENT) { + + if (cur->tag != paragraph && !blank) { + offset += CODE_INDENT; + container = add_child(container, indented_code, line_number, offset + 1); + } else { // indent > 4 in lazy line + break; + } + + } else if (bchar(ln, first_nonspace) == '>') { + + offset = first_nonspace + 1; + // optional following character + if (bchar(ln, offset) == ' ') { + offset++; + } + container = add_child(container, block_quote, line_number, offset + 1); + + } else if ((matched = scan_atx_header_start(ln, first_nonspace))) { + + offset = first_nonspace + matched; + container = add_child(container, atx_header, line_number, offset + 1); + int hashpos = bstrchrp(ln, '#', first_nonspace); + check(hashpos != BSTR_ERR, "no # found in atx header start"); + int level = 0; + while (bchar(ln, hashpos) == '#') { + level++; + hashpos++; + } + container->attributes.header_level = level; + + } else if ((matched = scan_open_code_fence(ln, first_nonspace))) { + + container = add_child(container, fenced_code, line_number, + first_nonspace + 1); + container->attributes.fenced_code_data.fence_char = bchar(ln, + first_nonspace); + container->attributes.fenced_code_data.fence_length = matched; + container->attributes.fenced_code_data.fence_offset = + first_nonspace - offset; + offset = first_nonspace + matched; + + } else if ((matched = scan_html_block_tag(ln, first_nonspace))) { + + container = add_child(container, html_block, line_number, + first_nonspace + 1); + // note, we don't adjust offset because the tag is part of the text + + } else if (container->tag == paragraph && + (lev = scan_setext_header_line(ln, first_nonspace)) && + // check that there is only one line in the paragraph: + bstrrchrp(container->string_content, '\n', + blength(container->string_content) - 2) == BSTR_ERR) { + + container->tag = setext_header; + container->attributes.header_level = lev; + offset = blength(ln) - 1; + + } else if (!(container->tag == paragraph && !all_matched) && + (matched = scan_hrule(ln, first_nonspace))) { + + // it's only now that we know the line is not part of a setext header: + container = add_child(container, hrule, line_number, first_nonspace + 1); + finalize(container, line_number); + container = container->parent; + offset = blength(ln) - 1; + + } else if ((matched = parse_list_marker(ln, first_nonspace, &data))) { + + // compute padding: + offset = first_nonspace + matched; + i = 0; + while (i <= 5 && bchar(ln, offset + i) == ' ') { + i++; + } + // i = number of spaces after marker, up to 5 + if (i >= 5 || i < 1 || bchar(ln, offset) == '\n') { + data->padding = matched + 1; + if (i > 0) { + offset += 1; + } + } else { + data->padding = matched + i; + offset += i; + } + + // check container; if it's a list, see if this list item + // can continue the list; otherwise, create a list container. + + data->marker_offset = indent; + + if (container->tag != list || + !lists_match(container->attributes.list_data, *data)) { + container = add_child(container, list, line_number, + first_nonspace + 1); + container->attributes.list_data = *data; + } + + // add the list item + container = add_child(container, list_item, line_number, + first_nonspace + 1); + container->attributes.list_data = *data; + free(data); + + } else { + break; + } + + if (accepts_lines(container->tag)) { + // if it's a line container, it can't contain other containers + break; + } + } + + // what remains at offset is a text line. add the text to the + // appropriate container. + + first_nonspace = offset; + while (bchar(ln, first_nonspace) == ' ') { + first_nonspace++; + } + + indent = first_nonspace - offset; + blank = bchar(ln, first_nonspace) == '\n'; + + // block quote lines are never blank as they start with > + // and we don't count blanks in fenced code for purposes of tight/loose + // lists or breaking out of lists. we also don't set last_line_blank + // on an empty list item. + container->last_line_blank = (blank && + container->tag != block_quote && + container->tag != fenced_code && + !(container->tag == list_item && + container->children == NULL && + container->start_line == line_number)); + + block *cont = container; + while (cont->parent) { + cont->parent->last_line_blank = false; + cont = cont->parent; + } + + if (cur != last_matched_container && + container == last_matched_container && + !blank && + cur->tag == paragraph && + blength(cur->string_content) > 0) { + + check(add_line(cur, ln, offset) == 0, "could not add line"); + + } else { // not a lazy continuation + + // finalize any blocks that were not matched and set cur to container: + while (cur != last_matched_container) { + + finalize(cur, line_number); + cur = cur->parent; + check(cur != NULL, "cur is NULL, last_matched_container->tag = %d", + last_matched_container->tag); + + } + + if (container->tag == indented_code) { + + check(add_line(container, ln, offset) == 0, "could not add line"); + + } else if (container->tag == fenced_code) { + + matched = (indent <= 3 + && bchar(ln, first_nonspace) == container->attributes.fenced_code_data.fence_char) + && scan_close_code_fence(ln, first_nonspace, + container->attributes.fenced_code_data.fence_length); + if (matched) { + // if closing fence, don't add line to container; instead, close it: + finalize(container, line_number); + container = container->parent; // back up to parent + } else { + check(add_line(container, ln, offset) == 0, "could not add line"); + } + + } else if (container->tag == html_block) { + + check(add_line(container, ln, offset) == 0, "could not add line"); + + } else if (blank) { + + // ??? do nothing + + } else if (container->tag == atx_header) { + + // chop off trailing ###s...use a scanner? + brtrimws(ln); + int p = blength(ln) - 1; + int numhashes = 0; + // if string ends in #s, remove these: + while (bchar(ln, p) == '#') { + p--; + numhashes++; + } + if (bchar(ln, p) == '\\') { + // the last # was escaped, so we include it. + p++; + numhashes--; + } + check(bdelete(ln, p + 1, numhashes) != BSTR_ERR, + "could not delete final hashes"); + check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + finalize(container, line_number); + container = container->parent; + + } else if (accepts_lines(container->tag)) { + + check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + + } else if (container->tag != hrule && container->tag != setext_header) { + + // create paragraph container for line + container = add_child(container, paragraph, line_number, first_nonspace + 1); + check(add_line(container, ln, first_nonspace) == 0, "could not add line"); + + } else { + + log_warn("Line %d with container type %d did not match any condition:\n\"%s\"", + line_number, container->tag, ln->data); + + } + *curptr = container; + } + + return 0; + error: + return -1; +} + diff --git a/src/bstrlib.c b/src/bstrlib.c new file mode 100644 index 0000000..1b19dbe --- /dev/null +++ b/src/bstrlib.c @@ -0,0 +1,2979 @@ +/*
+ * This source file is part of the bstring string library. This code was
+ * written by Paul Hsieh in 2002-2010, and is covered by either the 3-clause
+ * BSD open source license or GPL v2.0. Refer to the accompanying documentation
+ * for details on usage and license.
+ */
+
+/*
+ * bstrlib.c
+ *
+ * This file is the core module for implementing the bstring functions.
+ */
+
+#if defined (_MSC_VER)
+/* These warnings from MSVC++ are totally pointless. */
+# define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "bstrlib.h"
+
+/* Optionally include a mechanism for debugging memory */
+
+#if defined(MEMORY_DEBUG) || defined(BSTRLIB_MEMORY_DEBUG)
+#include "memdbg.h"
+#endif
+
+#ifndef bstr__alloc
+#define bstr__alloc(x) malloc (x)
+#endif
+
+#ifndef bstr__free
+#define bstr__free(p) free (p)
+#endif
+
+#ifndef bstr__realloc
+#define bstr__realloc(p,x) realloc ((p), (x))
+#endif
+
+#ifndef bstr__memcpy
+#define bstr__memcpy(d,s,l) memcpy ((d), (s), (l))
+#endif
+
+#ifndef bstr__memmove
+#define bstr__memmove(d,s,l) memmove ((d), (s), (l))
+#endif
+
+#ifndef bstr__memset
+#define bstr__memset(d,c,l) memset ((d), (c), (l))
+#endif
+
+#ifndef bstr__memcmp
+#define bstr__memcmp(d,c,l) memcmp ((d), (c), (l))
+#endif
+
+#ifndef bstr__memchr
+#define bstr__memchr(s,c,l) memchr ((s), (c), (l))
+#endif
+
+/* Just a length safe wrapper for memmove. */
+
+#define bBlockCopy(D,S,L) { if ((L) > 0) bstr__memmove ((D),(S),(L)); }
+
+/* Compute the snapped size for a given requested size. By snapping to powers
+ of 2 like this, repeated reallocations are avoided. */
+static int snapUpSize (int i) {
+ if (i < 8) {
+ i = 8;
+ } else {
+ unsigned int j;
+ j = (unsigned int) i;
+
+ j |= (j >> 1);
+ j |= (j >> 2);
+ j |= (j >> 4);
+ j |= (j >> 8); /* Ok, since int >= 16 bits */
+#if (UINT_MAX != 0xffff)
+ j |= (j >> 16); /* For 32 bit int systems */
+#if (UINT_MAX > 0xffffffffUL)
+ j |= (j >> 32); /* For 64 bit int systems */
+#endif
+#endif
+ /* Least power of two greater than i */
+ j++;
+ if ((int) j >= i) i = (int) j;
+ }
+ return i;
+}
+
+/* int balloc (bstring b, int len)
+ *
+ * Increase the size of the memory backing the bstring b to at least len.
+ */
+int balloc (bstring b, int olen) {
+ int len;
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 ||
+ b->mlen < b->slen || olen <= 0) {
+ return BSTR_ERR;
+ }
+
+ if (olen >= b->mlen) {
+ unsigned char * x;
+
+ if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
+
+ /* Assume probability of a non-moving realloc is 0.125 */
+ if (7 * b->mlen < 8 * b->slen) {
+
+ /* If slen is close to mlen in size then use realloc to reduce
+ the memory defragmentation */
+
+ reallocStrategy:;
+
+ x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+ if (x == NULL) {
+
+ /* Since we failed, try allocating the tighest possible
+ allocation */
+
+ if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
+ return BSTR_ERR;
+ }
+ }
+ } else {
+
+ /* If slen is not close to mlen then avoid the penalty of copying
+ the extra bytes that are allocated, but not considered part of
+ the string */
+
+ if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
+
+ /* Perhaps there is no available memory for the two
+ allocations to be in memory at once */
+
+ goto reallocStrategy;
+
+ } else {
+ if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
+ bstr__free (b->data);
+ }
+ }
+ b->data = x;
+ b->mlen = len;
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ return BSTR_OK;
+}
+
+/* int ballocmin (bstring b, int len)
+ *
+ * Set the size of the memory backing the bstring b to len or b->slen+1,
+ * whichever is larger. Note that repeated use of this function can degrade
+ * performance.
+ */
+int ballocmin (bstring b, int len) {
+ unsigned char * s;
+
+ if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 ||
+ b->mlen < b->slen || len <= 0) {
+ return BSTR_ERR;
+ }
+
+ if (len < b->slen + 1) len = b->slen + 1;
+
+ if (len != b->mlen) {
+ s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+ if (NULL == s) return BSTR_ERR;
+ s[b->slen] = (unsigned char) '\0';
+ b->data = s;
+ b->mlen = len;
+ }
+
+ return BSTR_OK;
+}
+
+/* bstring bfromcstr (const char * str)
+ *
+ * Create a bstring which contains the contents of the '\0' terminated char *
+ * buffer str.
+ */
+bstring bfromcstr (const char * str) {
+bstring b;
+int i;
+size_t j;
+
+ if (str == NULL) return NULL;
+ j = (strlen) (str);
+ i = snapUpSize ((int) (j + (2 - (j != 0))));
+ if (i <= (int) j) return NULL;
+
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (NULL == b) return NULL;
+ b->slen = (int) j;
+ if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ bstr__memcpy (b->data, str, j+1);
+ return b;
+}
+
+/* bstring bfromcstralloc (int mlen, const char * str)
+ *
+ * Create a bstring which contains the contents of the '\0' terminated char *
+ * buffer str. The memory buffer backing the string is at least len
+ * characters in length.
+ */
+bstring bfromcstralloc (int mlen, const char * str) {
+bstring b;
+int i;
+size_t j;
+
+ if (str == NULL) return NULL;
+ j = (strlen) (str);
+ i = snapUpSize ((int) (j + (2 - (j != 0))));
+ if (i <= (int) j) return NULL;
+
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b == NULL) return NULL;
+ b->slen = (int) j;
+ if (i < mlen) i = mlen;
+
+ if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ bstr__memcpy (b->data, str, j+1);
+ return b;
+}
+
+/* bstring blk2bstr (const void * blk, int len)
+ *
+ * Create a bstring which contains the content of the block blk of length
+ * len.
+ */
+bstring blk2bstr (const void * blk, int len) {
+bstring b;
+int i;
+
+ if (blk == NULL || len < 0) return NULL;
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b == NULL) return NULL;
+ b->slen = len;
+
+ i = len + (2 - (len != 0));
+ i = snapUpSize (i);
+
+ b->mlen = i;
+
+ b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
+ if (b->data == NULL) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
+ b->data[len] = (unsigned char) '\0';
+
+ return b;
+}
+
+/* char * bstr2cstr (const_bstring s, char z)
+ *
+ * Create a '\0' terminated char * buffer which is equal to the contents of
+ * the bstring s, except that any contained '\0' characters are converted
+ * to the character in z. This returned value should be freed with a
+ * bcstrfree () call, by the calling application.
+ */
+char * bstr2cstr (const_bstring b, char z) {
+int i, l;
+char * r;
+
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+ l = b->slen;
+ r = (char *) bstr__alloc ((size_t) (l + 1));
+ if (r == NULL) return r;
+
+ for (i=0; i < l; i ++) {
+ r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
+ }
+
+ r[l] = (unsigned char) '\0';
+
+ return r;
+}
+
+/* int bcstrfree (char * s)
+ *
+ * Frees a C-string generated by bstr2cstr (). This is normally unnecessary
+ * since it just wraps a call to bstr__free (), however, if bstr__alloc ()
+ * and bstr__free () have been redefined as a macros within the bstrlib
+ * module (via defining them in memdbg.h after defining
+ * BSTRLIB_MEMORY_DEBUG) with some difference in behaviour from the std
+ * library functions, then this allows a correct way of freeing the memory
+ * that allows higher level code to be independent from these macro
+ * redefinitions.
+ */
+int bcstrfree (char * s) {
+ if (s) {
+ bstr__free (s);
+ return BSTR_OK;
+ }
+ return BSTR_ERR;
+}
+
+/* int bconcat (bstring b0, const_bstring b1)
+ *
+ * Concatenate the bstring b1 to the bstring b0.
+ */
+int bconcat (bstring b0, const_bstring b1) {
+int len, d;
+bstring aux = (bstring) b1;
+
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
+
+ d = b0->slen;
+ len = b1->slen;
+ if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
+
+ if (b0->mlen <= d + len + 1) {
+ ptrdiff_t pd = b1->data - b0->data;
+ if (0 <= pd && pd < b0->mlen) {
+ if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+ }
+ if (balloc (b0, d + len + 1) != BSTR_OK) {
+ if (aux != b1) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ }
+
+ bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
+ b0->data[d + len] = (unsigned char) '\0';
+ b0->slen = d + len;
+ if (aux != b1) bdestroy (aux);
+ return BSTR_OK;
+}
+
+/* int bconchar (bstring b, char c)
+/ *
+ * Concatenate the single character c to the bstring b.
+ */
+int bconchar (bstring b, char c) {
+int d;
+
+ if (b == NULL) return BSTR_ERR;
+ d = b->slen;
+ if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ b->data[d] = (unsigned char) c;
+ b->data[d + 1] = (unsigned char) '\0';
+ b->slen++;
+ return BSTR_OK;
+}
+
+/* int bcatcstr (bstring b, const char * s)
+ *
+ * Concatenate a char * string to a bstring.
+ */
+int bcatcstr (bstring b, const char * s) {
+char * d;
+int i, l;
+
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+ || b->mlen <= 0 || s == NULL) return BSTR_ERR;
+
+ /* Optimistically concatenate directly */
+ l = b->mlen - b->slen;
+ d = (char *) &b->data[b->slen];
+ for (i=0; i < l; i++) {
+ if ((*d++ = *s++) == '\0') {
+ b->slen += i;
+ return BSTR_OK;
+ }
+ }
+ b->slen += i;
+
+ /* Need to explicitely resize and concatenate tail */
+ return bcatblk (b, (const void *) s, (int) strlen (s));
+}
+
+/* int bcatblk (bstring b, const void * s, int len)
+ *
+ * Concatenate a fixed length buffer to a bstring.
+ */
+int bcatblk (bstring b, const void * s, int len) {
+int nl;
+
+ if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+ || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
+
+ if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
+ if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
+
+ bBlockCopy (&b->data[b->slen], s, (size_t) len);
+ b->slen = nl;
+ b->data[nl] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* bstring bstrcpy (const_bstring b)
+ *
+ * Create a copy of the bstring b.
+ */
+bstring bstrcpy (const_bstring b) {
+bstring b0;
+int i,j;
+
+ /* Attempted to copy an invalid string? */
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+
+ b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (b0 == NULL) {
+ /* Unable to allocate memory for string header */
+ return NULL;
+ }
+
+ i = b->slen;
+ j = snapUpSize (i + 1);
+
+ b0->data = (unsigned char *) bstr__alloc (j);
+ if (b0->data == NULL) {
+ j = i + 1;
+ b0->data = (unsigned char *) bstr__alloc (j);
+ if (b0->data == NULL) {
+ /* Unable to allocate memory for string data */
+ bstr__free (b0);
+ return NULL;
+ }
+ }
+
+ b0->mlen = j;
+ b0->slen = i;
+
+ if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
+ b0->data[b0->slen] = (unsigned char) '\0';
+
+ return b0;
+}
+
+/* int bassign (bstring a, const_bstring b)
+ *
+ * Overwrite the string a with the contents of string b.
+ */
+int bassign (bstring a, const_bstring b) {
+ if (b == NULL || b->data == NULL || b->slen < 0)
+ return BSTR_ERR;
+ if (b->slen != 0) {
+ if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
+ bstr__memmove (a->data, b->data, b->slen);
+ } else {
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0)
+ return BSTR_ERR;
+ }
+ a->data[b->slen] = (unsigned char) '\0';
+ a->slen = b->slen;
+ return BSTR_OK;
+}
+
+/* int bassignmidstr (bstring a, const_bstring b, int left, int len)
+ *
+ * Overwrite the string a with the middle of contents of string b
+ * starting from position left and running for a length len. left and
+ * len are clamped to the ends of b as with the function bmidstr.
+ */
+int bassignmidstr (bstring a, const_bstring b, int left, int len) {
+ if (b == NULL || b->data == NULL || b->slen < 0)
+ return BSTR_ERR;
+
+ if (left < 0) {
+ len += left;
+ left = 0;
+ }
+
+ if (len > b->slen - left) len = b->slen - left;
+
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0)
+ return BSTR_ERR;
+
+ if (len > 0) {
+ if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
+ bstr__memmove (a->data, b->data + left, len);
+ a->slen = len;
+ } else {
+ a->slen = 0;
+ }
+ a->data[a->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bassigncstr (bstring a, const char * str)
+ *
+ * Overwrite the string a with the contents of char * string str. Note that
+ * the bstring a must be a well defined and writable bstring. If an error
+ * occurs BSTR_ERR is returned however a may be partially overwritten.
+ */
+int bassigncstr (bstring a, const char * str) {
+int i;
+size_t len;
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0 || NULL == str)
+ return BSTR_ERR;
+
+ for (i=0; i < a->mlen; i++) {
+ if ('\0' == (a->data[i] = str[i])) {
+ a->slen = i;
+ return BSTR_OK;
+ }
+ }
+
+ a->slen = i;
+ len = strlen (str + i);
+ if (len > INT_MAX || i + len + 1 > INT_MAX ||
+ 0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
+ bBlockCopy (a->data + i, str + i, (size_t) len + 1);
+ a->slen += (int) len;
+ return BSTR_OK;
+}
+
+/* int bassignblk (bstring a, const void * s, int len)
+ *
+ * Overwrite the string a with the contents of the block (s, len). Note that
+ * the bstring a must be a well defined and writable bstring. If an error
+ * occurs BSTR_ERR is returned and a is not overwritten.
+ */
+int bassignblk (bstring a, const void * s, int len) {
+ if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+ a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1)
+ return BSTR_ERR;
+ if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
+ bBlockCopy (a->data, s, (size_t) len);
+ a->data[len] = (unsigned char) '\0';
+ a->slen = len;
+ return BSTR_OK;
+}
+
+/* int btrunc (bstring b, int n)
+ *
+ * Truncate the bstring to at most n characters.
+ */
+int btrunc (bstring b, int n) {
+ if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ if (b->slen > n) {
+ b->slen = n;
+ b->data[n] = (unsigned char) '\0';
+ }
+ return BSTR_OK;
+}
+
+#define upcase(c) (toupper ((unsigned char) c))
+#define downcase(c) (tolower ((unsigned char) c))
+#define wspace(c) (isspace ((unsigned char) c))
+
+/* int btoupper (bstring b)
+ *
+ * Convert contents of bstring to upper case.
+ */
+int btoupper (bstring b) {
+int i, len;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ for (i=0, len = b->slen; i < len; i++) {
+ b->data[i] = (unsigned char) upcase (b->data[i]);
+ }
+ return BSTR_OK;
+}
+
+/* int btolower (bstring b)
+ *
+ * Convert contents of bstring to lower case.
+ */
+int btolower (bstring b) {
+int i, len;
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+ for (i=0, len = b->slen; i < len; i++) {
+ b->data[i] = (unsigned char) downcase (b->data[i]);
+ }
+ return BSTR_OK;
+}
+
+/* int bstricmp (const_bstring b0, const_bstring b1)
+ *
+ * Compare two strings without differentiating between case. The return
+ * value is the difference of the values of the characters where the two
+ * strings first differ after lower case transformation, otherwise 0 is
+ * returned indicating that the strings are equal. If the lengths are
+ * different, then a difference from 0 is given, but if the first extra
+ * character is '\0', then it is taken to be the value UCHAR_MAX+1.
+ */
+int bstricmp (const_bstring b0, const_bstring b1) {
+int i, v, n;
+
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
+ if ((n = b0->slen) > b1->slen) n = b1->slen;
+ else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
+
+ for (i = 0; i < n; i ++) {
+ v = (char) downcase (b0->data[i])
+ - (char) downcase (b1->data[i]);
+ if (0 != v) return v;
+ }
+
+ if (b0->slen > n) {
+ v = (char) downcase (b0->data[n]);
+ if (v) return v;
+ return UCHAR_MAX + 1;
+ }
+ if (b1->slen > n) {
+ v = - (char) downcase (b1->data[n]);
+ if (v) return v;
+ return - (int) (UCHAR_MAX + 1);
+ }
+ return BSTR_OK;
+}
+
+/* int bstrnicmp (const_bstring b0, const_bstring b1, int n)
+ *
+ * Compare two strings without differentiating between case for at most n
+ * characters. If the position where the two strings first differ is
+ * before the nth position, the return value is the difference of the values
+ * of the characters, otherwise 0 is returned. If the lengths are different
+ * and less than n characters, then a difference from 0 is given, but if the
+ * first extra character is '\0', then it is taken to be the value
+ * UCHAR_MAX+1.
+ */
+int bstrnicmp (const_bstring b0, const_bstring b1, int n) {
+int i, v, m;
+
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
+ m = n;
+ if (m > b0->slen) m = b0->slen;
+ if (m > b1->slen) m = b1->slen;
+
+ if (b0->data != b1->data) {
+ for (i = 0; i < m; i ++) {
+ v = (char) downcase (b0->data[i]);
+ v -= (char) downcase (b1->data[i]);
+ if (v != 0) return b0->data[i] - b1->data[i];
+ }
+ }
+
+ if (n == m || b0->slen == b1->slen) return BSTR_OK;
+
+ if (b0->slen > m) {
+ v = (char) downcase (b0->data[m]);
+ if (v) return v;
+ return UCHAR_MAX + 1;
+ }
+
+ v = - (char) downcase (b1->data[m]);
+ if (v) return v;
+ return - (int) (UCHAR_MAX + 1);
+}
+
+/* int biseqcaseless (const_bstring b0, const_bstring b1)
+ *
+ * Compare two strings for equality without differentiating between case.
+ * If the strings differ other than in case, 0 is returned, if the strings
+ * are the same, 1 is returned, if there is an error, -1 is returned. If
+ * the length of the strings are different, this function is O(1). '\0'
+ * termination characters are not treated in any special way.
+ */
+int biseqcaseless (const_bstring b0, const_bstring b1) {
+int i, n;
+
+ if (bdata (b0) == NULL || b0->slen < 0 ||
+ bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
+ if (b0->slen != b1->slen) return BSTR_OK;
+ if (b0->data == b1->data || b0->slen == 0) return 1;
+ for (i=0, n=b0->slen; i < n; i++) {
+ if (b0->data[i] != b1->data[i]) {
+ unsigned char c = (unsigned char) downcase (b0->data[i]);
+ if (c != (unsigned char) downcase (b1->data[i])) return 0;
+ }
+ }
+ return 1;
+}
+
+/* int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len)
+ *
+ * Compare beginning of string b0 with a block of memory of length len
+ * without differentiating between case for equality. If the beginning of b0
+ * differs from the memory block other than in case (or if b0 is too short),
+ * 0 is returned, if the strings are the same, 1 is returned, if there is an
+ * error, -1 is returned. '\0' characters are not treated in any special
+ * way.
+ */
+int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len) {
+int i;
+
+ if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+ return BSTR_ERR;
+ if (b0->slen < len) return BSTR_OK;
+ if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+
+ for (i = 0; i < len; i ++) {
+ if (b0->data[i] != ((const unsigned char *) blk)[i]) {
+ if (downcase (b0->data[i]) !=
+ downcase (((const unsigned char *) blk)[i])) return 0;
+ }
+ }
+ return 1;
+}
+
+/*
+ * int bltrimws (bstring b)
+ *
+ * Delete whitespace contiguous from the left end of the string.
+ */
+int bltrimws (bstring b) {
+int i, len;
+
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+ for (len = b->slen, i = 0; i < len; i++) {
+ if (!wspace (b->data[i])) {
+ return bdelete (b, 0, i);
+ }
+ }
+
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
+}
+
+/*
+ * int brtrimws (bstring b)
+ *
+ * Delete whitespace contiguous from the right end of the string.
+ */
+int brtrimws (bstring b) {
+int i;
+
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+ for (i = b->slen - 1; i >= 0; i--) {
+ if (!wspace (b->data[i])) {
+ if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+ b->slen = i + 1;
+ return BSTR_OK;
+ }
+ }
+
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
+}
+
+/*
+ * int btrimws (bstring b)
+ *
+ * Delete whitespace contiguous from both ends of the string.
+ */
+int btrimws (bstring b) {
+int i, j;
+
+ if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+ b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+ for (i = b->slen - 1; i >= 0; i--) {
+ if (!wspace (b->data[i])) {
+ if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+ b->slen = i + 1;
+ for (j = 0; wspace (b->data[j]); j++) {}
+ return bdelete (b, 0, j);
+ }
+ }
+
+ b->data[0] = (unsigned char) '\0';
+ b->slen = 0;
+ return BSTR_OK;
+}
+
+/* int biseq (const_bstring b0, const_bstring b1)
+ *
+ * Compare the string b0 and b1. If the strings differ, 0 is returned, if
+ * the strings are the same, 1 is returned, if there is an error, -1 is
+ * returned. If the length of the strings are different, this function is
+ * O(1). '\0' termination characters are not treated in any special way.
+ */
+int biseq (const_bstring b0, const_bstring b1) {
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
+ if (b0->slen != b1->slen) return BSTR_OK;
+ if (b0->data == b1->data || b0->slen == 0) return 1;
+ return !bstr__memcmp (b0->data, b1->data, b0->slen);
+}
+
+/* int bisstemeqblk (const_bstring b0, const void * blk, int len)
+ *
+ * Compare beginning of string b0 with a block of memory of length len for
+ * equality. If the beginning of b0 differs from the memory block (or if b0
+ * is too short), 0 is returned, if the strings are the same, 1 is returned,
+ * if there is an error, -1 is returned. '\0' characters are not treated in
+ * any special way.
+ */
+int bisstemeqblk (const_bstring b0, const void * blk, int len) {
+int i;
+
+ if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+ return BSTR_ERR;
+ if (b0->slen < len) return BSTR_OK;
+ if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+
+ for (i = 0; i < len; i ++) {
+ if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
+ }
+ return 1;
+}
+
+/* int biseqcstr (const_bstring b, const char *s)
+ *
+ * Compare the bstring b and char * string s. The C string s must be '\0'
+ * terminated at exactly the length of the bstring b, and the contents
+ * between the two must be identical with the bstring b with no '\0'
+ * characters for the two contents to be considered equal. This is
+ * equivalent to the condition that their current contents will be always be
+ * equal when comparing them in the same format after converting one or the
+ * other. If the strings are equal 1 is returned, if they are unequal 0 is
+ * returned and if there is a detectable error BSTR_ERR is returned.
+ */
+int biseqcstr (const_bstring b, const char * s) {
+int i;
+ if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+ for (i=0; i < b->slen; i++) {
+ if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
+ }
+ return s[i] == '\0';
+}
+
+/* int biseqcstrcaseless (const_bstring b, const char *s)
+ *
+ * Compare the bstring b and char * string s. The C string s must be '\0'
+ * terminated at exactly the length of the bstring b, and the contents
+ * between the two must be identical except for case with the bstring b with
+ * no '\0' characters for the two contents to be considered equal. This is
+ * equivalent to the condition that their current contents will be always be
+ * equal ignoring case when comparing them in the same format after
+ * converting one or the other. If the strings are equal, except for case,
+ * 1 is returned, if they are unequal regardless of case 0 is returned and
+ * if there is a detectable error BSTR_ERR is returned.
+ */
+int biseqcstrcaseless (const_bstring b, const char * s) {
+int i;
+ if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+ for (i=0; i < b->slen; i++) {
+ if (s[i] == '\0' ||
+ (b->data[i] != (unsigned char) s[i] &&
+ downcase (b->data[i]) != (unsigned char) downcase (s[i])))
+ return BSTR_OK;
+ }
+ return s[i] == '\0';
+}
+
+/* int bstrcmp (const_bstring b0, const_bstring b1)
+ *
+ * Compare the string b0 and b1. If there is an error, SHRT_MIN is returned,
+ * otherwise a value less than or greater than zero, indicating that the
+ * string pointed to by b0 is lexicographically less than or greater than
+ * the string pointed to by b1 is returned. If the the string lengths are
+ * unequal but the characters up until the length of the shorter are equal
+ * then a value less than, or greater than zero, indicating that the string
+ * pointed to by b0 is shorter or longer than the string pointed to by b1 is
+ * returned. 0 is returned if and only if the two strings are the same. If
+ * the length of the strings are different, this function is O(n). Like its
+ * standard C library counter part strcmp, the comparison does not proceed
+ * past any '\0' termination characters encountered.
+ */
+int bstrcmp (const_bstring b0, const_bstring b1) {
+int i, v, n;
+
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+ n = b0->slen; if (n > b1->slen) n = b1->slen;
+ if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
+ return BSTR_OK;
+
+ for (i = 0; i < n; i ++) {
+ v = ((char) b0->data[i]) - ((char) b1->data[i]);
+ if (v != 0) return v;
+ if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+ }
+
+ if (b0->slen > n) return 1;
+ if (b1->slen > n) return -1;
+ return BSTR_OK;
+}
+
+/* int bstrncmp (const_bstring b0, const_bstring b1, int n)
+ *
+ * Compare the string b0 and b1 for at most n characters. If there is an
+ * error, SHRT_MIN is returned, otherwise a value is returned as if b0 and
+ * b1 were first truncated to at most n characters then bstrcmp was called
+ * with these new strings are paremeters. If the length of the strings are
+ * different, this function is O(n). Like its standard C library counter
+ * part strcmp, the comparison does not proceed past any '\0' termination
+ * characters encountered.
+ */
+int bstrncmp (const_bstring b0, const_bstring b1, int n) {
+int i, v, m;
+
+ if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+ b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+ m = n;
+ if (m > b0->slen) m = b0->slen;
+ if (m > b1->slen) m = b1->slen;
+
+ if (b0->data != b1->data) {
+ for (i = 0; i < m; i ++) {
+ v = ((char) b0->data[i]) - ((char) b1->data[i]);
+ if (v != 0) return v;
+ if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+ }
+ }
+
+ if (n == m || b0->slen == b1->slen) return BSTR_OK;
+
+ if (b0->slen > m) return 1;
+ return -1;
+}
+
+/* bstring bmidstr (const_bstring b, int left, int len)
+ *
+ * Create a bstring which is the substring of b starting from position left
+ * and running for a length len (clamped by the end of the bstring b.) If
+ * b is detectably invalid, then NULL is returned. The section described
+ * by (left, len) is clamped to the boundaries of b.
+ */
+bstring bmidstr (const_bstring b, int left, int len) {
+
+ if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+
+ if (left < 0) {
+ len += left;
+ left = 0;
+ }
+
+ if (len > b->slen - left) len = b->slen - left;
+
+ if (len <= 0) return bfromcstr ("");
+ return blk2bstr (b->data + left, len);
+}
+
+/* int bdelete (bstring b, int pos, int len)
+ *
+ * Removes characters from pos to pos+len-1 inclusive and shifts the tail of
+ * the bstring starting from pos+len to pos. len must be positive for this
+ * call to have any effect. The section of the string described by (pos,
+ * len) is clamped to boundaries of the bstring b.
+ */
+int bdelete (bstring b, int pos, int len) {
+ /* Clamp to left side of bstring */
+ if (pos < 0) {
+ len += pos;
+ pos = 0;
+ }
+
+ if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 ||
+ b->mlen < b->slen || b->mlen <= 0)
+ return BSTR_ERR;
+ if (len > 0 && pos < b->slen) {
+ if (pos + len >= b->slen) {
+ b->slen = pos;
+ } else {
+ bBlockCopy ((char *) (b->data + pos),
+ (char *) (b->data + pos + len),
+ b->slen - (pos+len));
+ b->slen -= len;
+ }
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+ return BSTR_OK;
+}
+
+/* int bdestroy (bstring b)
+ *
+ * Free up the bstring. Note that if b is detectably invalid or not writable
+ * then no action is performed and BSTR_ERR is returned. Like a freed memory
+ * allocation, dereferences, writes or any other action on b after it has
+ * been bdestroyed is undefined.
+ */
+int bdestroy (bstring b) {
+ if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
+ b->data == NULL)
+ return BSTR_ERR;
+
+ bstr__free (b->data);
+
+ /* In case there is any stale usage, there is one more chance to
+ notice this error. */
+
+ b->slen = -1;
+ b->mlen = -__LINE__;
+ b->data = NULL;
+
+ bstr__free (b);
+ return BSTR_OK;
+}
+
+/* int binstr (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * forward. If it is found then return with the first position where it is
+ * found, otherwise return BSTR_ERR. Note that this is just a brute force
+ * string searcher that does not attempt clever things like the Boyer-Moore
+ * search algorithm. Because of this there are many degenerate cases where
+ * this can take much longer than it needs to.
+ */
+int binstr (const_bstring b1, int pos, const_bstring b2) {
+int j, ii, ll, lf;
+unsigned char * d0;
+unsigned char c0;
+register unsigned char * d1;
+register unsigned char c1;
+register int i;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ /* No space to find such a string? */
+ if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
+
+ /* An obvious alias case */
+ if (b1->data == b2->data && pos == 0) return 0;
+
+ i = pos;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ ll = b2->slen;
+
+ /* Peel off the b2->slen == 1 case */
+ c0 = d0[0];
+ if (1 == ll) {
+ for (;i < lf; i++) if (c0 == d1[i]) return i;
+ return BSTR_ERR;
+ }
+
+ c1 = c0;
+ j = 0;
+ lf = b1->slen - 1;
+
+ ii = -1;
+ if (i < lf) do {
+ /* Unrolled current character test */
+ if (c1 != d1[i]) {
+ if (c1 != d1[1+i]) {
+ i += 2;
+ continue;
+ }
+ i++;
+ }
+
+ /* Take note if this is the start of a potential match */
+ if (0 == j) ii = i;
+
+ /* Shift the test character down by one */
+ j++;
+ i++;
+
+ /* If this isn't past the last character continue */
+ if (j < ll) {
+ c1 = d0[j];
+ continue;
+ }
+
+ N0:;
+
+ /* If no characters mismatched, then we matched */
+ if (i == ii+j) return ii;
+
+ /* Shift back to the beginning */
+ i -= j;
+ j = 0;
+ c1 = c0;
+ } while (i < lf);
+
+ /* Deal with last case if unrolling caused a misalignment */
+ if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
+
+ return BSTR_ERR;
+}
+
+/* int binstrr (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * backward. If it is found then return with the first position where it is
+ * found, otherwise return BSTR_ERR. Note that this is just a brute force
+ * string searcher that does not attempt clever things like the Boyer-Moore
+ * search algorithm. Because of this there are many degenerate cases where
+ * this can take much longer than it needs to.
+ */
+int binstrr (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l;
+unsigned char * d0, * d1;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos && b2->slen == 0) return pos;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ /* Obvious alias case */
+ if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
+
+ i = pos;
+ if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+
+ /* If no space to find such a string then snap back */
+ if (l + 1 <= i) i = l;
+ j = 0;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ l = b2->slen;
+
+ for (;;) {
+ if (d0[j] == d1[i + j]) {
+ j ++;
+ if (j >= l) return i;
+ } else {
+ i --;
+ if (i < 0) break;
+ j=0;
+ }
+ }
+
+ return BSTR_ERR;
+}
+
+/* int binstrcaseless (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * forward but without regard to case. If it is found then return with the
+ * first position where it is found, otherwise return BSTR_ERR. Note that
+ * this is just a brute force string searcher that does not attempt clever
+ * things like the Boyer-Moore search algorithm. Because of this there are
+ * many degenerate cases where this can take much longer than it needs to.
+ */
+int binstrcaseless (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l, ll;
+unsigned char * d0, * d1;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ l = b1->slen - b2->slen + 1;
+
+ /* No space to find such a string? */
+ if (l <= pos) return BSTR_ERR;
+
+ /* An obvious alias case */
+ if (b1->data == b2->data && pos == 0) return BSTR_OK;
+
+ i = pos;
+ j = 0;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ ll = b2->slen;
+
+ for (;;) {
+ if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+ j ++;
+ if (j >= ll) return i;
+ } else {
+ i ++;
+ if (i >= l) break;
+ j=0;
+ }
+ }
+
+ return BSTR_ERR;
+}
+
+/* int binstrrcaseless (const_bstring b1, int pos, const_bstring b2)
+ *
+ * Search for the bstring b2 in b1 starting from position pos, and searching
+ * backward but without regard to case. If it is found then return with the
+ * first position where it is found, otherwise return BSTR_ERR. Note that
+ * this is just a brute force string searcher that does not attempt clever
+ * things like the Boyer-Moore search algorithm. Because of this there are
+ * many degenerate cases where this can take much longer than it needs to.
+ */
+int binstrrcaseless (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l;
+unsigned char * d0, * d1;
+
+ if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+ b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+ if (b1->slen == pos && b2->slen == 0) return pos;
+ if (b1->slen < pos || pos < 0) return BSTR_ERR;
+ if (b2->slen == 0) return pos;
+
+ /* Obvious alias case */
+ if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
+
+ i = pos;
+ if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+
+ /* If no space to find such a string then snap back */
+ if (l + 1 <= i) i = l;
+ j = 0;
+
+ d0 = b2->data;
+ d1 = b1->data;
+ l = b2->slen;
+
+ for (;;) {
+ if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+ j ++;
+ if (j >= l) return i;
+ } else {
+ i --;
+ if (i < 0) break;
+ j=0;
+ }
+ }
+
+ return BSTR_ERR;
+}
+
+
+/* int bstrchrp (const_bstring b, int c, int pos)
+ *
+ * Search for the character c in b forwards from the position pos
+ * (inclusive).
+ */
+int bstrchrp (const_bstring b, int c, int pos) {
+unsigned char * p;
+
+ if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+ p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
+ if (p) return (int) (p - b->data);
+ return BSTR_ERR;
+}
+
+/* int bstrrchrp (const_bstring b, int c, int pos)
+ *
+ * Search for the character c in b backwards from the position pos in string
+ * (inclusive).
+ */
+int bstrrchrp (const_bstring b, int c, int pos) {
+int i;
+
+ if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+ for (i=pos; i >= 0; i--) {
+ if (b->data[i] == (unsigned char) c) return i;
+ }
+ return BSTR_ERR;
+}
+
+#if !defined (BSTRLIB_AGGRESSIVE_MEMORY_FOR_SPEED_TRADEOFF)
+#define LONG_LOG_BITS_QTY (3)
+#define LONG_BITS_QTY (1 << LONG_LOG_BITS_QTY)
+#define LONG_TYPE unsigned char
+
+#define CFCLEN ((1 << CHAR_BIT) / LONG_BITS_QTY)
+struct charField { LONG_TYPE content[CFCLEN]; };
+#define testInCharField(cf,c) ((cf)->content[(c) >> LONG_LOG_BITS_QTY] & (((long)1) << ((c) & (LONG_BITS_QTY-1))))
+#define setInCharField(cf,idx) { \
+ unsigned int c = (unsigned int) (idx); \
+ (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
+}
+
+#else
+
+#define CFCLEN (1 << CHAR_BIT)
+struct charField { unsigned char content[CFCLEN]; };
+#define testInCharField(cf,c) ((cf)->content[(unsigned char) (c)])
+#define setInCharField(cf,idx) (cf)->content[(unsigned int) (idx)] = ~0
+
+#endif
+
+/* Convert a bstring to charField */
+static int buildCharField (struct charField * cf, const_bstring b) {
+int i;
+ if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
+ memset ((void *) cf->content, 0, sizeof (struct charField));
+ for (i=0; i < b->slen; i++) {
+ setInCharField (cf, b->data[i]);
+ }
+ return BSTR_OK;
+}
+
+static void invertCharField (struct charField * cf) {
+int i;
+ for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
+}
+
+/* Inner engine for binchr */
+static int binchrCF (const unsigned char * data, int len, int pos, const struct charField * cf) {
+int i;
+ for (i=pos; i < len; i++) {
+ unsigned char c = (unsigned char) data[i];
+ if (testInCharField (cf, c)) return i;
+ }
+ return BSTR_ERR;
+}
+
+/* int binchr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the first position in b0 starting from pos or after, in which
+ * one of the characters in b1 is found and return it. If such a position
+ * does not exist in b0, then BSTR_ERR is returned.
+ */
+int binchr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen <= pos) return BSTR_ERR;
+ if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
+ if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+ return binchrCF (b0->data, b0->slen, pos, &chrs);
+}
+
+/* Inner engine for binchrr */
+static int binchrrCF (const unsigned char * data, int pos, const struct charField * cf) {
+int i;
+ for (i=pos; i >= 0; i--) {
+ unsigned int c = (unsigned int) data[i];
+ if (testInCharField (cf, c)) return i;
+ }
+ return BSTR_ERR;
+}
+
+/* int binchrr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the last position in b0 no greater than pos, in which one of
+ * the characters in b1 is found and return it. If such a position does not
+ * exist in b0, then BSTR_ERR is returned.
+ */
+int binchrr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
+ b0->slen < pos) return BSTR_ERR;
+ if (pos == b0->slen) pos--;
+ if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
+ if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+ return binchrrCF (b0->data, pos, &chrs);
+}
+
+/* int bninchr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the first position in b0 starting from pos or after, in which
+ * none of the characters in b1 is found and return it. If such a position
+ * does not exist in b0, then BSTR_ERR is returned.
+ */
+int bninchr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen <= pos) return BSTR_ERR;
+ if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+ invertCharField (&chrs);
+ return binchrCF (b0->data, b0->slen, pos, &chrs);
+}
+
+/* int bninchrr (const_bstring b0, int pos, const_bstring b1);
+ *
+ * Search for the last position in b0 no greater than pos, in which none of
+ * the characters in b1 is found and return it. If such a position does not
+ * exist in b0, then BSTR_ERR is returned.
+ */
+int bninchrr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+ if (pos < 0 || b0 == NULL || b0->data == NULL ||
+ b0->slen < pos) return BSTR_ERR;
+ if (pos == b0->slen) pos--;
+ if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+ invertCharField (&chrs);
+ return binchrrCF (b0->data, pos, &chrs);
+}
+
+/* int bsetstr (bstring b0, int pos, bstring b1, unsigned char fill)
+ *
+ * Overwrite the string b0 starting at position pos with the string b1. If
+ * the position pos is past the end of b0, then the character "fill" is
+ * appended as necessary to make up the gap between the end of b0 and pos.
+ * If b1 is NULL, it behaves as if it were a 0-length string.
+ */
+int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill) {
+int d, newlen;
+ptrdiff_t pd;
+bstring aux = (bstring) b1;
+
+ if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data ||
+ b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
+ if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
+
+ d = pos;
+
+ /* Aliasing case */
+ if (NULL != aux) {
+ if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
+ if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+ }
+ d += aux->slen;
+ }
+
+ /* Increase memory size if necessary */
+ if (balloc (b0, d + 1) != BSTR_OK) {
+ if (aux != b1) bdestroy (aux);
+ return BSTR_ERR;
+ }
+
+ newlen = b0->slen;
+
+ /* Fill in "fill" character as necessary */
+ if (pos > newlen) {
+ bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
+ newlen = pos;
+ }
+
+ /* Copy b1 to position pos in b0. */
+ if (aux != NULL) {
+ bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
+ if (aux != b1) bdestroy (aux);
+ }
+
+ /* Indicate the potentially increased size of b0 */
+ if (d > newlen) newlen = d;
+
+ b0->slen = newlen;
+ b0->data[newlen] = (unsigned char) '\0';
+
+ return BSTR_OK;
+}
+
+/* int binsert (bstring b1, int pos, bstring b2, unsigned char fill)
+ *
+ * Inserts the string b2 into b1 at position pos. If the position pos is
+ * past the end of b1, then the character "fill" is appended as necessary to
+ * make up the gap between the end of b1 and pos. Unlike bsetstr, binsert
+ * does not allow b2 to be NULL.
+ */
+int binsert (bstring b1, int pos, const_bstring b2, unsigned char fill) {
+int d, l;
+ptrdiff_t pd;
+bstring aux = (bstring) b2;
+
+ if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 ||
+ b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
+
+ /* Aliasing case */
+ if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
+ if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+ }
+
+ /* Compute the two possible end pointers */
+ d = b1->slen + aux->slen;
+ l = pos + aux->slen;
+ if ((d|l) < 0) return BSTR_ERR;
+
+ if (l > d) {
+ /* Inserting past the end of the string */
+ if (balloc (b1, l + 1) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
+ b1->slen = l;
+ } else {
+ /* Inserting in the middle of the string */
+ if (balloc (b1, d + 1) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ bBlockCopy (b1->data + l, b1->data + pos, d - l);
+ b1->slen = d;
+ }
+ bBlockCopy (b1->data + pos, aux->data, aux->slen);
+ b1->data[b1->slen] = (unsigned char) '\0';
+ if (aux != b2) bdestroy (aux);
+ return BSTR_OK;
+}
+
+/* int breplace (bstring b1, int pos, int len, bstring b2,
+ * unsigned char fill)
+ *
+ * Replace a section of a string from pos for a length len with the string b2.
+ * fill is used is pos > b1->slen.
+ */
+int breplace (bstring b1, int pos, int len, const_bstring b2,
+ unsigned char fill) {
+int pl, ret;
+ptrdiff_t pd;
+bstring aux = (bstring) b2;
+
+ if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL ||
+ b2 == NULL || b1->data == NULL || b2->data == NULL ||
+ b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
+ b1->mlen <= 0) return BSTR_ERR;
+
+ /* Straddles the end? */
+ if (pl >= b1->slen) {
+ if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
+ if (pos + b2->slen < b1->slen) {
+ b1->slen = pos + b2->slen;
+ b1->data[b1->slen] = (unsigned char) '\0';
+ }
+ return ret;
+ }
+
+ /* Aliasing case */
+ if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
+ if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+ }
+
+ if (aux->slen > len) {
+ if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
+ if (aux != b2) bdestroy (aux);
+ return BSTR_ERR;
+ }
+ }
+
+ if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
+ bstr__memcpy (b1->data + pos, aux->data, aux->slen);
+ b1->slen += aux->slen - len;
+ b1->data[b1->slen] = (unsigned char) '\0';
+ if (aux != b2) bdestroy (aux);
+ return BSTR_OK;
+}
+
+/*
+ * findreplaceengine is used to implement bfindreplace and
+ * bfindreplacecaseless. It works by breaking the three cases of
+ * expansion, reduction and replacement, and solving each of these
+ * in the most efficient way possible.
+ */
+
+typedef int (*instr_fnptr) (const_bstring s1, int pos, const_bstring s2);
+
+#define INITIAL_STATIC_FIND_INDEX_COUNT 32
+
+static int findreplaceengine (bstring b, const_bstring find, const_bstring repl, int pos, instr_fnptr instr) {
+int i, ret, slen, mlen, delta, acc;
+int * d;
+int static_d[INITIAL_STATIC_FIND_INDEX_COUNT+1]; /* This +1 is unnecessary, but it shuts up LINT. */
+ptrdiff_t pd;
+bstring auxf = (bstring) find;
+bstring auxr = (bstring) repl;
+
+ if (b == NULL || b->data == NULL || find == NULL ||
+ find->data == NULL || repl == NULL || repl->data == NULL ||
+ pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
+ b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
+ if (pos > b->slen - find->slen) return BSTR_OK;
+
+ /* Alias with find string */
+ pd = (ptrdiff_t) (find->data - b->data);
+ if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
+ if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
+ }
+
+ /* Alias with repl string */
+ pd = (ptrdiff_t) (repl->data - b->data);
+ if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
+ if (NULL == (auxr = bstrcpy (repl))) {
+ if (auxf != find) bdestroy (auxf);
+ return BSTR_ERR;
+ }
+ }
+
+ delta = auxf->slen - auxr->slen;
+
+ /* in-place replacement since find and replace strings are of equal
+ length */
+ if (delta == 0) {
+ while ((pos = instr (b, pos, auxf)) >= 0) {
+ bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
+ pos += auxf->slen;
+ }
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return BSTR_OK;
+ }
+
+ /* shrinking replacement since auxf->slen > auxr->slen */
+ if (delta > 0) {
+ acc = 0;
+
+ while ((i = instr (b, pos, auxf)) >= 0) {
+ if (acc && i > pos)
+ bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+ if (auxr->slen)
+ bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
+ acc += delta;
+ pos = i + auxf->slen;
+ }
+
+ if (acc) {
+ i = b->slen;
+ if (i > pos)
+ bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+ b->slen -= acc;
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return BSTR_OK;
+ }
+
+ /* expanding replacement since find->slen < repl->slen. Its a lot
+ more complicated. This works by first finding all the matches and
+ storing them to a growable array, then doing at most one resize of
+ the destination bstring and then performing the direct memory transfers
+ of the string segment pieces to form the final result. The growable
+ array of matches uses a deferred doubling reallocing strategy. What
+ this means is that it starts as a reasonably fixed sized auto array in
+ the hopes that many if not most cases will never need to grow this
+ array. But it switches as soon as the bounds of the array will be
+ exceeded. An extra find result is always appended to this array that
+ corresponds to the end of the destination string, so slen is checked
+ against mlen - 1 rather than mlen before resizing.
+ */
+
+ mlen = INITIAL_STATIC_FIND_INDEX_COUNT;
+ d = (int *) static_d; /* Avoid malloc for trivial/initial cases */
+ acc = slen = 0;
+
+ while ((pos = instr (b, pos, auxf)) >= 0) {
+ if (slen >= mlen - 1) {
+ int sl, *t;
+
+ mlen += mlen;
+ sl = sizeof (int *) * mlen;
+ if (static_d == d) d = NULL; /* static_d cannot be realloced */
+ if (mlen <= 0 || sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
+ ret = BSTR_ERR;
+ goto done;
+ }
+ if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
+ d = t;
+ }
+ d[slen] = pos;
+ slen++;
+ acc -= delta;
+ pos += auxf->slen;
+ if (pos < 0 || acc < 0) {
+ ret = BSTR_ERR;
+ goto done;
+ }
+ }
+
+ /* slen <= INITIAL_STATIC_INDEX_COUNT-1 or mlen-1 here. */
+ d[slen] = b->slen;
+
+ if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
+ b->slen += acc;
+ for (i = slen-1; i >= 0; i--) {
+ int s, l;
+ s = d[i] + auxf->slen;
+ l = d[i+1] - s; /* d[slen] may be accessed here. */
+ if (l) {
+ bstr__memmove (b->data + s + acc, b->data + s, l);
+ }
+ if (auxr->slen) {
+ bstr__memmove (b->data + s + acc - auxr->slen,
+ auxr->data, auxr->slen);
+ }
+ acc += delta;
+ }
+ b->data[b->slen] = (unsigned char) '\0';
+ }
+
+ done:;
+ if (static_d == d) d = NULL;
+ bstr__free (d);
+ if (auxf != find) bdestroy (auxf);
+ if (auxr != repl) bdestroy (auxr);
+ return ret;
+}
+
+/* int bfindreplace (bstring b, const_bstring find, const_bstring repl,
+ * int pos)
+ *
+ * Replace all occurrences of a find string with a replace string after a
+ * given point in a bstring.
+ */
+int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
+ return findreplaceengine (b, find, repl, pos, binstr);
+}
+
+/* int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl,
+ * int pos)
+ *
+ * Replace all occurrences of a find string, ignoring case, with a replace
+ * string after a given point in a bstring.
+ */
+int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos) {
+ return findreplaceengine (b, find, repl, pos, binstrcaseless);
+}
+
+/* int binsertch (bstring b, int pos, int len, unsigned char fill)
+ *
+ * Inserts the character fill repeatedly into b at position pos for a
+ * length len. If the position pos is past the end of b, then the
+ * character "fill" is appended as necessary to make up the gap between the
+ * end of b and the position pos + len.
+ */
+int binsertch (bstring b, int pos, int len, unsigned char fill) {
+int d, l, i;
+
+ if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || len < 0) return BSTR_ERR;
+
+ /* Compute the two possible end pointers */
+ d = b->slen + len;
+ l = pos + len;
+ if ((d|l) < 0) return BSTR_ERR;
+
+ if (l > d) {
+ /* Inserting past the end of the string */
+ if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
+ pos = b->slen;
+ b->slen = l;
+ } else {
+ /* Inserting in the middle of the string */
+ if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
+ for (i = d - 1; i >= l; i--) {
+ b->data[i] = b->data[i - len];
+ }
+ b->slen = d;
+ }
+
+ for (i=pos; i < l; i++) b->data[i] = fill;
+ b->data[b->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bpattern (bstring b, int len)
+ *
+ * Replicate the bstring, b in place, end to end repeatedly until it
+ * surpasses len characters, then chop the result to exactly len characters.
+ * This function operates in-place. The function will return with BSTR_ERR
+ * if b is NULL or of length 0, otherwise BSTR_OK is returned.
+ */
+int bpattern (bstring b, int len) {
+int i, d;
+
+ d = blength (b);
+ if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
+ if (len > 0) {
+ if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
+ for (i = d; i < len; i++) b->data[i] = b->data[i - d];
+ }
+ b->data[len] = (unsigned char) '\0';
+ b->slen = len;
+ return BSTR_OK;
+}
+
+#define BS_BUFF_SZ (1024)
+
+/* int breada (bstring b, bNread readPtr, void * parm)
+ *
+ * Use a finite buffer fread-like function readPtr to concatenate to the
+ * bstring b the entire contents of file-like source data in a roughly
+ * efficient way.
+ */
+int breada (bstring b, bNread readPtr, void * parm) {
+int i, l, n;
+
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
+
+ i = b->slen;
+ for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
+ if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
+ l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
+ i += l;
+ b->slen = i;
+ if (i < n) break;
+ }
+
+ b->data[i] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* bstring bread (bNread readPtr, void * parm)
+ *
+ * Use a finite buffer fread-like function readPtr to create a bstring
+ * filled with the entire contents of file-like source data in a roughly
+ * efficient way.
+ */
+bstring bread (bNread readPtr, void * parm) {
+bstring buff;
+
+ if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
+ bdestroy (buff);
+ return NULL;
+ }
+ return buff;
+}
+
+/* int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator)
+ *
+ * Use an fgetc-like single character stream reading function (getcPtr) to
+ * obtain a sequence of characters which are concatenated to the end of the
+ * bstring b. The stream read is terminated by the passed in terminator
+ * parameter.
+ *
+ * If getcPtr returns with a negative number, or the terminator character
+ * (which is appended) is read, then the stream reading is halted and the
+ * function returns with a partial result in b. If there is an empty partial
+ * result, 1 is returned. If no characters are read, or there is some other
+ * detectable error, BSTR_ERR is returned.
+ */
+int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator) {
+int c, d, e;
+
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+ d = 0;
+ e = b->mlen - 2;
+
+ while ((c = getcPtr (parm)) >= 0) {
+ if (d > e) {
+ b->slen = d;
+ if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ e = b->mlen - 2;
+ }
+ b->data[d] = (unsigned char) c;
+ d++;
+ if (c == terminator) break;
+ }
+
+ b->data[d] = (unsigned char) '\0';
+ b->slen = d;
+
+ return d == 0 && c < 0;
+}
+
+/* int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator)
+ *
+ * Use an fgetc-like single character stream reading function (getcPtr) to
+ * obtain a sequence of characters which are concatenated to the end of the
+ * bstring b. The stream read is terminated by the passed in terminator
+ * parameter.
+ *
+ * If getcPtr returns with a negative number, or the terminator character
+ * (which is appended) is read, then the stream reading is halted and the
+ * function returns with a partial result concatentated to b. If there is
+ * an empty partial result, 1 is returned. If no characters are read, or
+ * there is some other detectable error, BSTR_ERR is returned.
+ */
+int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator) {
+int c, d, e;
+
+ if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+ b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+ d = b->slen;
+ e = b->mlen - 2;
+
+ while ((c = getcPtr (parm)) >= 0) {
+ if (d > e) {
+ b->slen = d;
+ if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+ e = b->mlen - 2;
+ }
+ b->data[d] = (unsigned char) c;
+ d++;
+ if (c == terminator) break;
+ }
+
+ b->data[d] = (unsigned char) '\0';
+ b->slen = d;
+
+ return d == 0 && c < 0;
+}
+
+/* bstring bgets (bNgetc getcPtr, void * parm, char terminator)
+ *
+ * Use an fgetc-like single character stream reading function (getcPtr) to
+ * obtain a sequence of characters which are concatenated into a bstring.
+ * The stream read is terminated by the passed in terminator function.
+ *
+ * If getcPtr returns with a negative number, or the terminator character
+ * (which is appended) is read, then the stream reading is halted and the
+ * result obtained thus far is returned. If no characters are read, or
+ * there is some other detectable error, NULL is returned.
+ */
+bstring bgets (bNgetc getcPtr, void * parm, char terminator) {
+bstring buff;
+
+ if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
+ bdestroy (buff);
+ buff = NULL;
+ }
+ return buff;
+}
+
+struct bStream {
+ bstring buff; /* Buffer for over-reads */
+ void * parm; /* The stream handle for core stream */
+ bNread readFnPtr; /* fread compatible fnptr for core stream */
+ int isEOF; /* track file's EOF state */
+ int maxBuffSz;
+};
+
+/* struct bStream * bsopen (bNread readPtr, void * parm)
+ *
+ * Wrap a given open stream (described by a fread compatible function
+ * pointer and stream handle) into an open bStream suitable for the bstring
+ * library streaming functions.
+ */
+struct bStream * bsopen (bNread readPtr, void * parm) {
+struct bStream * s;
+
+ if (readPtr == NULL) return NULL;
+ s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
+ if (s == NULL) return NULL;
+ s->parm = parm;
+ s->buff = bfromcstr ("");
+ s->readFnPtr = readPtr;
+ s->maxBuffSz = BS_BUFF_SZ;
+ s->isEOF = 0;
+ return s;
+}
+
+/* int bsbufflength (struct bStream * s, int sz)
+ *
+ * Set the length of the buffer used by the bStream. If sz is zero, the
+ * length is not set. This function returns with the previous length.
+ */
+int bsbufflength (struct bStream * s, int sz) {
+int oldSz;
+ if (s == NULL || sz < 0) return BSTR_ERR;
+ oldSz = s->maxBuffSz;
+ if (sz > 0) s->maxBuffSz = sz;
+ return oldSz;
+}
+
+int bseof (const struct bStream * s) {
+ if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
+ return s->isEOF && (s->buff->slen == 0);
+}
+
+/* void * bsclose (struct bStream * s)
+ *
+ * Close the bStream, and return the handle to the stream that was originally
+ * used to open the given stream.
+ */
+void * bsclose (struct bStream * s) {
+void * parm;
+ if (s == NULL) return NULL;
+ s->readFnPtr = NULL;
+ if (s->buff) bdestroy (s->buff);
+ s->buff = NULL;
+ parm = s->parm;
+ s->parm = NULL;
+ s->isEOF = 1;
+ bstr__free (s);
+ return parm;
+}
+
+/* int bsreadlna (bstring r, struct bStream * s, char terminator)
+ *
+ * Read a bstring terminated by the terminator character or the end of the
+ * stream from the bStream (s) and return it into the parameter r. This
+ * function may read additional characters from the core stream that are not
+ * returned, but will be retained for subsequent read operations.
+ */
+int bsreadlna (bstring r, struct bStream * s, char terminator) {
+int i, l, ret, rlo;
+char * b;
+struct tagbstring x;
+
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
+ r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
+ l = s->buff->slen;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) s->buff->data;
+ x.data = (unsigned char *) b;
+
+ /* First check if the current buffer holds the terminator */
+ b[l] = terminator; /* Set sentinel */
+ for (i=0; b[i] != terminator; i++) ;
+ if (i < l) {
+ x.slen = i + 1;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+ return BSTR_OK;
+ }
+
+ rlo = r->slen;
+
+ /* If not then just concatenate the entire buffer to the output */
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+ /* Perform direct in-place reads into the destination to allow for
+ the minimum of data-copies */
+ for (;;) {
+ if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) (r->data + r->slen);
+ l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+ if (l <= 0) {
+ r->data[r->slen] = (unsigned char) '\0';
+ s->buff->slen = 0;
+ s->isEOF = 1;
+ /* If nothing was read return with an error message */
+ return BSTR_ERR & -(r->slen == rlo);
+ }
+ b[l] = terminator; /* Set sentinel */
+ for (i=0; b[i] != terminator; i++) ;
+ if (i < l) break;
+ r->slen += l;
+ }
+
+ /* Terminator found, push over-read back to buffer */
+ i++;
+ r->slen += i;
+ s->buff->slen = l - i;
+ bstr__memcpy (s->buff->data, b + i, l - i);
+ r->data[r->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bsreadlnsa (bstring r, struct bStream * s, bstring term)
+ *
+ * Read a bstring terminated by any character in the term string or the end
+ * of the stream from the bStream (s) and return it into the parameter r.
+ * This function may read additional characters from the core stream that
+ * are not returned, but will be retained for subsequent read operations.
+ */
+int bsreadlnsa (bstring r, struct bStream * s, const_bstring term) {
+int i, l, ret, rlo;
+unsigned char * b;
+struct tagbstring x;
+struct charField cf;
+
+ if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
+ term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
+ r->mlen < r->slen) return BSTR_ERR;
+ if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
+ if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
+
+ l = s->buff->slen;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (unsigned char *) s->buff->data;
+ x.data = b;
+
+ /* First check if the current buffer holds the terminator */
+ b[l] = term->data[0]; /* Set sentinel */
+ for (i=0; !testInCharField (&cf, b[i]); i++) ;
+ if (i < l) {
+ x.slen = i + 1;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+ return BSTR_OK;
+ }
+
+ rlo = r->slen;
+
+ /* If not then just concatenate the entire buffer to the output */
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+ /* Perform direct in-place reads into the destination to allow for
+ the minimum of data-copies */
+ for (;;) {
+ if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (unsigned char *) (r->data + r->slen);
+ l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+ if (l <= 0) {
+ r->data[r->slen] = (unsigned char) '\0';
+ s->buff->slen = 0;
+ s->isEOF = 1;
+ /* If nothing was read return with an error message */
+ return BSTR_ERR & -(r->slen == rlo);
+ }
+
+ b[l] = term->data[0]; /* Set sentinel */
+ for (i=0; !testInCharField (&cf, b[i]); i++) ;
+ if (i < l) break;
+ r->slen += l;
+ }
+
+ /* Terminator found, push over-read back to buffer */
+ i++;
+ r->slen += i;
+ s->buff->slen = l - i;
+ bstr__memcpy (s->buff->data, b + i, l - i);
+ r->data[r->slen] = (unsigned char) '\0';
+ return BSTR_OK;
+}
+
+/* int bsreada (bstring r, struct bStream * s, int n)
+ *
+ * Read a bstring of length n (or, if it is fewer, as many bytes as is
+ * remaining) from the bStream. This function may read additional
+ * characters from the core stream that are not returned, but will be
+ * retained for subsequent read operations. This function will not read
+ * additional characters from the core stream beyond virtual stream pointer.
+ */
+int bsreada (bstring r, struct bStream * s, int n) {
+int l, ret, orslen;
+char * b;
+struct tagbstring x;
+
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+ || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
+
+ n += r->slen;
+ if (n <= 0) return BSTR_ERR;
+
+ l = s->buff->slen;
+
+ orslen = r->slen;
+
+ if (0 == l) {
+ if (s->isEOF) return BSTR_ERR;
+ if (r->mlen > n) {
+ l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
+ if (0 >= l || l > n - r->slen) {
+ s->isEOF = 1;
+ return BSTR_ERR;
+ }
+ r->slen += l;
+ r->data[r->slen] = (unsigned char) '\0';
+ return 0;
+ }
+ }
+
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ b = (char *) s->buff->data;
+ x.data = (unsigned char *) b;
+
+ do {
+ if (l + r->slen >= n) {
+ x.slen = n - r->slen;
+ ret = bconcat (r, &x);
+ s->buff->slen = l;
+ if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
+ return BSTR_ERR & -(r->slen == orslen);
+ }
+
+ x.slen = l;
+ if (BSTR_OK != bconcat (r, &x)) break;
+
+ l = n - r->slen;
+ if (l > s->maxBuffSz) l = s->maxBuffSz;
+
+ l = (int) s->readFnPtr (b, 1, l, s->parm);
+
+ } while (l > 0);
+ if (l < 0) l = 0;
+ if (l == 0) s->isEOF = 1;
+ s->buff->slen = l;
+ return BSTR_ERR & -(r->slen == orslen);
+}
+
+/* int bsreadln (bstring r, struct bStream * s, char terminator)
+ *
+ * Read a bstring terminated by the terminator character or the end of the
+ * stream from the bStream (s) and return it into the parameter r. This
+ * function may read additional characters from the core stream that are not
+ * returned, but will be retained for subsequent read operations.
+ */
+int bsreadln (bstring r, struct bStream * s, char terminator) {
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
+ return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreadlna (r, s, terminator);
+}
+
+/* int bsreadlns (bstring r, struct bStream * s, bstring term)
+ *
+ * Read a bstring terminated by any character in the term string or the end
+ * of the stream from the bStream (s) and return it into the parameter r.
+ * This function may read additional characters from the core stream that
+ * are not returned, but will be retained for subsequent read operations.
+ */
+int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
+ if (s == NULL || s->buff == NULL || r == NULL || term == NULL
+ || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
+ if (term->slen == 1) return bsreadln (r, s, term->data[0]);
+ if (term->slen < 1) return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreadlnsa (r, s, term);
+}
+
+/* int bsread (bstring r, struct bStream * s, int n)
+ *
+ * Read a bstring of length n (or, if it is fewer, as many bytes as is
+ * remaining) from the bStream. This function may read additional
+ * characters from the core stream that are not returned, but will be
+ * retained for subsequent read operations. This function will not read
+ * additional characters from the core stream beyond virtual stream pointer.
+ */
+int bsread (bstring r, struct bStream * s, int n) {
+ if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+ || n <= 0) return BSTR_ERR;
+ if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+ r->slen = 0;
+ return bsreada (r, s, n);
+}
+
+/* int bsunread (struct bStream * s, const_bstring b)
+ *
+ * Insert a bstring into the bStream at the current position. These
+ * characters will be read prior to those that actually come from the core
+ * stream.
+ */
+int bsunread (struct bStream * s, const_bstring b) {
+ if (s == NULL || s->buff == NULL) return BSTR_ERR;
+ return binsert (s->buff, 0, b, (unsigned char) '?');
+}
+
+/* int bspeek (bstring r, const struct bStream * s)
+ *
+ * Return the currently buffered characters from the bStream that will be
+ * read prior to reads from the core stream.
+ */
+int bspeek (bstring r, const struct bStream * s) {
+ if (s == NULL || s->buff == NULL) return BSTR_ERR;
+ return bassign (r, s->buff);
+}
+
+/* bstring bjoin (const struct bstrList * bl, const_bstring sep);
+ *
+ * Join the entries of a bstrList into one bstring by sequentially
+ * concatenating them with the sep string in between. If there is an error
+ * NULL is returned, otherwise a bstring with the correct result is returned.
+ */
+bstring bjoin (const struct bstrList * bl, const_bstring sep) {
+bstring b;
+int i, c, v;
+
+ if (bl == NULL || bl->qty < 0) return NULL;
+ if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
+
+ for (i = 0, c = 1; i < bl->qty; i++) {
+ v = bl->entry[i]->slen;
+ if (v < 0) return NULL; /* Invalid input */
+ c += v;
+ if (c < 0) return NULL; /* Wrap around ?? */
+ }
+
+ if (sep != NULL) c += (bl->qty - 1) * sep->slen;
+
+ b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+ if (NULL == b) return NULL; /* Out of memory */
+ b->data = (unsigned char *) bstr__alloc (c);
+ if (b->data == NULL) {
+ bstr__free (b);
+ return NULL;
+ }
+
+ b->mlen = c;
+ b->slen = c-1;
+
+ for (i = 0, c = 0; i < bl->qty; i++) {
+ if (i > 0 && sep != NULL) {
+ bstr__memcpy (b->data + c, sep->data, sep->slen);
+ c += sep->slen;
+ }
+ v = bl->entry[i]->slen;
+ bstr__memcpy (b->data + c, bl->entry[i]->data, v);
+ c += v;
+ }
+ b->data[c] = (unsigned char) '\0';
+ return b;
+}
+
+#define BSSSC_BUFF_LEN (256)
+
+/* int bssplitscb (struct bStream * s, const_bstring splitStr,
+ * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings read from a stream
+ * divided by any of the characters in splitStr. An empty splitStr causes
+ * the whole stream to be iterated once.
+ *
+ * Note: At the point of calling the cb function, the bStream pointer is
+ * pointed exactly at the position right after having read the split
+ * character. The cb function can act on the stream by causing the bStream
+ * pointer to move, and bssplitscb will continue by starting the next split
+ * at the position of the pointer after the return from cb.
+ *
+ * However, if the cb causes the bStream s to be destroyed then the cb must
+ * return with a negative value, otherwise bssplitscb will continue in an
+ * undefined manner.
+ */
+int bssplitscb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+struct charField chrs;
+bstring buff;
+int i, p, ret;
+
+ if (cb == NULL || s == NULL || s->readFnPtr == NULL
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+ if (splitStr->slen == 0) {
+ while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
+ if ((ret = cb (parm, 0, buff)) > 0)
+ ret = 0;
+ } else {
+ buildCharField (&chrs, splitStr);
+ ret = p = i = 0;
+ for (;;) {
+ if (i >= buff->slen) {
+ bsreada (buff, s, BSSSC_BUFF_LEN);
+ if (i >= buff->slen) {
+ if (0 < (ret = cb (parm, p, buff))) ret = 0;
+ break;
+ }
+ }
+ if (testInCharField (&chrs, buff->data[i])) {
+ struct tagbstring t;
+ unsigned char c;
+
+ blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
+ if ((ret = bsunread (s, &t)) < 0) break;
+ buff->slen = i;
+ c = buff->data[i];
+ buff->data[i] = (unsigned char) '\0';
+ if ((ret = cb (parm, p, buff)) < 0) break;
+ buff->data[i] = c;
+ buff->slen = 0;
+ p += i + 1;
+ i = -1;
+ }
+ i++;
+ }
+ }
+
+ bdestroy (buff);
+ return ret;
+}
+
+/* int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+ * int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings read from a stream
+ * divided by the entire substring splitStr. An empty splitStr causes
+ * each character of the stream to be iterated.
+ *
+ * Note: At the point of calling the cb function, the bStream pointer is
+ * pointed exactly at the position right after having read the split
+ * character. The cb function can act on the stream by causing the bStream
+ * pointer to move, and bssplitscb will continue by starting the next split
+ * at the position of the pointer after the return from cb.
+ *
+ * However, if the cb causes the bStream s to be destroyed then the cb must
+ * return with a negative value, otherwise bssplitscb will continue in an
+ * undefined manner.
+ */
+int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+bstring buff;
+int i, p, ret;
+
+ if (cb == NULL || s == NULL || s->readFnPtr == NULL
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
+
+ if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+ if (splitStr->slen == 0) {
+ for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
+ if ((ret = cb (parm, 0, buff)) < 0) {
+ bdestroy (buff);
+ return ret;
+ }
+ buff->slen = 0;
+ }
+ return BSTR_OK;
+ } else {
+ ret = p = i = 0;
+ for (i=p=0;;) {
+ if ((ret = binstr (buff, 0, splitStr)) >= 0) {
+ struct tagbstring t;
+ blk2tbstr (t, buff->data, ret);
+ i = ret + splitStr->slen;
+ if ((ret = cb (parm, p, &t)) < 0) break;
+ p += i;
+ bdelete (buff, 0, i);
+ } else {
+ bsreada (buff, s, BSSSC_BUFF_LEN);
+ if (bseof (s)) {
+ if ((ret = cb (parm, p, buff)) > 0) ret = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ bdestroy (buff);
+ return ret;
+}
+
+/* int bstrListCreate (void)
+ *
+ * Create a bstrList.
+ */
+struct bstrList * bstrListCreate (void) {
+struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (sl) {
+ sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
+ if (!sl->entry) {
+ bstr__free (sl);
+ sl = NULL;
+ } else {
+ sl->qty = 0;
+ sl->mlen = 1;
+ }
+ }
+ return sl;
+}
+
+/* int bstrListDestroy (struct bstrList * sl)
+ *
+ * Destroy a bstrList that has been created by bsplit, bsplits or bstrListCreate.
+ */
+int bstrListDestroy (struct bstrList * sl) {
+int i;
+ if (sl == NULL || sl->qty < 0) return BSTR_ERR;
+ for (i=0; i < sl->qty; i++) {
+ if (sl->entry[i]) {
+ bdestroy (sl->entry[i]);
+ sl->entry[i] = NULL;
+ }
+ }
+ sl->qty = -1;
+ sl->mlen = -1;
+ bstr__free (sl->entry);
+ sl->entry = NULL;
+ bstr__free (sl);
+ return BSTR_OK;
+}
+
+/* int bstrListAlloc (struct bstrList * sl, int msz)
+ *
+ * Ensure that there is memory for at least msz number of entries for the
+ * list.
+ */
+int bstrListAlloc (struct bstrList * sl, int msz) {
+bstring * l;
+int smsz;
+size_t nsz;
+ if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+ if (sl->mlen >= msz) return BSTR_OK;
+ smsz = snapUpSize (msz);
+ nsz = ((size_t) smsz) * sizeof (bstring);
+ if (nsz < (size_t) smsz) return BSTR_ERR;
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) {
+ smsz = msz;
+ nsz = ((size_t) smsz) * sizeof (bstring);
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) return BSTR_ERR;
+ }
+ sl->mlen = smsz;
+ sl->entry = l;
+ return BSTR_OK;
+}
+
+/* int bstrListAllocMin (struct bstrList * sl, int msz)
+ *
+ * Try to allocate the minimum amount of memory for the list to include at
+ * least msz entries or sl->qty whichever is greater.
+ */
+int bstrListAllocMin (struct bstrList * sl, int msz) {
+bstring * l;
+size_t nsz;
+ if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+ if (msz < sl->qty) msz = sl->qty;
+ if (sl->mlen == msz) return BSTR_OK;
+ nsz = ((size_t) msz) * sizeof (bstring);
+ if (nsz < (size_t) msz) return BSTR_ERR;
+ l = (bstring *) bstr__realloc (sl->entry, nsz);
+ if (!l) return BSTR_ERR;
+ sl->mlen = msz;
+ sl->entry = l;
+ return BSTR_OK;
+}
+
+/* int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings over str divided by the
+ * character in splitChar.
+ *
+ * Note: Non-destructive modification of str from within the cb function
+ * while performing this split is not undefined. bsplitcb behaves in
+ * sequential lock step with calls to cb. I.e., after returning from a cb
+ * that return a non-negative integer, bsplitcb continues from the position
+ * 1 character after the last detected split character and it will halt
+ * immediately if the length of str falls below this point. However, if the
+ * cb function destroys str, then it *must* return with a negative value,
+ * otherwise bsplitcb will continue in an undefined manner.
+ */
+int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
+int i, p, ret;
+
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen)
+ return BSTR_ERR;
+
+ p = pos;
+ do {
+ for (i=p; i < str->slen; i++) {
+ if (str->data[i] == splitChar) break;
+ }
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ p = i + 1;
+ } while (p <= str->slen);
+ return BSTR_OK;
+}
+
+/* int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings over str divided by any
+ * of the characters in splitStr. An empty splitStr causes the whole str to
+ * be iterated once.
+ *
+ * Note: Non-destructive modification of str from within the cb function
+ * while performing this split is not undefined. bsplitscb behaves in
+ * sequential lock step with calls to cb. I.e., after returning from a cb
+ * that return a non-negative integer, bsplitscb continues from the position
+ * 1 character after the last detected split character and it will halt
+ * immediately if the length of str falls below this point. However, if the
+ * cb function destroys str, then it *must* return with a negative value,
+ * otherwise bsplitscb will continue in an undefined manner.
+ */
+int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
+struct charField chrs;
+int i, p, ret;
+
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+ if (splitStr->slen == 0) {
+ if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
+ return ret;
+ }
+
+ if (splitStr->slen == 1)
+ return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+
+ buildCharField (&chrs, splitStr);
+
+ p = pos;
+ do {
+ for (i=p; i < str->slen; i++) {
+ if (testInCharField (&chrs, str->data[i])) break;
+ }
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ p = i + 1;
+ } while (p <= str->slen);
+ return BSTR_OK;
+}
+
+/* int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+ * int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ * Iterate the set of disjoint sequential substrings over str divided by the
+ * substring splitStr. An empty splitStr causes the whole str to be
+ * iterated once.
+ *
+ * Note: Non-destructive modification of str from within the cb function
+ * while performing this split is not undefined. bsplitstrcb behaves in
+ * sequential lock step with calls to cb. I.e., after returning from a cb
+ * that return a non-negative integer, bsplitscb continues from the position
+ * 1 character after the last detected split character and it will halt
+ * immediately if the length of str falls below this point. However, if the
+ * cb function destroys str, then it *must* return with a negative value,
+ * otherwise bsplitscb will continue in an undefined manner.
+ */
+int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm) {
+int i, p, ret;
+
+ if (cb == NULL || str == NULL || pos < 0 || pos > str->slen
+ || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+ if (0 == splitStr->slen) {
+ for (i=pos; i < str->slen; i++) {
+ if ((ret = cb (parm, i, 1)) < 0) return ret;
+ }
+ return BSTR_OK;
+ }
+
+ if (splitStr->slen == 1)
+ return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+
+ for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
+ if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
+ if ((ret = cb (parm, p, i - p)) < 0) return ret;
+ i += splitStr->slen;
+ p = i;
+ }
+ }
+ if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
+ return BSTR_OK;
+}
+
+struct genBstrList {
+ bstring b;
+ struct bstrList * bl;
+};
+
+static int bscb (void * parm, int ofs, int len) {
+struct genBstrList * g = (struct genBstrList *) parm;
+ if (g->bl->qty >= g->bl->mlen) {
+ int mlen = g->bl->mlen * 2;
+ bstring * tbl;
+
+ while (g->bl->qty >= mlen) {
+ if (mlen < g->bl->mlen) return BSTR_ERR;
+ mlen += mlen;
+ }
+
+ tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
+ if (tbl == NULL) return BSTR_ERR;
+
+ g->bl->entry = tbl;
+ g->bl->mlen = mlen;
+ }
+
+ g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
+ g->bl->qty++;
+ return BSTR_OK;
+}
+
+/* struct bstrList * bsplit (const_bstring str, unsigned char splitChar)
+ *
+ * Create an array of sequential substrings from str divided by the character
+ * splitChar.
+ */
+struct bstrList * bsplit (const_bstring str, unsigned char splitChar) {
+struct genBstrList g;
+
+ if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
+
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+ if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
+}
+
+/* struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr)
+ *
+ * Create an array of sequential substrings from str divided by the entire
+ * substring splitStr.
+ */
+struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr) {
+struct genBstrList g;
+
+ if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
+
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+ if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
+}
+
+/* struct bstrList * bsplits (const_bstring str, bstring splitStr)
+ *
+ * Create an array of sequential substrings from str divided by any of the
+ * characters in splitStr. An empty splitStr causes a single entry bstrList
+ * containing a copy of str to be returned.
+ */
+struct bstrList * bsplits (const_bstring str, const_bstring splitStr) {
+struct genBstrList g;
+
+ if ( str == NULL || str->slen < 0 || str->data == NULL ||
+ splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
+ return NULL;
+
+ g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+ if (g.bl == NULL) return NULL;
+ g.bl->mlen = 4;
+ g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+ if (NULL == g.bl->entry) {
+ bstr__free (g.bl);
+ return NULL;
+ }
+ g.b = (bstring) str;
+ g.bl->qty = 0;
+
+ if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
+ bstrListDestroy (g.bl);
+ return NULL;
+ }
+ return g.bl;
+}
+
+#if defined (__TURBOC__) && !defined (__BORLANDC__)
+# ifndef BSTRLIB_NOVSNP
+# define BSTRLIB_NOVSNP
+# endif
+#endif
+
+/* Give WATCOM C/C++, MSVC some latitude for their non-support of vsnprintf */
+#if defined(__WATCOMC__) || defined(_MSC_VER)
+#define exvsnprintf(r,b,n,f,a) {r = _vsnprintf (b,n,f,a);}
+#else
+#ifdef BSTRLIB_NOVSNP
+/* This is just a hack. If you are using a system without a vsnprintf, it is
+ not recommended that bformat be used at all. */
+#define exvsnprintf(r,b,n,f,a) {vsprintf (b,f,a); r = -1;}
+#define START_VSNBUFF (256)
+#else
+
+#if defined(__GNUC__) && !defined(__APPLE__)
+/* Something is making gcc complain about this prototype not being here, so
+ I've just gone ahead and put it in. */
+extern int vsnprintf (char *buf, size_t count, const char *format, va_list arg);
+#endif
+
+#define exvsnprintf(r,b,n,f,a) {r = vsnprintf (b,n,f,a);}
+#endif
+#endif
+
+#if !defined (BSTRLIB_NOVSNP)
+
+#ifndef START_VSNBUFF
+#define START_VSNBUFF (16)
+#endif
+
+/* On IRIX vsnprintf returns n-1 when the operation would overflow the target
+ buffer, WATCOM and MSVC both return -1, while C99 requires that the
+ returned value be exactly what the length would be if the buffer would be
+ large enough. This leads to the idea that if the return value is larger
+ than n, then changing n to the return value will reduce the number of
+ iterations required. */
+
+/* int bformata (bstring b, const char * fmt, ...)
+ *
+ * After the first parameter, it takes the same parameters as printf (), but
+ * rather than outputting results to stdio, it appends the results to
+ * a bstring which contains what would have been output. Note that if there
+ * is an early generation of a '\0' character, the bstring will be truncated
+ * to this end point.
+ */
+int bformata (bstring b, const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+ if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
+ || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
+
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+ }
+
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
+
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
+
+ if (buff->slen < n) break;
+
+ if (r > n) n = r; else n += n;
+
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return BSTR_ERR;
+ }
+ }
+
+ r = bconcat (b, buff);
+ bdestroy (buff);
+ return r;
+}
+
+/* int bassignformat (bstring b, const char * fmt, ...)
+ *
+ * After the first parameter, it takes the same parameters as printf (), but
+ * rather than outputting results to stdio, it outputs the results to
+ * the bstring parameter b. Note that if there is an early generation of a
+ * '\0' character, the bstring will be truncated to this end point.
+ */
+int bassignformat (bstring b, const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+ if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0
+ || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
+
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+ }
+
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
+
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
+
+ if (buff->slen < n) break;
+
+ if (r > n) n = r; else n += n;
+
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return BSTR_ERR;
+ }
+ }
+
+ r = bassign (b, buff);
+ bdestroy (buff);
+ return r;
+}
+
+/* bstring bformat (const char * fmt, ...)
+ *
+ * Takes the same parameters as printf (), but rather than outputting results
+ * to stdio, it forms a bstring which contains what would have been output.
+ * Note that if there is an early generation of a '\0' character, the
+ * bstring will be truncated to this end point.
+ */
+bstring bformat (const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+ if (fmt == NULL) return NULL;
+
+ /* Since the length is not determinable beforehand, a search is
+ performed using the truncating "vsnprintf" call (to avoid buffer
+ overflows) on increasing potential sizes for the output result. */
+
+ if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+ n = 1;
+ if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
+ }
+
+ for (;;) {
+ va_start (arglist, fmt);
+ exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+ va_end (arglist);
+
+ buff->data[n] = (unsigned char) '\0';
+ buff->slen = (int) (strlen) ((char *) buff->data);
+
+ if (buff->slen < n) break;
+
+ if (r > n) n = r; else n += n;
+
+ if (BSTR_OK != balloc (buff, n + 2)) {
+ bdestroy (buff);
+ return NULL;
+ }
+ }
+
+ return buff;
+}
+
+/* int bvcformata (bstring b, int count, const char * fmt, va_list arglist)
+ *
+ * The bvcformata function formats data under control of the format control
+ * string fmt and attempts to append the result to b. The fmt parameter is
+ * the same as that of the printf function. The variable argument list is
+ * replaced with arglist, which has been initialized by the va_start macro.
+ * The size of the appended output is upper bounded by count. If the
+ * required output exceeds count, the string b is not augmented with any
+ * contents and a value below BSTR_ERR is returned. If a value below -count
+ * is returned then it is recommended that the negative of this value be
+ * used as an update to the count in a subsequent pass. On other errors,
+ * such as running out of memory, parameter errors or numeric wrap around
+ * BSTR_ERR is returned. BSTR_OK is returned when the output is successfully
+ * generated and appended to b.
+ *
+ * Note: There is no sanity checking of arglist, and this function is
+ * destructive of the contents of b from the b->slen point onward. If there
+ * is an early generation of a '\0' character, the bstring will be truncated
+ * to this end point.
+ */
+int bvcformata (bstring b, int count, const char * fmt, va_list arg) {
+int n, r, l;
+
+ if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
+ || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+ if (count > (n = b->slen + count) + 2) return BSTR_ERR;
+ if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
+
+ exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
+
+ /* Did the operation complete successfully within bounds? */
+ for (l = b->slen; l <= n; l++) {
+ if ('\0' == b->data[l]) {
+ b->slen = l;
+ return BSTR_OK;
+ }
+ }
+
+ /* Abort, since the buffer was not large enough. The return value
+ tries to help set what the retry length should be. */
+
+ b->data[b->slen] = '\0';
+ if (r > count + 1) { /* Does r specify a particular target length? */
+ n = r;
+ } else {
+ n = count + count; /* If not, just double the size of count */
+ if (count > n) n = INT_MAX;
+ }
+ n = -n;
+
+ if (n > BSTR_ERR-1) n = BSTR_ERR-1;
+ return n;
+}
+
+#endif
diff --git a/src/bstrlib.h b/src/bstrlib.h new file mode 100644 index 0000000..c8fa694 --- /dev/null +++ b/src/bstrlib.h @@ -0,0 +1,304 @@ +/*
+ * This source file is part of the bstring string library. This code was
+ * written by Paul Hsieh in 2002-2010, and is covered by either the 3-clause
+ * BSD open source license or GPL v2.0. Refer to the accompanying documentation
+ * for details on usage and license.
+ */
+
+/*
+ * bstrlib.h
+ *
+ * This file is the header file for the core module for implementing the
+ * bstring functions.
+ */
+
+#ifndef BSTRLIB_INCLUDE
+#define BSTRLIB_INCLUDE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+
+#if !defined (BSTRLIB_VSNP_OK) && !defined (BSTRLIB_NOVSNP)
+# if defined (__TURBOC__) && !defined (__BORLANDC__)
+# define BSTRLIB_NOVSNP
+# endif
+#endif
+
+#define BSTR_ERR (-1)
+#define BSTR_OK (0)
+#define BSTR_BS_BUFF_LENGTH_GET (0)
+
+typedef struct tagbstring * bstring;
+typedef const struct tagbstring * const_bstring;
+
+/* Copy functions */
+#define cstr2bstr bfromcstr
+extern bstring bfromcstr (const char * str);
+extern bstring bfromcstralloc (int mlen, const char * str);
+extern bstring blk2bstr (const void * blk, int len);
+extern char * bstr2cstr (const_bstring s, char z);
+extern int bcstrfree (char * s);
+extern bstring bstrcpy (const_bstring b1);
+extern int bassign (bstring a, const_bstring b);
+extern int bassignmidstr (bstring a, const_bstring b, int left, int len);
+extern int bassigncstr (bstring a, const char * str);
+extern int bassignblk (bstring a, const void * s, int len);
+
+/* Destroy function */
+extern int bdestroy (bstring b);
+
+/* Space allocation hinting functions */
+extern int balloc (bstring s, int len);
+extern int ballocmin (bstring b, int len);
+
+/* Substring extraction */
+extern bstring bmidstr (const_bstring b, int left, int len);
+
+/* Various standard manipulations */
+extern int bconcat (bstring b0, const_bstring b1);
+extern int bconchar (bstring b0, char c);
+extern int bcatcstr (bstring b, const char * s);
+extern int bcatblk (bstring b, const void * s, int len);
+extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill);
+extern int binsertch (bstring s1, int pos, int len, unsigned char fill);
+extern int breplace (bstring b1, int pos, int len, const_bstring b2, unsigned char fill);
+extern int bdelete (bstring s1, int pos, int len);
+extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill);
+extern int btrunc (bstring b, int n);
+
+/* Scan/search functions */
+extern int bstricmp (const_bstring b0, const_bstring b1);
+extern int bstrnicmp (const_bstring b0, const_bstring b1, int n);
+extern int biseqcaseless (const_bstring b0, const_bstring b1);
+extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len);
+extern int biseq (const_bstring b0, const_bstring b1);
+extern int bisstemeqblk (const_bstring b0, const void * blk, int len);
+extern int biseqcstr (const_bstring b, const char * s);
+extern int biseqcstrcaseless (const_bstring b, const char * s);
+extern int bstrcmp (const_bstring b0, const_bstring b1);
+extern int bstrncmp (const_bstring b0, const_bstring b1, int n);
+extern int binstr (const_bstring s1, int pos, const_bstring s2);
+extern int binstrr (const_bstring s1, int pos, const_bstring s2);
+extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2);
+extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2);
+extern int bstrchrp (const_bstring b, int c, int pos);
+extern int bstrrchrp (const_bstring b, int c, int pos);
+#define bstrchr(b,c) bstrchrp ((b), (c), 0)
+#define bstrrchr(b,c) bstrrchrp ((b), (c), blength(b)-1)
+extern int binchr (const_bstring b0, int pos, const_bstring b1);
+extern int binchrr (const_bstring b0, int pos, const_bstring b1);
+extern int bninchr (const_bstring b0, int pos, const_bstring b1);
+extern int bninchrr (const_bstring b0, int pos, const_bstring b1);
+extern int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos);
+extern int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos);
+
+/* List of string container functions */
+struct bstrList {
+ int qty, mlen;
+ bstring * entry;
+};
+extern struct bstrList * bstrListCreate (void);
+extern int bstrListDestroy (struct bstrList * sl);
+extern int bstrListAlloc (struct bstrList * sl, int msz);
+extern int bstrListAllocMin (struct bstrList * sl, int msz);
+
+/* String split and join functions */
+extern struct bstrList * bsplit (const_bstring str, unsigned char splitChar);
+extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr);
+extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr);
+extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
+extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm);
+extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm);
+extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+ int (* cb) (void * parm, int ofs, int len), void * parm);
+
+/* Miscellaneous functions */
+extern int bpattern (bstring b, int len);
+extern int btoupper (bstring b);
+extern int btolower (bstring b);
+extern int bltrimws (bstring b);
+extern int brtrimws (bstring b);
+extern int btrimws (bstring b);
+
+/* <*>printf format functions */
+#if !defined (BSTRLIB_NOVSNP)
+extern bstring bformat (const char * fmt, ...);
+extern int bformata (bstring b, const char * fmt, ...);
+extern int bassignformat (bstring b, const char * fmt, ...);
+extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
+
+#define bvformata(ret, b, fmt, lastarg) { \
+bstring bstrtmp_b = (b); \
+const char * bstrtmp_fmt = (fmt); \
+int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \
+ for (;;) { \
+ va_list bstrtmp_arglist; \
+ va_start (bstrtmp_arglist, lastarg); \
+ bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
+ va_end (bstrtmp_arglist); \
+ if (bstrtmp_r >= 0) { /* Everything went ok */ \
+ bstrtmp_r = BSTR_OK; \
+ break; \
+ } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
+ bstrtmp_r = BSTR_ERR; \
+ break; \
+ } \
+ bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
+ } \
+ ret = bstrtmp_r; \
+}
+
+#endif
+
+typedef int (*bNgetc) (void *parm);
+typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem, void *parm);
+
+/* Input functions */
+extern bstring bgets (bNgetc getcPtr, void * parm, char terminator);
+extern bstring bread (bNread readPtr, void * parm);
+extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator);
+extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator);
+extern int breada (bstring b, bNread readPtr, void * parm);
+
+/* Stream functions */
+extern struct bStream * bsopen (bNread readPtr, void * parm);
+extern void * bsclose (struct bStream * s);
+extern int bsbufflength (struct bStream * s, int sz);
+extern int bsreadln (bstring b, struct bStream * s, char terminator);
+extern int bsreadlns (bstring r, struct bStream * s, const_bstring term);
+extern int bsread (bstring b, struct bStream * s, int n);
+extern int bsreadlna (bstring b, struct bStream * s, char terminator);
+extern int bsreadlnsa (bstring r, struct bStream * s, const_bstring term);
+extern int bsreada (bstring b, struct bStream * s, int n);
+extern int bsunread (struct bStream * s, const_bstring b);
+extern int bspeek (bstring r, const struct bStream * s);
+extern int bssplitscb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+ int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+extern int bseof (const struct bStream * s);
+
+struct tagbstring {
+ int mlen;
+ int slen;
+ unsigned char * data;
+};
+
+/* Accessor macros */
+#define blengthe(b, e) (((b) == (void *)0 || (b)->slen < 0) ? (int)(e) : ((b)->slen))
+#define blength(b) (blengthe ((b), 0))
+#define bdataofse(b, o, e) (((b) == (void *)0 || (b)->data == (void*)0) ? (char *)(e) : ((char *)(b)->data) + (o))
+#define bdataofs(b, o) (bdataofse ((b), (o), (void *)0))
+#define bdatae(b, e) (bdataofse (b, 0, e))
+#define bdata(b) (bdataofs (b, 0))
+#define bchare(b, p, e) ((((unsigned)(p)) < (unsigned)blength(b)) ? ((b)->data[(p)]) : (e))
+#define bchar(b, p) bchare ((b), (p), '\0')
+
+/* Static constant string initialization macro */
+#define bsStaticMlen(q,m) {(m), (int) sizeof(q)-1, (unsigned char *) ("" q "")}
+#if defined(_MSC_VER)
+/* There are many versions of MSVC which emit __LINE__ as a non-constant. */
+# define bsStatic(q) bsStaticMlen(q,-32)
+#endif
+#ifndef bsStatic
+# define bsStatic(q) bsStaticMlen(q,-__LINE__)
+#endif
+
+/* Static constant block parameter pair */
+#define bsStaticBlkParms(q) ((void *)("" q "")), ((int) sizeof(q)-1)
+
+/* Reference building macros */
+#define cstr2tbstr btfromcstr
+#define btfromcstr(t,s) { \
+ (t).data = (unsigned char *) (s); \
+ (t).slen = ((t).data) ? ((int) (strlen) ((char *)(t).data)) : 0; \
+ (t).mlen = -1; \
+}
+#define blk2tbstr(t,s,l) { \
+ (t).data = (unsigned char *) (s); \
+ (t).slen = l; \
+ (t).mlen = -1; \
+}
+#define btfromblk(t,s,l) blk2tbstr(t,s,l)
+#define bmid2tbstr(t,b,p,l) { \
+ const_bstring bstrtmp_s = (b); \
+ if (bstrtmp_s && bstrtmp_s->data && bstrtmp_s->slen >= 0) { \
+ int bstrtmp_left = (p); \
+ int bstrtmp_len = (l); \
+ if (bstrtmp_left < 0) { \
+ bstrtmp_len += bstrtmp_left; \
+ bstrtmp_left = 0; \
+ } \
+ if (bstrtmp_len > bstrtmp_s->slen - bstrtmp_left) \
+ bstrtmp_len = bstrtmp_s->slen - bstrtmp_left; \
+ if (bstrtmp_len <= 0) { \
+ (t).data = (unsigned char *)""; \
+ (t).slen = 0; \
+ } else { \
+ (t).data = bstrtmp_s->data + bstrtmp_left; \
+ (t).slen = bstrtmp_len; \
+ } \
+ } else { \
+ (t).data = (unsigned char *)""; \
+ (t).slen = 0; \
+ } \
+ (t).mlen = -__LINE__; \
+}
+#define btfromblkltrimws(t,s,l) { \
+ int bstrtmp_idx = 0, bstrtmp_len = (l); \
+ unsigned char * bstrtmp_s = (s); \
+ if (bstrtmp_s && bstrtmp_len >= 0) { \
+ for (; bstrtmp_idx < bstrtmp_len; bstrtmp_idx++) { \
+ if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \
+ } \
+ } \
+ (t).data = bstrtmp_s + bstrtmp_idx; \
+ (t).slen = bstrtmp_len - bstrtmp_idx; \
+ (t).mlen = -__LINE__; \
+}
+#define btfromblkrtrimws(t,s,l) { \
+ int bstrtmp_len = (l) - 1; \
+ unsigned char * bstrtmp_s = (s); \
+ if (bstrtmp_s && bstrtmp_len >= 0) { \
+ for (; bstrtmp_len >= 0; bstrtmp_len--) { \
+ if (!isspace (bstrtmp_s[bstrtmp_len])) break; \
+ } \
+ } \
+ (t).data = bstrtmp_s; \
+ (t).slen = bstrtmp_len + 1; \
+ (t).mlen = -__LINE__; \
+}
+#define btfromblktrimws(t,s,l) { \
+ int bstrtmp_idx = 0, bstrtmp_len = (l) - 1; \
+ unsigned char * bstrtmp_s = (s); \
+ if (bstrtmp_s && bstrtmp_len >= 0) { \
+ for (; bstrtmp_idx <= bstrtmp_len; bstrtmp_idx++) { \
+ if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \
+ } \
+ for (; bstrtmp_len >= bstrtmp_idx; bstrtmp_len--) { \
+ if (!isspace (bstrtmp_s[bstrtmp_len])) break; \
+ } \
+ } \
+ (t).data = bstrtmp_s + bstrtmp_idx; \
+ (t).slen = bstrtmp_len + 1 - bstrtmp_idx; \
+ (t).mlen = -__LINE__; \
+}
+
+/* Write protection macros */
+#define bwriteprotect(t) { if ((t).mlen >= 0) (t).mlen = -1; }
+#define bwriteallow(t) { if ((t).mlen == -1) (t).mlen = (t).slen + ((t).slen == 0); }
+#define biswriteprotected(t) ((t).mlen <= 0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/case_fold_switch.c b/src/case_fold_switch.c new file mode 100644 index 0000000..70fdd75 --- /dev/null +++ b/src/case_fold_switch.c @@ -0,0 +1,2637 @@ + switch (c) { + case 0x0041: + bufpush(0x0061); + break; + case 0x0042: + bufpush(0x0062); + break; + case 0x0043: + bufpush(0x0063); + break; + case 0x0044: + bufpush(0x0064); + break; + case 0x0045: + bufpush(0x0065); + break; + case 0x0046: + bufpush(0x0066); + break; + case 0x0047: + bufpush(0x0067); + break; + case 0x0048: + bufpush(0x0068); + break; + case 0x0049: + bufpush(0x0069); + break; + case 0x004A: + bufpush(0x006A); + break; + case 0x004B: + bufpush(0x006B); + break; + case 0x004C: + bufpush(0x006C); + break; + case 0x004D: + bufpush(0x006D); + break; + case 0x004E: + bufpush(0x006E); + break; + case 0x004F: + bufpush(0x006F); + break; + case 0x0050: + bufpush(0x0070); + break; + case 0x0051: + bufpush(0x0071); + break; + case 0x0052: + bufpush(0x0072); + break; + case 0x0053: + bufpush(0x0073); + break; + case 0x0054: + bufpush(0x0074); + break; + case 0x0055: + bufpush(0x0075); + break; + case 0x0056: + bufpush(0x0076); + break; + case 0x0057: + bufpush(0x0077); + break; + case 0x0058: + bufpush(0x0078); + break; + case 0x0059: + bufpush(0x0079); + break; + case 0x005A: + bufpush(0x007A); + break; + case 0x00B5: + bufpush(0x03BC); + break; + case 0x00C0: + bufpush(0x00E0); + break; + case 0x00C1: + bufpush(0x00E1); + break; + case 0x00C2: + bufpush(0x00E2); + break; + case 0x00C3: + bufpush(0x00E3); + break; + case 0x00C4: + bufpush(0x00E4); + break; + case 0x00C5: + bufpush(0x00E5); + break; + case 0x00C6: + bufpush(0x00E6); + break; + case 0x00C7: + bufpush(0x00E7); + break; + case 0x00C8: + bufpush(0x00E8); + break; + case 0x00C9: + bufpush(0x00E9); + break; + case 0x00CA: + bufpush(0x00EA); + break; + case 0x00CB: + bufpush(0x00EB); + break; + case 0x00CC: + bufpush(0x00EC); + break; + case 0x00CD: + bufpush(0x00ED); + break; + case 0x00CE: + bufpush(0x00EE); + break; + case 0x00CF: + bufpush(0x00EF); + break; + case 0x00D0: + bufpush(0x00F0); + break; + case 0x00D1: + bufpush(0x00F1); + break; + case 0x00D2: + bufpush(0x00F2); + break; + case 0x00D3: + bufpush(0x00F3); + break; + case 0x00D4: + bufpush(0x00F4); + break; + case 0x00D5: + bufpush(0x00F5); + break; + case 0x00D6: + bufpush(0x00F6); + break; + case 0x00D8: + bufpush(0x00F8); + break; + case 0x00D9: + bufpush(0x00F9); + break; + case 0x00DA: + bufpush(0x00FA); + break; + case 0x00DB: + bufpush(0x00FB); + break; + case 0x00DC: + bufpush(0x00FC); + break; + case 0x00DD: + bufpush(0x00FD); + break; + case 0x00DE: + bufpush(0x00FE); + break; + case 0x00DF: + bufpush(0x0073); + bufpush(0x0073); + break; + case 0x0100: + bufpush(0x0101); + break; + case 0x0102: + bufpush(0x0103); + break; + case 0x0104: + bufpush(0x0105); + break; + case 0x0106: + bufpush(0x0107); + break; + case 0x0108: + bufpush(0x0109); + break; + case 0x010A: + bufpush(0x010B); + break; + case 0x010C: + bufpush(0x010D); + break; + case 0x010E: + bufpush(0x010F); + break; + case 0x0110: + bufpush(0x0111); + break; + case 0x0112: + bufpush(0x0113); + break; + case 0x0114: + bufpush(0x0115); + break; + case 0x0116: + bufpush(0x0117); + break; + case 0x0118: + bufpush(0x0119); + break; + case 0x011A: + bufpush(0x011B); + break; + case 0x011C: + bufpush(0x011D); + break; + case 0x011E: + bufpush(0x011F); + break; + case 0x0120: + bufpush(0x0121); + break; + case 0x0122: + bufpush(0x0123); + break; + case 0x0124: + bufpush(0x0125); + break; + case 0x0126: + bufpush(0x0127); + break; + case 0x0128: + bufpush(0x0129); + break; + case 0x012A: + bufpush(0x012B); + break; + case 0x012C: + bufpush(0x012D); + break; + case 0x012E: + bufpush(0x012F); + break; + case 0x0130: + bufpush(0x0069); + bufpush(0x0307); + break; + case 0x0132: + bufpush(0x0133); + break; + case 0x0134: + bufpush(0x0135); + break; + case 0x0136: + bufpush(0x0137); + break; + case 0x0139: + bufpush(0x013A); + break; + case 0x013B: + bufpush(0x013C); + break; + case 0x013D: + bufpush(0x013E); + break; + case 0x013F: + bufpush(0x0140); + break; + case 0x0141: + bufpush(0x0142); + break; + case 0x0143: + bufpush(0x0144); + break; + case 0x0145: + bufpush(0x0146); + break; + case 0x0147: + bufpush(0x0148); + break; + case 0x0149: + bufpush(0x02BC); + bufpush(0x006E); + break; + case 0x014A: + bufpush(0x014B); + break; + case 0x014C: + bufpush(0x014D); + break; + case 0x014E: + bufpush(0x014F); + break; + case 0x0150: + bufpush(0x0151); + break; + case 0x0152: + bufpush(0x0153); + break; + case 0x0154: + bufpush(0x0155); + break; + case 0x0156: + bufpush(0x0157); + break; + case 0x0158: + bufpush(0x0159); + break; + case 0x015A: + bufpush(0x015B); + break; + case 0x015C: + bufpush(0x015D); + break; + case 0x015E: + bufpush(0x015F); + break; + case 0x0160: + bufpush(0x0161); + break; + case 0x0162: + bufpush(0x0163); + break; + case 0x0164: + bufpush(0x0165); + break; + case 0x0166: + bufpush(0x0167); + break; + case 0x0168: + bufpush(0x0169); + break; + case 0x016A: + bufpush(0x016B); + break; + case 0x016C: + bufpush(0x016D); + break; + case 0x016E: + bufpush(0x016F); + break; + case 0x0170: + bufpush(0x0171); + break; + case 0x0172: + bufpush(0x0173); + break; + case 0x0174: + bufpush(0x0175); + break; + case 0x0176: + bufpush(0x0177); + break; + case 0x0178: + bufpush(0x00FF); + break; + case 0x0179: + bufpush(0x017A); + break; + case 0x017B: + bufpush(0x017C); + break; + case 0x017D: + bufpush(0x017E); + break; + case 0x017F: + bufpush(0x0073); + break; + case 0x0181: + bufpush(0x0253); + break; + case 0x0182: + bufpush(0x0183); + break; + case 0x0184: + bufpush(0x0185); + break; + case 0x0186: + bufpush(0x0254); + break; + case 0x0187: + bufpush(0x0188); + break; + case 0x0189: + bufpush(0x0256); + break; + case 0x018A: + bufpush(0x0257); + break; + case 0x018B: + bufpush(0x018C); + break; + case 0x018E: + bufpush(0x01DD); + break; + case 0x018F: + bufpush(0x0259); + break; + case 0x0190: + bufpush(0x025B); + break; + case 0x0191: + bufpush(0x0192); + break; + case 0x0193: + bufpush(0x0260); + break; + case 0x0194: + bufpush(0x0263); + break; + case 0x0196: + bufpush(0x0269); + break; + case 0x0197: + bufpush(0x0268); + break; + case 0x0198: + bufpush(0x0199); + break; + case 0x019C: + bufpush(0x026F); + break; + case 0x019D: + bufpush(0x0272); + break; + case 0x019F: + bufpush(0x0275); + break; + case 0x01A0: + bufpush(0x01A1); + break; + case 0x01A2: + bufpush(0x01A3); + break; + case 0x01A4: + bufpush(0x01A5); + break; + case 0x01A6: + bufpush(0x0280); + break; + case 0x01A7: + bufpush(0x01A8); + break; + case 0x01A9: + bufpush(0x0283); + break; + case 0x01AC: + bufpush(0x01AD); + break; + case 0x01AE: + bufpush(0x0288); + break; + case 0x01AF: + bufpush(0x01B0); + break; + case 0x01B1: + bufpush(0x028A); + break; + case 0x01B2: + bufpush(0x028B); + break; + case 0x01B3: + bufpush(0x01B4); + break; + case 0x01B5: + bufpush(0x01B6); + break; + case 0x01B7: + bufpush(0x0292); + break; + case 0x01B8: + bufpush(0x01B9); + break; + case 0x01BC: + bufpush(0x01BD); + break; + case 0x01C4: + bufpush(0x01C6); + break; + case 0x01C5: + bufpush(0x01C6); + break; + case 0x01C7: + bufpush(0x01C9); + break; + case 0x01C8: + bufpush(0x01C9); + break; + case 0x01CA: + bufpush(0x01CC); + break; + case 0x01CB: + bufpush(0x01CC); + break; + case 0x01CD: + bufpush(0x01CE); + break; + case 0x01CF: + bufpush(0x01D0); + break; + case 0x01D1: + bufpush(0x01D2); + break; + case 0x01D3: + bufpush(0x01D4); + break; + case 0x01D5: + bufpush(0x01D6); + break; + case 0x01D7: + bufpush(0x01D8); + break; + case 0x01D9: + bufpush(0x01DA); + break; + case 0x01DB: + bufpush(0x01DC); + break; + case 0x01DE: + bufpush(0x01DF); + break; + case 0x01E0: + bufpush(0x01E1); + break; + case 0x01E2: + bufpush(0x01E3); + break; + case 0x01E4: + bufpush(0x01E5); + break; + case 0x01E6: + bufpush(0x01E7); + break; + case 0x01E8: + bufpush(0x01E9); + break; + case 0x01EA: + bufpush(0x01EB); + break; + case 0x01EC: + bufpush(0x01ED); + break; + case 0x01EE: + bufpush(0x01EF); + break; + case 0x01F0: + bufpush(0x006A); + bufpush(0x030C); + break; + case 0x01F1: + bufpush(0x01F3); + break; + case 0x01F2: + bufpush(0x01F3); + break; + case 0x01F4: + bufpush(0x01F5); + break; + case 0x01F6: + bufpush(0x0195); + break; + case 0x01F7: + bufpush(0x01BF); + break; + case 0x01F8: + bufpush(0x01F9); + break; + case 0x01FA: + bufpush(0x01FB); + break; + case 0x01FC: + bufpush(0x01FD); + break; + case 0x01FE: + bufpush(0x01FF); + break; + case 0x0200: + bufpush(0x0201); + break; + case 0x0202: + bufpush(0x0203); + break; + case 0x0204: + bufpush(0x0205); + break; + case 0x0206: + bufpush(0x0207); + break; + case 0x0208: + bufpush(0x0209); + break; + case 0x020A: + bufpush(0x020B); + break; + case 0x020C: + bufpush(0x020D); + break; + case 0x020E: + bufpush(0x020F); + break; + case 0x0210: + bufpush(0x0211); + break; + case 0x0212: + bufpush(0x0213); + break; + case 0x0214: + bufpush(0x0215); + break; + case 0x0216: + bufpush(0x0217); + break; + case 0x0218: + bufpush(0x0219); + break; + case 0x021A: + bufpush(0x021B); + break; + case 0x021C: + bufpush(0x021D); + break; + case 0x021E: + bufpush(0x021F); + break; + case 0x0220: + bufpush(0x019E); + break; + case 0x0222: + bufpush(0x0223); + break; + case 0x0224: + bufpush(0x0225); + break; + case 0x0226: + bufpush(0x0227); + break; + case 0x0228: + bufpush(0x0229); + break; + case 0x022A: + bufpush(0x022B); + break; + case 0x022C: + bufpush(0x022D); + break; + case 0x022E: + bufpush(0x022F); + break; + case 0x0230: + bufpush(0x0231); + break; + case 0x0232: + bufpush(0x0233); + break; + case 0x0345: + bufpush(0x03B9); + break; + case 0x0386: + bufpush(0x03AC); + break; + case 0x0388: + bufpush(0x03AD); + break; + case 0x0389: + bufpush(0x03AE); + break; + case 0x038A: + bufpush(0x03AF); + break; + case 0x038C: + bufpush(0x03CC); + break; + case 0x038E: + bufpush(0x03CD); + break; + case 0x038F: + bufpush(0x03CE); + break; + case 0x0390: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x0391: + bufpush(0x03B1); + break; + case 0x0392: + bufpush(0x03B2); + break; + case 0x0393: + bufpush(0x03B3); + break; + case 0x0394: + bufpush(0x03B4); + break; + case 0x0395: + bufpush(0x03B5); + break; + case 0x0396: + bufpush(0x03B6); + break; + case 0x0397: + bufpush(0x03B7); + break; + case 0x0398: + bufpush(0x03B8); + break; + case 0x0399: + bufpush(0x03B9); + break; + case 0x039A: + bufpush(0x03BA); + break; + case 0x039B: + bufpush(0x03BB); + break; + case 0x039C: + bufpush(0x03BC); + break; + case 0x039D: + bufpush(0x03BD); + break; + case 0x039E: + bufpush(0x03BE); + break; + case 0x039F: + bufpush(0x03BF); + break; + case 0x03A0: + bufpush(0x03C0); + break; + case 0x03A1: + bufpush(0x03C1); + break; + case 0x03A3: + bufpush(0x03C3); + break; + case 0x03A4: + bufpush(0x03C4); + break; + case 0x03A5: + bufpush(0x03C5); + break; + case 0x03A6: + bufpush(0x03C6); + break; + case 0x03A7: + bufpush(0x03C7); + break; + case 0x03A8: + bufpush(0x03C8); + break; + case 0x03A9: + bufpush(0x03C9); + break; + case 0x03AA: + bufpush(0x03CA); + break; + case 0x03AB: + bufpush(0x03CB); + break; + case 0x03B0: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x03C2: + bufpush(0x03C3); + break; + case 0x03D0: + bufpush(0x03B2); + break; + case 0x03D1: + bufpush(0x03B8); + break; + case 0x03D5: + bufpush(0x03C6); + break; + case 0x03D6: + bufpush(0x03C0); + break; + case 0x03D8: + bufpush(0x03D9); + break; + case 0x03DA: + bufpush(0x03DB); + break; + case 0x03DC: + bufpush(0x03DD); + break; + case 0x03DE: + bufpush(0x03DF); + break; + case 0x03E0: + bufpush(0x03E1); + break; + case 0x03E2: + bufpush(0x03E3); + break; + case 0x03E4: + bufpush(0x03E5); + break; + case 0x03E6: + bufpush(0x03E7); + break; + case 0x03E8: + bufpush(0x03E9); + break; + case 0x03EA: + bufpush(0x03EB); + break; + case 0x03EC: + bufpush(0x03ED); + break; + case 0x03EE: + bufpush(0x03EF); + break; + case 0x03F0: + bufpush(0x03BA); + break; + case 0x03F1: + bufpush(0x03C1); + break; + case 0x03F2: + bufpush(0x03C3); + break; + case 0x03F4: + bufpush(0x03B8); + break; + case 0x03F5: + bufpush(0x03B5); + break; + case 0x0400: + bufpush(0x0450); + break; + case 0x0401: + bufpush(0x0451); + break; + case 0x0402: + bufpush(0x0452); + break; + case 0x0403: + bufpush(0x0453); + break; + case 0x0404: + bufpush(0x0454); + break; + case 0x0405: + bufpush(0x0455); + break; + case 0x0406: + bufpush(0x0456); + break; + case 0x0407: + bufpush(0x0457); + break; + case 0x0408: + bufpush(0x0458); + break; + case 0x0409: + bufpush(0x0459); + break; + case 0x040A: + bufpush(0x045A); + break; + case 0x040B: + bufpush(0x045B); + break; + case 0x040C: + bufpush(0x045C); + break; + case 0x040D: + bufpush(0x045D); + break; + case 0x040E: + bufpush(0x045E); + break; + case 0x040F: + bufpush(0x045F); + break; + case 0x0410: + bufpush(0x0430); + break; + case 0x0411: + bufpush(0x0431); + break; + case 0x0412: + bufpush(0x0432); + break; + case 0x0413: + bufpush(0x0433); + break; + case 0x0414: + bufpush(0x0434); + break; + case 0x0415: + bufpush(0x0435); + break; + case 0x0416: + bufpush(0x0436); + break; + case 0x0417: + bufpush(0x0437); + break; + case 0x0418: + bufpush(0x0438); + break; + case 0x0419: + bufpush(0x0439); + break; + case 0x041A: + bufpush(0x043A); + break; + case 0x041B: + bufpush(0x043B); + break; + case 0x041C: + bufpush(0x043C); + break; + case 0x041D: + bufpush(0x043D); + break; + case 0x041E: + bufpush(0x043E); + break; + case 0x041F: + bufpush(0x043F); + break; + case 0x0420: + bufpush(0x0440); + break; + case 0x0421: + bufpush(0x0441); + break; + case 0x0422: + bufpush(0x0442); + break; + case 0x0423: + bufpush(0x0443); + break; + case 0x0424: + bufpush(0x0444); + break; + case 0x0425: + bufpush(0x0445); + break; + case 0x0426: + bufpush(0x0446); + break; + case 0x0427: + bufpush(0x0447); + break; + case 0x0428: + bufpush(0x0448); + break; + case 0x0429: + bufpush(0x0449); + break; + case 0x042A: + bufpush(0x044A); + break; + case 0x042B: + bufpush(0x044B); + break; + case 0x042C: + bufpush(0x044C); + break; + case 0x042D: + bufpush(0x044D); + break; + case 0x042E: + bufpush(0x044E); + break; + case 0x042F: + bufpush(0x044F); + break; + case 0x0460: + bufpush(0x0461); + break; + case 0x0462: + bufpush(0x0463); + break; + case 0x0464: + bufpush(0x0465); + break; + case 0x0466: + bufpush(0x0467); + break; + case 0x0468: + bufpush(0x0469); + break; + case 0x046A: + bufpush(0x046B); + break; + case 0x046C: + bufpush(0x046D); + break; + case 0x046E: + bufpush(0x046F); + break; + case 0x0470: + bufpush(0x0471); + break; + case 0x0472: + bufpush(0x0473); + break; + case 0x0474: + bufpush(0x0475); + break; + case 0x0476: + bufpush(0x0477); + break; + case 0x0478: + bufpush(0x0479); + break; + case 0x047A: + bufpush(0x047B); + break; + case 0x047C: + bufpush(0x047D); + break; + case 0x047E: + bufpush(0x047F); + break; + case 0x0480: + bufpush(0x0481); + break; + case 0x048A: + bufpush(0x048B); + break; + case 0x048C: + bufpush(0x048D); + break; + case 0x048E: + bufpush(0x048F); + break; + case 0x0490: + bufpush(0x0491); + break; + case 0x0492: + bufpush(0x0493); + break; + case 0x0494: + bufpush(0x0495); + break; + case 0x0496: + bufpush(0x0497); + break; + case 0x0498: + bufpush(0x0499); + break; + case 0x049A: + bufpush(0x049B); + break; + case 0x049C: + bufpush(0x049D); + break; + case 0x049E: + bufpush(0x049F); + break; + case 0x04A0: + bufpush(0x04A1); + break; + case 0x04A2: + bufpush(0x04A3); + break; + case 0x04A4: + bufpush(0x04A5); + break; + case 0x04A6: + bufpush(0x04A7); + break; + case 0x04A8: + bufpush(0x04A9); + break; + case 0x04AA: + bufpush(0x04AB); + break; + case 0x04AC: + bufpush(0x04AD); + break; + case 0x04AE: + bufpush(0x04AF); + break; + case 0x04B0: + bufpush(0x04B1); + break; + case 0x04B2: + bufpush(0x04B3); + break; + case 0x04B4: + bufpush(0x04B5); + break; + case 0x04B6: + bufpush(0x04B7); + break; + case 0x04B8: + bufpush(0x04B9); + break; + case 0x04BA: + bufpush(0x04BB); + break; + case 0x04BC: + bufpush(0x04BD); + break; + case 0x04BE: + bufpush(0x04BF); + break; + case 0x04C1: + bufpush(0x04C2); + break; + case 0x04C3: + bufpush(0x04C4); + break; + case 0x04C5: + bufpush(0x04C6); + break; + case 0x04C7: + bufpush(0x04C8); + break; + case 0x04C9: + bufpush(0x04CA); + break; + case 0x04CB: + bufpush(0x04CC); + break; + case 0x04CD: + bufpush(0x04CE); + break; + case 0x04D0: + bufpush(0x04D1); + break; + case 0x04D2: + bufpush(0x04D3); + break; + case 0x04D4: + bufpush(0x04D5); + break; + case 0x04D6: + bufpush(0x04D7); + break; + case 0x04D8: + bufpush(0x04D9); + break; + case 0x04DA: + bufpush(0x04DB); + break; + case 0x04DC: + bufpush(0x04DD); + break; + case 0x04DE: + bufpush(0x04DF); + break; + case 0x04E0: + bufpush(0x04E1); + break; + case 0x04E2: + bufpush(0x04E3); + break; + case 0x04E4: + bufpush(0x04E5); + break; + case 0x04E6: + bufpush(0x04E7); + break; + case 0x04E8: + bufpush(0x04E9); + break; + case 0x04EA: + bufpush(0x04EB); + break; + case 0x04EC: + bufpush(0x04ED); + break; + case 0x04EE: + bufpush(0x04EF); + break; + case 0x04F0: + bufpush(0x04F1); + break; + case 0x04F2: + bufpush(0x04F3); + break; + case 0x04F4: + bufpush(0x04F5); + break; + case 0x04F8: + bufpush(0x04F9); + break; + case 0x0500: + bufpush(0x0501); + break; + case 0x0502: + bufpush(0x0503); + break; + case 0x0504: + bufpush(0x0505); + break; + case 0x0506: + bufpush(0x0507); + break; + case 0x0508: + bufpush(0x0509); + break; + case 0x050A: + bufpush(0x050B); + break; + case 0x050C: + bufpush(0x050D); + break; + case 0x050E: + bufpush(0x050F); + break; + case 0x0531: + bufpush(0x0561); + break; + case 0x0532: + bufpush(0x0562); + break; + case 0x0533: + bufpush(0x0563); + break; + case 0x0534: + bufpush(0x0564); + break; + case 0x0535: + bufpush(0x0565); + break; + case 0x0536: + bufpush(0x0566); + break; + case 0x0537: + bufpush(0x0567); + break; + case 0x0538: + bufpush(0x0568); + break; + case 0x0539: + bufpush(0x0569); + break; + case 0x053A: + bufpush(0x056A); + break; + case 0x053B: + bufpush(0x056B); + break; + case 0x053C: + bufpush(0x056C); + break; + case 0x053D: + bufpush(0x056D); + break; + case 0x053E: + bufpush(0x056E); + break; + case 0x053F: + bufpush(0x056F); + break; + case 0x0540: + bufpush(0x0570); + break; + case 0x0541: + bufpush(0x0571); + break; + case 0x0542: + bufpush(0x0572); + break; + case 0x0543: + bufpush(0x0573); + break; + case 0x0544: + bufpush(0x0574); + break; + case 0x0545: + bufpush(0x0575); + break; + case 0x0546: + bufpush(0x0576); + break; + case 0x0547: + bufpush(0x0577); + break; + case 0x0548: + bufpush(0x0578); + break; + case 0x0549: + bufpush(0x0579); + break; + case 0x054A: + bufpush(0x057A); + break; + case 0x054B: + bufpush(0x057B); + break; + case 0x054C: + bufpush(0x057C); + break; + case 0x054D: + bufpush(0x057D); + break; + case 0x054E: + bufpush(0x057E); + break; + case 0x054F: + bufpush(0x057F); + break; + case 0x0550: + bufpush(0x0580); + break; + case 0x0551: + bufpush(0x0581); + break; + case 0x0552: + bufpush(0x0582); + break; + case 0x0553: + bufpush(0x0583); + break; + case 0x0554: + bufpush(0x0584); + break; + case 0x0555: + bufpush(0x0585); + break; + case 0x0556: + bufpush(0x0586); + break; + case 0x0587: + bufpush(0x0565); + bufpush(0x0582); + break; + case 0x1E00: + bufpush(0x1E01); + break; + case 0x1E02: + bufpush(0x1E03); + break; + case 0x1E04: + bufpush(0x1E05); + break; + case 0x1E06: + bufpush(0x1E07); + break; + case 0x1E08: + bufpush(0x1E09); + break; + case 0x1E0A: + bufpush(0x1E0B); + break; + case 0x1E0C: + bufpush(0x1E0D); + break; + case 0x1E0E: + bufpush(0x1E0F); + break; + case 0x1E10: + bufpush(0x1E11); + break; + case 0x1E12: + bufpush(0x1E13); + break; + case 0x1E14: + bufpush(0x1E15); + break; + case 0x1E16: + bufpush(0x1E17); + break; + case 0x1E18: + bufpush(0x1E19); + break; + case 0x1E1A: + bufpush(0x1E1B); + break; + case 0x1E1C: + bufpush(0x1E1D); + break; + case 0x1E1E: + bufpush(0x1E1F); + break; + case 0x1E20: + bufpush(0x1E21); + break; + case 0x1E22: + bufpush(0x1E23); + break; + case 0x1E24: + bufpush(0x1E25); + break; + case 0x1E26: + bufpush(0x1E27); + break; + case 0x1E28: + bufpush(0x1E29); + break; + case 0x1E2A: + bufpush(0x1E2B); + break; + case 0x1E2C: + bufpush(0x1E2D); + break; + case 0x1E2E: + bufpush(0x1E2F); + break; + case 0x1E30: + bufpush(0x1E31); + break; + case 0x1E32: + bufpush(0x1E33); + break; + case 0x1E34: + bufpush(0x1E35); + break; + case 0x1E36: + bufpush(0x1E37); + break; + case 0x1E38: + bufpush(0x1E39); + break; + case 0x1E3A: + bufpush(0x1E3B); + break; + case 0x1E3C: + bufpush(0x1E3D); + break; + case 0x1E3E: + bufpush(0x1E3F); + break; + case 0x1E40: + bufpush(0x1E41); + break; + case 0x1E42: + bufpush(0x1E43); + break; + case 0x1E44: + bufpush(0x1E45); + break; + case 0x1E46: + bufpush(0x1E47); + break; + case 0x1E48: + bufpush(0x1E49); + break; + case 0x1E4A: + bufpush(0x1E4B); + break; + case 0x1E4C: + bufpush(0x1E4D); + break; + case 0x1E4E: + bufpush(0x1E4F); + break; + case 0x1E50: + bufpush(0x1E51); + break; + case 0x1E52: + bufpush(0x1E53); + break; + case 0x1E54: + bufpush(0x1E55); + break; + case 0x1E56: + bufpush(0x1E57); + break; + case 0x1E58: + bufpush(0x1E59); + break; + case 0x1E5A: + bufpush(0x1E5B); + break; + case 0x1E5C: + bufpush(0x1E5D); + break; + case 0x1E5E: + bufpush(0x1E5F); + break; + case 0x1E60: + bufpush(0x1E61); + break; + case 0x1E62: + bufpush(0x1E63); + break; + case 0x1E64: + bufpush(0x1E65); + break; + case 0x1E66: + bufpush(0x1E67); + break; + case 0x1E68: + bufpush(0x1E69); + break; + case 0x1E6A: + bufpush(0x1E6B); + break; + case 0x1E6C: + bufpush(0x1E6D); + break; + case 0x1E6E: + bufpush(0x1E6F); + break; + case 0x1E70: + bufpush(0x1E71); + break; + case 0x1E72: + bufpush(0x1E73); + break; + case 0x1E74: + bufpush(0x1E75); + break; + case 0x1E76: + bufpush(0x1E77); + break; + case 0x1E78: + bufpush(0x1E79); + break; + case 0x1E7A: + bufpush(0x1E7B); + break; + case 0x1E7C: + bufpush(0x1E7D); + break; + case 0x1E7E: + bufpush(0x1E7F); + break; + case 0x1E80: + bufpush(0x1E81); + break; + case 0x1E82: + bufpush(0x1E83); + break; + case 0x1E84: + bufpush(0x1E85); + break; + case 0x1E86: + bufpush(0x1E87); + break; + case 0x1E88: + bufpush(0x1E89); + break; + case 0x1E8A: + bufpush(0x1E8B); + break; + case 0x1E8C: + bufpush(0x1E8D); + break; + case 0x1E8E: + bufpush(0x1E8F); + break; + case 0x1E90: + bufpush(0x1E91); + break; + case 0x1E92: + bufpush(0x1E93); + break; + case 0x1E94: + bufpush(0x1E95); + break; + case 0x1E96: + bufpush(0x0068); + bufpush(0x0331); + break; + case 0x1E97: + bufpush(0x0074); + bufpush(0x0308); + break; + case 0x1E98: + bufpush(0x0077); + bufpush(0x030A); + break; + case 0x1E99: + bufpush(0x0079); + bufpush(0x030A); + break; + case 0x1E9A: + bufpush(0x0061); + bufpush(0x02BE); + break; + case 0x1E9B: + bufpush(0x1E61); + break; + case 0x1EA0: + bufpush(0x1EA1); + break; + case 0x1EA2: + bufpush(0x1EA3); + break; + case 0x1EA4: + bufpush(0x1EA5); + break; + case 0x1EA6: + bufpush(0x1EA7); + break; + case 0x1EA8: + bufpush(0x1EA9); + break; + case 0x1EAA: + bufpush(0x1EAB); + break; + case 0x1EAC: + bufpush(0x1EAD); + break; + case 0x1EAE: + bufpush(0x1EAF); + break; + case 0x1EB0: + bufpush(0x1EB1); + break; + case 0x1EB2: + bufpush(0x1EB3); + break; + case 0x1EB4: + bufpush(0x1EB5); + break; + case 0x1EB6: + bufpush(0x1EB7); + break; + case 0x1EB8: + bufpush(0x1EB9); + break; + case 0x1EBA: + bufpush(0x1EBB); + break; + case 0x1EBC: + bufpush(0x1EBD); + break; + case 0x1EBE: + bufpush(0x1EBF); + break; + case 0x1EC0: + bufpush(0x1EC1); + break; + case 0x1EC2: + bufpush(0x1EC3); + break; + case 0x1EC4: + bufpush(0x1EC5); + break; + case 0x1EC6: + bufpush(0x1EC7); + break; + case 0x1EC8: + bufpush(0x1EC9); + break; + case 0x1ECA: + bufpush(0x1ECB); + break; + case 0x1ECC: + bufpush(0x1ECD); + break; + case 0x1ECE: + bufpush(0x1ECF); + break; + case 0x1ED0: + bufpush(0x1ED1); + break; + case 0x1ED2: + bufpush(0x1ED3); + break; + case 0x1ED4: + bufpush(0x1ED5); + break; + case 0x1ED6: + bufpush(0x1ED7); + break; + case 0x1ED8: + bufpush(0x1ED9); + break; + case 0x1EDA: + bufpush(0x1EDB); + break; + case 0x1EDC: + bufpush(0x1EDD); + break; + case 0x1EDE: + bufpush(0x1EDF); + break; + case 0x1EE0: + bufpush(0x1EE1); + break; + case 0x1EE2: + bufpush(0x1EE3); + break; + case 0x1EE4: + bufpush(0x1EE5); + break; + case 0x1EE6: + bufpush(0x1EE7); + break; + case 0x1EE8: + bufpush(0x1EE9); + break; + case 0x1EEA: + bufpush(0x1EEB); + break; + case 0x1EEC: + bufpush(0x1EED); + break; + case 0x1EEE: + bufpush(0x1EEF); + break; + case 0x1EF0: + bufpush(0x1EF1); + break; + case 0x1EF2: + bufpush(0x1EF3); + break; + case 0x1EF4: + bufpush(0x1EF5); + break; + case 0x1EF6: + bufpush(0x1EF7); + break; + case 0x1EF8: + bufpush(0x1EF9); + break; + case 0x1F08: + bufpush(0x1F00); + break; + case 0x1F09: + bufpush(0x1F01); + break; + case 0x1F0A: + bufpush(0x1F02); + break; + case 0x1F0B: + bufpush(0x1F03); + break; + case 0x1F0C: + bufpush(0x1F04); + break; + case 0x1F0D: + bufpush(0x1F05); + break; + case 0x1F0E: + bufpush(0x1F06); + break; + case 0x1F0F: + bufpush(0x1F07); + break; + case 0x1F18: + bufpush(0x1F10); + break; + case 0x1F19: + bufpush(0x1F11); + break; + case 0x1F1A: + bufpush(0x1F12); + break; + case 0x1F1B: + bufpush(0x1F13); + break; + case 0x1F1C: + bufpush(0x1F14); + break; + case 0x1F1D: + bufpush(0x1F15); + break; + case 0x1F28: + bufpush(0x1F20); + break; + case 0x1F29: + bufpush(0x1F21); + break; + case 0x1F2A: + bufpush(0x1F22); + break; + case 0x1F2B: + bufpush(0x1F23); + break; + case 0x1F2C: + bufpush(0x1F24); + break; + case 0x1F2D: + bufpush(0x1F25); + break; + case 0x1F2E: + bufpush(0x1F26); + break; + case 0x1F2F: + bufpush(0x1F27); + break; + case 0x1F38: + bufpush(0x1F30); + break; + case 0x1F39: + bufpush(0x1F31); + break; + case 0x1F3A: + bufpush(0x1F32); + break; + case 0x1F3B: + bufpush(0x1F33); + break; + case 0x1F3C: + bufpush(0x1F34); + break; + case 0x1F3D: + bufpush(0x1F35); + break; + case 0x1F3E: + bufpush(0x1F36); + break; + case 0x1F3F: + bufpush(0x1F37); + break; + case 0x1F48: + bufpush(0x1F40); + break; + case 0x1F49: + bufpush(0x1F41); + break; + case 0x1F4A: + bufpush(0x1F42); + break; + case 0x1F4B: + bufpush(0x1F43); + break; + case 0x1F4C: + bufpush(0x1F44); + break; + case 0x1F4D: + bufpush(0x1F45); + break; + case 0x1F50: + bufpush(0x03C5); + bufpush(0x0313); + break; + case 0x1F52: + bufpush(0x03C5); + bufpush(0x0313); + bufpush(0x0300); + break; + case 0x1F54: + bufpush(0x03C5); + bufpush(0x0313); + bufpush(0x0301); + break; + case 0x1F56: + bufpush(0x03C5); + bufpush(0x0313); + bufpush(0x0342); + break; + case 0x1F59: + bufpush(0x1F51); + break; + case 0x1F5B: + bufpush(0x1F53); + break; + case 0x1F5D: + bufpush(0x1F55); + break; + case 0x1F5F: + bufpush(0x1F57); + break; + case 0x1F68: + bufpush(0x1F60); + break; + case 0x1F69: + bufpush(0x1F61); + break; + case 0x1F6A: + bufpush(0x1F62); + break; + case 0x1F6B: + bufpush(0x1F63); + break; + case 0x1F6C: + bufpush(0x1F64); + break; + case 0x1F6D: + bufpush(0x1F65); + break; + case 0x1F6E: + bufpush(0x1F66); + break; + case 0x1F6F: + bufpush(0x1F67); + break; + case 0x1F80: + bufpush(0x1F00); + bufpush(0x03B9); + break; + case 0x1F81: + bufpush(0x1F01); + bufpush(0x03B9); + break; + case 0x1F82: + bufpush(0x1F02); + bufpush(0x03B9); + break; + case 0x1F83: + bufpush(0x1F03); + bufpush(0x03B9); + break; + case 0x1F84: + bufpush(0x1F04); + bufpush(0x03B9); + break; + case 0x1F85: + bufpush(0x1F05); + bufpush(0x03B9); + break; + case 0x1F86: + bufpush(0x1F06); + bufpush(0x03B9); + break; + case 0x1F87: + bufpush(0x1F07); + bufpush(0x03B9); + break; + case 0x1F88: + bufpush(0x1F00); + bufpush(0x03B9); + break; + case 0x1F89: + bufpush(0x1F01); + bufpush(0x03B9); + break; + case 0x1F8A: + bufpush(0x1F02); + bufpush(0x03B9); + break; + case 0x1F8B: + bufpush(0x1F03); + bufpush(0x03B9); + break; + case 0x1F8C: + bufpush(0x1F04); + bufpush(0x03B9); + break; + case 0x1F8D: + bufpush(0x1F05); + bufpush(0x03B9); + break; + case 0x1F8E: + bufpush(0x1F06); + bufpush(0x03B9); + break; + case 0x1F8F: + bufpush(0x1F07); + bufpush(0x03B9); + break; + case 0x1F90: + bufpush(0x1F20); + bufpush(0x03B9); + break; + case 0x1F91: + bufpush(0x1F21); + bufpush(0x03B9); + break; + case 0x1F92: + bufpush(0x1F22); + bufpush(0x03B9); + break; + case 0x1F93: + bufpush(0x1F23); + bufpush(0x03B9); + break; + case 0x1F94: + bufpush(0x1F24); + bufpush(0x03B9); + break; + case 0x1F95: + bufpush(0x1F25); + bufpush(0x03B9); + break; + case 0x1F96: + bufpush(0x1F26); + bufpush(0x03B9); + break; + case 0x1F97: + bufpush(0x1F27); + bufpush(0x03B9); + break; + case 0x1F98: + bufpush(0x1F20); + bufpush(0x03B9); + break; + case 0x1F99: + bufpush(0x1F21); + bufpush(0x03B9); + break; + case 0x1F9A: + bufpush(0x1F22); + bufpush(0x03B9); + break; + case 0x1F9B: + bufpush(0x1F23); + bufpush(0x03B9); + break; + case 0x1F9C: + bufpush(0x1F24); + bufpush(0x03B9); + break; + case 0x1F9D: + bufpush(0x1F25); + bufpush(0x03B9); + break; + case 0x1F9E: + bufpush(0x1F26); + bufpush(0x03B9); + break; + case 0x1F9F: + bufpush(0x1F27); + bufpush(0x03B9); + break; + case 0x1FA0: + bufpush(0x1F60); + bufpush(0x03B9); + break; + case 0x1FA1: + bufpush(0x1F61); + bufpush(0x03B9); + break; + case 0x1FA2: + bufpush(0x1F62); + bufpush(0x03B9); + break; + case 0x1FA3: + bufpush(0x1F63); + bufpush(0x03B9); + break; + case 0x1FA4: + bufpush(0x1F64); + bufpush(0x03B9); + break; + case 0x1FA5: + bufpush(0x1F65); + bufpush(0x03B9); + break; + case 0x1FA6: + bufpush(0x1F66); + bufpush(0x03B9); + break; + case 0x1FA7: + bufpush(0x1F67); + bufpush(0x03B9); + break; + case 0x1FA8: + bufpush(0x1F60); + bufpush(0x03B9); + break; + case 0x1FA9: + bufpush(0x1F61); + bufpush(0x03B9); + break; + case 0x1FAA: + bufpush(0x1F62); + bufpush(0x03B9); + break; + case 0x1FAB: + bufpush(0x1F63); + bufpush(0x03B9); + break; + case 0x1FAC: + bufpush(0x1F64); + bufpush(0x03B9); + break; + case 0x1FAD: + bufpush(0x1F65); + bufpush(0x03B9); + break; + case 0x1FAE: + bufpush(0x1F66); + bufpush(0x03B9); + break; + case 0x1FAF: + bufpush(0x1F67); + bufpush(0x03B9); + break; + case 0x1FB2: + bufpush(0x1F70); + bufpush(0x03B9); + break; + case 0x1FB3: + bufpush(0x03B1); + bufpush(0x03B9); + break; + case 0x1FB4: + bufpush(0x03AC); + bufpush(0x03B9); + break; + case 0x1FB6: + bufpush(0x03B1); + bufpush(0x0342); + break; + case 0x1FB7: + bufpush(0x03B1); + bufpush(0x0342); + bufpush(0x03B9); + break; + case 0x1FB8: + bufpush(0x1FB0); + break; + case 0x1FB9: + bufpush(0x1FB1); + break; + case 0x1FBA: + bufpush(0x1F70); + break; + case 0x1FBB: + bufpush(0x1F71); + break; + case 0x1FBC: + bufpush(0x03B1); + bufpush(0x03B9); + break; + case 0x1FBE: + bufpush(0x03B9); + break; + case 0x1FC2: + bufpush(0x1F74); + bufpush(0x03B9); + break; + case 0x1FC3: + bufpush(0x03B7); + bufpush(0x03B9); + break; + case 0x1FC4: + bufpush(0x03AE); + bufpush(0x03B9); + break; + case 0x1FC6: + bufpush(0x03B7); + bufpush(0x0342); + break; + case 0x1FC7: + bufpush(0x03B7); + bufpush(0x0342); + bufpush(0x03B9); + break; + case 0x1FC8: + bufpush(0x1F72); + break; + case 0x1FC9: + bufpush(0x1F73); + break; + case 0x1FCA: + bufpush(0x1F74); + break; + case 0x1FCB: + bufpush(0x1F75); + break; + case 0x1FCC: + bufpush(0x03B7); + bufpush(0x03B9); + break; + case 0x1FD2: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0300); + break; + case 0x1FD3: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x1FD6: + bufpush(0x03B9); + bufpush(0x0342); + break; + case 0x1FD7: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0342); + break; + case 0x1FD8: + bufpush(0x1FD0); + break; + case 0x1FD9: + bufpush(0x1FD1); + break; + case 0x1FDA: + bufpush(0x1F76); + break; + case 0x1FDB: + bufpush(0x1F77); + break; + case 0x1FE2: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0300); + break; + case 0x1FE3: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x1FE4: + bufpush(0x03C1); + bufpush(0x0313); + break; + case 0x1FE6: + bufpush(0x03C5); + bufpush(0x0342); + break; + case 0x1FE7: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0342); + break; + case 0x1FE8: + bufpush(0x1FE0); + break; + case 0x1FE9: + bufpush(0x1FE1); + break; + case 0x1FEA: + bufpush(0x1F7A); + break; + case 0x1FEB: + bufpush(0x1F7B); + break; + case 0x1FEC: + bufpush(0x1FE5); + break; + case 0x1FF2: + bufpush(0x1F7C); + bufpush(0x03B9); + break; + case 0x1FF3: + bufpush(0x03C9); + bufpush(0x03B9); + break; + case 0x1FF4: + bufpush(0x03CE); + bufpush(0x03B9); + break; + case 0x1FF6: + bufpush(0x03C9); + bufpush(0x0342); + break; + case 0x1FF7: + bufpush(0x03C9); + bufpush(0x0342); + bufpush(0x03B9); + break; + case 0x1FF8: + bufpush(0x1F78); + break; + case 0x1FF9: + bufpush(0x1F79); + break; + case 0x1FFA: + bufpush(0x1F7C); + break; + case 0x1FFB: + bufpush(0x1F7D); + break; + case 0x1FFC: + bufpush(0x03C9); + bufpush(0x03B9); + break; + case 0x2126: + bufpush(0x03C9); + break; + case 0x212A: + bufpush(0x006B); + break; + case 0x212B: + bufpush(0x00E5); + break; + case 0x2160: + bufpush(0x2170); + break; + case 0x2161: + bufpush(0x2171); + break; + case 0x2162: + bufpush(0x2172); + break; + case 0x2163: + bufpush(0x2173); + break; + case 0x2164: + bufpush(0x2174); + break; + case 0x2165: + bufpush(0x2175); + break; + case 0x2166: + bufpush(0x2176); + break; + case 0x2167: + bufpush(0x2177); + break; + case 0x2168: + bufpush(0x2178); + break; + case 0x2169: + bufpush(0x2179); + break; + case 0x216A: + bufpush(0x217A); + break; + case 0x216B: + bufpush(0x217B); + break; + case 0x216C: + bufpush(0x217C); + break; + case 0x216D: + bufpush(0x217D); + break; + case 0x216E: + bufpush(0x217E); + break; + case 0x216F: + bufpush(0x217F); + break; + case 0x24B6: + bufpush(0x24D0); + break; + case 0x24B7: + bufpush(0x24D1); + break; + case 0x24B8: + bufpush(0x24D2); + break; + case 0x24B9: + bufpush(0x24D3); + break; + case 0x24BA: + bufpush(0x24D4); + break; + case 0x24BB: + bufpush(0x24D5); + break; + case 0x24BC: + bufpush(0x24D6); + break; + case 0x24BD: + bufpush(0x24D7); + break; + case 0x24BE: + bufpush(0x24D8); + break; + case 0x24BF: + bufpush(0x24D9); + break; + case 0x24C0: + bufpush(0x24DA); + break; + case 0x24C1: + bufpush(0x24DB); + break; + case 0x24C2: + bufpush(0x24DC); + break; + case 0x24C3: + bufpush(0x24DD); + break; + case 0x24C4: + bufpush(0x24DE); + break; + case 0x24C5: + bufpush(0x24DF); + break; + case 0x24C6: + bufpush(0x24E0); + break; + case 0x24C7: + bufpush(0x24E1); + break; + case 0x24C8: + bufpush(0x24E2); + break; + case 0x24C9: + bufpush(0x24E3); + break; + case 0x24CA: + bufpush(0x24E4); + break; + case 0x24CB: + bufpush(0x24E5); + break; + case 0x24CC: + bufpush(0x24E6); + break; + case 0x24CD: + bufpush(0x24E7); + break; + case 0x24CE: + bufpush(0x24E8); + break; + case 0x24CF: + bufpush(0x24E9); + break; + case 0xFB00: + bufpush(0x0066); + bufpush(0x0066); + break; + case 0xFB01: + bufpush(0x0066); + bufpush(0x0069); + break; + case 0xFB02: + bufpush(0x0066); + bufpush(0x006C); + break; + case 0xFB03: + bufpush(0x0066); + bufpush(0x0066); + bufpush(0x0069); + break; + case 0xFB04: + bufpush(0x0066); + bufpush(0x0066); + bufpush(0x006C); + break; + case 0xFB05: + bufpush(0x0073); + bufpush(0x0074); + break; + case 0xFB06: + bufpush(0x0073); + bufpush(0x0074); + break; + case 0xFB13: + bufpush(0x0574); + bufpush(0x0576); + break; + case 0xFB14: + bufpush(0x0574); + bufpush(0x0565); + break; + case 0xFB15: + bufpush(0x0574); + bufpush(0x056B); + break; + case 0xFB16: + bufpush(0x057E); + bufpush(0x0576); + break; + case 0xFB17: + bufpush(0x0574); + bufpush(0x056D); + break; + case 0xFF21: + bufpush(0xFF41); + break; + case 0xFF22: + bufpush(0xFF42); + break; + case 0xFF23: + bufpush(0xFF43); + break; + case 0xFF24: + bufpush(0xFF44); + break; + case 0xFF25: + bufpush(0xFF45); + break; + case 0xFF26: + bufpush(0xFF46); + break; + case 0xFF27: + bufpush(0xFF47); + break; + case 0xFF28: + bufpush(0xFF48); + break; + case 0xFF29: + bufpush(0xFF49); + break; + case 0xFF2A: + bufpush(0xFF4A); + break; + case 0xFF2B: + bufpush(0xFF4B); + break; + case 0xFF2C: + bufpush(0xFF4C); + break; + case 0xFF2D: + bufpush(0xFF4D); + break; + case 0xFF2E: + bufpush(0xFF4E); + break; + case 0xFF2F: + bufpush(0xFF4F); + break; + case 0xFF30: + bufpush(0xFF50); + break; + case 0xFF31: + bufpush(0xFF51); + break; + case 0xFF32: + bufpush(0xFF52); + break; + case 0xFF33: + bufpush(0xFF53); + break; + case 0xFF34: + bufpush(0xFF54); + break; + case 0xFF35: + bufpush(0xFF55); + break; + case 0xFF36: + bufpush(0xFF56); + break; + case 0xFF37: + bufpush(0xFF57); + break; + case 0xFF38: + bufpush(0xFF58); + break; + case 0xFF39: + bufpush(0xFF59); + break; + case 0xFF3A: + bufpush(0xFF5A); + break; + case 0x10400: + bufpush(0x10428); + break; + case 0x10401: + bufpush(0x10429); + break; + case 0x10402: + bufpush(0x1042A); + break; + case 0x10403: + bufpush(0x1042B); + break; + case 0x10404: + bufpush(0x1042C); + break; + case 0x10405: + bufpush(0x1042D); + break; + case 0x10406: + bufpush(0x1042E); + break; + case 0x10407: + bufpush(0x1042F); + break; + case 0x10408: + bufpush(0x10430); + break; + case 0x10409: + bufpush(0x10431); + break; + case 0x1040A: + bufpush(0x10432); + break; + case 0x1040B: + bufpush(0x10433); + break; + case 0x1040C: + bufpush(0x10434); + break; + case 0x1040D: + bufpush(0x10435); + break; + case 0x1040E: + bufpush(0x10436); + break; + case 0x1040F: + bufpush(0x10437); + break; + case 0x10410: + bufpush(0x10438); + break; + case 0x10411: + bufpush(0x10439); + break; + case 0x10412: + bufpush(0x1043A); + break; + case 0x10413: + bufpush(0x1043B); + break; + case 0x10414: + bufpush(0x1043C); + break; + case 0x10415: + bufpush(0x1043D); + break; + case 0x10416: + bufpush(0x1043E); + break; + case 0x10417: + bufpush(0x1043F); + break; + case 0x10418: + bufpush(0x10440); + break; + case 0x10419: + bufpush(0x10441); + break; + case 0x1041A: + bufpush(0x10442); + break; + case 0x1041B: + bufpush(0x10443); + break; + case 0x1041C: + bufpush(0x10444); + break; + case 0x1041D: + bufpush(0x10445); + break; + case 0x1041E: + bufpush(0x10446); + break; + case 0x1041F: + bufpush(0x10447); + break; + case 0x10420: + bufpush(0x10448); + break; + case 0x10421: + bufpush(0x10449); + break; + case 0x10422: + bufpush(0x1044A); + break; + case 0x10423: + bufpush(0x1044B); + break; + case 0x10424: + bufpush(0x1044C); + break; + case 0x10425: + bufpush(0x1044D); + break; + default: + bufpush(c); + } diff --git a/src/casefold.c b/src/casefold.c new file mode 100644 index 0000000..33f18aa --- /dev/null +++ b/src/casefold.c @@ -0,0 +1,2699 @@ +#include <stdlib.h> +#include <stdio.h> + + + switch c { + case 0x0041: + bufpush(0x0061); + break; + case 0x0042: + bufpush(0x0062); + break; + case 0x0043: + bufpush(0x0063); + break; + case 0x0044: + bufpush(0x0064); + break; + case 0x0045: + bufpush(0x0065); + break; + case 0x0046: + bufpush(0x0066); + break; + case 0x0047: + bufpush(0x0067); + break; + case 0x0048: + bufpush(0x0068); + break; + case 0x0049: + bufpush(0x0069); + break; + case 0x0049: + bufpush(0x0131); + break; + case 0x004A: + bufpush(0x006A); + break; + case 0x004B: + bufpush(0x006B); + break; + case 0x004C: + bufpush(0x006C); + break; + case 0x004D: + bufpush(0x006D); + break; + case 0x004E: + bufpush(0x006E); + break; + case 0x004F: + bufpush(0x006F); + break; + case 0x0050: + bufpush(0x0070); + break; + case 0x0051: + bufpush(0x0071); + break; + case 0x0052: + bufpush(0x0072); + break; + case 0x0053: + bufpush(0x0073); + break; + case 0x0054: + bufpush(0x0074); + break; + case 0x0055: + bufpush(0x0075); + break; + case 0x0056: + bufpush(0x0076); + break; + case 0x0057: + bufpush(0x0077); + break; + case 0x0058: + bufpush(0x0078); + break; + case 0x0059: + bufpush(0x0079); + break; + case 0x005A: + bufpush(0x007A); + break; + case 0x00B5: + bufpush(0x03BC); + break; + case 0x00C0: + bufpush(0x00E0); + break; + case 0x00C1: + bufpush(0x00E1); + break; + case 0x00C2: + bufpush(0x00E2); + break; + case 0x00C3: + bufpush(0x00E3); + break; + case 0x00C4: + bufpush(0x00E4); + break; + case 0x00C5: + bufpush(0x00E5); + break; + case 0x00C6: + bufpush(0x00E6); + break; + case 0x00C7: + bufpush(0x00E7); + break; + case 0x00C8: + bufpush(0x00E8); + break; + case 0x00C9: + bufpush(0x00E9); + break; + case 0x00CA: + bufpush(0x00EA); + break; + case 0x00CB: + bufpush(0x00EB); + break; + case 0x00CC: + bufpush(0x00EC); + break; + case 0x00CD: + bufpush(0x00ED); + break; + case 0x00CE: + bufpush(0x00EE); + break; + case 0x00CF: + bufpush(0x00EF); + break; + case 0x00D0: + bufpush(0x00F0); + break; + case 0x00D1: + bufpush(0x00F1); + break; + case 0x00D2: + bufpush(0x00F2); + break; + case 0x00D3: + bufpush(0x00F3); + break; + case 0x00D4: + bufpush(0x00F4); + break; + case 0x00D5: + bufpush(0x00F5); + break; + case 0x00D6: + bufpush(0x00F6); + break; + case 0x00D8: + bufpush(0x00F8); + break; + case 0x00D9: + bufpush(0x00F9); + break; + case 0x00DA: + bufpush(0x00FA); + break; + case 0x00DB: + bufpush(0x00FB); + break; + case 0x00DC: + bufpush(0x00FC); + break; + case 0x00DD: + bufpush(0x00FD); + break; + case 0x00DE: + bufpush(0x00FE); + break; + case 0x00DF: + bufpush(0x0073); + bufpush(0x0073); + break; + case 0x0100: + bufpush(0x0101); + break; + case 0x0102: + bufpush(0x0103); + break; + case 0x0104: + bufpush(0x0105); + break; + case 0x0106: + bufpush(0x0107); + break; + case 0x0108: + bufpush(0x0109); + break; + case 0x010A: + bufpush(0x010B); + break; + case 0x010C: + bufpush(0x010D); + break; + case 0x010E: + bufpush(0x010F); + break; + case 0x0110: + bufpush(0x0111); + break; + case 0x0112: + bufpush(0x0113); + break; + case 0x0114: + bufpush(0x0115); + break; + case 0x0116: + bufpush(0x0117); + break; + case 0x0118: + bufpush(0x0119); + break; + case 0x011A: + bufpush(0x011B); + break; + case 0x011C: + bufpush(0x011D); + break; + case 0x011E: + bufpush(0x011F); + break; + case 0x0120: + bufpush(0x0121); + break; + case 0x0122: + bufpush(0x0123); + break; + case 0x0124: + bufpush(0x0125); + break; + case 0x0126: + bufpush(0x0127); + break; + case 0x0128: + bufpush(0x0129); + break; + case 0x012A: + bufpush(0x012B); + break; + case 0x012C: + bufpush(0x012D); + break; + case 0x012E: + bufpush(0x012F); + break; + case 0x0130: + bufpush(0x0069); + bufpush(0x0307); + break; + case 0x0130: + bufpush(0x0069); + break; + case 0x0132: + bufpush(0x0133); + break; + case 0x0134: + bufpush(0x0135); + break; + case 0x0136: + bufpush(0x0137); + break; + case 0x0139: + bufpush(0x013A); + break; + case 0x013B: + bufpush(0x013C); + break; + case 0x013D: + bufpush(0x013E); + break; + case 0x013F: + bufpush(0x0140); + break; + case 0x0141: + bufpush(0x0142); + break; + case 0x0143: + bufpush(0x0144); + break; + case 0x0145: + bufpush(0x0146); + break; + case 0x0147: + bufpush(0x0148); + break; + case 0x0149: + bufpush(0x02BC); + bufpush(0x006E); + break; + case 0x014A: + bufpush(0x014B); + break; + case 0x014C: + bufpush(0x014D); + break; + case 0x014E: + bufpush(0x014F); + break; + case 0x0150: + bufpush(0x0151); + break; + case 0x0152: + bufpush(0x0153); + break; + case 0x0154: + bufpush(0x0155); + break; + case 0x0156: + bufpush(0x0157); + break; + case 0x0158: + bufpush(0x0159); + break; + case 0x015A: + bufpush(0x015B); + break; + case 0x015C: + bufpush(0x015D); + break; + case 0x015E: + bufpush(0x015F); + break; + case 0x0160: + bufpush(0x0161); + break; + case 0x0162: + bufpush(0x0163); + break; + case 0x0164: + bufpush(0x0165); + break; + case 0x0166: + bufpush(0x0167); + break; + case 0x0168: + bufpush(0x0169); + break; + case 0x016A: + bufpush(0x016B); + break; + case 0x016C: + bufpush(0x016D); + break; + case 0x016E: + bufpush(0x016F); + break; + case 0x0170: + bufpush(0x0171); + break; + case 0x0172: + bufpush(0x0173); + break; + case 0x0174: + bufpush(0x0175); + break; + case 0x0176: + bufpush(0x0177); + break; + case 0x0178: + bufpush(0x00FF); + break; + case 0x0179: + bufpush(0x017A); + break; + case 0x017B: + bufpush(0x017C); + break; + case 0x017D: + bufpush(0x017E); + break; + case 0x017F: + bufpush(0x0073); + break; + case 0x0181: + bufpush(0x0253); + break; + case 0x0182: + bufpush(0x0183); + break; + case 0x0184: + bufpush(0x0185); + break; + case 0x0186: + bufpush(0x0254); + break; + case 0x0187: + bufpush(0x0188); + break; + case 0x0189: + bufpush(0x0256); + break; + case 0x018A: + bufpush(0x0257); + break; + case 0x018B: + bufpush(0x018C); + break; + case 0x018E: + bufpush(0x01DD); + break; + case 0x018F: + bufpush(0x0259); + break; + case 0x0190: + bufpush(0x025B); + break; + case 0x0191: + bufpush(0x0192); + break; + case 0x0193: + bufpush(0x0260); + break; + case 0x0194: + bufpush(0x0263); + break; + case 0x0196: + bufpush(0x0269); + break; + case 0x0197: + bufpush(0x0268); + break; + case 0x0198: + bufpush(0x0199); + break; + case 0x019C: + bufpush(0x026F); + break; + case 0x019D: + bufpush(0x0272); + break; + case 0x019F: + bufpush(0x0275); + break; + case 0x01A0: + bufpush(0x01A1); + break; + case 0x01A2: + bufpush(0x01A3); + break; + case 0x01A4: + bufpush(0x01A5); + break; + case 0x01A6: + bufpush(0x0280); + break; + case 0x01A7: + bufpush(0x01A8); + break; + case 0x01A9: + bufpush(0x0283); + break; + case 0x01AC: + bufpush(0x01AD); + break; + case 0x01AE: + bufpush(0x0288); + break; + case 0x01AF: + bufpush(0x01B0); + break; + case 0x01B1: + bufpush(0x028A); + break; + case 0x01B2: + bufpush(0x028B); + break; + case 0x01B3: + bufpush(0x01B4); + break; + case 0x01B5: + bufpush(0x01B6); + break; + case 0x01B7: + bufpush(0x0292); + break; + case 0x01B8: + bufpush(0x01B9); + break; + case 0x01BC: + bufpush(0x01BD); + break; + case 0x01C4: + bufpush(0x01C6); + break; + case 0x01C5: + bufpush(0x01C6); + break; + case 0x01C7: + bufpush(0x01C9); + break; + case 0x01C8: + bufpush(0x01C9); + break; + case 0x01CA: + bufpush(0x01CC); + break; + case 0x01CB: + bufpush(0x01CC); + break; + case 0x01CD: + bufpush(0x01CE); + break; + case 0x01CF: + bufpush(0x01D0); + break; + case 0x01D1: + bufpush(0x01D2); + break; + case 0x01D3: + bufpush(0x01D4); + break; + case 0x01D5: + bufpush(0x01D6); + break; + case 0x01D7: + bufpush(0x01D8); + break; + case 0x01D9: + bufpush(0x01DA); + break; + case 0x01DB: + bufpush(0x01DC); + break; + case 0x01DE: + bufpush(0x01DF); + break; + case 0x01E0: + bufpush(0x01E1); + break; + case 0x01E2: + bufpush(0x01E3); + break; + case 0x01E4: + bufpush(0x01E5); + break; + case 0x01E6: + bufpush(0x01E7); + break; + case 0x01E8: + bufpush(0x01E9); + break; + case 0x01EA: + bufpush(0x01EB); + break; + case 0x01EC: + bufpush(0x01ED); + break; + case 0x01EE: + bufpush(0x01EF); + break; + case 0x01F0: + bufpush(0x006A); + bufpush(0x030C); + break; + case 0x01F1: + bufpush(0x01F3); + break; + case 0x01F2: + bufpush(0x01F3); + break; + case 0x01F4: + bufpush(0x01F5); + break; + case 0x01F6: + bufpush(0x0195); + break; + case 0x01F7: + bufpush(0x01BF); + break; + case 0x01F8: + bufpush(0x01F9); + break; + case 0x01FA: + bufpush(0x01FB); + break; + case 0x01FC: + bufpush(0x01FD); + break; + case 0x01FE: + bufpush(0x01FF); + break; + case 0x0200: + bufpush(0x0201); + break; + case 0x0202: + bufpush(0x0203); + break; + case 0x0204: + bufpush(0x0205); + break; + case 0x0206: + bufpush(0x0207); + break; + case 0x0208: + bufpush(0x0209); + break; + case 0x020A: + bufpush(0x020B); + break; + case 0x020C: + bufpush(0x020D); + break; + case 0x020E: + bufpush(0x020F); + break; + case 0x0210: + bufpush(0x0211); + break; + case 0x0212: + bufpush(0x0213); + break; + case 0x0214: + bufpush(0x0215); + break; + case 0x0216: + bufpush(0x0217); + break; + case 0x0218: + bufpush(0x0219); + break; + case 0x021A: + bufpush(0x021B); + break; + case 0x021C: + bufpush(0x021D); + break; + case 0x021E: + bufpush(0x021F); + break; + case 0x0220: + bufpush(0x019E); + break; + case 0x0222: + bufpush(0x0223); + break; + case 0x0224: + bufpush(0x0225); + break; + case 0x0226: + bufpush(0x0227); + break; + case 0x0228: + bufpush(0x0229); + break; + case 0x022A: + bufpush(0x022B); + break; + case 0x022C: + bufpush(0x022D); + break; + case 0x022E: + bufpush(0x022F); + break; + case 0x0230: + bufpush(0x0231); + break; + case 0x0232: + bufpush(0x0233); + break; + case 0x0345: + bufpush(0x03B9); + break; + case 0x0386: + bufpush(0x03AC); + break; + case 0x0388: + bufpush(0x03AD); + break; + case 0x0389: + bufpush(0x03AE); + break; + case 0x038A: + bufpush(0x03AF); + break; + case 0x038C: + bufpush(0x03CC); + break; + case 0x038E: + bufpush(0x03CD); + break; + case 0x038F: + bufpush(0x03CE); + break; + case 0x0390: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x0391: + bufpush(0x03B1); + break; + case 0x0392: + bufpush(0x03B2); + break; + case 0x0393: + bufpush(0x03B3); + break; + case 0x0394: + bufpush(0x03B4); + break; + case 0x0395: + bufpush(0x03B5); + break; + case 0x0396: + bufpush(0x03B6); + break; + case 0x0397: + bufpush(0x03B7); + break; + case 0x0398: + bufpush(0x03B8); + break; + case 0x0399: + bufpush(0x03B9); + break; + case 0x039A: + bufpush(0x03BA); + break; + case 0x039B: + bufpush(0x03BB); + break; + case 0x039C: + bufpush(0x03BC); + break; + case 0x039D: + bufpush(0x03BD); + break; + case 0x039E: + bufpush(0x03BE); + break; + case 0x039F: + bufpush(0x03BF); + break; + case 0x03A0: + bufpush(0x03C0); + break; + case 0x03A1: + bufpush(0x03C1); + break; + case 0x03A3: + bufpush(0x03C3); + break; + case 0x03A4: + bufpush(0x03C4); + break; + case 0x03A5: + bufpush(0x03C5); + break; + case 0x03A6: + bufpush(0x03C6); + break; + case 0x03A7: + bufpush(0x03C7); + break; + case 0x03A8: + bufpush(0x03C8); + break; + case 0x03A9: + bufpush(0x03C9); + break; + case 0x03AA: + bufpush(0x03CA); + break; + case 0x03AB: + bufpush(0x03CB); + break; + case 0x03B0: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x03C2: + bufpush(0x03C3); + break; + case 0x03D0: + bufpush(0x03B2); + break; + case 0x03D1: + bufpush(0x03B8); + break; + case 0x03D5: + bufpush(0x03C6); + break; + case 0x03D6: + bufpush(0x03C0); + break; + case 0x03D8: + bufpush(0x03D9); + break; + case 0x03DA: + bufpush(0x03DB); + break; + case 0x03DC: + bufpush(0x03DD); + break; + case 0x03DE: + bufpush(0x03DF); + break; + case 0x03E0: + bufpush(0x03E1); + break; + case 0x03E2: + bufpush(0x03E3); + break; + case 0x03E4: + bufpush(0x03E5); + break; + case 0x03E6: + bufpush(0x03E7); + break; + case 0x03E8: + bufpush(0x03E9); + break; + case 0x03EA: + bufpush(0x03EB); + break; + case 0x03EC: + bufpush(0x03ED); + break; + case 0x03EE: + bufpush(0x03EF); + break; + case 0x03F0: + bufpush(0x03BA); + break; + case 0x03F1: + bufpush(0x03C1); + break; + case 0x03F2: + bufpush(0x03C3); + break; + case 0x03F4: + bufpush(0x03B8); + break; + case 0x03F5: + bufpush(0x03B5); + break; + case 0x0400: + bufpush(0x0450); + break; + case 0x0401: + bufpush(0x0451); + break; + case 0x0402: + bufpush(0x0452); + break; + case 0x0403: + bufpush(0x0453); + break; + case 0x0404: + bufpush(0x0454); + break; + case 0x0405: + bufpush(0x0455); + break; + case 0x0406: + bufpush(0x0456); + break; + case 0x0407: + bufpush(0x0457); + break; + case 0x0408: + bufpush(0x0458); + break; + case 0x0409: + bufpush(0x0459); + break; + case 0x040A: + bufpush(0x045A); + break; + case 0x040B: + bufpush(0x045B); + break; + case 0x040C: + bufpush(0x045C); + break; + case 0x040D: + bufpush(0x045D); + break; + case 0x040E: + bufpush(0x045E); + break; + case 0x040F: + bufpush(0x045F); + break; + case 0x0410: + bufpush(0x0430); + break; + case 0x0411: + bufpush(0x0431); + break; + case 0x0412: + bufpush(0x0432); + break; + case 0x0413: + bufpush(0x0433); + break; + case 0x0414: + bufpush(0x0434); + break; + case 0x0415: + bufpush(0x0435); + break; + case 0x0416: + bufpush(0x0436); + break; + case 0x0417: + bufpush(0x0437); + break; + case 0x0418: + bufpush(0x0438); + break; + case 0x0419: + bufpush(0x0439); + break; + case 0x041A: + bufpush(0x043A); + break; + case 0x041B: + bufpush(0x043B); + break; + case 0x041C: + bufpush(0x043C); + break; + case 0x041D: + bufpush(0x043D); + break; + case 0x041E: + bufpush(0x043E); + break; + case 0x041F: + bufpush(0x043F); + break; + case 0x0420: + bufpush(0x0440); + break; + case 0x0421: + bufpush(0x0441); + break; + case 0x0422: + bufpush(0x0442); + break; + case 0x0423: + bufpush(0x0443); + break; + case 0x0424: + bufpush(0x0444); + break; + case 0x0425: + bufpush(0x0445); + break; + case 0x0426: + bufpush(0x0446); + break; + case 0x0427: + bufpush(0x0447); + break; + case 0x0428: + bufpush(0x0448); + break; + case 0x0429: + bufpush(0x0449); + break; + case 0x042A: + bufpush(0x044A); + break; + case 0x042B: + bufpush(0x044B); + break; + case 0x042C: + bufpush(0x044C); + break; + case 0x042D: + bufpush(0x044D); + break; + case 0x042E: + bufpush(0x044E); + break; + case 0x042F: + bufpush(0x044F); + break; + case 0x0460: + bufpush(0x0461); + break; + case 0x0462: + bufpush(0x0463); + break; + case 0x0464: + bufpush(0x0465); + break; + case 0x0466: + bufpush(0x0467); + break; + case 0x0468: + bufpush(0x0469); + break; + case 0x046A: + bufpush(0x046B); + break; + case 0x046C: + bufpush(0x046D); + break; + case 0x046E: + bufpush(0x046F); + break; + case 0x0470: + bufpush(0x0471); + break; + case 0x0472: + bufpush(0x0473); + break; + case 0x0474: + bufpush(0x0475); + break; + case 0x0476: + bufpush(0x0477); + break; + case 0x0478: + bufpush(0x0479); + break; + case 0x047A: + bufpush(0x047B); + break; + case 0x047C: + bufpush(0x047D); + break; + case 0x047E: + bufpush(0x047F); + break; + case 0x0480: + bufpush(0x0481); + break; + case 0x048A: + bufpush(0x048B); + break; + case 0x048C: + bufpush(0x048D); + break; + case 0x048E: + bufpush(0x048F); + break; + case 0x0490: + bufpush(0x0491); + break; + case 0x0492: + bufpush(0x0493); + break; + case 0x0494: + bufpush(0x0495); + break; + case 0x0496: + bufpush(0x0497); + break; + case 0x0498: + bufpush(0x0499); + break; + case 0x049A: + bufpush(0x049B); + break; + case 0x049C: + bufpush(0x049D); + break; + case 0x049E: + bufpush(0x049F); + break; + case 0x04A0: + bufpush(0x04A1); + break; + case 0x04A2: + bufpush(0x04A3); + break; + case 0x04A4: + bufpush(0x04A5); + break; + case 0x04A6: + bufpush(0x04A7); + break; + case 0x04A8: + bufpush(0x04A9); + break; + case 0x04AA: + bufpush(0x04AB); + break; + case 0x04AC: + bufpush(0x04AD); + break; + case 0x04AE: + bufpush(0x04AF); + break; + case 0x04B0: + bufpush(0x04B1); + break; + case 0x04B2: + bufpush(0x04B3); + break; + case 0x04B4: + bufpush(0x04B5); + break; + case 0x04B6: + bufpush(0x04B7); + break; + case 0x04B8: + bufpush(0x04B9); + break; + case 0x04BA: + bufpush(0x04BB); + break; + case 0x04BC: + bufpush(0x04BD); + break; + case 0x04BE: + bufpush(0x04BF); + break; + case 0x04C1: + bufpush(0x04C2); + break; + case 0x04C3: + bufpush(0x04C4); + break; + case 0x04C5: + bufpush(0x04C6); + break; + case 0x04C7: + bufpush(0x04C8); + break; + case 0x04C9: + bufpush(0x04CA); + break; + case 0x04CB: + bufpush(0x04CC); + break; + case 0x04CD: + bufpush(0x04CE); + break; + case 0x04D0: + bufpush(0x04D1); + break; + case 0x04D2: + bufpush(0x04D3); + break; + case 0x04D4: + bufpush(0x04D5); + break; + case 0x04D6: + bufpush(0x04D7); + break; + case 0x04D8: + bufpush(0x04D9); + break; + case 0x04DA: + bufpush(0x04DB); + break; + case 0x04DC: + bufpush(0x04DD); + break; + case 0x04DE: + bufpush(0x04DF); + break; + case 0x04E0: + bufpush(0x04E1); + break; + case 0x04E2: + bufpush(0x04E3); + break; + case 0x04E4: + bufpush(0x04E5); + break; + case 0x04E6: + bufpush(0x04E7); + break; + case 0x04E8: + bufpush(0x04E9); + break; + case 0x04EA: + bufpush(0x04EB); + break; + case 0x04EC: + bufpush(0x04ED); + break; + case 0x04EE: + bufpush(0x04EF); + break; + case 0x04F0: + bufpush(0x04F1); + break; + case 0x04F2: + bufpush(0x04F3); + break; + case 0x04F4: + bufpush(0x04F5); + break; + case 0x04F8: + bufpush(0x04F9); + break; + case 0x0500: + bufpush(0x0501); + break; + case 0x0502: + bufpush(0x0503); + break; + case 0x0504: + bufpush(0x0505); + break; + case 0x0506: + bufpush(0x0507); + break; + case 0x0508: + bufpush(0x0509); + break; + case 0x050A: + bufpush(0x050B); + break; + case 0x050C: + bufpush(0x050D); + break; + case 0x050E: + bufpush(0x050F); + break; + case 0x0531: + bufpush(0x0561); + break; + case 0x0532: + bufpush(0x0562); + break; + case 0x0533: + bufpush(0x0563); + break; + case 0x0534: + bufpush(0x0564); + break; + case 0x0535: + bufpush(0x0565); + break; + case 0x0536: + bufpush(0x0566); + break; + case 0x0537: + bufpush(0x0567); + break; + case 0x0538: + bufpush(0x0568); + break; + case 0x0539: + bufpush(0x0569); + break; + case 0x053A: + bufpush(0x056A); + break; + case 0x053B: + bufpush(0x056B); + break; + case 0x053C: + bufpush(0x056C); + break; + case 0x053D: + bufpush(0x056D); + break; + case 0x053E: + bufpush(0x056E); + break; + case 0x053F: + bufpush(0x056F); + break; + case 0x0540: + bufpush(0x0570); + break; + case 0x0541: + bufpush(0x0571); + break; + case 0x0542: + bufpush(0x0572); + break; + case 0x0543: + bufpush(0x0573); + break; + case 0x0544: + bufpush(0x0574); + break; + case 0x0545: + bufpush(0x0575); + break; + case 0x0546: + bufpush(0x0576); + break; + case 0x0547: + bufpush(0x0577); + break; + case 0x0548: + bufpush(0x0578); + break; + case 0x0549: + bufpush(0x0579); + break; + case 0x054A: + bufpush(0x057A); + break; + case 0x054B: + bufpush(0x057B); + break; + case 0x054C: + bufpush(0x057C); + break; + case 0x054D: + bufpush(0x057D); + break; + case 0x054E: + bufpush(0x057E); + break; + case 0x054F: + bufpush(0x057F); + break; + case 0x0550: + bufpush(0x0580); + break; + case 0x0551: + bufpush(0x0581); + break; + case 0x0552: + bufpush(0x0582); + break; + case 0x0553: + bufpush(0x0583); + break; + case 0x0554: + bufpush(0x0584); + break; + case 0x0555: + bufpush(0x0585); + break; + case 0x0556: + bufpush(0x0586); + break; + case 0x0587: + bufpush(0x0565); + bufpush(0x0582); + break; + case 0x1E00: + bufpush(0x1E01); + break; + case 0x1E02: + bufpush(0x1E03); + break; + case 0x1E04: + bufpush(0x1E05); + break; + case 0x1E06: + bufpush(0x1E07); + break; + case 0x1E08: + bufpush(0x1E09); + break; + case 0x1E0A: + bufpush(0x1E0B); + break; + case 0x1E0C: + bufpush(0x1E0D); + break; + case 0x1E0E: + bufpush(0x1E0F); + break; + case 0x1E10: + bufpush(0x1E11); + break; + case 0x1E12: + bufpush(0x1E13); + break; + case 0x1E14: + bufpush(0x1E15); + break; + case 0x1E16: + bufpush(0x1E17); + break; + case 0x1E18: + bufpush(0x1E19); + break; + case 0x1E1A: + bufpush(0x1E1B); + break; + case 0x1E1C: + bufpush(0x1E1D); + break; + case 0x1E1E: + bufpush(0x1E1F); + break; + case 0x1E20: + bufpush(0x1E21); + break; + case 0x1E22: + bufpush(0x1E23); + break; + case 0x1E24: + bufpush(0x1E25); + break; + case 0x1E26: + bufpush(0x1E27); + break; + case 0x1E28: + bufpush(0x1E29); + break; + case 0x1E2A: + bufpush(0x1E2B); + break; + case 0x1E2C: + bufpush(0x1E2D); + break; + case 0x1E2E: + bufpush(0x1E2F); + break; + case 0x1E30: + bufpush(0x1E31); + break; + case 0x1E32: + bufpush(0x1E33); + break; + case 0x1E34: + bufpush(0x1E35); + break; + case 0x1E36: + bufpush(0x1E37); + break; + case 0x1E38: + bufpush(0x1E39); + break; + case 0x1E3A: + bufpush(0x1E3B); + break; + case 0x1E3C: + bufpush(0x1E3D); + break; + case 0x1E3E: + bufpush(0x1E3F); + break; + case 0x1E40: + bufpush(0x1E41); + break; + case 0x1E42: + bufpush(0x1E43); + break; + case 0x1E44: + bufpush(0x1E45); + break; + case 0x1E46: + bufpush(0x1E47); + break; + case 0x1E48: + bufpush(0x1E49); + break; + case 0x1E4A: + bufpush(0x1E4B); + break; + case 0x1E4C: + bufpush(0x1E4D); + break; + case 0x1E4E: + bufpush(0x1E4F); + break; + case 0x1E50: + bufpush(0x1E51); + break; + case 0x1E52: + bufpush(0x1E53); + break; + case 0x1E54: + bufpush(0x1E55); + break; + case 0x1E56: + bufpush(0x1E57); + break; + case 0x1E58: + bufpush(0x1E59); + break; + case 0x1E5A: + bufpush(0x1E5B); + break; + case 0x1E5C: + bufpush(0x1E5D); + break; + case 0x1E5E: + bufpush(0x1E5F); + break; + case 0x1E60: + bufpush(0x1E61); + break; + case 0x1E62: + bufpush(0x1E63); + break; + case 0x1E64: + bufpush(0x1E65); + break; + case 0x1E66: + bufpush(0x1E67); + break; + case 0x1E68: + bufpush(0x1E69); + break; + case 0x1E6A: + bufpush(0x1E6B); + break; + case 0x1E6C: + bufpush(0x1E6D); + break; + case 0x1E6E: + bufpush(0x1E6F); + break; + case 0x1E70: + bufpush(0x1E71); + break; + case 0x1E72: + bufpush(0x1E73); + break; + case 0x1E74: + bufpush(0x1E75); + break; + case 0x1E76: + bufpush(0x1E77); + break; + case 0x1E78: + bufpush(0x1E79); + break; + case 0x1E7A: + bufpush(0x1E7B); + break; + case 0x1E7C: + bufpush(0x1E7D); + break; + case 0x1E7E: + bufpush(0x1E7F); + break; + case 0x1E80: + bufpush(0x1E81); + break; + case 0x1E82: + bufpush(0x1E83); + break; + case 0x1E84: + bufpush(0x1E85); + break; + case 0x1E86: + bufpush(0x1E87); + break; + case 0x1E88: + bufpush(0x1E89); + break; + case 0x1E8A: + bufpush(0x1E8B); + break; + case 0x1E8C: + bufpush(0x1E8D); + break; + case 0x1E8E: + bufpush(0x1E8F); + break; + case 0x1E90: + bufpush(0x1E91); + break; + case 0x1E92: + bufpush(0x1E93); + break; + case 0x1E94: + bufpush(0x1E95); + break; + case 0x1E96: + bufpush(0x0068); + bufpush(0x0331); + break; + case 0x1E97: + bufpush(0x0074); + bufpush(0x0308); + break; + case 0x1E98: + bufpush(0x0077); + bufpush(0x030A); + break; + case 0x1E99: + bufpush(0x0079); + bufpush(0x030A); + break; + case 0x1E9A: + bufpush(0x0061); + bufpush(0x02BE); + break; + case 0x1E9B: + bufpush(0x1E61); + break; + case 0x1EA0: + bufpush(0x1EA1); + break; + case 0x1EA2: + bufpush(0x1EA3); + break; + case 0x1EA4: + bufpush(0x1EA5); + break; + case 0x1EA6: + bufpush(0x1EA7); + break; + case 0x1EA8: + bufpush(0x1EA9); + break; + case 0x1EAA: + bufpush(0x1EAB); + break; + case 0x1EAC: + bufpush(0x1EAD); + break; + case 0x1EAE: + bufpush(0x1EAF); + break; + case 0x1EB0: + bufpush(0x1EB1); + break; + case 0x1EB2: + bufpush(0x1EB3); + break; + case 0x1EB4: + bufpush(0x1EB5); + break; + case 0x1EB6: + bufpush(0x1EB7); + break; + case 0x1EB8: + bufpush(0x1EB9); + break; + case 0x1EBA: + bufpush(0x1EBB); + break; + case 0x1EBC: + bufpush(0x1EBD); + break; + case 0x1EBE: + bufpush(0x1EBF); + break; + case 0x1EC0: + bufpush(0x1EC1); + break; + case 0x1EC2: + bufpush(0x1EC3); + break; + case 0x1EC4: + bufpush(0x1EC5); + break; + case 0x1EC6: + bufpush(0x1EC7); + break; + case 0x1EC8: + bufpush(0x1EC9); + break; + case 0x1ECA: + bufpush(0x1ECB); + break; + case 0x1ECC: + bufpush(0x1ECD); + break; + case 0x1ECE: + bufpush(0x1ECF); + break; + case 0x1ED0: + bufpush(0x1ED1); + break; + case 0x1ED2: + bufpush(0x1ED3); + break; + case 0x1ED4: + bufpush(0x1ED5); + break; + case 0x1ED6: + bufpush(0x1ED7); + break; + case 0x1ED8: + bufpush(0x1ED9); + break; + case 0x1EDA: + bufpush(0x1EDB); + break; + case 0x1EDC: + bufpush(0x1EDD); + break; + case 0x1EDE: + bufpush(0x1EDF); + break; + case 0x1EE0: + bufpush(0x1EE1); + break; + case 0x1EE2: + bufpush(0x1EE3); + break; + case 0x1EE4: + bufpush(0x1EE5); + break; + case 0x1EE6: + bufpush(0x1EE7); + break; + case 0x1EE8: + bufpush(0x1EE9); + break; + case 0x1EEA: + bufpush(0x1EEB); + break; + case 0x1EEC: + bufpush(0x1EED); + break; + case 0x1EEE: + bufpush(0x1EEF); + break; + case 0x1EF0: + bufpush(0x1EF1); + break; + case 0x1EF2: + bufpush(0x1EF3); + break; + case 0x1EF4: + bufpush(0x1EF5); + break; + case 0x1EF6: + bufpush(0x1EF7); + break; + case 0x1EF8: + bufpush(0x1EF9); + break; + case 0x1F08: + bufpush(0x1F00); + break; + case 0x1F09: + bufpush(0x1F01); + break; + case 0x1F0A: + bufpush(0x1F02); + break; + case 0x1F0B: + bufpush(0x1F03); + break; + case 0x1F0C: + bufpush(0x1F04); + break; + case 0x1F0D: + bufpush(0x1F05); + break; + case 0x1F0E: + bufpush(0x1F06); + break; + case 0x1F0F: + bufpush(0x1F07); + break; + case 0x1F18: + bufpush(0x1F10); + break; + case 0x1F19: + bufpush(0x1F11); + break; + case 0x1F1A: + bufpush(0x1F12); + break; + case 0x1F1B: + bufpush(0x1F13); + break; + case 0x1F1C: + bufpush(0x1F14); + break; + case 0x1F1D: + bufpush(0x1F15); + break; + case 0x1F28: + bufpush(0x1F20); + break; + case 0x1F29: + bufpush(0x1F21); + break; + case 0x1F2A: + bufpush(0x1F22); + break; + case 0x1F2B: + bufpush(0x1F23); + break; + case 0x1F2C: + bufpush(0x1F24); + break; + case 0x1F2D: + bufpush(0x1F25); + break; + case 0x1F2E: + bufpush(0x1F26); + break; + case 0x1F2F: + bufpush(0x1F27); + break; + case 0x1F38: + bufpush(0x1F30); + break; + case 0x1F39: + bufpush(0x1F31); + break; + case 0x1F3A: + bufpush(0x1F32); + break; + case 0x1F3B: + bufpush(0x1F33); + break; + case 0x1F3C: + bufpush(0x1F34); + break; + case 0x1F3D: + bufpush(0x1F35); + break; + case 0x1F3E: + bufpush(0x1F36); + break; + case 0x1F3F: + bufpush(0x1F37); + break; + case 0x1F48: + bufpush(0x1F40); + break; + case 0x1F49: + bufpush(0x1F41); + break; + case 0x1F4A: + bufpush(0x1F42); + break; + case 0x1F4B: + bufpush(0x1F43); + break; + case 0x1F4C: + bufpush(0x1F44); + break; + case 0x1F4D: + bufpush(0x1F45); + break; + case 0x1F50: + bufpush(0x03C5); + bufpush(0x0313); + break; + case 0x1F52: + bufpush(0x03C5); + bufpush(0x0313); + bufpush(0x0300); + break; + case 0x1F54: + bufpush(0x03C5); + bufpush(0x0313); + bufpush(0x0301); + break; + case 0x1F56: + bufpush(0x03C5); + bufpush(0x0313); + bufpush(0x0342); + break; + case 0x1F59: + bufpush(0x1F51); + break; + case 0x1F5B: + bufpush(0x1F53); + break; + case 0x1F5D: + bufpush(0x1F55); + break; + case 0x1F5F: + bufpush(0x1F57); + break; + case 0x1F68: + bufpush(0x1F60); + break; + case 0x1F69: + bufpush(0x1F61); + break; + case 0x1F6A: + bufpush(0x1F62); + break; + case 0x1F6B: + bufpush(0x1F63); + break; + case 0x1F6C: + bufpush(0x1F64); + break; + case 0x1F6D: + bufpush(0x1F65); + break; + case 0x1F6E: + bufpush(0x1F66); + break; + case 0x1F6F: + bufpush(0x1F67); + break; + case 0x1F80: + bufpush(0x1F00); + bufpush(0x03B9); + break; + case 0x1F81: + bufpush(0x1F01); + bufpush(0x03B9); + break; + case 0x1F82: + bufpush(0x1F02); + bufpush(0x03B9); + break; + case 0x1F83: + bufpush(0x1F03); + bufpush(0x03B9); + break; + case 0x1F84: + bufpush(0x1F04); + bufpush(0x03B9); + break; + case 0x1F85: + bufpush(0x1F05); + bufpush(0x03B9); + break; + case 0x1F86: + bufpush(0x1F06); + bufpush(0x03B9); + break; + case 0x1F87: + bufpush(0x1F07); + bufpush(0x03B9); + break; + case 0x1F88: + bufpush(0x1F00); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F89: + bufpush(0x1F01); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F8A: + bufpush(0x1F02); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F8B: + bufpush(0x1F03); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F8C: + bufpush(0x1F04); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F8D: + bufpush(0x1F05); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F8E: + bufpush(0x1F06); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F8F: + bufpush(0x1F07); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F90: + bufpush(0x1F20); + bufpush(0x03B9); + break; + case 0x1F91: + bufpush(0x1F21); + bufpush(0x03B9); + break; + case 0x1F92: + bufpush(0x1F22); + bufpush(0x03B9); + break; + case 0x1F93: + bufpush(0x1F23); + bufpush(0x03B9); + break; + case 0x1F94: + bufpush(0x1F24); + bufpush(0x03B9); + break; + case 0x1F95: + bufpush(0x1F25); + bufpush(0x03B9); + break; + case 0x1F96: + bufpush(0x1F26); + bufpush(0x03B9); + break; + case 0x1F97: + bufpush(0x1F27); + bufpush(0x03B9); + break; + case 0x1F98: + bufpush(0x1F20); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F99: + bufpush(0x1F21); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F9A: + bufpush(0x1F22); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F9B: + bufpush(0x1F23); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F9C: + bufpush(0x1F24); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F9D: + bufpush(0x1F25); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F9E: + bufpush(0x1F26); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1F9F: + bufpush(0x1F27); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FA0: + bufpush(0x1F60); + bufpush(0x03B9); + break; + case 0x1FA1: + bufpush(0x1F61); + bufpush(0x03B9); + break; + case 0x1FA2: + bufpush(0x1F62); + bufpush(0x03B9); + break; + case 0x1FA3: + bufpush(0x1F63); + bufpush(0x03B9); + break; + case 0x1FA4: + bufpush(0x1F64); + bufpush(0x03B9); + break; + case 0x1FA5: + bufpush(0x1F65); + bufpush(0x03B9); + break; + case 0x1FA6: + bufpush(0x1F66); + bufpush(0x03B9); + break; + case 0x1FA7: + bufpush(0x1F67); + bufpush(0x03B9); + break; + case 0x1FA8: + bufpush(0x1F60); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FA9: + bufpush(0x1F61); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FAA: + bufpush(0x1F62); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FAB: + bufpush(0x1F63); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FAC: + bufpush(0x1F64); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FAD: + bufpush(0x1F65); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FAE: + bufpush(0x1F66); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FAF: + bufpush(0x1F67); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FB2: + bufpush(0x1F70); + bufpush(0x03B9); + break; + case 0x1FB3: + bufpush(0x03B1); + bufpush(0x03B9); + break; + case 0x1FB4: + bufpush(0x03AC); + bufpush(0x03B9); + break; + case 0x1FB6: + bufpush(0x03B1); + bufpush(0x0342); + break; + case 0x1FB7: + bufpush(0x03B1); + bufpush(0x0342); + bufpush(0x03B9); + break; + case 0x1FB8: + bufpush(0x1FB0); + break; + case 0x1FB9: + bufpush(0x1FB1); + break; + case 0x1FBA: + bufpush(0x1F70); + break; + case 0x1FBB: + bufpush(0x1F71); + break; + case 0x1FBC: + bufpush(0x03B1); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FBE: + bufpush(0x03B9); + break; + case 0x1FC2: + bufpush(0x1F74); + bufpush(0x03B9); + break; + case 0x1FC3: + bufpush(0x03B7); + bufpush(0x03B9); + break; + case 0x1FC4: + bufpush(0x03AE); + bufpush(0x03B9); + break; + case 0x1FC6: + bufpush(0x03B7); + bufpush(0x0342); + break; + case 0x1FC7: + bufpush(0x03B7); + bufpush(0x0342); + bufpush(0x03B9); + break; + case 0x1FC8: + bufpush(0x1F72); + break; + case 0x1FC9: + bufpush(0x1F73); + break; + case 0x1FCA: + bufpush(0x1F74); + break; + case 0x1FCB: + bufpush(0x1F75); + break; + case 0x1FCC: + bufpush(0x03B7); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x1FD2: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0300); + break; + case 0x1FD3: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x1FD6: + bufpush(0x03B9); + bufpush(0x0342); + break; + case 0x1FD7: + bufpush(0x03B9); + bufpush(0x0308); + bufpush(0x0342); + break; + case 0x1FD8: + bufpush(0x1FD0); + break; + case 0x1FD9: + bufpush(0x1FD1); + break; + case 0x1FDA: + bufpush(0x1F76); + break; + case 0x1FDB: + bufpush(0x1F77); + break; + case 0x1FE2: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0300); + break; + case 0x1FE3: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0301); + break; + case 0x1FE4: + bufpush(0x03C1); + bufpush(0x0313); + break; + case 0x1FE6: + bufpush(0x03C5); + bufpush(0x0342); + break; + case 0x1FE7: + bufpush(0x03C5); + bufpush(0x0308); + bufpush(0x0342); + break; + case 0x1FE8: + bufpush(0x1FE0); + break; + case 0x1FE9: + bufpush(0x1FE1); + break; + case 0x1FEA: + bufpush(0x1F7A); + break; + case 0x1FEB: + bufpush(0x1F7B); + break; + case 0x1FEC: + bufpush(0x1FE5); + break; + case 0x1FF2: + bufpush(0x1F7C); + bufpush(0x03B9); + break; + case 0x1FF3: + bufpush(0x03C9); + bufpush(0x03B9); + break; + case 0x1FF4: + bufpush(0x03CE); + bufpush(0x03B9); + break; + case 0x1FF6: + bufpush(0x03C9); + bufpush(0x0342); + break; + case 0x1FF7: + bufpush(0x03C9); + bufpush(0x0342); + bufpush(0x03B9); + break; + case 0x1FF8: + bufpush(0x1F78); + break; + case 0x1FF9: + bufpush(0x1F79); + break; + case 0x1FFA: + bufpush(0x1F7C); + break; + case 0x1FFB: + bufpush(0x1F7D); + break; + case 0x1FFC: + bufpush(0x03C9); + bufpush(0x03B9); + break; + case 0x: + break; + case 0x2126: + bufpush(0x03C9); + break; + case 0x212A: + bufpush(0x006B); + break; + case 0x212B: + bufpush(0x00E5); + break; + case 0x2160: + bufpush(0x2170); + break; + case 0x2161: + bufpush(0x2171); + break; + case 0x2162: + bufpush(0x2172); + break; + case 0x2163: + bufpush(0x2173); + break; + case 0x2164: + bufpush(0x2174); + break; + case 0x2165: + bufpush(0x2175); + break; + case 0x2166: + bufpush(0x2176); + break; + case 0x2167: + bufpush(0x2177); + break; + case 0x2168: + bufpush(0x2178); + break; + case 0x2169: + bufpush(0x2179); + break; + case 0x216A: + bufpush(0x217A); + break; + case 0x216B: + bufpush(0x217B); + break; + case 0x216C: + bufpush(0x217C); + break; + case 0x216D: + bufpush(0x217D); + break; + case 0x216E: + bufpush(0x217E); + break; + case 0x216F: + bufpush(0x217F); + break; + case 0x24B6: + bufpush(0x24D0); + break; + case 0x24B7: + bufpush(0x24D1); + break; + case 0x24B8: + bufpush(0x24D2); + break; + case 0x24B9: + bufpush(0x24D3); + break; + case 0x24BA: + bufpush(0x24D4); + break; + case 0x24BB: + bufpush(0x24D5); + break; + case 0x24BC: + bufpush(0x24D6); + break; + case 0x24BD: + bufpush(0x24D7); + break; + case 0x24BE: + bufpush(0x24D8); + break; + case 0x24BF: + bufpush(0x24D9); + break; + case 0x24C0: + bufpush(0x24DA); + break; + case 0x24C1: + bufpush(0x24DB); + break; + case 0x24C2: + bufpush(0x24DC); + break; + case 0x24C3: + bufpush(0x24DD); + break; + case 0x24C4: + bufpush(0x24DE); + break; + case 0x24C5: + bufpush(0x24DF); + break; + case 0x24C6: + bufpush(0x24E0); + break; + case 0x24C7: + bufpush(0x24E1); + break; + case 0x24C8: + bufpush(0x24E2); + break; + case 0x24C9: + bufpush(0x24E3); + break; + case 0x24CA: + bufpush(0x24E4); + break; + case 0x24CB: + bufpush(0x24E5); + break; + case 0x24CC: + bufpush(0x24E6); + break; + case 0x24CD: + bufpush(0x24E7); + break; + case 0x24CE: + bufpush(0x24E8); + break; + case 0x24CF: + bufpush(0x24E9); + break; + case 0xFB00: + bufpush(0x0066); + bufpush(0x0066); + break; + case 0xFB01: + bufpush(0x0066); + bufpush(0x0069); + break; + case 0xFB02: + bufpush(0x0066); + bufpush(0x006C); + break; + case 0xFB03: + bufpush(0x0066); + bufpush(0x0066); + bufpush(0x0069); + break; + case 0xFB04: + bufpush(0x0066); + bufpush(0x0066); + bufpush(0x006C); + break; + case 0xFB05: + bufpush(0x0073); + bufpush(0x0074); + break; + case 0xFB06: + bufpush(0x0073); + bufpush(0x0074); + break; + case 0xFB13: + bufpush(0x0574); + bufpush(0x0576); + break; + case 0xFB14: + bufpush(0x0574); + bufpush(0x0565); + break; + case 0xFB15: + bufpush(0x0574); + bufpush(0x056B); + break; + case 0xFB16: + bufpush(0x057E); + bufpush(0x0576); + break; + case 0xFB17: + bufpush(0x0574); + bufpush(0x056D); + break; + case 0xFF21: + bufpush(0xFF41); + break; + case 0xFF22: + bufpush(0xFF42); + break; + case 0xFF23: + bufpush(0xFF43); + break; + case 0xFF24: + bufpush(0xFF44); + break; + case 0xFF25: + bufpush(0xFF45); + break; + case 0xFF26: + bufpush(0xFF46); + break; + case 0xFF27: + bufpush(0xFF47); + break; + case 0xFF28: + bufpush(0xFF48); + break; + case 0xFF29: + bufpush(0xFF49); + break; + case 0xFF2A: + bufpush(0xFF4A); + break; + case 0xFF2B: + bufpush(0xFF4B); + break; + case 0xFF2C: + bufpush(0xFF4C); + break; + case 0xFF2D: + bufpush(0xFF4D); + break; + case 0xFF2E: + bufpush(0xFF4E); + break; + case 0xFF2F: + bufpush(0xFF4F); + break; + case 0xFF30: + bufpush(0xFF50); + break; + case 0xFF31: + bufpush(0xFF51); + break; + case 0xFF32: + bufpush(0xFF52); + break; + case 0xFF33: + bufpush(0xFF53); + break; + case 0xFF34: + bufpush(0xFF54); + break; + case 0xFF35: + bufpush(0xFF55); + break; + case 0xFF36: + bufpush(0xFF56); + break; + case 0xFF37: + bufpush(0xFF57); + break; + case 0xFF38: + bufpush(0xFF58); + break; + case 0xFF39: + bufpush(0xFF59); + break; + case 0xFF3A: + bufpush(0xFF5A); + break; + case 0x10400: + bufpush(0x10428); + break; + case 0x10401: + bufpush(0x10429); + break; + case 0x10402: + bufpush(0x1042A); + break; + case 0x10403: + bufpush(0x1042B); + break; + case 0x10404: + bufpush(0x1042C); + break; + case 0x10405: + bufpush(0x1042D); + break; + case 0x10406: + bufpush(0x1042E); + break; + case 0x10407: + bufpush(0x1042F); + break; + case 0x10408: + bufpush(0x10430); + break; + case 0x10409: + bufpush(0x10431); + break; + case 0x1040A: + bufpush(0x10432); + break; + case 0x1040B: + bufpush(0x10433); + break; + case 0x1040C: + bufpush(0x10434); + break; + case 0x1040D: + bufpush(0x10435); + break; + case 0x1040E: + bufpush(0x10436); + break; + case 0x1040F: + bufpush(0x10437); + break; + case 0x10410: + bufpush(0x10438); + break; + case 0x10411: + bufpush(0x10439); + break; + case 0x10412: + bufpush(0x1043A); + break; + case 0x10413: + bufpush(0x1043B); + break; + case 0x10414: + bufpush(0x1043C); + break; + case 0x10415: + bufpush(0x1043D); + break; + case 0x10416: + bufpush(0x1043E); + break; + case 0x10417: + bufpush(0x1043F); + break; + case 0x10418: + bufpush(0x10440); + break; + case 0x10419: + bufpush(0x10441); + break; + case 0x1041A: + bufpush(0x10442); + break; + case 0x1041B: + bufpush(0x10443); + break; + case 0x1041C: + bufpush(0x10444); + break; + case 0x1041D: + bufpush(0x10445); + break; + case 0x1041E: + bufpush(0x10446); + break; + case 0x1041F: + bufpush(0x10447); + break; + case 0x10420: + bufpush(0x10448); + break; + case 0x10421: + bufpush(0x10449); + break; + case 0x10422: + bufpush(0x1044A); + break; + case 0x10423: + bufpush(0x1044B); + break; + case 0x10424: + bufpush(0x1044C); + break; + case 0x10425: + bufpush(0x1044D); + break; + } diff --git a/src/debug.h b/src/debug.h new file mode 100644 index 0000000..af1d017 --- /dev/null +++ b/src/debug.h @@ -0,0 +1,36 @@ +#ifndef __debug_h__ +#define __debug_h__ +#include <stdio.h> +#include <errno.h> +#include <string.h> + +#ifdef NDEBUG +#define debug(M, ...) +#else +#define debug(M, ...) \ + fprintf(stderr, "DEBUG %s:%d: " M "\n", __FILE__, __LINE__, ##__VA_ARGS__) +#endif + +#define clean_errno() (errno == 0 ? "None" : strerror(errno)) + +#define log_err(M, ...) \ + fprintf(stderr, "[ERROR] (%s:%d: errno: %s) " M "\n", __FILE__, __LINE__, \ + clean_errno(), ##__VA_ARGS__) + +#define log_warn(M, ...) \ + fprintf(stderr, "[WARN] (%s:%d: errno: %s) " M "\n", __FILE__, __LINE__, \ + clean_errno(), ##__VA_ARGS__) + +#define log_info(M, ...) fprintf(stderr, "[INFO] (%s:%d) " M "\n", __FILE__, \ + __LINE__, ##__VA_ARGS__) + +#define check(A, M, ...) \ + if(!(A)) { log_err(M, ##__VA_ARGS__); errno=0; goto error; } + +#define sentinel(M, ...) \ + { log_err(M, ##__VA_ARGS__); errno=0; goto error; } + +#define check_debug(A, M, ...) \ + if(!(A)) { debug(M, ##__VA_ARGS__); errno=0; goto error; } + +#endif diff --git a/src/detab.c b/src/detab.c new file mode 100644 index 0000000..e03fcf7 --- /dev/null +++ b/src/detab.c @@ -0,0 +1,48 @@ +#include "bstrlib.h" + +// UTF-8 aware detab: assumes s has no newlines, or only a final newline. +// Return 0 on success, BSTR_ERR if invalid UTF-8. +extern int bdetab(bstring s, int utf8) +{ + unsigned char c; + int pos = 0; // a count of characters + int byte = 0; // a count of bytes + int high_chars_to_skip = 0; + int numspaces = 0; + while ((c = bchar(s, byte))) { + if (utf8 && high_chars_to_skip > 0) { + if (c >= 0x80) { + high_chars_to_skip--; + byte++; + } else { + return BSTR_ERR; // invalid utf-8 + } + } else if (c == '\t') { + bdelete(s, byte, 1); // delete tab character + numspaces = 4 - (pos % 4); + binsertch(s, byte, numspaces, ' '); + byte += numspaces; + pos += numspaces; + } else if (c <= 0x80 || !utf8) { + byte++; + pos++; + } else { // multibyte utf8 sequences + if (c >> 1 == 0176) { + high_chars_to_skip = 5; + } else if (c >> 2 == 076) { + high_chars_to_skip = 4; + } else if (c >> 3 == 036) { + high_chars_to_skip = 3; + } else if (c >> 4 == 016) { + high_chars_to_skip = 2; + } else if (c >> 5 == 06) { + high_chars_to_skip = 1; + } else { + return BSTR_ERR; // invalid utf-8 + } + pos++; + byte++; + } + } + return 0; +} diff --git a/src/getopt.c b/src/getopt.c new file mode 100644 index 0000000..321dd9f --- /dev/null +++ b/src/getopt.c @@ -0,0 +1,199 @@ +/* $Id: getopt.c 4022 2008-03-31 06:11:07Z rra $ + * + * Replacement implementation of getopt. + * + * This is a replacement implementation for getopt based on the my_getopt + * distribution by Benjamin Sittler. Only the getopt interface is included, + * since remctl doesn't use GNU long options, and the code has been rearranged + * and reworked somewhat to fit with the remctl coding style. + * + * Copyright 1997, 2000, 2001, 2002 Benjamin Sittler + * Copyright 2008 Russ Allbery <rra@stanford.edu> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <config.h> +#include <portable/system.h> +#include <portable/getopt.h> + +/* + * If we're running the test suite, rename getopt and the global variables to + * avoid conflicts with the system version. + */ +#if TESTING +# define getopt test_getopt +int test_getopt(int, char **, const char *); +# define optind test_optind +# define opterr test_opterr +# define optopt test_optopt +# define optarg test_optarg +#endif + +/* Initialize global interface variables. */ +int optind = 1; +int opterr = 1; +int optopt = 0; +char *optarg = NULL; + +/* + * This is the plain old UNIX getopt, with GNU-style extensions. If you're + * porting some piece of UNIX software, this is all you need. It supports + * GNU-style permutation and optional arguments, but does not support the GNU + * -W extension. + * + * This function is not re-entrant or thread-safe, has static variables, and + * generally isn't a great interface, but normally you only call it once. + */ +int +getopt(int argc, char *argv[], const char *optstring) +{ + const char *p; + size_t offset = 0; + char mode = '\0'; + int colon_mode = 0; + int option = -1; + + /* Holds the current position in the parameter being parsed. */ + static int charind = 0; + + /* + * By default, getopt permutes argv as it scans and leaves all non-options + * at the end. This can be changed with the first character of optstring + * or the environment variable POSIXLY_CORRECT. With a first character of + * '+' or when POSIXLY_CORRECT is set, option processing stops at the + * first non-option. If the first character is '-', each non-option argv + * element is handled as if it were the argument of an option with + * character code 1. mode holds this character. + * + * After the optional leading '+' and '-', optstring may contain ':'. If + * present, missing arguments return ':' instead of '?'. colon_mode holds + * this setting. + */ + if (getenv("POSIXLY_CORRECT") != NULL) { + mode = '+'; + colon_mode = '+'; + } else { + if (optstring[offset] == '+' || optstring[offset] == '-') { + mode = optstring[offset]; + offset++; + } + if (optstring[offset] == ':') { + colon_mode = 1; + offset++; + } + } + + /* + * charind holds where we left off. If it's set, we were in the middle + * of an argv element; if not, we pick up with the next element of + * optind. + */ + optarg = NULL; + if (charind == 0) { + if (optind >= argc) + option = -1; + else if (strcmp(argv[optind], "--") == 0) { + optind++; + option = -1; + } else if (argv[optind][0] != '-' || argv[optind][1] == '\0') { + char *tmp; + int i, j, k, end; + + if (mode == '+') + option = -1; + else if (mode == '-') { + optarg = argv[optind]; + optind++; + option = 1; + } else { + for (i = optind + 1, j = optind; i < argc; i++) + if ((argv[i][0] == '-') && (argv[i][1] != '\0')) { + optind = i; + option = getopt(argc, argv, optstring); + while (i > j) { + --i; + tmp = argv[i]; + end = (charind == 0) ? optind - 1 : optind; + for (k = i; k + 1 <= end; k++) { + argv[k] = argv[k + 1]; + } + argv[end] = tmp; + --optind; + } + break; + } + if (i == argc) + option = -1; + } + return option; + } else { + charind = 1; + } + } + if (charind != 0) { + optopt = argv[optind][charind]; + for (p = optstring + offset; *p != '\0'; p++) + if (optopt == *p) { + p++; + if (*p == ':') { + if (argv[optind][charind + 1] != '\0') { + optarg = &argv[optind][charind + 1]; + optind++; + charind = 0; + } else { + p++; + if (*p != ':') { + charind = 0; + optind++; + if (optind >= argc) { + if (opterr) + fprintf(stderr, "%s: option requires" + " an argument -- %c\n", argv[0], + optopt); + option = colon_mode ? ':' : '?'; + goto done; + } else { + optarg = argv[optind]; + optind++; + } + } + } + } + option = optopt; + } + if (option == -1) { + if (opterr) + fprintf(stderr, "%s: illegal option -- %c\n", argv[0], optopt); + option = '?'; + } + } + +done: + if (charind != 0) { + charind++; + if (argv[optind][charind] == '\0') { + optind++; + charind = 0; + } + } + if (optind > argc) + optind = argc; + return option; +} diff --git a/src/html.c b/src/html.c new file mode 100644 index 0000000..56d5dbb --- /dev/null +++ b/src/html.c @@ -0,0 +1,276 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include "bstrlib.h" +#include "stmd.h" +#include "debug.h" +#include "scanners.h" + +// Functions to convert block and inline lists to HTML strings. + +// Escape special characters in HTML. More efficient than +// three calls to bfindreplace. If preserve_entities is set, +// existing entities are left alone. +static bstring escape_html(bstring inp, bool preserve_entities) +{ + int pos = 0; + int match; + char c; + bstring escapable = blk2bstr("&<>\"", 4); + bstring ent; + bstring s = bstrcpy(inp); + while ((pos = binchr(s, pos, escapable)) != BSTR_ERR) { + c = bchar(s,pos); + switch (c) { + case '<': + bdelete(s, pos, 1); + ent = blk2bstr("<", 4); + binsert(s, pos, ent, ' '); + bdestroy(ent); + pos += 4; + break; + case '>': + bdelete(s, pos, 1); + ent = blk2bstr(">", 4); + binsert(s, pos, ent, ' '); + bdestroy(ent); + pos += 4; + break; + case '&': + if (preserve_entities && (match = scan_entity(s, pos))) { + pos += match; + } else { + bdelete(s, pos, 1); + ent = blk2bstr("&", 5); + binsert(s, pos, ent, ' '); + bdestroy(ent); + pos += 5; + } + break; + case '"': + bdelete(s, pos, 1); + ent = blk2bstr(""", 6); + binsert(s, pos, ent, ' '); + bdestroy(ent); + pos += 6; + break; + default: + bdelete(s, pos, 1); + log_err("unexpected character %02x", c); + } + } + bdestroy(escapable); + return s; +} + +static inline void cr(bstring buffer) +{ + int c = bchar(buffer, blength(buffer) - 1); + if (c != '\n' && c) { + bconchar(buffer, '\n'); + } +} + +// Convert a block list to HTML. Returns 0 on success, and sets result. +extern int blocks_to_html(block* b, bstring* result, bool tight) +{ + bstring contents = NULL; + bstring escaped, escaped2; + struct bstrList * info_words; + struct ListData * data; + bstring mbstart; + bstring html = blk2bstr("", 0); + + while(b != NULL) { + switch(b->tag) { + case document: + check(blocks_to_html(b->children, &contents, false) == 0, + "error converting blocks to html"); + bformata(html, "%s", contents->data); + bdestroy(contents); + break; + case paragraph: + check(inlines_to_html(b->inline_content, &contents) == 0, + "error converting inlines to html"); + if (tight) { + bformata(html, "%s", contents->data); + } else { + cr(html); + bformata(html, "<p>%s</p>", contents->data); + cr(html); + } + bdestroy(contents); + break; + case block_quote: + check(blocks_to_html(b->children, &contents, false) == 0, + "error converting blocks to html"); + cr(html); + bformata(html, "<blockquote>\n%s</blockquote>", contents->data); + cr(html); + bdestroy(contents); + break; + case list_item: + check(blocks_to_html(b->children, &contents, tight) == 0, + "error converting blocks to html"); + brtrimws(contents); + cr(html); + bformata(html, "<li>%s</li>", contents->data); + cr(html); + bdestroy(contents); + break; + case list: + // make sure a list starts at the beginning of the line: + cr(html); + data = &(b->attributes.list_data); + check(blocks_to_html(b->children, &contents, data->tight) == 0, + "error converting blocks to html"); + mbstart = bformat(" start=\"%d\"", data->start); + bformata(html, "<%s%s>\n%s</%s>", + data->list_type == bullet ? "ul" : "ol", + data->start == 1 ? "" : (char*) mbstart->data, + contents->data, + data->list_type == bullet ? "ul" : "ol"); + cr(html); + bdestroy(contents); + bdestroy(mbstart); + break; + case atx_header: + case setext_header: + check(inlines_to_html(b->inline_content, &contents) == 0, + "error converting inlines to html"); + cr(html); + bformata(html, "<h%d>%s</h%d>", + b->attributes.header_level, + contents->data, + b->attributes.header_level); + cr(html); + bdestroy(contents); + break; + case indented_code: + escaped = escape_html(b->string_content, false); + cr(html); + bformata(html, "<pre><code>%s</code></pre>", escaped->data); + cr(html); + bdestroy(escaped); + break; + case fenced_code: + escaped = escape_html(b->string_content, false); + cr(html); + bformata(html, "<pre"); + if (blength(b->attributes.fenced_code_data.info) > 0) { + escaped2 = escape_html(b->attributes.fenced_code_data.info, true); + info_words = bsplit(escaped2, ' '); + bformata(html, " class=\"%s\"", info_words->entry[0]->data); + bdestroy(escaped2); + bstrListDestroy(info_words); + } + bformata(html, "><code>%s</code></pre>", escaped->data); + cr(html); + bdestroy(escaped); + break; + case html_block: + bformata(html, "%s", b->string_content->data); + break; + case hrule: + bformata(html, "<hr />"); + cr(html); + break; + case reference_def: + break; + default: + log_warn("block type %d not implemented\n", b->tag); + break; + } + b = b->next; + } + *result = html; + return 0; + error: + return -1; +} + +// Convert an inline list to HTML. Returns 0 on success, and sets result. +extern int inlines_to_html(inl* ils, bstring* result) +{ + bstring contents = NULL; + bstring html = blk2bstr("", 0); + bstring mbtitle, escaped, escaped2; + + while(ils != NULL) { + switch(ils->tag) { + case str: + escaped = escape_html(ils->content.literal, false); + bformata(html, "%s", escaped->data); + bdestroy(escaped); + break; + case linebreak: + bformata(html, "<br />\n"); + break; + case softbreak: + bformata(html, "\n"); + break; + case code: + escaped = escape_html(ils->content.literal, false); + bformata(html, "<code>%s</code>", escaped->data); + bdestroy(escaped); + break; + case raw_html: + case entity: + bformata(html, "%s", ils->content.literal->data); + break; + case link: + check(inlines_to_html(ils->content.inlines, &contents) == 0, + "error converting inlines to html"); + if (blength(ils->content.linkable.title) > 0) { + escaped = escape_html(ils->content.linkable.title, true); + mbtitle = bformat(" title=\"%s\"", escaped->data); + bdestroy(escaped); + } else { + mbtitle = blk2bstr("",0); + } + escaped = escape_html(ils->content.linkable.url, true); + bformata(html, "<a href=\"%s\"%s>%s</a>", + escaped->data, + mbtitle->data, + contents->data); + bdestroy(escaped); + bdestroy(mbtitle); + bdestroy(contents); + break; + case image: + check(inlines_to_html(ils->content.inlines, &contents) == 0, + "error converting inlines to html"); + escaped = escape_html(ils->content.linkable.url, true); + escaped2 = escape_html(contents, false); + bdestroy(contents); + bformata(html, "<img src=\"%s\" alt=\"%s\"", + escaped->data, escaped2->data); + bdestroy(escaped); + bdestroy(escaped2); + if (blength(ils->content.linkable.title) > 0) { + escaped = escape_html(ils->content.linkable.title, true); + bformata(html, " title=\"%s\"", escaped->data); + bdestroy(escaped); + } + bformata(html, " />"); + break; + case strong: + check(inlines_to_html(ils->content.inlines, &contents) == 0, + "error converting inlines to html"); + bformata(html, "<strong>%s</strong>", contents->data); + bdestroy(contents); + break; + case emph: + check(inlines_to_html(ils->content.inlines, &contents) == 0, + "error converting inlines to html"); + bformata(html, "<em>%s</em>", contents->data); + bdestroy(contents); + break; + } + ils = ils->next; + } + *result = html; + return 0; + error: + return -1; +} diff --git a/src/inlines.c b/src/inlines.c new file mode 100644 index 0000000..9e35178 --- /dev/null +++ b/src/inlines.c @@ -0,0 +1,998 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include <ctype.h> +#include "bstrlib.h" +#include "stmd.h" +#include "uthash.h" +#include "debug.h" +#include "scanners.h" +#include "utf8.h" + +extern void free_reference(reference *ref) { + bdestroy(ref->label); + bdestroy(ref->url); + bdestroy(ref->title); + free(ref); +} + +extern void free_reference_map(reference **refmap) { + /* free the hash table contents */ + reference *s; + reference *tmp; + if (refmap != NULL) { + HASH_ITER(hh, *refmap, s, tmp) { + HASH_DEL(*refmap, s); + free_reference(s); + } + free(refmap); + } +} + +// normalize reference: collapse internal whitespace to single space, +// remove leading/trailing whitespace, case fold +static bstring normalize_reference(bstring s) +{ + bstring normalized = case_fold(s); + int pos = 0; + int startpos; + char c; + while ((c = bchar(normalized, pos))) { + if (isspace(c)) { + startpos = pos; + // skip til next non-space + pos++; + while (isspace(bchar(s, pos))) { + pos++; + } + bdelete(normalized, startpos, pos - startpos); + binsertch(normalized, startpos, 1, ' '); + pos = startpos + 1; + } + pos++; + } + btrimws(normalized); + return normalized; +} + +// Returns reference if refmap contains a reference with matching +// label, otherwise NULL. +extern reference* lookup_reference(reference** refmap, bstring lab) +{ + reference * ref = NULL; + bstring label = normalize_reference(lab); + if (refmap != NULL) { + HASH_FIND_STR(*refmap, (char*) label->data, ref); + } + bdestroy(label); + return ref; +} + +extern reference* make_reference(bstring label, bstring url, bstring title) +{ + reference * ref; + ref = malloc(sizeof(reference)); + ref->label = normalize_reference(label); + ref->url = bstrcpy(url); + ref->title = bstrcpy(title); + return ref; +} + +extern void add_reference(reference** refmap, reference* ref) +{ + reference * t = NULL; + HASH_FIND(hh, *refmap, (char*) ref->label->data, + (unsigned) blength(ref->label), t); + if (t == NULL) { + HASH_ADD_KEYPTR(hh, *refmap, (char*) ref->label->data, + (unsigned) blength(ref->label), ref); + } else { + free_reference(ref); // we free this now since it won't be in the refmap + } +} + +// Create an inline with a linkable string value. +inline static inl* make_linkable(int t, inl* label, bstring url, bstring title) +{ + inl* e = (inl*) malloc(sizeof(inl)); + e->tag = t; + e->content.linkable.label = label; + e->content.linkable.url = url; + e->content.linkable.title = title; + e->next = NULL; + return e; +} + +inline static inl* make_inlines(int t, inl* contents) +{ + inl* e = (inl*) malloc(sizeof(inl)); + e->tag = t; + e->content.inlines = contents; + e->next = NULL; + return e; +} + +// Create an inline with a literal string value. +inline static inl* make_literal(int t, bstring s) +{ + inl* e = (inl*) malloc(sizeof(inl)); + e->tag = t; + e->content.literal = s; + e->next = NULL; + return e; +} + +// Create an inline with no value. +inline static inl* make_simple(int t) +{ + inl* e = (inl*) malloc(sizeof(inl)); + e->tag = t; + e->next = NULL; + return e; +} + +// Macros for creating various kinds of inlines. +#define make_str(s) make_literal(str, s) +#define make_code(s) make_literal(code, s) +#define make_raw_html(s) make_literal(raw_html, s) +#define make_entity(s) make_literal(entity, s) +#define make_linebreak() make_simple(linebreak) +#define make_softbreak() make_simple(softbreak) +#define make_link(label, url, title) make_linkable(link, label, url, title) +#define make_image(alt, url, title) make_linkable(image, alt, url, title) +#define make_emph(contents) make_inlines(emph, contents) +#define make_strong(contents) make_inlines(strong, contents) + +// Free an inline list. +extern void free_inlines(inl* e) +{ + inl * next; + while (e != NULL) { + switch (e->tag){ + case str: + case raw_html: + case code: + case entity: + bdestroy(e->content.literal); + break; + case linebreak: + case softbreak: + break; + case link: + case image: + bdestroy(e->content.linkable.url); + bdestroy(e->content.linkable.title); + free_inlines(e->content.linkable.label); + break; + case emph: + case strong: + free_inlines(e->content.inlines); + break; + default: + break; + } + next = e->next; + free(e); + e = next; + } +} + +// Append inline list b to the end of inline list a. +// Return pointer to head of new list. +inline static inl* append_inlines(inl* a, inl* b) +{ + if (a == NULL) { // NULL acts like an empty list + return b; + } + inl* cur = a; + while (cur->next) { + cur = cur->next; + } + cur->next = b; + return a; +} + +// Make a 'subject' from an input string. +static subject* make_subject(bstring s, reference** refmap) +{ + subject* e = (subject*) malloc(sizeof(subject)); + // remove final whitespace + brtrimws(s); + e->buffer = s; + e->pos = 0; + e->label_nestlevel = 0; + e->reference_map = refmap; + return e; +} + +inline static int isbacktick(int c) +{ + return (c == '`'); +} + +// Return the next character in the subject, without advancing. +// Return 0 if at the end of the subject. +#define peek_char(subj) bchar(subj->buffer, subj->pos) + +// Return true if there are more characters in the subject. +inline static int is_eof(subject* subj) +{ + return (subj->pos >= blength(subj->buffer)); +} + +// Advance the subject. Doesn't check for eof. +#define advance(subj) subj->pos += 1 + +// Take characters while a predicate holds, and return a string. +inline static bstring take_while(subject* subj, int (*f)(int)) +{ + unsigned char c; + int startpos = subj->pos; + int len = 0; + while ((c = peek_char(subj)) && (*f)(c)) { + advance(subj); + len++; + } + return bmidstr(subj->buffer, startpos, len); +} + +// Take one character and return a string, or NULL if eof. +inline static bstring take_one(subject* subj) +{ + int startpos = subj->pos; + if (is_eof(subj)){ + return NULL; + } else { + advance(subj); + return bmidstr(subj->buffer, startpos, 1); + } +} + +// Try to process a backtick code span that began with a +// span of ticks of length openticklength length (already +// parsed). Return 0 if you don't find matching closing +// backticks, otherwise return the position in the subject +// after the closing backticks. +static int scan_to_closing_backticks(subject* subj, int openticklength) +{ + // read non backticks + char c; + while ((c = peek_char(subj)) && c != '`') { + advance(subj); + } + if (is_eof(subj)) { + return 0; // did not find closing ticks, return 0 + } + int numticks = 0; + while (peek_char(subj) == '`') { + advance(subj); + numticks++; + } + if (numticks != openticklength){ + return(scan_to_closing_backticks(subj, openticklength)); + } + return (subj->pos); +} + +// Destructively modify bstring, collapsing consecutive +// space and newline characters into a single space. +static int normalize_whitespace(bstring s) +{ + bool last_char_was_space = false; + int pos = 0; + char c; + while ((c = bchar(s, pos))) { + switch (c) { + case ' ': + if (last_char_was_space) { + bdelete(s, pos, 1); + } else { + pos++; + } + last_char_was_space = true; + break; + case '\n': + if (last_char_was_space) { + bdelete(s, pos, 1); + } else { + bdelete(s, pos, 1); + binsertch(s, pos, 1, ' '); + pos++; + } + last_char_was_space = true; + break; + default: + pos++; + last_char_was_space = false; + } + } + return 0; +} + +// Parse backtick code section or raw backticks, return an inline. +// Assumes that the subject has a backtick at the current position. +static inl* handle_backticks(subject *subj) +{ + bstring openticks = take_while(subj, isbacktick); + bstring result; + int ticklength = blength(openticks); + int startpos = subj->pos; + int endpos = scan_to_closing_backticks(subj, ticklength); + if (endpos == 0) { // not found + subj->pos = startpos; // rewind + return make_str(openticks); + } else { + bdestroy(openticks); + result = bmidstr(subj->buffer, startpos, endpos - startpos - ticklength); + btrimws(result); + normalize_whitespace(result); + return make_code(result); + } +} + +// Scan ***, **, or * and return number scanned, or 0. +// Don't advance position. +static int scan_delims(subject* subj, char c, bool * can_open, bool * can_close) +{ + int numdelims = 0; + char char_before, char_after; + int startpos = subj->pos; + + char_before = subj->pos == 0 ? '\n' : bchar(subj->buffer, subj->pos - 1); + while (peek_char(subj) == c) { + numdelims++; + advance(subj); + } + char_after = peek_char(subj); + *can_open = numdelims > 0 && numdelims <= 3 && !isspace(char_after); + *can_close = numdelims > 0 && numdelims <= 3 && !isspace(char_before); + if (c == '_') { + *can_open = *can_open && !isalnum(char_before); + *can_close = *can_close && !isalnum(char_after); + } + subj->pos = startpos; + return numdelims; +} + +// Parse strong/emph or a fallback. +// Assumes the subject has '_' or '*' at the current position. +static inl* handle_strong_emph(subject* subj, char c) +{ + bool can_open, can_close; + inl * result = NULL; + inl ** last = malloc(sizeof(inl *)); + inl * new; + inl * il; + inl * first_head = NULL; + inl * first_close = NULL; + int first_close_delims = 0; + int numdelims; + + *last = NULL; + + numdelims = scan_delims(subj, c, &can_open, &can_close); + subj->pos += numdelims; + + new = make_str(bmidstr(subj->buffer, subj->pos - numdelims, numdelims)); + *last = new; + first_head = new; + result = new; + + if (!can_open || numdelims == 0) { + goto done; + } + + switch (numdelims) { + case 1: + while (true) { + numdelims = scan_delims(subj, c, &can_open, &can_close); + if (numdelims >= 1 && can_close) { + subj->pos += 1; + first_head->tag = emph; + bdestroy(first_head->content.literal); + first_head->content.inlines = first_head->next; + first_head->next = NULL; + goto done; + } else { + if (!parse_inline(subj, last)) { + goto done; + } + } + } + break; + case 2: + while (true) { + numdelims = scan_delims(subj, c, &can_open, &can_close); + if (numdelims >= 2 && can_close) { + subj->pos += 2; + first_head->tag = strong; + bdestroy(first_head->content.literal); + first_head->content.inlines = first_head->next; + first_head->next = NULL; + goto done; + } else { + if (!parse_inline(subj, last)) { + goto done; + } + } + } + break; + case 3: + while (true) { + numdelims = scan_delims(subj, c, &can_open, &can_close); + if (can_close && numdelims >= 1 && numdelims <= 3 && + numdelims != first_close_delims) { + new = make_str(bmidstr(subj->buffer, subj->pos, numdelims)); + append_inlines(*last, new); + *last = new; + if (numdelims == 3) { + numdelims = 1; + } + subj->pos += numdelims; + if (first_close) { + first_head->tag = first_close_delims == 1 ? strong : emph; + bdestroy(first_head->content.literal); + first_head->content.inlines = + make_inlines(first_close_delims == 1 ? emph : strong, + first_head->next); + + il = first_head->next; + while (il->next && il->next != first_close) { + il = il->next; + } + il->next = NULL; + + first_head->content.inlines->next = first_close->next; + + il = first_head->content.inlines; + while (il->next && il->next != *last) { + il = il->next; + } + il->next = NULL; + free_inlines(*last); + + first_close->next = NULL; + free_inlines(first_close); + first_head->next = NULL; + goto done; + } else { + first_close = *last; + first_close_delims = numdelims; + } + } else { + if (!parse_inline(subj, last)) { + goto done; + } + } + } + break; + default: + goto done; + } + + done: + free(last); + return result; +} + +// Parse backslash-escape or just a backslash, returning an inline. +static inl* handle_backslash(subject *subj) +{ + advance(subj); + unsigned char nextchar = peek_char(subj); + if (ispunct(nextchar)) { // only ascii symbols and newline can be escaped + advance(subj); + return make_str(bformat("%c", nextchar)); + } else if (nextchar == '\n') { + advance(subj); + return make_linebreak(); + } else { + return make_str(bfromcstr("\\")); + } +} + +// Parse an entity or a regular "&" string. +// Assumes the subject has an '&' character at the current position. +static inl* handle_entity(subject* subj) +{ + int match; + inl * result; + match = scan_entity(subj->buffer, subj->pos); + if (match) { + result = make_entity(bmidstr(subj->buffer, subj->pos, match)); + subj->pos += match; + } else { + advance(subj); + result = make_str(bfromcstr("&")); + } + return result; +} + +// Like make_str, but parses entities. +// Returns an inline sequence consisting of str and entity elements. +static inl * make_str_with_entities(bstring s) +{ + inl * result = NULL; + inl * new; + int searchpos; + char c; + subject * subj = make_subject(s, NULL); + + while ((c = peek_char(subj))) { + switch (c) { + case '&': + new = handle_entity(subj); + break; + default: + searchpos = bstrchrp(subj->buffer, '&', subj->pos); + if (searchpos == BSTR_ERR) { + searchpos = blength(subj->buffer); + } + new = make_str(bmidstr(subj->buffer, subj->pos, searchpos - subj->pos)); + subj->pos = searchpos; + } + result = append_inlines(result, new); + } + free(subj); + return result; +} + +// Destructively unescape a string: remove backslashes before punctuation chars. +extern int unescape(bstring url) +{ + // remove backslashes before punctuation chars: + int searchpos = 0; + while ((searchpos = bstrchrp(url, '\\', searchpos)) != BSTR_ERR) { + if (ispunct(bchar(url, searchpos + 1))) { + bdelete(url, searchpos, 1); + } else { + searchpos++; + } + } + return 0; +} + +// Clean a URL: remove surrounding whitespace and surrounding <>, +// and remove \ that escape punctuation. +static int clean_url(bstring url) +{ + // remove surrounding <> if any: + int urllength = blength(url); + btrimws(url); + if (bchar(url, 0) == '<' && bchar(url, urllength - 1) == '>') { + bdelete(url, 0, 1); + bdelete(url, urllength - 2, 1); + } + unescape(url); + return 0; +} + +// Clean a title: remove surrounding quotes and remove \ that escape punctuation. +static int clean_title(bstring title) +{ + // remove surrounding quotes if any: + int titlelength = blength(title); + if ((bchar(title, 0) == '\'' && bchar(title, titlelength - 1) == '\'') || + (bchar(title, 0) == '(' && bchar(title, titlelength - 1) == ')') || + (bchar(title, 0) == '"' && bchar(title, titlelength - 1) == '"')) { + bdelete(title, 0, 1); + bdelete(title, titlelength - 2, 1); + } + unescape(title); + return 0; +} + +// Parse an autolink or HTML tag. +// Assumes the subject has a '<' character at the current position. +static inl* handle_pointy_brace(subject* subj) +{ + int matchlen = 0; + bstring contents; + inl* result; + + advance(subj); // advance past first < + // first try to match a URL autolink + matchlen = scan_autolink_uri(subj->buffer, subj->pos); + if (matchlen > 0) { + contents = bmidstr(subj->buffer, subj->pos, matchlen - 1); + subj->pos += matchlen; + result = make_link(make_str_with_entities(contents), + bstrcpy(contents), bfromcstr("")); + bdestroy(contents); + return result; + } + // next try to match an email autolink + matchlen = scan_autolink_email(subj->buffer, subj->pos); + if (matchlen > 0) { + contents = bmidstr(subj->buffer, subj->pos, matchlen - 1); + subj->pos += matchlen; + result = make_link(make_str_with_entities(contents), + bformat("mailto:%s", contents->data), + bfromcstr("")); + bdestroy(contents); + return result; + } + // finally, try to match an html tag + matchlen = scan_html_tag(subj->buffer, subj->pos); + if (matchlen > 0) { + contents = bmidstr(subj->buffer, subj->pos, matchlen); + binsertch(contents, 0, 1, '<'); + subj->pos += matchlen; + return make_raw_html(contents); + } else {// if nothing matches, just return the opening <: + return make_str(bfromcstr("<")); + } +} + +// Parse a link label. Returns 1 if successful. +// Unless raw_label is null, it is set to point to the raw contents of the []. +// Assumes the subject has a '[' character at the current position. +// Returns 0 and does not advance if no matching ] is found. +// Note the precedence: code backticks have precedence over label bracket +// markers, which have precedence over *, _, and other inline formatting +// markers. So, 2 below contains a link while 1 does not: +// 1. [a link `with a ](/url)` character +// 2. [a link *with emphasized ](/url) text* +static int link_label(subject* subj, bstring* raw_label) +{ + int nestlevel = 0; + inl* tmp = NULL; + bstring raw; + int startpos = subj->pos; + if (subj->label_nestlevel) { + // if we've already checked to the end of the subject + // for a label, even with a different starting [, we + // know we won't find one here and we can just return. + // Note: nestlevel 1 would be: [foo [bar] + // nestlevel 2 would be: [foo [bar [baz] + subj->label_nestlevel--; + return 0; + } + advance(subj); // advance past [ + char c; + while ((c = peek_char(subj)) && (c != ']' || nestlevel > 0)) { + switch (c) { + case '`': + tmp = handle_backticks(subj); + free_inlines(tmp); + break; + case '<': + tmp = handle_pointy_brace(subj); + free_inlines(tmp); + break; + case '[': // nested [] + nestlevel++; + advance(subj); + break; + case ']': // nested [] + nestlevel--; + advance(subj); + break; + case '\\': + advance(subj); + if (ispunct(peek_char(subj))) { + advance(subj); + } + break; + default: + advance(subj); + } + } + if (c == ']') { + if (raw_label != NULL) { + raw = bmidstr(subj->buffer, startpos + 1, subj->pos - (startpos + 1)); + *raw_label = raw; + } + subj->label_nestlevel = 0; + advance(subj); // advance past ] + return 1; + } else { + if (c == 0) { + subj->label_nestlevel = nestlevel; + } + subj->pos = startpos; // rewind + return 0; + } +} + +// Parse a link or the link portion of an image, or return a fallback. +static inl* handle_left_bracket(subject* subj) +{ + inl* lab = NULL; + inl* result = NULL; + reference* ref; + int n; + int sps; + int found_label; + int endlabel, starturl, endurl, starttitle, endtitle, endall; + bstring url, title, rawlabel, reflabel; + bstring rawlabel2 = NULL; + found_label = link_label(subj, &rawlabel); + endlabel = subj->pos; + if (found_label) { + if (peek_char(subj) == '(' && + ((sps = scan_spacechars(subj->buffer, subj->pos + 1)) > -1) && + ((n = scan_link_url(subj->buffer, subj->pos + 1 + sps)) > -1)) { + // try to parse an explicit link: + starturl = subj->pos + 1 + sps; // after ( + endurl = starturl + n; + starttitle = endurl + scan_spacechars(subj->buffer, endurl); + // ensure there are spaces btw url and title + endtitle = (starttitle == endurl) ? starttitle : + starttitle + scan_link_title(subj->buffer, starttitle); + endall = endtitle + scan_spacechars(subj->buffer, endtitle); + if (bchar(subj->buffer, endall) == ')') { + subj->pos = endall + 1; + url = bmidstr(subj->buffer, starturl, endurl - starturl); + clean_url(url); + title = bmidstr(subj->buffer, starttitle, endtitle - starttitle); + clean_title(title); + lab = parse_inlines(rawlabel, NULL); + bdestroy(rawlabel); + return make_link(lab, url, title); + } else { + // if we get here, we matched a label but didn't get further: + subj->pos = endlabel; + lab = parse_inlines(rawlabel, subj->reference_map); + bdestroy(rawlabel); + result = append_inlines(make_str(bfromcstr("[")), + append_inlines(lab, + make_str(bfromcstr("]")))); + return result; + } + } else { + // Check for reference link. + // First, see if there's another label: + subj->pos = subj->pos + scan_spacechars(subj->buffer, endlabel); + reflabel = rawlabel; + // if followed by a nonempty link label, we change reflabel to it: + if (peek_char(subj) == '[' && + link_label(subj, &rawlabel2)) { + if (blength(rawlabel2) > 0) { + reflabel = rawlabel2; + } + } else { + subj->pos = endlabel; + } + // lookup rawlabel in subject->reference_map: + ref = lookup_reference(subj->reference_map, reflabel); + if (ref != NULL) { // found + lab = parse_inlines(rawlabel, NULL); + result = make_link(lab, bstrcpy(ref->url), bstrcpy(ref->title)); + } else { + subj->pos = endlabel; + lab = parse_inlines(rawlabel, subj->reference_map); + result = append_inlines(make_str(bfromcstr("[")), + append_inlines(lab, make_str(bfromcstr("]")))); + } + bdestroy(rawlabel); + bdestroy(rawlabel2); + return result; + } + } + // If we fall through to here, it means we didn't match a link: + advance(subj); // advance past [ + return make_str(bfromcstr("[")); +} + +// Parse a hard or soft linebreak, returning an inline. +// Assumes the subject has a newline at the current position. +static inl* handle_newline(subject *subj) +{ + int nlpos = subj->pos; + // skip over newline + advance(subj); + // skip spaces at beginning of line + while (peek_char(subj) == ' ') { + advance(subj); + } + if (nlpos > 1 && + bchar(subj->buffer, nlpos - 1) == ' ' && + bchar(subj->buffer, nlpos - 2) == ' ') { + return make_linebreak(); + } else { + return make_softbreak(); + } +} + +inline static int not_eof(subject* subj) +{ + return !is_eof(subj); +} + +// Parse inlines while a predicate is satisfied. Return inlines. +extern inl* parse_inlines_while(subject* subj, int (*f)(subject*)) +{ + inl* result = NULL; + inl** last = &result; + while ((*f)(subj) && parse_inline(subj, last)) { + } + return result; +} + +// Parse an inline, advancing subject, and add it to last element. +// Adjust tail to point to new last element of list. +// Return 0 if no inline can be parsed, 1 otherwise. +extern int parse_inline(subject* subj, inl ** last) +{ + inl* new = NULL; + bstring contents; + bstring special_chars; + unsigned char c; + int endpos; + c = peek_char(subj); + if (c == 0) { + return 0; + } + switch(c){ + case '\n': + new = handle_newline(subj); + break; + case '`': + new = handle_backticks(subj); + break; + case '\\': + new = handle_backslash(subj); + break; + case '&': + new = handle_entity(subj); + break; + case '<': + new = handle_pointy_brace(subj); + break; + case '_': + if (subj->pos > 0 && (isalnum(bchar(subj->buffer, subj->pos - 1)) || + bchar(subj->buffer, subj->pos - 1) == '_')) { + new = make_str(take_one(subj)); + } else { + new = handle_strong_emph(subj, '_'); + } + break; + case '*': + new = handle_strong_emph(subj, '*'); + break; + case '[': + new = handle_left_bracket(subj); + break; + case '!': + advance(subj); + if (peek_char(subj) == '[') { + new = handle_left_bracket(subj); + if (new != NULL && new->tag == link) { + new->tag = image; + } else { + new = append_inlines(make_str(bfromcstr("!")), new); + } + } else { + new = make_str(bfromcstr("!")); + } + break; + default: + // we read until we hit a special character + special_chars = bfromcstr("\n\\`&_*[]<!"); + endpos = binchr(subj->buffer, subj->pos, special_chars); + bdestroy(special_chars); + if (endpos == subj->pos) { + // current char is special: read a 1-character str + contents = take_one(subj); + } else if (endpos == BSTR_ERR) { + // special char not found, take whole rest of buffer: + endpos = subj->buffer->slen; + contents = bmidstr(subj->buffer, subj->pos, endpos - subj->pos); + subj->pos = endpos; + } else { + // take buffer from subj->pos to endpos to str. + contents = bmidstr(subj->buffer, subj->pos, endpos - subj->pos); + subj->pos = endpos; + // if we're at a newline, strip trailing spaces. + if (peek_char(subj) == '\n') { + brtrimws(contents); + } + } + new = make_str(contents); + } + if (*last == NULL) { + *last = new; + } else { + append_inlines(*last, new); + } + return 1; +} + +extern inl* parse_inlines(bstring input, reference** refmap) +{ + subject * subj = make_subject(input, refmap); + inl * result = parse_inlines_while(subj, not_eof); + free(subj); + return result; +} + +// Parse zero or more space characters, including at most one newline. +void spnl(subject* subj) +{ + bool seen_newline = false; + while (peek_char(subj) == ' ' || + (!seen_newline && + (seen_newline = peek_char(subj) == '\n'))) { + advance(subj); + } +} + +// Parse reference. Assumes string begins with '[' character. +// Modify refmap if a reference is encountered. +// Return 0 if no reference found, otherwise position of subject +// after reference is parsed. +extern int parse_reference(bstring input, reference** refmap) +{ + subject * subj = make_subject(input, NULL); + bstring lab = NULL; + bstring url = NULL; + bstring title = NULL; + int matchlen = 0; + int beforetitle; + reference * new = NULL; + int newpos; + + // parse label: + if (!link_label(subj, &lab)) { + free(subj); + return 0; + } + // colon: + if (peek_char(subj) == ':') { + advance(subj); + } else { + free(subj); + bdestroy(lab); + return 0; + } + // parse link url: + spnl(subj); + matchlen = scan_link_url(subj->buffer, subj->pos); + if (matchlen) { + url = bmidstr(subj->buffer, subj->pos, matchlen); + clean_url(url); + subj->pos += matchlen; + } else { + free(subj); + bdestroy(lab); + bdestroy(url); + return 0; + } + // parse optional link_title + beforetitle = subj->pos; + spnl(subj); + matchlen = scan_link_title(subj->buffer, subj->pos); + if (matchlen) { + title = bmidstr(subj->buffer, subj->pos, matchlen); + clean_title(title); + subj->pos += matchlen; + } else { + subj->pos = beforetitle; + title = bfromcstr(""); + } + // parse final spaces and newline: + while (peek_char(subj) == ' ') { + advance(subj); + } + if (peek_char(subj) == '\n') { + advance(subj); + } else if (peek_char(subj) != 0) { + free(subj); + bdestroy(lab); + bdestroy(url); + bdestroy(title); + return 0; + } + // insert reference into refmap + new = make_reference(lab, url, title); + add_reference(refmap, new); + + newpos = subj->pos; + free(subj); + bdestroy(lab); + bdestroy(url); + bdestroy(title); + return newpos; +} + diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..40a63bc --- /dev/null +++ b/src/main.c @@ -0,0 +1,102 @@ +#include <stdlib.h> +#include <stdio.h> +#include "bstrlib.h" +#include "stmd.h" +#include "debug.h" + +void print_usage() +{ + printf("Usage: stmd [FILE*]\n"); + printf("Options: --help, -h Print usage information\n"); + printf(" --ast Print AST instead of HTML\n"); + printf(" --version Print version\n"); +} + +int main(int argc, char *argv[]) { + int i; + bool ast = false; + int g = 0; + int numfps = 0; + int files[argc]; + + for (i=1; i < argc; i++) { + if (strcmp(argv[i], "--version") == 0) { + printf("stmd %s", VERSION); + printf(" - standard markdown converter (c) 2014 John MacFarlane\n"); + exit(0); + } else if ((strcmp(argv[i], "--help") == 0) || + (strcmp(argv[i], "-h") == 0)) { + print_usage(); + exit(0); + } else if (strcmp(argv[i], "--ast") == 0) { + ast = true; + } else if (*argv[i] == '-') { + print_usage(); + exit(1); + } else { // treat as file argument + files[g] = i; + g++; + } + } + + numfps = g; + bstring s = NULL; + bstring html; + g = 0; + block * cur = make_document(); + int linenum = 1; + extern int errno; + FILE * fp = NULL; + + if (numfps == 0) { + // read from stdin + while ((s = bgets((bNgetc) fgetc, stdin, '\n'))) { + check(incorporate_line(s, linenum, &cur) == 0, + "error incorporating line %d", linenum); + bdestroy(s); + linenum++; + } + } else { + // iterate over input file pointers + for (g=0; g < numfps; g++) { + + fp = fopen(argv[files[g]], "r"); + if (fp == NULL) { + fprintf(stderr, "Error opening file %s: %s\n", + argv[files[g]], strerror(errno)); + exit(1); + } + + struct bStream *stream = bsopen((bNread)fread, fp); + if (stream == NULL) { + printf("Error opening stream\n"); + } + while (bsreadln(s, stream, '\n') != BSTR_ERR) { + check(incorporate_line(s, linenum, &cur) == 0, + "error incorporating line %d of %s", linenum, argv[files[g]]); + linenum++; + } + bsclose(stream); + } + } + + while (cur != cur->top) { + finalize(cur, linenum); + cur = cur->parent; + } + check(cur == cur->top, "problems finalizing open containers"); + finalize(cur, linenum); + process_inlines(cur, cur->attributes.refmap); + if (ast) { + print_blocks(cur, 0); + } else { + check(blocks_to_html(cur, &html, false) == 0, "could not format as HTML"); + printf("%s", html->data); + bdestroy(html); + } + free_blocks(cur); + return 0; +error: + return -1; +} + diff --git a/src/print.c b/src/print.c new file mode 100644 index 0000000..a924870 --- /dev/null +++ b/src/print.c @@ -0,0 +1,168 @@ +#include <stdlib.h> +#include <stdio.h> +#include "bstrlib.h" +#include "stmd.h" +#include "debug.h" + +static bstring format_str(bstring s) +{ + int pos = 0; + int len = blength(s); + bstring result = bfromcstr(""); + char c; + bformata(result, "\""); + while (pos < len) { + c = bchar(s, pos); + switch (c) { + case '\n': + bformata(result, "\\n"); + break; + case '"': + bformata(result, "\\\""); + break; + case '\\': + bformata(result, "\\\\"); + break; + default: + bformata(result, "%c", c); + } + pos++; + } + bformata(result, "\""); + return result; +} + +// Functions to pretty-print inline and block lists, for debugging. +// Prettyprint an inline list, for debugging. +extern void print_blocks(block* b, int indent) +{ + struct ListData * data; + while(b != NULL) { + // printf("%3d %3d %3d| ", b->start_line, b->start_column, b->end_line); + for (int i=0; i < indent; i++) { + putchar(' '); + } + switch(b->tag) { + case document: + printf("document\n"); + print_blocks(b->children, indent + 2); + break; + case block_quote: + printf("block_quote\n"); + print_blocks(b->children, indent + 2); + break; + case list_item: + data = &(b->attributes.list_data); + printf("list_item\n"); + print_blocks(b->children, indent + 2); + break; + case list: + data = &(b->attributes.list_data); + if (data->list_type == ordered) { + printf("list (type=ordered tight=%s start=%d delim=%s)\n", + (data->tight ? "true" : "false"), + data->start, + (data->delimiter == parens ? "parens" : "period")); + } else { + printf("list (type=bullet tight=%s bullet_char=%c)\n", + (data->tight ? "true" : "false"), + data->bullet_char); + } + print_blocks(b->children, indent + 2); + break; + case atx_header: + printf("atx_header (level=%d)\n", b->attributes.header_level); + print_inlines(b->inline_content, indent + 2); + break; + case setext_header: + printf("setext_header (level=%d)\n", b->attributes.header_level); + print_inlines(b->inline_content, indent + 2); + break; + case paragraph: + printf("paragraph\n"); + print_inlines(b->inline_content, indent + 2); + break; + case hrule: + printf("hrule\n"); + break; + case indented_code: + printf("indented_code %s\n", format_str(b->string_content)->data); + break; + case fenced_code: + printf("fenced_code length=%d info=%s %s\n", + b->attributes.fenced_code_data.fence_length, + format_str(b->attributes.fenced_code_data.info)->data, + format_str(b->string_content)->data); + break; + case html_block: + printf("html_block %s\n", format_str(b->string_content)->data); + break; + case reference_def: + printf("reference_def\n"); + break; + default: + log_warn("block type %d not implemented\n", b->tag); + break; + } + b = b->next; + } +} + +// Prettyprint an inline list, for debugging. +extern void print_inlines(inl* ils, int indent) +{ + while(ils != NULL) { + /* + // we add 11 extra spaces for the line/column info + for (int i=0; i < 11; i++) { + putchar(' '); + } + putchar('|'); + putchar(' '); + */ + for (int i=0; i < indent; i++) { + putchar(' '); + } + switch(ils->tag) { + case str: + printf("str %s\n", format_str(ils->content.literal)->data); + break; + case linebreak: + printf("linebreak\n"); + break; + case softbreak: + printf("softbreak\n"); + break; + case code: + printf("code %s\n", format_str(ils->content.literal)->data); + break; + case raw_html: + printf("html %s\n", format_str(ils->content.literal)->data); + break; + case entity: + printf("entity %s\n", format_str(ils->content.literal)->data); + break; + case link: + printf("link url=%s title=%s\n", + format_str(ils->content.linkable.url)->data, + format_str(ils->content.linkable.title)->data); + print_inlines(ils->content.linkable.label, indent + 2); + break; + case image: + printf("image url=%s title=%s\n", + format_str(ils->content.linkable.url)->data, + format_str(ils->content.linkable.title)->data); + print_inlines(ils->content.linkable.label, indent + 2); + break; + case strong: + printf("strong\n"); + print_inlines(ils->content.linkable.label, indent + 2); + break; + case emph: + printf("emph\n"); + print_inlines(ils->content.linkable.label, indent + 2); + break; + } + ils = ils->next; + } +} diff --git a/src/scanners.h b/src/scanners.h new file mode 100644 index 0000000..71e0520 --- /dev/null +++ b/src/scanners.h @@ -0,0 +1,15 @@ +#include "bstrlib.h" + +int scan_autolink_uri(bstring s, int pos); +int scan_autolink_email(bstring s, int pos); +int scan_html_tag(bstring s, int pos); +int scan_html_block_tag(bstring s, int pos); +int scan_link_url(bstring s, int pos); +int scan_link_title(bstring s, int pos); +int scan_spacechars(bstring s, int pos); +int scan_atx_header_start(bstring s, int pos); +int scan_setext_header_line(bstring s, int pos); +int scan_hrule(bstring s, int pos); +int scan_open_code_fence(bstring s, int pos); +int scan_close_code_fence(bstring s, int pos, int len); +int scan_entity(bstring s, int pos); diff --git a/src/scanners.re b/src/scanners.re new file mode 100644 index 0000000..f90238d --- /dev/null +++ b/src/scanners.re @@ -0,0 +1,238 @@ +#include "bstrlib.h" + +/*!re2c + re2c:define:YYCTYPE = "unsigned char"; + re2c:define:YYCURSOR = p; + re2c:define:YYMARKER = marker; + re2c:define:YYCTXMARKER = marker; + re2c:yyfill:enable = 0; + + wordchar = [^\x00-\x20]; + + spacechar = [ \t\n]; + + reg_char = [^\\()\x00-\x20]; + + escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-]; + + tagname = [A-Za-z][A-Za-z0-9]*; + + blocktagname = 'article'|'header'|'aside'|'hgroup'|'blockquote'|'hr'|'body'|'li'|'br'|'map'|'button'|'object'|'canvas'|'ol'|'caption'|'output'|'col'|'p'|'colgroup'|'pre'|'dd'|'progress'|'div'|'section'|'dl'|'table'|'td'|'dt'|'tbody'|'embed'|'textarea'|'fieldset'|'tfoot'|'figcaption'|'th'|'figure'|'thead'|'footer'|'footer'|'tr'|'form'|'ul'|'h1'|'h2'|'h3'|'h4'|'h5'|'h6'|'video'|'script'|'style'; + + attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*; + + unquotedvalue = [^\"'=<>`\x00]+; + singlequotedvalue = ['][^'\x00]*[']; + doublequotedvalue = [\"][^\"\x00]*[\"]; + + attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue; + + attributevaluespec = spacechar* [=] spacechar* attributevalue; + + attribute = spacechar+ attributename attributevaluespec?; + + opentag = tagname attribute* spacechar* [/]? [>]; + closetag = [/] tagname spacechar* [>]; + + htmlcomment = "!--" ([^-\x00]+ | [-][^-\x00]+)* "-->"; + + processinginstruction = "?" ([^?>\x00]+ | [?][^>\x00])* "?>"; + + declaration = "!" [A-Z]+ spacechar+ [^>\x00]* ">"; + + cdata = "![CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])* "]]>"; + + htmltag = opentag | closetag | htmlcomment | processinginstruction | + declaration | cdata; + + in_parens_nosp = [(] (reg_char|escaped_char)* [)]; + + in_double_quotes = ["] (escaped_char|[^"\x00])* ["]; + in_single_quotes = ['] (escaped_char|[^'\x00])* [']; + in_parens = [(] (escaped_char|[^)\x00])* [)]; + + scheme = 'coap'|'doi'|'javascript'|'aaa'|'aaas'|'about'|'acap'|'cap'|'cid'|'crid'|'data'|'dav'|'dict'|'dns'|'file'|'ftp'|'geo'|'go'|'gopher'|'h323'|'http'|'https'|'iax'|'icap'|'im'|'imap'|'info'|'ipp'|'iris'|'iris.beep'|'iris.xpc'|'iris.xpcs'|'iris.lwz'|'ldap'|'mailto'|'mid'|'msrp'|'msrps'|'mtqp'|'mupdate'|'news'|'nfs'|'ni'|'nih'|'nntp'|'opaquelocktoken'|'pop'|'pres'|'rtsp'|'service'|'session'|'shttp'|'sieve'|'sip'|'sips'|'sms'|'snmp'|'soap.beep'|'soap.beeps'|'tag'|'tel'|'telnet'|'tftp'|'thismessage'|'tn3270'|'tip'|'tv'|'urn'|'vemmi'|'ws'|'wss'|'xcon'|'xcon-userid'|'xmlrpc.beep'|'xmlrpc.beeps'|'xmpp'|'z39.50r'|'z39.50s'|'adiumxtra'|'afp'|'afs'|'aim'|'apt'|'attachment'|'aw'|'beshare'|'bitcoin'|'bolo'|'callto'|'chrome'|'chrome-extension'|'com-eventbrite-attendee'|'content'|'cvs'|'dlna-playsingle'|'dlna-playcontainer'|'dtn'|'dvb'|'ed2k'|'facetime'|'feed'|'finger'|'fish'|'gg'|'git'|'gizmoproject'|'gtalk'|'hcp'|'icon'|'ipn'|'irc'|'irc6'|'ircs'|'itms'|'jar'|'jms'|'keyparc'|'lastfm'|'ldaps'|'magnet'|'maps'|'market'|'message'|'mms'|'ms-help'|'msnim'|'mumble'|'mvn'|'notes'|'oid'|'palm'|'paparazzi'|'platform'|'proxy'|'psyc'|'query'|'res'|'resource'|'rmi'|'rsync'|'rtmp'|'secondlife'|'sftp'|'sgn'|'skype'|'smb'|'soldat'|'spotify'|'ssh'|'steam'|'svn'|'teamspeak'|'things'|'udp'|'unreal'|'ut2004'|'ventrilo'|'view-source'|'webcal'|'wtai'|'wyciwyg'|'xfire'|'xri'|'ymsgr'; +*/ + +// Try to match URI autolink after first <, returning number of chars matched. +extern int scan_autolink_uri(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + scheme [:]([^\x00-\x20<>\\]|escaped_char)*[>] { return (p - start); } + .? { return 0; } +*/ +} + +// Try to match email autolink after first <, returning num of chars matched. +extern int scan_autolink_email(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+ + [@] + [a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? + ([.][a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)* + [>] { return (p - start); } + .? { return 0; } +*/ +} + +// Try to match an HTML tag after first <, returning num of chars matched. +extern int scan_html_tag(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + htmltag { return (p - start); } + .? { return 0; } +*/ +} + +// Try to match an HTML block tag including first <, +// returning num of chars matched. +extern int scan_html_block_tag(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + [<] [/] blocktagname (spacechar | [>]) { return (p - start); } + [<] blocktagname (spacechar | [/>]) { return (p - start); } + [<] [!?] { return (p - start); } + .? { return 0; } +*/ +} + +// Try to match a URL in a link or reference, return number of chars matched. +// This may optionally be contained in <..>; otherwise +// whitespace and unbalanced right parentheses aren't allowed. +// Newlines aren't ever allowed. +extern int scan_link_url(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + [ \n]* [<] ([^<>\n\\\x00] | escaped_char | [\\])* [>] { return (p - start); } + [ \n]* (reg_char+ | escaped_char | in_parens_nosp)* { return (p - start); } + .? { return 0; } +*/ +} + +// Try to match a link title (in single quotes, in double quotes, or +// in parentheses), returning number of chars matched. Allow one +// level of internal nesting (quotes within quotes). +extern int scan_link_title(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + ["] (escaped_char|[^"\x00])* ["] { return (p - start); } + ['] (escaped_char|[^'\x00])* ['] { return (p - start); } + [(] (escaped_char|[^)\x00])* [)] { return (p - start); } + .? { return 0; } +*/ +} + +// Match space characters, including newlines. +extern int scan_spacechars(bstring s, int pos) +{ + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + [ \t\n]* { return (p - start); } + . { return 0; } +*/ +} + +// Match ATX header start. +extern int scan_atx_header_start(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + [#]{1,6} ([ ]+|[\n]) { return (p - start); } + .? { return 0; } +*/ +} + +// Match sexext header line. Return 1 for level-1 header, +// 2 for level-2, 0 for no match. +extern int scan_setext_header_line(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); +/*!re2c + [=]+ [ ]* [\n] { return 1; } + [-]+ [ ]* [\n] { return 2; } + .? { return 0; } +*/ +} + +// Scan a horizontal rule line: "...three or more hyphens, asterisks, +// or underscores on a line by themselves. If you wish, you may use +// spaces between the hyphens or asterisks." +extern int scan_hrule(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + ([*][ ]*){3,} [ \t]* [\n] { return (p - start); } + ([_][ ]*){3,} [ \t]* [\n] { return (p - start); } + ([-][ ]*){3,} [ \t]* [\n] { return (p - start); } + .? { return 0; } +*/ +} + +// Scan an opening code fence. +extern int scan_open_code_fence(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + [`]{3,} / [^`\n\x00]*[\n] { return (p - start); } + [~]{3,} / [^~\n\x00]*[\n] { return (p - start); } + .? { return 0; } +*/ +} + +// Scan a closing code fence with length at least len. +extern int scan_close_code_fence(bstring s, int pos, int len) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + ([`]{3,} | [~]{3,}) / spacechar* [\n] + { if (p - start > len) { + return (p - start); + } else { + return 0; + } } + .? { return 0; } +*/ +} + +// Scans an entity. +// Returns number of chars matched. +extern int scan_entity(bstring s, int pos) +{ + unsigned char * marker = NULL; + unsigned char * p = &(s->data[pos]); + unsigned char * start = p; +/*!re2c + [&] ([#] ([Xx][A-Fa-f0-9]{1,8}|[0-9]{1,8}) |[A-Za-z][A-Za-z0-9]{1,31} ) [;] + { return (p - start); } + .? { return 0; } +*/ +} diff --git a/src/stmd.h b/src/stmd.h new file mode 100644 index 0000000..5e34399 --- /dev/null +++ b/src/stmd.h @@ -0,0 +1,121 @@ +#include <stdbool.h> +#include "bstrlib.h" +#include "uthash.h" + +#define VERSION "0.1" +#define CODE_INDENT 4 + +typedef struct Inline { + enum { str, softbreak, linebreak, code, raw_html, entity, + emph, strong, link, image } tag; + union { + bstring literal; + struct Inline* inlines; + struct { struct Inline* label; + bstring url; + bstring title; + } linkable; + } content; + struct Inline* next; +} inl; + +typedef struct Reference { + bstring label; + bstring url; + bstring title; + UT_hash_handle hh; // used by uthash +} reference; + +typedef struct Subject { + bstring buffer; + int pos; + reference** reference_map; + int label_nestlevel; +} subject; + +// Types for blocks + +struct ListData { + enum { bullet, + ordered } list_type; + int marker_offset; + int padding; + int start; + enum { period, + parens } delimiter; + unsigned char bullet_char; + bool tight; +}; + +struct FencedCodeData { + int fence_length; + int fence_offset; + char fence_char; + bstring info; +}; + +typedef struct Block { + enum { document, + block_quote, + list, + list_item, + fenced_code, + indented_code, + html_block, + paragraph, + atx_header, + setext_header, + hrule, + reference_def + } tag; + int start_line; + int start_column; + int end_line; + bool open; + bool last_line_blank; + struct Block* children; + struct Block* last_child; + struct Block* parent; + struct Block* top; + bstring string_content; + inl* inline_content; + union { + struct ListData list_data; + struct FencedCodeData fenced_code_data; + int header_level; + reference** refmap; + } attributes; + struct Block * next; + struct Block * prev; +} block; + +int parse_inline(subject* subj, inl ** last); +inl* parse_inlines(bstring input, reference** refmap); +inl* parse_inlines_while(subject* subj, int (*f)(subject*)); +void free_inlines(inl* e); +int parse_reference(bstring input, reference** refmap); +void free_reference(reference *ref); +void free_reference_map(reference **refmap); +reference* make_reference(bstring label, bstring url, bstring title); +reference* lookup_reference(reference** refmap, bstring label); +void add_reference(reference** refmap, reference* ref); +int unescape(bstring s); + +extern block* make_document(); +extern block* add_child(block* parent, + int block_type, int start_line, int start_column); +void free_blocks(block* e); + +// FOR NOW: +int process_inlines(block* cur, reference** refmap); +int incorporate_line(bstring ln, int line_number, block** curptr); +int finalize(block* b, int line_number); + +void print_inlines(inl* ils, int indent); +void print_blocks(block* blk, int indent); + +int blocks_to_html(block* b, bstring* result, bool tight); +int inlines_to_html(inl* b, bstring* result); + +int bdetab(bstring s, int utf8); + diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..4bb3b35 --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,106 @@ +#include <stdlib.h> +#include "bstrlib.h" +#include "debug.h" + +#define advance(s) \ + s++; \ + check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s); + +// Reads a unicode code point from a UTF8-encoded string, and +// puts it in the pointer n. If something illegal +// is encountered, 0xFFFD is emitted. +// Returns a pointer to next position in string, or NULL if no +// more characters remain. +extern unsigned char * from_utf8(unsigned char * s, unsigned int *n) +{ + int x = 0; + + if (*s == 0) { + return NULL; + } else if (*s < 0x80) { + x = *s; + } else if (*s >> 5 == 0x06) { + x = *s & 0x1F; + advance(s); + x = (x << 6) + (*s & 0x3F); + } else if (*s >> 4 == 0x0E) { + x = *s & 0x0F; + advance(s); + x = (x << 6) + (*s & 0x3F); + advance(s); + x = (x << 6) + (*s & 0x3F); + } else if (*s >> 3 == 0x1E) { + x = *s & 0x07; + advance(s); + x = (x << 6) + (*s & 0x3F); + advance(s); + x = (x << 6) + (*s & 0x3F); + advance(s); + x = (x << 6) + (*s & 0x3F); + } else if (*s >> 2 == 0x3E) { + x = *s & 0x03; + advance(s); + x = (x << 6) + (*s & 0x3F); + advance(s); + x = (x << 6) + (*s & 0x3F); + advance(s); + x = (x << 6) + (*s & 0x3F); + advance(s); + x = (x << 6) + (*s & 0x3F); + } else { + log_err("UTF-8 decode error on byte %x", *s); + goto error; + } + *n = x; + s++; + return s; + error: + *n = 0xFFFD; + return s; +} + +// Converts the unicode code point c to UTF-8, +// putting the result in dest. Returns 0 on success, -1 on error. +extern int to_utf8(unsigned int c, bstring dest) +{ + if (c < 0x80) { + bconchar(dest, c); + } else if (c < 0x800) { + bconchar(dest, 192 + c/64); + bconchar(dest, 128 + c%64); + } else if (c - 0xd800u < 0x800) { + goto error; + } else if (c < 0x10000) { + bconchar(dest, 224 + c / 4096); + bconchar(dest, 128 + c /64%64); + bconchar(dest, 128 + c%64); + } else if (c < 0x110000) { + bconchar(dest, 240 + c/262144); + bconchar(dest, 128 + c/4096%64); + bconchar(dest, 128 + c/64%64); + bconchar(dest, 128 + c%64); + } else { + goto error; + } + return 0; +error: + return -1; +} + +#define bufpush(x) \ + check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x) + +// Returns the case-folded version of the source string, or NULL on error. +extern bstring case_fold(bstring source) +{ + unsigned char * s = source->data; + unsigned int c = 0; + bstring buf = bfromcstr(""); + while ((s = from_utf8(s, &c))) { +#include "case_fold_switch.c" + } + return buf; +error: + return NULL; +} + diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..fe59a90 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,6 @@ +#include <stdlib.h> +#include "bstrlib.h" + +extern unsigned char * from_utf8(unsigned char * s, unsigned int *n); +extern int to_utf8(unsigned int c, bstring dest); +extern bstring case_fold(bstring source); diff --git a/src/uthash.h b/src/uthash.h new file mode 100644 index 0000000..b9bc7e9 --- /dev/null +++ b/src/uthash.h @@ -0,0 +1,948 @@ +/* +Copyright (c) 2003-2013, Troy D. Hanson http://troydhanson.github.com/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTHASH_H +#define UTHASH_H + +#include <string.h> /* memcmp,strlen */ +#include <stddef.h> /* ptrdiff_t */ +#include <stdlib.h> /* exit() */ + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ source) this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#ifdef _MSC_VER /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#define DECLTYPE(x) +#endif +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + char **_da_dst = (char**)(&(dst)); \ + *_da_dst = (char*)(src); \ +} while(0) +#else +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + (dst) = DECLTYPE(dst)(src); \ +} while(0) +#endif + +/* a number of the hash function use uint32_t which isn't defined on win32 */ +#ifdef _MSC_VER +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +#else +#include <inttypes.h> /* uint32_t */ +#endif + +#define UTHASH_VERSION 1.9.8 + +#ifndef uthash_fatal +#define uthash_fatal(msg) exit(-1) /* fatal error (out of memory,etc) */ +#endif +#ifndef uthash_malloc +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#endif +#ifndef uthash_free +#define uthash_free(ptr,sz) free(ptr) /* free fcn */ +#endif + +#ifndef uthash_noexpand_fyi +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#endif +#ifndef uthash_expand_fyi +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ +#endif + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32 /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 5 /* lg2 of initial number of buckets */ +#define HASH_BKT_CAPACITY_THRESH 10 /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhe */ +#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) + +#define HASH_FIND(hh,head,keyptr,keylen,out) \ +do { \ + unsigned _hf_bkt,_hf_hashv; \ + out=NULL; \ + if (head) { \ + HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], \ + keyptr,keylen,out); \ + } \ + } \ +} while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0) +#define HASH_BLOOM_MAKE(tbl) \ +do { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!((tbl)->bloom_bv)) { uthash_fatal( "out of memory"); } \ + memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ +} while (0) + +#define HASH_BLOOM_FREE(tbl) \ +do { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ +} while (0) + +#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8))) +#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8))) + +#define HASH_BLOOM_ADD(tbl,hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) + +#define HASH_BLOOM_TEST(tbl,hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) + +#else +#define HASH_BLOOM_MAKE(tbl) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl,hashv) +#define HASH_BLOOM_TEST(tbl,hashv) (1) +#define HASH_BLOOM_BYTELEN 0 +#endif + +#define HASH_MAKE_TABLE(hh,head) \ +do { \ + (head)->hh.tbl = (UT_hash_table*)uthash_malloc( \ + sizeof(UT_hash_table)); \ + if (!((head)->hh.tbl)) { uthash_fatal( "out of memory"); } \ + memset((head)->hh.tbl, 0, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); } \ + memset((head)->hh.tbl->buckets, 0, \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ +} while(0) + +#define HASH_ADD(hh,head,fieldname,keylen_in,add) \ + HASH_ADD_KEYPTR(hh,head,&((add)->fieldname),keylen_in,add) + +#define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced) \ +do { \ + replaced=NULL; \ + HASH_FIND(hh,head,&((add)->fieldname),keylen_in,replaced); \ + if (replaced!=NULL) { \ + HASH_DELETE(hh,head,replaced); \ + }; \ + HASH_ADD(hh,head,fieldname,keylen_in,add); \ +} while(0) + +#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ +do { \ + unsigned _ha_bkt; \ + (add)->hh.next = NULL; \ + (add)->hh.key = (char*)(keyptr); \ + (add)->hh.keylen = (unsigned)(keylen_in); \ + if (!(head)) { \ + head = (add); \ + (head)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh,head); \ + } else { \ + (head)->hh.tbl->tail->next = (add); \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail = &((add)->hh); \ + } \ + (head)->hh.tbl->num_items++; \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets, \ + (add)->hh.hashv, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh); \ + HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv); \ + HASH_EMIT_KEY(hh,head,keyptr,keylen_in); \ + HASH_FSCK(hh,head); \ +} while(0) + +#define HASH_TO_BKT( hashv, num_bkts, bkt ) \ +do { \ + bkt = ((hashv) & ((num_bkts) - 1)); \ +} while(0) + +/* delete "delptr" from the hash table. + * "the usual" patch-up process for the app-order doubly-linked-list. + * The use of _hd_hh_del below deserves special explanation. + * These used to be expressed using (delptr) but that led to a bug + * if someone used the same symbol for the head and deletee, like + * HASH_DELETE(hh,users,users); + * We want that to work, but by changing the head (users) below + * we were forfeiting our ability to further refer to the deletee (users) + * in the patch-up process. Solution: use scratch space to + * copy the deletee pointer, then the latter references are via that + * scratch pointer rather than through the repointed (users) symbol. + */ +#define HASH_DELETE(hh,head,delptr) \ +do { \ + unsigned _hd_bkt; \ + struct UT_hash_handle *_hd_hh_del; \ + if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) ) { \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + head = NULL; \ + } else { \ + _hd_hh_del = &((delptr)->hh); \ + if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) { \ + (head)->hh.tbl->tail = \ + (UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \ + (head)->hh.tbl->hho); \ + } \ + if ((delptr)->hh.prev) { \ + ((UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \ + (head)->hh.tbl->hho))->next = (delptr)->hh.next; \ + } else { \ + DECLTYPE_ASSIGN(head,(delptr)->hh.next); \ + } \ + if (_hd_hh_del->next) { \ + ((UT_hash_handle*)((ptrdiff_t)_hd_hh_del->next + \ + (head)->hh.tbl->hho))->prev = \ + _hd_hh_del->prev; \ + } \ + HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh,head); \ +} while (0) + + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head,findstr,out) \ + HASH_FIND(hh,head,findstr,strlen(findstr),out) +#define HASH_ADD_STR(head,strfield,add) \ + HASH_ADD(hh,head,strfield,strlen(add->strfield),add) +#define HASH_REPLACE_STR(head,strfield,add,replaced) \ + HASH_REPLACE(hh,head,strfield,strlen(add->strfield),add,replaced) +#define HASH_FIND_INT(head,findint,out) \ + HASH_FIND(hh,head,findint,sizeof(int),out) +#define HASH_ADD_INT(head,intfield,add) \ + HASH_ADD(hh,head,intfield,sizeof(int),add) +#define HASH_REPLACE_INT(head,intfield,add,replaced) \ + HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced) +#define HASH_FIND_PTR(head,findptr,out) \ + HASH_FIND(hh,head,findptr,sizeof(void *),out) +#define HASH_ADD_PTR(head,ptrfield,add) \ + HASH_ADD(hh,head,ptrfield,sizeof(void *),add) +#define HASH_REPLACE_PTR(head,ptrfield,add,replaced) \ + HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced) +#define HASH_DEL(head,delptr) \ + HASH_DELETE(hh,head,delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. + * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. + */ +#ifdef HASH_DEBUG +#define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0) +#define HASH_FSCK(hh,head) \ +do { \ + unsigned _bkt_i; \ + unsigned _count, _bkt_count; \ + char *_prev; \ + struct UT_hash_handle *_thh; \ + if (head) { \ + _count = 0; \ + for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) { \ + _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) { \ + if (_prev != (char*)(_thh->hh_prev)) { \ + HASH_OOPS("invalid hh_prev %p, actual %p\n", \ + _thh->hh_prev, _prev ); \ + } \ + _bkt_count++; \ + _prev = (char*)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ + HASH_OOPS("invalid bucket count %d, actual %d\n", \ + (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("invalid hh item count %d, actual %d\n", \ + (head)->hh.tbl->num_items, _count ); \ + } \ + /* traverse hh in app order; check next/prev integrity, count */ \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) { \ + _count++; \ + if (_prev !=(char*)(_thh->prev)) { \ + HASH_OOPS("invalid prev %p, actual %p\n", \ + _thh->prev, _prev ); \ + } \ + _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = ( _thh->next ? (UT_hash_handle*)((char*)(_thh->next) + \ + (head)->hh.tbl->hho) : NULL ); \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("invalid app item count %d, actual %d\n", \ + (head)->hh.tbl->num_items, _count ); \ + } \ + } \ +} while (0) +#else +#define HASH_FSCK(hh,head) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * the descriptor to which this macro is defined for tuning the hash function. + * The app can #include <unistd.h> to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ +do { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, fieldlen); \ +} while (0) +#else +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) +#endif + +/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */ +#ifdef HASH_FUNCTION +#define HASH_FCN HASH_FUNCTION +#else +#define HASH_FCN HASH_JEN +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6 */ +#define HASH_BER(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _hb_keylen=keylen; \ + char *_hb_key=(char*)(key); \ + (hashv) = 0; \ + while (_hb_keylen--) { (hashv) = ((hashv) * 33) + *_hb_key++; } \ + bkt = (hashv) & (num_bkts-1); \ +} while (0) + + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */ +#define HASH_SAX(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _sx_i; \ + char *_hs_key=(char*)(key); \ + hashv = 0; \ + for(_sx_i=0; _sx_i < keylen; _sx_i++) \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + bkt = hashv & (num_bkts-1); \ +} while (0) + +#define HASH_FNV(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _fn_i; \ + char *_hf_key=(char*)(key); \ + hashv = 2166136261UL; \ + for(_fn_i=0; _fn_i < keylen; _fn_i++) \ + hashv = (hashv * 16777619) ^ _hf_key[_fn_i]; \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +#define HASH_OAT(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _ho_i; \ + char *_ho_key=(char*)(key); \ + hashv = 0; \ + for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +#define HASH_JEN_MIX(a,b,c) \ +do { \ + a -= b; a -= c; a ^= ( c >> 13 ); \ + b -= c; b -= a; b ^= ( a << 8 ); \ + c -= a; c -= b; c ^= ( b >> 13 ); \ + a -= b; a -= c; a ^= ( c >> 12 ); \ + b -= c; b -= a; b ^= ( a << 16 ); \ + c -= a; c -= b; c ^= ( b >> 5 ); \ + a -= b; a -= c; a ^= ( c >> 3 ); \ + b -= c; b -= a; b ^= ( a << 10 ); \ + c -= a; c -= b; c ^= ( b >> 15 ); \ +} while (0) + +#define HASH_JEN(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _hj_i,_hj_j,_hj_k; \ + unsigned char *_hj_key=(unsigned char*)(key); \ + hashv = 0xfeedbeef; \ + _hj_i = _hj_j = 0x9e3779b9; \ + _hj_k = (unsigned)(keylen); \ + while (_hj_k >= 12) { \ + _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ + + ( (unsigned)_hj_key[2] << 16 ) \ + + ( (unsigned)_hj_key[3] << 24 ) ); \ + _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ + + ( (unsigned)_hj_key[6] << 16 ) \ + + ( (unsigned)_hj_key[7] << 24 ) ); \ + hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ + + ( (unsigned)_hj_key[10] << 16 ) \ + + ( (unsigned)_hj_key[11] << 24 ) ); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12; \ + } \ + hashv += keylen; \ + switch ( _hj_k ) { \ + case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); \ + case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); \ + case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); \ + case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); \ + case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); \ + case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); \ + case 5: _hj_j += _hj_key[4]; \ + case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); \ + case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); \ + case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); \ + case 1: _hj_i += _hj_key[0]; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif +#define HASH_SFH(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned char *_sfh_key=(unsigned char*)(key); \ + uint32_t _sfh_tmp, _sfh_len = keylen; \ + \ + int _sfh_rem = _sfh_len & 3; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabe; \ + \ + /* Main loop */ \ + for (;_sfh_len > 0; _sfh_len--) { \ + hashv += get16bits (_sfh_key); \ + _sfh_tmp = (uint32_t)(get16bits (_sfh_key+2)) << 11 ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2*sizeof (uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) { \ + case 3: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)] << 18); \ + hashv += hashv >> 11; \ + break; \ + case 2: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +#ifdef HASH_USING_NO_STRICT_ALIASING +/* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads. + * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error. + * MurmurHash uses the faster approach only on CPU's where we know it's safe. + * + * Note the preprocessor built-in defines can be emitted using: + * + * gcc -m64 -dM -E - < /dev/null (on gcc) + * cc -## a.c (where a.c is a simple test file) (Sun Studio) + */ +#if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)) +#define MUR_GETBLOCK(p,i) p[i] +#else /* non intel */ +#define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0) +#define MUR_PLUS1_ALIGNED(p) (((unsigned long)p & 0x3) == 1) +#define MUR_PLUS2_ALIGNED(p) (((unsigned long)p & 0x3) == 2) +#define MUR_PLUS3_ALIGNED(p) (((unsigned long)p & 0x3) == 3) +#define WP(p) ((uint32_t*)((unsigned long)(p) & ~3UL)) +#if (defined(__BIG_ENDIAN__) || defined(SPARC) || defined(__ppc__) || defined(__ppc64__)) +#define MUR_THREE_ONE(p) ((((*WP(p))&0x00ffffff) << 8) | (((*(WP(p)+1))&0xff000000) >> 24)) +#define MUR_TWO_TWO(p) ((((*WP(p))&0x0000ffff) <<16) | (((*(WP(p)+1))&0xffff0000) >> 16)) +#define MUR_ONE_THREE(p) ((((*WP(p))&0x000000ff) <<24) | (((*(WP(p)+1))&0xffffff00) >> 8)) +#else /* assume little endian non-intel */ +#define MUR_THREE_ONE(p) ((((*WP(p))&0xffffff00) >> 8) | (((*(WP(p)+1))&0x000000ff) << 24)) +#define MUR_TWO_TWO(p) ((((*WP(p))&0xffff0000) >>16) | (((*(WP(p)+1))&0x0000ffff) << 16)) +#define MUR_ONE_THREE(p) ((((*WP(p))&0xff000000) >>24) | (((*(WP(p)+1))&0x00ffffff) << 8)) +#endif +#define MUR_GETBLOCK(p,i) (MUR_PLUS0_ALIGNED(p) ? ((p)[i]) : \ + (MUR_PLUS1_ALIGNED(p) ? MUR_THREE_ONE(p) : \ + (MUR_PLUS2_ALIGNED(p) ? MUR_TWO_TWO(p) : \ + MUR_ONE_THREE(p)))) +#endif +#define MUR_ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +#define MUR_FMIX(_h) \ +do { \ + _h ^= _h >> 16; \ + _h *= 0x85ebca6b; \ + _h ^= _h >> 13; \ + _h *= 0xc2b2ae35l; \ + _h ^= _h >> 16; \ +} while(0) + +#define HASH_MUR(key,keylen,num_bkts,hashv,bkt) \ +do { \ + const uint8_t *_mur_data = (const uint8_t*)(key); \ + const int _mur_nblocks = (keylen) / 4; \ + uint32_t _mur_h1 = 0xf88D5353; \ + uint32_t _mur_c1 = 0xcc9e2d51; \ + uint32_t _mur_c2 = 0x1b873593; \ + uint32_t _mur_k1 = 0; \ + const uint8_t *_mur_tail; \ + const uint32_t *_mur_blocks = (const uint32_t*)(_mur_data+_mur_nblocks*4); \ + int _mur_i; \ + for(_mur_i = -_mur_nblocks; _mur_i; _mur_i++) { \ + _mur_k1 = MUR_GETBLOCK(_mur_blocks,_mur_i); \ + _mur_k1 *= _mur_c1; \ + _mur_k1 = MUR_ROTL32(_mur_k1,15); \ + _mur_k1 *= _mur_c2; \ + \ + _mur_h1 ^= _mur_k1; \ + _mur_h1 = MUR_ROTL32(_mur_h1,13); \ + _mur_h1 = _mur_h1*5+0xe6546b64; \ + } \ + _mur_tail = (const uint8_t*)(_mur_data + _mur_nblocks*4); \ + _mur_k1=0; \ + switch((keylen) & 3) { \ + case 3: _mur_k1 ^= _mur_tail[2] << 16; \ + case 2: _mur_k1 ^= _mur_tail[1] << 8; \ + case 1: _mur_k1 ^= _mur_tail[0]; \ + _mur_k1 *= _mur_c1; \ + _mur_k1 = MUR_ROTL32(_mur_k1,15); \ + _mur_k1 *= _mur_c2; \ + _mur_h1 ^= _mur_k1; \ + } \ + _mur_h1 ^= (keylen); \ + MUR_FMIX(_mur_h1); \ + hashv = _mur_h1; \ + bkt = hashv & (num_bkts-1); \ +} while(0) +#endif /* HASH_USING_NO_STRICT_ALIASING */ + +/* key comparison function; return 0 if keys equal */ +#define HASH_KEYCMP(a,b,len) memcmp(a,b,len) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out) \ +do { \ + if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head)); \ + else out=NULL; \ + while (out) { \ + if ((out)->hh.keylen == keylen_in) { \ + if ((HASH_KEYCMP((out)->hh.key,keyptr,keylen_in)) == 0) break; \ + } \ + if ((out)->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,(out)->hh.hh_next)); \ + else out = NULL; \ + } \ +} while(0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head,addhh) \ +do { \ + head.count++; \ + (addhh)->hh_next = head.hh_head; \ + (addhh)->hh_prev = NULL; \ + if (head.hh_head) { (head).hh_head->hh_prev = (addhh); } \ + (head).hh_head=addhh; \ + if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH) \ + && (addhh)->tbl->noexpand != 1) { \ + HASH_EXPAND_BUCKETS((addhh)->tbl); \ + } \ +} while(0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(hh,head,hh_del) \ + (head).count--; \ + if ((head).hh_head == hh_del) { \ + (head).hh_head = hh_del->hh_next; \ + } \ + if (hh_del->hh_prev) { \ + hh_del->hh_prev->hh_next = hh_del->hh_next; \ + } \ + if (hh_del->hh_next) { \ + hh_del->hh_next->hh_prev = hh_del->hh_prev; \ + } + +/* Bucket expansion has the effect of doubling the number of buckets + * and redistributing the items into the new buckets. Ideally the + * items will distribute more or less evenly into the new buckets + * (the extent to which this is true is a measure of the quality of + * the hash function as it applies to the key domain). + * + * With the items distributed into more buckets, the chain length + * (item count) in each bucket is reduced. Thus by expanding buckets + * the hash keeps a bound on the chain length. This bounded chain + * length is the essence of how a hash provides constant time lookup. + * + * The calculation of tbl->ideal_chain_maxlen below deserves some + * explanation. First, keep in mind that we're calculating the ideal + * maximum chain length based on the *new* (doubled) bucket count. + * In fractions this is just n/b (n=number of items,b=new num buckets). + * Since the ideal chain length is an integer, we want to calculate + * ceil(n/b). We don't depend on floating point arithmetic in this + * hash, so to calculate ceil(n/b) with integers we could write + * + * ceil(n/b) = (n/b) + ((n%b)?1:0) + * + * and in fact a previous version of this hash did just that. + * But now we have improved things a bit by recognizing that b is + * always a power of two. We keep its base 2 log handy (call it lb), + * so now we can write this with a bit shift and logical AND: + * + * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * + */ +#define HASH_EXPAND_BUCKETS(tbl) \ +do { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ + 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + if (!_he_new_buckets) { uthash_fatal( "out of memory"); } \ + memset(_he_new_buckets, 0, \ + 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + tbl->ideal_chain_maxlen = \ + (tbl->num_items >> (tbl->log2_num_buckets+1)) + \ + ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0); \ + tbl->nonideal_items = 0; \ + for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++) \ + { \ + _he_thh = tbl->buckets[ _he_bkt_i ].hh_head; \ + while (_he_thh) { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[ _he_bkt ]); \ + if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) { \ + tbl->nonideal_items++; \ + _he_newbkt->expand_mult = _he_newbkt->count / \ + tbl->ideal_chain_maxlen; \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev = \ + _he_thh; \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ + tbl->num_buckets *= 2; \ + tbl->log2_num_buckets++; \ + tbl->buckets = _he_new_buckets; \ + tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ? \ + (tbl->ineff_expands+1) : 0; \ + if (tbl->ineff_expands > 1) { \ + tbl->noexpand=1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ +} while(0) + + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) +#define HASH_SRT(hh,head,cmpfcn) \ +do { \ + unsigned _hs_i; \ + unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head) { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping) { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p) { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for ( _hs_i = 0; _hs_i < _hs_insize; _hs_i++ ) { \ + _hs_psize++; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + if (! (_hs_q) ) break; \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) { \ + if (_hs_psize == 0) { \ + _hs_e = _hs_q; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_qsize--; \ + } else if ( (_hs_qsize == 0) || !(_hs_q) ) { \ + _hs_e = _hs_p; \ + if (_hs_p){ \ + _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ + ((void*)((char*)(_hs_p->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + } \ + _hs_psize--; \ + } else if (( \ + cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \ + ) <= 0) { \ + _hs_e = _hs_p; \ + if (_hs_p){ \ + _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ + ((void*)((char*)(_hs_p->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + } \ + _hs_psize--; \ + } else { \ + _hs_e = _hs_q; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_qsize--; \ + } \ + if ( _hs_tail ) { \ + _hs_tail->next = ((_hs_e) ? \ + ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL); \ + } else { \ + _hs_list = _hs_e; \ + } \ + if (_hs_e) { \ + _hs_e->prev = ((_hs_tail) ? \ + ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL); \ + } \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + if (_hs_tail){ \ + _hs_tail->next = NULL; \ + } \ + if ( _hs_nmerges <= 1 ) { \ + _hs_looping=0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2; \ + } \ + HASH_FSCK(hh,head); \ + } \ +} while (0) + +/* This function selects items from one hash into another hash. + * The end result is that the selected items have dual presence + * in both hashes. There is no copy of the items made; rather + * they are added into the new hash through a secondary hash + * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ +do { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt=NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ + ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ + if (src) { \ + for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ + for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ + _src_hh; \ + _src_hh = _src_hh->hh_next) { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) { \ + _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh) { _last_elt_hh->next = _elt; } \ + if (!dst) { \ + DECLTYPE_ASSIGN(dst,_elt); \ + HASH_MAKE_TABLE(hh_dst,dst); \ + } else { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh); \ + (dst)->hh_dst.tbl->num_items++; \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst,dst); \ +} while (0) + +#define HASH_CLEAR(hh,head) \ +do { \ + if (head) { \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)=NULL; \ + } \ +} while(0) + +#define HASH_OVERHEAD(hh,head) \ + (size_t)((((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \ + ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \ + (sizeof(UT_hash_table)) + \ + (HASH_BLOOM_BYTELEN))) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh,head,el,tmp) \ +for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL); \ + el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) +#else +#define HASH_ITER(hh,head,el,tmp) \ +for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL); \ + el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL)) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh,head) +#define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0) + +typedef struct UT_hash_bucket { + struct UT_hash_handle *hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If + * the bucket's chain exceeds this length, bucket expansion is triggered). + * However, setting expand_mult to a non-zero value delays bucket expansion + * (that would be triggered by additions to this particular bucket) + * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. + * (The multiplier is simply expand_mult+1). The whole idea of this + * multiplier is to reduce bucket expansions, since they are expensive, in + * situations where we know that a particular bucket tends to be overused. + * It is better to let its chain length grow to a longer yet-still-bounded + * value, than to do an O(n) bucket expansion too often. + */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1 +#define HASH_BLOOM_SIGNATURE 0xb12220f2 + +typedef struct UT_hash_table { + UT_hash_bucket *buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * more than ceil(#items/#buckets) items. that's the ideal chain length. */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * exceeds the ideal chain maxlen. these items pay the penalty for an uneven + * hash distribution; reaching them in a chain traversal takes >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * afterward, more than half the items in the hash had nonideal chain + * positions. If this happens on two consecutive expansions we inhibit any + * further expansion, as it's not helping; this happens when the hash + * function isn't a good fit for the key domain. When expansion is inhibited + * the hash will still work, albeit no longer in constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t *bloom_bv; + char bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle { + struct UT_hash_table *tbl; + void *prev; /* prev element in app order */ + void *next; /* next element in app order */ + struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle *hh_next; /* next hh in bucket order */ + void *key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ diff --git a/template.html b/template.html new file mode 100644 index 0000000..2286c68 --- /dev/null +++ b/template.html @@ -0,0 +1,66 @@ +<!DOCTYPE html> +<html> +<head> +<meta charset="UTF-8"> +<title>$title$</title> +<style type="text/css"> +body { font-family: arial, sans-serif; line-height: 1.4em; max-width: 52em; + margin: 3em; } +div#TOC ul { list-style: none; } +h1 { font-size: 140%; font-weight: bold; border-top: 1px solid gray; padding-top: 0.5em; } +h2 { font-size: 120%; font-weight: bold; } +h3 { font-size: 110%; font-weight: bold; } +h4 { font-size: 100%; font-weight: bold; } +span.space { position: relative; } +span.space:after { + content: ""; + position: absolute; + /* create a mark that indicates a space (trick from D. Greenspan) */ + top: 3px; bottom: 3px; left: 1px; right: 1px; + border: 1px solid #999; +} +div.example { overflow: hidden; } +p { text-align: justify; } +pre { padding: 0.5em; margin-left: 0; margin-right: 0; margin-top: 0.2em; + margin-bottom: 0.5em; font-size: 88%; } +pre { + white-space: pre-wrap; /* css-3 */ + white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ + white-space: -pre-wrap; /* Opera 4-6 */ + white-space: -o-pre-wrap; /* Opera 7 */ + word-wrap: break-word; /* Internet Explorer 5.5+ */ +} +code { font-size: 120%; font-family: monospace; } +div.example > pre { float:left; width: 48%; } +div.example > pre.markdown { clear:left; } +pre.tree { font-weight: bold; color: #777; } +pre.markdown { background-color: #E3DEC1;} +pre.html { background-color: #E89F65; } +pre.html span.space:after { + border: 1px solid #666; +} +div.examplenum { font-size: 82%; text-align: left; } +a.footnoteRef > sup:before { + content: "["; +} +a.footnoteRef > sup:after { + content: "]"; +} +a.footnoteRef > sup { + vertical-align: baseline; + font-size: 100%; +} +</style> +</head> +<body> +<h1 class="title">$title$</h1> +<div class="version">Version $version$ ($date$)</div> +<div class="authors"> +$for(author)$<span class="author">$author$</span>$sep$; $endfor$ +</div> +<div id="TOC"> +$toc$ +</div> +$body$ +</body> +</html> diff --git a/template.tex b/template.tex new file mode 100644 index 0000000..d083b72 --- /dev/null +++ b/template.tex @@ -0,0 +1,229 @@ +\documentclass[$if(fontsize)$$fontsize$,$endif$$if(lang)$$lang$,$endif$$if(papersize)$$papersize$,$endif$$for(classoption)$$classoption$$sep$,$endfor$]{$documentclass$} +$if(fontfamily)$ +\usepackage{$fontfamily$} +$else$ +\usepackage{lmodern} +$endif$ +$if(linestretch)$ +\usepackage{setspace} +\setstretch{$linestretch$} +$endif$ +\usepackage{amssymb,amsmath} +\usepackage{ifxetex,ifluatex} +\usepackage{fixltx2e} % provides \textsubscript +\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex + \usepackage[T1]{fontenc} + \usepackage[utf8]{inputenc} +$if(euro)$ + \usepackage{eurosym} +$endif$ +\else % if luatex or xelatex + \ifxetex + \usepackage{mathspec} + \usepackage{xltxtra,xunicode} + \else + \usepackage{fontspec} + \fi + \defaultfontfeatures{Mapping=tex-text,Scale=MatchLowercase} + \newcommand{\euro}{€} +$if(mainfont)$ + \setmainfont{$mainfont$} +$endif$ +$if(sansfont)$ + \setsansfont{$sansfont$} +$endif$ +$if(monofont)$ + \setmonofont[Mapping=tex-ansi]{$monofont$} +$endif$ +$if(mathfont)$ + \setmathfont(Digits,Latin,Greek){$mathfont$} +$endif$ +\fi +% use upquote if available, for straight quotes in verbatim environments +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +% use microtype if available +\IfFileExists{microtype.sty}{\usepackage{microtype}}{} +\usepackage[margin=1in]{geometry} +$if(natbib)$ +\usepackage{natbib} +\bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$} +$endif$ +$if(biblatex)$ +\usepackage{biblatex} +$if(biblio-files)$ +\bibliography{$biblio-files$} +$endif$ +$endif$ +$if(listings)$ +\usepackage{listings} +$endif$ +$if(lhs)$ +\lstnewenvironment{code}{\lstset{language=Haskell,basicstyle=\small\ttfamily}}{} +$endif$ +\usepackage{fancyvrb} +\usepackage{color,framed} +\newcommand{\VerbBar}{|} +\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} +\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\},fontsize=\small} +% Add ',fontsize=\small' for more characters per line +\definecolor{shadecolor}{gray}{1} +\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} +\newcommand{\NormalTok}[1]{{#1}} +\let\KeywordTok\NormalTok +\let\DataTypeTok\NormalTok +\let\DecValTok\NormalTok +\let\BaseNTok\NormalTok +\let\FloatTok\NormalTok +\let\CharTok\NormalTok +\let\StringTok\NormalTok +\let\CommentTok\NormalTok +\let\OtherTok\NormalTok +\let\AlertTok\NormalTok +\let\FunctionTok\NormalTok +\let\RegionMarkerTok\NormalTok +\let\ErrorTok\NormalTok +%\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}} +%\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.56,0.13,0.00}{{#1}}} +%\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}} +%\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}} +%\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}} +%\newcommand{\CharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}} +%\newcommand{\StringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}} +%\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textit{{#1}}}} +%\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{{#1}}} +%\newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}} +%\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.02,0.16,0.49}{{#1}}} +%\newcommand{\RegionMarkerTok}[1]{{#1}} +%\newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}} +$if(verbatim-in-note)$ +\usepackage{fancyvrb} +$endif$ +$if(tables)$ +\usepackage{longtable,booktabs} +$endif$ +$if(graphics)$ +\usepackage{graphicx} +\makeatletter +\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} +\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} +\makeatother +% Scale images if necessary, so that they will not overflow the page +% margins by default, and it is still possible to overwrite the defaults +% using explicit options in \includegraphics[width, height, ...]{} +\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} +$endif$ +\ifxetex + \usepackage[setpagesize=false, % page size defined by xetex + unicode=false, % unicode breaks when used with xetex + xetex]{hyperref} +\else + \usepackage[unicode=true]{hyperref} +\fi +\hypersetup{breaklinks=true, + bookmarks=true, + pdfauthor={$author-meta$}, + pdftitle={$title-meta$}, + colorlinks=true, + citecolor=$if(citecolor)$$citecolor$$else$blue$endif$, + urlcolor=$if(urlcolor)$$urlcolor$$else$blue$endif$, + linkcolor=$if(linkcolor)$$linkcolor$$else$magenta$endif$, + pdfborder={0 0 0}} +\urlstyle{same} % don't use monospace font for urls +$if(links-as-notes)$ +% Make links footnotes instead of hotlinks: +\renewcommand{\href}[2]{#2\footnote{\url{#1}}} +$endif$ +$if(strikeout)$ +\usepackage[normalem]{ulem} +% avoid problems with \sout in headers with hyperref: +\pdfstringdefDisableCommands{\renewcommand{\sout}{}} +$endif$ +\setlength{\parindent}{0pt} +\setlength{\parskip}{6pt plus 2pt minus 1pt} +\setlength{\emergencystretch}{3em} % prevent overfull lines +$if(numbersections)$ +\setcounter{secnumdepth}{5} +$else$ +\setcounter{secnumdepth}{0} +$endif$ +$if(verbatim-in-note)$ +\VerbatimFootnotes % allows verbatim text in footnotes +$endif$ +$if(lang)$ +\ifxetex + \usepackage{polyglossia} + \setmainlanguage{$mainlang$} +\else + \usepackage[$lang$]{babel} +\fi +$endif$ + +\usepackage{titlesec} +\titleformat{\chapter}[hang]{\Huge\bfseries}{\thechapter\ }{0pt}{\Huge\bfseries} + +\usepackage{fancyhdr} +\pagestyle{fancy} +\pagenumbering{arabic} +\lhead{\itshape $title$} +\chead{} +\rhead{\itshape{\nouppercase{\rightmark}}} +\lfoot{v$version$ ($date$)} +\cfoot{} +\rfoot{\thepage} + +$if(title)$ +\title{$title$$if(subtitle)$\\\vspace{0.5em}{\large $subtitle$}$endif$} +$endif$ +$if(author)$ +\author{$for(author)$$author$$sep$ \and $endfor$} +$endif$ +\date{$date$} +$for(header-includes)$ +$header-includes$ +$endfor$ + +\begin{document} +$if(title)$ +\maketitle +$endif$ +$if(abstract)$ +\begin{abstract} +$abstract$ +\end{abstract} +$endif$ + +$for(include-before)$ +$include-before$ + +$endfor$ +$if(toc)$ +{ +\hypersetup{linkcolor=black} +\setcounter{tocdepth}{$toc-depth$} +\tableofcontents +} +$endif$ +$body$ + +$if(natbib)$ +$if(biblio-files)$ +$if(biblio-title)$ +$if(book-class)$ +\renewcommand\bibname{$biblio-title$} +$else$ +\renewcommand\refname{$biblio-title$} +$endif$ +$endif$ +\bibliography{$biblio-files$} + +$endif$ +$endif$ +$if(biblatex)$ +\printbibliography$if(biblio-title)$[title=$biblio-title$]$endif$ + +$endif$ +$for(include-after)$ +$include-after$ + +$endfor$ +\end{document} |