- --- semantic-markdown - Pandoc plugin to process semantic hints
- ---
- --- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk>
- --- SPDX-License-Identifier: GPL-3.0-or-later
- ---
- --- ## Examples
- ---
- --- The following Markdown text includes semantic annotations
- --- within braced enclosures:
- ---
- --- ```markdown
- --- # {=<#artwork> .:Image} Semantics
- ---
- --- Simple ontological annotation:
- --- [This][painting] is not a [pipe].
- ---
- --- Nested, mixed-use and custom-namespaced annotations:
- --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription}
- ---
- --- [painting]: {wd:Q1061035}
- --- "A painting of a smoking pipe {:depiction}"
- ---
- --- [pipe]: {wd:Q104526}
- --- "A smoking pipe {:depicts}"
- ---
- --- {@default}: foaf
- ---
- --- {bibo}: http://purl.org/ontology/bibo/
- ---
- --- {wd}: http://www.wikidata.org/entity/
- --- ```
- ---
- --- This filter should transform the above text, with the command
- --- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`,
- --- into the below markdown text with semantic annotations as metadata:
- ---
- --- ```markdown
- --- ---
- --- turtle: |
- --- @prefix bibo: http://purl.org/ontology/bibo/
- --- @prefix foaf: http://xmlns.com/foaf/0.1/
- --- @prefix wd: https://www.wikidata.org/entity/
- ---
- --- <#artwork> a foaf:Image ;
- --- foaf:depiction <https://www.wikidata.org/entity/Q1061035> ;
- --- foaf:depicts <https://www.wikidata.org/entity/Q104526> ;
- --- bibo:shortDescription "Ceci n'est pas une pipe."@fr .
- --- ---
- --- # Semantics
- ---
- --- Simple ontological annotation:
- --- [This][painting] is not a [pipe].
- ---
- --- Nested, mixed-use and custom-namespaced annotations:
- --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr}
- ---
- --- [painting]: https://www.wikidata.org/entity/Q1061035
- --- "A painting of a smoking pipe"
- ---
- --- [pipe]: https://www.wikidata.org/entity/Q104526
- --- "A smoking pipe"
- --- ```
- ---
- --- This filter should also transform the above text, with the command
- --- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`,
- --- into the below HTML text with embedded RDFa Lite 1.1 anotations,
- --- modulo wrapping of long lines:
- ---
- --- ```html
- --- <div vocab="http://xmlns.com/foaf/0.1/"
- --- prefix="bibo: http://purl.org/ontology/bibo/"
- --- resource="#artwork" typeof="Image">
- --- <h1>Semantics</h1>
- --- <p>Simple ontological annotation:
- --- <a property="depiction"
- --- href="https://www.wikidata.org/entity/Q1061035"
- --- title="A painting of a smoking pipe">This</a>
- --- is not
- --- a <a property="depicts"
- --- href="https://www.wikidata.org/entity/Q104526"
- --- title="A smoking pipe">pipe</a>.</p>
- ---
- --- <p>Nested, mixed-use and custom-namespaced annotations:
- --- <span lang="fr" property="bibo:shortDescription">
- --- <a property="depiction"
- --- href="https://www.wikidata.org/entity/Q1061035"
- --- title="A painting of a smoking pipe">Ceci</a>
- --- n'est pas
- --- une <a property="depicts"
- --- href="https://www.wikidata.org/entity/Q104526"
- --- title="A smoking pipe">pipe</a>.
- --- </span></p>
- --- </div>
- --- ```
- ---
- --- * v0.0.1
- --- * initial release
- ---
- --- @version 0.0.1
- --- @see <https://source.jones.dk/semantic-markdown/about/>
- --- @see <https://moodle.ruc.dk/course/view.php?id=23505>
- --- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
- --- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>
- -- TODO: maybe use topdown traversal
- -- * order of declaring annotations might matter (but should not)
- -- * might enable simpler functions and/or faster processing
- -- @see <https://pandoc.org/lua-filters.html#topdown-traversal>
- -- ensure stable character classes independent of system locale
- -- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
- os.setlocale 'C'
- -- flag running with older release of Pandoc
- --
- -- Some Pandoc features,
- -- notably pandoc.List:at() introduced wit Pandoc 3.5,
- -- are unavailable in older Pandoc releases still in widespread use
- -- due to complexities of keeping Haskell dependencies in sync.
- -- @see <https://bugs.debian.org/1098377>
- local PANDOC_IS_OLD <const> = PANDOC_VERSION[1] < 3
- or (PANDOC_VERSION[1] == 3 and PANDOC_VERSION[2] < 5)
- --- pseudo-enum table to track parser enclosure state
- --- @see <https://stackoverflow.com/a/70529481/18619283>
- local Enclosure = {
- NONE = "0",
- BRACKETED = "1",
- BRACKETED_DONE = "2",
- BRACED = "3",
- }
- -- element types representing content enclosure in Markdown
- local ElementTypeIsEnclosure = {
- Emph = true,
- Image = true,
- Link = true,
- Strong = true,
- }
- --- CURIE_TYPE_PREFIX - `typeof` or `resource` attribute CURIE hint
- local CURIE_TYPE_PREFIX <const> = "[.=]?"
- -- TODO: cover non-ASCII Unicode characters
- -- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
- --- CURIE_PREFIX - CURIE prefix component as set of chars
- --- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
- local _NAME_START_CHAR <const> = "A-Z_a-z"
- local _NAME_CHAR <const> = _NAME_START_CHAR.."-0-9"
- local _REF <const> = "[".._NAME_START_CHAR.."][".._NAME_CHAR.."]*"
- local CURIE_PREFIX <const> = "[".._NAME_START_CHAR.."_-][".._NAME_CHAR.."]*"
- --- CURIE_LONG - CURIE with prefix and reference as set of chars
- local CURIE_LONG <const> = CURIE_PREFIX..":".._REF
- --- CURIE_NO_REF - CURIE with only prefix as set of chars
- local CURIE_NO_REF <const> = CURIE_PREFIX..":"
- --- CURIE_LOCAL - CURIE with only name as set of chars
- local CURIE_LOCAL <const> = ":".._REF
- --- CURIE_DEFAULT - CURIE without prefix or name as char
- local CURIE_DEFAULT <const> = ":"
- -- TODO: CURIE_re - CURIE as `LPeg.re` regex object
- -- TODO: test and replace above curie* patterns
- -- @see <https://pandoc.org/lua-filters.html#global-variables>
- --local CURIE_re <const> = re.compile("("..CURIE_PREFIX..")?:(".._REF..")?")
- -- FIXME: define RDF context same as RDFa
- -- TODO: maybe support overriding context with a JSON-LD URI
- -- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>
- --- TableEmpty - check if Pandoc List contains any elements
- ---
- --- Use a workaround for Pandoc releases older than 3.5
- --- where ergonomic function pandoc.List:at() is missing.
- ---
- --- @param list Pandoc List to inspect
- --- @return result of inspection as boolean
- local function TableEmpty(list)
- if PANDOC_IS_OLD then
- local list_clone = list:clone()
- return not list_clone:remove()
- else
- return list:at(1) == nil
- end
- end
- --- Namespaces - process RDF namespace IRI declarations
- ---
- --- Takes as input a list of Para block elements.
- --- For each block matching the pattern for a namespace IRI definition,
- --- the declared namespace is extracted.
- --- Returns an empty paragraph in case of a match,
- --- or nothing (to signal preservation of original content).
- ---
- --- Example:
- ---
- --- ```Markdown
- --- # Annotated paragraph using a custom namespace
- ---
- --- My favorite animal is the [Liger]{ov:preferredAnimal}.
- --- {=<#me> .:Person}
- ---
- --- {ov}: http://open.vocab.org/terms/
- --- ```
- ---
- --- @param blocks Markdown with ontological annotations as Blocks
- --- @returns Markdown without ontological annotations as Blocks
- --- @see <https://pandoc.org/lua-filters.html#type-blocks>
- --- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
- local function Namespaces(blocks)
- -- paragraph with only a braced prefix-only CURIE, colon and one word
- local curie_pattern = "^{"..CURIE_TYPE_PREFIX..CURIE_PREFIX.."}:$"
- if #blocks.content == 3
- and blocks.content[1].t == "Str"
- and blocks.content[2].t == "Space"
- and blocks.content[1].text:match(curie_pattern)
- then
- local el = blocks.content[3]
- -- default namespace, parsed as commonmark
- if el.t == "Str"
- and el.text == "@default"
- then
- -- FIXME: add CURIE to metadata
- return {}
- end
- -- default namespace, parsed as markdown
- if el.t == "Cite"
- and #el.content == 1
- and el.content[1].text == "@default"
- then
- -- FIXME: add CURIE to metadata
- return {}
- end
- -- namespace
- -- TODO: relax to match URI syntax without hardcoded protocols
- local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:"
- if el.t == "Str"
- and el.text:match(proto_pattern)
- then
- -- FIXME: add CURIE and URI to metadata
- return {}
- end
- end
- end
- --- Statements - process inline RDF statements
- ---
- --- Locate and extract ontological annotations
- --- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
- ---
- --- Markup for ontological annotations is an extension to Markdown
- --- using similar syntax as hypermedia annotations,
- --- but listing RDFa [CURIEs] in a braced enclosure.
- ---
- --- ```ASCII-art
- --- Simple ontological annotation:
- --- "A [map]{foaf:depiction} is not the territory"
- --- | ||\~~~~~~~~~~~~/|
- --- a bc CURIEa d
- ---
- --- Nested and mixed-use annotations:
- --- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description}
- --- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/|
- --- a a1 |c1 CURIEa d1bc CURIEb d
- --- b1
- ---
- --- Chained hypermedia and ontological annotations:
- --- "A [map](https://osm.org/){foaf:depiction} is not the territory"
- --- | || ||\~~~~~~~~~~~~/|
- --- a be fc CURIEa d
- ---
- --- Legend:
- --- a-b: bracketed enclosure around content
- --- c-d: braced enclosure around ontological or other annotation
- --- e-f: parenthesized enclosure around hypermedia annotation
- --- ```
- ---
- --- Ontological annotations are parsed and reorganised
- --- using the following algorithm:
- ---
- --- 1. locate pairs of bracketed text and braced text
- --- either adjacent or separated by parenthesized text,
- --- where braced text contains one or more [CURIEs]
- --- 2. for each pair,
- --- 1. add CURIEs in braced text to metadata
- --- 2. add positions of brackets to metadata
- --- 3. delete CURIEs
- --- 4. delete braced enclosure if now structurally empty
- --- 5. delete brackets if now unannotated
- ---
- --- The implementation is inspired by Pandoc [issue#6038].
- ---
- --- @param inlines Markdown with semantic annotations as Inlines
- --- @returns Markdown stripped of semantic annotations as Inlines
- --- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
- --- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
- --- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
- -- TODO: maybe instead as step #5 add/reuse hypermedia anchor
- local function Statements (block)
- -- flags for enclosing stages
- -- TODO: support nested bracket enclosure
- local encl = Enclosure.NONE
- -- amount of detected statements in this block
- local statement_count = 0
- -- stacks of qualified and pending unenclosed/enclosed elements
- local elems = pandoc.List()
- local elems_unenclosed = pandoc.List()
- local elems_enclosed = pandoc.List()
- -- strings of pending unenclosed/enclosed chars
- local chars_unenclosed = ""
- local chars_enclosed = ""
- for _, el in ipairs(block.content) do
- local pos = 1
- -- non-string element, highest state first to support fall-through
- if el.t ~= 'Str' then
- elems_unenclosed:insert(el)
- if encl == Enclosure.BRACED then
- elems_enclosed:insert(el)
- goto continue
- end
- if encl == Enclosure.BRACKETED_DONE then
- -- disqualify bracketing not directly followed by brace
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- encl = Enclosure.NONE
- -- fall through to parse element as unenclosed
- end
- if encl == Enclosure.BRACKETED then
- elems_enclosed:insert(el)
- goto continue
- end
- if encl == Enclosure.NONE then
- -- specific elements represent content enclosure
- if ElementTypeIsEnclosure[el.t] then
- encl = Enclosure.BRACKETED_DONE
- end
- end
- goto continue
- end
- -- unenclosed
- -- TODO: accept backslash except immediately before bracket
- if encl == Enclosure.NONE then
- local _, nextpos, s = el.text:find("^([^%[\\]*)")
- pos = nextpos and nextpos + 1 or pos + 1
- chars_unenclosed = chars_unenclosed..s
- -- entering bracketed enclosure
- if el.text:sub(pos, pos) == "[" then
- -- qualify unenclosed elements
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- if chars_unenclosed:len() > 0 then
- elems:insert(pandoc.Str(chars_unenclosed))
- end
- pos = pos + 1
- chars_unenclosed = chars_unenclosed.."["
- chars_enclosed = ""
- encl = Enclosure.BRACKETED
- end
- end
- -- in bracketed enclosure
- -- TODO: accept backslash except immediately before bracket/brace
- -- TODO: support nested bracket enclosure
- if encl == Enclosure.BRACKETED then
- local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos)
- pos = nextpos and nextpos + 1 or pos + 1
- chars_unenclosed = chars_unenclosed..s
- chars_enclosed = chars_enclosed..s
- -- exiting bracketed enclosure
- if el.text:sub(pos, pos) == "]" then
- pos = pos + 1
- chars_unenclosed = chars_unenclosed.."]"
- encl = Enclosure.BRACKETED_DONE
- end
- end
- -- exited bracketed enclosure
- if encl == Enclosure.BRACKETED_DONE then
- -- entering braced enclosure
- if el.text:sub(pos, pos) == "{" then
- pos = pos + 1
- chars_unenclosed = chars_unenclosed.."{"
- encl = Enclosure.BRACED
- -- leaving non-annotation enclosure
- else
- -- disqualify bracketing not directly followed by brace
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- if chars_unenclosed:len() > 0 then
- elems:insert(pandoc.Str(chars_unenclosed))
- chars_unenclosed = ""
- end
- chars_enclosed = ""
- encl = Enclosure.NONE
- end
- end
- -- in braced enclosure, leaving it
- -- TODO: support mixed-use enclosure
- if encl == Enclosure.BRACED then
- local curie_pattern1 = "^"..CURIE_TYPE_PREFIX..CURIE_LONG.."}"
- local curie_pattern2 = "^"..CURIE_TYPE_PREFIX..CURIE_NO_REF.."}"
- local curie_pattern3 = "^"..CURIE_TYPE_PREFIX..CURIE_LOCAL.."}"
- local curie_pattern4 = "^"..CURIE_TYPE_PREFIX..CURIE_DEFAULT.."}"
- local _, nextpos1 = el.text:find(curie_pattern1, pos)
- local _, nextpos2 = el.text:find(curie_pattern2, pos)
- local _, nextpos3 = el.text:find(curie_pattern3, pos)
- local _, nextpos4 = el.text:find(curie_pattern4, pos)
- local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4
- if nextpos then
- if chars_enclosed:len() > 0 then
- elems_enclosed:insert(pandoc.Str(chars_enclosed))
- end
- -- qualify completed bracketed enclosure
- if not TableEmpty(elems_enclosed) then
- elems:extend(elems_enclosed)
- end
- elems_enclosed = pandoc.List()
- elems_unenclosed = pandoc.List()
- chars_enclosed = ""
- chars_unenclosed = ""
- encl = Enclosure.NONE
- statement_count = statement_count + 1
- pos = nextpos + 1
- -- TODO: instead recursively parse remains of Str
- chars_unenclosed = chars_unenclosed..el.text:sub(pos)
- end
- end
- -- push strings to stacks
- if chars_enclosed:len() > 0 then
- elems_enclosed:insert(pandoc.Str(chars_enclosed))
- end
- if chars_unenclosed:len() > 0 then
- elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
- end
- chars_unenclosed = ""
- chars_enclosed = ""
- -- done parsing current Inline element
- ::continue::
- end
- -- return altered stack if it contains complete enclosures
- if statement_count > 0 then
- -- disqualify incomplete enclosure
- if encl ~= Enclosure.NONE then
- elems:extend(elems_unenclosed)
- end
- block.content = elems
- return block
- end
- end
- -- First resolve namespace declarations, then statements.
- --
- -- Although this filter is *not* a full RDF parser,
- -- order matters for the parts we do handle --
- -- e.g. namespace resolving is similar to other RDF formats
- -- with detailed documented process ordering.
- --
- -- @see <https://www.w3.org/TR/turtle/#sec-parsing>
- local meta = {}
- return {
- -- move aside MetaBlocks to speed up processing content
- --
- -- @see <https://stackoverflow.com/a/47356252/18619283>
- { Meta = function(m) meta = m; return {} end },
- {Para = Namespaces},
- {Block = Statements},
- -- FIXME: add custom declared namespaces in Meta
- -- TODO: maybe add only actively used namespaces
- -- (do same as for unused link definitions)
- { Meta = function(_) return meta; end },
- --{ Meta = function(_) return NamespacesToMeta(meta); end },
- }
|