--- semantic-markdown - Pandoc plugin to process semantic hints --- --- SPDX-FileCopyrightText: 2025 Jonas Smedegaard --- SPDX-License-Identifier: GPL-3.0-or-later --- --- ## Examples --- --- The following Markdown text includes semantic annotations --- within braced enclosures: --- --- ```markdown --- # {=<#artwork> .:Image} Semantics --- --- Simple ontological annotation: --- [This][painting] is not a [pipe]. --- --- Nested, mixed-use and custom-namespaced annotations: --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription} --- --- [painting]: {wd:Q1061035} --- "A painting of a smoking pipe {:depiction}" --- --- [pipe]: {wd:Q104526} --- "A smoking pipe {:depicts}" --- --- {@default}: foaf --- --- {bibo}: http://purl.org/ontology/bibo/ --- --- {wd}: http://www.wikidata.org/entity/ --- ``` --- --- This filter should transform the above text, with the command --- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`, --- into the below markdown text with semantic annotations as metadata: --- --- ```markdown --- --- --- turtle: | --- @prefix bibo: http://purl.org/ontology/bibo/ --- @prefix foaf: http://xmlns.com/foaf/0.1/ --- @prefix wd: https://www.wikidata.org/entity/ --- --- <#artwork> a foaf:Image ; --- foaf:depiction ; --- foaf:depicts ; --- bibo:shortDescription "Ceci n'est pas une pipe."@fr . --- --- --- # Semantics --- --- Simple ontological annotation: --- [This][painting] is not a [pipe]. --- --- Nested, mixed-use and custom-namespaced annotations: --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr} --- --- [painting]: https://www.wikidata.org/entity/Q1061035 --- "A painting of a smoking pipe" --- --- [pipe]: https://www.wikidata.org/entity/Q104526 --- "A smoking pipe" --- ``` --- --- This filter should also transform the above text, with the command --- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`, --- into the below HTML text with embedded RDFa Lite 1.1 anotations, --- modulo wrapping of long lines: --- --- ```html ---
---

Semantics

---

Simple ontological annotation: --- This --- is not --- a pipe.

--- ---

Nested, mixed-use and custom-namespaced annotations: --- --- Ceci --- n'est pas --- une pipe. ---

---
--- ``` --- --- * v0.0.1 --- * initial release --- --- @version 0.0.1 --- @see --- @see --- @see --- @see -- TODO: maybe use topdown traversal -- * order of declaring annotations might matter (but should not) -- * might enable simpler functions and/or faster processing -- @see -- ensure stable character classes independent of system locale -- @see os.setlocale 'C' --- pseudo-enum table to track parser enclosure state --- @see local Enclosure = { NONE = "0", BRACKETED = "1", BRACKETED_DONE = "2", BRACED = "3", } -- element types representing content enclosure in Markdown local ElementTypeIsEnclosure = { Emph = true, Image = true, Link = true, Strong = true, } --- curie_type_prefix - `typeof` or `resource` attribute CURIE hint local curie_type_prefix = "[.=]?" -- TODO: cover non-ASCII Unicode characters -- @see --- curie_prefix - CURIE prefix component as set of chars --- @see local _name_start_char = "A-Z_a-z" local _name_char = _name_start_char.."-0-9" local _ref = "[".._name_start_char.."][".._name_char.."]*" local curie_prefix = "[".._name_start_char.."_-][".._name_char.."]*" --- curie_long - CURIE with prefix and reference as set of chars local curie_long = curie_prefix..":".._ref --- curie_no_ref - CURIE with only prefix as set of chars local curie_no_ref = curie_prefix..":" --- curie_local - CURIE with only name as set of chars local curie_local = ":".._ref --- curie_default - CURIE without prefix or name as char local curie_default = ":" -- TODO: curie_re - CURIE as `LPeg.re` regex object -- TODO: test and replace above curie* patterns -- @see --local curie_re = re.compile("("..curie_prefix..")?:(".._ref..")?") -- FIXME: define RDF context same as RDFa -- TODO: maybe support overriding context with a JSON-LD URI -- @see --- Namespaces - process RDF namespace IRI declarations --- --- Takes as input a list of Para block elements. --- For each block matching the pattern for a namespace IRI definition, --- the declared namespace is extracted. --- Returns an empty paragraph in case of a match, --- or nothing (to signal preservation of original content). --- --- Example: --- --- ```Markdown --- # Annotated paragraph using a custom namespace --- --- My favorite animal is the [Liger]{ov:preferredAnimal}. --- {=<#me> .:Person} --- --- {ov}: http://open.vocab.org/terms/ --- ``` --- --- @param blocks Markdown with ontological annotations as Blocks --- @returns Markdown without ontological annotations as Blocks --- @see --- @see local function Namespaces(blocks) -- paragraph with only a braced prefix-only CURIE, colon and one word local curie_pattern = "^{"..curie_type_prefix..curie_prefix.."}:$" if #blocks.content == 3 and blocks.content[1].t == "Str" and blocks.content[2].t == "Space" and blocks.content[1].text:match(curie_pattern) then local el = blocks.content[3] -- default namespace, parsed as commonmark if el.t == "Str" and el.text == "@default" then -- FIXME: add CURIE to metadata return {} end -- default namespace, parsed as markdown if el.t == "Cite" and #el.content == 1 and el.content[1].text == "@default" then -- FIXME: add CURIE to metadata return {} end -- namespace -- TODO: relax to match URI syntax without hardcoded protocols local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:" if el.t == "Str" and el.text:match(proto_pattern) then -- FIXME: add CURIE and URI to metadata return {} end end end --- Statements - process inline RDF statements --- --- Locate and extract ontological annotations --- within a [Block] element of a Pandoc Abstract Syntax Tree (AST). --- --- Markup for ontological annotations is an extension to Markdown --- using similar syntax as hypermedia annotations, --- but listing RDFa [CURIEs] in a braced enclosure. --- --- ```ASCII-art --- Simple ontological annotation: --- "A [map]{foaf:depiction} is not the territory" --- | ||\~~~~~~~~~~~~/| --- a bc CURIEa d --- --- Nested and mixed-use annotations: --- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description} --- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/| --- a a1 |c1 CURIEa d1bc CURIEb d --- b1 --- --- Chained hypermedia and ontological annotations: --- "A [map](https://osm.org/){foaf:depiction} is not the territory" --- | || ||\~~~~~~~~~~~~/| --- a be fc CURIEa d --- --- Legend: --- a-b: bracketed enclosure around content --- c-d: braced enclosure around ontological or other annotation --- e-f: parenthesized enclosure around hypermedia annotation --- ``` --- --- Ontological annotations are parsed and reorganised --- using the following algorithm: --- --- 1. locate pairs of bracketed text and braced text --- either adjacent or separated by parenthesized text, --- where braced text contains one or more [CURIEs] --- 2. for each pair, --- 1. add CURIEs in braced text to metadata --- 2. add positions of brackets to metadata --- 3. delete CURIEs --- 4. delete braced enclosure if now structurally empty --- 5. delete brackets if now unannotated --- --- The implementation is inspired by Pandoc [issue#6038]. --- --- @param inlines Markdown with semantic annotations as Inlines --- @returns Markdown stripped of semantic annotations as Inlines --- @see [Block]: --- @see [CURIEs]: --- @see [issue#6038]: -- TODO: maybe instead as step #5 add/reuse hypermedia anchor local function Statements (block) -- flags for enclosing stages -- TODO: support nested bracket enclosure local encl = Enclosure.NONE -- amount of detected statements in this block local statement_count = 0 -- stacks of qualified and pending unenclosed/enclosed elements local elems = pandoc.List() local elems_unenclosed = pandoc.List() local elems_enclosed = pandoc.List() for _, el in ipairs(block.content) do local pos = 1 -- strings of pending unenclosed/enclosed chars local chars_unenclosed = "" local chars_enclosed = "" -- non-string element, highest state first to support fall-through if el.t ~= 'Str' then elems_unenclosed:insert(el) if encl == Enclosure.BRACED then elems_enclosed:insert(el) goto continue end if encl == Enclosure.BRACKETED_DONE then -- disqualify bracketing not directly followed by brace elems:extend(elems_unenclosed) elems_unenclosed = pandoc.List() elems_enclosed = pandoc.List() encl = Enclosure.NONE -- fall through to parse element as unenclosed end if encl == Enclosure.BRACKETED then elems_enclosed:insert(el) goto continue end if encl == Enclosure.NONE then -- specific elements represent content enclosure if ElementTypeIsEnclosure[el.t] then encl = Enclosure.BRACKETED_DONE end end goto continue end -- unenclosed -- TODO: accept backslash except immediately before bracket if encl == Enclosure.NONE then local _, nextpos, s = el.text:find("^([^%[\\]*)") pos = nextpos and nextpos + 1 or pos + 1 chars_unenclosed = chars_unenclosed..s -- entering bracketed enclosure if el.text:sub(pos, pos) == "[" then -- qualify unenclosed elements elems:extend(elems_unenclosed) elems_unenclosed = pandoc.List() elems_enclosed = pandoc.List() if chars_unenclosed:len() > 0 then elems:insert(pandoc.Str(chars_unenclosed)) end pos = pos + 1 chars_unenclosed = chars_unenclosed.."[" chars_enclosed = "" encl = Enclosure.BRACKETED end end -- in bracketed enclosure -- TODO: accept backslash except immediately before bracket/brace -- TODO: support nested bracket enclosure if encl == Enclosure.BRACKETED then local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos) pos = nextpos and nextpos + 1 or pos + 1 chars_unenclosed = chars_unenclosed..s chars_enclosed = chars_enclosed..s -- exiting bracketed enclosure if el.text:sub(pos, pos) == "]" then pos = pos + 1 chars_unenclosed = chars_unenclosed.."]" encl = Enclosure.BRACKETED_DONE end end -- exited bracketed enclosure if encl == Enclosure.BRACKETED_DONE then -- entering braced enclosure if el.text:sub(pos, pos) == "{" then pos = pos + 1 chars_unenclosed = chars_unenclosed.."{" encl = Enclosure.BRACED -- leaving non-annotation enclosure else -- disqualify bracketing not directly followed by brace elems:extend(elems_unenclosed) elems_unenclosed = pandoc.List() elems_enclosed = pandoc.List() if chars_unenclosed:len() > 0 then elems:insert(pandoc.Str(chars_unenclosed)) chars_unenclosed = "" end chars_enclosed = "" encl = Enclosure.NONE end end -- in braced enclosure, leaving it -- TODO: support mixed-use enclosure if encl == Enclosure.BRACED then local curie_pattern1 = "^"..curie_type_prefix..curie_long.."}" local curie_pattern2 = "^"..curie_type_prefix..curie_no_ref.."}" local curie_pattern3 = "^"..curie_type_prefix..curie_local.."}" local curie_pattern4 = "^"..curie_type_prefix..curie_default.."}" local _, nextpos1 = el.text:find(curie_pattern1, pos) local _, nextpos2 = el.text:find(curie_pattern2, pos) local _, nextpos3 = el.text:find(curie_pattern3, pos) local _, nextpos4 = el.text:find(curie_pattern4, pos) local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4 if nextpos then statement_count = statement_count + 1 pos = nextpos + 1 -- TODO: instead recursively call Statements() on remains of Str chars_enclosed = chars_enclosed..el.text:sub(pos) -- qualify completed bracketed enclosure elems:extend(elems_enclosed) elems_enclosed = pandoc.List() elems_unenclosed = pandoc.List() if chars_enclosed:len() > 0 then elems:insert(pandoc.Str(chars_enclosed)) chars_enclosed = "" end chars_unenclosed = "" encl = Enclosure.NONE end end -- push strings to stacks if chars_enclosed:len() > 0 then elems_enclosed:insert(pandoc.Str(chars_enclosed)) end if chars_unenclosed:len() > 0 then elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) end -- done parsing current Inline element ::continue:: end -- return altered stack if it contains complete enclosures if statement_count > 0 then -- disqualify incomplete enclosure if encl ~= Enclosure.NONE then elems:extend(elems_unenclosed) end block.content = elems return block end end -- First resolve namespace declarations, then statements. -- -- Although this filter is *not* a full RDF parser, -- order matters for the parts we do handle -- -- e.g. namespace resolving is similar to other RDF formats -- with detailed documented process ordering. -- -- @see local meta = {} return { -- move aside MetaBlocks to speed up processing content -- -- @see { Meta = function(m) meta = m; return {} end }, {Para = Namespaces}, {Block = Statements}, -- FIXME: add custom declared namespaces in Meta -- TODO: maybe add only actively used namespaces -- (do same as for unused link definitions) { Meta = function(_) return meta; end }, --{ Meta = function(_) return NamespacesToMeta(meta); end }, }