diff options
| author | Jonas Smedegaard <dr@jones.dk> | 2025-05-24 14:26:51 +0200 |
|---|---|---|
| committer | Jonas Smedegaard <dr@jones.dk> | 2025-05-24 14:26:51 +0200 |
| commit | 7f610808af0a83264d9e2dd52620b01bea4d0091 (patch) | |
| tree | 26bc05ea5ae624f2f214f5ffec1cbc7666792da9 /_extensions | |
| parent | 85447f31197306a540af3b761700ae16ff75a110 (diff) | |
rename filter semantic-markdown -> sem-md; consistently capitalize spec Semantic Markdown
Diffstat (limited to '_extensions')
| l--------- | _extensions/ruc-play/sem-md | 1 | ||||
| -rw-r--r-- | _extensions/ruc-play/semantic-markdown/_extension.yaml | 6 | ||||
| -rw-r--r-- | _extensions/ruc-play/semantic-markdown/semantic-markdown.lua | 602 |
3 files changed, 1 insertions, 608 deletions
diff --git a/_extensions/ruc-play/sem-md b/_extensions/ruc-play/sem-md new file mode 120000 index 0000000..65c97a3 --- /dev/null +++ b/_extensions/ruc-play/sem-md @@ -0,0 +1 @@ +/home/jonas/Projects/PLAY/md/sem-md
\ No newline at end of file diff --git a/_extensions/ruc-play/semantic-markdown/_extension.yaml b/_extensions/ruc-play/semantic-markdown/_extension.yaml deleted file mode 100644 index 76b9a7a..0000000 --- a/_extensions/ruc-play/semantic-markdown/_extension.yaml +++ /dev/null @@ -1,6 +0,0 @@ -name: semantic-markdown -author: Jonas Smedegaard -version: 0.0.1 -contributes: - filters: - - semantic-markdown.lua diff --git a/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua b/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua deleted file mode 100644 index abdb078..0000000 --- a/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua +++ /dev/null @@ -1,602 +0,0 @@ ---- semantic-markdown - Pandoc filter to process semantic hints ---- ---- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk> ---- SPDX-License-Identifier: GPL-3.0-or-later ---- ---- ## Examples ---- ---- The following Markdown text includes semantic annotations ---- within braced enclosures: ---- ---- ```markdown ---- # {=<#artwork> .:Image} Semantics ---- ---- Simple ontological annotation: ---- [This][painting] is not a [pipe]. ---- ---- Nested, mixed-use and custom-namespaced annotations: ---- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription} ---- ---- [painting]: {wd:Q1061035} ---- "A painting of a smoking pipe {:depiction}" ---- ---- [pipe]: {wd:Q104526} ---- "A smoking pipe {:depicts}" ---- ---- {@default}: foaf ---- ---- {bibo}: http://purl.org/ontology/bibo/ ---- ---- {wd}: http://www.wikidata.org/entity/ ---- ``` ---- ---- This filter should transform the above text, with the command ---- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`, ---- into the below markdown text with semantic annotations as metadata: ---- ---- ```markdown ---- --- ---- turtle: | ---- @prefix bibo: http://purl.org/ontology/bibo/ ---- @prefix foaf: http://xmlns.com/foaf/0.1/ ---- @prefix wd: https://www.wikidata.org/entity/ ---- ---- <#artwork> a foaf:Image ; ---- foaf:depiction <https://www.wikidata.org/entity/Q1061035> ; ---- foaf:depicts <https://www.wikidata.org/entity/Q104526> ; ---- bibo:shortDescription "Ceci n'est pas une pipe."@fr . ---- --- ---- # Semantics ---- ---- Simple ontological annotation: ---- [This][painting] is not a [pipe]. ---- ---- Nested, mixed-use and custom-namespaced annotations: ---- [[Ceci][painting] n'est pas une [pipe].]{lang=fr} ---- ---- [painting]: https://www.wikidata.org/entity/Q1061035 ---- "A painting of a smoking pipe" ---- ---- [pipe]: https://www.wikidata.org/entity/Q104526 ---- "A smoking pipe" ---- ``` ---- ---- This filter should also transform the above text, with the command ---- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`, ---- into the below HTML text with embedded RDFa Lite 1.1 anotations, ---- modulo wrapping of long lines: ---- ---- ```html ---- <div vocab="http://xmlns.com/foaf/0.1/" ---- prefix="bibo: http://purl.org/ontology/bibo/" ---- resource="#artwork" typeof="Image"> ---- <h1>Semantics</h1> ---- <p>Simple ontological annotation: ---- <a property="depiction" ---- href="https://www.wikidata.org/entity/Q1061035" ---- title="A painting of a smoking pipe">This</a> ---- is not ---- a <a property="depicts" ---- href="https://www.wikidata.org/entity/Q104526" ---- title="A smoking pipe">pipe</a>.</p> ---- ---- <p>Nested, mixed-use and custom-namespaced annotations: ---- <span lang="fr" property="bibo:shortDescription"> ---- <a property="depiction" ---- href="https://www.wikidata.org/entity/Q1061035" ---- title="A painting of a smoking pipe">Ceci</a> ---- n'est pas ---- une <a property="depicts" ---- href="https://www.wikidata.org/entity/Q104526" ---- title="A smoking pipe">pipe</a>. ---- </span></p> ---- </div> ---- ``` ---- ---- * v0.0.1 ---- * initial release ---- ---- @version 0.0.1 ---- @see <https://source.jones.dk/semantic-markdown/about/> ---- @see <https://moodle.ruc.dk/course/view.php?id=23505> ---- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa> ---- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html> - --- TODO: maybe use topdown traversal --- * order of declaring annotations might matter (but should not) --- * might enable simpler functions and/or faster processing --- @see <https://pandoc.org/lua-filters.html#topdown-traversal> - --- ensure stable character classes independent of system locale --- @see <https://pandoc.org/lua-filters.html#common-pitfalls> -os.setlocale 'C' - --- flag running with older release of Pandoc --- --- Some Pandoc features, --- notably pandoc.List:at() introduced wit Pandoc 3.5, --- are unavailable in older Pandoc releases still in widespread use --- due to complexities of keeping Haskell dependencies in sync. --- @see <https://bugs.debian.org/1098377> -local PANDOC_IS_OLD <const> = PANDOC_VERSION[1] < 3 - or (PANDOC_VERSION[1] == 3 and PANDOC_VERSION[2] < 5) - ---- pseudo-enum table to track parser enclosure state ---- @see <https://stackoverflow.com/a/70529481/18619283> -local Enclosure = { - NONE = "0", - BRACKETED = "1", - BRACKETED_DONE = "2", - BRACED = "3", - BRACED_DONE = "4", -} - --- element types representing content enclosure in Markdown -local ElementTypeIsEnclosure = { - Emph = true, - Image = true, - Link = true, - Strong = true, -} - ---- CURIE_TYPE_PREFIX - `typeof` or `resource` attribute CURIE hint -local CURIE_TYPE_PREFIX <const> = "[.=]?" - --- TODO: cover non-ASCII Unicode characters --- @see <https://www.lua.org/manual/5.4/manual.html#6.5> ---- CURIE_PREFIX - CURIE prefix component as set of chars ---- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/> -local _NAME_START_CHAR <const> = "A-Z_a-z" -local _NAME_CHAR <const> = _NAME_START_CHAR.."-0-9" -local _REF <const> = "[".._NAME_START_CHAR.."][".._NAME_CHAR.."]*" -local CURIE_PREFIX <const> = "[".._NAME_START_CHAR.."_-][".._NAME_CHAR.."]*" - ---- CURIE_LONG - CURIE with prefix and reference as set of chars -local CURIE_LONG <const> = CURIE_PREFIX..":".._REF - ---- CURIE_NO_REF - CURIE with only prefix as set of chars -local CURIE_NO_REF <const> = CURIE_PREFIX..":" - ---- CURIE_LOCAL - CURIE with only name as set of chars -local CURIE_LOCAL <const> = ":".._REF - ---- CURIE_DEFAULT - CURIE without prefix or name as char -local CURIE_DEFAULT <const> = ":" - --- TODO: CURIE_re - CURIE as `LPeg.re` regex object --- TODO: test and replace above curie* patterns --- @see <https://pandoc.org/lua-filters.html#global-variables> ---local CURIE_re <const> = re.compile("("..CURIE_PREFIX..")?:(".._REF..")?") - --- FIXME: define RDF context same as RDFa --- TODO: maybe support overriding context with a JSON-LD URI --- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1> - ---- TableEmpty - check if Pandoc List contains any elements ---- ---- Use a workaround for Pandoc releases older than 3.5 ---- where ergonomic function pandoc.List:at() is missing. ---- ---- @param list Pandoc List to inspect ---- @return result of inspection as boolean -local function TableEmpty(list) - if PANDOC_IS_OLD then - local list_clone = list:clone() - return not list_clone:remove() - else - return list:at(1) == nil - end -end - ---- Namespaces - process RDF namespace IRI declarations ---- ---- Takes as input a list of Para block elements. ---- For each block matching the pattern for a namespace IRI definition, ---- the declared namespace is extracted. ---- Returns an empty paragraph in case of a match, ---- or nothing (to signal preservation of original content). ---- ---- Example: ---- ---- ```Markdown ---- # Annotated paragraph using a custom namespace ---- ---- My favorite animal is the [Liger]{ov:preferredAnimal}. ---- {=<#me> .:Person} ---- ---- {ov}: http://open.vocab.org/terms/ ---- ``` ---- ---- @param blocks Markdown with ontological annotations as Blocks ---- @returns Markdown without ontological annotations as Blocks ---- @see <https://pandoc.org/lua-filters.html#type-blocks> ---- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies> -local function Namespaces(blocks) - - -- paragraph with only a braced prefix-only CURIE, colon and one word - local curie_pattern = "^{"..CURIE_TYPE_PREFIX..CURIE_PREFIX.."}:$" - if #blocks.content == 3 - and blocks.content[1].t == "Str" - and blocks.content[2].t == "Space" - and blocks.content[1].text:match(curie_pattern) - then - local el = blocks.content[3] - - -- default namespace, parsed as commonmark - if el.t == "Str" - and el.text == "@default" - then - -- FIXME: add CURIE to metadata - return {} - end - - -- default namespace, parsed as markdown - if el.t == "Cite" - and #el.content == 1 - and el.content[1].text == "@default" - then - -- FIXME: add CURIE to metadata - return {} - end - - -- namespace - -- TODO: relax to match URI syntax without hardcoded protocols - local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:" - if el.t == "Str" - and el.text:match(proto_pattern) - then - -- FIXME: add CURIE and URI to metadata - return {} - end - end -end - ---- Statements - process inline RDF statements ---- ---- Locate and extract ontological annotations ---- within a [Block] element of a Pandoc Abstract Syntax Tree (AST). ---- ---- Markup for ontological annotations is an extension to Markdown ---- using similar syntax as hypermedia annotations, ---- but listing RDFa [CURIEs] in a braced enclosure. ---- ---- ```ASCII-art ---- Simple ontological annotation: ---- "A [map]{foaf:depiction} is not the territory" ---- | ||\~~~~~~~~~~~~/| ---- a bc CURIEa d ---- ---- Nested and mixed-use annotations: ---- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description} ---- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/| ---- a a1 |c1 CURIEa d1bc CURIEb d ---- b1 ---- ---- Chained hypermedia and ontological annotations: ---- "A [map](https://osm.org/){foaf:depiction} is not the territory" ---- | || ||\~~~~~~~~~~~~/| ---- a be fc CURIEa d ---- ---- Legend: ---- a-b: bracketed enclosure around content ---- c-d: braced enclosure around ontological or other annotation ---- e-f: parenthesized enclosure around hypermedia annotation ---- ``` ---- ---- Ontological annotations are parsed and reorganised ---- using the following algorithm: ---- ---- 1. locate pairs of bracketed text and braced text ---- either adjacent or separated by parenthesized text, ---- where braced text contains one or more [CURIEs] ---- 2. for each pair, ---- 1. add CURIEs in braced text to metadata ---- 2. add positions of brackets to metadata ---- 3. delete CURIEs ---- 4. delete braced enclosure if now structurally empty ---- 5. delete brackets if now unannotated ---- ---- The implementation is inspired by Pandoc [issue#6038]. ---- ---- @param inlines Markdown with semantic annotations as Inlines ---- @returns Markdown stripped of semantic annotations as Inlines ---- @see [Block]: <https://pandoc.org/lua-filters.html#type-block> ---- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/> ---- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038> --- TODO: maybe instead as step #5 add/reuse hypermedia anchor -local function Statements (block) - - -- flags for enclosing stages - -- TODO: support nested bracket enclosure - local encl = Enclosure.NONE - - -- amount of detected statements in this block - local block_has_diverged = false - - -- stacks of qualified and pending unenclosed/enclosed elements - local elems = pandoc.List() - local elems_unenclosed = pandoc.List() - local elems_enclosed = pandoc.List() - - -- strings of pending unenclosed/enclosed chars - local chars_unenclosed = "" - local chars_enclosed = "" - - for _, el in ipairs(block.content) do - local pos = 1 - - -- non-string element, highest state first to support fall-through - if el.t ~= 'Str' then - if encl == Enclosure.BRACED_DONE then - - -- push post-brace string to stack - -- and disqualify brace-only end-of-block enclosure - -- TODO: parse chars_unenclosed as Str instead - if chars_unenclosed:len() > 0 then - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - end - chars_unenclosed = "" - chars_enclosed = "" - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - - -- drop space after completed enclosure - -- FIXME: only strip after *completed* enclosure - if el.t ~= "Space" then - encl = Enclosure.NONE - end - - -- fall through to parse element as unenclosed - end - - if encl == Enclosure.BRACED then - elems_unenclosed:insert(el) - elems_enclosed:insert(el) - - goto continue - end - - if encl == Enclosure.BRACKETED_DONE then - - -- disqualify bracketing not directly followed by brace - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - encl = Enclosure.NONE - - -- fall through to parse element as unenclosed - end - - if encl == Enclosure.BRACKETED then - elems_unenclosed:insert(el) - elems_enclosed:insert(el) - - goto continue - end - - if encl == Enclosure.NONE then - - -- semantic annotation misparsed as Link - -- TODO: limit to solely CURIEs in target - if el.t == "Link" - and el.target:find("^{.*}$") - then - elems:extend(elems_unenclosed) - elems:extend(el.content) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - block_has_diverged = true - - else - elems_unenclosed:insert(el) - end - - -- specific elements represent content enclosure - if ElementTypeIsEnclosure[el.t] then - encl = Enclosure.BRACKETED_DONE - end - end - - goto continue - end - - -- unenclosed immediately after enclosure - if encl == Enclosure.BRACED_DONE then - - -- push post-brace string to stack - -- and disqualify brace-only end-of-block enclosure - -- TODO: parse chars_unenclosed as Str - if chars_unenclosed:len() > 0 then - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - end - chars_unenclosed = "" - chars_enclosed = "" - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - encl = Enclosure.NONE - end - - -- unenclosed - -- TODO: accept backslash except immediately before bracket - if encl == Enclosure.NONE then - local _, nextpos, s = el.text:find("^([^%[{\\]*)") - pos = nextpos and nextpos + 1 or pos + 1 - chars_unenclosed = chars_unenclosed..s - - -- entering bracketed or braced enclosure - local t = el.text:sub(pos, pos) - if t == "[" or t == "{" then - - -- qualify unenclosed elements - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - if chars_unenclosed:len() > 0 then - elems:insert(pandoc.Str(chars_unenclosed)) - end - - pos = pos + 1 - chars_unenclosed = chars_unenclosed..t - chars_enclosed = "" - if t == "[" then - encl = Enclosure.BRACKETED - elseif t == "{" then - encl = Enclosure.BRACED - end - end - end - - -- in bracketed enclosure - -- TODO: accept backslash except immediately before bracket/brace - -- TODO: support nested bracket enclosure - if encl == Enclosure.BRACKETED then - local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos) - pos = nextpos and nextpos + 1 or pos + 1 - chars_unenclosed = chars_unenclosed..s - chars_enclosed = chars_enclosed..s - - -- exiting bracketed enclosure - if el.text:sub(pos, pos) == "]" then - pos = pos + 1 - chars_unenclosed = chars_unenclosed.."]" - encl = Enclosure.BRACKETED_DONE - end - end - - -- exited bracketed enclosure - if encl == Enclosure.BRACKETED_DONE then - - -- entering braced enclosure - if el.text:sub(pos, pos) == "{" then - pos = pos + 1 - chars_unenclosed = chars_unenclosed.."{" - encl = Enclosure.BRACED - - -- leaving non-annotation enclosure - else - - -- disqualify bracketing not directly followed by brace - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - if chars_unenclosed:len() > 0 then - elems:insert(pandoc.Str(chars_unenclosed)) - chars_unenclosed = "" - end - chars_enclosed = "" - encl = Enclosure.NONE - - end - end - - -- in braced enclosure, leaving it - -- TODO: support mixed-use enclosure - if encl == Enclosure.BRACED then - local curie_pattern1 = "^"..CURIE_TYPE_PREFIX..CURIE_LONG.."}" - local curie_pattern2 = "^"..CURIE_TYPE_PREFIX..CURIE_NO_REF.."}" - local curie_pattern3 = "^"..CURIE_TYPE_PREFIX..CURIE_LOCAL.."}" - local curie_pattern4 = "^"..CURIE_TYPE_PREFIX..CURIE_DEFAULT.."}" - local curie_pattern5 = "^"..CURIE_TYPE_PREFIX.."<[^<>]*>}" - local _, nextpos1 = el.text:find(curie_pattern1, pos) - local _, nextpos2 = el.text:find(curie_pattern2, pos) - local _, nextpos3 = el.text:find(curie_pattern3, pos) - local _, nextpos4 = el.text:find(curie_pattern4, pos) - local _, nextpos5 = el.text:find(curie_pattern5, pos) - local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4 or nextpos5 - if nextpos then - if chars_enclosed:len() > 0 then - elems_enclosed:insert(pandoc.Str(chars_enclosed)) - end - - -- qualify completed bracketed enclosure - if not TableEmpty(elems_enclosed) then - elems:extend(elems_enclosed) - - -- qualify braced-only enclosure at beginning of block - elseif (TableEmpty(elems_unenclosed) - and (chars_unenclosed:len() == 0 or chars_unenclosed == "{")) - then - elems:extend(elems_enclosed) - - -- postpone braced-only enclosure maybe at end of block - else - chars_unenclosed = chars_unenclosed..el.text:sub(pos, nextpos) - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - chars_unenclosed = el.text:sub(nextpos + 1) - chars_enclosed = el.text:sub(nextpos + 1) - encl = Enclosure.BRACED_DONE - - goto continue - end - - elems_enclosed = pandoc.List() - elems_unenclosed = pandoc.List() - chars_enclosed = "" - chars_unenclosed = el.text:sub(nextpos + 1) - encl = Enclosure.BRACED_DONE - - block_has_diverged = true - end - end - - -- push strings to stacks - if chars_enclosed:len() > 0 then - elems_enclosed:insert(pandoc.Str(chars_enclosed)) - end - if chars_unenclosed:len() > 0 then - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - end - chars_unenclosed = "" - chars_enclosed = "" - - -- done parsing current Inline element - ::continue:: - end - - -- qualify brace-only enclosure at end of block - if encl == Enclosure.BRACED_DONE - and not TableEmpty(elems_enclosed) - then - elems:extend(elems_enclosed) - block_has_diverged = true - end - - -- return altered stack if it contains complete enclosures - if block_has_diverged then - - -- disqualify incomplete enclosure - elems:extend(elems_unenclosed) - - block.content = elems - return block - end -end - --- First resolve namespace declarations, then statements. --- --- Although this filter is *not* a full RDF parser, --- order matters for the parts we do handle -- --- e.g. namespace resolving is similar to other RDF formats --- with detailed documented process ordering. --- --- @see <https://www.w3.org/TR/turtle/#sec-parsing> -local meta = {} -return { - - -- move aside MetaBlocks to speed up processing content - -- - -- @see <https://stackoverflow.com/a/47356252/18619283> - { Meta = function(m) meta = m; return {} end }, - - {Para = Namespaces}, - - {Block = Statements}, - - -- FIXME: add custom declared namespaces in Meta - -- TODO: maybe add only actively used namespaces - -- (do same as for unused link definitions) - { Meta = function(_) return meta; end }, - --{ Meta = function(_) return NamespacesToMeta(meta); end }, -} |
