From 7f610808af0a83264d9e2dd52620b01bea4d0091 Mon Sep 17 00:00:00 2001 From: Jonas Smedegaard Date: Sat, 24 May 2025 14:26:51 +0200 Subject: rename filter semantic-markdown -> sem-md; consistently capitalize spec Semantic Markdown --- Makefile | 4 +- _conclusion.qmd | 2 +- _extensions/ruc-play/sem-md | 1 + .../ruc-play/semantic-markdown/_extension.yaml | 6 - .../semantic-markdown/semantic-markdown.lua | 602 --------------------- _intro.qmd | 2 +- _markdown.qmd | 2 +- example/example.qmd | 2 +- report.qmd | 4 +- sem-md/_extension.yaml | 6 + sem-md/sem-md.lua | 602 +++++++++++++++++++++ 11 files changed, 617 insertions(+), 616 deletions(-) create mode 120000 _extensions/ruc-play/sem-md delete mode 100644 _extensions/ruc-play/semantic-markdown/_extension.yaml delete mode 100644 _extensions/ruc-play/semantic-markdown/semantic-markdown.lua create mode 100644 sem-md/_extension.yaml create mode 100644 sem-md/sem-md.lua diff --git a/Makefile b/Makefile index 3e4cefa..956af90 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,9 @@ PDF_DOCUMENTS = _site/report.pdf include _make/*.mk -DOCUMENT_APPENDIX_REGEX = Pandoc filter semantic-markdown +DOCUMENT_APPENDIX_REGEX = Pandoc filter sem-md -FILTER = _extensions/ruc-play/semantic-markdown/semantic-markdown.lua +FILTER = _extensions/ruc-play/sem-md/sem-md.lua DIFFTESTS = $(patsubst %.md,%,$(wildcard test/*.md)) diff --git a/_conclusion.qmd b/_conclusion.qmd index 688b343..446a421 100644 --- a/_conclusion.qmd +++ b/_conclusion.qmd @@ -34,7 +34,7 @@ and streamlining of automated document layout. ### Implementation as import extension This project has been implementet as a cleanup filter -for a misparsing of semantic Markdown as regular Markdown +for a misparsing of Semantic Markdown as regular Markdown (see @sec-misparsing). This approach was chosen based in an assumption on fitting better with existing uses of Pandoc, diff --git a/_extensions/ruc-play/sem-md b/_extensions/ruc-play/sem-md new file mode 120000 index 0000000..65c97a3 --- /dev/null +++ b/_extensions/ruc-play/sem-md @@ -0,0 +1 @@ +/home/jonas/Projects/PLAY/md/sem-md \ No newline at end of file diff --git a/_extensions/ruc-play/semantic-markdown/_extension.yaml b/_extensions/ruc-play/semantic-markdown/_extension.yaml deleted file mode 100644 index 76b9a7a..0000000 --- a/_extensions/ruc-play/semantic-markdown/_extension.yaml +++ /dev/null @@ -1,6 +0,0 @@ -name: semantic-markdown -author: Jonas Smedegaard -version: 0.0.1 -contributes: - filters: - - semantic-markdown.lua diff --git a/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua b/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua deleted file mode 100644 index abdb078..0000000 --- a/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua +++ /dev/null @@ -1,602 +0,0 @@ ---- semantic-markdown - Pandoc filter to process semantic hints ---- ---- SPDX-FileCopyrightText: 2025 Jonas Smedegaard ---- SPDX-License-Identifier: GPL-3.0-or-later ---- ---- ## Examples ---- ---- The following Markdown text includes semantic annotations ---- within braced enclosures: ---- ---- ```markdown ---- # {=<#artwork> .:Image} Semantics ---- ---- Simple ontological annotation: ---- [This][painting] is not a [pipe]. ---- ---- Nested, mixed-use and custom-namespaced annotations: ---- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription} ---- ---- [painting]: {wd:Q1061035} ---- "A painting of a smoking pipe {:depiction}" ---- ---- [pipe]: {wd:Q104526} ---- "A smoking pipe {:depicts}" ---- ---- {@default}: foaf ---- ---- {bibo}: http://purl.org/ontology/bibo/ ---- ---- {wd}: http://www.wikidata.org/entity/ ---- ``` ---- ---- This filter should transform the above text, with the command ---- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`, ---- into the below markdown text with semantic annotations as metadata: ---- ---- ```markdown ---- --- ---- turtle: | ---- @prefix bibo: http://purl.org/ontology/bibo/ ---- @prefix foaf: http://xmlns.com/foaf/0.1/ ---- @prefix wd: https://www.wikidata.org/entity/ ---- ---- <#artwork> a foaf:Image ; ---- foaf:depiction ; ---- foaf:depicts ; ---- bibo:shortDescription "Ceci n'est pas une pipe."@fr . ---- --- ---- # Semantics ---- ---- Simple ontological annotation: ---- [This][painting] is not a [pipe]. ---- ---- Nested, mixed-use and custom-namespaced annotations: ---- [[Ceci][painting] n'est pas une [pipe].]{lang=fr} ---- ---- [painting]: https://www.wikidata.org/entity/Q1061035 ---- "A painting of a smoking pipe" ---- ---- [pipe]: https://www.wikidata.org/entity/Q104526 ---- "A smoking pipe" ---- ``` ---- ---- This filter should also transform the above text, with the command ---- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`, ---- into the below HTML text with embedded RDFa Lite 1.1 anotations, ---- modulo wrapping of long lines: ---- ---- ```html ----
----

Semantics

----

Simple ontological annotation: ---- This ---- is not ---- a pipe.

---- ----

Nested, mixed-use and custom-namespaced annotations: ---- ---- Ceci ---- n'est pas ---- une pipe. ----

----
---- ``` ---- ---- * v0.0.1 ---- * initial release ---- ---- @version 0.0.1 ---- @see ---- @see ---- @see ---- @see - --- TODO: maybe use topdown traversal --- * order of declaring annotations might matter (but should not) --- * might enable simpler functions and/or faster processing --- @see - --- ensure stable character classes independent of system locale --- @see -os.setlocale 'C' - --- flag running with older release of Pandoc --- --- Some Pandoc features, --- notably pandoc.List:at() introduced wit Pandoc 3.5, --- are unavailable in older Pandoc releases still in widespread use --- due to complexities of keeping Haskell dependencies in sync. --- @see -local PANDOC_IS_OLD = PANDOC_VERSION[1] < 3 - or (PANDOC_VERSION[1] == 3 and PANDOC_VERSION[2] < 5) - ---- pseudo-enum table to track parser enclosure state ---- @see -local Enclosure = { - NONE = "0", - BRACKETED = "1", - BRACKETED_DONE = "2", - BRACED = "3", - BRACED_DONE = "4", -} - --- element types representing content enclosure in Markdown -local ElementTypeIsEnclosure = { - Emph = true, - Image = true, - Link = true, - Strong = true, -} - ---- CURIE_TYPE_PREFIX - `typeof` or `resource` attribute CURIE hint -local CURIE_TYPE_PREFIX = "[.=]?" - --- TODO: cover non-ASCII Unicode characters --- @see ---- CURIE_PREFIX - CURIE prefix component as set of chars ---- @see -local _NAME_START_CHAR = "A-Z_a-z" -local _NAME_CHAR = _NAME_START_CHAR.."-0-9" -local _REF = "[".._NAME_START_CHAR.."][".._NAME_CHAR.."]*" -local CURIE_PREFIX = "[".._NAME_START_CHAR.."_-][".._NAME_CHAR.."]*" - ---- CURIE_LONG - CURIE with prefix and reference as set of chars -local CURIE_LONG = CURIE_PREFIX..":".._REF - ---- CURIE_NO_REF - CURIE with only prefix as set of chars -local CURIE_NO_REF = CURIE_PREFIX..":" - ---- CURIE_LOCAL - CURIE with only name as set of chars -local CURIE_LOCAL = ":".._REF - ---- CURIE_DEFAULT - CURIE without prefix or name as char -local CURIE_DEFAULT = ":" - --- TODO: CURIE_re - CURIE as `LPeg.re` regex object --- TODO: test and replace above curie* patterns --- @see ---local CURIE_re = re.compile("("..CURIE_PREFIX..")?:(".._REF..")?") - --- FIXME: define RDF context same as RDFa --- TODO: maybe support overriding context with a JSON-LD URI --- @see - ---- TableEmpty - check if Pandoc List contains any elements ---- ---- Use a workaround for Pandoc releases older than 3.5 ---- where ergonomic function pandoc.List:at() is missing. ---- ---- @param list Pandoc List to inspect ---- @return result of inspection as boolean -local function TableEmpty(list) - if PANDOC_IS_OLD then - local list_clone = list:clone() - return not list_clone:remove() - else - return list:at(1) == nil - end -end - ---- Namespaces - process RDF namespace IRI declarations ---- ---- Takes as input a list of Para block elements. ---- For each block matching the pattern for a namespace IRI definition, ---- the declared namespace is extracted. ---- Returns an empty paragraph in case of a match, ---- or nothing (to signal preservation of original content). ---- ---- Example: ---- ---- ```Markdown ---- # Annotated paragraph using a custom namespace ---- ---- My favorite animal is the [Liger]{ov:preferredAnimal}. ---- {=<#me> .:Person} ---- ---- {ov}: http://open.vocab.org/terms/ ---- ``` ---- ---- @param blocks Markdown with ontological annotations as Blocks ---- @returns Markdown without ontological annotations as Blocks ---- @see ---- @see -local function Namespaces(blocks) - - -- paragraph with only a braced prefix-only CURIE, colon and one word - local curie_pattern = "^{"..CURIE_TYPE_PREFIX..CURIE_PREFIX.."}:$" - if #blocks.content == 3 - and blocks.content[1].t == "Str" - and blocks.content[2].t == "Space" - and blocks.content[1].text:match(curie_pattern) - then - local el = blocks.content[3] - - -- default namespace, parsed as commonmark - if el.t == "Str" - and el.text == "@default" - then - -- FIXME: add CURIE to metadata - return {} - end - - -- default namespace, parsed as markdown - if el.t == "Cite" - and #el.content == 1 - and el.content[1].text == "@default" - then - -- FIXME: add CURIE to metadata - return {} - end - - -- namespace - -- TODO: relax to match URI syntax without hardcoded protocols - local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:" - if el.t == "Str" - and el.text:match(proto_pattern) - then - -- FIXME: add CURIE and URI to metadata - return {} - end - end -end - ---- Statements - process inline RDF statements ---- ---- Locate and extract ontological annotations ---- within a [Block] element of a Pandoc Abstract Syntax Tree (AST). ---- ---- Markup for ontological annotations is an extension to Markdown ---- using similar syntax as hypermedia annotations, ---- but listing RDFa [CURIEs] in a braced enclosure. ---- ---- ```ASCII-art ---- Simple ontological annotation: ---- "A [map]{foaf:depiction} is not the territory" ---- | ||\~~~~~~~~~~~~/| ---- a bc CURIEa d ---- ---- Nested and mixed-use annotations: ---- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description} ---- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/| ---- a a1 |c1 CURIEa d1bc CURIEb d ---- b1 ---- ---- Chained hypermedia and ontological annotations: ---- "A [map](https://osm.org/){foaf:depiction} is not the territory" ---- | || ||\~~~~~~~~~~~~/| ---- a be fc CURIEa d ---- ---- Legend: ---- a-b: bracketed enclosure around content ---- c-d: braced enclosure around ontological or other annotation ---- e-f: parenthesized enclosure around hypermedia annotation ---- ``` ---- ---- Ontological annotations are parsed and reorganised ---- using the following algorithm: ---- ---- 1. locate pairs of bracketed text and braced text ---- either adjacent or separated by parenthesized text, ---- where braced text contains one or more [CURIEs] ---- 2. for each pair, ---- 1. add CURIEs in braced text to metadata ---- 2. add positions of brackets to metadata ---- 3. delete CURIEs ---- 4. delete braced enclosure if now structurally empty ---- 5. delete brackets if now unannotated ---- ---- The implementation is inspired by Pandoc [issue#6038]. ---- ---- @param inlines Markdown with semantic annotations as Inlines ---- @returns Markdown stripped of semantic annotations as Inlines ---- @see [Block]: ---- @see [CURIEs]: ---- @see [issue#6038]: --- TODO: maybe instead as step #5 add/reuse hypermedia anchor -local function Statements (block) - - -- flags for enclosing stages - -- TODO: support nested bracket enclosure - local encl = Enclosure.NONE - - -- amount of detected statements in this block - local block_has_diverged = false - - -- stacks of qualified and pending unenclosed/enclosed elements - local elems = pandoc.List() - local elems_unenclosed = pandoc.List() - local elems_enclosed = pandoc.List() - - -- strings of pending unenclosed/enclosed chars - local chars_unenclosed = "" - local chars_enclosed = "" - - for _, el in ipairs(block.content) do - local pos = 1 - - -- non-string element, highest state first to support fall-through - if el.t ~= 'Str' then - if encl == Enclosure.BRACED_DONE then - - -- push post-brace string to stack - -- and disqualify brace-only end-of-block enclosure - -- TODO: parse chars_unenclosed as Str instead - if chars_unenclosed:len() > 0 then - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - end - chars_unenclosed = "" - chars_enclosed = "" - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - - -- drop space after completed enclosure - -- FIXME: only strip after *completed* enclosure - if el.t ~= "Space" then - encl = Enclosure.NONE - end - - -- fall through to parse element as unenclosed - end - - if encl == Enclosure.BRACED then - elems_unenclosed:insert(el) - elems_enclosed:insert(el) - - goto continue - end - - if encl == Enclosure.BRACKETED_DONE then - - -- disqualify bracketing not directly followed by brace - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - encl = Enclosure.NONE - - -- fall through to parse element as unenclosed - end - - if encl == Enclosure.BRACKETED then - elems_unenclosed:insert(el) - elems_enclosed:insert(el) - - goto continue - end - - if encl == Enclosure.NONE then - - -- semantic annotation misparsed as Link - -- TODO: limit to solely CURIEs in target - if el.t == "Link" - and el.target:find("^{.*}$") - then - elems:extend(elems_unenclosed) - elems:extend(el.content) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - block_has_diverged = true - - else - elems_unenclosed:insert(el) - end - - -- specific elements represent content enclosure - if ElementTypeIsEnclosure[el.t] then - encl = Enclosure.BRACKETED_DONE - end - end - - goto continue - end - - -- unenclosed immediately after enclosure - if encl == Enclosure.BRACED_DONE then - - -- push post-brace string to stack - -- and disqualify brace-only end-of-block enclosure - -- TODO: parse chars_unenclosed as Str - if chars_unenclosed:len() > 0 then - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - end - chars_unenclosed = "" - chars_enclosed = "" - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - encl = Enclosure.NONE - end - - -- unenclosed - -- TODO: accept backslash except immediately before bracket - if encl == Enclosure.NONE then - local _, nextpos, s = el.text:find("^([^%[{\\]*)") - pos = nextpos and nextpos + 1 or pos + 1 - chars_unenclosed = chars_unenclosed..s - - -- entering bracketed or braced enclosure - local t = el.text:sub(pos, pos) - if t == "[" or t == "{" then - - -- qualify unenclosed elements - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - if chars_unenclosed:len() > 0 then - elems:insert(pandoc.Str(chars_unenclosed)) - end - - pos = pos + 1 - chars_unenclosed = chars_unenclosed..t - chars_enclosed = "" - if t == "[" then - encl = Enclosure.BRACKETED - elseif t == "{" then - encl = Enclosure.BRACED - end - end - end - - -- in bracketed enclosure - -- TODO: accept backslash except immediately before bracket/brace - -- TODO: support nested bracket enclosure - if encl == Enclosure.BRACKETED then - local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos) - pos = nextpos and nextpos + 1 or pos + 1 - chars_unenclosed = chars_unenclosed..s - chars_enclosed = chars_enclosed..s - - -- exiting bracketed enclosure - if el.text:sub(pos, pos) == "]" then - pos = pos + 1 - chars_unenclosed = chars_unenclosed.."]" - encl = Enclosure.BRACKETED_DONE - end - end - - -- exited bracketed enclosure - if encl == Enclosure.BRACKETED_DONE then - - -- entering braced enclosure - if el.text:sub(pos, pos) == "{" then - pos = pos + 1 - chars_unenclosed = chars_unenclosed.."{" - encl = Enclosure.BRACED - - -- leaving non-annotation enclosure - else - - -- disqualify bracketing not directly followed by brace - elems:extend(elems_unenclosed) - elems_unenclosed = pandoc.List() - elems_enclosed = pandoc.List() - if chars_unenclosed:len() > 0 then - elems:insert(pandoc.Str(chars_unenclosed)) - chars_unenclosed = "" - end - chars_enclosed = "" - encl = Enclosure.NONE - - end - end - - -- in braced enclosure, leaving it - -- TODO: support mixed-use enclosure - if encl == Enclosure.BRACED then - local curie_pattern1 = "^"..CURIE_TYPE_PREFIX..CURIE_LONG.."}" - local curie_pattern2 = "^"..CURIE_TYPE_PREFIX..CURIE_NO_REF.."}" - local curie_pattern3 = "^"..CURIE_TYPE_PREFIX..CURIE_LOCAL.."}" - local curie_pattern4 = "^"..CURIE_TYPE_PREFIX..CURIE_DEFAULT.."}" - local curie_pattern5 = "^"..CURIE_TYPE_PREFIX.."<[^<>]*>}" - local _, nextpos1 = el.text:find(curie_pattern1, pos) - local _, nextpos2 = el.text:find(curie_pattern2, pos) - local _, nextpos3 = el.text:find(curie_pattern3, pos) - local _, nextpos4 = el.text:find(curie_pattern4, pos) - local _, nextpos5 = el.text:find(curie_pattern5, pos) - local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4 or nextpos5 - if nextpos then - if chars_enclosed:len() > 0 then - elems_enclosed:insert(pandoc.Str(chars_enclosed)) - end - - -- qualify completed bracketed enclosure - if not TableEmpty(elems_enclosed) then - elems:extend(elems_enclosed) - - -- qualify braced-only enclosure at beginning of block - elseif (TableEmpty(elems_unenclosed) - and (chars_unenclosed:len() == 0 or chars_unenclosed == "{")) - then - elems:extend(elems_enclosed) - - -- postpone braced-only enclosure maybe at end of block - else - chars_unenclosed = chars_unenclosed..el.text:sub(pos, nextpos) - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - chars_unenclosed = el.text:sub(nextpos + 1) - chars_enclosed = el.text:sub(nextpos + 1) - encl = Enclosure.BRACED_DONE - - goto continue - end - - elems_enclosed = pandoc.List() - elems_unenclosed = pandoc.List() - chars_enclosed = "" - chars_unenclosed = el.text:sub(nextpos + 1) - encl = Enclosure.BRACED_DONE - - block_has_diverged = true - end - end - - -- push strings to stacks - if chars_enclosed:len() > 0 then - elems_enclosed:insert(pandoc.Str(chars_enclosed)) - end - if chars_unenclosed:len() > 0 then - elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) - end - chars_unenclosed = "" - chars_enclosed = "" - - -- done parsing current Inline element - ::continue:: - end - - -- qualify brace-only enclosure at end of block - if encl == Enclosure.BRACED_DONE - and not TableEmpty(elems_enclosed) - then - elems:extend(elems_enclosed) - block_has_diverged = true - end - - -- return altered stack if it contains complete enclosures - if block_has_diverged then - - -- disqualify incomplete enclosure - elems:extend(elems_unenclosed) - - block.content = elems - return block - end -end - --- First resolve namespace declarations, then statements. --- --- Although this filter is *not* a full RDF parser, --- order matters for the parts we do handle -- --- e.g. namespace resolving is similar to other RDF formats --- with detailed documented process ordering. --- --- @see -local meta = {} -return { - - -- move aside MetaBlocks to speed up processing content - -- - -- @see - { Meta = function(m) meta = m; return {} end }, - - {Para = Namespaces}, - - {Block = Statements}, - - -- FIXME: add custom declared namespaces in Meta - -- TODO: maybe add only actively used namespaces - -- (do same as for unused link definitions) - { Meta = function(_) return meta; end }, - --{ Meta = function(_) return NamespacesToMeta(meta); end }, -} diff --git a/_intro.qmd b/_intro.qmd index f63f79c..5b4a5fb 100644 --- a/_intro.qmd +++ b/_intro.qmd @@ -36,7 +36,7 @@ or "this uses the derogatory meaning of the term". A few years ago, a call was made on a blog to extend Markdown to cover semantic text annotations [@Francart2020]. -This led to a draft specification called "Semantic Markdown" +This led to the draft specification Semantic Markdown [@Smedegaard2022]; no actual implementation was made at the time, however. diff --git a/_markdown.qmd b/_markdown.qmd index f137b17..a82f49f 100644 --- a/_markdown.qmd +++ b/_markdown.qmd @@ -23,7 +23,7 @@ the renderers for `README.md` files at Github and Gitlab -- and because its syntax is thoroughly documented, separately from implementations of parsers of that dialect. -The second analysis covers the Markdown extension semantic-markdown, +The second analysis covers the Markdown extension Semantic Markdown, chosen because it covers semantic text annotation and is the only Markdown extension description that covers it, as far as the author of this paper is aware. diff --git a/example/example.qmd b/example/example.qmd index 57e03b5..7fb3421 100644 --- a/example/example.qmd +++ b/example/example.qmd @@ -4,7 +4,7 @@ format: html: minimal: true filters: - - semantic-markdown + - sem-md --- My name is diff --git a/report.qmd b/report.qmd index 409298b..d973730 100644 --- a/report.qmd +++ b/report.qmd @@ -114,9 +114,9 @@ are editorial comments not intended for inclusion in the final version.* \appendix -# Pandoc filter `semantic-markdown` {.appendix} +# Pandoc filter `sem-md` {.appendix} -```{.lua include="_extensions/ruc-play/semantic-markdown/semantic-markdown.lua" code-line-numbers="true"} +```{.lua include="sem-md/sem-md.lua" code-line-numbers="true"} ``` # Markdown syntax as PEG {.appendix #sec-def-peg} diff --git a/sem-md/_extension.yaml b/sem-md/_extension.yaml new file mode 100644 index 0000000..22bfe98 --- /dev/null +++ b/sem-md/_extension.yaml @@ -0,0 +1,6 @@ +name: sem-md +author: Jonas Smedegaard +version: 0.0.1 +contributes: + filters: + - sem-md.lua diff --git a/sem-md/sem-md.lua b/sem-md/sem-md.lua new file mode 100644 index 0000000..4c7ad01 --- /dev/null +++ b/sem-md/sem-md.lua @@ -0,0 +1,602 @@ +--- sem-md - Pandoc filter to process semantic annotations in Markdown +--- +--- SPDX-FileCopyrightText: 2025 Jonas Smedegaard +--- SPDX-License-Identifier: GPL-3.0-or-later +--- +--- ## Examples +--- +--- The following Markdown text includes semantic annotations +--- within braced enclosures: +--- +--- ```markdown +--- # {=<#artwork> .:Image} Semantics +--- +--- Simple ontological annotation: +--- [This][painting] is not a [pipe]. +--- +--- Nested, mixed-use and custom-namespaced annotations: +--- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription} +--- +--- [painting]: {wd:Q1061035} +--- "A painting of a smoking pipe {:depiction}" +--- +--- [pipe]: {wd:Q104526} +--- "A smoking pipe {:depicts}" +--- +--- {@default}: foaf +--- +--- {bibo}: http://purl.org/ontology/bibo/ +--- +--- {wd}: http://www.wikidata.org/entity/ +--- ``` +--- +--- This filter should transform the above text, with the command +--- `pandoc -L sem-md.lua -t commonmark --wrap=preserve`, +--- into the below markdown text with semantic annotations as metadata: +--- +--- ```markdown +--- --- +--- turtle: | +--- @prefix bibo: http://purl.org/ontology/bibo/ +--- @prefix foaf: http://xmlns.com/foaf/0.1/ +--- @prefix wd: https://www.wikidata.org/entity/ +--- +--- <#artwork> a foaf:Image ; +--- foaf:depiction ; +--- foaf:depicts ; +--- bibo:shortDescription "Ceci n'est pas une pipe."@fr . +--- --- +--- # Semantics +--- +--- Simple ontological annotation: +--- [This][painting] is not a [pipe]. +--- +--- Nested, mixed-use and custom-namespaced annotations: +--- [[Ceci][painting] n'est pas une [pipe].]{lang=fr} +--- +--- [painting]: https://www.wikidata.org/entity/Q1061035 +--- "A painting of a smoking pipe" +--- +--- [pipe]: https://www.wikidata.org/entity/Q104526 +--- "A smoking pipe" +--- ``` +--- +--- This filter should also transform the above text, with the command +--- `pandoc -L sem-md.lua -t html --wrap=preserve`, +--- into the below HTML text with embedded RDFa Lite 1.1 anotations, +--- modulo wrapping of long lines: +--- +--- ```html +---
+---

Semantics

+---

Simple ontological annotation: +--- This +--- is not +--- a pipe.

+--- +---

Nested, mixed-use and custom-namespaced annotations: +--- +--- Ceci +--- n'est pas +--- une pipe. +---

+---
+--- ``` +--- +--- * v0.0.1 +--- * initial release +--- +--- @version 0.0.1 +--- @see +--- @see +--- @see +--- @see + +-- TODO: maybe use topdown traversal +-- * order of declaring annotations might matter (but should not) +-- * might enable simpler functions and/or faster processing +-- @see + +-- ensure stable character classes independent of system locale +-- @see +os.setlocale 'C' + +-- flag running with older release of Pandoc +-- +-- Some Pandoc features, +-- notably pandoc.List:at() introduced wit Pandoc 3.5, +-- are unavailable in older Pandoc releases still in widespread use +-- due to complexities of keeping Haskell dependencies in sync. +-- @see +local PANDOC_IS_OLD = PANDOC_VERSION[1] < 3 + or (PANDOC_VERSION[1] == 3 and PANDOC_VERSION[2] < 5) + +--- pseudo-enum table to track parser enclosure state +--- @see +local Enclosure = { + NONE = "0", + BRACKETED = "1", + BRACKETED_DONE = "2", + BRACED = "3", + BRACED_DONE = "4", +} + +-- element types representing content enclosure in Markdown +local ElementTypeIsEnclosure = { + Emph = true, + Image = true, + Link = true, + Strong = true, +} + +--- CURIE_TYPE_PREFIX - `typeof` or `resource` attribute CURIE hint +local CURIE_TYPE_PREFIX = "[.=]?" + +-- TODO: cover non-ASCII Unicode characters +-- @see +--- CURIE_PREFIX - CURIE prefix component as set of chars +--- @see +local _NAME_START_CHAR = "A-Z_a-z" +local _NAME_CHAR = _NAME_START_CHAR.."-0-9" +local _REF = "[".._NAME_START_CHAR.."][".._NAME_CHAR.."]*" +local CURIE_PREFIX = "[".._NAME_START_CHAR.."_-][".._NAME_CHAR.."]*" + +--- CURIE_LONG - CURIE with prefix and reference as set of chars +local CURIE_LONG = CURIE_PREFIX..":".._REF + +--- CURIE_NO_REF - CURIE with only prefix as set of chars +local CURIE_NO_REF = CURIE_PREFIX..":" + +--- CURIE_LOCAL - CURIE with only name as set of chars +local CURIE_LOCAL = ":".._REF + +--- CURIE_DEFAULT - CURIE without prefix or name as char +local CURIE_DEFAULT = ":" + +-- TODO: CURIE_re - CURIE as `LPeg.re` regex object +-- TODO: test and replace above curie* patterns +-- @see +--local CURIE_re = re.compile("("..CURIE_PREFIX..")?:(".._REF..")?") + +-- FIXME: define RDF context same as RDFa +-- TODO: maybe support overriding context with a JSON-LD URI +-- @see + +--- TableEmpty - check if Pandoc List contains any elements +--- +--- Use a workaround for Pandoc releases older than 3.5 +--- where ergonomic function pandoc.List:at() is missing. +--- +--- @param list Pandoc List to inspect +--- @return result of inspection as boolean +local function TableEmpty(list) + if PANDOC_IS_OLD then + local list_clone = list:clone() + return not list_clone:remove() + else + return list:at(1) == nil + end +end + +--- Namespaces - process RDF namespace IRI declarations +--- +--- Takes as input a list of Para block elements. +--- For each block matching the pattern for a namespace IRI definition, +--- the declared namespace is extracted. +--- Returns an empty paragraph in case of a match, +--- or nothing (to signal preservation of original content). +--- +--- Example: +--- +--- ```Markdown +--- # Annotated paragraph using a custom namespace +--- +--- My favorite animal is the [Liger]{ov:preferredAnimal}. +--- {=<#me> .:Person} +--- +--- {ov}: http://open.vocab.org/terms/ +--- ``` +--- +--- @param blocks Markdown with ontological annotations as Blocks +--- @returns Markdown without ontological annotations as Blocks +--- @see +--- @see +local function Namespaces(blocks) + + -- paragraph with only a braced prefix-only CURIE, colon and one word + local curie_pattern = "^{"..CURIE_TYPE_PREFIX..CURIE_PREFIX.."}:$" + if #blocks.content == 3 + and blocks.content[1].t == "Str" + and blocks.content[2].t == "Space" + and blocks.content[1].text:match(curie_pattern) + then + local el = blocks.content[3] + + -- default namespace, parsed as commonmark + if el.t == "Str" + and el.text == "@default" + then + -- FIXME: add CURIE to metadata + return {} + end + + -- default namespace, parsed as markdown + if el.t == "Cite" + and #el.content == 1 + and el.content[1].text == "@default" + then + -- FIXME: add CURIE to metadata + return {} + end + + -- namespace + -- TODO: relax to match URI syntax without hardcoded protocols + local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:" + if el.t == "Str" + and el.text:match(proto_pattern) + then + -- FIXME: add CURIE and URI to metadata + return {} + end + end +end + +--- Statements - process inline RDF statements +--- +--- Locate and extract ontological annotations +--- within a [Block] element of a Pandoc Abstract Syntax Tree (AST). +--- +--- Markup for ontological annotations is an extension to Markdown +--- using similar syntax as hypermedia annotations, +--- but listing RDFa [CURIEs] in a braced enclosure. +--- +--- ```ASCII-art +--- Simple ontological annotation: +--- "A [map]{foaf:depiction} is not the territory" +--- | ||\~~~~~~~~~~~~/| +--- a bc CURIEa d +--- +--- Nested and mixed-use annotations: +--- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description} +--- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/| +--- a a1 |c1 CURIEa d1bc CURIEb d +--- b1 +--- +--- Chained hypermedia and ontological annotations: +--- "A [map](https://osm.org/){foaf:depiction} is not the territory" +--- | || ||\~~~~~~~~~~~~/| +--- a be fc CURIEa d +--- +--- Legend: +--- a-b: bracketed enclosure around content +--- c-d: braced enclosure around ontological or other annotation +--- e-f: parenthesized enclosure around hypermedia annotation +--- ``` +--- +--- Ontological annotations are parsed and reorganised +--- using the following algorithm: +--- +--- 1. locate pairs of bracketed text and braced text +--- either adjacent or separated by parenthesized text, +--- where braced text contains one or more [CURIEs] +--- 2. for each pair, +--- 1. add CURIEs in braced text to metadata +--- 2. add positions of brackets to metadata +--- 3. delete CURIEs +--- 4. delete braced enclosure if now structurally empty +--- 5. delete brackets if now unannotated +--- +--- The implementation is inspired by Pandoc [issue#6038]. +--- +--- @param inlines Markdown with semantic annotations as Inlines +--- @returns Markdown stripped of semantic annotations as Inlines +--- @see [Block]: +--- @see [CURIEs]: +--- @see [issue#6038]: +-- TODO: maybe instead as step #5 add/reuse hypermedia anchor +local function Statements (block) + + -- flags for enclosing stages + -- TODO: support nested bracket enclosure + local encl = Enclosure.NONE + + -- amount of detected statements in this block + local block_has_diverged = false + + -- stacks of qualified and pending unenclosed/enclosed elements + local elems = pandoc.List() + local elems_unenclosed = pandoc.List() + local elems_enclosed = pandoc.List() + + -- strings of pending unenclosed/enclosed chars + local chars_unenclosed = "" + local chars_enclosed = "" + + for _, el in ipairs(block.content) do + local pos = 1 + + -- non-string element, highest state first to support fall-through + if el.t ~= 'Str' then + if encl == Enclosure.BRACED_DONE then + + -- push post-brace string to stack + -- and disqualify brace-only end-of-block enclosure + -- TODO: parse chars_unenclosed as Str instead + if chars_unenclosed:len() > 0 then + elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) + end + chars_unenclosed = "" + chars_enclosed = "" + elems:extend(elems_unenclosed) + elems_unenclosed = pandoc.List() + elems_enclosed = pandoc.List() + + -- drop space after completed enclosure + -- FIXME: only strip after *completed* enclosure + if el.t ~= "Space" then + encl = Enclosure.NONE + end + + -- fall through to parse element as unenclosed + end + + if encl == Enclosure.BRACED then + elems_unenclosed:insert(el) + elems_enclosed:insert(el) + + goto continue + end + + if encl == Enclosure.BRACKETED_DONE then + + -- disqualify bracketing not directly followed by brace + elems:extend(elems_unenclosed) + elems_unenclosed = pandoc.List() + elems_enclosed = pandoc.List() + encl = Enclosure.NONE + + -- fall through to parse element as unenclosed + end + + if encl == Enclosure.BRACKETED then + elems_unenclosed:insert(el) + elems_enclosed:insert(el) + + goto continue + end + + if encl == Enclosure.NONE then + + -- semantic annotation misparsed as Link + -- TODO: limit to solely CURIEs in target + if el.t == "Link" + and el.target:find("^{.*}$") + then + elems:extend(elems_unenclosed) + elems:extend(el.content) + elems_unenclosed = pandoc.List() + elems_enclosed = pandoc.List() + block_has_diverged = true + + else + elems_unenclosed:insert(el) + end + + -- specific elements represent content enclosure + if ElementTypeIsEnclosure[el.t] then + encl = Enclosure.BRACKETED_DONE + end + end + + goto continue + end + + -- unenclosed immediately after enclosure + if encl == Enclosure.BRACED_DONE then + + -- push post-brace string to stack + -- and disqualify brace-only end-of-block enclosure + -- TODO: parse chars_unenclosed as Str + if chars_unenclosed:len() > 0 then + elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) + end + chars_unenclosed = "" + chars_enclosed = "" + elems:extend(elems_unenclosed) + elems_unenclosed = pandoc.List() + elems_enclosed = pandoc.List() + encl = Enclosure.NONE + end + + -- unenclosed + -- TODO: accept backslash except immediately before bracket + if encl == Enclosure.NONE then + local _, nextpos, s = el.text:find("^([^%[{\\]*)") + pos = nextpos and nextpos + 1 or pos + 1 + chars_unenclosed = chars_unenclosed..s + + -- entering bracketed or braced enclosure + local t = el.text:sub(pos, pos) + if t == "[" or t == "{" then + + -- qualify unenclosed elements + elems:extend(elems_unenclosed) + elems_unenclosed = pandoc.List() + elems_enclosed = pandoc.List() + if chars_unenclosed:len() > 0 then + elems:insert(pandoc.Str(chars_unenclosed)) + end + + pos = pos + 1 + chars_unenclosed = chars_unenclosed..t + chars_enclosed = "" + if t == "[" then + encl = Enclosure.BRACKETED + elseif t == "{" then + encl = Enclosure.BRACED + end + end + end + + -- in bracketed enclosure + -- TODO: accept backslash except immediately before bracket/brace + -- TODO: support nested bracket enclosure + if encl == Enclosure.BRACKETED then + local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos) + pos = nextpos and nextpos + 1 or pos + 1 + chars_unenclosed = chars_unenclosed..s + chars_enclosed = chars_enclosed..s + + -- exiting bracketed enclosure + if el.text:sub(pos, pos) == "]" then + pos = pos + 1 + chars_unenclosed = chars_unenclosed.."]" + encl = Enclosure.BRACKETED_DONE + end + end + + -- exited bracketed enclosure + if encl == Enclosure.BRACKETED_DONE then + + -- entering braced enclosure + if el.text:sub(pos, pos) == "{" then + pos = pos + 1 + chars_unenclosed = chars_unenclosed.."{" + encl = Enclosure.BRACED + + -- leaving non-annotation enclosure + else + + -- disqualify bracketing not directly followed by brace + elems:extend(elems_unenclosed) + elems_unenclosed = pandoc.List() + elems_enclosed = pandoc.List() + if chars_unenclosed:len() > 0 then + elems:insert(pandoc.Str(chars_unenclosed)) + chars_unenclosed = "" + end + chars_enclosed = "" + encl = Enclosure.NONE + + end + end + + -- in braced enclosure, leaving it + -- TODO: support mixed-use enclosure + if encl == Enclosure.BRACED then + local curie_pattern1 = "^"..CURIE_TYPE_PREFIX..CURIE_LONG.."}" + local curie_pattern2 = "^"..CURIE_TYPE_PREFIX..CURIE_NO_REF.."}" + local curie_pattern3 = "^"..CURIE_TYPE_PREFIX..CURIE_LOCAL.."}" + local curie_pattern4 = "^"..CURIE_TYPE_PREFIX..CURIE_DEFAULT.."}" + local curie_pattern5 = "^"..CURIE_TYPE_PREFIX.."<[^<>]*>}" + local _, nextpos1 = el.text:find(curie_pattern1, pos) + local _, nextpos2 = el.text:find(curie_pattern2, pos) + local _, nextpos3 = el.text:find(curie_pattern3, pos) + local _, nextpos4 = el.text:find(curie_pattern4, pos) + local _, nextpos5 = el.text:find(curie_pattern5, pos) + local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4 or nextpos5 + if nextpos then + if chars_enclosed:len() > 0 then + elems_enclosed:insert(pandoc.Str(chars_enclosed)) + end + + -- qualify completed bracketed enclosure + if not TableEmpty(elems_enclosed) then + elems:extend(elems_enclosed) + + -- qualify braced-only enclosure at beginning of block + elseif (TableEmpty(elems_unenclosed) + and (chars_unenclosed:len() == 0 or chars_unenclosed == "{")) + then + elems:extend(elems_enclosed) + + -- postpone braced-only enclosure maybe at end of block + else + chars_unenclosed = chars_unenclosed..el.text:sub(pos, nextpos) + elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) + chars_unenclosed = el.text:sub(nextpos + 1) + chars_enclosed = el.text:sub(nextpos + 1) + encl = Enclosure.BRACED_DONE + + goto continue + end + + elems_enclosed = pandoc.List() + elems_unenclosed = pandoc.List() + chars_enclosed = "" + chars_unenclosed = el.text:sub(nextpos + 1) + encl = Enclosure.BRACED_DONE + + block_has_diverged = true + end + end + + -- push strings to stacks + if chars_enclosed:len() > 0 then + elems_enclosed:insert(pandoc.Str(chars_enclosed)) + end + if chars_unenclosed:len() > 0 then + elems_unenclosed:insert(pandoc.Str(chars_unenclosed)) + end + chars_unenclosed = "" + chars_enclosed = "" + + -- done parsing current Inline element + ::continue:: + end + + -- qualify brace-only enclosure at end of block + if encl == Enclosure.BRACED_DONE + and not TableEmpty(elems_enclosed) + then + elems:extend(elems_enclosed) + block_has_diverged = true + end + + -- return altered stack if it contains complete enclosures + if block_has_diverged then + + -- disqualify incomplete enclosure + elems:extend(elems_unenclosed) + + block.content = elems + return block + end +end + +-- First resolve namespace declarations, then statements. +-- +-- Although this filter is *not* a full RDF parser, +-- order matters for the parts we do handle -- +-- e.g. namespace resolving is similar to other RDF formats +-- with detailed documented process ordering. +-- +-- @see +local meta = {} +return { + + -- move aside MetaBlocks to speed up processing content + -- + -- @see + { Meta = function(m) meta = m; return {} end }, + + {Para = Namespaces}, + + {Block = Statements}, + + -- FIXME: add custom declared namespaces in Meta + -- TODO: maybe add only actively used namespaces + -- (do same as for unused link definitions) + { Meta = function(_) return meta; end }, + --{ Meta = function(_) return NamespacesToMeta(meta); end }, +} -- cgit v1.2.3