aboutsummaryrefslogtreecommitdiff
path: root/_extensions
diff options
context:
space:
mode:
Diffstat (limited to '_extensions')
l---------_extensions/ruc-play/sem-md1
-rw-r--r--_extensions/ruc-play/semantic-markdown/_extension.yaml6
-rw-r--r--_extensions/ruc-play/semantic-markdown/semantic-markdown.lua602
3 files changed, 1 insertions, 608 deletions
diff --git a/_extensions/ruc-play/sem-md b/_extensions/ruc-play/sem-md
new file mode 120000
index 0000000..65c97a3
--- /dev/null
+++ b/_extensions/ruc-play/sem-md
@@ -0,0 +1 @@
+/home/jonas/Projects/PLAY/md/sem-md \ No newline at end of file
diff --git a/_extensions/ruc-play/semantic-markdown/_extension.yaml b/_extensions/ruc-play/semantic-markdown/_extension.yaml
deleted file mode 100644
index 76b9a7a..0000000
--- a/_extensions/ruc-play/semantic-markdown/_extension.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-name: semantic-markdown
-author: Jonas Smedegaard
-version: 0.0.1
-contributes:
- filters:
- - semantic-markdown.lua
diff --git a/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua b/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua
deleted file mode 100644
index abdb078..0000000
--- a/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua
+++ /dev/null
@@ -1,602 +0,0 @@
---- semantic-markdown - Pandoc filter to process semantic hints
----
---- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk>
---- SPDX-License-Identifier: GPL-3.0-or-later
----
---- ## Examples
----
---- The following Markdown text includes semantic annotations
---- within braced enclosures:
----
---- ```markdown
---- # {=<#artwork> .:Image} Semantics
----
---- Simple ontological annotation:
---- [This][painting] is not a [pipe].
----
---- Nested, mixed-use and custom-namespaced annotations:
---- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription}
----
---- [painting]: {wd:Q1061035}
---- "A painting of a smoking pipe {:depiction}"
----
---- [pipe]: {wd:Q104526}
---- "A smoking pipe {:depicts}"
----
---- {@default}: foaf
----
---- {bibo}: http://purl.org/ontology/bibo/
----
---- {wd}: http://www.wikidata.org/entity/
---- ```
----
---- This filter should transform the above text, with the command
---- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`,
---- into the below markdown text with semantic annotations as metadata:
----
---- ```markdown
---- ---
---- turtle: |
---- @prefix bibo: http://purl.org/ontology/bibo/
---- @prefix foaf: http://xmlns.com/foaf/0.1/
---- @prefix wd: https://www.wikidata.org/entity/
----
---- <#artwork> a foaf:Image ;
---- foaf:depiction <https://www.wikidata.org/entity/Q1061035> ;
---- foaf:depicts <https://www.wikidata.org/entity/Q104526> ;
---- bibo:shortDescription "Ceci n'est pas une pipe."@fr .
---- ---
---- # Semantics
----
---- Simple ontological annotation:
---- [This][painting] is not a [pipe].
----
---- Nested, mixed-use and custom-namespaced annotations:
---- [[Ceci][painting] n'est pas une [pipe].]{lang=fr}
----
---- [painting]: https://www.wikidata.org/entity/Q1061035
---- "A painting of a smoking pipe"
----
---- [pipe]: https://www.wikidata.org/entity/Q104526
---- "A smoking pipe"
---- ```
----
---- This filter should also transform the above text, with the command
---- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`,
---- into the below HTML text with embedded RDFa Lite 1.1 anotations,
---- modulo wrapping of long lines:
----
---- ```html
---- <div vocab="http://xmlns.com/foaf/0.1/"
---- prefix="bibo: http://purl.org/ontology/bibo/"
---- resource="#artwork" typeof="Image">
---- <h1>Semantics</h1>
---- <p>Simple ontological annotation:
---- <a property="depiction"
---- href="https://www.wikidata.org/entity/Q1061035"
---- title="A painting of a smoking pipe">This</a>
---- is not
---- a <a property="depicts"
---- href="https://www.wikidata.org/entity/Q104526"
---- title="A smoking pipe">pipe</a>.</p>
----
---- <p>Nested, mixed-use and custom-namespaced annotations:
---- <span lang="fr" property="bibo:shortDescription">
---- <a property="depiction"
---- href="https://www.wikidata.org/entity/Q1061035"
---- title="A painting of a smoking pipe">Ceci</a>
---- n'est pas
---- une <a property="depicts"
---- href="https://www.wikidata.org/entity/Q104526"
---- title="A smoking pipe">pipe</a>.
---- </span></p>
---- </div>
---- ```
----
---- * v0.0.1
---- * initial release
----
---- @version 0.0.1
---- @see <https://source.jones.dk/semantic-markdown/about/>
---- @see <https://moodle.ruc.dk/course/view.php?id=23505>
---- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
---- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>
-
--- TODO: maybe use topdown traversal
--- * order of declaring annotations might matter (but should not)
--- * might enable simpler functions and/or faster processing
--- @see <https://pandoc.org/lua-filters.html#topdown-traversal>
-
--- ensure stable character classes independent of system locale
--- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
-os.setlocale 'C'
-
--- flag running with older release of Pandoc
---
--- Some Pandoc features,
--- notably pandoc.List:at() introduced wit Pandoc 3.5,
--- are unavailable in older Pandoc releases still in widespread use
--- due to complexities of keeping Haskell dependencies in sync.
--- @see <https://bugs.debian.org/1098377>
-local PANDOC_IS_OLD <const> = PANDOC_VERSION[1] < 3
- or (PANDOC_VERSION[1] == 3 and PANDOC_VERSION[2] < 5)
-
---- pseudo-enum table to track parser enclosure state
---- @see <https://stackoverflow.com/a/70529481/18619283>
-local Enclosure = {
- NONE = "0",
- BRACKETED = "1",
- BRACKETED_DONE = "2",
- BRACED = "3",
- BRACED_DONE = "4",
-}
-
--- element types representing content enclosure in Markdown
-local ElementTypeIsEnclosure = {
- Emph = true,
- Image = true,
- Link = true,
- Strong = true,
-}
-
---- CURIE_TYPE_PREFIX - `typeof` or `resource` attribute CURIE hint
-local CURIE_TYPE_PREFIX <const> = "[.=]?"
-
--- TODO: cover non-ASCII Unicode characters
--- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
---- CURIE_PREFIX - CURIE prefix component as set of chars
---- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
-local _NAME_START_CHAR <const> = "A-Z_a-z"
-local _NAME_CHAR <const> = _NAME_START_CHAR.."-0-9"
-local _REF <const> = "[".._NAME_START_CHAR.."][".._NAME_CHAR.."]*"
-local CURIE_PREFIX <const> = "[".._NAME_START_CHAR.."_-][".._NAME_CHAR.."]*"
-
---- CURIE_LONG - CURIE with prefix and reference as set of chars
-local CURIE_LONG <const> = CURIE_PREFIX..":".._REF
-
---- CURIE_NO_REF - CURIE with only prefix as set of chars
-local CURIE_NO_REF <const> = CURIE_PREFIX..":"
-
---- CURIE_LOCAL - CURIE with only name as set of chars
-local CURIE_LOCAL <const> = ":".._REF
-
---- CURIE_DEFAULT - CURIE without prefix or name as char
-local CURIE_DEFAULT <const> = ":"
-
--- TODO: CURIE_re - CURIE as `LPeg.re` regex object
--- TODO: test and replace above curie* patterns
--- @see <https://pandoc.org/lua-filters.html#global-variables>
---local CURIE_re <const> = re.compile("("..CURIE_PREFIX..")?:(".._REF..")?")
-
--- FIXME: define RDF context same as RDFa
--- TODO: maybe support overriding context with a JSON-LD URI
--- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>
-
---- TableEmpty - check if Pandoc List contains any elements
----
---- Use a workaround for Pandoc releases older than 3.5
---- where ergonomic function pandoc.List:at() is missing.
----
---- @param list Pandoc List to inspect
---- @return result of inspection as boolean
-local function TableEmpty(list)
- if PANDOC_IS_OLD then
- local list_clone = list:clone()
- return not list_clone:remove()
- else
- return list:at(1) == nil
- end
-end
-
---- Namespaces - process RDF namespace IRI declarations
----
---- Takes as input a list of Para block elements.
---- For each block matching the pattern for a namespace IRI definition,
---- the declared namespace is extracted.
---- Returns an empty paragraph in case of a match,
---- or nothing (to signal preservation of original content).
----
---- Example:
----
---- ```Markdown
---- # Annotated paragraph using a custom namespace
----
---- My favorite animal is the [Liger]{ov:preferredAnimal}.
---- {=<#me> .:Person}
----
---- {ov}: http://open.vocab.org/terms/
---- ```
----
---- @param blocks Markdown with ontological annotations as Blocks
---- @returns Markdown without ontological annotations as Blocks
---- @see <https://pandoc.org/lua-filters.html#type-blocks>
---- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
-local function Namespaces(blocks)
-
- -- paragraph with only a braced prefix-only CURIE, colon and one word
- local curie_pattern = "^{"..CURIE_TYPE_PREFIX..CURIE_PREFIX.."}:$"
- if #blocks.content == 3
- and blocks.content[1].t == "Str"
- and blocks.content[2].t == "Space"
- and blocks.content[1].text:match(curie_pattern)
- then
- local el = blocks.content[3]
-
- -- default namespace, parsed as commonmark
- if el.t == "Str"
- and el.text == "@default"
- then
- -- FIXME: add CURIE to metadata
- return {}
- end
-
- -- default namespace, parsed as markdown
- if el.t == "Cite"
- and #el.content == 1
- and el.content[1].text == "@default"
- then
- -- FIXME: add CURIE to metadata
- return {}
- end
-
- -- namespace
- -- TODO: relax to match URI syntax without hardcoded protocols
- local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:"
- if el.t == "Str"
- and el.text:match(proto_pattern)
- then
- -- FIXME: add CURIE and URI to metadata
- return {}
- end
- end
-end
-
---- Statements - process inline RDF statements
----
---- Locate and extract ontological annotations
---- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
----
---- Markup for ontological annotations is an extension to Markdown
---- using similar syntax as hypermedia annotations,
---- but listing RDFa [CURIEs] in a braced enclosure.
----
---- ```ASCII-art
---- Simple ontological annotation:
---- "A [map]{foaf:depiction} is not the territory"
---- | ||\~~~~~~~~~~~~/|
---- a bc CURIEa d
----
---- Nested and mixed-use annotations:
---- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description}
---- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/|
---- a a1 |c1 CURIEa d1bc CURIEb d
---- b1
----
---- Chained hypermedia and ontological annotations:
---- "A [map](https://osm.org/){foaf:depiction} is not the territory"
---- | || ||\~~~~~~~~~~~~/|
---- a be fc CURIEa d
----
---- Legend:
---- a-b: bracketed enclosure around content
---- c-d: braced enclosure around ontological or other annotation
---- e-f: parenthesized enclosure around hypermedia annotation
---- ```
----
---- Ontological annotations are parsed and reorganised
---- using the following algorithm:
----
---- 1. locate pairs of bracketed text and braced text
---- either adjacent or separated by parenthesized text,
---- where braced text contains one or more [CURIEs]
---- 2. for each pair,
---- 1. add CURIEs in braced text to metadata
---- 2. add positions of brackets to metadata
---- 3. delete CURIEs
---- 4. delete braced enclosure if now structurally empty
---- 5. delete brackets if now unannotated
----
---- The implementation is inspired by Pandoc [issue#6038].
----
---- @param inlines Markdown with semantic annotations as Inlines
---- @returns Markdown stripped of semantic annotations as Inlines
---- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
---- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
---- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
--- TODO: maybe instead as step #5 add/reuse hypermedia anchor
-local function Statements (block)
-
- -- flags for enclosing stages
- -- TODO: support nested bracket enclosure
- local encl = Enclosure.NONE
-
- -- amount of detected statements in this block
- local block_has_diverged = false
-
- -- stacks of qualified and pending unenclosed/enclosed elements
- local elems = pandoc.List()
- local elems_unenclosed = pandoc.List()
- local elems_enclosed = pandoc.List()
-
- -- strings of pending unenclosed/enclosed chars
- local chars_unenclosed = ""
- local chars_enclosed = ""
-
- for _, el in ipairs(block.content) do
- local pos = 1
-
- -- non-string element, highest state first to support fall-through
- if el.t ~= 'Str' then
- if encl == Enclosure.BRACED_DONE then
-
- -- push post-brace string to stack
- -- and disqualify brace-only end-of-block enclosure
- -- TODO: parse chars_unenclosed as Str instead
- if chars_unenclosed:len() > 0 then
- elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
- end
- chars_unenclosed = ""
- chars_enclosed = ""
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
-
- -- drop space after completed enclosure
- -- FIXME: only strip after *completed* enclosure
- if el.t ~= "Space" then
- encl = Enclosure.NONE
- end
-
- -- fall through to parse element as unenclosed
- end
-
- if encl == Enclosure.BRACED then
- elems_unenclosed:insert(el)
- elems_enclosed:insert(el)
-
- goto continue
- end
-
- if encl == Enclosure.BRACKETED_DONE then
-
- -- disqualify bracketing not directly followed by brace
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- encl = Enclosure.NONE
-
- -- fall through to parse element as unenclosed
- end
-
- if encl == Enclosure.BRACKETED then
- elems_unenclosed:insert(el)
- elems_enclosed:insert(el)
-
- goto continue
- end
-
- if encl == Enclosure.NONE then
-
- -- semantic annotation misparsed as Link
- -- TODO: limit to solely CURIEs in target
- if el.t == "Link"
- and el.target:find("^{.*}$")
- then
- elems:extend(elems_unenclosed)
- elems:extend(el.content)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- block_has_diverged = true
-
- else
- elems_unenclosed:insert(el)
- end
-
- -- specific elements represent content enclosure
- if ElementTypeIsEnclosure[el.t] then
- encl = Enclosure.BRACKETED_DONE
- end
- end
-
- goto continue
- end
-
- -- unenclosed immediately after enclosure
- if encl == Enclosure.BRACED_DONE then
-
- -- push post-brace string to stack
- -- and disqualify brace-only end-of-block enclosure
- -- TODO: parse chars_unenclosed as Str
- if chars_unenclosed:len() > 0 then
- elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
- end
- chars_unenclosed = ""
- chars_enclosed = ""
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- encl = Enclosure.NONE
- end
-
- -- unenclosed
- -- TODO: accept backslash except immediately before bracket
- if encl == Enclosure.NONE then
- local _, nextpos, s = el.text:find("^([^%[{\\]*)")
- pos = nextpos and nextpos + 1 or pos + 1
- chars_unenclosed = chars_unenclosed..s
-
- -- entering bracketed or braced enclosure
- local t = el.text:sub(pos, pos)
- if t == "[" or t == "{" then
-
- -- qualify unenclosed elements
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- if chars_unenclosed:len() > 0 then
- elems:insert(pandoc.Str(chars_unenclosed))
- end
-
- pos = pos + 1
- chars_unenclosed = chars_unenclosed..t
- chars_enclosed = ""
- if t == "[" then
- encl = Enclosure.BRACKETED
- elseif t == "{" then
- encl = Enclosure.BRACED
- end
- end
- end
-
- -- in bracketed enclosure
- -- TODO: accept backslash except immediately before bracket/brace
- -- TODO: support nested bracket enclosure
- if encl == Enclosure.BRACKETED then
- local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos)
- pos = nextpos and nextpos + 1 or pos + 1
- chars_unenclosed = chars_unenclosed..s
- chars_enclosed = chars_enclosed..s
-
- -- exiting bracketed enclosure
- if el.text:sub(pos, pos) == "]" then
- pos = pos + 1
- chars_unenclosed = chars_unenclosed.."]"
- encl = Enclosure.BRACKETED_DONE
- end
- end
-
- -- exited bracketed enclosure
- if encl == Enclosure.BRACKETED_DONE then
-
- -- entering braced enclosure
- if el.text:sub(pos, pos) == "{" then
- pos = pos + 1
- chars_unenclosed = chars_unenclosed.."{"
- encl = Enclosure.BRACED
-
- -- leaving non-annotation enclosure
- else
-
- -- disqualify bracketing not directly followed by brace
- elems:extend(elems_unenclosed)
- elems_unenclosed = pandoc.List()
- elems_enclosed = pandoc.List()
- if chars_unenclosed:len() > 0 then
- elems:insert(pandoc.Str(chars_unenclosed))
- chars_unenclosed = ""
- end
- chars_enclosed = ""
- encl = Enclosure.NONE
-
- end
- end
-
- -- in braced enclosure, leaving it
- -- TODO: support mixed-use enclosure
- if encl == Enclosure.BRACED then
- local curie_pattern1 = "^"..CURIE_TYPE_PREFIX..CURIE_LONG.."}"
- local curie_pattern2 = "^"..CURIE_TYPE_PREFIX..CURIE_NO_REF.."}"
- local curie_pattern3 = "^"..CURIE_TYPE_PREFIX..CURIE_LOCAL.."}"
- local curie_pattern4 = "^"..CURIE_TYPE_PREFIX..CURIE_DEFAULT.."}"
- local curie_pattern5 = "^"..CURIE_TYPE_PREFIX.."<[^<>]*>}"
- local _, nextpos1 = el.text:find(curie_pattern1, pos)
- local _, nextpos2 = el.text:find(curie_pattern2, pos)
- local _, nextpos3 = el.text:find(curie_pattern3, pos)
- local _, nextpos4 = el.text:find(curie_pattern4, pos)
- local _, nextpos5 = el.text:find(curie_pattern5, pos)
- local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4 or nextpos5
- if nextpos then
- if chars_enclosed:len() > 0 then
- elems_enclosed:insert(pandoc.Str(chars_enclosed))
- end
-
- -- qualify completed bracketed enclosure
- if not TableEmpty(elems_enclosed) then
- elems:extend(elems_enclosed)
-
- -- qualify braced-only enclosure at beginning of block
- elseif (TableEmpty(elems_unenclosed)
- and (chars_unenclosed:len() == 0 or chars_unenclosed == "{"))
- then
- elems:extend(elems_enclosed)
-
- -- postpone braced-only enclosure maybe at end of block
- else
- chars_unenclosed = chars_unenclosed..el.text:sub(pos, nextpos)
- elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
- chars_unenclosed = el.text:sub(nextpos + 1)
- chars_enclosed = el.text:sub(nextpos + 1)
- encl = Enclosure.BRACED_DONE
-
- goto continue
- end
-
- elems_enclosed = pandoc.List()
- elems_unenclosed = pandoc.List()
- chars_enclosed = ""
- chars_unenclosed = el.text:sub(nextpos + 1)
- encl = Enclosure.BRACED_DONE
-
- block_has_diverged = true
- end
- end
-
- -- push strings to stacks
- if chars_enclosed:len() > 0 then
- elems_enclosed:insert(pandoc.Str(chars_enclosed))
- end
- if chars_unenclosed:len() > 0 then
- elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
- end
- chars_unenclosed = ""
- chars_enclosed = ""
-
- -- done parsing current Inline element
- ::continue::
- end
-
- -- qualify brace-only enclosure at end of block
- if encl == Enclosure.BRACED_DONE
- and not TableEmpty(elems_enclosed)
- then
- elems:extend(elems_enclosed)
- block_has_diverged = true
- end
-
- -- return altered stack if it contains complete enclosures
- if block_has_diverged then
-
- -- disqualify incomplete enclosure
- elems:extend(elems_unenclosed)
-
- block.content = elems
- return block
- end
-end
-
--- First resolve namespace declarations, then statements.
---
--- Although this filter is *not* a full RDF parser,
--- order matters for the parts we do handle --
--- e.g. namespace resolving is similar to other RDF formats
--- with detailed documented process ordering.
---
--- @see <https://www.w3.org/TR/turtle/#sec-parsing>
-local meta = {}
-return {
-
- -- move aside MetaBlocks to speed up processing content
- --
- -- @see <https://stackoverflow.com/a/47356252/18619283>
- { Meta = function(m) meta = m; return {} end },
-
- {Para = Namespaces},
-
- {Block = Statements},
-
- -- FIXME: add custom declared namespaces in Meta
- -- TODO: maybe add only actively used namespaces
- -- (do same as for unused link definitions)
- { Meta = function(_) return meta; end },
- --{ Meta = function(_) return NamespacesToMeta(meta); end },
-}