- --- semantic-markdown - Pandoc plugin to process semantic hints
- ---
- --- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk>
- --- SPDX-License-Identifier: GPL-3.0-or-later
- ---
- --- ## Examples
- ---
- --- Ideally, this text:
- ---
- --- ```Markdown+RDF
- --- Simple ontological annotation:
- --- [This]{foaf:depiction} is not a pipe.
- ---
- --- Nested, mixed-use and custom-namespaced annotations:
- --- [[Ceci]{foaf:depiction} n'est pas une pipe.]{lang=fr bibo:Quote}
- ---
- --- {bibo}: http://purl.org/ontology/bibo/
- --- ```
- ---
- --- ...should with this filter be transformed to this text:
- ---
- --- ```Markdown
- --- ---
- --- turtle: |
- --- @prefix bibo: http://purl.org/ontology/bibo/
- ---
- --- _:001 a foaf:depiction .
- --- _:002 a foaf:depiction .
- --- _:003 a bibo:Quote .
- --- ---
- --- Simple ontological annotation:
- --- This is not a pipe.
- ---
- --- Nested, mixed-use and custom-namespaced annotations:
- --- [Ceci n'est pas une pipe.]{lang=fr}
- --- ```
- ---
- --- When target document format is html,
- --- this filter should ideally produce RDFa 1.1 Lite or Core data.
- --- (Lite is *not* a subset of Core as it deviates slightly).
- ---
- --- * v0.0.1
- --- * initial release
- ---
- --- @version 0.0.1
- --- @see <https://source.jones.dk/semantic-markdown/about/>
- --- @see <https://moodle.ruc.dk/course/view.php?id=23505>
- --- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
- --- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>
- -- TODO: maybe use topdown traversal
- -- * order of declaring annotations might matter (but should not)
- -- * might enable simpler functions and/or faster processing
- -- @see <https://pandoc.org/lua-filters.html#topdown-traversal>
- -- ensure stable character classes independent of system locale
- -- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
- os.setlocale 'C'
- -- TODO: cover non-ASCII Unicode characters
- -- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
- --- Curie_long - CURIE with prefix and reference as set of chars
- --- @see <https://www.w3.org/TR/rdfa-core/#h-s_curies>
- local _name_start_char = "A-Z_a-z"
- local _name_char = _name_start_char.."-0-9"
- local _reference = "[".._name_start_char.."][".._name_char.."]*"
- local _prefix = "[".._name_start_char.."_-][".._name_char.."]*"
- local curie_long = _prefix..":".._reference
- --- curie_no_ref - CURIE with only prefix as set of chars
- local curie_no_ref = _prefix..":"
- --- curie_local - CURIE with only name as set of chars
- local curie_local = ":".._reference
- --- curie_default - CURIE without prefix or name as char
- local curie_default = ":"
- -- TODO: curie_re - CURIE as `LPeg.re` regex object
- -- TODO: test and replace above curie* patterns
- -- @see <https://pandoc.org/lua-filters.html#global-variables>
- --local curie_re = re.compile("(".._prefix..")?:(".._reference..")?")
- -- FIXME: define RDF context same as RDFa
- -- TODO: maybe support overriding context with a JSON-LD URI
- -- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>
- --- Namespaces - process RDF namespace IRI declarations
- ---
- --- Takes as input a list of Para block elements.
- --- For each block matching the pattern for a namespace IRI definition,
- --- the declared namespace is extracted.
- --- Returns an empty paragraph in case of a match,
- --- or nothing (to signal preservation of original content).
- ---
- --- Example:
- ---
- --- ```Markdown
- --- # Annotated paragraph using a custom namespace
- ---
- --- My favorite animal is the [Liger]{ov:preferredAnimal}.
- ---
- --- {ov}: http://open.vocab.org/terms/
- --- ```
- ---
- --- @param blocks Markdown with ontological annotations as Blocks
- --- @returns Markdown without ontological annotations as Blocks
- --- @see <https://pandoc.org/lua-filters.html#type-blocks>
- --- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
- local function Namespaces(blocks)
- if #blocks.content == 3
- and blocks.content[1].t == "Str"
- and blocks.content[2].t == "Space"
- and blocks.content[3].t == "Str"
- and blocks.content[1].text:match "^{"..curie_no_ref.."}%:%:$"
- and blocks.content[3].text:match "^https?:"
- then
- -- FIXME: register namespace in Meta
- return pandoc.Blocks {}
- end
- end
- --- Statements - process inline RDF statements
- ---
- --- This function is a Pandoc hook executed for each Inlines object
- --- when iterating through its Abstract Syntax Tree (AST) of a document.
- ---
- --- ```ASCII-art
- --- Simple ontological annotation:
- --- "A [map]{foaf:depiction} is not the territory"
- --- | || |
- --- | |brace_open brace_close
- --- | bracket_close
- --- bracket_open
- ---
- --- Nested and mixed-use annotations:
- --- ["[Ceci]{foaf:depiction} n'est pas une pipe"{lang=fr dc:Text}
- --- | | || |
- --- | | |brace_open brace_close
- --- | | bracket_close
- --- | bracket_open
- --- ```
- ---
- --- @param inlines Markdown with semantic annotations as Inlines
- --- @returns Markdown stripped of semantic annotations as Inlines
- --- @see <https://pandoc.org/lua-filters.html#type-inline>
- function Statements (inlines)
- -- positions of enclosure markers
- local bracket_open, bracket_close, brace_open, brace_close
- -- maintain states across inlines
- local bracketed, braced, has_hints
- local new_inlines = {}
- for i, el in ipairs(inlines) do
- -- only string inlines can alter state
- if el.t ~= 'Str' then
- table.insert(new_inlines, el)
- goto continue
- end
- -- unenclosed
- if not (bracketed or braced) then
- _, bracket_open = string.find(el.text, "%[")
- if bracket_open then
- bracketed = true
- end
- end
- -- enters a bracket enclosure
- -- TODO: maybe support nested bracket enclosure
- if bracketed and not braced then
- _, bracket_close, s = string.find(el.text, "^([^%[%]}]*)%]{",
- bracket_open)
- if bracket_close then
- braced = true
- table.insert(new_inlines, pandoc.Str(s))
- end
- end
- -- (ignore space-delimited enclosures: not in spec for inlines)
- -- completes a brace enclosure
- -- TODO: support mixed-use enclosure
- -- TODO: cover curie_prefix and curie_local and curie_default
- if braced then
- _, brace_close = string.find(el.text, "^"..curie_long.."}",
- bracket_close)
- if brace_close then
- has_hints = true
- -- TODO: call same function with remains of Str
- end
- end
- ::continue::
- end
- -- FIXME
- -- if has_hints then
- -- return pandoc.Inlines {new_inlines}
- -- end
- end
- -- First resolve namespace declarations, then statements.
- --
- -- Although this filter is *not* a full RDF parser,
- -- e.g. namespace resolving is similar to other RDF formats
- -- with detailed documented process ordering.
- -- @see <https://www.w3.org/TR/turtle/#sec-parsing>
- return {
- {Para = Namespaces},
- {Inlines = Statements}
- -- FIXME: add custom declared namespaces in Meta
- -- TODO: maybe add only actively used namespaces
- -- (do same as for unused link definitions)
- --{Meta = NamespacesToMeta},
- }
|