_extensions/ruc-play/semantic-markdown/semantic-markdown.lua



--- semantic-markdown - Pandoc plugin to process semantic hints
---
--- SPDX-FileCopyrightText: 2025  Jonas Smedegaard <dr@jones.dk>
--- SPDX-License-Identifier: GPL-3.0-or-later
---
--- ## Examples
---
--- The following Markdown text includes semantic annotations
--- within braced enclosures:
---
--- ```markdown
--- # {=<#artwork> .:Image} Semantics
---
--- Simple ontological annotation:
--- [This][painting] is not a [pipe].
---
--- Nested, mixed-use and custom-namespaced annotations:
--- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription}
---
--- [painting]: {wd:Q1061035}
---   "A painting of a smoking pipe {:depiction}"
---
--- [pipe]: {wd:Q104526}
---   "A smoking pipe {:depicts}"
---
--- {@default}: foaf
---
--- {bibo}: http://purl.org/ontology/bibo/
---
--- {wd}: http://www.wikidata.org/entity/
--- ```
---
--- This filter should transform the above text, with the command
--- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`,
--- into the below markdown text with semantic annotations as metadata:
---
--- ```markdown
--- ---
--- turtle: |
---   @prefix bibo: http://purl.org/ontology/bibo/
---   @prefix foaf: http://xmlns.com/foaf/0.1/
---   @prefix wd: https://www.wikidata.org/entity/
---
---   <#artwork> a foaf:Image ;
---     foaf:depiction <https://www.wikidata.org/entity/Q1061035> ;
---     foaf:depicts <https://www.wikidata.org/entity/Q104526> ;
---     bibo:shortDescription "Ceci n'est pas une pipe."@fr .
--- ---
--- # Semantics
---
--- Simple ontological annotation:
--- [This][painting] is not a [pipe].
---
--- Nested, mixed-use and custom-namespaced annotations:
--- [[Ceci][painting] n'est pas une [pipe].]{lang=fr}
---
--- [painting]: https://www.wikidata.org/entity/Q1061035
---   "A painting of a smoking pipe"
---
--- [pipe]: https://www.wikidata.org/entity/Q104526
---   "A smoking pipe"
--- ```
---
--- This filter should also transform the above text, with the command
--- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`,
--- into the below HTML text with embedded RDFa Lite 1.1 anotations,
--- modulo wrapping of long lines:
---
--- ```html
--- <div vocab="http://xmlns.com/foaf/0.1/"
--- prefix="bibo: http://purl.org/ontology/bibo/"
--- resource="#artwork" typeof="Image">
--- <h1>Semantics</h1>
--- <p>Simple ontological annotation:
--- <a property="depiction"
--- href="https://www.wikidata.org/entity/Q1061035"
--- title="A painting of a smoking pipe">This</a>
--- is not
--- a <a property="depicts"
--- href="https://www.wikidata.org/entity/Q104526"
--- title="A smoking pipe">pipe</a>.</p>
---
--- <p>Nested, mixed-use and custom-namespaced annotations:
--- <span lang="fr" property="bibo:shortDescription">
--- <a property="depiction"
--- href="https://www.wikidata.org/entity/Q1061035"
--- title="A painting of a smoking pipe">Ceci</a>
--- n'est pas
--- une <a property="depicts"
--- href="https://www.wikidata.org/entity/Q104526"
--- title="A smoking pipe">pipe</a>.
--- </span></p>
--- </div>
--- ```
---
--- * v0.0.1
---   * initial release
---
--- @version 0.0.1
--- @see <https://source.jones.dk/semantic-markdown/about/>
--- @see <https://moodle.ruc.dk/course/view.php?id=23505>
--- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
--- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>

-- TODO: maybe use topdown traversal
--  * order of declaring annotations might matter (but should not)
--  * might enable simpler functions and/or faster processing
-- @see <https://pandoc.org/lua-filters.html#topdown-traversal>

-- ensure stable character classes independent of system locale
-- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
os.setlocale 'C'

-- flag running with older release of Pandoc
--
-- Some Pandoc features,
-- notably pandoc.List:at() introduced wit Pandoc 3.5,
-- are unavailable in older Pandoc releases still in widespread use
-- due to complexities of keeping Haskell dependencies in sync.
-- @see <https://bugs.debian.org/1098377>
local PANDOC_IS_OLD <const> = PANDOC_VERSION[1] < 3
 or (PANDOC_VERSION[1] == 3 and PANDOC_VERSION[2] < 5)

--- pseudo-enum table to track parser enclosure state
--- @see <https://stackoverflow.com/a/70529481/18619283>
local Enclosure = {
  NONE = "0",
  BRACKETED = "1",
  BRACKETED_DONE = "2",
  BRACED = "3",
}

-- element types representing content enclosure in Markdown
local ElementTypeIsEnclosure = {
  Emph = true,
  Image = true,
  Link = true,
  Strong = true,
}

--- CURIE_TYPE_PREFIX - `typeof` or `resource` attribute CURIE hint
local CURIE_TYPE_PREFIX <const> = "[.=]?"

-- TODO: cover non-ASCII Unicode characters
-- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
--- CURIE_PREFIX - CURIE prefix component as set of chars
--- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
local _NAME_START_CHAR <const> = "A-Z_a-z"
local _NAME_CHAR <const> = _NAME_START_CHAR.."-0-9"
local _REF <const> = "[".._NAME_START_CHAR.."][".._NAME_CHAR.."]*"
local CURIE_PREFIX <const> = "[".._NAME_START_CHAR.."_-][".._NAME_CHAR.."]*"

--- CURIE_LONG - CURIE with prefix and reference as set of chars
local CURIE_LONG <const> = CURIE_PREFIX..":".._REF

--- CURIE_NO_REF - CURIE with only prefix as set of chars
local CURIE_NO_REF <const> = CURIE_PREFIX..":"

--- CURIE_LOCAL - CURIE with only name as set of chars
local CURIE_LOCAL <const> = ":".._REF

--- CURIE_DEFAULT - CURIE without prefix or name as char
local CURIE_DEFAULT <const> = ":"

-- TODO: CURIE_re - CURIE as `LPeg.re` regex object
-- TODO: test and replace above curie* patterns
-- @see <https://pandoc.org/lua-filters.html#global-variables>
--local CURIE_re <const> = re.compile("("..CURIE_PREFIX..")?:(".._REF..")?")

-- FIXME: define RDF context same as RDFa
-- TODO: maybe support overriding context with a JSON-LD URI
-- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>

--- TableEmpty - check if Pandoc List contains any elements
---
--- Use a workaround for Pandoc releases older than 3.5
--- where ergonomic function pandoc.List:at() is missing.
---
--- @param list  Pandoc List to inspect
--- @return      result of inspection as boolean
local function TableEmpty(list)
  if PANDOC_IS_OLD then
    local list_clone = list:clone()
    return not list_clone:remove()
  else
    return list:at(1) == nil
  end
end

--- Namespaces - process RDF namespace IRI declarations
---
--- Takes as input a list of Para block elements.
--- For each block matching the pattern for a namespace IRI definition,
--- the declared namespace is extracted.
--- Returns an empty paragraph in case of a match,
--- or nothing (to signal preservation of original content).
---
--- Example:
---
--- ```Markdown
--- # Annotated paragraph using a custom namespace
---
--- My favorite animal is the [Liger]{ov:preferredAnimal}.
--- {=<#me> .:Person}
---
--- {ov}: http://open.vocab.org/terms/
--- ```
---
--- @param blocks  Markdown with ontological annotations as Blocks
--- @returns       Markdown without ontological annotations as Blocks
--- @see <https://pandoc.org/lua-filters.html#type-blocks>
--- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
local function Namespaces(blocks)

  -- paragraph with only a braced prefix-only CURIE, colon and one word
  local curie_pattern = "^{"..CURIE_TYPE_PREFIX..CURIE_PREFIX.."}:$"
  if #blocks.content == 3
    and blocks.content[1].t == "Str"
    and blocks.content[2].t == "Space"
    and blocks.content[1].text:match(curie_pattern)
  then
    local el = blocks.content[3]

    -- default namespace, parsed as commonmark
    if el.t == "Str"
        and el.text == "@default"
    then
      -- FIXME: add CURIE to metadata
      return {}
    end

    -- default namespace, parsed as markdown
    if el.t == "Cite"
        and #el.content == 1
        and el.content[1].text == "@default"
    then
      -- FIXME: add CURIE to metadata
      return {}
    end

    -- namespace
    -- TODO: relax to match URI syntax without hardcoded protocols
    local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:"
    if el.t == "Str"
      and el.text:match(proto_pattern)
    then
      -- FIXME: add CURIE and URI to metadata
      return {}
    end
  end
end

--- Statements - process inline RDF statements
---
--- Locate and extract ontological annotations
--- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
---
--- Markup for ontological annotations is an extension to Markdown
--- using similar syntax as hypermedia annotations,
--- but listing RDFa [CURIEs] in a braced enclosure.
---
--- ```ASCII-art
--- Simple ontological annotation:
--- "A [map]{foaf:depiction} is not the territory"
---    |   ||\~~~~~~~~~~~~/|
---    a   bc    CURIEa    d
---
--- Nested and mixed-use annotations:
--- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description}
--- |                    |    ||\~~~~~~~~~~/        | ||\~~~~~~~~~~~~/|
--- a                    a1   |c1  CURIEa           d1bc    CURIEb    d
---                           b1
---
--- Chained hypermedia and ontological annotations:
--- "A [map](https://osm.org/){foaf:depiction} is not the territory"
---    |   ||                ||\~~~~~~~~~~~~/|
---    a   be                fc    CURIEa    d
---
--- Legend:
---  a-b: bracketed enclosure around content
---  c-d: braced enclosure around ontological or other annotation
---  e-f: parenthesized enclosure around hypermedia annotation
--- ```
---
--- Ontological annotations are parsed and reorganised
--- using the following algorithm:
---
---  1. locate pairs of bracketed text and braced text
---     either adjacent or separated by parenthesized text,
---     where braced text contains one or more [CURIEs]
---  2. for each pair,
---    1. add CURIEs in braced text to metadata
---    2. add positions of brackets to metadata
---    3. delete CURIEs
---    4. delete braced enclosure if now structurally empty
---    5. delete brackets if now unannotated
---
--- The implementation is inspired by Pandoc [issue#6038].
---
--- @param inlines  Markdown with semantic annotations as Inlines
--- @returns        Markdown stripped of semantic annotations as Inlines
--- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
--- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
--- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
-- TODO: maybe instead as step #5 add/reuse hypermedia anchor
local function Statements (block)

  -- flags for enclosing stages
  -- TODO: support nested bracket enclosure
  local encl = Enclosure.NONE

  -- amount of detected statements in this block
  local statement_count = 0

  -- stacks of qualified and pending unenclosed/enclosed elements
  local elems = pandoc.List()
  local elems_unenclosed = pandoc.List()
  local elems_enclosed = pandoc.List()

  -- strings of pending unenclosed/enclosed chars
  local chars_unenclosed = ""
  local chars_enclosed = ""

  for _, el in ipairs(block.content) do
    local pos = 1

    -- non-string element, highest state first to support fall-through
    if el.t ~= 'Str' then
      elems_unenclosed:insert(el)

      if encl == Enclosure.BRACED then
        elems_enclosed:insert(el)

        goto continue
      end

      if encl == Enclosure.BRACKETED_DONE then

        -- disqualify bracketing not directly followed by brace
        elems:extend(elems_unenclosed)
        elems_unenclosed = pandoc.List()
        elems_enclosed = pandoc.List()
        encl = Enclosure.NONE

        -- fall through to parse element as unenclosed
      end

      if encl == Enclosure.BRACKETED then
        elems_enclosed:insert(el)

        goto continue
      end

      if encl == Enclosure.NONE then

        -- specific elements represent content enclosure
        if ElementTypeIsEnclosure[el.t] then
          encl = Enclosure.BRACKETED_DONE
        end
      end

      goto continue
    end

    -- unenclosed
    -- TODO: accept backslash except immediately before bracket
    if encl == Enclosure.NONE then
      local _, nextpos, s = el.text:find("^([^%[\\]*)")
      pos = nextpos and nextpos + 1 or pos + 1
      chars_unenclosed = chars_unenclosed..s

      -- entering bracketed enclosure
      if el.text:sub(pos, pos) == "[" then

        -- qualify unenclosed elements
        elems:extend(elems_unenclosed)
        elems_unenclosed = pandoc.List()
        elems_enclosed = pandoc.List()
        if chars_unenclosed:len() > 0 then
          elems:insert(pandoc.Str(chars_unenclosed))
        end

        pos = pos + 1
        chars_unenclosed = chars_unenclosed.."["
        chars_enclosed = ""
        encl = Enclosure.BRACKETED
      end
    end

    -- in bracketed enclosure
    -- TODO: accept backslash except immediately before bracket/brace
    -- TODO: support nested bracket enclosure
    if encl == Enclosure.BRACKETED then
      local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos)
      pos = nextpos and nextpos + 1 or pos + 1
      chars_unenclosed = chars_unenclosed..s
      chars_enclosed = chars_enclosed..s

      -- exiting bracketed enclosure
      if el.text:sub(pos, pos) == "]" then
        pos = pos + 1
        chars_unenclosed = chars_unenclosed.."]"
        encl = Enclosure.BRACKETED_DONE
      end
    end

    -- exited bracketed enclosure
    if encl == Enclosure.BRACKETED_DONE then

      -- entering braced enclosure
      if el.text:sub(pos, pos) == "{" then
        pos = pos + 1
        chars_unenclosed = chars_unenclosed.."{"
        encl = Enclosure.BRACED

      -- leaving non-annotation enclosure
      else

        -- disqualify bracketing not directly followed by brace
        elems:extend(elems_unenclosed)
        elems_unenclosed = pandoc.List()
        elems_enclosed = pandoc.List()
        if chars_unenclosed:len() > 0 then
          elems:insert(pandoc.Str(chars_unenclosed))
          chars_unenclosed = ""
        end
        chars_enclosed = ""
        encl = Enclosure.NONE

      end
    end

    -- in braced enclosure, leaving it
    -- TODO: support mixed-use enclosure
    if encl == Enclosure.BRACED then
      local curie_pattern1 = "^"..CURIE_TYPE_PREFIX..CURIE_LONG.."}"
      local curie_pattern2 = "^"..CURIE_TYPE_PREFIX..CURIE_NO_REF.."}"
      local curie_pattern3 = "^"..CURIE_TYPE_PREFIX..CURIE_LOCAL.."}"
      local curie_pattern4 = "^"..CURIE_TYPE_PREFIX..CURIE_DEFAULT.."}"
      local _, nextpos1 = el.text:find(curie_pattern1, pos)
      local _, nextpos2 = el.text:find(curie_pattern2, pos)
      local _, nextpos3 = el.text:find(curie_pattern3, pos)
      local _, nextpos4 = el.text:find(curie_pattern4, pos)
      local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4
      if nextpos then
        if chars_enclosed:len() > 0 then
          elems_enclosed:insert(pandoc.Str(chars_enclosed))
        end

        -- qualify completed bracketed enclosure
        if not TableEmpty(elems_enclosed) then
          elems:extend(elems_enclosed)
        end

        elems_enclosed = pandoc.List()
        elems_unenclosed = pandoc.List()
        chars_enclosed = ""
        chars_unenclosed = ""
        encl = Enclosure.NONE

        statement_count = statement_count + 1
        pos = nextpos + 1

        -- TODO: instead recursively parse remains of Str
        chars_unenclosed = chars_unenclosed..el.text:sub(pos)
      end
    end

    -- push strings to stacks
    if chars_enclosed:len() > 0 then
      elems_enclosed:insert(pandoc.Str(chars_enclosed))
    end
    if chars_unenclosed:len() > 0 then
      elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
    end
    chars_unenclosed = ""
    chars_enclosed = ""

    -- done parsing current Inline element
    ::continue::
  end

  -- return altered stack if it contains complete enclosures
  if statement_count > 0 then

    -- disqualify incomplete enclosure
    if encl ~= Enclosure.NONE then
      elems:extend(elems_unenclosed)
    end

    block.content = elems
    return block
  end
end

-- First resolve namespace declarations, then statements.
--
-- Although this filter is *not* a full RDF parser,
-- order matters for the parts we do handle --
-- e.g. namespace resolving is similar to other RDF formats
-- with detailed documented process ordering.
--
-- @see <https://www.w3.org/TR/turtle/#sec-parsing>
local meta = {}
return {

  -- move aside MetaBlocks to speed up processing content
  --
  -- @see <https://stackoverflow.com/a/47356252/18619283>
  { Meta = function(m) meta = m; return {} end },

  {Para = Namespaces},

  {Block = Statements},

  -- FIXME: add custom declared namespaces in Meta
  -- TODO: maybe add only actively used namespaces
  -- (do same as for unused link definitions)
  { Meta = function(_) return meta; end },
  --{ Meta = function(_) return NamespacesToMeta(meta); end },
}