--- semantic-markdown - Pandoc plugin to process semantic hints
---
--- SPDX-FileCopyrightText: 2025  Jonas Smedegaard <dr@jones.dk>
--- SPDX-License-Identifier: GPL-3.0-or-later
---
--- ## Examples
---
--- Ideally, this text:
---
--- ```Markdown+RDF
--- Simple ontological annotation:
--- [This]{foaf:depiction} is not a pipe.
---
--- Nested, mixed-use and custom-namespaced annotations:
--- [[Ceci]{foaf:depiction} n'est pas une pipe.]{lang=fr bibo:Quote}
---
--- {bibo}: http://purl.org/ontology/bibo/
--- ```
---
--- ...should with this filter be transformed to this text:
---
--- ```Markdown
--- ---
--- turtle: |
---   @prefix bibo: http://purl.org/ontology/bibo/
---
---   _:001 a foaf:depiction .
---   _:002 a foaf:depiction .
---   _:003 a bibo:Quote .
--- ---
--- Simple ontological annotation:
--- This is not a pipe.
---
--- Nested, mixed-use and custom-namespaced annotations:
--- [Ceci n'est pas une pipe.]{lang=fr}
--- ```
---
--- When target document format is html,
--- this filter should ideally produce RDFa 1.1 Lite or Core data.
--- (Lite is *not* a subset of Core as it deviates slightly).
---
--- * v0.0.1
---   * initial release
---
--- @version 0.0.1
--- @see <https://source.jones.dk/semantic-markdown/about/>
--- @see <https://moodle.ruc.dk/course/view.php?id=23505>
--- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
--- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>

-- TODO: maybe use topdown traversal
--  * order of declaring annotations might matter (but should not)
--  * might enable simpler functions and/or faster processing
-- @see <https://pandoc.org/lua-filters.html#topdown-traversal>

-- ensure stable character classes independent of system locale
-- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
os.setlocale 'C'

--- pseudo-enum table to track parser enclosure state
--- @see <https://stackoverflow.com/a/70529481/18619283>
local Enclosure = {
  NONE = "0",
  BRACKETED = "1",
  BRACKETED_DONE = "2",
  BRACED = "3",
}

-- TODO: cover non-ASCII Unicode characters
-- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
--- curie_prefix - CURIE prefix component as set of chars
--- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
local _name_start_char = "A-Z_a-z"
local _name_char = _name_start_char.."-0-9"
local _ref = "[".._name_start_char.."][".._name_char.."]*"
local curie_prefix = "[".._name_start_char.."_-][".._name_char.."]*"

--- curie_long - CURIE with prefix and reference as set of chars
local curie_long = curie_prefix..":".._ref

--- curie_no_ref - CURIE with only prefix as set of chars
local curie_no_ref = curie_prefix..":"

--- curie_local - CURIE with only name as set of chars
local curie_local = ":".._ref

--- curie_default - CURIE without prefix or name as char
local curie_default = ":"

-- TODO: curie_re - CURIE as `LPeg.re` regex object
-- TODO: test and replace above curie* patterns
-- @see <https://pandoc.org/lua-filters.html#global-variables>
--local curie_re = re.compile("("..curie_prefix..")?:(".._ref..")?")

-- FIXME: define RDF context same as RDFa
-- TODO: maybe support overriding context with a JSON-LD URI
-- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>

--- Namespaces - process RDF namespace IRI declarations
---
--- Takes as input a list of Para block elements.
--- For each block matching the pattern for a namespace IRI definition,
--- the declared namespace is extracted.
--- Returns an empty paragraph in case of a match,
--- or nothing (to signal preservation of original content).
---
--- Example:
---
--- ```Markdown
--- # Annotated paragraph using a custom namespace
---
--- My favorite animal is the [Liger]{ov:preferredAnimal}.
---
--- {ov}: http://open.vocab.org/terms/
--- ```
---
--- @param blocks  Markdown with ontological annotations as Blocks
--- @returns       Markdown without ontological annotations as Blocks
--- @see <https://pandoc.org/lua-filters.html#type-blocks>
--- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
local function Namespaces(blocks)

  -- paragraph with only a braced prefix-only CURIE, colon and one word
  local pattern = "^{"..curie_prefix.."}:$"
  if #blocks.content == 3
    and blocks.content[1].t == "Str"
    and blocks.content[2].t == "Space"
    and blocks.content[1].text:match(pattern)
  then

    -- default namespace, misparsed as a citation
    if blocks.content[3].t == "Cite"
        and #blocks.content[3].content == 1

        -- TODO: maybe check case-insensitively
        and blocks.content[3].content[1].text == "@default"
    then
      -- FIXME: add CURIE to metadata
      return {}
    end

    -- namespace
    local pattern = "^https?:"
    if blocks.content[3].t == "Str"

        -- TODO: maybe check case-insensitively
        -- TODO: relax to match URI syntax without hardcoded protocols
      and blocks.content[3].text:match(pattern)
    then
      -- FIXME: add CURIE and URI to metadata
      return {}
    end
  end
end

--- Statements - process inline RDF statements
---
--- Locate and extract ontological annotations
--- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
---
--- Markup for ontological annotations is an extension to Markdown
--- using similar syntax as hypermedia annotations,
--- but listing RDFa [CURIEs] in a braced enclosure.
---
--- ```ASCII-art
--- Simple ontological annotation:
--- "A [map]{foaf:depiction} is not the territory"
---    |   ||\~~~~~~~~~~~~/|
---    a   bc    CURIEa    d
---
--- Nested and mixed-use annotations:
--- ["[Ceci]{foaf:depiction} n'est pas une pipe"]{lang=fr dc:Text}
--- | |    ||\~~~~~~~~~~~~/|                    ||        \~~~~~/|
--- a a1   |c1   CURIEa    d1                   bc        CURIEb d
---        b1
---
--- Chained hypermedia and ontological annotations:
--- "A [map](https://osm.org/){foaf:depiction} is not the territory"
---    |   ||                ||\~~~~~~~~~~~~/|
---    a   be                fc    CURIEa    d
---
--- Legend:
---  a-b: braceted enclosure around content
---  c-d: bracketed enclosure around ontological or other annotation
---  e-f: parenthesized enclosure around hypermedia annotation
--- ```
---
--- Ontological annotations are parsed and reorganised
--- using the following algorithm:
---
---  1. locate pairs of bracketed text and braced text
---     either adjacent or separated by parenthesized text,
---     where braced text contains one or more [CURIEs]
---  2. for each pair,
---    1. add CURIEs in braced text to metadata
---    2. add positions of brackets to metadata
---    3. delete CURIEs
---    4. delete braced enclosure if now structurally empty
---    5. delete brackets if now unannotated
---
--- The implementation is inspired by Pandoc [issue#6038].
---
--- @param inlines  Markdown with semantic annotations as Inlines
--- @returns        Markdown stripped of semantic annotations as Inlines
--- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
--- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
--- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
-- TODO: maybe instead as step #5 add/reuse hypermedia anchor
function Statements (block)

  -- flags for enclosing stages
  -- TODO: support nested bracket enclosure
  local enclosure = Enclosure.NONE

  -- amount of detected statements in this block
  local statement_count = 0

  local stack = {}

  for i, el in ipairs(block.content) do
    local pos = 1
    local stack_next = ""

    -- non-string element
    if el.t ~= 'Str' then

      -- TODO: support mixed-use braced enclosure
      if enclosure ~= Enclosure.BRACED then
        table.insert(stack, el)
      end
      goto continue
    end

    -- unenclosed
    -- TODO: support backslash except immediately before bracket
    if enclosure == Enclosure.NONE then
      _, x, s = el.text:find("^([^%[\\]*)")
      if x then
        a = x + 1
      else
        a = 1
      end
      if el.text:sub(a, a) == "[" then

        -- entering bracketed enclosure
        pos = a + 1
        stack_next = stack_next..s
        enclosure = Enclosure.BRACKETED

      -- staying unenclosed
      else
        table.insert(stack, el)
        goto continue
      end
    end

    -- in bracketed enclosure
    -- TODO: support backslash except immediately before bracket/brace
    -- TODO: support nested bracket enclosure
    if enclosure == Enclosure.BRACKETED then
      _, x, s = el.text:find("^([^%[%]}\\]*)", pos)
      if x then
        b = x + 1
      else
        b = pos
      end
      stack_next = stack_next..s

      -- exiting bracketed enclosure
      if el.text:sub(b, b) == "]" then
        pos = b + 1
        enclosure = Enclosure.BRACKETED_DONE
      end
    end

    -- exited bracketed enclosure
    if enclosure == Enclosure.BRACKETED_DONE then

      -- entering braced enclosure
      if el.text:sub(pos, pos) == "{" then
        pos = pos + 1
        enclosure = Enclosure.BRACED

      -- leaving non-annotation enclosure
      else
        enclosure = Enclosure.NONE

        -- TODO: clear only back to entering this bracketed enclosure
        stack = {}

        -- TODO: parse remains of Str
        goto continue
      end
    end

    -- in braced enclosure, leaving it
    -- TODO: support mixed-use enclosure
    if enclosure == Enclosure.BRACED then
      _, d1 = el.text:find("^"..curie_long.."}", pos)
      _, d2 = el.text:find("^"..curie_no_ref.."}", pos)
      _, d3 = el.text:find("^"..curie_local.."}", pos)
      _, d4 = el.text:find("^"..curie_default.."}", pos)
      if d1 then d = d1
      elseif d2 then d = d2
      elseif d3 then d = d3
      elseif d4 then d = d4
      end
      if d then
        statement_count = statement_count + 1
        pos = d + 1

        -- TODO: instead recursively call Statements() on remains of Str
        stack_next = stack_next..el.text:sub(pos)

        enclosure = Enclosure.NONE
      end
    end

    -- push any string collected from above parsing to stack
    if stack_next:len() > 0 then
      table.insert(stack, pandoc.Str(stack_next))
    end

    -- done parsing current Inline element
    ::continue::
  end
  if statement_count > 0 then
    return pandoc.Blocks {pandoc.Para(stack)}
  end
end

-- First resolve namespace declarations, then statements.
--
-- Although this filter is *not* a full RDF parser,
-- order matters for the parts we do handle --
-- e.g. namespace resolving is similar to other RDF formats
-- with detailed documented process ordering.
--
-- @see <https://www.w3.org/TR/turtle/#sec-parsing>
local meta = {}
return {

  -- move aside MetaBlocks to speed up processing content
  --
  -- @see <https://stackoverflow.com/a/47356252/18619283>
  { Meta = function(m) meta = m; return {} end },

  {Para = Namespaces},

  {Para = Statements},

  -- FIXME: add custom declared namespaces in Meta
  -- TODO: maybe add only actively used namespaces
  -- (do same as for unused link definitions)
  { Meta = function(_) return meta; end },
  --{ Meta = function(_) return NamespacesToMeta(meta); end },
}