aboutsummaryrefslogtreecommitdiff
path: root/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua
blob: 7ae46554eab2aed31c8e82c6bf1cc9543aaf9233 (plain)
  1. --- semantic-markdown - Pandoc plugin to process semantic hints
  2. ---
  3. --- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk>
  4. --- SPDX-License-Identifier: GPL-3.0-or-later
  5. ---
  6. --- ## Examples
  7. ---
  8. --- Ideally, this text:
  9. ---
  10. --- ```Markdown+RDF
  11. --- Simple ontological annotation:
  12. --- [This]{foaf:depiction} is not a pipe.
  13. ---
  14. --- Nested, mixed-use and custom-namespaced annotations:
  15. --- [[Ceci]{foaf:depiction} n'est pas une pipe.]{lang=fr bibo:Quote}
  16. ---
  17. --- {bibo}: http://purl.org/ontology/bibo/
  18. --- ```
  19. ---
  20. --- ...should with this filter be transformed to this text:
  21. ---
  22. --- ```Markdown
  23. --- ---
  24. --- turtle: |
  25. --- @prefix bibo: http://purl.org/ontology/bibo/
  26. ---
  27. --- _:001 a foaf:depiction .
  28. --- _:002 a foaf:depiction .
  29. --- _:003 a bibo:Quote .
  30. --- ---
  31. --- Simple ontological annotation:
  32. --- This is not a pipe.
  33. ---
  34. --- Nested, mixed-use and custom-namespaced annotations:
  35. --- [Ceci n'est pas une pipe.]{lang=fr}
  36. --- ```
  37. ---
  38. --- When target document format is html,
  39. --- this filter should ideally produce RDFa 1.1 Lite or Core data.
  40. --- (Lite is *not* a subset of Core as it deviates slightly).
  41. ---
  42. --- * v0.0.1
  43. --- * initial release
  44. ---
  45. --- @version 0.0.1
  46. --- @see <https://source.jones.dk/semantic-markdown/about/>
  47. --- @see <https://moodle.ruc.dk/course/view.php?id=23505>
  48. --- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
  49. --- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>
  50. -- TODO: maybe use topdown traversal
  51. -- * order of declaring annotations might matter (but should not)
  52. -- * might enable simpler functions and/or faster processing
  53. -- @see <https://pandoc.org/lua-filters.html#topdown-traversal>
  54. -- ensure stable character classes independent of system locale
  55. -- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
  56. os.setlocale 'C'
  57. --- pseudo-enum table to track parser enclosure state
  58. --- @see <https://stackoverflow.com/a/70529481/18619283>
  59. local Enclosure = {
  60. NONE = "0",
  61. BRACKETED = "1",
  62. BRACKETED_DONE = "2",
  63. BRACED = "3",
  64. }
  65. -- element types that represents bracket enclosure in Markdown
  66. local ElementTypeIsBracketing = {
  67. Image = true,
  68. Link = true,
  69. }
  70. -- TODO: cover non-ASCII Unicode characters
  71. -- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
  72. --- curie_prefix - CURIE prefix component as set of chars
  73. --- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
  74. local _name_start_char = "A-Z_a-z"
  75. local _name_char = _name_start_char.."-0-9"
  76. local _ref = "[".._name_start_char.."][".._name_char.."]*"
  77. local curie_prefix = "[".._name_start_char.."_-][".._name_char.."]*"
  78. --- curie_long - CURIE with prefix and reference as set of chars
  79. local curie_long = curie_prefix..":".._ref
  80. --- curie_no_ref - CURIE with only prefix as set of chars
  81. local curie_no_ref = curie_prefix..":"
  82. --- curie_local - CURIE with only name as set of chars
  83. local curie_local = ":".._ref
  84. --- curie_default - CURIE without prefix or name as char
  85. local curie_default = ":"
  86. -- TODO: curie_re - CURIE as `LPeg.re` regex object
  87. -- TODO: test and replace above curie* patterns
  88. -- @see <https://pandoc.org/lua-filters.html#global-variables>
  89. --local curie_re = re.compile("("..curie_prefix..")?:(".._ref..")?")
  90. -- FIXME: define RDF context same as RDFa
  91. -- TODO: maybe support overriding context with a JSON-LD URI
  92. -- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>
  93. --- Namespaces - process RDF namespace IRI declarations
  94. ---
  95. --- Takes as input a list of Para block elements.
  96. --- For each block matching the pattern for a namespace IRI definition,
  97. --- the declared namespace is extracted.
  98. --- Returns an empty paragraph in case of a match,
  99. --- or nothing (to signal preservation of original content).
  100. ---
  101. --- Example:
  102. ---
  103. --- ```Markdown
  104. --- # Annotated paragraph using a custom namespace
  105. ---
  106. --- My favorite animal is the [Liger]{ov:preferredAnimal}.
  107. ---
  108. --- {ov}: http://open.vocab.org/terms/
  109. --- ```
  110. ---
  111. --- @param blocks Markdown with ontological annotations as Blocks
  112. --- @returns Markdown without ontological annotations as Blocks
  113. --- @see <https://pandoc.org/lua-filters.html#type-blocks>
  114. --- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
  115. local function Namespaces(blocks)
  116. -- paragraph with only a braced prefix-only CURIE, colon and one word
  117. local curie_pattern = "^{"..curie_prefix.."}:$"
  118. if #blocks.content == 3
  119. and blocks.content[1].t == "Str"
  120. and blocks.content[2].t == "Space"
  121. and blocks.content[1].text:match(curie_pattern)
  122. then
  123. local el = blocks.content[3]
  124. -- default namespace, parsed as commonmark
  125. if el.t == "Str"
  126. and el.text == "@default"
  127. then
  128. -- FIXME: add CURIE to metadata
  129. return {}
  130. end
  131. -- default namespace, parsed as markdown
  132. if el.t == "Cite"
  133. and #el.content == 1
  134. and el.content[1].text == "@default"
  135. then
  136. -- FIXME: add CURIE to metadata
  137. return {}
  138. end
  139. -- namespace
  140. -- TODO: relax to match URI syntax without hardcoded protocols
  141. local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:"
  142. if el.t == "Str"
  143. and el.text:match(proto_pattern)
  144. then
  145. -- FIXME: add CURIE and URI to metadata
  146. return {}
  147. end
  148. end
  149. end
  150. --- Statements - process inline RDF statements
  151. ---
  152. --- Locate and extract ontological annotations
  153. --- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
  154. ---
  155. --- Markup for ontological annotations is an extension to Markdown
  156. --- using similar syntax as hypermedia annotations,
  157. --- but listing RDFa [CURIEs] in a braced enclosure.
  158. ---
  159. --- ```ASCII-art
  160. --- Simple ontological annotation:
  161. --- "A [map]{foaf:depiction} is not the territory"
  162. --- | ||\~~~~~~~~~~~~/|
  163. --- a bc CURIEa d
  164. ---
  165. --- Nested and mixed-use annotations:
  166. --- ["[Ceci]{foaf:depiction} n'est pas une pipe"]{lang=fr dc:Text}
  167. --- | | ||\~~~~~~~~~~~~/| || \~~~~~/|
  168. --- a a1 |c1 CURIEa d1 bc CURIEb d
  169. --- b1
  170. ---
  171. --- Chained hypermedia and ontological annotations:
  172. --- "A [map](https://osm.org/){foaf:depiction} is not the territory"
  173. --- | || ||\~~~~~~~~~~~~/|
  174. --- a be fc CURIEa d
  175. ---
  176. --- Legend:
  177. --- a-b: braceted enclosure around content
  178. --- c-d: bracketed enclosure around ontological or other annotation
  179. --- e-f: parenthesized enclosure around hypermedia annotation
  180. --- ```
  181. ---
  182. --- Ontological annotations are parsed and reorganised
  183. --- using the following algorithm:
  184. ---
  185. --- 1. locate pairs of bracketed text and braced text
  186. --- either adjacent or separated by parenthesized text,
  187. --- where braced text contains one or more [CURIEs]
  188. --- 2. for each pair,
  189. --- 1. add CURIEs in braced text to metadata
  190. --- 2. add positions of brackets to metadata
  191. --- 3. delete CURIEs
  192. --- 4. delete braced enclosure if now structurally empty
  193. --- 5. delete brackets if now unannotated
  194. ---
  195. --- The implementation is inspired by Pandoc [issue#6038].
  196. ---
  197. --- @param inlines Markdown with semantic annotations as Inlines
  198. --- @returns Markdown stripped of semantic annotations as Inlines
  199. --- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
  200. --- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
  201. --- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
  202. -- TODO: maybe instead as step #5 add/reuse hypermedia anchor
  203. local function Statements (block)
  204. -- flags for enclosing stages
  205. -- TODO: support nested bracket enclosure
  206. local enclosure = Enclosure.NONE
  207. -- amount of detected statements in this block
  208. local statement_count = 0
  209. local stack = {}
  210. for _, el in ipairs(block.content) do
  211. local pos = 1
  212. local stack_next = ""
  213. -- non-string element
  214. if el.t ~= 'Str' then
  215. -- specific elements represent bracketing
  216. if enclosure == Enclosure.NONE then
  217. if ElementTypeIsBracketing[el.t] then
  218. enclosure = Enclosure.BRACKETED_DONE
  219. end
  220. -- disqualify bracketing not directly followed by brace
  221. elseif enclosure == Enclosure.BRACKETED_DONE then
  222. enclosure = Enclosure.NONE
  223. end
  224. -- collect element, except in braced enclosure
  225. -- TODO: support mixed-use braced enclosure
  226. if enclosure ~= Enclosure.BRACED then
  227. table.insert(stack, el)
  228. end
  229. goto continue
  230. end
  231. -- unenclosed
  232. -- TODO: accept backslash except immediately before bracket
  233. if enclosure == Enclosure.NONE then
  234. local _, x, s = el.text:find("^([^%[\\]*)")
  235. pos = x and x + 1 or pos + 1
  236. if el.text:sub(pos, pos) == "[" then
  237. -- entering bracketed enclosure
  238. pos = pos + 1
  239. stack_next = stack_next..s
  240. enclosure = Enclosure.BRACKETED
  241. -- staying unenclosed
  242. else
  243. stack_next = stack_next..el.text
  244. end
  245. end
  246. -- in bracketed enclosure
  247. -- TODO: accept backslash except immediately before bracket/brace
  248. -- TODO: support nested bracket enclosure
  249. if enclosure == Enclosure.BRACKETED then
  250. local _, x, s = el.text:find("^([^%[%]}\\]*)", pos)
  251. pos = x and x + 1 or pos + 1
  252. stack_next = stack_next..s
  253. -- exiting bracketed enclosure
  254. if el.text:sub(pos, pos) == "]" then
  255. pos = pos + 1
  256. enclosure = Enclosure.BRACKETED_DONE
  257. end
  258. end
  259. -- exited bracketed enclosure
  260. if enclosure == Enclosure.BRACKETED_DONE then
  261. -- entering braced enclosure
  262. if el.text:sub(pos, pos) == "{" then
  263. pos = pos + 1
  264. enclosure = Enclosure.BRACED
  265. -- leaving non-annotation enclosure
  266. else
  267. enclosure = Enclosure.NONE
  268. -- TODO: clear only back to entering this bracketed enclosure
  269. stack = {}
  270. -- TODO: parse remains of Str
  271. end
  272. end
  273. -- in braced enclosure, leaving it
  274. -- TODO: support mixed-use enclosure
  275. if enclosure == Enclosure.BRACED then
  276. local _, d1 = el.text:find("^"..curie_long.."}", pos)
  277. local _, d2 = el.text:find("^"..curie_no_ref.."}", pos)
  278. local _, d3 = el.text:find("^"..curie_local.."}", pos)
  279. local _, d4 = el.text:find("^"..curie_default.."}", pos)
  280. local d = d1 or d2 or d3 or d4
  281. if d then
  282. statement_count = statement_count + 1
  283. pos = d + 1
  284. -- TODO: instead recursively call Statements() on remains of Str
  285. stack_next = stack_next..el.text:sub(pos)
  286. enclosure = Enclosure.NONE
  287. end
  288. end
  289. -- push any string collected from above parsing to stack
  290. if stack_next:len() > 0 then
  291. table.insert(stack, pandoc.Str(stack_next))
  292. end
  293. -- done parsing current Inline element
  294. ::continue::
  295. end
  296. if statement_count > 0 then
  297. return pandoc.Blocks {pandoc.Para(stack)}
  298. end
  299. end
  300. -- First resolve namespace declarations, then statements.
  301. --
  302. -- Although this filter is *not* a full RDF parser,
  303. -- order matters for the parts we do handle --
  304. -- e.g. namespace resolving is similar to other RDF formats
  305. -- with detailed documented process ordering.
  306. --
  307. -- @see <https://www.w3.org/TR/turtle/#sec-parsing>
  308. local meta = {}
  309. return {
  310. -- move aside MetaBlocks to speed up processing content
  311. --
  312. -- @see <https://stackoverflow.com/a/47356252/18619283>
  313. { Meta = function(m) meta = m; return {} end },
  314. {Para = Namespaces},
  315. {Para = Statements},
  316. -- FIXME: add custom declared namespaces in Meta
  317. -- TODO: maybe add only actively used namespaces
  318. -- (do same as for unused link definitions)
  319. { Meta = function(_) return meta; end },
  320. --{ Meta = function(_) return NamespacesToMeta(meta); end },
  321. }