aboutsummaryrefslogtreecommitdiff
path: root/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua
blob: 92d9fa7bcd9866187503cad7c3faa2af2e1f31cf (plain)
  1. --- semantic-markdown - Pandoc plugin to process semantic hints
  2. ---
  3. --- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk>
  4. --- SPDX-License-Identifier: GPL-3.0-or-later
  5. ---
  6. --- ## Examples
  7. ---
  8. --- Ideally, this text:
  9. ---
  10. --- ```Markdown+RDF
  11. --- Simple ontological annotation:
  12. --- [This]{foaf:depiction} is not a pipe.
  13. ---
  14. --- Nested, mixed-use and custom-namespaced annotations:
  15. --- [[Ceci]{foaf:depiction} n'est pas une pipe.]{lang=fr bibo:Quote}
  16. ---
  17. --- {bibo}: http://purl.org/ontology/bibo/
  18. --- ```
  19. ---
  20. --- ...should with this filter be transformed to this text:
  21. ---
  22. --- ```Markdown
  23. --- ---
  24. --- turtle: |
  25. --- @prefix bibo: http://purl.org/ontology/bibo/
  26. ---
  27. --- _:001 a foaf:depiction .
  28. --- _:002 a foaf:depiction .
  29. --- _:003 a bibo:Quote .
  30. --- ---
  31. --- Simple ontological annotation:
  32. --- This is not a pipe.
  33. ---
  34. --- Nested, mixed-use and custom-namespaced annotations:
  35. --- [Ceci n'est pas une pipe.]{lang=fr}
  36. --- ```
  37. ---
  38. --- When target document format is html,
  39. --- this filter should ideally produce RDFa 1.1 Lite or Core data.
  40. --- (Lite is *not* a subset of Core as it deviates slightly).
  41. ---
  42. --- * v0.0.1
  43. --- * initial release
  44. ---
  45. --- @version 0.0.1
  46. --- @see <https://source.jones.dk/semantic-markdown/about/>
  47. --- @see <https://moodle.ruc.dk/course/view.php?id=23505>
  48. --- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
  49. --- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>
  50. -- TODO: maybe use topdown traversal
  51. -- * order of declaring annotations might matter (but should not)
  52. -- * might enable simpler functions and/or faster processing
  53. -- @see <https://pandoc.org/lua-filters.html#topdown-traversal>
  54. -- ensure stable character classes independent of system locale
  55. -- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
  56. os.setlocale 'C'
  57. -- TODO: cover non-ASCII Unicode characters
  58. -- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
  59. --- Curie_long - CURIE with prefix and reference as set of chars
  60. --- @see <https://www.w3.org/TR/rdfa-core/#h-s_curies>
  61. local _name_start_char = "A-Z_a-z"
  62. local _name_char = _name_start_char.."-0-9"
  63. local _reference = "[".._name_start_char.."][".._name_char.."]*"
  64. local _prefix = "[".._name_start_char.."_-][".._name_char.."]*"
  65. local curie_long = _prefix..":".._reference
  66. --- curie_no_ref - CURIE with only prefix as set of chars
  67. local curie_no_ref = _prefix..":"
  68. --- curie_local - CURIE with only name as set of chars
  69. local curie_local = ":".._reference
  70. --- curie_default - CURIE without prefix or name as char
  71. local curie_default = ":"
  72. -- TODO: curie_re - CURIE as `LPeg.re` regex object
  73. -- TODO: test and replace above curie* patterns
  74. -- @see <https://pandoc.org/lua-filters.html#global-variables>
  75. --local curie_re = re.compile("(".._prefix..")?:(".._reference..")?")
  76. -- FIXME: define RDF context same as RDFa
  77. -- TODO: maybe support overriding context with a JSON-LD URI
  78. -- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>
  79. --- Namespaces - process RDF namespace IRI declarations
  80. ---
  81. --- Takes as input a list of Para block elements.
  82. --- For each block matching the pattern for a namespace IRI definition,
  83. --- the declared namespace is extracted.
  84. --- Returns an empty paragraph in case of a match,
  85. --- or nothing (to signal preservation of original content).
  86. ---
  87. --- Example:
  88. ---
  89. --- ```Markdown
  90. --- # Annotated paragraph using a custom namespace
  91. ---
  92. --- My favorite animal is the [Liger]{ov:preferredAnimal}.
  93. ---
  94. --- {ov}: http://open.vocab.org/terms/
  95. --- ```
  96. ---
  97. --- @param blocks Markdown with ontological annotations as Blocks
  98. --- @returns Markdown without ontological annotations as Blocks
  99. --- @see <https://pandoc.org/lua-filters.html#type-blocks>
  100. --- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
  101. local function Namespaces(blocks)
  102. -- paragraph with only a braced reference CURIE, colon and one word
  103. if #blocks.content == 3
  104. and blocks.content[1].t == "Str"
  105. and blocks.content[2].t == "Space"
  106. and blocks.content[1].text:match "^{"..curie_no_ref.."}%:%:$"
  107. then
  108. -- default namespace, misparsed as a citation
  109. if blocks.content[3].t == "Cite"
  110. and #blocks.content[3].content == 1
  111. -- TODO: maybe check case-insensitively
  112. and blocks.content[3].content[1].text == "@default"
  113. then
  114. -- FIXME: add CURIE to metadata
  115. return {}
  116. end
  117. -- namespace
  118. if blocks.content[3].t == "Str"
  119. -- TODO: maybe check case-insensitively
  120. -- TODO: relax to match URI syntax without hardcoded protocols
  121. and blocks.content[3].text:match "^https?:"
  122. then
  123. -- FIXME: add CURIE and URI to metadata
  124. return {}
  125. end
  126. end
  127. end
  128. --- Statements - process inline RDF statements
  129. ---
  130. --- Locate and extract ontological annotations
  131. --- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
  132. ---
  133. --- Markup for ontological annotations is an extension to Markdown
  134. --- using similar syntax as hypermedia annotations,
  135. --- but listing RDFa [CURIEs] in a braced enclosure.
  136. ---
  137. --- ```ASCII-art
  138. --- Simple ontological annotation:
  139. --- "A [map]{foaf:depiction} is not the territory"
  140. --- | ||\~~~~~~~~~~~~/|
  141. --- a bc CURIEa d
  142. ---
  143. --- Nested and mixed-use annotations:
  144. --- ["[Ceci]{foaf:depiction} n'est pas une pipe"]{lang=fr dc:Text}
  145. --- | | ||\~~~~~~~~~~~~/| || \~~~~~/|
  146. --- a a1 |c1 CURIEa d1 bc CURIEb d
  147. --- b1
  148. ---
  149. --- Chained hypermedia and ontological annotations:
  150. --- "A [map](https://osm.org/){foaf:depiction} is not the territory"
  151. --- | || ||\~~~~~~~~~~~~/|
  152. --- a be fc CURIEa d
  153. ---
  154. --- Legend:
  155. --- a-b: braceted enclosure around content
  156. --- c-d: bracketed enclosure around ontological or other annotation
  157. --- e-f: parenthesized enclosure around hypermedia annotation
  158. --- ```
  159. ---
  160. --- Ontological annotations are parsed and reorganised
  161. --- using the following algorithm:
  162. ---
  163. --- 1. locate pairs of bracketed text and braced text
  164. --- either adjacent or separated by parenthesized text,
  165. --- where braced text contains one or more [CURIEs]
  166. --- 2. for each pair,
  167. --- 1. add CURIEs in braced text to metadata
  168. --- 2. add positions of brackets to metadata
  169. --- 3. delete CURIEs
  170. --- 4. delete braced enclosure if now structurally empty
  171. --- 5. delete brackets if now unannotated
  172. ---
  173. --- The implementation is inspired by Pandoc [issue#6038].
  174. ---
  175. --- @param inlines Markdown with semantic annotations as Inlines
  176. --- @returns Markdown stripped of semantic annotations as Inlines
  177. --- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
  178. --- @see [CURIEs]: <https://www.w3.org/TR/rdfa-core/#s_curies>
  179. --- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
  180. -- TODO: maybe instead as step #5 add/reuse hypermedia anchor
  181. function Statements (block)
  182. -- flags for enclosing stages
  183. -- TODO: support nested bracket enclosure
  184. local bracketed, braced
  185. -- amount of detected statements in this block
  186. local statement_count = 0
  187. local stack = {}
  188. for i, el in ipairs(block.content) do
  189. local pos = 0
  190. local stack_next = ""
  191. -- non-string element
  192. if el.t ~= 'Str' then
  193. -- TODO: support mixed-use braced enclosure
  194. if not braced then
  195. table.insert(stack, el)
  196. end
  197. goto continue
  198. end
  199. -- unenclosed
  200. -- TODO: support backslash except immediately before bracket
  201. if not (bracketed or braced) then
  202. _, x, s = string.find(el.text, "^([^%[\\]*)")
  203. if x then
  204. a = x + 1
  205. else
  206. a = 1
  207. end
  208. if el.text:sub(a, a) == "[" then
  209. -- entering bracketed enclosure
  210. bracketed = true
  211. pos = a + 1
  212. stack_next = stack_next..s
  213. -- staying unenclosed
  214. else
  215. table.insert(stack, el)
  216. goto continue
  217. end
  218. end
  219. -- in bracketed enclosure
  220. -- TODO: support backslash except immediately before bracket/brace
  221. -- TODO: support nested bracket enclosure
  222. if bracketed and not braced then
  223. _, x, s = string.find(el.text, "^([^%[%]}\\]*)", pos)
  224. if x then
  225. b = x + 1
  226. else
  227. b = pos
  228. end
  229. if el.text:sub(b, b) == "]" then
  230. c = b + 1
  231. -- entering braced enclosure
  232. if el.text:sub(c, c) == "{" then
  233. braced = true
  234. pos = c + 1
  235. stack_next = stack_next..s
  236. -- leaving non-annotation enclosure
  237. else
  238. bracketed = false
  239. braced = false
  240. -- TODO: clear only back to entering this bracketed enclosure
  241. stack = {}
  242. -- TODO: parse remains of Str
  243. goto continue
  244. end
  245. -- staying enclosed
  246. else
  247. stack_next = stack_next..s
  248. end
  249. end
  250. -- in braced enclosure, leaving it
  251. -- TODO: support mixed-use enclosure
  252. -- TODO: cover curie_prefix and curie_local and curie_default
  253. if braced then
  254. _, d1 = string.find(el.text, "^"..curie_long.."}", pos)
  255. _, d2 = string.find(el.text, "^"..curie_no_ref.."}", pos)
  256. _, d3 = string.find(el.text, "^"..curie_local.."}", pos)
  257. _, d4 = string.find(el.text, "^"..curie_local.."}", pos)
  258. if d1 then d = d1
  259. elseif d2 then d = d2
  260. elseif d3 then d = d3
  261. elseif d4 then d = d4
  262. end
  263. if d then
  264. statement_count = statement_count + 1
  265. table.insert(stack, pandoc.Str(stack_next))
  266. stack_next = ""
  267. bracketed = false
  268. braced = false
  269. pos = d + 1
  270. -- TODO: parse remains of Str
  271. end
  272. end
  273. -- end of element, push collected string to stack
  274. if string.len(stack_next) > 0 and pos >= el.text:len() then
  275. table.insert(stack, pandoc.Str(stack_next))
  276. stack_next = ""
  277. end
  278. -- done parsing current Inline element
  279. ::continue::
  280. end
  281. if statement_count > 0 then
  282. return pandoc.Blocks {stack}
  283. end
  284. end
  285. -- First resolve namespace declarations, then statements.
  286. --
  287. -- Although this filter is *not* a full RDF parser,
  288. -- order matters for the parts we do handle --
  289. -- e.g. namespace resolving is similar to other RDF formats
  290. -- with detailed documented process ordering.
  291. --
  292. -- @see <https://www.w3.org/TR/turtle/#sec-parsing>
  293. local meta = {}
  294. return {
  295. -- move aside MetaBlocks to speed up processing content
  296. --
  297. -- @see <https://stackoverflow.com/a/47356252/18619283>
  298. { Meta = function(m) meta = m; return {} end },
  299. {Para = Namespaces},
  300. {Block = Statements},
  301. -- FIXME: add custom declared namespaces in Meta
  302. -- TODO: maybe add only actively used namespaces
  303. -- (do same as for unused link definitions)
  304. { Meta = function(_) return meta; end },
  305. --{ Meta = function(_) return NamespacesToMeta(meta); end },
  306. }