aboutsummaryrefslogtreecommitdiff
path: root/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua
blob: 59195c2761c82dd46df9a1b1e285ad7d4f5b5c36 (plain)
  1. --- semantic-markdown - Pandoc plugin to process semantic hints
  2. ---
  3. --- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk>
  4. --- SPDX-License-Identifier: GPL-3.0-or-later
  5. ---
  6. --- ## Examples
  7. ---
  8. --- The following Markdown text includes semantic annotations
  9. --- within braced enclosures:
  10. ---
  11. --- ```markdown
  12. --- # {=<#artwork> .:Image} Semantics
  13. ---
  14. --- Simple ontological annotation:
  15. --- [This][painting] is not a [pipe].
  16. ---
  17. --- Nested, mixed-use and custom-namespaced annotations:
  18. --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription}
  19. ---
  20. --- [painting]: {wd:Q1061035}
  21. --- "A painting of a smoking pipe {:depiction}"
  22. ---
  23. --- [pipe]: {wd:Q104526}
  24. --- "A smoking pipe {:depicts}"
  25. ---
  26. --- {@default}: foaf
  27. ---
  28. --- {bibo}: http://purl.org/ontology/bibo/
  29. ---
  30. --- {wd}: http://www.wikidata.org/entity/
  31. --- ```
  32. ---
  33. --- This filter should transform the above text, with the command
  34. --- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`,
  35. --- into the below markdown text with semantic annotations as metadata:
  36. ---
  37. --- ```markdown
  38. --- ---
  39. --- turtle: |
  40. --- @prefix bibo: http://purl.org/ontology/bibo/
  41. --- @prefix foaf: http://xmlns.com/foaf/0.1/
  42. --- @prefix wd: https://www.wikidata.org/entity/
  43. ---
  44. --- <#artwork> a foaf:Image ;
  45. --- foaf:depiction <https://www.wikidata.org/entity/Q1061035> ;
  46. --- foaf:depicts <https://www.wikidata.org/entity/Q104526> ;
  47. --- bibo:shortDescription "Ceci n'est pas une pipe."@fr .
  48. --- ---
  49. --- # Semantics
  50. ---
  51. --- Simple ontological annotation:
  52. --- [This][painting] is not a [pipe].
  53. ---
  54. --- Nested, mixed-use and custom-namespaced annotations:
  55. --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr}
  56. ---
  57. --- [painting]: https://www.wikidata.org/entity/Q1061035
  58. --- "A painting of a smoking pipe"
  59. ---
  60. --- [pipe]: https://www.wikidata.org/entity/Q104526
  61. --- "A smoking pipe"
  62. --- ```
  63. ---
  64. --- This filter should also transform the above text, with the command
  65. --- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`,
  66. --- into the below HTML text with embedded RDFa Lite 1.1 anotations,
  67. --- modulo wrapping of long lines:
  68. ---
  69. --- ```html
  70. --- <div vocab="http://xmlns.com/foaf/0.1/"
  71. --- prefix="bibo: http://purl.org/ontology/bibo/"
  72. --- resource="#artwork" typeof="Image">
  73. --- <h1>Semantics</h1>
  74. --- <p>Simple ontological annotation:
  75. --- <a property="depiction"
  76. --- href="https://www.wikidata.org/entity/Q1061035"
  77. --- title="A painting of a smoking pipe">This</a>
  78. --- is not
  79. --- a <a property="depicts"
  80. --- href="https://www.wikidata.org/entity/Q104526"
  81. --- title="A smoking pipe">pipe</a>.</p>
  82. ---
  83. --- <p>Nested, mixed-use and custom-namespaced annotations:
  84. --- <span lang="fr" property="bibo:shortDescription">
  85. --- <a property="depiction"
  86. --- href="https://www.wikidata.org/entity/Q1061035"
  87. --- title="A painting of a smoking pipe">Ceci</a>
  88. --- n'est pas
  89. --- une <a property="depicts"
  90. --- href="https://www.wikidata.org/entity/Q104526"
  91. --- title="A smoking pipe">pipe</a>.
  92. --- </span></p>
  93. --- </div>
  94. --- ```
  95. ---
  96. --- * v0.0.1
  97. --- * initial release
  98. ---
  99. --- @version 0.0.1
  100. --- @see <https://source.jones.dk/semantic-markdown/about/>
  101. --- @see <https://moodle.ruc.dk/course/view.php?id=23505>
  102. --- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
  103. --- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>
  104. -- TODO: maybe use topdown traversal
  105. -- * order of declaring annotations might matter (but should not)
  106. -- * might enable simpler functions and/or faster processing
  107. -- @see <https://pandoc.org/lua-filters.html#topdown-traversal>
  108. -- ensure stable character classes independent of system locale
  109. -- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
  110. os.setlocale 'C'
  111. -- flag running with older release of Pandoc
  112. --
  113. -- Some Pandoc features,
  114. -- notably pandoc.List:at() introduced wit Pandoc 3.5,
  115. -- are unavailable in older Pandoc releases still in widespread use
  116. -- due to complexities of keeping Haskell dependencies in sync.
  117. -- @see <https://bugs.debian.org/1098377>
  118. local PANDOC_IS_OLD <const> = PANDOC_VERSION[1] < 3
  119. or (PANDOC_VERSION[1] == 3 and PANDOC_VERSION[2] < 5)
  120. --- pseudo-enum table to track parser enclosure state
  121. --- @see <https://stackoverflow.com/a/70529481/18619283>
  122. local Enclosure = {
  123. NONE = "0",
  124. BRACKETED = "1",
  125. BRACKETED_DONE = "2",
  126. BRACED = "3",
  127. }
  128. -- element types representing content enclosure in Markdown
  129. local ElementTypeIsEnclosure = {
  130. Emph = true,
  131. Image = true,
  132. Link = true,
  133. Strong = true,
  134. }
  135. --- CURIE_TYPE_PREFIX - `typeof` or `resource` attribute CURIE hint
  136. local CURIE_TYPE_PREFIX <const> = "[.=]?"
  137. -- TODO: cover non-ASCII Unicode characters
  138. -- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
  139. --- CURIE_PREFIX - CURIE prefix component as set of chars
  140. --- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
  141. local _NAME_START_CHAR <const> = "A-Z_a-z"
  142. local _NAME_CHAR <const> = _NAME_START_CHAR.."-0-9"
  143. local _REF <const> = "[".._NAME_START_CHAR.."][".._NAME_CHAR.."]*"
  144. local CURIE_PREFIX <const> = "[".._NAME_START_CHAR.."_-][".._NAME_CHAR.."]*"
  145. --- CURIE_LONG - CURIE with prefix and reference as set of chars
  146. local CURIE_LONG <const> = CURIE_PREFIX..":".._REF
  147. --- CURIE_NO_REF - CURIE with only prefix as set of chars
  148. local CURIE_NO_REF <const> = CURIE_PREFIX..":"
  149. --- CURIE_LOCAL - CURIE with only name as set of chars
  150. local CURIE_LOCAL <const> = ":".._REF
  151. --- CURIE_DEFAULT - CURIE without prefix or name as char
  152. local CURIE_DEFAULT <const> = ":"
  153. -- TODO: CURIE_re - CURIE as `LPeg.re` regex object
  154. -- TODO: test and replace above curie* patterns
  155. -- @see <https://pandoc.org/lua-filters.html#global-variables>
  156. --local CURIE_re <const> = re.compile("("..CURIE_PREFIX..")?:(".._REF..")?")
  157. -- FIXME: define RDF context same as RDFa
  158. -- TODO: maybe support overriding context with a JSON-LD URI
  159. -- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>
  160. --- TableEmpty - check if Pandoc List contains any elements
  161. ---
  162. --- Use a workaround for Pandoc releases older than 3.5
  163. --- where ergonomic function pandoc.List:at() is missing.
  164. ---
  165. --- @param list Pandoc List to inspect
  166. --- @return result of inspection as boolean
  167. local function TableEmpty(list)
  168. if PANDOC_IS_OLD then
  169. local list_clone = list:clone()
  170. return not list_clone:remove()
  171. else
  172. return list:at(1) == nil
  173. end
  174. end
  175. --- Namespaces - process RDF namespace IRI declarations
  176. ---
  177. --- Takes as input a list of Para block elements.
  178. --- For each block matching the pattern for a namespace IRI definition,
  179. --- the declared namespace is extracted.
  180. --- Returns an empty paragraph in case of a match,
  181. --- or nothing (to signal preservation of original content).
  182. ---
  183. --- Example:
  184. ---
  185. --- ```Markdown
  186. --- # Annotated paragraph using a custom namespace
  187. ---
  188. --- My favorite animal is the [Liger]{ov:preferredAnimal}.
  189. --- {=<#me> .:Person}
  190. ---
  191. --- {ov}: http://open.vocab.org/terms/
  192. --- ```
  193. ---
  194. --- @param blocks Markdown with ontological annotations as Blocks
  195. --- @returns Markdown without ontological annotations as Blocks
  196. --- @see <https://pandoc.org/lua-filters.html#type-blocks>
  197. --- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
  198. local function Namespaces(blocks)
  199. -- paragraph with only a braced prefix-only CURIE, colon and one word
  200. local curie_pattern = "^{"..CURIE_TYPE_PREFIX..CURIE_PREFIX.."}:$"
  201. if #blocks.content == 3
  202. and blocks.content[1].t == "Str"
  203. and blocks.content[2].t == "Space"
  204. and blocks.content[1].text:match(curie_pattern)
  205. then
  206. local el = blocks.content[3]
  207. -- default namespace, parsed as commonmark
  208. if el.t == "Str"
  209. and el.text == "@default"
  210. then
  211. -- FIXME: add CURIE to metadata
  212. return {}
  213. end
  214. -- default namespace, parsed as markdown
  215. if el.t == "Cite"
  216. and #el.content == 1
  217. and el.content[1].text == "@default"
  218. then
  219. -- FIXME: add CURIE to metadata
  220. return {}
  221. end
  222. -- namespace
  223. -- TODO: relax to match URI syntax without hardcoded protocols
  224. local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:"
  225. if el.t == "Str"
  226. and el.text:match(proto_pattern)
  227. then
  228. -- FIXME: add CURIE and URI to metadata
  229. return {}
  230. end
  231. end
  232. end
  233. --- Statements - process inline RDF statements
  234. ---
  235. --- Locate and extract ontological annotations
  236. --- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
  237. ---
  238. --- Markup for ontological annotations is an extension to Markdown
  239. --- using similar syntax as hypermedia annotations,
  240. --- but listing RDFa [CURIEs] in a braced enclosure.
  241. ---
  242. --- ```ASCII-art
  243. --- Simple ontological annotation:
  244. --- "A [map]{foaf:depiction} is not the territory"
  245. --- | ||\~~~~~~~~~~~~/|
  246. --- a bc CURIEa d
  247. ---
  248. --- Nested and mixed-use annotations:
  249. --- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description}
  250. --- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/|
  251. --- a a1 |c1 CURIEa d1bc CURIEb d
  252. --- b1
  253. ---
  254. --- Chained hypermedia and ontological annotations:
  255. --- "A [map](https://osm.org/){foaf:depiction} is not the territory"
  256. --- | || ||\~~~~~~~~~~~~/|
  257. --- a be fc CURIEa d
  258. ---
  259. --- Legend:
  260. --- a-b: bracketed enclosure around content
  261. --- c-d: braced enclosure around ontological or other annotation
  262. --- e-f: parenthesized enclosure around hypermedia annotation
  263. --- ```
  264. ---
  265. --- Ontological annotations are parsed and reorganised
  266. --- using the following algorithm:
  267. ---
  268. --- 1. locate pairs of bracketed text and braced text
  269. --- either adjacent or separated by parenthesized text,
  270. --- where braced text contains one or more [CURIEs]
  271. --- 2. for each pair,
  272. --- 1. add CURIEs in braced text to metadata
  273. --- 2. add positions of brackets to metadata
  274. --- 3. delete CURIEs
  275. --- 4. delete braced enclosure if now structurally empty
  276. --- 5. delete brackets if now unannotated
  277. ---
  278. --- The implementation is inspired by Pandoc [issue#6038].
  279. ---
  280. --- @param inlines Markdown with semantic annotations as Inlines
  281. --- @returns Markdown stripped of semantic annotations as Inlines
  282. --- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
  283. --- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
  284. --- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
  285. -- TODO: maybe instead as step #5 add/reuse hypermedia anchor
  286. local function Statements (block)
  287. -- flags for enclosing stages
  288. -- TODO: support nested bracket enclosure
  289. local encl = Enclosure.NONE
  290. -- amount of detected statements in this block
  291. local statement_count = 0
  292. -- stacks of qualified and pending unenclosed/enclosed elements
  293. local elems = pandoc.List()
  294. local elems_unenclosed = pandoc.List()
  295. local elems_enclosed = pandoc.List()
  296. -- strings of pending unenclosed/enclosed chars
  297. local chars_unenclosed = ""
  298. local chars_enclosed = ""
  299. for _, el in ipairs(block.content) do
  300. local pos = 1
  301. -- non-string element, highest state first to support fall-through
  302. if el.t ~= 'Str' then
  303. elems_unenclosed:insert(el)
  304. if encl == Enclosure.BRACED then
  305. elems_enclosed:insert(el)
  306. goto continue
  307. end
  308. if encl == Enclosure.BRACKETED_DONE then
  309. -- disqualify bracketing not directly followed by brace
  310. elems:extend(elems_unenclosed)
  311. elems_unenclosed = pandoc.List()
  312. elems_enclosed = pandoc.List()
  313. encl = Enclosure.NONE
  314. -- fall through to parse element as unenclosed
  315. end
  316. if encl == Enclosure.BRACKETED then
  317. elems_enclosed:insert(el)
  318. goto continue
  319. end
  320. if encl == Enclosure.NONE then
  321. -- specific elements represent content enclosure
  322. if ElementTypeIsEnclosure[el.t] then
  323. encl = Enclosure.BRACKETED_DONE
  324. end
  325. end
  326. goto continue
  327. end
  328. -- unenclosed
  329. -- TODO: accept backslash except immediately before bracket
  330. if encl == Enclosure.NONE then
  331. local _, nextpos, s = el.text:find("^([^%[\\]*)")
  332. pos = nextpos and nextpos + 1 or pos + 1
  333. chars_unenclosed = chars_unenclosed..s
  334. -- entering bracketed enclosure
  335. if el.text:sub(pos, pos) == "[" then
  336. -- qualify unenclosed elements
  337. elems:extend(elems_unenclosed)
  338. elems_unenclosed = pandoc.List()
  339. elems_enclosed = pandoc.List()
  340. if chars_unenclosed:len() > 0 then
  341. elems:insert(pandoc.Str(chars_unenclosed))
  342. end
  343. pos = pos + 1
  344. chars_unenclosed = chars_unenclosed.."["
  345. chars_enclosed = ""
  346. encl = Enclosure.BRACKETED
  347. end
  348. end
  349. -- in bracketed enclosure
  350. -- TODO: accept backslash except immediately before bracket/brace
  351. -- TODO: support nested bracket enclosure
  352. if encl == Enclosure.BRACKETED then
  353. local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos)
  354. pos = nextpos and nextpos + 1 or pos + 1
  355. chars_unenclosed = chars_unenclosed..s
  356. chars_enclosed = chars_enclosed..s
  357. -- exiting bracketed enclosure
  358. if el.text:sub(pos, pos) == "]" then
  359. pos = pos + 1
  360. chars_unenclosed = chars_unenclosed.."]"
  361. encl = Enclosure.BRACKETED_DONE
  362. end
  363. end
  364. -- exited bracketed enclosure
  365. if encl == Enclosure.BRACKETED_DONE then
  366. -- entering braced enclosure
  367. if el.text:sub(pos, pos) == "{" then
  368. pos = pos + 1
  369. chars_unenclosed = chars_unenclosed.."{"
  370. encl = Enclosure.BRACED
  371. -- leaving non-annotation enclosure
  372. else
  373. -- disqualify bracketing not directly followed by brace
  374. elems:extend(elems_unenclosed)
  375. elems_unenclosed = pandoc.List()
  376. elems_enclosed = pandoc.List()
  377. if chars_unenclosed:len() > 0 then
  378. elems:insert(pandoc.Str(chars_unenclosed))
  379. chars_unenclosed = ""
  380. end
  381. chars_enclosed = ""
  382. encl = Enclosure.NONE
  383. end
  384. end
  385. -- in braced enclosure, leaving it
  386. -- TODO: support mixed-use enclosure
  387. if encl == Enclosure.BRACED then
  388. local curie_pattern1 = "^"..CURIE_TYPE_PREFIX..CURIE_LONG.."}"
  389. local curie_pattern2 = "^"..CURIE_TYPE_PREFIX..CURIE_NO_REF.."}"
  390. local curie_pattern3 = "^"..CURIE_TYPE_PREFIX..CURIE_LOCAL.."}"
  391. local curie_pattern4 = "^"..CURIE_TYPE_PREFIX..CURIE_DEFAULT.."}"
  392. local _, nextpos1 = el.text:find(curie_pattern1, pos)
  393. local _, nextpos2 = el.text:find(curie_pattern2, pos)
  394. local _, nextpos3 = el.text:find(curie_pattern3, pos)
  395. local _, nextpos4 = el.text:find(curie_pattern4, pos)
  396. local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4
  397. if nextpos then
  398. if chars_enclosed:len() > 0 then
  399. elems_enclosed:insert(pandoc.Str(chars_enclosed))
  400. end
  401. -- qualify completed bracketed enclosure
  402. if not TableEmpty(elems_enclosed) then
  403. elems:extend(elems_enclosed)
  404. end
  405. elems_enclosed = pandoc.List()
  406. elems_unenclosed = pandoc.List()
  407. chars_enclosed = ""
  408. chars_unenclosed = ""
  409. encl = Enclosure.NONE
  410. statement_count = statement_count + 1
  411. pos = nextpos + 1
  412. -- TODO: instead recursively parse remains of Str
  413. chars_unenclosed = chars_unenclosed..el.text:sub(pos)
  414. end
  415. end
  416. -- push strings to stacks
  417. if chars_enclosed:len() > 0 then
  418. elems_enclosed:insert(pandoc.Str(chars_enclosed))
  419. end
  420. if chars_unenclosed:len() > 0 then
  421. elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
  422. end
  423. chars_unenclosed = ""
  424. chars_enclosed = ""
  425. -- done parsing current Inline element
  426. ::continue::
  427. end
  428. -- return altered stack if it contains complete enclosures
  429. if statement_count > 0 then
  430. -- disqualify incomplete enclosure
  431. if encl ~= Enclosure.NONE then
  432. elems:extend(elems_unenclosed)
  433. end
  434. block.content = elems
  435. return block
  436. end
  437. end
  438. -- First resolve namespace declarations, then statements.
  439. --
  440. -- Although this filter is *not* a full RDF parser,
  441. -- order matters for the parts we do handle --
  442. -- e.g. namespace resolving is similar to other RDF formats
  443. -- with detailed documented process ordering.
  444. --
  445. -- @see <https://www.w3.org/TR/turtle/#sec-parsing>
  446. local meta = {}
  447. return {
  448. -- move aside MetaBlocks to speed up processing content
  449. --
  450. -- @see <https://stackoverflow.com/a/47356252/18619283>
  451. { Meta = function(m) meta = m; return {} end },
  452. {Para = Namespaces},
  453. {Block = Statements},
  454. -- FIXME: add custom declared namespaces in Meta
  455. -- TODO: maybe add only actively used namespaces
  456. -- (do same as for unused link definitions)
  457. { Meta = function(_) return meta; end },
  458. --{ Meta = function(_) return NamespacesToMeta(meta); end },
  459. }