aboutsummaryrefslogtreecommitdiff
path: root/_extensions/ruc-play/semantic-markdown/semantic-markdown.lua
blob: f076950f4a5ad8ef103537c30190d2ecd7ecfcd0 (plain)
  1. --- semantic-markdown - Pandoc plugin to process semantic hints
  2. ---
  3. --- SPDX-FileCopyrightText: 2025 Jonas Smedegaard <dr@jones.dk>
  4. --- SPDX-License-Identifier: GPL-3.0-or-later
  5. ---
  6. --- ## Examples
  7. ---
  8. --- The following Markdown text includes semantic annotations
  9. --- within braced enclosures:
  10. ---
  11. --- ```markdown
  12. --- # {=<#artwork> .:Image} Semantics
  13. ---
  14. --- Simple ontological annotation:
  15. --- [This][painting] is not a [pipe].
  16. ---
  17. --- Nested, mixed-use and custom-namespaced annotations:
  18. --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr bibo:shortDescription}
  19. ---
  20. --- [painting]: {wd:Q1061035}
  21. --- "A painting of a smoking pipe {:depiction}"
  22. ---
  23. --- [pipe]: {wd:Q104526}
  24. --- "A smoking pipe {:depicts}"
  25. ---
  26. --- {@default}: foaf
  27. ---
  28. --- {bibo}: http://purl.org/ontology/bibo/
  29. ---
  30. --- {wd}: http://www.wikidata.org/entity/
  31. --- ```
  32. ---
  33. --- This filter should transform the above text, with the command
  34. --- `pandoc -L semantic-markdown.lua -t commonmark --wrap=preserve`,
  35. --- into the below markdown text with semantic annotations as metadata:
  36. ---
  37. --- ```markdown
  38. --- ---
  39. --- turtle: |
  40. --- @prefix bibo: http://purl.org/ontology/bibo/
  41. --- @prefix foaf: http://xmlns.com/foaf/0.1/
  42. --- @prefix wd: https://www.wikidata.org/entity/
  43. ---
  44. --- <#artwork> a foaf:Image ;
  45. --- foaf:depiction <https://www.wikidata.org/entity/Q1061035> ;
  46. --- foaf:depicts <https://www.wikidata.org/entity/Q104526> ;
  47. --- bibo:shortDescription "Ceci n'est pas une pipe."@fr .
  48. --- ---
  49. --- # Semantics
  50. ---
  51. --- Simple ontological annotation:
  52. --- [This][painting] is not a [pipe].
  53. ---
  54. --- Nested, mixed-use and custom-namespaced annotations:
  55. --- [[Ceci][painting] n'est pas une [pipe].]{lang=fr}
  56. ---
  57. --- [painting]: https://www.wikidata.org/entity/Q1061035
  58. --- "A painting of a smoking pipe"
  59. ---
  60. --- [pipe]: https://www.wikidata.org/entity/Q104526
  61. --- "A smoking pipe"
  62. --- ```
  63. ---
  64. --- This filter should also transform the above text, with the command
  65. --- `pandoc -L semantic-markdown.lua -t html --wrap=preserve`,
  66. --- into the below HTML text with embedded RDFa Lite 1.1 anotations,
  67. --- modulo wrapping of long lines:
  68. ---
  69. --- ```html
  70. --- <div vocab="http://xmlns.com/foaf/0.1/"
  71. --- prefix="bibo: http://purl.org/ontology/bibo/"
  72. --- resource="#artwork" typeof="Image">
  73. --- <h1>Semantics</h1>
  74. --- <p>Simple ontological annotation:
  75. --- <a property="depiction"
  76. --- href="https://www.wikidata.org/entity/Q1061035"
  77. --- title="A painting of a smoking pipe">This</a>
  78. --- is not
  79. --- a <a property="depicts"
  80. --- href="https://www.wikidata.org/entity/Q104526"
  81. --- title="A smoking pipe">pipe</a>.</p>
  82. ---
  83. --- <p>Nested, mixed-use and custom-namespaced annotations:
  84. --- <span lang="fr" property="bibo:shortDescription">
  85. --- <a property="depiction"
  86. --- href="https://www.wikidata.org/entity/Q1061035"
  87. --- title="A painting of a smoking pipe">Ceci</a>
  88. --- n'est pas
  89. --- une <a property="depicts"
  90. --- href="https://www.wikidata.org/entity/Q104526"
  91. --- title="A smoking pipe">pipe</a>.
  92. --- </span></p>
  93. --- </div>
  94. --- ```
  95. ---
  96. --- * v0.0.1
  97. --- * initial release
  98. ---
  99. --- @version 0.0.1
  100. --- @see <https://source.jones.dk/semantic-markdown/about/>
  101. --- @see <https://moodle.ruc.dk/course/view.php?id=23505>
  102. --- @see <https://www.w3.org/TR/rdfa-primer/#using-rdfa>
  103. --- @see <https://www.ctrl.blog/entry/rdfa-link-attributes.html>
  104. -- TODO: maybe use topdown traversal
  105. -- * order of declaring annotations might matter (but should not)
  106. -- * might enable simpler functions and/or faster processing
  107. -- @see <https://pandoc.org/lua-filters.html#topdown-traversal>
  108. -- ensure stable character classes independent of system locale
  109. -- @see <https://pandoc.org/lua-filters.html#common-pitfalls>
  110. os.setlocale 'C'
  111. --- pseudo-enum table to track parser enclosure state
  112. --- @see <https://stackoverflow.com/a/70529481/18619283>
  113. local Enclosure = {
  114. NONE = "0",
  115. BRACKETED = "1",
  116. BRACKETED_DONE = "2",
  117. BRACED = "3",
  118. }
  119. -- element types representing content enclosure in Markdown
  120. local ElementTypeIsEnclosure = {
  121. Emph = true,
  122. Image = true,
  123. Link = true,
  124. Strong = true,
  125. }
  126. --- curie_type_prefix - `typeof` or `resource` attribute CURIE hint
  127. local curie_type_prefix = "[.=]?"
  128. -- TODO: cover non-ASCII Unicode characters
  129. -- @see <https://www.lua.org/manual/5.4/manual.html#6.5>
  130. --- curie_prefix - CURIE prefix component as set of chars
  131. --- @see <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
  132. local _name_start_char = "A-Z_a-z"
  133. local _name_char = _name_start_char.."-0-9"
  134. local _ref = "[".._name_start_char.."][".._name_char.."]*"
  135. local curie_prefix = "[".._name_start_char.."_-][".._name_char.."]*"
  136. --- curie_long - CURIE with prefix and reference as set of chars
  137. local curie_long = curie_prefix..":".._ref
  138. --- curie_no_ref - CURIE with only prefix as set of chars
  139. local curie_no_ref = curie_prefix..":"
  140. --- curie_local - CURIE with only name as set of chars
  141. local curie_local = ":".._ref
  142. --- curie_default - CURIE without prefix or name as char
  143. local curie_default = ":"
  144. -- TODO: curie_re - CURIE as `LPeg.re` regex object
  145. -- TODO: test and replace above curie* patterns
  146. -- @see <https://pandoc.org/lua-filters.html#global-variables>
  147. --local curie_re = re.compile("("..curie_prefix..")?:(".._ref..")?")
  148. -- FIXME: define RDF context same as RDFa
  149. -- TODO: maybe support overriding context with a JSON-LD URI
  150. -- @see <https://www.w3.org/2011/rdfa-context/rdfa-1.1>
  151. --- Namespaces - process RDF namespace IRI declarations
  152. ---
  153. --- Takes as input a list of Para block elements.
  154. --- For each block matching the pattern for a namespace IRI definition,
  155. --- the declared namespace is extracted.
  156. --- Returns an empty paragraph in case of a match,
  157. --- or nothing (to signal preservation of original content).
  158. ---
  159. --- Example:
  160. ---
  161. --- ```Markdown
  162. --- # Annotated paragraph using a custom namespace
  163. ---
  164. --- My favorite animal is the [Liger]{ov:preferredAnimal}.
  165. --- {=<#me> .:Person}
  166. ---
  167. --- {ov}: http://open.vocab.org/terms/
  168. --- ```
  169. ---
  170. --- @param blocks Markdown with ontological annotations as Blocks
  171. --- @returns Markdown without ontological annotations as Blocks
  172. --- @see <https://pandoc.org/lua-filters.html#type-blocks>
  173. --- @see <https://www.w3.org/TR/rdf12-concepts/#vocabularies>
  174. local function Namespaces(blocks)
  175. -- paragraph with only a braced prefix-only CURIE, colon and one word
  176. local curie_pattern = "^{"..curie_type_prefix..curie_prefix.."}:$"
  177. if #blocks.content == 3
  178. and blocks.content[1].t == "Str"
  179. and blocks.content[2].t == "Space"
  180. and blocks.content[1].text:match(curie_pattern)
  181. then
  182. local el = blocks.content[3]
  183. -- default namespace, parsed as commonmark
  184. if el.t == "Str"
  185. and el.text == "@default"
  186. then
  187. -- FIXME: add CURIE to metadata
  188. return {}
  189. end
  190. -- default namespace, parsed as markdown
  191. if el.t == "Cite"
  192. and #el.content == 1
  193. and el.content[1].text == "@default"
  194. then
  195. -- FIXME: add CURIE to metadata
  196. return {}
  197. end
  198. -- namespace
  199. -- TODO: relax to match URI syntax without hardcoded protocols
  200. local proto_pattern = "^[Hh][Tt][Tt][Pp][Ss]?:"
  201. if el.t == "Str"
  202. and el.text:match(proto_pattern)
  203. then
  204. -- FIXME: add CURIE and URI to metadata
  205. return {}
  206. end
  207. end
  208. end
  209. --- Statements - process inline RDF statements
  210. ---
  211. --- Locate and extract ontological annotations
  212. --- within a [Block] element of a Pandoc Abstract Syntax Tree (AST).
  213. ---
  214. --- Markup for ontological annotations is an extension to Markdown
  215. --- using similar syntax as hypermedia annotations,
  216. --- but listing RDFa [CURIEs] in a braced enclosure.
  217. ---
  218. --- ```ASCII-art
  219. --- Simple ontological annotation:
  220. --- "A [map]{foaf:depiction} is not the territory"
  221. --- | ||\~~~~~~~~~~~~/|
  222. --- a bc CURIEa d
  223. ---
  224. --- Nested and mixed-use annotations:
  225. --- ["Ceci n'est pas une [pipe]{foaf:depicts lang=fr}"]{dc:description}
  226. --- | | ||\~~~~~~~~~~/ | ||\~~~~~~~~~~~~/|
  227. --- a a1 |c1 CURIEa d1bc CURIEb d
  228. --- b1
  229. ---
  230. --- Chained hypermedia and ontological annotations:
  231. --- "A [map](https://osm.org/){foaf:depiction} is not the territory"
  232. --- | || ||\~~~~~~~~~~~~/|
  233. --- a be fc CURIEa d
  234. ---
  235. --- Legend:
  236. --- a-b: bracketed enclosure around content
  237. --- c-d: braced enclosure around ontological or other annotation
  238. --- e-f: parenthesized enclosure around hypermedia annotation
  239. --- ```
  240. ---
  241. --- Ontological annotations are parsed and reorganised
  242. --- using the following algorithm:
  243. ---
  244. --- 1. locate pairs of bracketed text and braced text
  245. --- either adjacent or separated by parenthesized text,
  246. --- where braced text contains one or more [CURIEs]
  247. --- 2. for each pair,
  248. --- 1. add CURIEs in braced text to metadata
  249. --- 2. add positions of brackets to metadata
  250. --- 3. delete CURIEs
  251. --- 4. delete braced enclosure if now structurally empty
  252. --- 5. delete brackets if now unannotated
  253. ---
  254. --- The implementation is inspired by Pandoc [issue#6038].
  255. ---
  256. --- @param inlines Markdown with semantic annotations as Inlines
  257. --- @returns Markdown stripped of semantic annotations as Inlines
  258. --- @see [Block]: <https://pandoc.org/lua-filters.html#type-block>
  259. --- @see [CURIEs]: <https://www.w3.org/TR/2010/NOTE-curie-20101216/>
  260. --- @see [issue#6038]: <https://github.com/jgm/pandoc/issues/6038>
  261. -- TODO: maybe instead as step #5 add/reuse hypermedia anchor
  262. local function Statements (block)
  263. -- flags for enclosing stages
  264. -- TODO: support nested bracket enclosure
  265. local encl = Enclosure.NONE
  266. -- amount of detected statements in this block
  267. local statement_count = 0
  268. -- stacks of qualified and pending unenclosed/enclosed elements
  269. local elems = pandoc.List()
  270. local elems_unenclosed = pandoc.List()
  271. local elems_enclosed = pandoc.List()
  272. for _, el in ipairs(block.content) do
  273. local pos = 1
  274. -- strings of pending unenclosed/enclosed chars
  275. local chars_unenclosed = ""
  276. local chars_enclosed = ""
  277. -- non-string element, highest state first to support fall-through
  278. if el.t ~= 'Str' then
  279. elems_unenclosed:insert(el)
  280. if encl == Enclosure.BRACED then
  281. elems_enclosed:insert(el)
  282. goto continue
  283. end
  284. if encl == Enclosure.BRACKETED_DONE then
  285. -- disqualify bracketing not directly followed by brace
  286. elems:extend(elems_unenclosed)
  287. elems_unenclosed = pandoc.List()
  288. elems_enclosed = pandoc.List()
  289. encl = Enclosure.NONE
  290. -- fall through to parse element as unenclosed
  291. end
  292. if encl == Enclosure.BRACKETED then
  293. elems_enclosed:insert(el)
  294. goto continue
  295. end
  296. if encl == Enclosure.NONE then
  297. -- specific elements represent content enclosure
  298. if ElementTypeIsEnclosure[el.t] then
  299. encl = Enclosure.BRACKETED_DONE
  300. end
  301. end
  302. goto continue
  303. end
  304. -- unenclosed
  305. -- TODO: accept backslash except immediately before bracket
  306. if encl == Enclosure.NONE then
  307. local _, nextpos, s = el.text:find("^([^%[\\]*)")
  308. pos = nextpos and nextpos + 1 or pos + 1
  309. chars_unenclosed = chars_unenclosed..s
  310. -- entering bracketed enclosure
  311. if el.text:sub(pos, pos) == "[" then
  312. -- qualify unenclosed elements
  313. elems:extend(elems_unenclosed)
  314. elems_unenclosed = pandoc.List()
  315. elems_enclosed = pandoc.List()
  316. if chars_unenclosed:len() > 0 then
  317. elems:insert(pandoc.Str(chars_unenclosed))
  318. end
  319. pos = pos + 1
  320. chars_unenclosed = chars_unenclosed.."["
  321. chars_enclosed = ""
  322. encl = Enclosure.BRACKETED
  323. end
  324. end
  325. -- in bracketed enclosure
  326. -- TODO: accept backslash except immediately before bracket/brace
  327. -- TODO: support nested bracket enclosure
  328. if encl == Enclosure.BRACKETED then
  329. local _, nextpos, s = el.text:find("^([^%[%]}\\]*)", pos)
  330. pos = nextpos and nextpos + 1 or pos + 1
  331. chars_unenclosed = chars_unenclosed..s
  332. chars_enclosed = chars_enclosed..s
  333. -- exiting bracketed enclosure
  334. if el.text:sub(pos, pos) == "]" then
  335. pos = pos + 1
  336. chars_unenclosed = chars_unenclosed.."]"
  337. encl = Enclosure.BRACKETED_DONE
  338. end
  339. end
  340. -- exited bracketed enclosure
  341. if encl == Enclosure.BRACKETED_DONE then
  342. -- entering braced enclosure
  343. if el.text:sub(pos, pos) == "{" then
  344. pos = pos + 1
  345. chars_unenclosed = chars_unenclosed.."{"
  346. encl = Enclosure.BRACED
  347. -- leaving non-annotation enclosure
  348. else
  349. -- disqualify bracketing not directly followed by brace
  350. elems:extend(elems_unenclosed)
  351. elems_unenclosed = pandoc.List()
  352. elems_enclosed = pandoc.List()
  353. if chars_unenclosed:len() > 0 then
  354. elems:insert(pandoc.Str(chars_unenclosed))
  355. chars_unenclosed = ""
  356. end
  357. chars_enclosed = ""
  358. encl = Enclosure.NONE
  359. end
  360. end
  361. -- in braced enclosure, leaving it
  362. -- TODO: support mixed-use enclosure
  363. if encl == Enclosure.BRACED then
  364. local curie_pattern1 = "^"..curie_type_prefix..curie_long.."}"
  365. local curie_pattern2 = "^"..curie_type_prefix..curie_no_ref.."}"
  366. local curie_pattern3 = "^"..curie_type_prefix..curie_local.."}"
  367. local curie_pattern4 = "^"..curie_type_prefix..curie_default.."}"
  368. local _, nextpos1 = el.text:find(curie_pattern1, pos)
  369. local _, nextpos2 = el.text:find(curie_pattern2, pos)
  370. local _, nextpos3 = el.text:find(curie_pattern3, pos)
  371. local _, nextpos4 = el.text:find(curie_pattern4, pos)
  372. local nextpos = nextpos1 or nextpos2 or nextpos3 or nextpos4
  373. if nextpos then
  374. statement_count = statement_count + 1
  375. pos = nextpos + 1
  376. -- TODO: instead recursively call Statements() on remains of Str
  377. chars_enclosed = chars_enclosed..el.text:sub(pos)
  378. -- qualify completed bracketed enclosure
  379. elems:extend(elems_enclosed)
  380. elems_enclosed = pandoc.List()
  381. elems_unenclosed = pandoc.List()
  382. if chars_enclosed:len() > 0 then
  383. elems:insert(pandoc.Str(chars_enclosed))
  384. chars_enclosed = ""
  385. end
  386. chars_unenclosed = ""
  387. encl = Enclosure.NONE
  388. end
  389. end
  390. -- push strings to stacks
  391. if chars_enclosed:len() > 0 then
  392. elems_enclosed:insert(pandoc.Str(chars_enclosed))
  393. end
  394. if chars_unenclosed:len() > 0 then
  395. elems_unenclosed:insert(pandoc.Str(chars_unenclosed))
  396. end
  397. -- done parsing current Inline element
  398. ::continue::
  399. end
  400. -- return altered stack if it contains complete enclosures
  401. if statement_count > 0 then
  402. -- disqualify incomplete enclosure
  403. if encl ~= Enclosure.NONE then
  404. elems:extend(elems_unenclosed)
  405. end
  406. block.content = elems
  407. return block
  408. end
  409. end
  410. -- First resolve namespace declarations, then statements.
  411. --
  412. -- Although this filter is *not* a full RDF parser,
  413. -- order matters for the parts we do handle --
  414. -- e.g. namespace resolving is similar to other RDF formats
  415. -- with detailed documented process ordering.
  416. --
  417. -- @see <https://www.w3.org/TR/turtle/#sec-parsing>
  418. local meta = {}
  419. return {
  420. -- move aside MetaBlocks to speed up processing content
  421. --
  422. -- @see <https://stackoverflow.com/a/47356252/18619283>
  423. { Meta = function(m) meta = m; return {} end },
  424. {Para = Namespaces},
  425. {Block = Statements},
  426. -- FIXME: add custom declared namespaces in Meta
  427. -- TODO: maybe add only actively used namespaces
  428. -- (do same as for unused link definitions)
  429. { Meta = function(_) return meta; end },
  430. --{ Meta = function(_) return NamespacesToMeta(meta); end },
  431. }