blob: 0586ade3a40179ffc4a8173a13f1e510e9cb1a5d (
plain)
- # Make snippet for extracting RDF data from PDF documents
- #
- # Copyright 2024, Jonas Smedegaard <dr@jones.dk>
- # SPDX-License-Identifier: GPL-3+
- #
- # Setup:
- # In main Makefile...
- # * set variable PDF_DOCUMENTS or use simple and slow default
- # * set variable BASE_IRI or use default file URI
- # * include this make snippet
- #
- # Dependencies:
- # * podofoxmp (e.g. Debian package libpodofo-utils)
- # * rapper (e.g. Debian package raptor2-utils)
- # * perl v5.36.0 or newer
- # list of relative paths to PDF documents
- #PDF_DOCUMENTS = \
- # main_paper.pdf \
- # promo_article.pdf \
- # research/deep/superconductors/report.pdf
- # research/deep/fringe/index.pdf
- PDF_DOCUMENTS ?= $(wildcard *.pdf)
- # extract XMP metadata from PDF document,
- # and convert to the human-friendlier RDF/Turtle serialization
- $(PDF_DOCUMENTS:%=turtle-from-%): turtle-from-%:
- @pdfinfo -meta $* \
- | perl -gp \
- -e 's,\s*<\?xpacket [^>]+>\s*,,;' \
- -e 's,<x:xmpmeta [^>]+>,<?xml version="1.0" encoding="utf-8"?>,;' \
- -e 's,\s*</x:xmpmeta>,,;' \
- -e 's,\s*<\?xpacket [^>]+>,,;' \
- | rapper -q -i rdfxml -o turtle - $(BASE_IRI)$(dir $*)
- .PHONY: $(PDF_DOCUMENTS:%=turtle-from-%)
|