summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonas Smedegaard <dr@jones.dk>2011-07-14 20:12:18 +0200
committerJonas Smedegaard <dr@jones.dk>2011-07-14 20:12:18 +0200
commita437311bce168de6fad29f55543ef6574aa5fc6d (patch)
treef9108a3770f3e35af551ae603fc0adbbac6aeccf
parent4de7c4cd72951b7c623debf8d363358768861432 (diff)
Add linkedin2rdf.pl and (slower but slightly more greedy) any2rdf.pl, and use the latter in mkfoaf.sh.
-rwxr-xr-xfoaf/any2rdf.pl41
-rwxr-xr-xfoaf/linkedin2rdf.pl43
-rwxr-xr-xfoaf/mkfoaf.sh15
3 files changed, 94 insertions, 5 deletions
diff --git a/foaf/any2rdf.pl b/foaf/any2rdf.pl
new file mode 100755
index 0000000..7fa156a
--- /dev/null
+++ b/foaf/any2rdf.pl
@@ -0,0 +1,41 @@
+#!/usr/bin/perl -w
+
+use RDF::Trine;
+use HTML::Data::Parser;
+use RDF::TrineShortcuts;
+use File::Slurp;
+
+my $markup = read_file( $ARGV[0] );
+my $base_uri = $ARGV[1];
+
+# hint: locate common abbreviations at http://prefix.cc/
+#
+my $NAMESPACES= {
+ rdfs => 'http://www.w3.org/2000/01/rdf-schema#',
+ dc => 'http://purl.org/dc/terms/',
+ foaf => 'http://xmlns.com/foaf/0.1/',
+ xhtml => 'http://www.w3.org/1999/xhtml/vocab#',
+ vcard => 'http://www.w3.org/2006/vcard/ns#',
+ vcardx => 'http://buzzword.org.uk/rdf/vcardx#',
+ hcterms => 'http://purl.org/uF/hCard/terms/',
+ ical => 'http://www.w3.org/2002/12/cal/icaltzd#',
+ cv => 'http://purl.org/captsolo/resume-rdf/0.2/cv#',
+ hresume => 'http://ontologi.es/hresume#',
+# TODO: somehow make RDF::Trine abbreviate labels
+# 'xsd:dateTime' => 'http://www.w3.org/2001/XMLSchema#dateTime',
+# 'geo:SpatialThing' => 'http://www.w3.org/2003/01/geo/wgs84_pos#SpatialThing',
+};
+
+my $parser = HTML::Data::Parser->new;
+my $model = RDF::Trine::Model->temporary_model;
+my $writer = RDF::Trine::Serializer->new('RDFXML');
+
+$parser->parse_into_model($base_uri, $markup, $model);
+my $output = rdf_string($model,'RDFXML',
+ namespaces => { %$NAMESPACES },
+);
+
+# TODO: somehow make RDF::Trine generate stable IDs
+$output =~ s/(?<=rdf:nodeID=")B[a-f0-9]{32}(?=0[0-9]{3}")//g;
+
+print $output;
diff --git a/foaf/linkedin2rdf.pl b/foaf/linkedin2rdf.pl
new file mode 100755
index 0000000..3e4eb24
--- /dev/null
+++ b/foaf/linkedin2rdf.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+
+use HTML::Microformats;
+use RDF::TrineShortcuts;
+use File::Slurp;
+
+my $markup = read_file( $ARGV[0] );
+my $base_uri = $ARGV[1];
+
+# hint: locate common abbreviations at http://prefix.cc/
+#
+my $NAMESPACES= {
+ rdfs => 'http://www.w3.org/2000/01/rdf-schema#',
+ dc => 'http://purl.org/dc/terms/',
+ foaf => 'http://xmlns.com/foaf/0.1/',
+ xhtml => 'http://www.w3.org/1999/xhtml/vocab#',
+ vcard => 'http://www.w3.org/2006/vcard/ns#',
+ vcardx => 'http://buzzword.org.uk/rdf/vcardx#',
+ hcterms => 'http://purl.org/uF/hCard/terms/',
+ ical => 'http://www.w3.org/2002/12/cal/icaltzd#',
+ cv => 'http://purl.org/captsolo/resume-rdf/0.2/cv#',
+ hresume => 'http://ontologi.es/hresume#',
+# TODO: somehow make RDF::Trine abbreviate labels
+# 'xsd:dateTime' => 'http://www.w3.org/2001/XMLSchema#dateTime',
+# 'geo:SpatialThing' => 'http://www.w3.org/2003/01/geo/wgs84_pos#SpatialThing',
+};
+
+my $doc = HTML::Microformats
+ ->new_document($markup, $base_uri)
+# ->assume_profile(qw(hResume hCard hCalendar))
+ ->assume_all_profiles
+;
+$doc->parse_microformats;
+#print $doc->serialise_model(as => 'RDFXML');
+my $model = $doc->model;
+my $output = rdf_string($model,'RDFXML',
+ namespaces => { %$NAMESPACES },
+);
+
+# TODO: somehow make RDF::Trine generate stable IDs
+$output =~ s/(?<=rdf:nodeID=")B[a-f0-9]{32}(?=0[0-9]{3}")//g;
+
+print $output;
diff --git a/foaf/mkfoaf.sh b/foaf/mkfoaf.sh
index 7423e5b..33142da 100755
--- a/foaf/mkfoaf.sh
+++ b/foaf/mkfoaf.sh
@@ -38,15 +38,20 @@ linkedin2foaf() {
# TODO: support homepage as fallback for accountName
# id=$(perl -0 -ne '/foaf:accountServiceHomepage\s+<http:\/\/www.linkedin.com\/>\s+;\s+foaf:(?:homepage\s+<(?=http)|accountName\s+")([^<"\s]+)/ and print $1 and exit;' "$inpath") #'
- id=$(perl -0 -ne '/^:me.*?foaf:accountServiceHomepage\s+<http:\/\/www.linkedin.com\/>\s+;\s+foaf:accountName\s+"([^<"\s]+)/ms and print $1 and exit;' "$inpath") #'
+# id=$(perl -0 -ne '/^:me.*?foaf:accountServiceHomepage\s+<http:\/\/www.linkedin.com\/>\s+;\s+foaf:accountName\s+"([^<"\s]+)/ms and print $1 and exit;' "$inpath") #'
+# id=62345396
+ id=jonassm
[ -n "$id" ] || exit1 "Failed to resolve LinkedIn account name."
+ uri="http://www.linkedin.com/in/$id"
mkdir -p "$outdir"
# work around unescaped &'s in linkedin pages
-# xsltproc --html "$bindir/linkedin2foaf.xsl" "http://www.linkedin.com/in/$id" > "$outpath"
- wget -q -O "$tmppath" "http://www.linkedin.com/in/$id"
- perl -i -pe 's/&([a-zA-Z0-9]+=|\s)/&amp;$1/g' "$tmppath"
- xsltproc --html "$xsltdir/linkedin2foaf.xsl" "$tmppath" > "$outpath"
+# xsltproc --html "$bindir/linkedin2foaf.xsl" "$uri" > "$outpath"
+ wget -q -O "$tmppath" "$uri"
+# perl -i -pe 's/&([a-zA-Z0-9]+=|\s)/&amp;$1/g' "$tmppath"
+# xsltproc --html "$xsltdir/linkedin2foaf.xsl" "$tmppath" > "$outpath"
+# perl "$bindir/linkedin2rdf.pl" "$tmppath" "$uri" > "$outpath"
+ perl "$bindir/any2rdf.pl" "$tmppath" "$uri" > "$outpath"
rm -f "$tmppath"
foafsign "$outpath"
}