From a437311bce168de6fad29f55543ef6574aa5fc6d Mon Sep 17 00:00:00 2001 From: Jonas Smedegaard Date: Thu, 14 Jul 2011 20:12:18 +0200 Subject: Add linkedin2rdf.pl and (slower but slightly more greedy) any2rdf.pl, and use the latter in mkfoaf.sh. --- foaf/any2rdf.pl | 41 +++++++++++++++++++++++++++++++++++++++++ foaf/linkedin2rdf.pl | 43 +++++++++++++++++++++++++++++++++++++++++++ foaf/mkfoaf.sh | 15 ++++++++++----- 3 files changed, 94 insertions(+), 5 deletions(-) create mode 100755 foaf/any2rdf.pl create mode 100755 foaf/linkedin2rdf.pl diff --git a/foaf/any2rdf.pl b/foaf/any2rdf.pl new file mode 100755 index 0000000..7fa156a --- /dev/null +++ b/foaf/any2rdf.pl @@ -0,0 +1,41 @@ +#!/usr/bin/perl -w + +use RDF::Trine; +use HTML::Data::Parser; +use RDF::TrineShortcuts; +use File::Slurp; + +my $markup = read_file( $ARGV[0] ); +my $base_uri = $ARGV[1]; + +# hint: locate common abbreviations at http://prefix.cc/ +# +my $NAMESPACES= { + rdfs => 'http://www.w3.org/2000/01/rdf-schema#', + dc => 'http://purl.org/dc/terms/', + foaf => 'http://xmlns.com/foaf/0.1/', + xhtml => 'http://www.w3.org/1999/xhtml/vocab#', + vcard => 'http://www.w3.org/2006/vcard/ns#', + vcardx => 'http://buzzword.org.uk/rdf/vcardx#', + hcterms => 'http://purl.org/uF/hCard/terms/', + ical => 'http://www.w3.org/2002/12/cal/icaltzd#', + cv => 'http://purl.org/captsolo/resume-rdf/0.2/cv#', + hresume => 'http://ontologi.es/hresume#', +# TODO: somehow make RDF::Trine abbreviate labels +# 'xsd:dateTime' => 'http://www.w3.org/2001/XMLSchema#dateTime', +# 'geo:SpatialThing' => 'http://www.w3.org/2003/01/geo/wgs84_pos#SpatialThing', +}; + +my $parser = HTML::Data::Parser->new; +my $model = RDF::Trine::Model->temporary_model; +my $writer = RDF::Trine::Serializer->new('RDFXML'); + +$parser->parse_into_model($base_uri, $markup, $model); +my $output = rdf_string($model,'RDFXML', + namespaces => { %$NAMESPACES }, +); + +# TODO: somehow make RDF::Trine generate stable IDs +$output =~ s/(?<=rdf:nodeID=")B[a-f0-9]{32}(?=0[0-9]{3}")//g; + +print $output; diff --git a/foaf/linkedin2rdf.pl b/foaf/linkedin2rdf.pl new file mode 100755 index 0000000..3e4eb24 --- /dev/null +++ b/foaf/linkedin2rdf.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl -w + +use HTML::Microformats; +use RDF::TrineShortcuts; +use File::Slurp; + +my $markup = read_file( $ARGV[0] ); +my $base_uri = $ARGV[1]; + +# hint: locate common abbreviations at http://prefix.cc/ +# +my $NAMESPACES= { + rdfs => 'http://www.w3.org/2000/01/rdf-schema#', + dc => 'http://purl.org/dc/terms/', + foaf => 'http://xmlns.com/foaf/0.1/', + xhtml => 'http://www.w3.org/1999/xhtml/vocab#', + vcard => 'http://www.w3.org/2006/vcard/ns#', + vcardx => 'http://buzzword.org.uk/rdf/vcardx#', + hcterms => 'http://purl.org/uF/hCard/terms/', + ical => 'http://www.w3.org/2002/12/cal/icaltzd#', + cv => 'http://purl.org/captsolo/resume-rdf/0.2/cv#', + hresume => 'http://ontologi.es/hresume#', +# TODO: somehow make RDF::Trine abbreviate labels +# 'xsd:dateTime' => 'http://www.w3.org/2001/XMLSchema#dateTime', +# 'geo:SpatialThing' => 'http://www.w3.org/2003/01/geo/wgs84_pos#SpatialThing', +}; + +my $doc = HTML::Microformats + ->new_document($markup, $base_uri) +# ->assume_profile(qw(hResume hCard hCalendar)) + ->assume_all_profiles +; +$doc->parse_microformats; +#print $doc->serialise_model(as => 'RDFXML'); +my $model = $doc->model; +my $output = rdf_string($model,'RDFXML', + namespaces => { %$NAMESPACES }, +); + +# TODO: somehow make RDF::Trine generate stable IDs +$output =~ s/(?<=rdf:nodeID=")B[a-f0-9]{32}(?=0[0-9]{3}")//g; + +print $output; diff --git a/foaf/mkfoaf.sh b/foaf/mkfoaf.sh index 7423e5b..33142da 100755 --- a/foaf/mkfoaf.sh +++ b/foaf/mkfoaf.sh @@ -38,15 +38,20 @@ linkedin2foaf() { # TODO: support homepage as fallback for accountName # id=$(perl -0 -ne '/foaf:accountServiceHomepage\s+\s+;\s+foaf:(?:homepage\s+<(?=http)|accountName\s+")([^<"\s]+)/ and print $1 and exit;' "$inpath") #' - id=$(perl -0 -ne '/^:me.*?foaf:accountServiceHomepage\s+\s+;\s+foaf:accountName\s+"([^<"\s]+)/ms and print $1 and exit;' "$inpath") #' +# id=$(perl -0 -ne '/^:me.*?foaf:accountServiceHomepage\s+\s+;\s+foaf:accountName\s+"([^<"\s]+)/ms and print $1 and exit;' "$inpath") #' +# id=62345396 + id=jonassm [ -n "$id" ] || exit1 "Failed to resolve LinkedIn account name." + uri="http://www.linkedin.com/in/$id" mkdir -p "$outdir" # work around unescaped &'s in linkedin pages -# xsltproc --html "$bindir/linkedin2foaf.xsl" "http://www.linkedin.com/in/$id" > "$outpath" - wget -q -O "$tmppath" "http://www.linkedin.com/in/$id" - perl -i -pe 's/&([a-zA-Z0-9]+=|\s)/&$1/g' "$tmppath" - xsltproc --html "$xsltdir/linkedin2foaf.xsl" "$tmppath" > "$outpath" +# xsltproc --html "$bindir/linkedin2foaf.xsl" "$uri" > "$outpath" + wget -q -O "$tmppath" "$uri" +# perl -i -pe 's/&([a-zA-Z0-9]+=|\s)/&$1/g' "$tmppath" +# xsltproc --html "$xsltdir/linkedin2foaf.xsl" "$tmppath" > "$outpath" +# perl "$bindir/linkedin2rdf.pl" "$tmppath" "$uri" > "$outpath" + perl "$bindir/any2rdf.pl" "$tmppath" "$uri" > "$outpath" rm -f "$tmppath" foafsign "$outpath" } -- cgit v1.2.3