diff options
author | Joey Hess <joey@gnu.kitenet.net> | 2009-05-19 13:07:47 -0400 |
---|---|---|
committer | Joey Hess <joey@gnu.kitenet.net> | 2009-05-19 13:07:47 -0400 |
commit | ef003f48f4a3fe8fb67fda62c70a299b07d75976 (patch) | |
tree | cb5972026e6beed91b5eca2fa4962790244062bd /doc/tips/importing_posts_from_wordpress | |
parent | 53b1c6f559c1d09fbdbc28c8e4d5090dd455cd26 (diff) | |
parent | 4c5987d150b26f638494638f7861fb7646542a37 (diff) |
Merge branch 'master' into po
Conflicts:
debian/changelog
Diffstat (limited to 'doc/tips/importing_posts_from_wordpress')
-rw-r--r-- | doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn | 136 |
1 files changed, 128 insertions, 8 deletions
diff --git a/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn b/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn index 5d7a266ec..0c0527f2c 100644 --- a/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn +++ b/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn @@ -2,7 +2,7 @@ I modified the script a bit so categories and tags would actually show up in the output file. - +----- <pre> #!/usr/bin/env python @@ -29,7 +29,7 @@ I modified the script a bit so categories and tags would actually show up in the Usage: run --help as an argument with this script. Notes: - I added some extra bits to include the [[!tag foo]] stuff in the post, + I added some extra bits to include the \[[!tag foo]] stuff in the post, as it wasn't before, at all. I'll diff the versions out so you can see the mess I made :). @@ -66,7 +66,7 @@ def main(name, email, subdir, branch='master'): commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string) timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S")) - content = '[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"')) + content = '\[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"')) content += x.find('content:encoded').string.replace('\r\n', '\n') # categories = x.findAll('category') @@ -76,8 +76,120 @@ def main(name, email, subdir, branch='master'): """ We do it differently here because we have duplicates otherwise. Take a look: - <category><![CDATA[Health]]></category> - <category domain="category" nicename="health"><![CDATA[Health]]></category> + <category><![CDATA[Health]]></category> + <category domain="category" nicename="health"><![CDATA[Health]]></category> + + If we do the what original did, we end up with all tags and cats doubled. + Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'. + I'd much rather have the value of 'nicename', and tried, but my + python skillz are extremely limited.... + """ + categories = x.findAll('category', nicename=True) + if categories: + content += "\n" + for cat in categories: + # remove 'tags/' because we have a 'tagbase' set. + # your choice: 'tag', or 'taglink' + # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-')) + content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-')) + # print >>sys.stderr, cat.string.replace(' ', '-') + + # moved this thing down + data = content.encode('ascii', 'html_replace') + print "commit refs/heads/%s" % branch + print "committer %s <%s> %d +0000" % (name, email, timestamp) + print "data %d" % len(commit_msg) + print commit_msg + print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub) + print "data %d" % len(data) + print data + +if __name__ == "__main__": + if len(sys.argv) not in (4, 5): + print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0]) + else: + main(*sys.argv[1:]) + +</pre> +----- + +I have another version of the script, which uses the `timestamp` from the script, and inserts that as a \[[!meta date="foodate"]]. I'm posting it here just in case I happen to be doing something to the httpd. + +(Hopefully I've escaped everything properly; if I missed something, check the source.) + +----- +<pre> +#!/usr/bin/env python + +""" + Purpose: + Wordpress-to-Ikiwiki import tool + + Copyright: + Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + + Usage: run --help as an argument with this script. + + Notes: + I added some extra bits to include the \[[!tag foo]] stuff in the post, + as it wasn't before, at all. I'll diff the versions out so you can see + the mess I made :). + +""" + +import os, sys +import time +import re + +from datetime import datetime +from BeautifulSoup import BeautifulSoup + +import codecs, htmlentitydefs + +codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \ + % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end)) + +def main(name, email, subdir, branch='master'): + soup = BeautifulSoup(sys.stdin.read()) + + # Regular expression to match stub in URL. + stub_pattern = re.compile(r'.*\/(.+)\/$') + + for x in soup.findAll('item'): + # Ignore draft posts + if x.find('wp:status').string != 'publish': continue + + match = stub_pattern.match(x.guid.string) + if match: + stub = match.groups()[0] + else: + # Fall back to our own stubs + stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower() + + commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string) + timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S")) + content = '\[[!meta title="%s"]]\n' % (x.title.string.replace('"', r'\"')) + content += "\[[!meta date=\"%s\"]]\n" % datetime.fromtimestamp(timestamp) + content += x.find('content:encoded').string.replace('\r\n', '\n') + + """ + We do it differently here because we have duplicates otherwise. + Take a look: + <category><![CDATA[Health]]></category> + <category domain="category" nicename="health"><![CDATA[Health]]></category> If we do the what original did, we end up with all tags and cats doubled. Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'. @@ -90,14 +202,15 @@ def main(name, email, subdir, branch='master'): for cat in categories: # remove 'tags/' because we have a 'tagbase' set. # your choice: 'tag', or 'taglink' - # content += "\n[[!tag %s]]" % (cat.string.replace(' ', '-')) - content += "\n[[!taglink %s]]" % (cat.string.replace(' ', '-')) + # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-')) + content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-')) + # this is just debugging, and for fun # print >>sys.stderr, cat.string.replace(' ', '-') # moved this thing down data = content.encode('ascii', 'html_replace') print "commit refs/heads/%s" % branch - print "committer %s <%s> %d +0000" % (name, email, timestamp) + print "committer %s <%s> %d +0000" % (name, email, timestamp) print "data %d" % len(commit_msg) print commit_msg print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub) @@ -111,3 +224,10 @@ if __name__ == "__main__": main(*sys.argv[1:]) </pre> +----- + + +[[!tag wordpress]] +[[!tag python]] +[[!tag conversion]] +[[!tag ikiwiki]] |