import codecs import os from urlparse import urlparse from urllib import urlopen from lxml import etree, html from mako.template import Template from nikola.plugin_categories import Command from nikola import utils links = {} class CommandImportWordpress(Command): """Import a wordpress dump.""" name = "import_wordpress" def run(self, fname=None): # Parse the data if fname is None: print "Usage: nikola import_wordpress wordpress_dump.xml" return context = {} with open(fname) as fd: xml = [] for line in fd: # These explode etree and are useless if ' %s" % (url, dst_path) with open(dst_path, 'wb+') as fd: fd.write(urlopen(url).read()) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[link] = '/' + dst_url links[url] = '/' + dst_url return def import_item(item): """Takes an item from the feed and creates a post file.""" title = get_text_tag(item, 'title', 'NO TITLE') # link is something like http://foo.com/2012/09/01/hello-world/ # So, take the path, utils.slugify it, and that's our slug slug = utils.slugify(urlparse(get_text_tag(item, 'link', None)).path) description = get_text_tag(item, 'description', '') post_date = get_text_tag(item, '{http://wordpress.org/export/1.2/}post_date', None) post_type = get_text_tag(item, '{http://wordpress.org/export/1.2/}post_type', 'post') status = get_text_tag(item, '{http://wordpress.org/export/1.2/}status', 'publish') content = get_text_tag(item, '{http://purl.org/rss/1.0/modules/content/}encoded', '') tags = [] if status != 'publish': tags.append('draft') for tag in item.findall('category'): text = tag.text if text == 'Uncategorized': continue tags.append(text) if post_type == 'attachment': return elif post_type == 'post': out_folder = 'posts' else: out_folder = 'stories' # Write metadata with codecs.open(os.path.join('new_site', out_folder, slug + '.meta'), "w+", "utf8") as fd: fd.write(u'%s\n' % title) fd.write(u'%s\n' % slug) fd.write(u'%s\n' % post_date) fd.write(u'%s\n' % ','.join(tags)) fd.write(u'\n') fd.write(u'%s\n' % description) with open(os.path.join( 'new_site', out_folder, slug + '.wp'), "wb+") as fd: if content.strip(): try: doc = html.document_fromstring(content) doc.rewrite_links(replacer) fd.write(html.tostring(doc, encoding='utf8')) except: import pdb pdb.set_trace()