Imported Upstream version 5upstream/5

author: Agustin Henze <tin@sluc.org.ar> 2012-12-12 20:15:48 -0300
committer: Agustin Henze <tin@sluc.org.ar> 2012-12-12 20:15:48 -0300
commit: 0f2c04e70a0ffdd0892d6970cafbcd952d221db5 (patch)
tree: d36f7747c4b9cb5c5e00cae5b137d22214b1c7be /nikola/plugins/command_import_wordpress.py
parent: ca1f5a392261a7c6b82b5ac1015427605909d8c9 (diff)
1 files changed, 163 insertions, 0 deletions
diff --git a/nikola/plugins/command_import_wordpress.py b/nikola/plugins/command_import_wordpress.py
new file mode 100644
index 0000000..e75d022
--- /dev/null
+++ b/nikola/plugins/command_import_wordpress.py
@@ -0,0 +1,163 @@
+import codecs
+import os
+from urlparse import urlparse
+from urllib import urlopen
+
+from lxml import etree, html
+from mako.template import Template
+
+from nikola.plugin_categories import Command
+from nikola import utils
+
+links = {}
+
+
+class CommandImportWordpress(Command):
+    """Import a wordpress dump."""
+
+    name = "import_wordpress"
+
+    def run(self, fname=None):
+        # Parse the data
+        if fname is None:
+            print "Usage: nikola import_wordpress wordpress_dump.xml"
+            return
+        context = {}
+        with open(fname) as fd:
+            xml = []
+            for line in fd:
+                # These explode etree and are useless
+                if '<atom:link rel=' in line:
+                    continue
+                xml.append(line)
+            xml = '\n'.join(xml)
+
+        tree = etree.fromstring(xml)
+        channel = tree.find('channel')
+
+        context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
+        context['BLOG_TITLE'] = get_text_tag(
+            channel, 'title', 'PUT TITLE HERE')
+        context['BLOG_DESCRIPTION'] = get_text_tag(
+            channel, 'description', 'PUT DESCRIPTION HERE')
+        context['BLOG_URL'] = get_text_tag(channel, 'link', '#')
+        author = channel.find('{http://wordpress.org/export/1.2/}author')
+        context['BLOG_EMAIL'] = get_text_tag(
+            author,
+            '{http://wordpress.org/export/1.2/}author_email',
+            "joe@example.com")
+        context['BLOG_AUTHOR'] = get_text_tag(
+            author,
+            '{http://wordpress.org/export/1.2/}author_display_name',
+            "Joe Example")
+        context['POST_PAGES'] = '''(
+            ("posts/*.wp", "posts", "post.tmpl", True),
+            ("stories/*.wp", "stories", "story.tmpl", False),
+        )'''
+        context['POST_COMPILERS'] = '''{
+        "rest": ('.txt', '.rst'),
+        "markdown": ('.md', '.mdown', '.markdown', '.wp'),
+        "html": ('.html', '.htm')
+        }
+        '''
+
+        # Generate base site
+        os.system('nikola init new_site')
+        conf_template = Template(filename=os.path.join(
+            os.path.dirname(utils.__file__), 'data', 'samplesite', 'conf.py.in'))
+        with codecs.open(os.path.join('new_site', 'conf.py'),
+            'w+', 'utf8') as fd:
+            fd.write(conf_template.render(**context))
+
+        # Import posts
+        for item in channel.findall('item'):
+            import_attachment(item)
+        for item in channel.findall('item'):
+            import_item(item)
+
+
+def replacer(dst):
+    return links.get(dst, dst)
+
+
+def get_text_tag(tag, name, default):
+    t = tag.find(name)
+    if t is not None:
+        return t.text
+    else:
+        return default
+
+
+def import_attachment(item):
+    post_type = get_text_tag(item,
+        '{http://wordpress.org/export/1.2/}post_type', 'post')
+    if post_type == 'attachment':
+        url = get_text_tag(item,
+            '{http://wordpress.org/export/1.2/}attachment_url', 'foo')
+        link = get_text_tag(item,
+            '{http://wordpress.org/export/1.2/}link', 'foo')
+        path = urlparse(url).path
+        dst_path = os.path.join(*(['new_site', 'files']
+            + list(path.split('/'))))
+        dst_dir = os.path.dirname(dst_path)
+        if not os.path.isdir(dst_dir):
+            os.makedirs(dst_dir)
+        print "Downloading %s => %s" % (url, dst_path)
+        with open(dst_path, 'wb+') as fd:
+            fd.write(urlopen(url).read())
+        dst_url = '/'.join(dst_path.split(os.sep)[2:])
+        links[link] = '/' + dst_url
+        links[url] = '/' + dst_url
+    return
+
+
+def import_item(item):
+    """Takes an item from the feed and creates a post file."""
+    title = get_text_tag(item, 'title', 'NO TITLE')
+    # link is something like http://foo.com/2012/09/01/hello-world/
+    # So, take the path, utils.slugify it, and that's our slug
+    slug = utils.slugify(urlparse(get_text_tag(item, 'link', None)).path)
+    description = get_text_tag(item, 'description', '')
+    post_date = get_text_tag(item,
+        '{http://wordpress.org/export/1.2/}post_date', None)
+    post_type = get_text_tag(item,
+        '{http://wordpress.org/export/1.2/}post_type', 'post')
+    status = get_text_tag(item,
+        '{http://wordpress.org/export/1.2/}status', 'publish')
+    content = get_text_tag(item,
+        '{http://purl.org/rss/1.0/modules/content/}encoded', '')
+
+    tags = []
+    if status != 'publish':
+        tags.append('draft')
+    for tag in item.findall('category'):
+        text = tag.text
+        if text == 'Uncategorized':
+            continue
+        tags.append(text)
+
+    if post_type == 'attachment':
+        return
+    elif post_type == 'post':
+        out_folder = 'posts'
+    else:
+        out_folder = 'stories'
+    # Write metadata
+    with codecs.open(os.path.join('new_site', out_folder, slug + '.meta'),
+        "w+", "utf8") as fd:
+        fd.write(u'%s\n' % title)
+        fd.write(u'%s\n' % slug)
+        fd.write(u'%s\n' % post_date)
+        fd.write(u'%s\n' % ','.join(tags))
+        fd.write(u'\n')
+        fd.write(u'%s\n' % description)
+    with open(os.path.join(
+        'new_site', out_folder, slug + '.wp'), "wb+") as fd:
+        if content.strip():
+            try:
+                doc = html.document_fromstring(content)
+                doc.rewrite_links(replacer)
+                fd.write(html.tostring(doc, encoding='utf8'))
+            except:
+                import pdb
+                pdb.set_trace()
author	Agustin Henze <tin@sluc.org.ar>	2012-12-12 20:15:48 -0300
committer	Agustin Henze <tin@sluc.org.ar>	2012-12-12 20:15:48 -0300
commit	0f2c04e70a0ffdd0892d6970cafbcd952d221db5 (patch)
tree	d36f7747c4b9cb5c5e00cae5b137d22214b1c7be /nikola/plugins/command_import_wordpress.py
parent	ca1f5a392261a7c6b82b5ac1015427605909d8c9 (diff)