aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command_import_wordpress.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/command_import_wordpress.py')
-rw-r--r--nikola/plugins/command_import_wordpress.py166
1 files changed, 121 insertions, 45 deletions
diff --git a/nikola/plugins/command_import_wordpress.py b/nikola/plugins/command_import_wordpress.py
index 1552da4..07028d8 100644
--- a/nikola/plugins/command_import_wordpress.py
+++ b/nikola/plugins/command_import_wordpress.py
@@ -25,20 +25,23 @@
from __future__ import unicode_literals, print_function
import codecs
import csv
+import datetime
import os
import re
+from optparse import OptionParser
+
try:
from urlparse import urlparse
except ImportError:
- from urllib.parse import urlparse
+ from urllib.parse import urlparse # NOQA
-from lxml import etree, html, builder
+from lxml import etree, html
from mako.template import Template
try:
import requests
except ImportError:
- requests = None
+ requests = None # NOQA
from nikola.plugin_categories import Command
from nikola import utils
@@ -85,9 +88,14 @@ class CommandImportWordpress(Command):
return redirections
- @staticmethod
- def generate_base_site(context):
- os.system('nikola init new_site')
+ def generate_base_site(self):
+ if not os.path.exists(self.output_folder):
+ os.system('nikola init --empty %s' % (self.output_folder, ))
+ else:
+ self.import_into_existing_site = True
+ print('The folder %s already exists - assuming that this is a '
+ 'already existing nikola site.' % self.output_folder)
+
conf_template = Template(filename=os.path.join(
os.path.dirname(utils.__file__), 'conf.py.in'))
@@ -128,14 +136,18 @@ class CommandImportWordpress(Command):
@staticmethod
def download_url_content_to_file(url, dst_path):
- with open(dst_path, 'wb+') as fd:
- fd.write(requests.get(url).content)
+ try:
+ with open(dst_path, 'wb+') as fd:
+ fd.write(requests.get(url).content)
+ except requests.exceptions.ConnectionError as err:
+ print("Downloading %s to %s failed: %s" % (url, dst_path, err))
def import_attachment(self, item, wordpress_namespace):
- url = get_text_tag(item, '{%s}attachment_url' % wordpress_namespace, 'foo')
+ url = get_text_tag(
+ item, '{%s}attachment_url' % wordpress_namespace, 'foo')
link = get_text_tag(item, '{%s}link' % wordpress_namespace, 'foo')
path = urlparse(url).path
- dst_path = os.path.join(*(['new_site', 'files']
+ dst_path = os.path.join(*([self.output_folder, 'files']
+ list(path.split('/'))))
dst_dir = os.path.dirname(dst_path)
if not os.path.isdir(dst_dir):
@@ -147,23 +159,32 @@ class CommandImportWordpress(Command):
links[url] = '/' + dst_url
@staticmethod
- def write_content(filename, content):
+ def transform_sourcecode(content):
+ new_content = re.sub('\[sourcecode language="([^"]+)"\]',
+ "\n~~~~~~~~~~~~{.\\1}\n", content)
+ new_content = new_content.replace('[/sourcecode]',
+ "\n~~~~~~~~~~~~\n")
+ return new_content
+
+ @staticmethod
+ def transform_caption(content):
+ new_caption = re.sub(r'\[/caption\]', '', content)
+ new_caption = re.sub(r'\[caption.*\]', '', new_caption)
+
+ return new_caption
+
+ @classmethod
+ def transform_content(cls, content):
+ new_content = cls.transform_sourcecode(content)
+ return cls.transform_caption(new_content)
+
+ @classmethod
+ def write_content(cls, filename, content):
+ doc = html.document_fromstring(content)
+ doc.rewrite_links(replacer)
+
with open(filename, "wb+") as fd:
- if content.strip():
- # Handle sourcecode pseudo-tags
- content = re.sub('\[sourcecode language="([^"]+)"\]',
- "\n~~~~~~~~~~~~{.\\1}\n", content)
- content = content.replace('[/sourcecode]', "\n~~~~~~~~~~~~\n")
- doc = html.document_fromstring(content)
- doc.rewrite_links(replacer)
- # Replace H1 elements with H2 elements
- for tag in doc.findall('.//h1'):
- if not tag.text:
- print("Failed to fix bad title: %r" %
- html.tostring(tag))
- else:
- tag.getparent().replace(tag, builder.E.h2(tag.text))
- fd.write(html.tostring(doc, encoding='utf8'))
+ fd.write(html.tostring(doc, encoding='utf8'))
@staticmethod
def write_metadata(filename, title, slug, post_date, description, tags):
@@ -186,22 +207,30 @@ class CommandImportWordpress(Command):
link = get_text_tag(item, 'link', None)
slug = utils.slugify(urlparse(link).path)
if not slug: # it happens if the post has no "nice" URL
- slug = get_text_tag(item, '{%s}post_name' % wordpress_namespace, None)
+ slug = get_text_tag(
+ item, '{%s}post_name' % wordpress_namespace, None)
if not slug: # it *may* happen
- slug = get_text_tag(item, '{%s}post_id' % wordpress_namespace, None)
+ slug = get_text_tag(
+ item, '{%s}post_id' % wordpress_namespace, None)
if not slug: # should never happen
print("Error converting post:", title)
return
description = get_text_tag(item, 'description', '')
- post_date = get_text_tag(item, '{%s}post_date' % wordpress_namespace, None)
- status = get_text_tag(item, '{%s}status' % wordpress_namespace, 'publish')
+ post_date = get_text_tag(
+ item, '{%s}post_date' % wordpress_namespace, None)
+ status = get_text_tag(
+ item, '{%s}status' % wordpress_namespace, 'publish')
content = get_text_tag(
item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')
tags = []
if status != 'publish':
tags.append('draft')
+ is_draft = True
+ else:
+ is_draft = False
+
for tag in item.findall('category'):
text = tag.text
if text == 'Uncategorized':
@@ -211,17 +240,28 @@ class CommandImportWordpress(Command):
self.url_map[link] = self.context['BLOG_URL'] + '/' + \
out_folder + '/' + slug + '.html'
- self.write_metadata(os.path.join('new_site', out_folder,
- slug + '.meta'),
- title, slug, post_date, description, tags)
- self.write_content(
- os.path.join('new_site', out_folder, slug + '.wp'), content)
+ if is_draft and self.exclude_drafts:
+ print('Draft "%s" will not be imported.' % (title, ))
+ elif content.strip():
+ # If no content is found, no files are written.
+ content = self.transform_content(content)
+
+ self.write_metadata(os.path.join(self.output_folder, out_folder,
+ slug + '.meta'),
+ title, slug, post_date, description, tags)
+ self.write_content(
+ os.path.join(self.output_folder, out_folder, slug + '.wp'),
+ content)
+ else:
+ print('Not going to import "%s" because it seems to contain'
+ ' no content.' % (title, ))
def process_item(self, item):
# The namespace usually is something like:
# http://wordpress.org/export/1.2/
wordpress_namespace = item.nsmap['wp']
- post_type = get_text_tag(item, '{%s}post_type' % wordpress_namespace, 'post')
+ post_type = get_text_tag(
+ item, '{%s}post_type' % wordpress_namespace, 'post')
if post_type == 'attachment':
self.import_attachment(item, wordpress_namespace)
@@ -241,32 +281,68 @@ class CommandImportWordpress(Command):
for item in url_map.items():
csv_writer.writerow(item)
+ def get_configuration_output_path(self):
+ if not self.import_into_existing_site:
+ filename = 'conf.py'
+ else:
+ filename = 'conf.py.wordpress_import-%s' % datetime.datetime.now(
+ ).strftime('%Y%m%d_%H%M%s')
+ config_output_path = os.path.join(self.output_folder, filename)
+ print('Configuration will be written to: %s' % config_output_path)
+
+ return config_output_path
+
@staticmethod
def write_configuration(filename, rendered_template):
with codecs.open(filename, 'w+', 'utf8') as fd:
fd.write(rendered_template)
- def run(self, fname=None):
+ def run(self, *arguments):
+ """Import a Wordpress blog from an export file into a Nikola site."""
# Parse the data
if requests is None:
- print('To use the import_wordpress command, you have to install the "requests" package.')
+ print('To use the import_wordpress command,'
+ ' you have to install the "requests" package.')
return
- if fname is None:
- print("Usage: nikola import_wordpress wordpress_dump.xml")
+
+ parser = OptionParser(usage="nikola %s [options] "
+ "wordpress_export_file" % self.name)
+ parser.add_option('-f', '--filename', dest='filename',
+ help='WordPress export file from which the import '
+ 'made.')
+ parser.add_option('-o', '--output-folder', dest='output_folder',
+ default='new_site', help='The location into which '
+ 'the imported content will be written')
+ parser.add_option('-d', '--no-drafts', dest='exclude_drafts',
+ default=False, action="store_true", help='Do not '
+ 'import drafts.')
+
+ (options, args) = parser.parse_args(list(arguments))
+
+ if not options.filename and args:
+ options.filename = args[0]
+
+ if not options.filename:
+ parser.print_usage()
return
+ self.wordpress_export_file = options.filename
+ self.output_folder = options.output_folder
+ self.import_into_existing_site = False
+ self.exclude_drafts = options.exclude_drafts
self.url_map = {}
- channel = self.get_channel_from_file(fname)
+ channel = self.get_channel_from_file(self.wordpress_export_file)
self.context = self.populate_context(channel)
- conf_template = self.generate_base_site(self.context)
+ conf_template = self.generate_base_site()
self.context['REDIRECTIONS'] = self.configure_redirections(
self.url_map)
self.import_posts(channel)
self.write_urlmap_csv(
- os.path.join('new_site', 'url_map.csv'), self.url_map)
- self.write_configuration(os.path.join(
- 'new_site', 'conf.py'), conf_template.render(**self.context))
+ os.path.join(self.output_folder, 'url_map.csv'), self.url_map)
+
+ self.write_configuration(self.get_configuration_output_path(
+ ), conf_template.render(**self.context))
def replacer(dst):