1 files changed, 121 insertions, 45 deletions
diff --git a/nikola/plugins/command_import_wordpress.py b/nikola/plugins/command_import_wordpress.py
index 1552da4..07028d8 100644
--- a/nikola/plugins/command_import_wordpress.py
+++ b/nikola/plugins/command_import_wordpress.py
@@ -25,20 +25,23 @@
 from __future__ import unicode_literals, print_function
 import codecs
 import csv
+import datetime
 import os
 import re
+from optparse import OptionParser
+
 try:
     from urlparse import urlparse
 except ImportError:
-    from urllib.parse import urlparse
+    from urllib.parse import urlparse  # NOQA
 
-from lxml import etree, html, builder
+from lxml import etree, html
 from mako.template import Template
 
 try:
     import requests
 except ImportError:
-    requests = None
+    requests = None  # NOQA
 
 from nikola.plugin_categories import Command
 from nikola import utils
@@ -85,9 +88,14 @@ class CommandImportWordpress(Command):
 
         return redirections
 
-    @staticmethod
-    def generate_base_site(context):
-        os.system('nikola init new_site')
+    def generate_base_site(self):
+        if not os.path.exists(self.output_folder):
+            os.system('nikola init --empty %s' % (self.output_folder, ))
+        else:
+            self.import_into_existing_site = True
+            print('The folder %s already exists - assuming that this is a '
+                  'already existing nikola site.' % self.output_folder)
+
         conf_template = Template(filename=os.path.join(
             os.path.dirname(utils.__file__), 'conf.py.in'))
 
@@ -128,14 +136,18 @@ class CommandImportWordpress(Command):
 
     @staticmethod
     def download_url_content_to_file(url, dst_path):
-        with open(dst_path, 'wb+') as fd:
-            fd.write(requests.get(url).content)
+        try:
+            with open(dst_path, 'wb+') as fd:
+                fd.write(requests.get(url).content)
+        except requests.exceptions.ConnectionError as err:
+            print("Downloading %s to %s failed: %s" % (url, dst_path, err))
 
     def import_attachment(self, item, wordpress_namespace):
-        url = get_text_tag(item, '{%s}attachment_url' % wordpress_namespace, 'foo')
+        url = get_text_tag(
+            item, '{%s}attachment_url' % wordpress_namespace, 'foo')
         link = get_text_tag(item, '{%s}link' % wordpress_namespace, 'foo')
         path = urlparse(url).path
-        dst_path = os.path.join(*(['new_site', 'files']
+        dst_path = os.path.join(*([self.output_folder, 'files']
                                   + list(path.split('/'))))
         dst_dir = os.path.dirname(dst_path)
         if not os.path.isdir(dst_dir):
@@ -147,23 +159,32 @@ class CommandImportWordpress(Command):
         links[url] = '/' + dst_url
 
     @staticmethod
-    def write_content(filename, content):
+    def transform_sourcecode(content):
+        new_content = re.sub('\[sourcecode language="([^"]+)"\]',
+                             "\n~~~~~~~~~~~~{.\\1}\n", content)
+        new_content = new_content.replace('[/sourcecode]',
+                                          "\n~~~~~~~~~~~~\n")
+        return new_content
+
+    @staticmethod
+    def transform_caption(content):
+        new_caption = re.sub(r'\[/caption\]', '', content)
+        new_caption = re.sub(r'\[caption.*\]', '', new_caption)
+
+        return new_caption
+
+    @classmethod
+    def transform_content(cls, content):
+        new_content = cls.transform_sourcecode(content)
+        return cls.transform_caption(new_content)
+
+    @classmethod
+    def write_content(cls, filename, content):
+        doc = html.document_fromstring(content)
+        doc.rewrite_links(replacer)
+
         with open(filename, "wb+") as fd:
-            if content.strip():
-                # Handle sourcecode pseudo-tags
-                content = re.sub('\[sourcecode language="([^"]+)"\]',
-                                 "\n~~~~~~~~~~~~{.\\1}\n", content)
-                content = content.replace('[/sourcecode]', "\n~~~~~~~~~~~~\n")
-                doc = html.document_fromstring(content)
-                doc.rewrite_links(replacer)
-                # Replace H1 elements with H2 elements
-                for tag in doc.findall('.//h1'):
-                    if not tag.text:
-                        print("Failed to fix bad title: %r" %
-                              html.tostring(tag))
-                    else:
-                        tag.getparent().replace(tag, builder.E.h2(tag.text))
-                fd.write(html.tostring(doc, encoding='utf8'))
+            fd.write(html.tostring(doc, encoding='utf8'))
 
     @staticmethod
     def write_metadata(filename, title, slug, post_date, description, tags):
@@ -186,22 +207,30 @@ class CommandImportWordpress(Command):
         link = get_text_tag(item, 'link', None)
         slug = utils.slugify(urlparse(link).path)
         if not slug:  # it happens if the post has no "nice" URL
-            slug = get_text_tag(item, '{%s}post_name' % wordpress_namespace, None)
+            slug = get_text_tag(
+                item, '{%s}post_name' % wordpress_namespace, None)
         if not slug:  # it *may* happen
-            slug = get_text_tag(item, '{%s}post_id' % wordpress_namespace, None)
+            slug = get_text_tag(
+                item, '{%s}post_id' % wordpress_namespace, None)
         if not slug:  # should never happen
             print("Error converting post:", title)
             return
 
         description = get_text_tag(item, 'description', '')
-        post_date = get_text_tag(item, '{%s}post_date' % wordpress_namespace, None)
-        status = get_text_tag(item, '{%s}status' % wordpress_namespace, 'publish')
+        post_date = get_text_tag(
+            item, '{%s}post_date' % wordpress_namespace, None)
+        status = get_text_tag(
+            item, '{%s}status' % wordpress_namespace, 'publish')
         content = get_text_tag(
             item, '{http://purl.org/rss/1.0/modules/content/}encoded', '')
 
         tags = []
         if status != 'publish':
             tags.append('draft')
+            is_draft = True
+        else:
+            is_draft = False
+
         for tag in item.findall('category'):
             text = tag.text
             if text == 'Uncategorized':
@@ -211,17 +240,28 @@ class CommandImportWordpress(Command):
         self.url_map[link] = self.context['BLOG_URL'] + '/' + \
             out_folder + '/' + slug + '.html'
 
-        self.write_metadata(os.path.join('new_site', out_folder,
-                                         slug + '.meta'),
-                            title, slug, post_date, description, tags)
-        self.write_content(
-            os.path.join('new_site', out_folder, slug + '.wp'), content)
+        if is_draft and self.exclude_drafts:
+            print('Draft "%s" will not be imported.' % (title, ))
+        elif content.strip():
+            # If no content is found, no files are written.
+            content = self.transform_content(content)
+
+            self.write_metadata(os.path.join(self.output_folder, out_folder,
+                                             slug + '.meta'),
+                                title, slug, post_date, description, tags)
+            self.write_content(
+                os.path.join(self.output_folder, out_folder, slug + '.wp'),
+                content)
+        else:
+            print('Not going to import "%s" because it seems to contain'
+                  ' no content.' % (title, ))
 
     def process_item(self, item):
         # The namespace usually is something like:
         # http://wordpress.org/export/1.2/
         wordpress_namespace = item.nsmap['wp']
-        post_type = get_text_tag(item, '{%s}post_type' % wordpress_namespace, 'post')
+        post_type = get_text_tag(
+            item, '{%s}post_type' % wordpress_namespace, 'post')
 
         if post_type == 'attachment':
             self.import_attachment(item, wordpress_namespace)
@@ -241,32 +281,68 @@ class CommandImportWordpress(Command):
             for item in url_map.items():
                 csv_writer.writerow(item)
 
+    def get_configuration_output_path(self):
+        if not self.import_into_existing_site:
+            filename = 'conf.py'
+        else:
+            filename = 'conf.py.wordpress_import-%s' % datetime.datetime.now(
+            ).strftime('%Y%m%d_%H%M%s')
+        config_output_path = os.path.join(self.output_folder, filename)
+        print('Configuration will be written to: %s' % config_output_path)
+
+        return config_output_path
+
     @staticmethod
     def write_configuration(filename, rendered_template):
         with codecs.open(filename, 'w+', 'utf8') as fd:
             fd.write(rendered_template)
 
-    def run(self, fname=None):
+    def run(self, *arguments):
+        """Import a Wordpress blog from an export file into a Nikola site."""
         # Parse the data
         if requests is None:
-            print('To use the import_wordpress command, you have to install the "requests" package.')
+            print('To use the import_wordpress command,'
+                  ' you have to install the "requests" package.')
             return
-        if fname is None:
-            print("Usage: nikola import_wordpress wordpress_dump.xml")
+
+        parser = OptionParser(usage="nikola %s [options] "
+                              "wordpress_export_file" % self.name)
+        parser.add_option('-f', '--filename', dest='filename',
+                          help='WordPress export file from which the import '
+                          'made.')
+        parser.add_option('-o', '--output-folder', dest='output_folder',
+                          default='new_site', help='The location into which '
+                          'the imported content will be written')
+        parser.add_option('-d', '--no-drafts', dest='exclude_drafts',
+                          default=False, action="store_true", help='Do not '
+                          'import drafts.')
+
+        (options, args) = parser.parse_args(list(arguments))
+
+        if not options.filename and args:
+            options.filename = args[0]
+
+        if not options.filename:
+            parser.print_usage()
             return
 
+        self.wordpress_export_file = options.filename
+        self.output_folder = options.output_folder
+        self.import_into_existing_site = False
+        self.exclude_drafts = options.exclude_drafts
         self.url_map = {}
-        channel = self.get_channel_from_file(fname)
+        channel = self.get_channel_from_file(self.wordpress_export_file)
         self.context = self.populate_context(channel)
-        conf_template = self.generate_base_site(self.context)
+        conf_template = self.generate_base_site()
         self.context['REDIRECTIONS'] = self.configure_redirections(
             self.url_map)
 
         self.import_posts(channel)
         self.write_urlmap_csv(
-            os.path.join('new_site', 'url_map.csv'), self.url_map)
-        self.write_configuration(os.path.join(
-            'new_site', 'conf.py'), conf_template.render(**self.context))
+            os.path.join(self.output_folder, 'url_map.csv'), self.url_map)
+
+        self.write_configuration(self.get_configuration_output_path(
+        ), conf_template.render(**self.context))
 
 
 def replacer(dst):