1 files changed, 140 insertions, 37 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index 69ef144..0b48583 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright © 2012-2015 Roberto Alsina and others.
+# Copyright © 2012-2016 Roberto Alsina and others.
 
 # Permission is hereby granted, free of charge, to any
 # person obtaining a copy of this software and associated
@@ -38,6 +38,11 @@ from lxml import etree
 from collections import defaultdict
 
 try:
+    import html2text
+except:
+    html2text = None
+
+try:
     from urlparse import urlparse
     from urllib import unquote
 except ImportError:
@@ -170,6 +175,20 @@ class CommandImportWordpress(Command, ImportMixin):
             'help': "Export comments as .wpcomment files",
         },
         {
+            'name': 'html2text',
+            'long': 'html2text',
+            'default': False,
+            'type': bool,
+            'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import",
+        },
+        {
+            'name': 'transform_to_markdown',
+            'long': 'transform-to-markdown',
+            'default': False,
+            'type': bool,
+            'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import",
+        },
+        {
             'name': 'transform_to_html',
             'long': 'transform-to-html',
             'default': False,
@@ -191,14 +210,35 @@ class CommandImportWordpress(Command, ImportMixin):
             'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!",
         },
         {
-            'name': 'tag_saniziting_strategy',
-            'long': 'tag-saniziting-strategy',
+            'name': 'tag_sanitizing_strategy',
+            'long': 'tag-sanitizing-strategy',
             'default': 'first',
             'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name',
         },
+        {
+            'name': 'one_file',
+            'long': 'one-file',
+            'default': False,
+            'type': bool,
+            'help': "Save imported posts in the more modern one-file format.",
+        },
     ]
     all_tags = set([])
 
+    def _get_compiler(self):
+        """Return whatever compiler we will use."""
+        self._find_wordpress_compiler()
+        if self.wordpress_page_compiler is not None:
+            return self.wordpress_page_compiler
+        plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler')
+        if plugin_info is not None:
+            if not plugin_info.is_activated:
+                self.site.plugin_manager.activatePluginByName(plugin_info.name)
+                plugin_info.plugin_object.set_site(self.site)
+            return plugin_info.plugin_object
+        else:
+            LOGGER.error("Can't find markdown post compiler.")
+
     def _find_wordpress_compiler(self):
         """Find WordPress compiler plugin."""
         if self.wordpress_page_compiler is not None:
@@ -223,6 +263,8 @@ class CommandImportWordpress(Command, ImportMixin):
                         'putting these arguments before the filename if you '
                         'are running into problems.'.format(args))
 
+        self.onefile = options.get('one_file', False)
+
         self.import_into_existing_site = False
         self.url_map = {}
         self.timezone = None
@@ -239,6 +281,9 @@ class CommandImportWordpress(Command, ImportMixin):
         self.export_categories_as_categories = options.get('export_categories_as_categories', False)
         self.export_comments = options.get('export_comments', False)
 
+        self.html2text = options.get('html2text', False)
+        self.transform_to_markdown = options.get('transform_to_markdown', False)
+
         self.transform_to_html = options.get('transform_to_html', False)
         self.use_wordpress_compiler = options.get('use_wordpress_compiler', False)
         self.install_wordpress_compiler = options.get('install_wordpress_compiler', False)
@@ -257,10 +302,18 @@ class CommandImportWordpress(Command, ImportMixin):
         self.separate_qtranslate_content = options.get('separate_qtranslate_content')
         self.translations_pattern = options.get('translations_pattern')
 
-        if self.transform_to_html and self.use_wordpress_compiler:
-            LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.")
+        count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0)
+        if count > 1:
+            LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.")
+            return False
+        if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler:
+            LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
+
+        if (self.html2text or self.transform_to_markdown) and not html2text:
+            LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.")
+            return False
 
-        if self.transform_to_html:
+        if self.transform_to_html or self.transform_to_markdown:
             self._find_wordpress_compiler()
             if not self.wordpress_page_compiler and self.install_wordpress_compiler:
                 if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'):  # local install
@@ -334,7 +387,7 @@ class CommandImportWordpress(Command, ImportMixin):
         self.context['TRANSLATIONS'] = format_default_translations_config(
             self.extra_languages)
         self.context['REDIRECTIONS'] = self.configure_redirections(
-            self.url_map)
+            self.url_map, self.base_dir)
         if self.timezone:
             self.context['TIMEZONE'] = self.timezone
         if self.export_categories_as_categories:
@@ -350,7 +403,7 @@ class CommandImportWordpress(Command, ImportMixin):
                     tag_str = tag
             except AttributeError:
                 tag_str = tag
-            tag = utils.slugify(tag_str)
+            tag = utils.slugify(tag_str, self.lang)
             src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag)
             dst_url = self.site.link('tag', tag)
             if src_url != dst_url:
@@ -382,7 +435,7 @@ class CommandImportWordpress(Command, ImportMixin):
                 if b'<atom:link rel=' in line:
                     continue
                 xml.append(line)
-        return b'\n'.join(xml)
+        return b''.join(xml)
 
     @classmethod
     def get_channel_from_file(cls, filename):
@@ -396,7 +449,8 @@ class CommandImportWordpress(Command, ImportMixin):
         wordpress_namespace = channel.nsmap['wp']
 
         context = SAMPLE_CONF.copy()
-        context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
+        self.lang = get_text_tag(channel, 'language', 'en')[:2]
+        context['DEFAULT_LANG'] = self.lang
         context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
         context['BLOG_TITLE'] = get_text_tag(channel, 'title',
                                              'PUT TITLE HERE')
@@ -428,7 +482,7 @@ class CommandImportWordpress(Command, ImportMixin):
         PAGES = '(\n'
         for extension in extensions:
             POSTS += '    ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension)
-            PAGES += '    ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension)
+            PAGES += '    ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension)
         POSTS += ')\n'
         PAGES += ')\n'
         context['POSTS'] = POSTS
@@ -446,9 +500,6 @@ class CommandImportWordpress(Command, ImportMixin):
 
     def download_url_content_to_file(self, url, dst_path):
         """Download some content (attachments) to a file."""
-        if self.no_downloads:
-            return
-
         try:
             request = requests.get(url, auth=self.auth)
             if request.status_code >= 400:
@@ -468,10 +519,13 @@ class CommandImportWordpress(Command, ImportMixin):
                             'foo')
         path = urlparse(url).path
         dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
-        dst_dir = os.path.dirname(dst_path)
-        utils.makedirs(dst_dir)
-        LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
-        self.download_url_content_to_file(url, dst_path)
+        if self.no_downloads:
+            LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+        else:
+            dst_dir = os.path.dirname(dst_path)
+            utils.makedirs(dst_dir)
+            LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+            self.download_url_content_to_file(url, dst_path)
         dst_url = '/'.join(dst_path.split(os.sep)[2:])
         links[link] = '/' + dst_url
         links[url] = '/' + dst_url
@@ -517,6 +571,8 @@ class CommandImportWordpress(Command, ImportMixin):
 
                     if meta_key in metadata:
                         image_meta = metadata[meta_key]
+                        if not image_meta:
+                            continue
                         dst_meta = {}
 
                         def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False):
@@ -562,15 +618,18 @@ class CommandImportWordpress(Command, ImportMixin):
                         meta = {}
                         meta['size'] = size.decode('utf-8')
                         if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]:
-                            meta['width'] = metadata[size_key][size][width_key]
-                            meta['height'] = metadata[size_key][size][height_key]
+                            meta['width'] = int(metadata[size_key][size][width_key])
+                            meta['height'] = int(metadata[size_key][size][height_key])
 
                         path = urlparse(url).path
                         dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
-                        dst_dir = os.path.dirname(dst_path)
-                        utils.makedirs(dst_dir)
-                        LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
-                        self.download_url_content_to_file(url, dst_path)
+                        if self.no_downloads:
+                            LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+                        else:
+                            dst_dir = os.path.dirname(dst_path)
+                            utils.makedirs(dst_dir)
+                            LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+                            self.download_url_content_to_file(url, dst_path)
                         dst_url = '/'.join(dst_path.split(os.sep)[2:])
                         links[url] = '/' + dst_url
 
@@ -638,10 +697,10 @@ class CommandImportWordpress(Command, ImportMixin):
         return content
 
     @staticmethod
-    def transform_caption(content):
+    def transform_caption(content, use_html=False):
         """Transform captions."""
-        new_caption = re.sub(r'\[/caption\]', '', content)
-        new_caption = re.sub(r'\[caption.*\]', '', new_caption)
+        new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content)
+        new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption)
 
         return new_caption
 
@@ -664,6 +723,26 @@ class CommandImportWordpress(Command, ImportMixin):
                 except TypeError:  # old versions of the plugin don't support the additional argument
                     content = self.wordpress_page_compiler.compile_to_string(content)
                 return content, 'html', True
+            elif self.transform_to_markdown:
+                # First convert to HTML with WordPress plugin
+                additional_data = {}
+                if attachments is not None:
+                    additional_data['attachments'] = attachments
+                try:
+                    content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data)
+                except TypeError:  # old versions of the plugin don't support the additional argument
+                    content = self.wordpress_page_compiler.compile_to_string(content)
+                # Now convert to MarkDown with html2text
+                h = html2text.HTML2Text()
+                content = h.handle(content)
+                return content, 'md', False
+            elif self.html2text:
+                # TODO: what to do with [code] blocks?
+                # content = self.transform_code(content)
+                content = self.transform_caption(content, use_html=True)
+                h = html2text.HTML2Text()
+                content = h.handle(content)
+                return content, 'md', False
             elif self.use_wordpress_compiler:
                 return content, 'wp', False
             else:
@@ -781,6 +860,12 @@ class CommandImportWordpress(Command, ImportMixin):
             out_folder = 'posts'
 
         title = get_text_tag(item, 'title', 'NO TITLE')
+
+        # titles can have line breaks in them, particularly when they are
+        # created by third-party tools that post to Wordpress.
+        # Handle windows-style and unix-style line endings.
+        title = title.replace('\r\n', ' ').replace('\n', ' ')
+
         # link is something like http://foo.com/2012/09/01/hello-world/
         # So, take the path, utils.slugify it, and that's our slug
         link = get_text_tag(item, 'link', None)
@@ -813,7 +898,7 @@ class CommandImportWordpress(Command, ImportMixin):
         else:
             if len(pathlist) > 1:
                 out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
-            slug = utils.slugify(pathlist[-1])
+            slug = utils.slugify(pathlist[-1], self.lang)
 
         description = get_text_tag(item, 'description', '')
         post_date = get_text_tag(
@@ -928,14 +1013,32 @@ class CommandImportWordpress(Command, ImportMixin):
                     meta_slug = slug
                 tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
                                                          post_name=os.path.join(out_folder, slug))
-                self.write_metadata(os.path.join(self.output_folder, out_folder,
-                                                 out_meta_filename),
-                                    title, meta_slug, post_date, description, tags, **other_meta)
-                self.write_content(
-                    os.path.join(self.output_folder,
-                                 out_folder, out_content_filename),
-                    content,
-                    rewrite_html)
+
+                meta = {
+                    "title": title,
+                    "slug": meta_slug,
+                    "date": post_date,
+                    "description": description,
+                    "tags": ','.join(tags),
+                }
+                meta.update(other_meta)
+                if self.onefile:
+                    self.write_post(
+                        os.path.join(self.output_folder,
+                                     out_folder, out_content_filename),
+                        content,
+                        meta,
+                        self._get_compiler(),
+                        rewrite_html)
+                else:
+                    self.write_metadata(os.path.join(self.output_folder, out_folder,
+                                                     out_meta_filename),
+                                        title, meta_slug, post_date, description, tags, **other_meta)
+                    self.write_content(
+                        os.path.join(self.output_folder,
+                                     out_folder, out_content_filename),
+                        content,
+                        rewrite_html)
 
             if self.export_comments:
                 comments = []
@@ -995,7 +1098,7 @@ class CommandImportWordpress(Command, ImportMixin):
             if post_type == 'post':
                 out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments)
             else:
-                out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments)
+                out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments)
             # Process attachment data
             if attachments is not None:
                 # If post was exported, store data