1 files changed, 337 insertions, 143 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index a652ec8..5e2aee6 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright © 2012-2015 Roberto Alsina and others.
+# Copyright © 2012-2020 Roberto Alsina and others.
 
 # Permission is hereby granted, free of charge, to any
 # person obtaining a copy of this software and associated
@@ -26,41 +26,45 @@
 
 """Import a WordPress dump."""
 
-from __future__ import unicode_literals, print_function
-import os
-import re
-import sys
 import datetime
 import io
 import json
+import os
+import re
+import sys
+from collections import defaultdict
+from urllib.parse import urlparse, unquote
+
 import requests
 from lxml import etree
-from collections import defaultdict
+
+from nikola.plugin_categories import Command
+from nikola import utils, hierarchy_utils
+from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
+from nikola.utils import req_missing
+from nikola.plugins.basic_import import ImportMixin, links
+from nikola.plugins.command.init import (
+    SAMPLE_CONF, prepare_config,
+    format_default_translations_config,
+    get_default_translations_dict
+)
 
 try:
-    from urlparse import urlparse
-    from urllib import unquote
+    import html2text
 except ImportError:
-    from urllib.parse import urlparse, unquote  # NOQA
+    html2text = None
 
 try:
     import phpserialize
 except ImportError:
-    phpserialize = None  # NOQA
+    phpserialize = None
 
-from nikola.plugin_categories import Command
-from nikola import utils
-from nikola.utils import req_missing
-from nikola.plugins.basic_import import ImportMixin, links
-from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
-from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config
-
-LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER)
+LOGGER = utils.get_logger('import_wordpress')
 
 
 def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False):
     """Install a Nikola plugin."""
-    LOGGER.notice("Installing plugin '{0}'".format(plugin_name))
+    LOGGER.info("Installing plugin '{0}'".format(plugin_name))
     # Get hold of the 'plugin' plugin
     plugin_installer_info = site.plugin_manager.getPluginByName('plugin', 'Command')
     if plugin_installer_info is None:
@@ -88,7 +92,6 @@ def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False)
 
 
 class CommandImportWordpress(Command, ImportMixin):
-
     """Import a WordPress dump."""
 
     name = "import_wordpress"
@@ -144,15 +147,22 @@ class CommandImportWordpress(Command, ImportMixin):
             'long': 'qtranslate',
             'default': False,
             'type': bool,
-            'help': "Look for translations generated by qtranslate plugin",
-            # WARNING: won't recover translated titles that actually
-            # don't seem to be part of the wordpress XML export at the
-            # time of writing :(
+            'help': """Look for translations generated by qtranslate plugin.
+WARNING: a default wordpress export won't allow to recover title translations.
+For this to be possible consider applying the hack suggested at
+https://github.com/qtranslate/qtranslate-xt/issues/199 :
+
+In wp-admin/includes/export.php change
+`echo apply_filters( 'the_title_rss', $post->post_title );
+
+to
+`echo apply_filters( 'the_title_export', $post->post_title );
+"""
         },
         {
             'name': 'translations_pattern',
             'long': 'translations_pattern',
-            'default': None,
+            'default': DEFAULT_TRANSLATIONS_PATTERN,
             'type': str,
             'help': "The pattern for translation files names",
         },
@@ -171,6 +181,20 @@ class CommandImportWordpress(Command, ImportMixin):
             'help': "Export comments as .wpcomment files",
         },
         {
+            'name': 'html2text',
+            'long': 'html2text',
+            'default': False,
+            'type': bool,
+            'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import",
+        },
+        {
+            'name': 'transform_to_markdown',
+            'long': 'transform-to-markdown',
+            'default': False,
+            'type': bool,
+            'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import",
+        },
+        {
             'name': 'transform_to_html',
             'long': 'transform-to-html',
             'default': False,
@@ -191,9 +215,36 @@ class CommandImportWordpress(Command, ImportMixin):
             'type': bool,
             'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!",
         },
+        {
+            'name': 'tag_sanitizing_strategy',
+            'long': 'tag-sanitizing-strategy',
+            'default': 'first',
+            'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name',
+        },
+        {
+            'name': 'one_file',
+            'long': 'one-file',
+            'default': False,
+            'type': bool,
+            'help': "Save imported posts in the more modern one-file format.",
+        },
     ]
     all_tags = set([])
 
+    def _get_compiler(self):
+        """Return whatever compiler we will use."""
+        self._find_wordpress_compiler()
+        if self.wordpress_page_compiler is not None:
+            return self.wordpress_page_compiler
+        plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler')
+        if plugin_info is not None:
+            if not plugin_info.is_activated:
+                self.site.plugin_manager.activatePluginByName(plugin_info.name)
+                plugin_info.plugin_object.set_site(self.site)
+            return plugin_info.plugin_object
+        else:
+            LOGGER.error("Can't find markdown post compiler.")
+
     def _find_wordpress_compiler(self):
         """Find WordPress compiler plugin."""
         if self.wordpress_page_compiler is not None:
@@ -214,9 +265,11 @@ class CommandImportWordpress(Command, ImportMixin):
             options['output_folder'] = args.pop(0)
 
         if args:
-            LOGGER.warn('You specified additional arguments ({0}). Please consider '
-                        'putting these arguments before the filename if you '
-                        'are running into problems.'.format(args))
+            LOGGER.warning('You specified additional arguments ({0}). Please consider '
+                           'putting these arguments before the filename if you '
+                           'are running into problems.'.format(args))
+
+        self.onefile = options.get('one_file', False)
 
         self.import_into_existing_site = False
         self.url_map = {}
@@ -234,11 +287,16 @@ class CommandImportWordpress(Command, ImportMixin):
         self.export_categories_as_categories = options.get('export_categories_as_categories', False)
         self.export_comments = options.get('export_comments', False)
 
+        self.html2text = options.get('html2text', False)
+        self.transform_to_markdown = options.get('transform_to_markdown', False)
+
         self.transform_to_html = options.get('transform_to_html', False)
         self.use_wordpress_compiler = options.get('use_wordpress_compiler', False)
         self.install_wordpress_compiler = options.get('install_wordpress_compiler', False)
         self.wordpress_page_compiler = None
 
+        self.tag_saniziting_strategy = options.get('tag_saniziting_strategy', 'first')
+
         self.auth = None
         if options.get('download_auth') is not None:
             username_password = options.get('download_auth')
@@ -250,10 +308,18 @@ class CommandImportWordpress(Command, ImportMixin):
         self.separate_qtranslate_content = options.get('separate_qtranslate_content')
         self.translations_pattern = options.get('translations_pattern')
 
-        if self.transform_to_html and self.use_wordpress_compiler:
-            LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.")
+        count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0)
+        if count > 1:
+            LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.")
+            return False
+        if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler:
+            LOGGER.warning("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
+
+        if (self.html2text or self.transform_to_markdown) and not html2text:
+            LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.")
+            return False
 
-        if self.transform_to_html:
+        if self.transform_to_html or self.transform_to_markdown:
             self._find_wordpress_compiler()
             if not self.wordpress_page_compiler and self.install_wordpress_compiler:
                 if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'):  # local install
@@ -279,14 +345,14 @@ class CommandImportWordpress(Command, ImportMixin):
                 # cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
                 cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
                 cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
-                cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
+                cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
                 cat_path = [cat_name]
                 if cat_parent_slug in cat_map:
                     cat_path = cat_map[cat_parent_slug] + cat_path
                 cat_map[cat_slug] = cat_path
             self._category_paths = dict()
             for cat, path in cat_map.items():
-                self._category_paths[cat] = utils.join_hierarchical_category_path(path)
+                self._category_paths[cat] = hierarchy_utils.join_hierarchical_category_path(path)
 
     def _execute(self, options={}, args=[]):
         """Import a WordPress blog from an export file into a Nikola site."""
@@ -313,21 +379,16 @@ class CommandImportWordpress(Command, ImportMixin):
             if phpserialize is None:
                 req_missing(['phpserialize'], 'import WordPress dumps without --no-downloads')
 
-        channel = self.get_channel_from_file(self.wordpress_export_file)
+        export_file_preprocessor = modernize_qtranslate_tags if self.separate_qtranslate_content else None
+        channel = self.get_channel_from_file(self.wordpress_export_file, export_file_preprocessor)
         self._prepare(channel)
         conf_template = self.generate_base_site()
 
-        # If user  has specified a custom pattern for translation files we
-        # need to fix the config
-        if self.translations_pattern:
-            self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern
-
         self.import_posts(channel)
-
         self.context['TRANSLATIONS'] = format_default_translations_config(
             self.extra_languages)
         self.context['REDIRECTIONS'] = self.configure_redirections(
-            self.url_map)
+            self.url_map, self.base_dir)
         if self.timezone:
             self.context['TIMEZONE'] = self.timezone
         if self.export_categories_as_categories:
@@ -337,10 +398,13 @@ class CommandImportWordpress(Command, ImportMixin):
         # Add tag redirects
         for tag in self.all_tags:
             try:
-                tag_str = tag.decode('utf8')
+                if isinstance(tag, bytes):
+                    tag_str = tag.decode('utf8', 'replace')
+                else:
+                    tag_str = tag
             except AttributeError:
                 tag_str = tag
-            tag = utils.slugify(tag_str)
+            tag = utils.slugify(tag_str, self.lang)
             src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag)
             dst_url = self.site.link('tag', tag)
             if src_url != dst_url:
@@ -357,9 +421,9 @@ class CommandImportWordpress(Command, ImportMixin):
                 if not install_plugin(self.site, 'wordpress_compiler', output_dir=os.path.join(self.output_folder, 'plugins')):
                     return False
             else:
-                LOGGER.warn("Make sure to install the WordPress page compiler via")
-                LOGGER.warn("    nikola plugin -i wordpress_compiler")
-                LOGGER.warn("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
+                LOGGER.warning("Make sure to install the WordPress page compiler via")
+                LOGGER.warning("    nikola plugin -i wordpress_compiler")
+                LOGGER.warning("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
 
     @classmethod
     def read_xml_file(cls, filename):
@@ -372,12 +436,19 @@ class CommandImportWordpress(Command, ImportMixin):
                 if b'<atom:link rel=' in line:
                     continue
                 xml.append(line)
-        return b'\n'.join(xml)
+        return b''.join(xml)
 
     @classmethod
-    def get_channel_from_file(cls, filename):
-        """Get channel from XML file."""
-        tree = etree.fromstring(cls.read_xml_file(filename))
+    def get_channel_from_file(cls, filename, xml_preprocessor=None):
+        """Get channel from XML file.
+
+        An optional 'xml_preprocessor' allows to modify the xml
+        (typically to deal with variations in tags injected by some WP plugin)
+        """
+        xml_string = cls.read_xml_file(filename)
+        if xml_preprocessor:
+            xml_string = xml_preprocessor(xml_string)
+        tree = etree.fromstring(xml_string)
         channel = tree.find('channel')
         return channel
 
@@ -386,8 +457,12 @@ class CommandImportWordpress(Command, ImportMixin):
         wordpress_namespace = channel.nsmap['wp']
 
         context = SAMPLE_CONF.copy()
-        context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
-        context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
+        self.lang = get_text_tag(channel, 'language', 'en')[:2]
+        context['DEFAULT_LANG'] = self.lang
+        # If user  has specified a custom pattern for translation files we
+        # need to fix the config
+        context['TRANSLATIONS_PATTERN'] = self.translations_pattern
+
         context['BLOG_TITLE'] = get_text_tag(channel, 'title',
                                              'PUT TITLE HERE')
         context['BLOG_DESCRIPTION'] = get_text_tag(
@@ -418,17 +493,17 @@ class CommandImportWordpress(Command, ImportMixin):
         PAGES = '(\n'
         for extension in extensions:
             POSTS += '    ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension)
-            PAGES += '    ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension)
+            PAGES += '    ("pages/*.{0}", "pages", "page.tmpl"),\n'.format(extension)
         POSTS += ')\n'
         PAGES += ')\n'
         context['POSTS'] = POSTS
         context['PAGES'] = PAGES
         COMPILERS = '{\n'
-        COMPILERS += '''    "rest": ('.txt', '.rst'),''' + '\n'
-        COMPILERS += '''    "markdown": ('.md', '.mdown', '.markdown'),''' + '\n'
-        COMPILERS += '''    "html": ('.html', '.htm'),''' + '\n'
+        COMPILERS += '''    "rest": ['.txt', '.rst'],''' + '\n'
+        COMPILERS += '''    "markdown": ['.md', '.mdown', '.markdown'],''' + '\n'
+        COMPILERS += '''    "html": ['.html', '.htm'],''' + '\n'
         if self.use_wordpress_compiler:
-            COMPILERS += '''    "wordpress": ('.wp'),''' + '\n'
+            COMPILERS += '''    "wordpress": ['.wp'],''' + '\n'
         COMPILERS += '}'
         context['COMPILERS'] = COMPILERS
 
@@ -436,18 +511,15 @@ class CommandImportWordpress(Command, ImportMixin):
 
     def download_url_content_to_file(self, url, dst_path):
         """Download some content (attachments) to a file."""
-        if self.no_downloads:
-            return
-
         try:
             request = requests.get(url, auth=self.auth)
             if request.status_code >= 400:
-                LOGGER.warn("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
+                LOGGER.warning("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
                 return
             with open(dst_path, 'wb+') as fd:
                 fd.write(request.content)
         except requests.exceptions.ConnectionError as err:
-            LOGGER.warn("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
+            LOGGER.warning("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
 
     def import_attachment(self, item, wordpress_namespace):
         """Import an attachment to the site."""
@@ -458,10 +530,13 @@ class CommandImportWordpress(Command, ImportMixin):
                             'foo')
         path = urlparse(url).path
         dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
-        dst_dir = os.path.dirname(dst_path)
-        utils.makedirs(dst_dir)
-        LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
-        self.download_url_content_to_file(url, dst_path)
+        if self.no_downloads:
+            LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+        else:
+            dst_dir = os.path.dirname(dst_path)
+            utils.makedirs(dst_dir)
+            LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+            self.download_url_content_to_file(url, dst_path)
         dst_url = '/'.join(dst_path.split(os.sep)[2:])
         links[link] = '/' + dst_url
         links[url] = '/' + dst_url
@@ -485,14 +560,7 @@ class CommandImportWordpress(Command, ImportMixin):
                     # that the export should give you the power to insert
                     # your blogging into another site or system its not.
                     # Why don't they just use JSON?
-                    if sys.version_info[0] == 2:
-                        try:
-                            metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
-                        except ValueError:
-                            # local encoding might be wrong sometimes
-                            metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
-                    else:
-                        metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
+                    metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
 
                     meta_key = b'image_meta'
                     size_key = b'sizes'
@@ -507,6 +575,8 @@ class CommandImportWordpress(Command, ImportMixin):
 
                     if meta_key in metadata:
                         image_meta = metadata[meta_key]
+                        if not image_meta:
+                            continue
                         dst_meta = {}
 
                         def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False):
@@ -517,6 +587,9 @@ class CommandImportWordpress(Command, ImportMixin):
                                     if ignore_zero and value == 0:
                                         return
                                 elif is_float:
+                                    # in some locales (like fr) and for old posts there may be a comma here.
+                                    if isinstance(value, bytes):
+                                        value = value.replace(b",", b".")
                                     value = float(value)
                                     if ignore_zero and value == 0:
                                         return
@@ -552,15 +625,18 @@ class CommandImportWordpress(Command, ImportMixin):
                         meta = {}
                         meta['size'] = size.decode('utf-8')
                         if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]:
-                            meta['width'] = metadata[size_key][size][width_key]
-                            meta['height'] = metadata[size_key][size][height_key]
+                            meta['width'] = int(metadata[size_key][size][width_key])
+                            meta['height'] = int(metadata[size_key][size][height_key])
 
                         path = urlparse(url).path
                         dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
-                        dst_dir = os.path.dirname(dst_path)
-                        utils.makedirs(dst_dir)
-                        LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
-                        self.download_url_content_to_file(url, dst_path)
+                        if self.no_downloads:
+                            LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+                        else:
+                            dst_dir = os.path.dirname(dst_path)
+                            utils.makedirs(dst_dir)
+                            LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+                            self.download_url_content_to_file(url, dst_path)
                         dst_url = '/'.join(dst_path.split(os.sep)[2:])
                         links[url] = '/' + dst_url
 
@@ -604,7 +680,7 @@ class CommandImportWordpress(Command, ImportMixin):
 
     def transform_code(self, content):
         """Transform code blocks."""
-        # http://en.support.wordpress.com/code/posting-source-code/. There are
+        # https://en.support.wordpress.com/code/posting-source-code/. There are
         # a ton of things not supported here. We only do a basic [code
         # lang="x"] -> ```x translation, and remove quoted html entities (<,
         # >, &, and ").
@@ -628,10 +704,10 @@ class CommandImportWordpress(Command, ImportMixin):
         return content
 
     @staticmethod
-    def transform_caption(content):
+    def transform_caption(content, use_html=False):
         """Transform captions."""
-        new_caption = re.sub(r'\[/caption\]', '', content)
-        new_caption = re.sub(r'\[caption.*\]', '', new_caption)
+        new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content)
+        new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption)
 
         return new_caption
 
@@ -654,6 +730,26 @@ class CommandImportWordpress(Command, ImportMixin):
                 except TypeError:  # old versions of the plugin don't support the additional argument
                     content = self.wordpress_page_compiler.compile_to_string(content)
                 return content, 'html', True
+            elif self.transform_to_markdown:
+                # First convert to HTML with WordPress plugin
+                additional_data = {}
+                if attachments is not None:
+                    additional_data['attachments'] = attachments
+                try:
+                    content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data)
+                except TypeError:  # old versions of the plugin don't support the additional argument
+                    content = self.wordpress_page_compiler.compile_to_string(content)
+                # Now convert to MarkDown with html2text
+                h = html2text.HTML2Text()
+                content = h.handle(content)
+                return content, 'md', False
+            elif self.html2text:
+                # TODO: what to do with [code] blocks?
+                # content = self.transform_code(content)
+                content = self.transform_caption(content, use_html=True)
+                h = html2text.HTML2Text()
+                content = h.handle(content)
+                return content, 'md', False
             elif self.use_wordpress_compiler:
                 return content, 'wp', False
             else:
@@ -686,7 +782,7 @@ class CommandImportWordpress(Command, ImportMixin):
         elif approved == 'spam' or approved == 'trash':
             pass
         else:
-            LOGGER.warn("Unknown comment approved status: " + str(approved))
+            LOGGER.warning("Unknown comment approved status: {0}".format(approved))
         parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0))
         if parent == 0:
             parent = None
@@ -724,6 +820,16 @@ class CommandImportWordpress(Command, ImportMixin):
             write_header_line(fd, "wordpress_user_id", comment["user_id"])
             fd.write(('\n' + comment['content']).encode('utf8'))
 
+    def _create_meta_and_content_filenames(self, slug, extension, lang, default_language, translations_config):
+        out_meta_filename = slug + '.meta'
+        out_content_filename = slug + '.' + extension
+        if lang and lang != default_language:
+            out_meta_filename = utils.get_translation_candidate(translations_config,
+                                                                out_meta_filename, lang)
+            out_content_filename = utils.get_translation_candidate(translations_config,
+                                                                   out_content_filename, lang)
+        return out_meta_filename, out_content_filename
+
     def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
         """Create post metadata."""
         other_meta = {'wp-status': status}
@@ -735,24 +841,48 @@ class CommandImportWordpress(Command, ImportMixin):
                 if text in self._category_paths:
                     cats.append(self._category_paths[text])
                 else:
-                    cats.append(utils.join_hierarchical_category_path([text]))
+                    cats.append(hierarchy_utils.join_hierarchical_category_path([utils.html_unescape(text)]))
             other_meta['categories'] = ','.join(cats)
             if len(cats) > 0:
                 other_meta['category'] = cats[0]
                 if len(cats) > 1:
-                    LOGGER.warn(('Post "{0}" has more than one category! ' +
-                                 'Will only use the first one.').format(post_name))
-            tags_cats = tags
+                    LOGGER.warning(('Post "{0}" has more than one category! ' +
+                                    'Will only use the first one.').format(post_name))
+            tags_cats = [utils.html_unescape(tag) for tag in tags]
         else:
-            tags_cats = tags + categories
+            tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
         return tags_cats, other_meta
 
+    _tag_sanitize_map = {True: {}, False: {}}
+
+    def _sanitize(self, tag, is_category):
+        if self.tag_saniziting_strategy == 'lower':
+            return tag.lower()
+        if tag.lower() not in self._tag_sanitize_map[is_category]:
+            self._tag_sanitize_map[is_category][tag.lower()] = [tag]
+            return tag
+        previous = self._tag_sanitize_map[is_category][tag.lower()]
+        if self.tag_saniziting_strategy == 'first':
+            if tag != previous[0]:
+                LOGGER.warning("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0]))
+            return previous[0]
+        else:
+            LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy))
+            sys.exit(1)
+        return tag
+
     def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None):
         """Take an item from the feed and creates a post file."""
         if out_folder is None:
             out_folder = 'posts'
 
         title = get_text_tag(item, 'title', 'NO TITLE')
+
+        # titles can have line breaks in them, particularly when they are
+        # created by third-party tools that post to Wordpress.
+        # Handle windows-style and unix-style line endings.
+        title = title.replace('\r\n', ' ').replace('\n', ' ')
+
         # link is something like http://foo.com/2012/09/01/hello-world/
         # So, take the path, utils.slugify it, and that's our slug
         link = get_text_tag(item, 'link', None)
@@ -760,7 +890,10 @@ class CommandImportWordpress(Command, ImportMixin):
         path = unquote(parsed.path.strip('/'))
 
         try:
-            path = path.decode('utf8')
+            if isinstance(path, bytes):
+                path = path.decode('utf8', 'replace')
+            else:
+                path = path
         except AttributeError:
             pass
 
@@ -782,7 +915,7 @@ class CommandImportWordpress(Command, ImportMixin):
         else:
             if len(pathlist) > 1:
                 out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
-            slug = utils.slugify(pathlist[-1])
+            slug = utils.slugify(pathlist[-1], self.lang)
 
         description = get_text_tag(item, 'description', '')
         post_date = get_text_tag(
@@ -809,17 +942,19 @@ class CommandImportWordpress(Command, ImportMixin):
 
         tags = []
         categories = []
+        post_status = 'published'
+        has_math = "no"
         if status == 'trash':
-            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
+            LOGGER.warning('Trashed post "{0}" will not be imported.'.format(title))
             return False
         elif status == 'private':
-            tags.append('private')
             is_draft = False
             is_private = True
+            post_status = 'private'
         elif status != 'publish':
-            tags.append('draft')
             is_draft = True
             is_private = False
+            post_status = 'draft'
         else:
             is_draft = False
             is_private = False
@@ -831,14 +966,23 @@ class CommandImportWordpress(Command, ImportMixin):
                 type = tag.attrib['domain']
             if text == 'Uncategorized' and type == 'category':
                 continue
-            self.all_tags.add(text)
             if type == 'category':
-                categories.append(type)
+                categories.append(text)
             else:
                 tags.append(text)
 
         if '$latex' in content:
-            tags.append('mathjax')
+            has_math = "yes"
+
+        for i, cat in enumerate(categories[:]):
+            cat = self._sanitize(cat, True)
+            categories[i] = cat
+            self.all_tags.add(cat)
+
+        for i, tag in enumerate(tags[:]):
+            tag = self._sanitize(tag, False)
+            tags[i] = tag
+            self.all_tags.add(tag)
 
         # Find post format if it's there
         post_format = 'wp'
@@ -849,53 +993,75 @@ class CommandImportWordpress(Command, ImportMixin):
                 post_format = 'wp'
 
         if is_draft and self.exclude_drafts:
-            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
+            LOGGER.warning('Draft "{0}" will not be imported.'.format(title))
             return False
         elif is_private and self.exclude_privates:
-            LOGGER.notice('Private post "{0}" will not be imported.'.format(title))
+            LOGGER.warning('Private post "{0}" will not be imported.'.format(title))
             return False
         elif content.strip() or self.import_empty_items:
             # If no content is found, no files are written.
             self.url_map[link] = (self.context['SITE_URL'] +
                                   out_folder.rstrip('/') + '/' + slug +
                                   '.html').replace(os.sep, '/')
-            if hasattr(self, "separate_qtranslate_content") \
-               and self.separate_qtranslate_content:
-                content_translations = separate_qtranslate_content(content)
+            default_language = self.context["DEFAULT_LANG"]
+            if self.separate_qtranslate_content:
+                content_translations = separate_qtranslate_tagged_langs(content)
+                title_translations = separate_qtranslate_tagged_langs(title)
             else:
                 content_translations = {"": content}
-            default_language = self.context["DEFAULT_LANG"]
+                title_translations = {"": title}
+            # in case of mistmatch between the languages found in the title and in the content
+            default_title = title_translations.get(default_language, title)
+            extra_languages = [lang for lang in content_translations.keys() if lang not in ("", default_language)]
+            for extra_lang in extra_languages:
+                self.extra_languages.add(extra_lang)
+            translations_dict = get_default_translations_dict(default_language, extra_languages)
+            current_translations_config = {
+                "DEFAULT_LANG": default_language,
+                "TRANSLATIONS": translations_dict,
+                "TRANSLATIONS_PATTERN": self.context["TRANSLATIONS_PATTERN"]
+            }
             for lang, content in content_translations.items():
                 try:
                     content, extension, rewrite_html = self.transform_content(content, post_format, attachments)
-                except:
+                except Exception:
                     LOGGER.error(('Cannot interpret post "{0}" (language {1}) with post ' +
                                   'format {2}!').format(os.path.join(out_folder, slug), lang, post_format))
                     return False
-                if lang:
-                    out_meta_filename = slug + '.meta'
-                    if lang == default_language:
-                        out_content_filename = slug + '.' + extension
-                    else:
-                        out_content_filename \
-                            = utils.get_translation_candidate(self.context,
-                                                              slug + "." + extension, lang)
-                        self.extra_languages.add(lang)
-                    meta_slug = slug
-                else:
-                    out_meta_filename = slug + '.meta'
-                    out_content_filename = slug + '.' + extension
-                    meta_slug = slug
+
+                out_meta_filename, out_content_filename = self._create_meta_and_content_filenames(
+                    slug, extension, lang, default_language, current_translations_config)
+
                 tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
                                                          post_name=os.path.join(out_folder, slug))
-                self.write_metadata(os.path.join(self.output_folder, out_folder,
-                                                 out_meta_filename),
-                                    title, meta_slug, post_date, description, tags, **other_meta)
-                self.write_content(
-                    os.path.join(self.output_folder,
-                                 out_folder, out_content_filename),
-                    content,
-                    rewrite_html)
+                current_title = title_translations.get(lang, default_title)
+                meta = {
+                    "title": current_title,
+                    "slug": slug,
+                    "date": post_date,
+                    "description": description,
+                    "tags": ','.join(tags),
+                    "status": post_status,
+                    "has_math": has_math,
+                }
+                meta.update(other_meta)
+                if self.onefile:
+                    self.write_post(
+                        os.path.join(self.output_folder,
+                                     out_folder, out_content_filename),
+                        content,
+                        meta,
+                        self._get_compiler(),
+                        rewrite_html)
+                else:
+                    self.write_metadata(os.path.join(self.output_folder, out_folder,
+                                                     out_meta_filename),
+                                        current_title, slug, post_date, description, tags, **other_meta)
+                    self.write_content(
+                        os.path.join(self.output_folder,
+                                     out_folder, out_content_filename),
+                        content,
+                        rewrite_html)
 
             if self.export_comments:
                 comments = []
@@ -905,13 +1071,13 @@ class CommandImportWordpress(Command, ImportMixin):
                         comments.append(comment)
 
                 for comment in comments:
-                    comment_filename = slug + "." + str(comment['id']) + ".wpcomment"
+                    comment_filename = "{0}.{1}.wpcomment".format(slug, comment['id'])
                     self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment)
 
             return (out_folder, slug)
         else:
-            LOGGER.warn(('Not going to import "{0}" because it seems to contain'
-                         ' no content.').format(title))
+            LOGGER.warning(('Not going to import "{0}" because it seems to contain'
+                            ' no content.').format(title))
             return False
 
     def _extract_item_info(self, item):
@@ -937,7 +1103,7 @@ class CommandImportWordpress(Command, ImportMixin):
             if parent_id is not None and int(parent_id) != 0:
                 self.attachments[int(parent_id)][post_id] = data
             else:
-                LOGGER.warn("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
+                LOGGER.warning("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
 
     def write_attachments_info(self, path, attachments):
         """Write attachments info file."""
@@ -955,7 +1121,7 @@ class CommandImportWordpress(Command, ImportMixin):
             if post_type == 'post':
                 out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments)
             else:
-                out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments)
+                out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments)
             # Process attachment data
             if attachments is not None:
                 # If post was exported, store data
@@ -975,8 +1141,8 @@ class CommandImportWordpress(Command, ImportMixin):
             self.process_item_if_post_or_page(item)
         # Assign attachments to posts
         for post_id in self.attachments:
-            LOGGER.warn(("Found attachments for post or page #{0}, but didn't find post or page. " +
-                         "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
+            LOGGER.warning(("Found attachments for post or page #{0}, but didn't find post or page. " +
+                            "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
 
 
 def get_text_tag(tag, name, default):
@@ -990,15 +1156,20 @@ def get_text_tag(tag, name, default):
         return default
 
 
-def separate_qtranslate_content(text):
-    """Parse the content of a wordpress post or page and separate qtranslate languages.
+def separate_qtranslate_tagged_langs(text):
+    """Parse the content of a wordpress post or page and separate languages.
+
+    For qtranslateX tags: [:LL]blabla[:]
 
-    qtranslate tags: <!--:LL-->blabla<!--:-->
+    Note: qtranslate* plugins had a troubled history and used various
+    tags over time, application of the 'modernize_qtranslate_tags'
+    function is required for this function to handle most of the legacy
+    cases.
     """
-    # TODO: uniformize qtranslate tags <!--/en--> => <!--:-->
-    qt_start = "<!--:"
-    qt_end = "-->"
-    qt_end_with_lang_len = 5
+    qt_start = "[:"
+    qt_end = "]"
+    qt_end_len = len(qt_end)
+    qt_end_with_lang_len = qt_end_len + 2
     qt_chunks = text.split(qt_start)
     content_by_lang = {}
     common_txt_list = []
@@ -1010,9 +1181,9 @@ def separate_qtranslate_content(text):
             # be some piece of common text or tags, or just nothing
             lang = ""  # default language
             c = c.lstrip(qt_end)
-            if not c:
+            if not c.strip():
                 continue
-        elif c[2:].startswith(qt_end):
+        elif c[2:qt_end_with_lang_len].startswith(qt_end):
             # a language specific section (with language code at the begining)
             lang = c[:2]
             c = c[qt_end_with_lang_len:]
@@ -1033,3 +1204,26 @@ def separate_qtranslate_content(text):
     for l in content_by_lang.keys():
         content_by_lang[l] = " ".join(content_by_lang[l])
     return content_by_lang
+
+
+def modernize_qtranslate_tags(xml_bytes):
+    """
+    Uniformize the "tag" used by various version of qtranslate.
+
+    The resulting byte string will only contain one set of qtranslate tags
+    (namely [:LG] and [:]), older ones being converted to new ones.
+    """
+    old_start_lang = re.compile(b"<!--:?(\\w{2})-->")
+    new_start_lang = b"[:\\1]"
+    old_end_lang = re.compile(b"<!--(/\\w{2}|:)-->")
+    new_end_lang = b"[:]"
+    title_match = re.compile(b"<title>(.*?)</title>")
+    modern_starts = old_start_lang.sub(new_start_lang, xml_bytes)
+    modernized_bytes = old_end_lang.sub(new_end_lang, modern_starts)
+
+    def title_escape(match):
+        title = match.group(1)
+        title = title.replace(b"&", b"&amp;").replace(b"<", b"&lt;").replace(b">", b"&gt;")
+        return b"<title>" + title + b"</title>"
+    fixed_bytes = title_match.sub(title_escape, modernized_bytes)
+    return fixed_bytes