1 files changed, 167 insertions, 116 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index 0b48583..5e2aee6 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright © 2012-2016 Roberto Alsina and others.
+# Copyright © 2012-2020 Roberto Alsina and others.
 
 # Permission is hereby granted, free of charge, to any
 # person obtaining a copy of this software and associated
@@ -26,46 +26,45 @@
 
 """Import a WordPress dump."""
 
-from __future__ import unicode_literals, print_function
-import os
-import re
-import sys
 import datetime
 import io
 import json
+import os
+import re
+import sys
+from collections import defaultdict
+from urllib.parse import urlparse, unquote
+
 import requests
 from lxml import etree
-from collections import defaultdict
 
-try:
-    import html2text
-except:
-    html2text = None
+from nikola.plugin_categories import Command
+from nikola import utils, hierarchy_utils
+from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
+from nikola.utils import req_missing
+from nikola.plugins.basic_import import ImportMixin, links
+from nikola.plugins.command.init import (
+    SAMPLE_CONF, prepare_config,
+    format_default_translations_config,
+    get_default_translations_dict
+)
 
 try:
-    from urlparse import urlparse
-    from urllib import unquote
+    import html2text
 except ImportError:
-    from urllib.parse import urlparse, unquote  # NOQA
+    html2text = None
 
 try:
     import phpserialize
 except ImportError:
-    phpserialize = None  # NOQA
+    phpserialize = None
 
-from nikola.plugin_categories import Command
-from nikola import utils
-from nikola.utils import req_missing, unicode_str
-from nikola.plugins.basic_import import ImportMixin, links
-from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
-from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config
-
-LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER)
+LOGGER = utils.get_logger('import_wordpress')
 
 
 def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False):
     """Install a Nikola plugin."""
-    LOGGER.notice("Installing plugin '{0}'".format(plugin_name))
+    LOGGER.info("Installing plugin '{0}'".format(plugin_name))
     # Get hold of the 'plugin' plugin
     plugin_installer_info = site.plugin_manager.getPluginByName('plugin', 'Command')
     if plugin_installer_info is None:
@@ -148,15 +147,22 @@ class CommandImportWordpress(Command, ImportMixin):
             'long': 'qtranslate',
             'default': False,
             'type': bool,
-            'help': "Look for translations generated by qtranslate plugin",
-            # WARNING: won't recover translated titles that actually
-            # don't seem to be part of the wordpress XML export at the
-            # time of writing :(
+            'help': """Look for translations generated by qtranslate plugin.
+WARNING: a default wordpress export won't allow to recover title translations.
+For this to be possible consider applying the hack suggested at
+https://github.com/qtranslate/qtranslate-xt/issues/199 :
+
+In wp-admin/includes/export.php change
+`echo apply_filters( 'the_title_rss', $post->post_title );
+
+to
+`echo apply_filters( 'the_title_export', $post->post_title );
+"""
         },
         {
             'name': 'translations_pattern',
             'long': 'translations_pattern',
-            'default': None,
+            'default': DEFAULT_TRANSLATIONS_PATTERN,
             'type': str,
             'help': "The pattern for translation files names",
         },
@@ -259,9 +265,9 @@ class CommandImportWordpress(Command, ImportMixin):
             options['output_folder'] = args.pop(0)
 
         if args:
-            LOGGER.warn('You specified additional arguments ({0}). Please consider '
-                        'putting these arguments before the filename if you '
-                        'are running into problems.'.format(args))
+            LOGGER.warning('You specified additional arguments ({0}). Please consider '
+                           'putting these arguments before the filename if you '
+                           'are running into problems.'.format(args))
 
         self.onefile = options.get('one_file', False)
 
@@ -307,7 +313,7 @@ class CommandImportWordpress(Command, ImportMixin):
             LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.")
             return False
         if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler:
-            LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
+            LOGGER.warning("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
 
         if (self.html2text or self.transform_to_markdown) and not html2text:
             LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.")
@@ -339,14 +345,14 @@ class CommandImportWordpress(Command, ImportMixin):
                 # cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
                 cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
                 cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
-                cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
+                cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
                 cat_path = [cat_name]
                 if cat_parent_slug in cat_map:
                     cat_path = cat_map[cat_parent_slug] + cat_path
                 cat_map[cat_slug] = cat_path
             self._category_paths = dict()
             for cat, path in cat_map.items():
-                self._category_paths[cat] = utils.join_hierarchical_category_path(path)
+                self._category_paths[cat] = hierarchy_utils.join_hierarchical_category_path(path)
 
     def _execute(self, options={}, args=[]):
         """Import a WordPress blog from an export file into a Nikola site."""
@@ -373,17 +379,12 @@ class CommandImportWordpress(Command, ImportMixin):
             if phpserialize is None:
                 req_missing(['phpserialize'], 'import WordPress dumps without --no-downloads')
 
-        channel = self.get_channel_from_file(self.wordpress_export_file)
+        export_file_preprocessor = modernize_qtranslate_tags if self.separate_qtranslate_content else None
+        channel = self.get_channel_from_file(self.wordpress_export_file, export_file_preprocessor)
         self._prepare(channel)
         conf_template = self.generate_base_site()
 
-        # If user  has specified a custom pattern for translation files we
-        # need to fix the config
-        if self.translations_pattern:
-            self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern
-
         self.import_posts(channel)
-
         self.context['TRANSLATIONS'] = format_default_translations_config(
             self.extra_languages)
         self.context['REDIRECTIONS'] = self.configure_redirections(
@@ -397,7 +398,7 @@ class CommandImportWordpress(Command, ImportMixin):
         # Add tag redirects
         for tag in self.all_tags:
             try:
-                if isinstance(tag, utils.bytes_str):
+                if isinstance(tag, bytes):
                     tag_str = tag.decode('utf8', 'replace')
                 else:
                     tag_str = tag
@@ -420,9 +421,9 @@ class CommandImportWordpress(Command, ImportMixin):
                 if not install_plugin(self.site, 'wordpress_compiler', output_dir=os.path.join(self.output_folder, 'plugins')):
                     return False
             else:
-                LOGGER.warn("Make sure to install the WordPress page compiler via")
-                LOGGER.warn("    nikola plugin -i wordpress_compiler")
-                LOGGER.warn("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
+                LOGGER.warning("Make sure to install the WordPress page compiler via")
+                LOGGER.warning("    nikola plugin -i wordpress_compiler")
+                LOGGER.warning("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
 
     @classmethod
     def read_xml_file(cls, filename):
@@ -438,9 +439,16 @@ class CommandImportWordpress(Command, ImportMixin):
         return b''.join(xml)
 
     @classmethod
-    def get_channel_from_file(cls, filename):
-        """Get channel from XML file."""
-        tree = etree.fromstring(cls.read_xml_file(filename))
+    def get_channel_from_file(cls, filename, xml_preprocessor=None):
+        """Get channel from XML file.
+
+        An optional 'xml_preprocessor' allows to modify the xml
+        (typically to deal with variations in tags injected by some WP plugin)
+        """
+        xml_string = cls.read_xml_file(filename)
+        if xml_preprocessor:
+            xml_string = xml_preprocessor(xml_string)
+        tree = etree.fromstring(xml_string)
         channel = tree.find('channel')
         return channel
 
@@ -451,7 +459,10 @@ class CommandImportWordpress(Command, ImportMixin):
         context = SAMPLE_CONF.copy()
         self.lang = get_text_tag(channel, 'language', 'en')[:2]
         context['DEFAULT_LANG'] = self.lang
-        context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
+        # If user  has specified a custom pattern for translation files we
+        # need to fix the config
+        context['TRANSLATIONS_PATTERN'] = self.translations_pattern
+
         context['BLOG_TITLE'] = get_text_tag(channel, 'title',
                                              'PUT TITLE HERE')
         context['BLOG_DESCRIPTION'] = get_text_tag(
@@ -482,17 +493,17 @@ class CommandImportWordpress(Command, ImportMixin):
         PAGES = '(\n'
         for extension in extensions:
             POSTS += '    ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension)
-            PAGES += '    ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension)
+            PAGES += '    ("pages/*.{0}", "pages", "page.tmpl"),\n'.format(extension)
         POSTS += ')\n'
         PAGES += ')\n'
         context['POSTS'] = POSTS
         context['PAGES'] = PAGES
         COMPILERS = '{\n'
-        COMPILERS += '''    "rest": ('.txt', '.rst'),''' + '\n'
-        COMPILERS += '''    "markdown": ('.md', '.mdown', '.markdown'),''' + '\n'
-        COMPILERS += '''    "html": ('.html', '.htm'),''' + '\n'
+        COMPILERS += '''    "rest": ['.txt', '.rst'],''' + '\n'
+        COMPILERS += '''    "markdown": ['.md', '.mdown', '.markdown'],''' + '\n'
+        COMPILERS += '''    "html": ['.html', '.htm'],''' + '\n'
         if self.use_wordpress_compiler:
-            COMPILERS += '''    "wordpress": ('.wp'),''' + '\n'
+            COMPILERS += '''    "wordpress": ['.wp'],''' + '\n'
         COMPILERS += '}'
         context['COMPILERS'] = COMPILERS
 
@@ -503,12 +514,12 @@ class CommandImportWordpress(Command, ImportMixin):
         try:
             request = requests.get(url, auth=self.auth)
             if request.status_code >= 400:
-                LOGGER.warn("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
+                LOGGER.warning("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
                 return
             with open(dst_path, 'wb+') as fd:
                 fd.write(request.content)
         except requests.exceptions.ConnectionError as err:
-            LOGGER.warn("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
+            LOGGER.warning("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
 
     def import_attachment(self, item, wordpress_namespace):
         """Import an attachment to the site."""
@@ -549,14 +560,7 @@ class CommandImportWordpress(Command, ImportMixin):
                     # that the export should give you the power to insert
                     # your blogging into another site or system its not.
                     # Why don't they just use JSON?
-                    if sys.version_info[0] == 2:
-                        try:
-                            metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
-                        except ValueError:
-                            # local encoding might be wrong sometimes
-                            metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
-                    else:
-                        metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
+                    metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
 
                     meta_key = b'image_meta'
                     size_key = b'sizes'
@@ -583,6 +587,9 @@ class CommandImportWordpress(Command, ImportMixin):
                                     if ignore_zero and value == 0:
                                         return
                                 elif is_float:
+                                    # in some locales (like fr) and for old posts there may be a comma here.
+                                    if isinstance(value, bytes):
+                                        value = value.replace(b",", b".")
                                     value = float(value)
                                     if ignore_zero and value == 0:
                                         return
@@ -775,7 +782,7 @@ class CommandImportWordpress(Command, ImportMixin):
         elif approved == 'spam' or approved == 'trash':
             pass
         else:
-            LOGGER.warn("Unknown comment approved status: {0}".format(approved))
+            LOGGER.warning("Unknown comment approved status: {0}".format(approved))
         parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0))
         if parent == 0:
             parent = None
@@ -796,7 +803,7 @@ class CommandImportWordpress(Command, ImportMixin):
             """Write comment header line."""
             if header_content is None:
                 return
-            header_content = unicode_str(header_content).replace('\n', ' ')
+            header_content = str(header_content).replace('\n', ' ')
             line = '.. ' + header_field + ': ' + header_content + '\n'
             fd.write(line.encode('utf8'))
 
@@ -813,6 +820,16 @@ class CommandImportWordpress(Command, ImportMixin):
             write_header_line(fd, "wordpress_user_id", comment["user_id"])
             fd.write(('\n' + comment['content']).encode('utf8'))
 
+    def _create_meta_and_content_filenames(self, slug, extension, lang, default_language, translations_config):
+        out_meta_filename = slug + '.meta'
+        out_content_filename = slug + '.' + extension
+        if lang and lang != default_language:
+            out_meta_filename = utils.get_translation_candidate(translations_config,
+                                                                out_meta_filename, lang)
+            out_content_filename = utils.get_translation_candidate(translations_config,
+                                                                   out_content_filename, lang)
+        return out_meta_filename, out_content_filename
+
     def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
         """Create post metadata."""
         other_meta = {'wp-status': status}
@@ -824,16 +841,16 @@ class CommandImportWordpress(Command, ImportMixin):
                 if text in self._category_paths:
                     cats.append(self._category_paths[text])
                 else:
-                    cats.append(utils.join_hierarchical_category_path([text]))
+                    cats.append(hierarchy_utils.join_hierarchical_category_path([utils.html_unescape(text)]))
             other_meta['categories'] = ','.join(cats)
             if len(cats) > 0:
                 other_meta['category'] = cats[0]
                 if len(cats) > 1:
-                    LOGGER.warn(('Post "{0}" has more than one category! ' +
-                                 'Will only use the first one.').format(post_name))
-            tags_cats = tags
+                    LOGGER.warning(('Post "{0}" has more than one category! ' +
+                                    'Will only use the first one.').format(post_name))
+            tags_cats = [utils.html_unescape(tag) for tag in tags]
         else:
-            tags_cats = tags + categories
+            tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
         return tags_cats, other_meta
 
     _tag_sanitize_map = {True: {}, False: {}}
@@ -847,7 +864,7 @@ class CommandImportWordpress(Command, ImportMixin):
         previous = self._tag_sanitize_map[is_category][tag.lower()]
         if self.tag_saniziting_strategy == 'first':
             if tag != previous[0]:
-                LOGGER.warn("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0]))
+                LOGGER.warning("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0]))
             return previous[0]
         else:
             LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy))
@@ -873,7 +890,7 @@ class CommandImportWordpress(Command, ImportMixin):
         path = unquote(parsed.path.strip('/'))
 
         try:
-            if isinstance(path, utils.bytes_str):
+            if isinstance(path, bytes):
                 path = path.decode('utf8', 'replace')
             else:
                 path = path
@@ -925,17 +942,19 @@ class CommandImportWordpress(Command, ImportMixin):
 
         tags = []
         categories = []
+        post_status = 'published'
+        has_math = "no"
         if status == 'trash':
-            LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
+            LOGGER.warning('Trashed post "{0}" will not be imported.'.format(title))
             return False
         elif status == 'private':
-            tags.append('private')
             is_draft = False
             is_private = True
+            post_status = 'private'
         elif status != 'publish':
-            tags.append('draft')
             is_draft = True
             is_private = False
+            post_status = 'draft'
         else:
             is_draft = False
             is_private = False
@@ -953,7 +972,7 @@ class CommandImportWordpress(Command, ImportMixin):
                 tags.append(text)
 
         if '$latex' in content:
-            tags.append('mathjax')
+            has_math = "yes"
 
         for i, cat in enumerate(categories[:]):
             cat = self._sanitize(cat, True)
@@ -974,52 +993,56 @@ class CommandImportWordpress(Command, ImportMixin):
                 post_format = 'wp'
 
         if is_draft and self.exclude_drafts:
-            LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
+            LOGGER.warning('Draft "{0}" will not be imported.'.format(title))
             return False
         elif is_private and self.exclude_privates:
-            LOGGER.notice('Private post "{0}" will not be imported.'.format(title))
+            LOGGER.warning('Private post "{0}" will not be imported.'.format(title))
             return False
         elif content.strip() or self.import_empty_items:
             # If no content is found, no files are written.
             self.url_map[link] = (self.context['SITE_URL'] +
                                   out_folder.rstrip('/') + '/' + slug +
                                   '.html').replace(os.sep, '/')
-            if hasattr(self, "separate_qtranslate_content") \
-               and self.separate_qtranslate_content:
-                content_translations = separate_qtranslate_content(content)
+            default_language = self.context["DEFAULT_LANG"]
+            if self.separate_qtranslate_content:
+                content_translations = separate_qtranslate_tagged_langs(content)
+                title_translations = separate_qtranslate_tagged_langs(title)
             else:
                 content_translations = {"": content}
-            default_language = self.context["DEFAULT_LANG"]
+                title_translations = {"": title}
+            # in case of mistmatch between the languages found in the title and in the content
+            default_title = title_translations.get(default_language, title)
+            extra_languages = [lang for lang in content_translations.keys() if lang not in ("", default_language)]
+            for extra_lang in extra_languages:
+                self.extra_languages.add(extra_lang)
+            translations_dict = get_default_translations_dict(default_language, extra_languages)
+            current_translations_config = {
+                "DEFAULT_LANG": default_language,
+                "TRANSLATIONS": translations_dict,
+                "TRANSLATIONS_PATTERN": self.context["TRANSLATIONS_PATTERN"]
+            }
             for lang, content in content_translations.items():
                 try:
                     content, extension, rewrite_html = self.transform_content(content, post_format, attachments)
-                except:
+                except Exception:
                     LOGGER.error(('Cannot interpret post "{0}" (language {1}) with post ' +
                                   'format {2}!').format(os.path.join(out_folder, slug), lang, post_format))
                     return False
-                if lang:
-                    out_meta_filename = slug + '.meta'
-                    if lang == default_language:
-                        out_content_filename = slug + '.' + extension
-                    else:
-                        out_content_filename \
-                            = utils.get_translation_candidate(self.context,
-                                                              slug + "." + extension, lang)
-                        self.extra_languages.add(lang)
-                    meta_slug = slug
-                else:
-                    out_meta_filename = slug + '.meta'
-                    out_content_filename = slug + '.' + extension
-                    meta_slug = slug
+
+                out_meta_filename, out_content_filename = self._create_meta_and_content_filenames(
+                    slug, extension, lang, default_language, current_translations_config)
+
                 tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
                                                          post_name=os.path.join(out_folder, slug))
-
+                current_title = title_translations.get(lang, default_title)
                 meta = {
-                    "title": title,
-                    "slug": meta_slug,
+                    "title": current_title,
+                    "slug": slug,
                     "date": post_date,
                     "description": description,
                     "tags": ','.join(tags),
+                    "status": post_status,
+                    "has_math": has_math,
                 }
                 meta.update(other_meta)
                 if self.onefile:
@@ -1033,7 +1056,7 @@ class CommandImportWordpress(Command, ImportMixin):
                 else:
                     self.write_metadata(os.path.join(self.output_folder, out_folder,
                                                      out_meta_filename),
-                                        title, meta_slug, post_date, description, tags, **other_meta)
+                                        current_title, slug, post_date, description, tags, **other_meta)
                     self.write_content(
                         os.path.join(self.output_folder,
                                      out_folder, out_content_filename),
@@ -1053,8 +1076,8 @@ class CommandImportWordpress(Command, ImportMixin):
 
             return (out_folder, slug)
         else:
-            LOGGER.warn(('Not going to import "{0}" because it seems to contain'
-                         ' no content.').format(title))
+            LOGGER.warning(('Not going to import "{0}" because it seems to contain'
+                            ' no content.').format(title))
             return False
 
     def _extract_item_info(self, item):
@@ -1080,7 +1103,7 @@ class CommandImportWordpress(Command, ImportMixin):
             if parent_id is not None and int(parent_id) != 0:
                 self.attachments[int(parent_id)][post_id] = data
             else:
-                LOGGER.warn("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
+                LOGGER.warning("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
 
     def write_attachments_info(self, path, attachments):
         """Write attachments info file."""
@@ -1118,8 +1141,8 @@ class CommandImportWordpress(Command, ImportMixin):
             self.process_item_if_post_or_page(item)
         # Assign attachments to posts
         for post_id in self.attachments:
-            LOGGER.warn(("Found attachments for post or page #{0}, but didn't find post or page. " +
-                         "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
+            LOGGER.warning(("Found attachments for post or page #{0}, but didn't find post or page. " +
+                            "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
 
 
 def get_text_tag(tag, name, default):
@@ -1133,15 +1156,20 @@ def get_text_tag(tag, name, default):
         return default
 
 
-def separate_qtranslate_content(text):
-    """Parse the content of a wordpress post or page and separate qtranslate languages.
+def separate_qtranslate_tagged_langs(text):
+    """Parse the content of a wordpress post or page and separate languages.
+
+    For qtranslateX tags: [:LL]blabla[:]
 
-    qtranslate tags: <!--:LL-->blabla<!--:-->
+    Note: qtranslate* plugins had a troubled history and used various
+    tags over time, application of the 'modernize_qtranslate_tags'
+    function is required for this function to handle most of the legacy
+    cases.
     """
-    # TODO: uniformize qtranslate tags <!--/en--> => <!--:-->
-    qt_start = "<!--:"
-    qt_end = "-->"
-    qt_end_with_lang_len = 5
+    qt_start = "[:"
+    qt_end = "]"
+    qt_end_len = len(qt_end)
+    qt_end_with_lang_len = qt_end_len + 2
     qt_chunks = text.split(qt_start)
     content_by_lang = {}
     common_txt_list = []
@@ -1153,9 +1181,9 @@ def separate_qtranslate_content(text):
             # be some piece of common text or tags, or just nothing
             lang = ""  # default language
             c = c.lstrip(qt_end)
-            if not c:
+            if not c.strip():
                 continue
-        elif c[2:].startswith(qt_end):
+        elif c[2:qt_end_with_lang_len].startswith(qt_end):
             # a language specific section (with language code at the begining)
             lang = c[:2]
             c = c[qt_end_with_lang_len:]
@@ -1176,3 +1204,26 @@ def separate_qtranslate_content(text):
     for l in content_by_lang.keys():
         content_by_lang[l] = " ".join(content_by_lang[l])
     return content_by_lang
+
+
+def modernize_qtranslate_tags(xml_bytes):
+    """
+    Uniformize the "tag" used by various version of qtranslate.
+
+    The resulting byte string will only contain one set of qtranslate tags
+    (namely [:LG] and [:]), older ones being converted to new ones.
+    """
+    old_start_lang = re.compile(b"<!--:?(\\w{2})-->")
+    new_start_lang = b"[:\\1]"
+    old_end_lang = re.compile(b"<!--(/\\w{2}|:)-->")
+    new_end_lang = b"[:]"
+    title_match = re.compile(b"<title>(.*?)</title>")
+    modern_starts = old_start_lang.sub(new_start_lang, xml_bytes)
+    modernized_bytes = old_end_lang.sub(new_end_lang, modern_starts)
+
+    def title_escape(match):
+        title = match.group(1)
+        title = title.replace(b"&", b"&amp;").replace(b"<", b"&lt;").replace(b">", b"&gt;")
+        return b"<title>" + title + b"</title>"
+    fixed_bytes = title_match.sub(title_escape, modernized_bytes)
+    return fixed_bytes