aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command/import_wordpress.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
-rw-r--r--nikola/plugins/command/import_wordpress.py283
1 files changed, 167 insertions, 116 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index 0b48583..5e2aee6 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright © 2012-2016 Roberto Alsina and others.
+# Copyright © 2012-2020 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
@@ -26,46 +26,45 @@
"""Import a WordPress dump."""
-from __future__ import unicode_literals, print_function
-import os
-import re
-import sys
import datetime
import io
import json
+import os
+import re
+import sys
+from collections import defaultdict
+from urllib.parse import urlparse, unquote
+
import requests
from lxml import etree
-from collections import defaultdict
-try:
- import html2text
-except:
- html2text = None
+from nikola.plugin_categories import Command
+from nikola import utils, hierarchy_utils
+from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
+from nikola.utils import req_missing
+from nikola.plugins.basic_import import ImportMixin, links
+from nikola.plugins.command.init import (
+ SAMPLE_CONF, prepare_config,
+ format_default_translations_config,
+ get_default_translations_dict
+)
try:
- from urlparse import urlparse
- from urllib import unquote
+ import html2text
except ImportError:
- from urllib.parse import urlparse, unquote # NOQA
+ html2text = None
try:
import phpserialize
except ImportError:
- phpserialize = None # NOQA
+ phpserialize = None
-from nikola.plugin_categories import Command
-from nikola import utils
-from nikola.utils import req_missing, unicode_str
-from nikola.plugins.basic_import import ImportMixin, links
-from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
-from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config
-
-LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER)
+LOGGER = utils.get_logger('import_wordpress')
def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False):
"""Install a Nikola plugin."""
- LOGGER.notice("Installing plugin '{0}'".format(plugin_name))
+ LOGGER.info("Installing plugin '{0}'".format(plugin_name))
# Get hold of the 'plugin' plugin
plugin_installer_info = site.plugin_manager.getPluginByName('plugin', 'Command')
if plugin_installer_info is None:
@@ -148,15 +147,22 @@ class CommandImportWordpress(Command, ImportMixin):
'long': 'qtranslate',
'default': False,
'type': bool,
- 'help': "Look for translations generated by qtranslate plugin",
- # WARNING: won't recover translated titles that actually
- # don't seem to be part of the wordpress XML export at the
- # time of writing :(
+ 'help': """Look for translations generated by qtranslate plugin.
+WARNING: a default wordpress export won't allow to recover title translations.
+For this to be possible consider applying the hack suggested at
+https://github.com/qtranslate/qtranslate-xt/issues/199 :
+
+In wp-admin/includes/export.php change
+`echo apply_filters( 'the_title_rss', $post->post_title );
+
+to
+`echo apply_filters( 'the_title_export', $post->post_title );
+"""
},
{
'name': 'translations_pattern',
'long': 'translations_pattern',
- 'default': None,
+ 'default': DEFAULT_TRANSLATIONS_PATTERN,
'type': str,
'help': "The pattern for translation files names",
},
@@ -259,9 +265,9 @@ class CommandImportWordpress(Command, ImportMixin):
options['output_folder'] = args.pop(0)
if args:
- LOGGER.warn('You specified additional arguments ({0}). Please consider '
- 'putting these arguments before the filename if you '
- 'are running into problems.'.format(args))
+ LOGGER.warning('You specified additional arguments ({0}). Please consider '
+ 'putting these arguments before the filename if you '
+ 'are running into problems.'.format(args))
self.onefile = options.get('one_file', False)
@@ -307,7 +313,7 @@ class CommandImportWordpress(Command, ImportMixin):
LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.")
return False
if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler:
- LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
+ LOGGER.warning("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
if (self.html2text or self.transform_to_markdown) and not html2text:
LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.")
@@ -339,14 +345,14 @@ class CommandImportWordpress(Command, ImportMixin):
# cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
- cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
+ cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
cat_path = [cat_name]
if cat_parent_slug in cat_map:
cat_path = cat_map[cat_parent_slug] + cat_path
cat_map[cat_slug] = cat_path
self._category_paths = dict()
for cat, path in cat_map.items():
- self._category_paths[cat] = utils.join_hierarchical_category_path(path)
+ self._category_paths[cat] = hierarchy_utils.join_hierarchical_category_path(path)
def _execute(self, options={}, args=[]):
"""Import a WordPress blog from an export file into a Nikola site."""
@@ -373,17 +379,12 @@ class CommandImportWordpress(Command, ImportMixin):
if phpserialize is None:
req_missing(['phpserialize'], 'import WordPress dumps without --no-downloads')
- channel = self.get_channel_from_file(self.wordpress_export_file)
+ export_file_preprocessor = modernize_qtranslate_tags if self.separate_qtranslate_content else None
+ channel = self.get_channel_from_file(self.wordpress_export_file, export_file_preprocessor)
self._prepare(channel)
conf_template = self.generate_base_site()
- # If user has specified a custom pattern for translation files we
- # need to fix the config
- if self.translations_pattern:
- self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern
-
self.import_posts(channel)
-
self.context['TRANSLATIONS'] = format_default_translations_config(
self.extra_languages)
self.context['REDIRECTIONS'] = self.configure_redirections(
@@ -397,7 +398,7 @@ class CommandImportWordpress(Command, ImportMixin):
# Add tag redirects
for tag in self.all_tags:
try:
- if isinstance(tag, utils.bytes_str):
+ if isinstance(tag, bytes):
tag_str = tag.decode('utf8', 'replace')
else:
tag_str = tag
@@ -420,9 +421,9 @@ class CommandImportWordpress(Command, ImportMixin):
if not install_plugin(self.site, 'wordpress_compiler', output_dir=os.path.join(self.output_folder, 'plugins')):
return False
else:
- LOGGER.warn("Make sure to install the WordPress page compiler via")
- LOGGER.warn(" nikola plugin -i wordpress_compiler")
- LOGGER.warn("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
+ LOGGER.warning("Make sure to install the WordPress page compiler via")
+ LOGGER.warning(" nikola plugin -i wordpress_compiler")
+ LOGGER.warning("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
@classmethod
def read_xml_file(cls, filename):
@@ -438,9 +439,16 @@ class CommandImportWordpress(Command, ImportMixin):
return b''.join(xml)
@classmethod
- def get_channel_from_file(cls, filename):
- """Get channel from XML file."""
- tree = etree.fromstring(cls.read_xml_file(filename))
+ def get_channel_from_file(cls, filename, xml_preprocessor=None):
+ """Get channel from XML file.
+
+ An optional 'xml_preprocessor' allows to modify the xml
+ (typically to deal with variations in tags injected by some WP plugin)
+ """
+ xml_string = cls.read_xml_file(filename)
+ if xml_preprocessor:
+ xml_string = xml_preprocessor(xml_string)
+ tree = etree.fromstring(xml_string)
channel = tree.find('channel')
return channel
@@ -451,7 +459,10 @@ class CommandImportWordpress(Command, ImportMixin):
context = SAMPLE_CONF.copy()
self.lang = get_text_tag(channel, 'language', 'en')[:2]
context['DEFAULT_LANG'] = self.lang
- context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
+ # If user has specified a custom pattern for translation files we
+ # need to fix the config
+ context['TRANSLATIONS_PATTERN'] = self.translations_pattern
+
context['BLOG_TITLE'] = get_text_tag(channel, 'title',
'PUT TITLE HERE')
context['BLOG_DESCRIPTION'] = get_text_tag(
@@ -482,17 +493,17 @@ class CommandImportWordpress(Command, ImportMixin):
PAGES = '(\n'
for extension in extensions:
POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension)
- PAGES += ' ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension)
+ PAGES += ' ("pages/*.{0}", "pages", "page.tmpl"),\n'.format(extension)
POSTS += ')\n'
PAGES += ')\n'
context['POSTS'] = POSTS
context['PAGES'] = PAGES
COMPILERS = '{\n'
- COMPILERS += ''' "rest": ('.txt', '.rst'),''' + '\n'
- COMPILERS += ''' "markdown": ('.md', '.mdown', '.markdown'),''' + '\n'
- COMPILERS += ''' "html": ('.html', '.htm'),''' + '\n'
+ COMPILERS += ''' "rest": ['.txt', '.rst'],''' + '\n'
+ COMPILERS += ''' "markdown": ['.md', '.mdown', '.markdown'],''' + '\n'
+ COMPILERS += ''' "html": ['.html', '.htm'],''' + '\n'
if self.use_wordpress_compiler:
- COMPILERS += ''' "wordpress": ('.wp'),''' + '\n'
+ COMPILERS += ''' "wordpress": ['.wp'],''' + '\n'
COMPILERS += '}'
context['COMPILERS'] = COMPILERS
@@ -503,12 +514,12 @@ class CommandImportWordpress(Command, ImportMixin):
try:
request = requests.get(url, auth=self.auth)
if request.status_code >= 400:
- LOGGER.warn("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
+ LOGGER.warning("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
return
with open(dst_path, 'wb+') as fd:
fd.write(request.content)
except requests.exceptions.ConnectionError as err:
- LOGGER.warn("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
+ LOGGER.warning("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
def import_attachment(self, item, wordpress_namespace):
"""Import an attachment to the site."""
@@ -549,14 +560,7 @@ class CommandImportWordpress(Command, ImportMixin):
# that the export should give you the power to insert
# your blogging into another site or system its not.
# Why don't they just use JSON?
- if sys.version_info[0] == 2:
- try:
- metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
- except ValueError:
- # local encoding might be wrong sometimes
- metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
- else:
- metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
+ metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
meta_key = b'image_meta'
size_key = b'sizes'
@@ -583,6 +587,9 @@ class CommandImportWordpress(Command, ImportMixin):
if ignore_zero and value == 0:
return
elif is_float:
+ # in some locales (like fr) and for old posts there may be a comma here.
+ if isinstance(value, bytes):
+ value = value.replace(b",", b".")
value = float(value)
if ignore_zero and value == 0:
return
@@ -775,7 +782,7 @@ class CommandImportWordpress(Command, ImportMixin):
elif approved == 'spam' or approved == 'trash':
pass
else:
- LOGGER.warn("Unknown comment approved status: {0}".format(approved))
+ LOGGER.warning("Unknown comment approved status: {0}".format(approved))
parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0))
if parent == 0:
parent = None
@@ -796,7 +803,7 @@ class CommandImportWordpress(Command, ImportMixin):
"""Write comment header line."""
if header_content is None:
return
- header_content = unicode_str(header_content).replace('\n', ' ')
+ header_content = str(header_content).replace('\n', ' ')
line = '.. ' + header_field + ': ' + header_content + '\n'
fd.write(line.encode('utf8'))
@@ -813,6 +820,16 @@ class CommandImportWordpress(Command, ImportMixin):
write_header_line(fd, "wordpress_user_id", comment["user_id"])
fd.write(('\n' + comment['content']).encode('utf8'))
+ def _create_meta_and_content_filenames(self, slug, extension, lang, default_language, translations_config):
+ out_meta_filename = slug + '.meta'
+ out_content_filename = slug + '.' + extension
+ if lang and lang != default_language:
+ out_meta_filename = utils.get_translation_candidate(translations_config,
+ out_meta_filename, lang)
+ out_content_filename = utils.get_translation_candidate(translations_config,
+ out_content_filename, lang)
+ return out_meta_filename, out_content_filename
+
def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
"""Create post metadata."""
other_meta = {'wp-status': status}
@@ -824,16 +841,16 @@ class CommandImportWordpress(Command, ImportMixin):
if text in self._category_paths:
cats.append(self._category_paths[text])
else:
- cats.append(utils.join_hierarchical_category_path([text]))
+ cats.append(hierarchy_utils.join_hierarchical_category_path([utils.html_unescape(text)]))
other_meta['categories'] = ','.join(cats)
if len(cats) > 0:
other_meta['category'] = cats[0]
if len(cats) > 1:
- LOGGER.warn(('Post "{0}" has more than one category! ' +
- 'Will only use the first one.').format(post_name))
- tags_cats = tags
+ LOGGER.warning(('Post "{0}" has more than one category! ' +
+ 'Will only use the first one.').format(post_name))
+ tags_cats = [utils.html_unescape(tag) for tag in tags]
else:
- tags_cats = tags + categories
+ tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
return tags_cats, other_meta
_tag_sanitize_map = {True: {}, False: {}}
@@ -847,7 +864,7 @@ class CommandImportWordpress(Command, ImportMixin):
previous = self._tag_sanitize_map[is_category][tag.lower()]
if self.tag_saniziting_strategy == 'first':
if tag != previous[0]:
- LOGGER.warn("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0]))
+ LOGGER.warning("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0]))
return previous[0]
else:
LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy))
@@ -873,7 +890,7 @@ class CommandImportWordpress(Command, ImportMixin):
path = unquote(parsed.path.strip('/'))
try:
- if isinstance(path, utils.bytes_str):
+ if isinstance(path, bytes):
path = path.decode('utf8', 'replace')
else:
path = path
@@ -925,17 +942,19 @@ class CommandImportWordpress(Command, ImportMixin):
tags = []
categories = []
+ post_status = 'published'
+ has_math = "no"
if status == 'trash':
- LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
+ LOGGER.warning('Trashed post "{0}" will not be imported.'.format(title))
return False
elif status == 'private':
- tags.append('private')
is_draft = False
is_private = True
+ post_status = 'private'
elif status != 'publish':
- tags.append('draft')
is_draft = True
is_private = False
+ post_status = 'draft'
else:
is_draft = False
is_private = False
@@ -953,7 +972,7 @@ class CommandImportWordpress(Command, ImportMixin):
tags.append(text)
if '$latex' in content:
- tags.append('mathjax')
+ has_math = "yes"
for i, cat in enumerate(categories[:]):
cat = self._sanitize(cat, True)
@@ -974,52 +993,56 @@ class CommandImportWordpress(Command, ImportMixin):
post_format = 'wp'
if is_draft and self.exclude_drafts:
- LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
+ LOGGER.warning('Draft "{0}" will not be imported.'.format(title))
return False
elif is_private and self.exclude_privates:
- LOGGER.notice('Private post "{0}" will not be imported.'.format(title))
+ LOGGER.warning('Private post "{0}" will not be imported.'.format(title))
return False
elif content.strip() or self.import_empty_items:
# If no content is found, no files are written.
self.url_map[link] = (self.context['SITE_URL'] +
out_folder.rstrip('/') + '/' + slug +
'.html').replace(os.sep, '/')
- if hasattr(self, "separate_qtranslate_content") \
- and self.separate_qtranslate_content:
- content_translations = separate_qtranslate_content(content)
+ default_language = self.context["DEFAULT_LANG"]
+ if self.separate_qtranslate_content:
+ content_translations = separate_qtranslate_tagged_langs(content)
+ title_translations = separate_qtranslate_tagged_langs(title)
else:
content_translations = {"": content}
- default_language = self.context["DEFAULT_LANG"]
+ title_translations = {"": title}
+ # in case of mistmatch between the languages found in the title and in the content
+ default_title = title_translations.get(default_language, title)
+ extra_languages = [lang for lang in content_translations.keys() if lang not in ("", default_language)]
+ for extra_lang in extra_languages:
+ self.extra_languages.add(extra_lang)
+ translations_dict = get_default_translations_dict(default_language, extra_languages)
+ current_translations_config = {
+ "DEFAULT_LANG": default_language,
+ "TRANSLATIONS": translations_dict,
+ "TRANSLATIONS_PATTERN": self.context["TRANSLATIONS_PATTERN"]
+ }
for lang, content in content_translations.items():
try:
content, extension, rewrite_html = self.transform_content(content, post_format, attachments)
- except:
+ except Exception:
LOGGER.error(('Cannot interpret post "{0}" (language {1}) with post ' +
'format {2}!').format(os.path.join(out_folder, slug), lang, post_format))
return False
- if lang:
- out_meta_filename = slug + '.meta'
- if lang == default_language:
- out_content_filename = slug + '.' + extension
- else:
- out_content_filename \
- = utils.get_translation_candidate(self.context,
- slug + "." + extension, lang)
- self.extra_languages.add(lang)
- meta_slug = slug
- else:
- out_meta_filename = slug + '.meta'
- out_content_filename = slug + '.' + extension
- meta_slug = slug
+
+ out_meta_filename, out_content_filename = self._create_meta_and_content_filenames(
+ slug, extension, lang, default_language, current_translations_config)
+
tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
post_name=os.path.join(out_folder, slug))
-
+ current_title = title_translations.get(lang, default_title)
meta = {
- "title": title,
- "slug": meta_slug,
+ "title": current_title,
+ "slug": slug,
"date": post_date,
"description": description,
"tags": ','.join(tags),
+ "status": post_status,
+ "has_math": has_math,
}
meta.update(other_meta)
if self.onefile:
@@ -1033,7 +1056,7 @@ class CommandImportWordpress(Command, ImportMixin):
else:
self.write_metadata(os.path.join(self.output_folder, out_folder,
out_meta_filename),
- title, meta_slug, post_date, description, tags, **other_meta)
+ current_title, slug, post_date, description, tags, **other_meta)
self.write_content(
os.path.join(self.output_folder,
out_folder, out_content_filename),
@@ -1053,8 +1076,8 @@ class CommandImportWordpress(Command, ImportMixin):
return (out_folder, slug)
else:
- LOGGER.warn(('Not going to import "{0}" because it seems to contain'
- ' no content.').format(title))
+ LOGGER.warning(('Not going to import "{0}" because it seems to contain'
+ ' no content.').format(title))
return False
def _extract_item_info(self, item):
@@ -1080,7 +1103,7 @@ class CommandImportWordpress(Command, ImportMixin):
if parent_id is not None and int(parent_id) != 0:
self.attachments[int(parent_id)][post_id] = data
else:
- LOGGER.warn("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
+ LOGGER.warning("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
def write_attachments_info(self, path, attachments):
"""Write attachments info file."""
@@ -1118,8 +1141,8 @@ class CommandImportWordpress(Command, ImportMixin):
self.process_item_if_post_or_page(item)
# Assign attachments to posts
for post_id in self.attachments:
- LOGGER.warn(("Found attachments for post or page #{0}, but didn't find post or page. " +
- "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
+ LOGGER.warning(("Found attachments for post or page #{0}, but didn't find post or page. " +
+ "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
def get_text_tag(tag, name, default):
@@ -1133,15 +1156,20 @@ def get_text_tag(tag, name, default):
return default
-def separate_qtranslate_content(text):
- """Parse the content of a wordpress post or page and separate qtranslate languages.
+def separate_qtranslate_tagged_langs(text):
+ """Parse the content of a wordpress post or page and separate languages.
+
+ For qtranslateX tags: [:LL]blabla[:]
- qtranslate tags: <!--:LL-->blabla<!--:-->
+ Note: qtranslate* plugins had a troubled history and used various
+ tags over time, application of the 'modernize_qtranslate_tags'
+ function is required for this function to handle most of the legacy
+ cases.
"""
- # TODO: uniformize qtranslate tags <!--/en--> => <!--:-->
- qt_start = "<!--:"
- qt_end = "-->"
- qt_end_with_lang_len = 5
+ qt_start = "[:"
+ qt_end = "]"
+ qt_end_len = len(qt_end)
+ qt_end_with_lang_len = qt_end_len + 2
qt_chunks = text.split(qt_start)
content_by_lang = {}
common_txt_list = []
@@ -1153,9 +1181,9 @@ def separate_qtranslate_content(text):
# be some piece of common text or tags, or just nothing
lang = "" # default language
c = c.lstrip(qt_end)
- if not c:
+ if not c.strip():
continue
- elif c[2:].startswith(qt_end):
+ elif c[2:qt_end_with_lang_len].startswith(qt_end):
# a language specific section (with language code at the begining)
lang = c[:2]
c = c[qt_end_with_lang_len:]
@@ -1176,3 +1204,26 @@ def separate_qtranslate_content(text):
for l in content_by_lang.keys():
content_by_lang[l] = " ".join(content_by_lang[l])
return content_by_lang
+
+
+def modernize_qtranslate_tags(xml_bytes):
+ """
+ Uniformize the "tag" used by various version of qtranslate.
+
+ The resulting byte string will only contain one set of qtranslate tags
+ (namely [:LG] and [:]), older ones being converted to new ones.
+ """
+ old_start_lang = re.compile(b"<!--:?(\\w{2})-->")
+ new_start_lang = b"[:\\1]"
+ old_end_lang = re.compile(b"<!--(/\\w{2}|:)-->")
+ new_end_lang = b"[:]"
+ title_match = re.compile(b"<title>(.*?)</title>")
+ modern_starts = old_start_lang.sub(new_start_lang, xml_bytes)
+ modernized_bytes = old_end_lang.sub(new_end_lang, modern_starts)
+
+ def title_escape(match):
+ title = match.group(1)
+ title = title.replace(b"&", b"&amp;").replace(b"<", b"&lt;").replace(b">", b"&gt;")
+ return b"<title>" + title + b"</title>"
+ fixed_bytes = title_match.sub(title_escape, modernized_bytes)
+ return fixed_bytes