diff options
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command/import_wordpress.py | 283 |
1 files changed, 167 insertions, 116 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py index 0b48583..5e2aee6 100644 --- a/nikola/plugins/command/import_wordpress.py +++ b/nikola/plugins/command/import_wordpress.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2012-2016 Roberto Alsina and others. +# Copyright © 2012-2020 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -26,46 +26,45 @@ """Import a WordPress dump.""" -from __future__ import unicode_literals, print_function -import os -import re -import sys import datetime import io import json +import os +import re +import sys +from collections import defaultdict +from urllib.parse import urlparse, unquote + import requests from lxml import etree -from collections import defaultdict -try: - import html2text -except: - html2text = None +from nikola.plugin_categories import Command +from nikola import utils, hierarchy_utils +from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN +from nikola.utils import req_missing +from nikola.plugins.basic_import import ImportMixin, links +from nikola.plugins.command.init import ( + SAMPLE_CONF, prepare_config, + format_default_translations_config, + get_default_translations_dict +) try: - from urlparse import urlparse - from urllib import unquote + import html2text except ImportError: - from urllib.parse import urlparse, unquote # NOQA + html2text = None try: import phpserialize except ImportError: - phpserialize = None # NOQA + phpserialize = None -from nikola.plugin_categories import Command -from nikola import utils -from nikola.utils import req_missing, unicode_str -from nikola.plugins.basic_import import ImportMixin, links -from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN -from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config - -LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER) +LOGGER = utils.get_logger('import_wordpress') def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False): """Install a Nikola plugin.""" - LOGGER.notice("Installing plugin '{0}'".format(plugin_name)) + LOGGER.info("Installing plugin '{0}'".format(plugin_name)) # Get hold of the 'plugin' plugin plugin_installer_info = site.plugin_manager.getPluginByName('plugin', 'Command') if plugin_installer_info is None: @@ -148,15 +147,22 @@ class CommandImportWordpress(Command, ImportMixin): 'long': 'qtranslate', 'default': False, 'type': bool, - 'help': "Look for translations generated by qtranslate plugin", - # WARNING: won't recover translated titles that actually - # don't seem to be part of the wordpress XML export at the - # time of writing :( + 'help': """Look for translations generated by qtranslate plugin. +WARNING: a default wordpress export won't allow to recover title translations. +For this to be possible consider applying the hack suggested at +https://github.com/qtranslate/qtranslate-xt/issues/199 : + +In wp-admin/includes/export.php change +`echo apply_filters( 'the_title_rss', $post->post_title ); + +to +`echo apply_filters( 'the_title_export', $post->post_title ); +""" }, { 'name': 'translations_pattern', 'long': 'translations_pattern', - 'default': None, + 'default': DEFAULT_TRANSLATIONS_PATTERN, 'type': str, 'help': "The pattern for translation files names", }, @@ -259,9 +265,9 @@ class CommandImportWordpress(Command, ImportMixin): options['output_folder'] = args.pop(0) if args: - LOGGER.warn('You specified additional arguments ({0}). Please consider ' - 'putting these arguments before the filename if you ' - 'are running into problems.'.format(args)) + LOGGER.warning('You specified additional arguments ({0}). Please consider ' + 'putting these arguments before the filename if you ' + 'are running into problems.'.format(args)) self.onefile = options.get('one_file', False) @@ -307,7 +313,7 @@ class CommandImportWordpress(Command, ImportMixin): LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.") return False if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler: - LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.") + LOGGER.warning("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.") if (self.html2text or self.transform_to_markdown) and not html2text: LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.") @@ -339,14 +345,14 @@ class CommandImportWordpress(Command, ImportMixin): # cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None) cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None) cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None) - cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None) + cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)) cat_path = [cat_name] if cat_parent_slug in cat_map: cat_path = cat_map[cat_parent_slug] + cat_path cat_map[cat_slug] = cat_path self._category_paths = dict() for cat, path in cat_map.items(): - self._category_paths[cat] = utils.join_hierarchical_category_path(path) + self._category_paths[cat] = hierarchy_utils.join_hierarchical_category_path(path) def _execute(self, options={}, args=[]): """Import a WordPress blog from an export file into a Nikola site.""" @@ -373,17 +379,12 @@ class CommandImportWordpress(Command, ImportMixin): if phpserialize is None: req_missing(['phpserialize'], 'import WordPress dumps without --no-downloads') - channel = self.get_channel_from_file(self.wordpress_export_file) + export_file_preprocessor = modernize_qtranslate_tags if self.separate_qtranslate_content else None + channel = self.get_channel_from_file(self.wordpress_export_file, export_file_preprocessor) self._prepare(channel) conf_template = self.generate_base_site() - # If user has specified a custom pattern for translation files we - # need to fix the config - if self.translations_pattern: - self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern - self.import_posts(channel) - self.context['TRANSLATIONS'] = format_default_translations_config( self.extra_languages) self.context['REDIRECTIONS'] = self.configure_redirections( @@ -397,7 +398,7 @@ class CommandImportWordpress(Command, ImportMixin): # Add tag redirects for tag in self.all_tags: try: - if isinstance(tag, utils.bytes_str): + if isinstance(tag, bytes): tag_str = tag.decode('utf8', 'replace') else: tag_str = tag @@ -420,9 +421,9 @@ class CommandImportWordpress(Command, ImportMixin): if not install_plugin(self.site, 'wordpress_compiler', output_dir=os.path.join(self.output_folder, 'plugins')): return False else: - LOGGER.warn("Make sure to install the WordPress page compiler via") - LOGGER.warn(" nikola plugin -i wordpress_compiler") - LOGGER.warn("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder)) + LOGGER.warning("Make sure to install the WordPress page compiler via") + LOGGER.warning(" nikola plugin -i wordpress_compiler") + LOGGER.warning("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder)) @classmethod def read_xml_file(cls, filename): @@ -438,9 +439,16 @@ class CommandImportWordpress(Command, ImportMixin): return b''.join(xml) @classmethod - def get_channel_from_file(cls, filename): - """Get channel from XML file.""" - tree = etree.fromstring(cls.read_xml_file(filename)) + def get_channel_from_file(cls, filename, xml_preprocessor=None): + """Get channel from XML file. + + An optional 'xml_preprocessor' allows to modify the xml + (typically to deal with variations in tags injected by some WP plugin) + """ + xml_string = cls.read_xml_file(filename) + if xml_preprocessor: + xml_string = xml_preprocessor(xml_string) + tree = etree.fromstring(xml_string) channel = tree.find('channel') return channel @@ -451,7 +459,10 @@ class CommandImportWordpress(Command, ImportMixin): context = SAMPLE_CONF.copy() self.lang = get_text_tag(channel, 'language', 'en')[:2] context['DEFAULT_LANG'] = self.lang - context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN + # If user has specified a custom pattern for translation files we + # need to fix the config + context['TRANSLATIONS_PATTERN'] = self.translations_pattern + context['BLOG_TITLE'] = get_text_tag(channel, 'title', 'PUT TITLE HERE') context['BLOG_DESCRIPTION'] = get_text_tag( @@ -482,17 +493,17 @@ class CommandImportWordpress(Command, ImportMixin): PAGES = '(\n' for extension in extensions: POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension) - PAGES += ' ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension) + PAGES += ' ("pages/*.{0}", "pages", "page.tmpl"),\n'.format(extension) POSTS += ')\n' PAGES += ')\n' context['POSTS'] = POSTS context['PAGES'] = PAGES COMPILERS = '{\n' - COMPILERS += ''' "rest": ('.txt', '.rst'),''' + '\n' - COMPILERS += ''' "markdown": ('.md', '.mdown', '.markdown'),''' + '\n' - COMPILERS += ''' "html": ('.html', '.htm'),''' + '\n' + COMPILERS += ''' "rest": ['.txt', '.rst'],''' + '\n' + COMPILERS += ''' "markdown": ['.md', '.mdown', '.markdown'],''' + '\n' + COMPILERS += ''' "html": ['.html', '.htm'],''' + '\n' if self.use_wordpress_compiler: - COMPILERS += ''' "wordpress": ('.wp'),''' + '\n' + COMPILERS += ''' "wordpress": ['.wp'],''' + '\n' COMPILERS += '}' context['COMPILERS'] = COMPILERS @@ -503,12 +514,12 @@ class CommandImportWordpress(Command, ImportMixin): try: request = requests.get(url, auth=self.auth) if request.status_code >= 400: - LOGGER.warn("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code)) + LOGGER.warning("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code)) return with open(dst_path, 'wb+') as fd: fd.write(request.content) except requests.exceptions.ConnectionError as err: - LOGGER.warn("Downloading {0} to {1} failed: {2}".format(url, dst_path, err)) + LOGGER.warning("Downloading {0} to {1} failed: {2}".format(url, dst_path, err)) def import_attachment(self, item, wordpress_namespace): """Import an attachment to the site.""" @@ -549,14 +560,7 @@ class CommandImportWordpress(Command, ImportMixin): # that the export should give you the power to insert # your blogging into another site or system its not. # Why don't they just use JSON? - if sys.version_info[0] == 2: - try: - metadata = phpserialize.loads(utils.sys_encode(meta_value.text)) - except ValueError: - # local encoding might be wrong sometimes - metadata = phpserialize.loads(meta_value.text.encode('utf-8')) - else: - metadata = phpserialize.loads(meta_value.text.encode('utf-8')) + metadata = phpserialize.loads(meta_value.text.encode('utf-8')) meta_key = b'image_meta' size_key = b'sizes' @@ -583,6 +587,9 @@ class CommandImportWordpress(Command, ImportMixin): if ignore_zero and value == 0: return elif is_float: + # in some locales (like fr) and for old posts there may be a comma here. + if isinstance(value, bytes): + value = value.replace(b",", b".") value = float(value) if ignore_zero and value == 0: return @@ -775,7 +782,7 @@ class CommandImportWordpress(Command, ImportMixin): elif approved == 'spam' or approved == 'trash': pass else: - LOGGER.warn("Unknown comment approved status: {0}".format(approved)) + LOGGER.warning("Unknown comment approved status: {0}".format(approved)) parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0)) if parent == 0: parent = None @@ -796,7 +803,7 @@ class CommandImportWordpress(Command, ImportMixin): """Write comment header line.""" if header_content is None: return - header_content = unicode_str(header_content).replace('\n', ' ') + header_content = str(header_content).replace('\n', ' ') line = '.. ' + header_field + ': ' + header_content + '\n' fd.write(line.encode('utf8')) @@ -813,6 +820,16 @@ class CommandImportWordpress(Command, ImportMixin): write_header_line(fd, "wordpress_user_id", comment["user_id"]) fd.write(('\n' + comment['content']).encode('utf8')) + def _create_meta_and_content_filenames(self, slug, extension, lang, default_language, translations_config): + out_meta_filename = slug + '.meta' + out_content_filename = slug + '.' + extension + if lang and lang != default_language: + out_meta_filename = utils.get_translation_candidate(translations_config, + out_meta_filename, lang) + out_content_filename = utils.get_translation_candidate(translations_config, + out_content_filename, lang) + return out_meta_filename, out_content_filename + def _create_metadata(self, status, excerpt, tags, categories, post_name=None): """Create post metadata.""" other_meta = {'wp-status': status} @@ -824,16 +841,16 @@ class CommandImportWordpress(Command, ImportMixin): if text in self._category_paths: cats.append(self._category_paths[text]) else: - cats.append(utils.join_hierarchical_category_path([text])) + cats.append(hierarchy_utils.join_hierarchical_category_path([utils.html_unescape(text)])) other_meta['categories'] = ','.join(cats) if len(cats) > 0: other_meta['category'] = cats[0] if len(cats) > 1: - LOGGER.warn(('Post "{0}" has more than one category! ' + - 'Will only use the first one.').format(post_name)) - tags_cats = tags + LOGGER.warning(('Post "{0}" has more than one category! ' + + 'Will only use the first one.').format(post_name)) + tags_cats = [utils.html_unescape(tag) for tag in tags] else: - tags_cats = tags + categories + tags_cats = [utils.html_unescape(tag) for tag in tags + categories] return tags_cats, other_meta _tag_sanitize_map = {True: {}, False: {}} @@ -847,7 +864,7 @@ class CommandImportWordpress(Command, ImportMixin): previous = self._tag_sanitize_map[is_category][tag.lower()] if self.tag_saniziting_strategy == 'first': if tag != previous[0]: - LOGGER.warn("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0])) + LOGGER.warning("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0])) return previous[0] else: LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy)) @@ -873,7 +890,7 @@ class CommandImportWordpress(Command, ImportMixin): path = unquote(parsed.path.strip('/')) try: - if isinstance(path, utils.bytes_str): + if isinstance(path, bytes): path = path.decode('utf8', 'replace') else: path = path @@ -925,17 +942,19 @@ class CommandImportWordpress(Command, ImportMixin): tags = [] categories = [] + post_status = 'published' + has_math = "no" if status == 'trash': - LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title)) + LOGGER.warning('Trashed post "{0}" will not be imported.'.format(title)) return False elif status == 'private': - tags.append('private') is_draft = False is_private = True + post_status = 'private' elif status != 'publish': - tags.append('draft') is_draft = True is_private = False + post_status = 'draft' else: is_draft = False is_private = False @@ -953,7 +972,7 @@ class CommandImportWordpress(Command, ImportMixin): tags.append(text) if '$latex' in content: - tags.append('mathjax') + has_math = "yes" for i, cat in enumerate(categories[:]): cat = self._sanitize(cat, True) @@ -974,52 +993,56 @@ class CommandImportWordpress(Command, ImportMixin): post_format = 'wp' if is_draft and self.exclude_drafts: - LOGGER.notice('Draft "{0}" will not be imported.'.format(title)) + LOGGER.warning('Draft "{0}" will not be imported.'.format(title)) return False elif is_private and self.exclude_privates: - LOGGER.notice('Private post "{0}" will not be imported.'.format(title)) + LOGGER.warning('Private post "{0}" will not be imported.'.format(title)) return False elif content.strip() or self.import_empty_items: # If no content is found, no files are written. self.url_map[link] = (self.context['SITE_URL'] + out_folder.rstrip('/') + '/' + slug + '.html').replace(os.sep, '/') - if hasattr(self, "separate_qtranslate_content") \ - and self.separate_qtranslate_content: - content_translations = separate_qtranslate_content(content) + default_language = self.context["DEFAULT_LANG"] + if self.separate_qtranslate_content: + content_translations = separate_qtranslate_tagged_langs(content) + title_translations = separate_qtranslate_tagged_langs(title) else: content_translations = {"": content} - default_language = self.context["DEFAULT_LANG"] + title_translations = {"": title} + # in case of mistmatch between the languages found in the title and in the content + default_title = title_translations.get(default_language, title) + extra_languages = [lang for lang in content_translations.keys() if lang not in ("", default_language)] + for extra_lang in extra_languages: + self.extra_languages.add(extra_lang) + translations_dict = get_default_translations_dict(default_language, extra_languages) + current_translations_config = { + "DEFAULT_LANG": default_language, + "TRANSLATIONS": translations_dict, + "TRANSLATIONS_PATTERN": self.context["TRANSLATIONS_PATTERN"] + } for lang, content in content_translations.items(): try: content, extension, rewrite_html = self.transform_content(content, post_format, attachments) - except: + except Exception: LOGGER.error(('Cannot interpret post "{0}" (language {1}) with post ' + 'format {2}!').format(os.path.join(out_folder, slug), lang, post_format)) return False - if lang: - out_meta_filename = slug + '.meta' - if lang == default_language: - out_content_filename = slug + '.' + extension - else: - out_content_filename \ - = utils.get_translation_candidate(self.context, - slug + "." + extension, lang) - self.extra_languages.add(lang) - meta_slug = slug - else: - out_meta_filename = slug + '.meta' - out_content_filename = slug + '.' + extension - meta_slug = slug + + out_meta_filename, out_content_filename = self._create_meta_and_content_filenames( + slug, extension, lang, default_language, current_translations_config) + tags, other_meta = self._create_metadata(status, excerpt, tags, categories, post_name=os.path.join(out_folder, slug)) - + current_title = title_translations.get(lang, default_title) meta = { - "title": title, - "slug": meta_slug, + "title": current_title, + "slug": slug, "date": post_date, "description": description, "tags": ','.join(tags), + "status": post_status, + "has_math": has_math, } meta.update(other_meta) if self.onefile: @@ -1033,7 +1056,7 @@ class CommandImportWordpress(Command, ImportMixin): else: self.write_metadata(os.path.join(self.output_folder, out_folder, out_meta_filename), - title, meta_slug, post_date, description, tags, **other_meta) + current_title, slug, post_date, description, tags, **other_meta) self.write_content( os.path.join(self.output_folder, out_folder, out_content_filename), @@ -1053,8 +1076,8 @@ class CommandImportWordpress(Command, ImportMixin): return (out_folder, slug) else: - LOGGER.warn(('Not going to import "{0}" because it seems to contain' - ' no content.').format(title)) + LOGGER.warning(('Not going to import "{0}" because it seems to contain' + ' no content.').format(title)) return False def _extract_item_info(self, item): @@ -1080,7 +1103,7 @@ class CommandImportWordpress(Command, ImportMixin): if parent_id is not None and int(parent_id) != 0: self.attachments[int(parent_id)][post_id] = data else: - LOGGER.warn("Attachment #{0} ({1}) has no parent!".format(post_id, data['files'])) + LOGGER.warning("Attachment #{0} ({1}) has no parent!".format(post_id, data['files'])) def write_attachments_info(self, path, attachments): """Write attachments info file.""" @@ -1118,8 +1141,8 @@ class CommandImportWordpress(Command, ImportMixin): self.process_item_if_post_or_page(item) # Assign attachments to posts for post_id in self.attachments: - LOGGER.warn(("Found attachments for post or page #{0}, but didn't find post or page. " + - "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()])) + LOGGER.warning(("Found attachments for post or page #{0}, but didn't find post or page. " + + "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()])) def get_text_tag(tag, name, default): @@ -1133,15 +1156,20 @@ def get_text_tag(tag, name, default): return default -def separate_qtranslate_content(text): - """Parse the content of a wordpress post or page and separate qtranslate languages. +def separate_qtranslate_tagged_langs(text): + """Parse the content of a wordpress post or page and separate languages. + + For qtranslateX tags: [:LL]blabla[:] - qtranslate tags: <!--:LL-->blabla<!--:--> + Note: qtranslate* plugins had a troubled history and used various + tags over time, application of the 'modernize_qtranslate_tags' + function is required for this function to handle most of the legacy + cases. """ - # TODO: uniformize qtranslate tags <!--/en--> => <!--:--> - qt_start = "<!--:" - qt_end = "-->" - qt_end_with_lang_len = 5 + qt_start = "[:" + qt_end = "]" + qt_end_len = len(qt_end) + qt_end_with_lang_len = qt_end_len + 2 qt_chunks = text.split(qt_start) content_by_lang = {} common_txt_list = [] @@ -1153,9 +1181,9 @@ def separate_qtranslate_content(text): # be some piece of common text or tags, or just nothing lang = "" # default language c = c.lstrip(qt_end) - if not c: + if not c.strip(): continue - elif c[2:].startswith(qt_end): + elif c[2:qt_end_with_lang_len].startswith(qt_end): # a language specific section (with language code at the begining) lang = c[:2] c = c[qt_end_with_lang_len:] @@ -1176,3 +1204,26 @@ def separate_qtranslate_content(text): for l in content_by_lang.keys(): content_by_lang[l] = " ".join(content_by_lang[l]) return content_by_lang + + +def modernize_qtranslate_tags(xml_bytes): + """ + Uniformize the "tag" used by various version of qtranslate. + + The resulting byte string will only contain one set of qtranslate tags + (namely [:LG] and [:]), older ones being converted to new ones. + """ + old_start_lang = re.compile(b"<!--:?(\\w{2})-->") + new_start_lang = b"[:\\1]" + old_end_lang = re.compile(b"<!--(/\\w{2}|:)-->") + new_end_lang = b"[:]" + title_match = re.compile(b"<title>(.*?)</title>") + modern_starts = old_start_lang.sub(new_start_lang, xml_bytes) + modernized_bytes = old_end_lang.sub(new_end_lang, modern_starts) + + def title_escape(match): + title = match.group(1) + title = title.replace(b"&", b"&").replace(b"<", b"<").replace(b">", b">") + return b"<title>" + title + b"</title>" + fixed_bytes = title_match.sub(title_escape, modernized_bytes) + return fixed_bytes |
