diff options
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command/import_wordpress.py | 480 |
1 files changed, 337 insertions, 143 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py index a652ec8..5e2aee6 100644 --- a/nikola/plugins/command/import_wordpress.py +++ b/nikola/plugins/command/import_wordpress.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2012-2015 Roberto Alsina and others. +# Copyright © 2012-2020 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -26,41 +26,45 @@ """Import a WordPress dump.""" -from __future__ import unicode_literals, print_function -import os -import re -import sys import datetime import io import json +import os +import re +import sys +from collections import defaultdict +from urllib.parse import urlparse, unquote + import requests from lxml import etree -from collections import defaultdict + +from nikola.plugin_categories import Command +from nikola import utils, hierarchy_utils +from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN +from nikola.utils import req_missing +from nikola.plugins.basic_import import ImportMixin, links +from nikola.plugins.command.init import ( + SAMPLE_CONF, prepare_config, + format_default_translations_config, + get_default_translations_dict +) try: - from urlparse import urlparse - from urllib import unquote + import html2text except ImportError: - from urllib.parse import urlparse, unquote # NOQA + html2text = None try: import phpserialize except ImportError: - phpserialize = None # NOQA + phpserialize = None -from nikola.plugin_categories import Command -from nikola import utils -from nikola.utils import req_missing -from nikola.plugins.basic_import import ImportMixin, links -from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN -from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config - -LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER) +LOGGER = utils.get_logger('import_wordpress') def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False): """Install a Nikola plugin.""" - LOGGER.notice("Installing plugin '{0}'".format(plugin_name)) + LOGGER.info("Installing plugin '{0}'".format(plugin_name)) # Get hold of the 'plugin' plugin plugin_installer_info = site.plugin_manager.getPluginByName('plugin', 'Command') if plugin_installer_info is None: @@ -88,7 +92,6 @@ def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False) class CommandImportWordpress(Command, ImportMixin): - """Import a WordPress dump.""" name = "import_wordpress" @@ -144,15 +147,22 @@ class CommandImportWordpress(Command, ImportMixin): 'long': 'qtranslate', 'default': False, 'type': bool, - 'help': "Look for translations generated by qtranslate plugin", - # WARNING: won't recover translated titles that actually - # don't seem to be part of the wordpress XML export at the - # time of writing :( + 'help': """Look for translations generated by qtranslate plugin. +WARNING: a default wordpress export won't allow to recover title translations. +For this to be possible consider applying the hack suggested at +https://github.com/qtranslate/qtranslate-xt/issues/199 : + +In wp-admin/includes/export.php change +`echo apply_filters( 'the_title_rss', $post->post_title ); + +to +`echo apply_filters( 'the_title_export', $post->post_title ); +""" }, { 'name': 'translations_pattern', 'long': 'translations_pattern', - 'default': None, + 'default': DEFAULT_TRANSLATIONS_PATTERN, 'type': str, 'help': "The pattern for translation files names", }, @@ -171,6 +181,20 @@ class CommandImportWordpress(Command, ImportMixin): 'help': "Export comments as .wpcomment files", }, { + 'name': 'html2text', + 'long': 'html2text', + 'default': False, + 'type': bool, + 'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import", + }, + { + 'name': 'transform_to_markdown', + 'long': 'transform-to-markdown', + 'default': False, + 'type': bool, + 'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import", + }, + { 'name': 'transform_to_html', 'long': 'transform-to-html', 'default': False, @@ -191,9 +215,36 @@ class CommandImportWordpress(Command, ImportMixin): 'type': bool, 'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!", }, + { + 'name': 'tag_sanitizing_strategy', + 'long': 'tag-sanitizing-strategy', + 'default': 'first', + 'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name', + }, + { + 'name': 'one_file', + 'long': 'one-file', + 'default': False, + 'type': bool, + 'help': "Save imported posts in the more modern one-file format.", + }, ] all_tags = set([]) + def _get_compiler(self): + """Return whatever compiler we will use.""" + self._find_wordpress_compiler() + if self.wordpress_page_compiler is not None: + return self.wordpress_page_compiler + plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler') + if plugin_info is not None: + if not plugin_info.is_activated: + self.site.plugin_manager.activatePluginByName(plugin_info.name) + plugin_info.plugin_object.set_site(self.site) + return plugin_info.plugin_object + else: + LOGGER.error("Can't find markdown post compiler.") + def _find_wordpress_compiler(self): """Find WordPress compiler plugin.""" if self.wordpress_page_compiler is not None: @@ -214,9 +265,11 @@ class CommandImportWordpress(Command, ImportMixin): options['output_folder'] = args.pop(0) if args: - LOGGER.warn('You specified additional arguments ({0}). Please consider ' - 'putting these arguments before the filename if you ' - 'are running into problems.'.format(args)) + LOGGER.warning('You specified additional arguments ({0}). Please consider ' + 'putting these arguments before the filename if you ' + 'are running into problems.'.format(args)) + + self.onefile = options.get('one_file', False) self.import_into_existing_site = False self.url_map = {} @@ -234,11 +287,16 @@ class CommandImportWordpress(Command, ImportMixin): self.export_categories_as_categories = options.get('export_categories_as_categories', False) self.export_comments = options.get('export_comments', False) + self.html2text = options.get('html2text', False) + self.transform_to_markdown = options.get('transform_to_markdown', False) + self.transform_to_html = options.get('transform_to_html', False) self.use_wordpress_compiler = options.get('use_wordpress_compiler', False) self.install_wordpress_compiler = options.get('install_wordpress_compiler', False) self.wordpress_page_compiler = None + self.tag_saniziting_strategy = options.get('tag_saniziting_strategy', 'first') + self.auth = None if options.get('download_auth') is not None: username_password = options.get('download_auth') @@ -250,10 +308,18 @@ class CommandImportWordpress(Command, ImportMixin): self.separate_qtranslate_content = options.get('separate_qtranslate_content') self.translations_pattern = options.get('translations_pattern') - if self.transform_to_html and self.use_wordpress_compiler: - LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.") + count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0) + if count > 1: + LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.") + return False + if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler: + LOGGER.warning("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.") + + if (self.html2text or self.transform_to_markdown) and not html2text: + LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.") + return False - if self.transform_to_html: + if self.transform_to_html or self.transform_to_markdown: self._find_wordpress_compiler() if not self.wordpress_page_compiler and self.install_wordpress_compiler: if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'): # local install @@ -279,14 +345,14 @@ class CommandImportWordpress(Command, ImportMixin): # cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None) cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None) cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None) - cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None) + cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)) cat_path = [cat_name] if cat_parent_slug in cat_map: cat_path = cat_map[cat_parent_slug] + cat_path cat_map[cat_slug] = cat_path self._category_paths = dict() for cat, path in cat_map.items(): - self._category_paths[cat] = utils.join_hierarchical_category_path(path) + self._category_paths[cat] = hierarchy_utils.join_hierarchical_category_path(path) def _execute(self, options={}, args=[]): """Import a WordPress blog from an export file into a Nikola site.""" @@ -313,21 +379,16 @@ class CommandImportWordpress(Command, ImportMixin): if phpserialize is None: req_missing(['phpserialize'], 'import WordPress dumps without --no-downloads') - channel = self.get_channel_from_file(self.wordpress_export_file) + export_file_preprocessor = modernize_qtranslate_tags if self.separate_qtranslate_content else None + channel = self.get_channel_from_file(self.wordpress_export_file, export_file_preprocessor) self._prepare(channel) conf_template = self.generate_base_site() - # If user has specified a custom pattern for translation files we - # need to fix the config - if self.translations_pattern: - self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern - self.import_posts(channel) - self.context['TRANSLATIONS'] = format_default_translations_config( self.extra_languages) self.context['REDIRECTIONS'] = self.configure_redirections( - self.url_map) + self.url_map, self.base_dir) if self.timezone: self.context['TIMEZONE'] = self.timezone if self.export_categories_as_categories: @@ -337,10 +398,13 @@ class CommandImportWordpress(Command, ImportMixin): # Add tag redirects for tag in self.all_tags: try: - tag_str = tag.decode('utf8') + if isinstance(tag, bytes): + tag_str = tag.decode('utf8', 'replace') + else: + tag_str = tag except AttributeError: tag_str = tag - tag = utils.slugify(tag_str) + tag = utils.slugify(tag_str, self.lang) src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag) dst_url = self.site.link('tag', tag) if src_url != dst_url: @@ -357,9 +421,9 @@ class CommandImportWordpress(Command, ImportMixin): if not install_plugin(self.site, 'wordpress_compiler', output_dir=os.path.join(self.output_folder, 'plugins')): return False else: - LOGGER.warn("Make sure to install the WordPress page compiler via") - LOGGER.warn(" nikola plugin -i wordpress_compiler") - LOGGER.warn("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder)) + LOGGER.warning("Make sure to install the WordPress page compiler via") + LOGGER.warning(" nikola plugin -i wordpress_compiler") + LOGGER.warning("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder)) @classmethod def read_xml_file(cls, filename): @@ -372,12 +436,19 @@ class CommandImportWordpress(Command, ImportMixin): if b'<atom:link rel=' in line: continue xml.append(line) - return b'\n'.join(xml) + return b''.join(xml) @classmethod - def get_channel_from_file(cls, filename): - """Get channel from XML file.""" - tree = etree.fromstring(cls.read_xml_file(filename)) + def get_channel_from_file(cls, filename, xml_preprocessor=None): + """Get channel from XML file. + + An optional 'xml_preprocessor' allows to modify the xml + (typically to deal with variations in tags injected by some WP plugin) + """ + xml_string = cls.read_xml_file(filename) + if xml_preprocessor: + xml_string = xml_preprocessor(xml_string) + tree = etree.fromstring(xml_string) channel = tree.find('channel') return channel @@ -386,8 +457,12 @@ class CommandImportWordpress(Command, ImportMixin): wordpress_namespace = channel.nsmap['wp'] context = SAMPLE_CONF.copy() - context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2] - context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN + self.lang = get_text_tag(channel, 'language', 'en')[:2] + context['DEFAULT_LANG'] = self.lang + # If user has specified a custom pattern for translation files we + # need to fix the config + context['TRANSLATIONS_PATTERN'] = self.translations_pattern + context['BLOG_TITLE'] = get_text_tag(channel, 'title', 'PUT TITLE HERE') context['BLOG_DESCRIPTION'] = get_text_tag( @@ -418,17 +493,17 @@ class CommandImportWordpress(Command, ImportMixin): PAGES = '(\n' for extension in extensions: POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension) - PAGES += ' ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension) + PAGES += ' ("pages/*.{0}", "pages", "page.tmpl"),\n'.format(extension) POSTS += ')\n' PAGES += ')\n' context['POSTS'] = POSTS context['PAGES'] = PAGES COMPILERS = '{\n' - COMPILERS += ''' "rest": ('.txt', '.rst'),''' + '\n' - COMPILERS += ''' "markdown": ('.md', '.mdown', '.markdown'),''' + '\n' - COMPILERS += ''' "html": ('.html', '.htm'),''' + '\n' + COMPILERS += ''' "rest": ['.txt', '.rst'],''' + '\n' + COMPILERS += ''' "markdown": ['.md', '.mdown', '.markdown'],''' + '\n' + COMPILERS += ''' "html": ['.html', '.htm'],''' + '\n' if self.use_wordpress_compiler: - COMPILERS += ''' "wordpress": ('.wp'),''' + '\n' + COMPILERS += ''' "wordpress": ['.wp'],''' + '\n' COMPILERS += '}' context['COMPILERS'] = COMPILERS @@ -436,18 +511,15 @@ class CommandImportWordpress(Command, ImportMixin): def download_url_content_to_file(self, url, dst_path): """Download some content (attachments) to a file.""" - if self.no_downloads: - return - try: request = requests.get(url, auth=self.auth) if request.status_code >= 400: - LOGGER.warn("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code)) + LOGGER.warning("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code)) return with open(dst_path, 'wb+') as fd: fd.write(request.content) except requests.exceptions.ConnectionError as err: - LOGGER.warn("Downloading {0} to {1} failed: {2}".format(url, dst_path, err)) + LOGGER.warning("Downloading {0} to {1} failed: {2}".format(url, dst_path, err)) def import_attachment(self, item, wordpress_namespace): """Import an attachment to the site.""" @@ -458,10 +530,13 @@ class CommandImportWordpress(Command, ImportMixin): 'foo') path = urlparse(url).path dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) - dst_dir = os.path.dirname(dst_path) - utils.makedirs(dst_dir) - LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) - self.download_url_content_to_file(url, dst_path) + if self.no_downloads: + LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path)) + else: + dst_dir = os.path.dirname(dst_path) + utils.makedirs(dst_dir) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) + self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[link] = '/' + dst_url links[url] = '/' + dst_url @@ -485,14 +560,7 @@ class CommandImportWordpress(Command, ImportMixin): # that the export should give you the power to insert # your blogging into another site or system its not. # Why don't they just use JSON? - if sys.version_info[0] == 2: - try: - metadata = phpserialize.loads(utils.sys_encode(meta_value.text)) - except ValueError: - # local encoding might be wrong sometimes - metadata = phpserialize.loads(meta_value.text.encode('utf-8')) - else: - metadata = phpserialize.loads(meta_value.text.encode('utf-8')) + metadata = phpserialize.loads(meta_value.text.encode('utf-8')) meta_key = b'image_meta' size_key = b'sizes' @@ -507,6 +575,8 @@ class CommandImportWordpress(Command, ImportMixin): if meta_key in metadata: image_meta = metadata[meta_key] + if not image_meta: + continue dst_meta = {} def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False): @@ -517,6 +587,9 @@ class CommandImportWordpress(Command, ImportMixin): if ignore_zero and value == 0: return elif is_float: + # in some locales (like fr) and for old posts there may be a comma here. + if isinstance(value, bytes): + value = value.replace(b",", b".") value = float(value) if ignore_zero and value == 0: return @@ -552,15 +625,18 @@ class CommandImportWordpress(Command, ImportMixin): meta = {} meta['size'] = size.decode('utf-8') if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]: - meta['width'] = metadata[size_key][size][width_key] - meta['height'] = metadata[size_key][size][height_key] + meta['width'] = int(metadata[size_key][size][width_key]) + meta['height'] = int(metadata[size_key][size][height_key]) path = urlparse(url).path dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) - dst_dir = os.path.dirname(dst_path) - utils.makedirs(dst_dir) - LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) - self.download_url_content_to_file(url, dst_path) + if self.no_downloads: + LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path)) + else: + dst_dir = os.path.dirname(dst_path) + utils.makedirs(dst_dir) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) + self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[url] = '/' + dst_url @@ -604,7 +680,7 @@ class CommandImportWordpress(Command, ImportMixin): def transform_code(self, content): """Transform code blocks.""" - # http://en.support.wordpress.com/code/posting-source-code/. There are + # https://en.support.wordpress.com/code/posting-source-code/. There are # a ton of things not supported here. We only do a basic [code # lang="x"] -> ```x translation, and remove quoted html entities (<, # >, &, and "). @@ -628,10 +704,10 @@ class CommandImportWordpress(Command, ImportMixin): return content @staticmethod - def transform_caption(content): + def transform_caption(content, use_html=False): """Transform captions.""" - new_caption = re.sub(r'\[/caption\]', '', content) - new_caption = re.sub(r'\[caption.*\]', '', new_caption) + new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content) + new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption) return new_caption @@ -654,6 +730,26 @@ class CommandImportWordpress(Command, ImportMixin): except TypeError: # old versions of the plugin don't support the additional argument content = self.wordpress_page_compiler.compile_to_string(content) return content, 'html', True + elif self.transform_to_markdown: + # First convert to HTML with WordPress plugin + additional_data = {} + if attachments is not None: + additional_data['attachments'] = attachments + try: + content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data) + except TypeError: # old versions of the plugin don't support the additional argument + content = self.wordpress_page_compiler.compile_to_string(content) + # Now convert to MarkDown with html2text + h = html2text.HTML2Text() + content = h.handle(content) + return content, 'md', False + elif self.html2text: + # TODO: what to do with [code] blocks? + # content = self.transform_code(content) + content = self.transform_caption(content, use_html=True) + h = html2text.HTML2Text() + content = h.handle(content) + return content, 'md', False elif self.use_wordpress_compiler: return content, 'wp', False else: @@ -686,7 +782,7 @@ class CommandImportWordpress(Command, ImportMixin): elif approved == 'spam' or approved == 'trash': pass else: - LOGGER.warn("Unknown comment approved status: " + str(approved)) + LOGGER.warning("Unknown comment approved status: {0}".format(approved)) parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0)) if parent == 0: parent = None @@ -724,6 +820,16 @@ class CommandImportWordpress(Command, ImportMixin): write_header_line(fd, "wordpress_user_id", comment["user_id"]) fd.write(('\n' + comment['content']).encode('utf8')) + def _create_meta_and_content_filenames(self, slug, extension, lang, default_language, translations_config): + out_meta_filename = slug + '.meta' + out_content_filename = slug + '.' + extension + if lang and lang != default_language: + out_meta_filename = utils.get_translation_candidate(translations_config, + out_meta_filename, lang) + out_content_filename = utils.get_translation_candidate(translations_config, + out_content_filename, lang) + return out_meta_filename, out_content_filename + def _create_metadata(self, status, excerpt, tags, categories, post_name=None): """Create post metadata.""" other_meta = {'wp-status': status} @@ -735,24 +841,48 @@ class CommandImportWordpress(Command, ImportMixin): if text in self._category_paths: cats.append(self._category_paths[text]) else: - cats.append(utils.join_hierarchical_category_path([text])) + cats.append(hierarchy_utils.join_hierarchical_category_path([utils.html_unescape(text)])) other_meta['categories'] = ','.join(cats) if len(cats) > 0: other_meta['category'] = cats[0] if len(cats) > 1: - LOGGER.warn(('Post "{0}" has more than one category! ' + - 'Will only use the first one.').format(post_name)) - tags_cats = tags + LOGGER.warning(('Post "{0}" has more than one category! ' + + 'Will only use the first one.').format(post_name)) + tags_cats = [utils.html_unescape(tag) for tag in tags] else: - tags_cats = tags + categories + tags_cats = [utils.html_unescape(tag) for tag in tags + categories] return tags_cats, other_meta + _tag_sanitize_map = {True: {}, False: {}} + + def _sanitize(self, tag, is_category): + if self.tag_saniziting_strategy == 'lower': + return tag.lower() + if tag.lower() not in self._tag_sanitize_map[is_category]: + self._tag_sanitize_map[is_category][tag.lower()] = [tag] + return tag + previous = self._tag_sanitize_map[is_category][tag.lower()] + if self.tag_saniziting_strategy == 'first': + if tag != previous[0]: + LOGGER.warning("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0])) + return previous[0] + else: + LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy)) + sys.exit(1) + return tag + def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None): """Take an item from the feed and creates a post file.""" if out_folder is None: out_folder = 'posts' title = get_text_tag(item, 'title', 'NO TITLE') + + # titles can have line breaks in them, particularly when they are + # created by third-party tools that post to Wordpress. + # Handle windows-style and unix-style line endings. + title = title.replace('\r\n', ' ').replace('\n', ' ') + # link is something like http://foo.com/2012/09/01/hello-world/ # So, take the path, utils.slugify it, and that's our slug link = get_text_tag(item, 'link', None) @@ -760,7 +890,10 @@ class CommandImportWordpress(Command, ImportMixin): path = unquote(parsed.path.strip('/')) try: - path = path.decode('utf8') + if isinstance(path, bytes): + path = path.decode('utf8', 'replace') + else: + path = path except AttributeError: pass @@ -782,7 +915,7 @@ class CommandImportWordpress(Command, ImportMixin): else: if len(pathlist) > 1: out_folder = os.path.join(*([out_folder] + pathlist[:-1])) - slug = utils.slugify(pathlist[-1]) + slug = utils.slugify(pathlist[-1], self.lang) description = get_text_tag(item, 'description', '') post_date = get_text_tag( @@ -809,17 +942,19 @@ class CommandImportWordpress(Command, ImportMixin): tags = [] categories = [] + post_status = 'published' + has_math = "no" if status == 'trash': - LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title)) + LOGGER.warning('Trashed post "{0}" will not be imported.'.format(title)) return False elif status == 'private': - tags.append('private') is_draft = False is_private = True + post_status = 'private' elif status != 'publish': - tags.append('draft') is_draft = True is_private = False + post_status = 'draft' else: is_draft = False is_private = False @@ -831,14 +966,23 @@ class CommandImportWordpress(Command, ImportMixin): type = tag.attrib['domain'] if text == 'Uncategorized' and type == 'category': continue - self.all_tags.add(text) if type == 'category': - categories.append(type) + categories.append(text) else: tags.append(text) if '$latex' in content: - tags.append('mathjax') + has_math = "yes" + + for i, cat in enumerate(categories[:]): + cat = self._sanitize(cat, True) + categories[i] = cat + self.all_tags.add(cat) + + for i, tag in enumerate(tags[:]): + tag = self._sanitize(tag, False) + tags[i] = tag + self.all_tags.add(tag) # Find post format if it's there post_format = 'wp' @@ -849,53 +993,75 @@ class CommandImportWordpress(Command, ImportMixin): post_format = 'wp' if is_draft and self.exclude_drafts: - LOGGER.notice('Draft "{0}" will not be imported.'.format(title)) + LOGGER.warning('Draft "{0}" will not be imported.'.format(title)) return False elif is_private and self.exclude_privates: - LOGGER.notice('Private post "{0}" will not be imported.'.format(title)) + LOGGER.warning('Private post "{0}" will not be imported.'.format(title)) return False elif content.strip() or self.import_empty_items: # If no content is found, no files are written. self.url_map[link] = (self.context['SITE_URL'] + out_folder.rstrip('/') + '/' + slug + '.html').replace(os.sep, '/') - if hasattr(self, "separate_qtranslate_content") \ - and self.separate_qtranslate_content: - content_translations = separate_qtranslate_content(content) + default_language = self.context["DEFAULT_LANG"] + if self.separate_qtranslate_content: + content_translations = separate_qtranslate_tagged_langs(content) + title_translations = separate_qtranslate_tagged_langs(title) else: content_translations = {"": content} - default_language = self.context["DEFAULT_LANG"] + title_translations = {"": title} + # in case of mistmatch between the languages found in the title and in the content + default_title = title_translations.get(default_language, title) + extra_languages = [lang for lang in content_translations.keys() if lang not in ("", default_language)] + for extra_lang in extra_languages: + self.extra_languages.add(extra_lang) + translations_dict = get_default_translations_dict(default_language, extra_languages) + current_translations_config = { + "DEFAULT_LANG": default_language, + "TRANSLATIONS": translations_dict, + "TRANSLATIONS_PATTERN": self.context["TRANSLATIONS_PATTERN"] + } for lang, content in content_translations.items(): try: content, extension, rewrite_html = self.transform_content(content, post_format, attachments) - except: + except Exception: LOGGER.error(('Cannot interpret post "{0}" (language {1}) with post ' + 'format {2}!').format(os.path.join(out_folder, slug), lang, post_format)) return False - if lang: - out_meta_filename = slug + '.meta' - if lang == default_language: - out_content_filename = slug + '.' + extension - else: - out_content_filename \ - = utils.get_translation_candidate(self.context, - slug + "." + extension, lang) - self.extra_languages.add(lang) - meta_slug = slug - else: - out_meta_filename = slug + '.meta' - out_content_filename = slug + '.' + extension - meta_slug = slug + + out_meta_filename, out_content_filename = self._create_meta_and_content_filenames( + slug, extension, lang, default_language, current_translations_config) + tags, other_meta = self._create_metadata(status, excerpt, tags, categories, post_name=os.path.join(out_folder, slug)) - self.write_metadata(os.path.join(self.output_folder, out_folder, - out_meta_filename), - title, meta_slug, post_date, description, tags, **other_meta) - self.write_content( - os.path.join(self.output_folder, - out_folder, out_content_filename), - content, - rewrite_html) + current_title = title_translations.get(lang, default_title) + meta = { + "title": current_title, + "slug": slug, + "date": post_date, + "description": description, + "tags": ','.join(tags), + "status": post_status, + "has_math": has_math, + } + meta.update(other_meta) + if self.onefile: + self.write_post( + os.path.join(self.output_folder, + out_folder, out_content_filename), + content, + meta, + self._get_compiler(), + rewrite_html) + else: + self.write_metadata(os.path.join(self.output_folder, out_folder, + out_meta_filename), + current_title, slug, post_date, description, tags, **other_meta) + self.write_content( + os.path.join(self.output_folder, + out_folder, out_content_filename), + content, + rewrite_html) if self.export_comments: comments = [] @@ -905,13 +1071,13 @@ class CommandImportWordpress(Command, ImportMixin): comments.append(comment) for comment in comments: - comment_filename = slug + "." + str(comment['id']) + ".wpcomment" + comment_filename = "{0}.{1}.wpcomment".format(slug, comment['id']) self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment) return (out_folder, slug) else: - LOGGER.warn(('Not going to import "{0}" because it seems to contain' - ' no content.').format(title)) + LOGGER.warning(('Not going to import "{0}" because it seems to contain' + ' no content.').format(title)) return False def _extract_item_info(self, item): @@ -937,7 +1103,7 @@ class CommandImportWordpress(Command, ImportMixin): if parent_id is not None and int(parent_id) != 0: self.attachments[int(parent_id)][post_id] = data else: - LOGGER.warn("Attachment #{0} ({1}) has no parent!".format(post_id, data['files'])) + LOGGER.warning("Attachment #{0} ({1}) has no parent!".format(post_id, data['files'])) def write_attachments_info(self, path, attachments): """Write attachments info file.""" @@ -955,7 +1121,7 @@ class CommandImportWordpress(Command, ImportMixin): if post_type == 'post': out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments) else: - out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments) + out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments) # Process attachment data if attachments is not None: # If post was exported, store data @@ -975,8 +1141,8 @@ class CommandImportWordpress(Command, ImportMixin): self.process_item_if_post_or_page(item) # Assign attachments to posts for post_id in self.attachments: - LOGGER.warn(("Found attachments for post or page #{0}, but didn't find post or page. " + - "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()])) + LOGGER.warning(("Found attachments for post or page #{0}, but didn't find post or page. " + + "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()])) def get_text_tag(tag, name, default): @@ -990,15 +1156,20 @@ def get_text_tag(tag, name, default): return default -def separate_qtranslate_content(text): - """Parse the content of a wordpress post or page and separate qtranslate languages. +def separate_qtranslate_tagged_langs(text): + """Parse the content of a wordpress post or page and separate languages. + + For qtranslateX tags: [:LL]blabla[:] - qtranslate tags: <!--:LL-->blabla<!--:--> + Note: qtranslate* plugins had a troubled history and used various + tags over time, application of the 'modernize_qtranslate_tags' + function is required for this function to handle most of the legacy + cases. """ - # TODO: uniformize qtranslate tags <!--/en--> => <!--:--> - qt_start = "<!--:" - qt_end = "-->" - qt_end_with_lang_len = 5 + qt_start = "[:" + qt_end = "]" + qt_end_len = len(qt_end) + qt_end_with_lang_len = qt_end_len + 2 qt_chunks = text.split(qt_start) content_by_lang = {} common_txt_list = [] @@ -1010,9 +1181,9 @@ def separate_qtranslate_content(text): # be some piece of common text or tags, or just nothing lang = "" # default language c = c.lstrip(qt_end) - if not c: + if not c.strip(): continue - elif c[2:].startswith(qt_end): + elif c[2:qt_end_with_lang_len].startswith(qt_end): # a language specific section (with language code at the begining) lang = c[:2] c = c[qt_end_with_lang_len:] @@ -1033,3 +1204,26 @@ def separate_qtranslate_content(text): for l in content_by_lang.keys(): content_by_lang[l] = " ".join(content_by_lang[l]) return content_by_lang + + +def modernize_qtranslate_tags(xml_bytes): + """ + Uniformize the "tag" used by various version of qtranslate. + + The resulting byte string will only contain one set of qtranslate tags + (namely [:LG] and [:]), older ones being converted to new ones. + """ + old_start_lang = re.compile(b"<!--:?(\\w{2})-->") + new_start_lang = b"[:\\1]" + old_end_lang = re.compile(b"<!--(/\\w{2}|:)-->") + new_end_lang = b"[:]" + title_match = re.compile(b"<title>(.*?)</title>") + modern_starts = old_start_lang.sub(new_start_lang, xml_bytes) + modernized_bytes = old_end_lang.sub(new_end_lang, modern_starts) + + def title_escape(match): + title = match.group(1) + title = title.replace(b"&", b"&").replace(b"<", b"<").replace(b">", b">") + return b"<title>" + title + b"</title>" + fixed_bytes = title_match.sub(title_escape, modernized_bytes) + return fixed_bytes |
