aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command/import_wordpress.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
-rw-r--r--nikola/plugins/command/import_wordpress.py480
1 files changed, 337 insertions, 143 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index a652ec8..5e2aee6 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright © 2012-2015 Roberto Alsina and others.
+# Copyright © 2012-2020 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
@@ -26,41 +26,45 @@
"""Import a WordPress dump."""
-from __future__ import unicode_literals, print_function
-import os
-import re
-import sys
import datetime
import io
import json
+import os
+import re
+import sys
+from collections import defaultdict
+from urllib.parse import urlparse, unquote
+
import requests
from lxml import etree
-from collections import defaultdict
+
+from nikola.plugin_categories import Command
+from nikola import utils, hierarchy_utils
+from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
+from nikola.utils import req_missing
+from nikola.plugins.basic_import import ImportMixin, links
+from nikola.plugins.command.init import (
+ SAMPLE_CONF, prepare_config,
+ format_default_translations_config,
+ get_default_translations_dict
+)
try:
- from urlparse import urlparse
- from urllib import unquote
+ import html2text
except ImportError:
- from urllib.parse import urlparse, unquote # NOQA
+ html2text = None
try:
import phpserialize
except ImportError:
- phpserialize = None # NOQA
+ phpserialize = None
-from nikola.plugin_categories import Command
-from nikola import utils
-from nikola.utils import req_missing
-from nikola.plugins.basic_import import ImportMixin, links
-from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
-from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config
-
-LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER)
+LOGGER = utils.get_logger('import_wordpress')
def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False):
"""Install a Nikola plugin."""
- LOGGER.notice("Installing plugin '{0}'".format(plugin_name))
+ LOGGER.info("Installing plugin '{0}'".format(plugin_name))
# Get hold of the 'plugin' plugin
plugin_installer_info = site.plugin_manager.getPluginByName('plugin', 'Command')
if plugin_installer_info is None:
@@ -88,7 +92,6 @@ def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False)
class CommandImportWordpress(Command, ImportMixin):
-
"""Import a WordPress dump."""
name = "import_wordpress"
@@ -144,15 +147,22 @@ class CommandImportWordpress(Command, ImportMixin):
'long': 'qtranslate',
'default': False,
'type': bool,
- 'help': "Look for translations generated by qtranslate plugin",
- # WARNING: won't recover translated titles that actually
- # don't seem to be part of the wordpress XML export at the
- # time of writing :(
+ 'help': """Look for translations generated by qtranslate plugin.
+WARNING: a default wordpress export won't allow to recover title translations.
+For this to be possible consider applying the hack suggested at
+https://github.com/qtranslate/qtranslate-xt/issues/199 :
+
+In wp-admin/includes/export.php change
+`echo apply_filters( 'the_title_rss', $post->post_title );
+
+to
+`echo apply_filters( 'the_title_export', $post->post_title );
+"""
},
{
'name': 'translations_pattern',
'long': 'translations_pattern',
- 'default': None,
+ 'default': DEFAULT_TRANSLATIONS_PATTERN,
'type': str,
'help': "The pattern for translation files names",
},
@@ -171,6 +181,20 @@ class CommandImportWordpress(Command, ImportMixin):
'help': "Export comments as .wpcomment files",
},
{
+ 'name': 'html2text',
+ 'long': 'html2text',
+ 'default': False,
+ 'type': bool,
+ 'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import",
+ },
+ {
+ 'name': 'transform_to_markdown',
+ 'long': 'transform-to-markdown',
+ 'default': False,
+ 'type': bool,
+ 'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import",
+ },
+ {
'name': 'transform_to_html',
'long': 'transform-to-html',
'default': False,
@@ -191,9 +215,36 @@ class CommandImportWordpress(Command, ImportMixin):
'type': bool,
'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!",
},
+ {
+ 'name': 'tag_sanitizing_strategy',
+ 'long': 'tag-sanitizing-strategy',
+ 'default': 'first',
+ 'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name',
+ },
+ {
+ 'name': 'one_file',
+ 'long': 'one-file',
+ 'default': False,
+ 'type': bool,
+ 'help': "Save imported posts in the more modern one-file format.",
+ },
]
all_tags = set([])
+ def _get_compiler(self):
+ """Return whatever compiler we will use."""
+ self._find_wordpress_compiler()
+ if self.wordpress_page_compiler is not None:
+ return self.wordpress_page_compiler
+ plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler')
+ if plugin_info is not None:
+ if not plugin_info.is_activated:
+ self.site.plugin_manager.activatePluginByName(plugin_info.name)
+ plugin_info.plugin_object.set_site(self.site)
+ return plugin_info.plugin_object
+ else:
+ LOGGER.error("Can't find markdown post compiler.")
+
def _find_wordpress_compiler(self):
"""Find WordPress compiler plugin."""
if self.wordpress_page_compiler is not None:
@@ -214,9 +265,11 @@ class CommandImportWordpress(Command, ImportMixin):
options['output_folder'] = args.pop(0)
if args:
- LOGGER.warn('You specified additional arguments ({0}). Please consider '
- 'putting these arguments before the filename if you '
- 'are running into problems.'.format(args))
+ LOGGER.warning('You specified additional arguments ({0}). Please consider '
+ 'putting these arguments before the filename if you '
+ 'are running into problems.'.format(args))
+
+ self.onefile = options.get('one_file', False)
self.import_into_existing_site = False
self.url_map = {}
@@ -234,11 +287,16 @@ class CommandImportWordpress(Command, ImportMixin):
self.export_categories_as_categories = options.get('export_categories_as_categories', False)
self.export_comments = options.get('export_comments', False)
+ self.html2text = options.get('html2text', False)
+ self.transform_to_markdown = options.get('transform_to_markdown', False)
+
self.transform_to_html = options.get('transform_to_html', False)
self.use_wordpress_compiler = options.get('use_wordpress_compiler', False)
self.install_wordpress_compiler = options.get('install_wordpress_compiler', False)
self.wordpress_page_compiler = None
+ self.tag_saniziting_strategy = options.get('tag_saniziting_strategy', 'first')
+
self.auth = None
if options.get('download_auth') is not None:
username_password = options.get('download_auth')
@@ -250,10 +308,18 @@ class CommandImportWordpress(Command, ImportMixin):
self.separate_qtranslate_content = options.get('separate_qtranslate_content')
self.translations_pattern = options.get('translations_pattern')
- if self.transform_to_html and self.use_wordpress_compiler:
- LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.")
+ count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0)
+ if count > 1:
+ LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.")
+ return False
+ if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler:
+ LOGGER.warning("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
+
+ if (self.html2text or self.transform_to_markdown) and not html2text:
+ LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.")
+ return False
- if self.transform_to_html:
+ if self.transform_to_html or self.transform_to_markdown:
self._find_wordpress_compiler()
if not self.wordpress_page_compiler and self.install_wordpress_compiler:
if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'): # local install
@@ -279,14 +345,14 @@ class CommandImportWordpress(Command, ImportMixin):
# cat_id = get_text_tag(cat, '{{{0}}}term_id'.format(wordpress_namespace), None)
cat_slug = get_text_tag(cat, '{{{0}}}category_nicename'.format(wordpress_namespace), None)
cat_parent_slug = get_text_tag(cat, '{{{0}}}category_parent'.format(wordpress_namespace), None)
- cat_name = get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None)
+ cat_name = utils.html_unescape(get_text_tag(cat, '{{{0}}}cat_name'.format(wordpress_namespace), None))
cat_path = [cat_name]
if cat_parent_slug in cat_map:
cat_path = cat_map[cat_parent_slug] + cat_path
cat_map[cat_slug] = cat_path
self._category_paths = dict()
for cat, path in cat_map.items():
- self._category_paths[cat] = utils.join_hierarchical_category_path(path)
+ self._category_paths[cat] = hierarchy_utils.join_hierarchical_category_path(path)
def _execute(self, options={}, args=[]):
"""Import a WordPress blog from an export file into a Nikola site."""
@@ -313,21 +379,16 @@ class CommandImportWordpress(Command, ImportMixin):
if phpserialize is None:
req_missing(['phpserialize'], 'import WordPress dumps without --no-downloads')
- channel = self.get_channel_from_file(self.wordpress_export_file)
+ export_file_preprocessor = modernize_qtranslate_tags if self.separate_qtranslate_content else None
+ channel = self.get_channel_from_file(self.wordpress_export_file, export_file_preprocessor)
self._prepare(channel)
conf_template = self.generate_base_site()
- # If user has specified a custom pattern for translation files we
- # need to fix the config
- if self.translations_pattern:
- self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern
-
self.import_posts(channel)
-
self.context['TRANSLATIONS'] = format_default_translations_config(
self.extra_languages)
self.context['REDIRECTIONS'] = self.configure_redirections(
- self.url_map)
+ self.url_map, self.base_dir)
if self.timezone:
self.context['TIMEZONE'] = self.timezone
if self.export_categories_as_categories:
@@ -337,10 +398,13 @@ class CommandImportWordpress(Command, ImportMixin):
# Add tag redirects
for tag in self.all_tags:
try:
- tag_str = tag.decode('utf8')
+ if isinstance(tag, bytes):
+ tag_str = tag.decode('utf8', 'replace')
+ else:
+ tag_str = tag
except AttributeError:
tag_str = tag
- tag = utils.slugify(tag_str)
+ tag = utils.slugify(tag_str, self.lang)
src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag)
dst_url = self.site.link('tag', tag)
if src_url != dst_url:
@@ -357,9 +421,9 @@ class CommandImportWordpress(Command, ImportMixin):
if not install_plugin(self.site, 'wordpress_compiler', output_dir=os.path.join(self.output_folder, 'plugins')):
return False
else:
- LOGGER.warn("Make sure to install the WordPress page compiler via")
- LOGGER.warn(" nikola plugin -i wordpress_compiler")
- LOGGER.warn("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
+ LOGGER.warning("Make sure to install the WordPress page compiler via")
+ LOGGER.warning(" nikola plugin -i wordpress_compiler")
+ LOGGER.warning("in your imported blog's folder ({0}), if you haven't installed it system-wide or user-wide. Otherwise, your newly imported blog won't compile.".format(self.output_folder))
@classmethod
def read_xml_file(cls, filename):
@@ -372,12 +436,19 @@ class CommandImportWordpress(Command, ImportMixin):
if b'<atom:link rel=' in line:
continue
xml.append(line)
- return b'\n'.join(xml)
+ return b''.join(xml)
@classmethod
- def get_channel_from_file(cls, filename):
- """Get channel from XML file."""
- tree = etree.fromstring(cls.read_xml_file(filename))
+ def get_channel_from_file(cls, filename, xml_preprocessor=None):
+ """Get channel from XML file.
+
+ An optional 'xml_preprocessor' allows to modify the xml
+ (typically to deal with variations in tags injected by some WP plugin)
+ """
+ xml_string = cls.read_xml_file(filename)
+ if xml_preprocessor:
+ xml_string = xml_preprocessor(xml_string)
+ tree = etree.fromstring(xml_string)
channel = tree.find('channel')
return channel
@@ -386,8 +457,12 @@ class CommandImportWordpress(Command, ImportMixin):
wordpress_namespace = channel.nsmap['wp']
context = SAMPLE_CONF.copy()
- context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
- context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
+ self.lang = get_text_tag(channel, 'language', 'en')[:2]
+ context['DEFAULT_LANG'] = self.lang
+ # If user has specified a custom pattern for translation files we
+ # need to fix the config
+ context['TRANSLATIONS_PATTERN'] = self.translations_pattern
+
context['BLOG_TITLE'] = get_text_tag(channel, 'title',
'PUT TITLE HERE')
context['BLOG_DESCRIPTION'] = get_text_tag(
@@ -418,17 +493,17 @@ class CommandImportWordpress(Command, ImportMixin):
PAGES = '(\n'
for extension in extensions:
POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension)
- PAGES += ' ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension)
+ PAGES += ' ("pages/*.{0}", "pages", "page.tmpl"),\n'.format(extension)
POSTS += ')\n'
PAGES += ')\n'
context['POSTS'] = POSTS
context['PAGES'] = PAGES
COMPILERS = '{\n'
- COMPILERS += ''' "rest": ('.txt', '.rst'),''' + '\n'
- COMPILERS += ''' "markdown": ('.md', '.mdown', '.markdown'),''' + '\n'
- COMPILERS += ''' "html": ('.html', '.htm'),''' + '\n'
+ COMPILERS += ''' "rest": ['.txt', '.rst'],''' + '\n'
+ COMPILERS += ''' "markdown": ['.md', '.mdown', '.markdown'],''' + '\n'
+ COMPILERS += ''' "html": ['.html', '.htm'],''' + '\n'
if self.use_wordpress_compiler:
- COMPILERS += ''' "wordpress": ('.wp'),''' + '\n'
+ COMPILERS += ''' "wordpress": ['.wp'],''' + '\n'
COMPILERS += '}'
context['COMPILERS'] = COMPILERS
@@ -436,18 +511,15 @@ class CommandImportWordpress(Command, ImportMixin):
def download_url_content_to_file(self, url, dst_path):
"""Download some content (attachments) to a file."""
- if self.no_downloads:
- return
-
try:
request = requests.get(url, auth=self.auth)
if request.status_code >= 400:
- LOGGER.warn("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
+ LOGGER.warning("Downloading {0} to {1} failed with HTTP status code {2}".format(url, dst_path, request.status_code))
return
with open(dst_path, 'wb+') as fd:
fd.write(request.content)
except requests.exceptions.ConnectionError as err:
- LOGGER.warn("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
+ LOGGER.warning("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))
def import_attachment(self, item, wordpress_namespace):
"""Import an attachment to the site."""
@@ -458,10 +530,13 @@ class CommandImportWordpress(Command, ImportMixin):
'foo')
path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
- dst_dir = os.path.dirname(dst_path)
- utils.makedirs(dst_dir)
- LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
- self.download_url_content_to_file(url, dst_path)
+ if self.no_downloads:
+ LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+ else:
+ dst_dir = os.path.dirname(dst_path)
+ utils.makedirs(dst_dir)
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+ self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[link] = '/' + dst_url
links[url] = '/' + dst_url
@@ -485,14 +560,7 @@ class CommandImportWordpress(Command, ImportMixin):
# that the export should give you the power to insert
# your blogging into another site or system its not.
# Why don't they just use JSON?
- if sys.version_info[0] == 2:
- try:
- metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
- except ValueError:
- # local encoding might be wrong sometimes
- metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
- else:
- metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
+ metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
meta_key = b'image_meta'
size_key = b'sizes'
@@ -507,6 +575,8 @@ class CommandImportWordpress(Command, ImportMixin):
if meta_key in metadata:
image_meta = metadata[meta_key]
+ if not image_meta:
+ continue
dst_meta = {}
def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False):
@@ -517,6 +587,9 @@ class CommandImportWordpress(Command, ImportMixin):
if ignore_zero and value == 0:
return
elif is_float:
+ # in some locales (like fr) and for old posts there may be a comma here.
+ if isinstance(value, bytes):
+ value = value.replace(b",", b".")
value = float(value)
if ignore_zero and value == 0:
return
@@ -552,15 +625,18 @@ class CommandImportWordpress(Command, ImportMixin):
meta = {}
meta['size'] = size.decode('utf-8')
if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]:
- meta['width'] = metadata[size_key][size][width_key]
- meta['height'] = metadata[size_key][size][height_key]
+ meta['width'] = int(metadata[size_key][size][width_key])
+ meta['height'] = int(metadata[size_key][size][height_key])
path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
- dst_dir = os.path.dirname(dst_path)
- utils.makedirs(dst_dir)
- LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
- self.download_url_content_to_file(url, dst_path)
+ if self.no_downloads:
+ LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+ else:
+ dst_dir = os.path.dirname(dst_path)
+ utils.makedirs(dst_dir)
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+ self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[url] = '/' + dst_url
@@ -604,7 +680,7 @@ class CommandImportWordpress(Command, ImportMixin):
def transform_code(self, content):
"""Transform code blocks."""
- # http://en.support.wordpress.com/code/posting-source-code/. There are
+ # https://en.support.wordpress.com/code/posting-source-code/. There are
# a ton of things not supported here. We only do a basic [code
# lang="x"] -> ```x translation, and remove quoted html entities (<,
# >, &, and ").
@@ -628,10 +704,10 @@ class CommandImportWordpress(Command, ImportMixin):
return content
@staticmethod
- def transform_caption(content):
+ def transform_caption(content, use_html=False):
"""Transform captions."""
- new_caption = re.sub(r'\[/caption\]', '', content)
- new_caption = re.sub(r'\[caption.*\]', '', new_caption)
+ new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content)
+ new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption)
return new_caption
@@ -654,6 +730,26 @@ class CommandImportWordpress(Command, ImportMixin):
except TypeError: # old versions of the plugin don't support the additional argument
content = self.wordpress_page_compiler.compile_to_string(content)
return content, 'html', True
+ elif self.transform_to_markdown:
+ # First convert to HTML with WordPress plugin
+ additional_data = {}
+ if attachments is not None:
+ additional_data['attachments'] = attachments
+ try:
+ content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data)
+ except TypeError: # old versions of the plugin don't support the additional argument
+ content = self.wordpress_page_compiler.compile_to_string(content)
+ # Now convert to MarkDown with html2text
+ h = html2text.HTML2Text()
+ content = h.handle(content)
+ return content, 'md', False
+ elif self.html2text:
+ # TODO: what to do with [code] blocks?
+ # content = self.transform_code(content)
+ content = self.transform_caption(content, use_html=True)
+ h = html2text.HTML2Text()
+ content = h.handle(content)
+ return content, 'md', False
elif self.use_wordpress_compiler:
return content, 'wp', False
else:
@@ -686,7 +782,7 @@ class CommandImportWordpress(Command, ImportMixin):
elif approved == 'spam' or approved == 'trash':
pass
else:
- LOGGER.warn("Unknown comment approved status: " + str(approved))
+ LOGGER.warning("Unknown comment approved status: {0}".format(approved))
parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0))
if parent == 0:
parent = None
@@ -724,6 +820,16 @@ class CommandImportWordpress(Command, ImportMixin):
write_header_line(fd, "wordpress_user_id", comment["user_id"])
fd.write(('\n' + comment['content']).encode('utf8'))
+ def _create_meta_and_content_filenames(self, slug, extension, lang, default_language, translations_config):
+ out_meta_filename = slug + '.meta'
+ out_content_filename = slug + '.' + extension
+ if lang and lang != default_language:
+ out_meta_filename = utils.get_translation_candidate(translations_config,
+ out_meta_filename, lang)
+ out_content_filename = utils.get_translation_candidate(translations_config,
+ out_content_filename, lang)
+ return out_meta_filename, out_content_filename
+
def _create_metadata(self, status, excerpt, tags, categories, post_name=None):
"""Create post metadata."""
other_meta = {'wp-status': status}
@@ -735,24 +841,48 @@ class CommandImportWordpress(Command, ImportMixin):
if text in self._category_paths:
cats.append(self._category_paths[text])
else:
- cats.append(utils.join_hierarchical_category_path([text]))
+ cats.append(hierarchy_utils.join_hierarchical_category_path([utils.html_unescape(text)]))
other_meta['categories'] = ','.join(cats)
if len(cats) > 0:
other_meta['category'] = cats[0]
if len(cats) > 1:
- LOGGER.warn(('Post "{0}" has more than one category! ' +
- 'Will only use the first one.').format(post_name))
- tags_cats = tags
+ LOGGER.warning(('Post "{0}" has more than one category! ' +
+ 'Will only use the first one.').format(post_name))
+ tags_cats = [utils.html_unescape(tag) for tag in tags]
else:
- tags_cats = tags + categories
+ tags_cats = [utils.html_unescape(tag) for tag in tags + categories]
return tags_cats, other_meta
+ _tag_sanitize_map = {True: {}, False: {}}
+
+ def _sanitize(self, tag, is_category):
+ if self.tag_saniziting_strategy == 'lower':
+ return tag.lower()
+ if tag.lower() not in self._tag_sanitize_map[is_category]:
+ self._tag_sanitize_map[is_category][tag.lower()] = [tag]
+ return tag
+ previous = self._tag_sanitize_map[is_category][tag.lower()]
+ if self.tag_saniziting_strategy == 'first':
+ if tag != previous[0]:
+ LOGGER.warning("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0]))
+ return previous[0]
+ else:
+ LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy))
+ sys.exit(1)
+ return tag
+
def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None):
"""Take an item from the feed and creates a post file."""
if out_folder is None:
out_folder = 'posts'
title = get_text_tag(item, 'title', 'NO TITLE')
+
+ # titles can have line breaks in them, particularly when they are
+ # created by third-party tools that post to Wordpress.
+ # Handle windows-style and unix-style line endings.
+ title = title.replace('\r\n', ' ').replace('\n', ' ')
+
# link is something like http://foo.com/2012/09/01/hello-world/
# So, take the path, utils.slugify it, and that's our slug
link = get_text_tag(item, 'link', None)
@@ -760,7 +890,10 @@ class CommandImportWordpress(Command, ImportMixin):
path = unquote(parsed.path.strip('/'))
try:
- path = path.decode('utf8')
+ if isinstance(path, bytes):
+ path = path.decode('utf8', 'replace')
+ else:
+ path = path
except AttributeError:
pass
@@ -782,7 +915,7 @@ class CommandImportWordpress(Command, ImportMixin):
else:
if len(pathlist) > 1:
out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
- slug = utils.slugify(pathlist[-1])
+ slug = utils.slugify(pathlist[-1], self.lang)
description = get_text_tag(item, 'description', '')
post_date = get_text_tag(
@@ -809,17 +942,19 @@ class CommandImportWordpress(Command, ImportMixin):
tags = []
categories = []
+ post_status = 'published'
+ has_math = "no"
if status == 'trash':
- LOGGER.warn('Trashed post "{0}" will not be imported.'.format(title))
+ LOGGER.warning('Trashed post "{0}" will not be imported.'.format(title))
return False
elif status == 'private':
- tags.append('private')
is_draft = False
is_private = True
+ post_status = 'private'
elif status != 'publish':
- tags.append('draft')
is_draft = True
is_private = False
+ post_status = 'draft'
else:
is_draft = False
is_private = False
@@ -831,14 +966,23 @@ class CommandImportWordpress(Command, ImportMixin):
type = tag.attrib['domain']
if text == 'Uncategorized' and type == 'category':
continue
- self.all_tags.add(text)
if type == 'category':
- categories.append(type)
+ categories.append(text)
else:
tags.append(text)
if '$latex' in content:
- tags.append('mathjax')
+ has_math = "yes"
+
+ for i, cat in enumerate(categories[:]):
+ cat = self._sanitize(cat, True)
+ categories[i] = cat
+ self.all_tags.add(cat)
+
+ for i, tag in enumerate(tags[:]):
+ tag = self._sanitize(tag, False)
+ tags[i] = tag
+ self.all_tags.add(tag)
# Find post format if it's there
post_format = 'wp'
@@ -849,53 +993,75 @@ class CommandImportWordpress(Command, ImportMixin):
post_format = 'wp'
if is_draft and self.exclude_drafts:
- LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
+ LOGGER.warning('Draft "{0}" will not be imported.'.format(title))
return False
elif is_private and self.exclude_privates:
- LOGGER.notice('Private post "{0}" will not be imported.'.format(title))
+ LOGGER.warning('Private post "{0}" will not be imported.'.format(title))
return False
elif content.strip() or self.import_empty_items:
# If no content is found, no files are written.
self.url_map[link] = (self.context['SITE_URL'] +
out_folder.rstrip('/') + '/' + slug +
'.html').replace(os.sep, '/')
- if hasattr(self, "separate_qtranslate_content") \
- and self.separate_qtranslate_content:
- content_translations = separate_qtranslate_content(content)
+ default_language = self.context["DEFAULT_LANG"]
+ if self.separate_qtranslate_content:
+ content_translations = separate_qtranslate_tagged_langs(content)
+ title_translations = separate_qtranslate_tagged_langs(title)
else:
content_translations = {"": content}
- default_language = self.context["DEFAULT_LANG"]
+ title_translations = {"": title}
+ # in case of mistmatch between the languages found in the title and in the content
+ default_title = title_translations.get(default_language, title)
+ extra_languages = [lang for lang in content_translations.keys() if lang not in ("", default_language)]
+ for extra_lang in extra_languages:
+ self.extra_languages.add(extra_lang)
+ translations_dict = get_default_translations_dict(default_language, extra_languages)
+ current_translations_config = {
+ "DEFAULT_LANG": default_language,
+ "TRANSLATIONS": translations_dict,
+ "TRANSLATIONS_PATTERN": self.context["TRANSLATIONS_PATTERN"]
+ }
for lang, content in content_translations.items():
try:
content, extension, rewrite_html = self.transform_content(content, post_format, attachments)
- except:
+ except Exception:
LOGGER.error(('Cannot interpret post "{0}" (language {1}) with post ' +
'format {2}!').format(os.path.join(out_folder, slug), lang, post_format))
return False
- if lang:
- out_meta_filename = slug + '.meta'
- if lang == default_language:
- out_content_filename = slug + '.' + extension
- else:
- out_content_filename \
- = utils.get_translation_candidate(self.context,
- slug + "." + extension, lang)
- self.extra_languages.add(lang)
- meta_slug = slug
- else:
- out_meta_filename = slug + '.meta'
- out_content_filename = slug + '.' + extension
- meta_slug = slug
+
+ out_meta_filename, out_content_filename = self._create_meta_and_content_filenames(
+ slug, extension, lang, default_language, current_translations_config)
+
tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
post_name=os.path.join(out_folder, slug))
- self.write_metadata(os.path.join(self.output_folder, out_folder,
- out_meta_filename),
- title, meta_slug, post_date, description, tags, **other_meta)
- self.write_content(
- os.path.join(self.output_folder,
- out_folder, out_content_filename),
- content,
- rewrite_html)
+ current_title = title_translations.get(lang, default_title)
+ meta = {
+ "title": current_title,
+ "slug": slug,
+ "date": post_date,
+ "description": description,
+ "tags": ','.join(tags),
+ "status": post_status,
+ "has_math": has_math,
+ }
+ meta.update(other_meta)
+ if self.onefile:
+ self.write_post(
+ os.path.join(self.output_folder,
+ out_folder, out_content_filename),
+ content,
+ meta,
+ self._get_compiler(),
+ rewrite_html)
+ else:
+ self.write_metadata(os.path.join(self.output_folder, out_folder,
+ out_meta_filename),
+ current_title, slug, post_date, description, tags, **other_meta)
+ self.write_content(
+ os.path.join(self.output_folder,
+ out_folder, out_content_filename),
+ content,
+ rewrite_html)
if self.export_comments:
comments = []
@@ -905,13 +1071,13 @@ class CommandImportWordpress(Command, ImportMixin):
comments.append(comment)
for comment in comments:
- comment_filename = slug + "." + str(comment['id']) + ".wpcomment"
+ comment_filename = "{0}.{1}.wpcomment".format(slug, comment['id'])
self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment)
return (out_folder, slug)
else:
- LOGGER.warn(('Not going to import "{0}" because it seems to contain'
- ' no content.').format(title))
+ LOGGER.warning(('Not going to import "{0}" because it seems to contain'
+ ' no content.').format(title))
return False
def _extract_item_info(self, item):
@@ -937,7 +1103,7 @@ class CommandImportWordpress(Command, ImportMixin):
if parent_id is not None and int(parent_id) != 0:
self.attachments[int(parent_id)][post_id] = data
else:
- LOGGER.warn("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
+ LOGGER.warning("Attachment #{0} ({1}) has no parent!".format(post_id, data['files']))
def write_attachments_info(self, path, attachments):
"""Write attachments info file."""
@@ -955,7 +1121,7 @@ class CommandImportWordpress(Command, ImportMixin):
if post_type == 'post':
out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments)
else:
- out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments)
+ out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments)
# Process attachment data
if attachments is not None:
# If post was exported, store data
@@ -975,8 +1141,8 @@ class CommandImportWordpress(Command, ImportMixin):
self.process_item_if_post_or_page(item)
# Assign attachments to posts
for post_id in self.attachments:
- LOGGER.warn(("Found attachments for post or page #{0}, but didn't find post or page. " +
- "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
+ LOGGER.warning(("Found attachments for post or page #{0}, but didn't find post or page. " +
+ "(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))
def get_text_tag(tag, name, default):
@@ -990,15 +1156,20 @@ def get_text_tag(tag, name, default):
return default
-def separate_qtranslate_content(text):
- """Parse the content of a wordpress post or page and separate qtranslate languages.
+def separate_qtranslate_tagged_langs(text):
+ """Parse the content of a wordpress post or page and separate languages.
+
+ For qtranslateX tags: [:LL]blabla[:]
- qtranslate tags: <!--:LL-->blabla<!--:-->
+ Note: qtranslate* plugins had a troubled history and used various
+ tags over time, application of the 'modernize_qtranslate_tags'
+ function is required for this function to handle most of the legacy
+ cases.
"""
- # TODO: uniformize qtranslate tags <!--/en--> => <!--:-->
- qt_start = "<!--:"
- qt_end = "-->"
- qt_end_with_lang_len = 5
+ qt_start = "[:"
+ qt_end = "]"
+ qt_end_len = len(qt_end)
+ qt_end_with_lang_len = qt_end_len + 2
qt_chunks = text.split(qt_start)
content_by_lang = {}
common_txt_list = []
@@ -1010,9 +1181,9 @@ def separate_qtranslate_content(text):
# be some piece of common text or tags, or just nothing
lang = "" # default language
c = c.lstrip(qt_end)
- if not c:
+ if not c.strip():
continue
- elif c[2:].startswith(qt_end):
+ elif c[2:qt_end_with_lang_len].startswith(qt_end):
# a language specific section (with language code at the begining)
lang = c[:2]
c = c[qt_end_with_lang_len:]
@@ -1033,3 +1204,26 @@ def separate_qtranslate_content(text):
for l in content_by_lang.keys():
content_by_lang[l] = " ".join(content_by_lang[l])
return content_by_lang
+
+
+def modernize_qtranslate_tags(xml_bytes):
+ """
+ Uniformize the "tag" used by various version of qtranslate.
+
+ The resulting byte string will only contain one set of qtranslate tags
+ (namely [:LG] and [:]), older ones being converted to new ones.
+ """
+ old_start_lang = re.compile(b"<!--:?(\\w{2})-->")
+ new_start_lang = b"[:\\1]"
+ old_end_lang = re.compile(b"<!--(/\\w{2}|:)-->")
+ new_end_lang = b"[:]"
+ title_match = re.compile(b"<title>(.*?)</title>")
+ modern_starts = old_start_lang.sub(new_start_lang, xml_bytes)
+ modernized_bytes = old_end_lang.sub(new_end_lang, modern_starts)
+
+ def title_escape(match):
+ title = match.group(1)
+ title = title.replace(b"&", b"&amp;").replace(b"<", b"&lt;").replace(b">", b"&gt;")
+ return b"<title>" + title + b"</title>"
+ fixed_bytes = title_match.sub(title_escape, modernized_bytes)
+ return fixed_bytes