aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command/import_wordpress.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
-rw-r--r--nikola/plugins/command/import_wordpress.py233
1 files changed, 188 insertions, 45 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index a652ec8..0b48583 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright © 2012-2015 Roberto Alsina and others.
+# Copyright © 2012-2016 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
@@ -38,6 +38,11 @@ from lxml import etree
from collections import defaultdict
try:
+ import html2text
+except:
+ html2text = None
+
+try:
from urlparse import urlparse
from urllib import unquote
except ImportError:
@@ -50,7 +55,7 @@ except ImportError:
from nikola.plugin_categories import Command
from nikola import utils
-from nikola.utils import req_missing
+from nikola.utils import req_missing, unicode_str
from nikola.plugins.basic_import import ImportMixin, links
from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config
@@ -88,7 +93,6 @@ def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False)
class CommandImportWordpress(Command, ImportMixin):
-
"""Import a WordPress dump."""
name = "import_wordpress"
@@ -171,6 +175,20 @@ class CommandImportWordpress(Command, ImportMixin):
'help': "Export comments as .wpcomment files",
},
{
+ 'name': 'html2text',
+ 'long': 'html2text',
+ 'default': False,
+ 'type': bool,
+ 'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import",
+ },
+ {
+ 'name': 'transform_to_markdown',
+ 'long': 'transform-to-markdown',
+ 'default': False,
+ 'type': bool,
+ 'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import",
+ },
+ {
'name': 'transform_to_html',
'long': 'transform-to-html',
'default': False,
@@ -191,9 +209,36 @@ class CommandImportWordpress(Command, ImportMixin):
'type': bool,
'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!",
},
+ {
+ 'name': 'tag_sanitizing_strategy',
+ 'long': 'tag-sanitizing-strategy',
+ 'default': 'first',
+ 'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name',
+ },
+ {
+ 'name': 'one_file',
+ 'long': 'one-file',
+ 'default': False,
+ 'type': bool,
+ 'help': "Save imported posts in the more modern one-file format.",
+ },
]
all_tags = set([])
+ def _get_compiler(self):
+ """Return whatever compiler we will use."""
+ self._find_wordpress_compiler()
+ if self.wordpress_page_compiler is not None:
+ return self.wordpress_page_compiler
+ plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler')
+ if plugin_info is not None:
+ if not plugin_info.is_activated:
+ self.site.plugin_manager.activatePluginByName(plugin_info.name)
+ plugin_info.plugin_object.set_site(self.site)
+ return plugin_info.plugin_object
+ else:
+ LOGGER.error("Can't find markdown post compiler.")
+
def _find_wordpress_compiler(self):
"""Find WordPress compiler plugin."""
if self.wordpress_page_compiler is not None:
@@ -218,6 +263,8 @@ class CommandImportWordpress(Command, ImportMixin):
'putting these arguments before the filename if you '
'are running into problems.'.format(args))
+ self.onefile = options.get('one_file', False)
+
self.import_into_existing_site = False
self.url_map = {}
self.timezone = None
@@ -234,11 +281,16 @@ class CommandImportWordpress(Command, ImportMixin):
self.export_categories_as_categories = options.get('export_categories_as_categories', False)
self.export_comments = options.get('export_comments', False)
+ self.html2text = options.get('html2text', False)
+ self.transform_to_markdown = options.get('transform_to_markdown', False)
+
self.transform_to_html = options.get('transform_to_html', False)
self.use_wordpress_compiler = options.get('use_wordpress_compiler', False)
self.install_wordpress_compiler = options.get('install_wordpress_compiler', False)
self.wordpress_page_compiler = None
+ self.tag_saniziting_strategy = options.get('tag_saniziting_strategy', 'first')
+
self.auth = None
if options.get('download_auth') is not None:
username_password = options.get('download_auth')
@@ -250,10 +302,18 @@ class CommandImportWordpress(Command, ImportMixin):
self.separate_qtranslate_content = options.get('separate_qtranslate_content')
self.translations_pattern = options.get('translations_pattern')
- if self.transform_to_html and self.use_wordpress_compiler:
- LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.")
+ count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0)
+ if count > 1:
+ LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.")
+ return False
+ if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler:
+ LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
+
+ if (self.html2text or self.transform_to_markdown) and not html2text:
+ LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.")
+ return False
- if self.transform_to_html:
+ if self.transform_to_html or self.transform_to_markdown:
self._find_wordpress_compiler()
if not self.wordpress_page_compiler and self.install_wordpress_compiler:
if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'): # local install
@@ -327,7 +387,7 @@ class CommandImportWordpress(Command, ImportMixin):
self.context['TRANSLATIONS'] = format_default_translations_config(
self.extra_languages)
self.context['REDIRECTIONS'] = self.configure_redirections(
- self.url_map)
+ self.url_map, self.base_dir)
if self.timezone:
self.context['TIMEZONE'] = self.timezone
if self.export_categories_as_categories:
@@ -337,10 +397,13 @@ class CommandImportWordpress(Command, ImportMixin):
# Add tag redirects
for tag in self.all_tags:
try:
- tag_str = tag.decode('utf8')
+ if isinstance(tag, utils.bytes_str):
+ tag_str = tag.decode('utf8', 'replace')
+ else:
+ tag_str = tag
except AttributeError:
tag_str = tag
- tag = utils.slugify(tag_str)
+ tag = utils.slugify(tag_str, self.lang)
src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag)
dst_url = self.site.link('tag', tag)
if src_url != dst_url:
@@ -372,7 +435,7 @@ class CommandImportWordpress(Command, ImportMixin):
if b'<atom:link rel=' in line:
continue
xml.append(line)
- return b'\n'.join(xml)
+ return b''.join(xml)
@classmethod
def get_channel_from_file(cls, filename):
@@ -386,7 +449,8 @@ class CommandImportWordpress(Command, ImportMixin):
wordpress_namespace = channel.nsmap['wp']
context = SAMPLE_CONF.copy()
- context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
+ self.lang = get_text_tag(channel, 'language', 'en')[:2]
+ context['DEFAULT_LANG'] = self.lang
context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
context['BLOG_TITLE'] = get_text_tag(channel, 'title',
'PUT TITLE HERE')
@@ -418,7 +482,7 @@ class CommandImportWordpress(Command, ImportMixin):
PAGES = '(\n'
for extension in extensions:
POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension)
- PAGES += ' ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension)
+ PAGES += ' ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension)
POSTS += ')\n'
PAGES += ')\n'
context['POSTS'] = POSTS
@@ -436,9 +500,6 @@ class CommandImportWordpress(Command, ImportMixin):
def download_url_content_to_file(self, url, dst_path):
"""Download some content (attachments) to a file."""
- if self.no_downloads:
- return
-
try:
request = requests.get(url, auth=self.auth)
if request.status_code >= 400:
@@ -458,10 +519,13 @@ class CommandImportWordpress(Command, ImportMixin):
'foo')
path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
- dst_dir = os.path.dirname(dst_path)
- utils.makedirs(dst_dir)
- LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
- self.download_url_content_to_file(url, dst_path)
+ if self.no_downloads:
+ LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+ else:
+ dst_dir = os.path.dirname(dst_path)
+ utils.makedirs(dst_dir)
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+ self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[link] = '/' + dst_url
links[url] = '/' + dst_url
@@ -507,6 +571,8 @@ class CommandImportWordpress(Command, ImportMixin):
if meta_key in metadata:
image_meta = metadata[meta_key]
+ if not image_meta:
+ continue
dst_meta = {}
def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False):
@@ -552,15 +618,18 @@ class CommandImportWordpress(Command, ImportMixin):
meta = {}
meta['size'] = size.decode('utf-8')
if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]:
- meta['width'] = metadata[size_key][size][width_key]
- meta['height'] = metadata[size_key][size][height_key]
+ meta['width'] = int(metadata[size_key][size][width_key])
+ meta['height'] = int(metadata[size_key][size][height_key])
path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
- dst_dir = os.path.dirname(dst_path)
- utils.makedirs(dst_dir)
- LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
- self.download_url_content_to_file(url, dst_path)
+ if self.no_downloads:
+ LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+ else:
+ dst_dir = os.path.dirname(dst_path)
+ utils.makedirs(dst_dir)
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+ self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[url] = '/' + dst_url
@@ -604,7 +673,7 @@ class CommandImportWordpress(Command, ImportMixin):
def transform_code(self, content):
"""Transform code blocks."""
- # http://en.support.wordpress.com/code/posting-source-code/. There are
+ # https://en.support.wordpress.com/code/posting-source-code/. There are
# a ton of things not supported here. We only do a basic [code
# lang="x"] -> ```x translation, and remove quoted html entities (<,
# >, &, and ").
@@ -628,10 +697,10 @@ class CommandImportWordpress(Command, ImportMixin):
return content
@staticmethod
- def transform_caption(content):
+ def transform_caption(content, use_html=False):
"""Transform captions."""
- new_caption = re.sub(r'\[/caption\]', '', content)
- new_caption = re.sub(r'\[caption.*\]', '', new_caption)
+ new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content)
+ new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption)
return new_caption
@@ -654,6 +723,26 @@ class CommandImportWordpress(Command, ImportMixin):
except TypeError: # old versions of the plugin don't support the additional argument
content = self.wordpress_page_compiler.compile_to_string(content)
return content, 'html', True
+ elif self.transform_to_markdown:
+ # First convert to HTML with WordPress plugin
+ additional_data = {}
+ if attachments is not None:
+ additional_data['attachments'] = attachments
+ try:
+ content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data)
+ except TypeError: # old versions of the plugin don't support the additional argument
+ content = self.wordpress_page_compiler.compile_to_string(content)
+ # Now convert to MarkDown with html2text
+ h = html2text.HTML2Text()
+ content = h.handle(content)
+ return content, 'md', False
+ elif self.html2text:
+ # TODO: what to do with [code] blocks?
+ # content = self.transform_code(content)
+ content = self.transform_caption(content, use_html=True)
+ h = html2text.HTML2Text()
+ content = h.handle(content)
+ return content, 'md', False
elif self.use_wordpress_compiler:
return content, 'wp', False
else:
@@ -686,7 +775,7 @@ class CommandImportWordpress(Command, ImportMixin):
elif approved == 'spam' or approved == 'trash':
pass
else:
- LOGGER.warn("Unknown comment approved status: " + str(approved))
+ LOGGER.warn("Unknown comment approved status: {0}".format(approved))
parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0))
if parent == 0:
parent = None
@@ -707,7 +796,7 @@ class CommandImportWordpress(Command, ImportMixin):
"""Write comment header line."""
if header_content is None:
return
- header_content = str(header_content).replace('\n', ' ')
+ header_content = unicode_str(header_content).replace('\n', ' ')
line = '.. ' + header_field + ': ' + header_content + '\n'
fd.write(line.encode('utf8'))
@@ -747,12 +836,36 @@ class CommandImportWordpress(Command, ImportMixin):
tags_cats = tags + categories
return tags_cats, other_meta
+ _tag_sanitize_map = {True: {}, False: {}}
+
+ def _sanitize(self, tag, is_category):
+ if self.tag_saniziting_strategy == 'lower':
+ return tag.lower()
+ if tag.lower() not in self._tag_sanitize_map[is_category]:
+ self._tag_sanitize_map[is_category][tag.lower()] = [tag]
+ return tag
+ previous = self._tag_sanitize_map[is_category][tag.lower()]
+ if self.tag_saniziting_strategy == 'first':
+ if tag != previous[0]:
+ LOGGER.warn("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0]))
+ return previous[0]
+ else:
+ LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy))
+ sys.exit(1)
+ return tag
+
def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None):
"""Take an item from the feed and creates a post file."""
if out_folder is None:
out_folder = 'posts'
title = get_text_tag(item, 'title', 'NO TITLE')
+
+ # titles can have line breaks in them, particularly when they are
+ # created by third-party tools that post to Wordpress.
+ # Handle windows-style and unix-style line endings.
+ title = title.replace('\r\n', ' ').replace('\n', ' ')
+
# link is something like http://foo.com/2012/09/01/hello-world/
# So, take the path, utils.slugify it, and that's our slug
link = get_text_tag(item, 'link', None)
@@ -760,7 +873,10 @@ class CommandImportWordpress(Command, ImportMixin):
path = unquote(parsed.path.strip('/'))
try:
- path = path.decode('utf8')
+ if isinstance(path, utils.bytes_str):
+ path = path.decode('utf8', 'replace')
+ else:
+ path = path
except AttributeError:
pass
@@ -782,7 +898,7 @@ class CommandImportWordpress(Command, ImportMixin):
else:
if len(pathlist) > 1:
out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
- slug = utils.slugify(pathlist[-1])
+ slug = utils.slugify(pathlist[-1], self.lang)
description = get_text_tag(item, 'description', '')
post_date = get_text_tag(
@@ -831,15 +947,24 @@ class CommandImportWordpress(Command, ImportMixin):
type = tag.attrib['domain']
if text == 'Uncategorized' and type == 'category':
continue
- self.all_tags.add(text)
if type == 'category':
- categories.append(type)
+ categories.append(text)
else:
tags.append(text)
if '$latex' in content:
tags.append('mathjax')
+ for i, cat in enumerate(categories[:]):
+ cat = self._sanitize(cat, True)
+ categories[i] = cat
+ self.all_tags.add(cat)
+
+ for i, tag in enumerate(tags[:]):
+ tag = self._sanitize(tag, False)
+ tags[i] = tag
+ self.all_tags.add(tag)
+
# Find post format if it's there
post_format = 'wp'
format_tag = [x for x in item.findall('*//{%s}meta_key' % wordpress_namespace) if x.text == '_tc_post_format']
@@ -888,14 +1013,32 @@ class CommandImportWordpress(Command, ImportMixin):
meta_slug = slug
tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
post_name=os.path.join(out_folder, slug))
- self.write_metadata(os.path.join(self.output_folder, out_folder,
- out_meta_filename),
- title, meta_slug, post_date, description, tags, **other_meta)
- self.write_content(
- os.path.join(self.output_folder,
- out_folder, out_content_filename),
- content,
- rewrite_html)
+
+ meta = {
+ "title": title,
+ "slug": meta_slug,
+ "date": post_date,
+ "description": description,
+ "tags": ','.join(tags),
+ }
+ meta.update(other_meta)
+ if self.onefile:
+ self.write_post(
+ os.path.join(self.output_folder,
+ out_folder, out_content_filename),
+ content,
+ meta,
+ self._get_compiler(),
+ rewrite_html)
+ else:
+ self.write_metadata(os.path.join(self.output_folder, out_folder,
+ out_meta_filename),
+ title, meta_slug, post_date, description, tags, **other_meta)
+ self.write_content(
+ os.path.join(self.output_folder,
+ out_folder, out_content_filename),
+ content,
+ rewrite_html)
if self.export_comments:
comments = []
@@ -905,7 +1048,7 @@ class CommandImportWordpress(Command, ImportMixin):
comments.append(comment)
for comment in comments:
- comment_filename = slug + "." + str(comment['id']) + ".wpcomment"
+ comment_filename = "{0}.{1}.wpcomment".format(slug, comment['id'])
self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment)
return (out_folder, slug)
@@ -955,7 +1098,7 @@ class CommandImportWordpress(Command, ImportMixin):
if post_type == 'post':
out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments)
else:
- out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments)
+ out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments)
# Process attachment data
if attachments is not None:
# If post was exported, store data