aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command/import_wordpress.py
diff options
context:
space:
mode:
authorLibravatarDererk <dererk@satellogic.com>2016-11-15 14:18:46 -0300
committerLibravatarDererk <dererk@satellogic.com>2016-11-15 14:18:46 -0300
commitffb671c61a24a9086343b54bad080e145ff33fc5 (patch)
tree2c5291f7a34edf4afdc8e07887a148291bfa3fa1 /nikola/plugins/command/import_wordpress.py
parent4e3224c012df9f74f010eb92203520515e8537b9 (diff)
New upstream version 7.8.1upstream/7.8.1
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
-rw-r--r--nikola/plugins/command/import_wordpress.py177
1 files changed, 140 insertions, 37 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index 69ef144..0b48583 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright © 2012-2015 Roberto Alsina and others.
+# Copyright © 2012-2016 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
@@ -38,6 +38,11 @@ from lxml import etree
from collections import defaultdict
try:
+ import html2text
+except:
+ html2text = None
+
+try:
from urlparse import urlparse
from urllib import unquote
except ImportError:
@@ -170,6 +175,20 @@ class CommandImportWordpress(Command, ImportMixin):
'help': "Export comments as .wpcomment files",
},
{
+ 'name': 'html2text',
+ 'long': 'html2text',
+ 'default': False,
+ 'type': bool,
+ 'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import",
+ },
+ {
+ 'name': 'transform_to_markdown',
+ 'long': 'transform-to-markdown',
+ 'default': False,
+ 'type': bool,
+ 'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import",
+ },
+ {
'name': 'transform_to_html',
'long': 'transform-to-html',
'default': False,
@@ -191,14 +210,35 @@ class CommandImportWordpress(Command, ImportMixin):
'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!",
},
{
- 'name': 'tag_saniziting_strategy',
- 'long': 'tag-saniziting-strategy',
+ 'name': 'tag_sanitizing_strategy',
+ 'long': 'tag-sanitizing-strategy',
'default': 'first',
'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name',
},
+ {
+ 'name': 'one_file',
+ 'long': 'one-file',
+ 'default': False,
+ 'type': bool,
+ 'help': "Save imported posts in the more modern one-file format.",
+ },
]
all_tags = set([])
+ def _get_compiler(self):
+ """Return whatever compiler we will use."""
+ self._find_wordpress_compiler()
+ if self.wordpress_page_compiler is not None:
+ return self.wordpress_page_compiler
+ plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler')
+ if plugin_info is not None:
+ if not plugin_info.is_activated:
+ self.site.plugin_manager.activatePluginByName(plugin_info.name)
+ plugin_info.plugin_object.set_site(self.site)
+ return plugin_info.plugin_object
+ else:
+ LOGGER.error("Can't find markdown post compiler.")
+
def _find_wordpress_compiler(self):
"""Find WordPress compiler plugin."""
if self.wordpress_page_compiler is not None:
@@ -223,6 +263,8 @@ class CommandImportWordpress(Command, ImportMixin):
'putting these arguments before the filename if you '
'are running into problems.'.format(args))
+ self.onefile = options.get('one_file', False)
+
self.import_into_existing_site = False
self.url_map = {}
self.timezone = None
@@ -239,6 +281,9 @@ class CommandImportWordpress(Command, ImportMixin):
self.export_categories_as_categories = options.get('export_categories_as_categories', False)
self.export_comments = options.get('export_comments', False)
+ self.html2text = options.get('html2text', False)
+ self.transform_to_markdown = options.get('transform_to_markdown', False)
+
self.transform_to_html = options.get('transform_to_html', False)
self.use_wordpress_compiler = options.get('use_wordpress_compiler', False)
self.install_wordpress_compiler = options.get('install_wordpress_compiler', False)
@@ -257,10 +302,18 @@ class CommandImportWordpress(Command, ImportMixin):
self.separate_qtranslate_content = options.get('separate_qtranslate_content')
self.translations_pattern = options.get('translations_pattern')
- if self.transform_to_html and self.use_wordpress_compiler:
- LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.")
+ count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0)
+ if count > 1:
+ LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.")
+ return False
+ if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler:
+ LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.")
+
+ if (self.html2text or self.transform_to_markdown) and not html2text:
+ LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.")
+ return False
- if self.transform_to_html:
+ if self.transform_to_html or self.transform_to_markdown:
self._find_wordpress_compiler()
if not self.wordpress_page_compiler and self.install_wordpress_compiler:
if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'): # local install
@@ -334,7 +387,7 @@ class CommandImportWordpress(Command, ImportMixin):
self.context['TRANSLATIONS'] = format_default_translations_config(
self.extra_languages)
self.context['REDIRECTIONS'] = self.configure_redirections(
- self.url_map)
+ self.url_map, self.base_dir)
if self.timezone:
self.context['TIMEZONE'] = self.timezone
if self.export_categories_as_categories:
@@ -350,7 +403,7 @@ class CommandImportWordpress(Command, ImportMixin):
tag_str = tag
except AttributeError:
tag_str = tag
- tag = utils.slugify(tag_str)
+ tag = utils.slugify(tag_str, self.lang)
src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag)
dst_url = self.site.link('tag', tag)
if src_url != dst_url:
@@ -382,7 +435,7 @@ class CommandImportWordpress(Command, ImportMixin):
if b'<atom:link rel=' in line:
continue
xml.append(line)
- return b'\n'.join(xml)
+ return b''.join(xml)
@classmethod
def get_channel_from_file(cls, filename):
@@ -396,7 +449,8 @@ class CommandImportWordpress(Command, ImportMixin):
wordpress_namespace = channel.nsmap['wp']
context = SAMPLE_CONF.copy()
- context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
+ self.lang = get_text_tag(channel, 'language', 'en')[:2]
+ context['DEFAULT_LANG'] = self.lang
context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
context['BLOG_TITLE'] = get_text_tag(channel, 'title',
'PUT TITLE HERE')
@@ -428,7 +482,7 @@ class CommandImportWordpress(Command, ImportMixin):
PAGES = '(\n'
for extension in extensions:
POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension)
- PAGES += ' ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension)
+ PAGES += ' ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension)
POSTS += ')\n'
PAGES += ')\n'
context['POSTS'] = POSTS
@@ -446,9 +500,6 @@ class CommandImportWordpress(Command, ImportMixin):
def download_url_content_to_file(self, url, dst_path):
"""Download some content (attachments) to a file."""
- if self.no_downloads:
- return
-
try:
request = requests.get(url, auth=self.auth)
if request.status_code >= 400:
@@ -468,10 +519,13 @@ class CommandImportWordpress(Command, ImportMixin):
'foo')
path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
- dst_dir = os.path.dirname(dst_path)
- utils.makedirs(dst_dir)
- LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
- self.download_url_content_to_file(url, dst_path)
+ if self.no_downloads:
+ LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+ else:
+ dst_dir = os.path.dirname(dst_path)
+ utils.makedirs(dst_dir)
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+ self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[link] = '/' + dst_url
links[url] = '/' + dst_url
@@ -517,6 +571,8 @@ class CommandImportWordpress(Command, ImportMixin):
if meta_key in metadata:
image_meta = metadata[meta_key]
+ if not image_meta:
+ continue
dst_meta = {}
def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False):
@@ -562,15 +618,18 @@ class CommandImportWordpress(Command, ImportMixin):
meta = {}
meta['size'] = size.decode('utf-8')
if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]:
- meta['width'] = metadata[size_key][size][width_key]
- meta['height'] = metadata[size_key][size][height_key]
+ meta['width'] = int(metadata[size_key][size][width_key])
+ meta['height'] = int(metadata[size_key][size][height_key])
path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
- dst_dir = os.path.dirname(dst_path)
- utils.makedirs(dst_dir)
- LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
- self.download_url_content_to_file(url, dst_path)
+ if self.no_downloads:
+ LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path))
+ else:
+ dst_dir = os.path.dirname(dst_path)
+ utils.makedirs(dst_dir)
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
+ self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[url] = '/' + dst_url
@@ -638,10 +697,10 @@ class CommandImportWordpress(Command, ImportMixin):
return content
@staticmethod
- def transform_caption(content):
+ def transform_caption(content, use_html=False):
"""Transform captions."""
- new_caption = re.sub(r'\[/caption\]', '', content)
- new_caption = re.sub(r'\[caption.*\]', '', new_caption)
+ new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content)
+ new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption)
return new_caption
@@ -664,6 +723,26 @@ class CommandImportWordpress(Command, ImportMixin):
except TypeError: # old versions of the plugin don't support the additional argument
content = self.wordpress_page_compiler.compile_to_string(content)
return content, 'html', True
+ elif self.transform_to_markdown:
+ # First convert to HTML with WordPress plugin
+ additional_data = {}
+ if attachments is not None:
+ additional_data['attachments'] = attachments
+ try:
+ content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data)
+ except TypeError: # old versions of the plugin don't support the additional argument
+ content = self.wordpress_page_compiler.compile_to_string(content)
+ # Now convert to MarkDown with html2text
+ h = html2text.HTML2Text()
+ content = h.handle(content)
+ return content, 'md', False
+ elif self.html2text:
+ # TODO: what to do with [code] blocks?
+ # content = self.transform_code(content)
+ content = self.transform_caption(content, use_html=True)
+ h = html2text.HTML2Text()
+ content = h.handle(content)
+ return content, 'md', False
elif self.use_wordpress_compiler:
return content, 'wp', False
else:
@@ -781,6 +860,12 @@ class CommandImportWordpress(Command, ImportMixin):
out_folder = 'posts'
title = get_text_tag(item, 'title', 'NO TITLE')
+
+ # titles can have line breaks in them, particularly when they are
+ # created by third-party tools that post to Wordpress.
+ # Handle windows-style and unix-style line endings.
+ title = title.replace('\r\n', ' ').replace('\n', ' ')
+
# link is something like http://foo.com/2012/09/01/hello-world/
# So, take the path, utils.slugify it, and that's our slug
link = get_text_tag(item, 'link', None)
@@ -813,7 +898,7 @@ class CommandImportWordpress(Command, ImportMixin):
else:
if len(pathlist) > 1:
out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
- slug = utils.slugify(pathlist[-1])
+ slug = utils.slugify(pathlist[-1], self.lang)
description = get_text_tag(item, 'description', '')
post_date = get_text_tag(
@@ -928,14 +1013,32 @@ class CommandImportWordpress(Command, ImportMixin):
meta_slug = slug
tags, other_meta = self._create_metadata(status, excerpt, tags, categories,
post_name=os.path.join(out_folder, slug))
- self.write_metadata(os.path.join(self.output_folder, out_folder,
- out_meta_filename),
- title, meta_slug, post_date, description, tags, **other_meta)
- self.write_content(
- os.path.join(self.output_folder,
- out_folder, out_content_filename),
- content,
- rewrite_html)
+
+ meta = {
+ "title": title,
+ "slug": meta_slug,
+ "date": post_date,
+ "description": description,
+ "tags": ','.join(tags),
+ }
+ meta.update(other_meta)
+ if self.onefile:
+ self.write_post(
+ os.path.join(self.output_folder,
+ out_folder, out_content_filename),
+ content,
+ meta,
+ self._get_compiler(),
+ rewrite_html)
+ else:
+ self.write_metadata(os.path.join(self.output_folder, out_folder,
+ out_meta_filename),
+ title, meta_slug, post_date, description, tags, **other_meta)
+ self.write_content(
+ os.path.join(self.output_folder,
+ out_folder, out_content_filename),
+ content,
+ rewrite_html)
if self.export_comments:
comments = []
@@ -995,7 +1098,7 @@ class CommandImportWordpress(Command, ImportMixin):
if post_type == 'post':
out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments)
else:
- out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments)
+ out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments)
# Process attachment data
if attachments is not None:
# If post was exported, store data