diff options
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command/import_wordpress.py | 233 |
1 files changed, 188 insertions, 45 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py index a652ec8..0b48583 100644 --- a/nikola/plugins/command/import_wordpress.py +++ b/nikola/plugins/command/import_wordpress.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2012-2015 Roberto Alsina and others. +# Copyright © 2012-2016 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -38,6 +38,11 @@ from lxml import etree from collections import defaultdict try: + import html2text +except: + html2text = None + +try: from urlparse import urlparse from urllib import unquote except ImportError: @@ -50,7 +55,7 @@ except ImportError: from nikola.plugin_categories import Command from nikola import utils -from nikola.utils import req_missing +from nikola.utils import req_missing, unicode_str from nikola.plugins.basic_import import ImportMixin, links from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config @@ -88,7 +93,6 @@ def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False) class CommandImportWordpress(Command, ImportMixin): - """Import a WordPress dump.""" name = "import_wordpress" @@ -171,6 +175,20 @@ class CommandImportWordpress(Command, ImportMixin): 'help': "Export comments as .wpcomment files", }, { + 'name': 'html2text', + 'long': 'html2text', + 'default': False, + 'type': bool, + 'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import", + }, + { + 'name': 'transform_to_markdown', + 'long': 'transform-to-markdown', + 'default': False, + 'type': bool, + 'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import", + }, + { 'name': 'transform_to_html', 'long': 'transform-to-html', 'default': False, @@ -191,9 +209,36 @@ class CommandImportWordpress(Command, ImportMixin): 'type': bool, 'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!", }, + { + 'name': 'tag_sanitizing_strategy', + 'long': 'tag-sanitizing-strategy', + 'default': 'first', + 'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name', + }, + { + 'name': 'one_file', + 'long': 'one-file', + 'default': False, + 'type': bool, + 'help': "Save imported posts in the more modern one-file format.", + }, ] all_tags = set([]) + def _get_compiler(self): + """Return whatever compiler we will use.""" + self._find_wordpress_compiler() + if self.wordpress_page_compiler is not None: + return self.wordpress_page_compiler + plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler') + if plugin_info is not None: + if not plugin_info.is_activated: + self.site.plugin_manager.activatePluginByName(plugin_info.name) + plugin_info.plugin_object.set_site(self.site) + return plugin_info.plugin_object + else: + LOGGER.error("Can't find markdown post compiler.") + def _find_wordpress_compiler(self): """Find WordPress compiler plugin.""" if self.wordpress_page_compiler is not None: @@ -218,6 +263,8 @@ class CommandImportWordpress(Command, ImportMixin): 'putting these arguments before the filename if you ' 'are running into problems.'.format(args)) + self.onefile = options.get('one_file', False) + self.import_into_existing_site = False self.url_map = {} self.timezone = None @@ -234,11 +281,16 @@ class CommandImportWordpress(Command, ImportMixin): self.export_categories_as_categories = options.get('export_categories_as_categories', False) self.export_comments = options.get('export_comments', False) + self.html2text = options.get('html2text', False) + self.transform_to_markdown = options.get('transform_to_markdown', False) + self.transform_to_html = options.get('transform_to_html', False) self.use_wordpress_compiler = options.get('use_wordpress_compiler', False) self.install_wordpress_compiler = options.get('install_wordpress_compiler', False) self.wordpress_page_compiler = None + self.tag_saniziting_strategy = options.get('tag_saniziting_strategy', 'first') + self.auth = None if options.get('download_auth') is not None: username_password = options.get('download_auth') @@ -250,10 +302,18 @@ class CommandImportWordpress(Command, ImportMixin): self.separate_qtranslate_content = options.get('separate_qtranslate_content') self.translations_pattern = options.get('translations_pattern') - if self.transform_to_html and self.use_wordpress_compiler: - LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.") + count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0) + if count > 1: + LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.") + return False + if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler: + LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.") + + if (self.html2text or self.transform_to_markdown) and not html2text: + LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.") + return False - if self.transform_to_html: + if self.transform_to_html or self.transform_to_markdown: self._find_wordpress_compiler() if not self.wordpress_page_compiler and self.install_wordpress_compiler: if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'): # local install @@ -327,7 +387,7 @@ class CommandImportWordpress(Command, ImportMixin): self.context['TRANSLATIONS'] = format_default_translations_config( self.extra_languages) self.context['REDIRECTIONS'] = self.configure_redirections( - self.url_map) + self.url_map, self.base_dir) if self.timezone: self.context['TIMEZONE'] = self.timezone if self.export_categories_as_categories: @@ -337,10 +397,13 @@ class CommandImportWordpress(Command, ImportMixin): # Add tag redirects for tag in self.all_tags: try: - tag_str = tag.decode('utf8') + if isinstance(tag, utils.bytes_str): + tag_str = tag.decode('utf8', 'replace') + else: + tag_str = tag except AttributeError: tag_str = tag - tag = utils.slugify(tag_str) + tag = utils.slugify(tag_str, self.lang) src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag) dst_url = self.site.link('tag', tag) if src_url != dst_url: @@ -372,7 +435,7 @@ class CommandImportWordpress(Command, ImportMixin): if b'<atom:link rel=' in line: continue xml.append(line) - return b'\n'.join(xml) + return b''.join(xml) @classmethod def get_channel_from_file(cls, filename): @@ -386,7 +449,8 @@ class CommandImportWordpress(Command, ImportMixin): wordpress_namespace = channel.nsmap['wp'] context = SAMPLE_CONF.copy() - context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2] + self.lang = get_text_tag(channel, 'language', 'en')[:2] + context['DEFAULT_LANG'] = self.lang context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN context['BLOG_TITLE'] = get_text_tag(channel, 'title', 'PUT TITLE HERE') @@ -418,7 +482,7 @@ class CommandImportWordpress(Command, ImportMixin): PAGES = '(\n' for extension in extensions: POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension) - PAGES += ' ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension) + PAGES += ' ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension) POSTS += ')\n' PAGES += ')\n' context['POSTS'] = POSTS @@ -436,9 +500,6 @@ class CommandImportWordpress(Command, ImportMixin): def download_url_content_to_file(self, url, dst_path): """Download some content (attachments) to a file.""" - if self.no_downloads: - return - try: request = requests.get(url, auth=self.auth) if request.status_code >= 400: @@ -458,10 +519,13 @@ class CommandImportWordpress(Command, ImportMixin): 'foo') path = urlparse(url).path dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) - dst_dir = os.path.dirname(dst_path) - utils.makedirs(dst_dir) - LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) - self.download_url_content_to_file(url, dst_path) + if self.no_downloads: + LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path)) + else: + dst_dir = os.path.dirname(dst_path) + utils.makedirs(dst_dir) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) + self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[link] = '/' + dst_url links[url] = '/' + dst_url @@ -507,6 +571,8 @@ class CommandImportWordpress(Command, ImportMixin): if meta_key in metadata: image_meta = metadata[meta_key] + if not image_meta: + continue dst_meta = {} def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False): @@ -552,15 +618,18 @@ class CommandImportWordpress(Command, ImportMixin): meta = {} meta['size'] = size.decode('utf-8') if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]: - meta['width'] = metadata[size_key][size][width_key] - meta['height'] = metadata[size_key][size][height_key] + meta['width'] = int(metadata[size_key][size][width_key]) + meta['height'] = int(metadata[size_key][size][height_key]) path = urlparse(url).path dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) - dst_dir = os.path.dirname(dst_path) - utils.makedirs(dst_dir) - LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) - self.download_url_content_to_file(url, dst_path) + if self.no_downloads: + LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path)) + else: + dst_dir = os.path.dirname(dst_path) + utils.makedirs(dst_dir) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) + self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[url] = '/' + dst_url @@ -604,7 +673,7 @@ class CommandImportWordpress(Command, ImportMixin): def transform_code(self, content): """Transform code blocks.""" - # http://en.support.wordpress.com/code/posting-source-code/. There are + # https://en.support.wordpress.com/code/posting-source-code/. There are # a ton of things not supported here. We only do a basic [code # lang="x"] -> ```x translation, and remove quoted html entities (<, # >, &, and "). @@ -628,10 +697,10 @@ class CommandImportWordpress(Command, ImportMixin): return content @staticmethod - def transform_caption(content): + def transform_caption(content, use_html=False): """Transform captions.""" - new_caption = re.sub(r'\[/caption\]', '', content) - new_caption = re.sub(r'\[caption.*\]', '', new_caption) + new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content) + new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption) return new_caption @@ -654,6 +723,26 @@ class CommandImportWordpress(Command, ImportMixin): except TypeError: # old versions of the plugin don't support the additional argument content = self.wordpress_page_compiler.compile_to_string(content) return content, 'html', True + elif self.transform_to_markdown: + # First convert to HTML with WordPress plugin + additional_data = {} + if attachments is not None: + additional_data['attachments'] = attachments + try: + content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data) + except TypeError: # old versions of the plugin don't support the additional argument + content = self.wordpress_page_compiler.compile_to_string(content) + # Now convert to MarkDown with html2text + h = html2text.HTML2Text() + content = h.handle(content) + return content, 'md', False + elif self.html2text: + # TODO: what to do with [code] blocks? + # content = self.transform_code(content) + content = self.transform_caption(content, use_html=True) + h = html2text.HTML2Text() + content = h.handle(content) + return content, 'md', False elif self.use_wordpress_compiler: return content, 'wp', False else: @@ -686,7 +775,7 @@ class CommandImportWordpress(Command, ImportMixin): elif approved == 'spam' or approved == 'trash': pass else: - LOGGER.warn("Unknown comment approved status: " + str(approved)) + LOGGER.warn("Unknown comment approved status: {0}".format(approved)) parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0)) if parent == 0: parent = None @@ -707,7 +796,7 @@ class CommandImportWordpress(Command, ImportMixin): """Write comment header line.""" if header_content is None: return - header_content = str(header_content).replace('\n', ' ') + header_content = unicode_str(header_content).replace('\n', ' ') line = '.. ' + header_field + ': ' + header_content + '\n' fd.write(line.encode('utf8')) @@ -747,12 +836,36 @@ class CommandImportWordpress(Command, ImportMixin): tags_cats = tags + categories return tags_cats, other_meta + _tag_sanitize_map = {True: {}, False: {}} + + def _sanitize(self, tag, is_category): + if self.tag_saniziting_strategy == 'lower': + return tag.lower() + if tag.lower() not in self._tag_sanitize_map[is_category]: + self._tag_sanitize_map[is_category][tag.lower()] = [tag] + return tag + previous = self._tag_sanitize_map[is_category][tag.lower()] + if self.tag_saniziting_strategy == 'first': + if tag != previous[0]: + LOGGER.warn("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0])) + return previous[0] + else: + LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy)) + sys.exit(1) + return tag + def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None): """Take an item from the feed and creates a post file.""" if out_folder is None: out_folder = 'posts' title = get_text_tag(item, 'title', 'NO TITLE') + + # titles can have line breaks in them, particularly when they are + # created by third-party tools that post to Wordpress. + # Handle windows-style and unix-style line endings. + title = title.replace('\r\n', ' ').replace('\n', ' ') + # link is something like http://foo.com/2012/09/01/hello-world/ # So, take the path, utils.slugify it, and that's our slug link = get_text_tag(item, 'link', None) @@ -760,7 +873,10 @@ class CommandImportWordpress(Command, ImportMixin): path = unquote(parsed.path.strip('/')) try: - path = path.decode('utf8') + if isinstance(path, utils.bytes_str): + path = path.decode('utf8', 'replace') + else: + path = path except AttributeError: pass @@ -782,7 +898,7 @@ class CommandImportWordpress(Command, ImportMixin): else: if len(pathlist) > 1: out_folder = os.path.join(*([out_folder] + pathlist[:-1])) - slug = utils.slugify(pathlist[-1]) + slug = utils.slugify(pathlist[-1], self.lang) description = get_text_tag(item, 'description', '') post_date = get_text_tag( @@ -831,15 +947,24 @@ class CommandImportWordpress(Command, ImportMixin): type = tag.attrib['domain'] if text == 'Uncategorized' and type == 'category': continue - self.all_tags.add(text) if type == 'category': - categories.append(type) + categories.append(text) else: tags.append(text) if '$latex' in content: tags.append('mathjax') + for i, cat in enumerate(categories[:]): + cat = self._sanitize(cat, True) + categories[i] = cat + self.all_tags.add(cat) + + for i, tag in enumerate(tags[:]): + tag = self._sanitize(tag, False) + tags[i] = tag + self.all_tags.add(tag) + # Find post format if it's there post_format = 'wp' format_tag = [x for x in item.findall('*//{%s}meta_key' % wordpress_namespace) if x.text == '_tc_post_format'] @@ -888,14 +1013,32 @@ class CommandImportWordpress(Command, ImportMixin): meta_slug = slug tags, other_meta = self._create_metadata(status, excerpt, tags, categories, post_name=os.path.join(out_folder, slug)) - self.write_metadata(os.path.join(self.output_folder, out_folder, - out_meta_filename), - title, meta_slug, post_date, description, tags, **other_meta) - self.write_content( - os.path.join(self.output_folder, - out_folder, out_content_filename), - content, - rewrite_html) + + meta = { + "title": title, + "slug": meta_slug, + "date": post_date, + "description": description, + "tags": ','.join(tags), + } + meta.update(other_meta) + if self.onefile: + self.write_post( + os.path.join(self.output_folder, + out_folder, out_content_filename), + content, + meta, + self._get_compiler(), + rewrite_html) + else: + self.write_metadata(os.path.join(self.output_folder, out_folder, + out_meta_filename), + title, meta_slug, post_date, description, tags, **other_meta) + self.write_content( + os.path.join(self.output_folder, + out_folder, out_content_filename), + content, + rewrite_html) if self.export_comments: comments = [] @@ -905,7 +1048,7 @@ class CommandImportWordpress(Command, ImportMixin): comments.append(comment) for comment in comments: - comment_filename = slug + "." + str(comment['id']) + ".wpcomment" + comment_filename = "{0}.{1}.wpcomment".format(slug, comment['id']) self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment) return (out_folder, slug) @@ -955,7 +1098,7 @@ class CommandImportWordpress(Command, ImportMixin): if post_type == 'post': out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments) else: - out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments) + out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments) # Process attachment data if attachments is not None: # If post was exported, store data |
