diff options
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command/import_wordpress.py | 177 |
1 files changed, 140 insertions, 37 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py index 69ef144..0b48583 100644 --- a/nikola/plugins/command/import_wordpress.py +++ b/nikola/plugins/command/import_wordpress.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2012-2015 Roberto Alsina and others. +# Copyright © 2012-2016 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -38,6 +38,11 @@ from lxml import etree from collections import defaultdict try: + import html2text +except: + html2text = None + +try: from urlparse import urlparse from urllib import unquote except ImportError: @@ -170,6 +175,20 @@ class CommandImportWordpress(Command, ImportMixin): 'help': "Export comments as .wpcomment files", }, { + 'name': 'html2text', + 'long': 'html2text', + 'default': False, + 'type': bool, + 'help': "Uses html2text (needs to be installed with pip) to transform WordPress posts to MarkDown during import", + }, + { + 'name': 'transform_to_markdown', + 'long': 'transform-to-markdown', + 'default': False, + 'type': bool, + 'help': "Uses WordPress page compiler to transform WordPress posts to HTML and then use html2text to transform them to MarkDown during import", + }, + { 'name': 'transform_to_html', 'long': 'transform-to-html', 'default': False, @@ -191,14 +210,35 @@ class CommandImportWordpress(Command, ImportMixin): 'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!", }, { - 'name': 'tag_saniziting_strategy', - 'long': 'tag-saniziting-strategy', + 'name': 'tag_sanitizing_strategy', + 'long': 'tag-sanitizing-strategy', 'default': 'first', 'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name', }, + { + 'name': 'one_file', + 'long': 'one-file', + 'default': False, + 'type': bool, + 'help': "Save imported posts in the more modern one-file format.", + }, ] all_tags = set([]) + def _get_compiler(self): + """Return whatever compiler we will use.""" + self._find_wordpress_compiler() + if self.wordpress_page_compiler is not None: + return self.wordpress_page_compiler + plugin_info = self.site.plugin_manager.getPluginByName('markdown', 'PageCompiler') + if plugin_info is not None: + if not plugin_info.is_activated: + self.site.plugin_manager.activatePluginByName(plugin_info.name) + plugin_info.plugin_object.set_site(self.site) + return plugin_info.plugin_object + else: + LOGGER.error("Can't find markdown post compiler.") + def _find_wordpress_compiler(self): """Find WordPress compiler plugin.""" if self.wordpress_page_compiler is not None: @@ -223,6 +263,8 @@ class CommandImportWordpress(Command, ImportMixin): 'putting these arguments before the filename if you ' 'are running into problems.'.format(args)) + self.onefile = options.get('one_file', False) + self.import_into_existing_site = False self.url_map = {} self.timezone = None @@ -239,6 +281,9 @@ class CommandImportWordpress(Command, ImportMixin): self.export_categories_as_categories = options.get('export_categories_as_categories', False) self.export_comments = options.get('export_comments', False) + self.html2text = options.get('html2text', False) + self.transform_to_markdown = options.get('transform_to_markdown', False) + self.transform_to_html = options.get('transform_to_html', False) self.use_wordpress_compiler = options.get('use_wordpress_compiler', False) self.install_wordpress_compiler = options.get('install_wordpress_compiler', False) @@ -257,10 +302,18 @@ class CommandImportWordpress(Command, ImportMixin): self.separate_qtranslate_content = options.get('separate_qtranslate_content') self.translations_pattern = options.get('translations_pattern') - if self.transform_to_html and self.use_wordpress_compiler: - LOGGER.warn("It does not make sense to combine --transform-to-html with --use-wordpress-compiler, as the first converts all posts to HTML and the latter option affects zero posts.") + count = (1 if self.html2text else 0) + (1 if self.transform_to_html else 0) + (1 if self.transform_to_markdown else 0) + if count > 1: + LOGGER.error("You can use at most one of the options --html2text, --transform-to-html and --transform-to-markdown.") + return False + if (self.html2text or self.transform_to_html or self.transform_to_markdown) and self.use_wordpress_compiler: + LOGGER.warn("It does not make sense to combine --use-wordpress-compiler with any of --html2text, --transform-to-html and --transform-to-markdown, as the latter convert all posts to HTML and the first option then affects zero posts.") + + if (self.html2text or self.transform_to_markdown) and not html2text: + LOGGER.error("You need to install html2text via 'pip install html2text' before you can use the --html2text and --transform-to-markdown options.") + return False - if self.transform_to_html: + if self.transform_to_html or self.transform_to_markdown: self._find_wordpress_compiler() if not self.wordpress_page_compiler and self.install_wordpress_compiler: if not install_plugin(self.site, 'wordpress_compiler', output_dir='plugins'): # local install @@ -334,7 +387,7 @@ class CommandImportWordpress(Command, ImportMixin): self.context['TRANSLATIONS'] = format_default_translations_config( self.extra_languages) self.context['REDIRECTIONS'] = self.configure_redirections( - self.url_map) + self.url_map, self.base_dir) if self.timezone: self.context['TIMEZONE'] = self.timezone if self.export_categories_as_categories: @@ -350,7 +403,7 @@ class CommandImportWordpress(Command, ImportMixin): tag_str = tag except AttributeError: tag_str = tag - tag = utils.slugify(tag_str) + tag = utils.slugify(tag_str, self.lang) src_url = '{}tag/{}'.format(self.context['SITE_URL'], tag) dst_url = self.site.link('tag', tag) if src_url != dst_url: @@ -382,7 +435,7 @@ class CommandImportWordpress(Command, ImportMixin): if b'<atom:link rel=' in line: continue xml.append(line) - return b'\n'.join(xml) + return b''.join(xml) @classmethod def get_channel_from_file(cls, filename): @@ -396,7 +449,8 @@ class CommandImportWordpress(Command, ImportMixin): wordpress_namespace = channel.nsmap['wp'] context = SAMPLE_CONF.copy() - context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2] + self.lang = get_text_tag(channel, 'language', 'en')[:2] + context['DEFAULT_LANG'] = self.lang context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN context['BLOG_TITLE'] = get_text_tag(channel, 'title', 'PUT TITLE HERE') @@ -428,7 +482,7 @@ class CommandImportWordpress(Command, ImportMixin): PAGES = '(\n' for extension in extensions: POSTS += ' ("posts/*.{0}", "posts", "post.tmpl"),\n'.format(extension) - PAGES += ' ("stories/*.{0}", "stories", "story.tmpl"),\n'.format(extension) + PAGES += ' ("pages/*.{0}", "pages", "story.tmpl"),\n'.format(extension) POSTS += ')\n' PAGES += ')\n' context['POSTS'] = POSTS @@ -446,9 +500,6 @@ class CommandImportWordpress(Command, ImportMixin): def download_url_content_to_file(self, url, dst_path): """Download some content (attachments) to a file.""" - if self.no_downloads: - return - try: request = requests.get(url, auth=self.auth) if request.status_code >= 400: @@ -468,10 +519,13 @@ class CommandImportWordpress(Command, ImportMixin): 'foo') path = urlparse(url).path dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) - dst_dir = os.path.dirname(dst_path) - utils.makedirs(dst_dir) - LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) - self.download_url_content_to_file(url, dst_path) + if self.no_downloads: + LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path)) + else: + dst_dir = os.path.dirname(dst_path) + utils.makedirs(dst_dir) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) + self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[link] = '/' + dst_url links[url] = '/' + dst_url @@ -517,6 +571,8 @@ class CommandImportWordpress(Command, ImportMixin): if meta_key in metadata: image_meta = metadata[meta_key] + if not image_meta: + continue dst_meta = {} def add(our_key, wp_key, is_int=False, ignore_zero=False, is_float=False): @@ -562,15 +618,18 @@ class CommandImportWordpress(Command, ImportMixin): meta = {} meta['size'] = size.decode('utf-8') if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]: - meta['width'] = metadata[size_key][size][width_key] - meta['height'] = metadata[size_key][size][height_key] + meta['width'] = int(metadata[size_key][size][width_key]) + meta['height'] = int(metadata[size_key][size][height_key]) path = urlparse(url).path dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) - dst_dir = os.path.dirname(dst_path) - utils.makedirs(dst_dir) - LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) - self.download_url_content_to_file(url, dst_path) + if self.no_downloads: + LOGGER.info("Skipping downloading {0} => {1}".format(url, dst_path)) + else: + dst_dir = os.path.dirname(dst_path) + utils.makedirs(dst_dir) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) + self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[url] = '/' + dst_url @@ -638,10 +697,10 @@ class CommandImportWordpress(Command, ImportMixin): return content @staticmethod - def transform_caption(content): + def transform_caption(content, use_html=False): """Transform captions.""" - new_caption = re.sub(r'\[/caption\]', '', content) - new_caption = re.sub(r'\[caption.*\]', '', new_caption) + new_caption = re.sub(r'\[/caption\]', '</h1>' if use_html else '', content) + new_caption = re.sub(r'\[caption.*\]', '<h1>' if use_html else '', new_caption) return new_caption @@ -664,6 +723,26 @@ class CommandImportWordpress(Command, ImportMixin): except TypeError: # old versions of the plugin don't support the additional argument content = self.wordpress_page_compiler.compile_to_string(content) return content, 'html', True + elif self.transform_to_markdown: + # First convert to HTML with WordPress plugin + additional_data = {} + if attachments is not None: + additional_data['attachments'] = attachments + try: + content = self.wordpress_page_compiler.compile_to_string(content, additional_data=additional_data) + except TypeError: # old versions of the plugin don't support the additional argument + content = self.wordpress_page_compiler.compile_to_string(content) + # Now convert to MarkDown with html2text + h = html2text.HTML2Text() + content = h.handle(content) + return content, 'md', False + elif self.html2text: + # TODO: what to do with [code] blocks? + # content = self.transform_code(content) + content = self.transform_caption(content, use_html=True) + h = html2text.HTML2Text() + content = h.handle(content) + return content, 'md', False elif self.use_wordpress_compiler: return content, 'wp', False else: @@ -781,6 +860,12 @@ class CommandImportWordpress(Command, ImportMixin): out_folder = 'posts' title = get_text_tag(item, 'title', 'NO TITLE') + + # titles can have line breaks in them, particularly when they are + # created by third-party tools that post to Wordpress. + # Handle windows-style and unix-style line endings. + title = title.replace('\r\n', ' ').replace('\n', ' ') + # link is something like http://foo.com/2012/09/01/hello-world/ # So, take the path, utils.slugify it, and that's our slug link = get_text_tag(item, 'link', None) @@ -813,7 +898,7 @@ class CommandImportWordpress(Command, ImportMixin): else: if len(pathlist) > 1: out_folder = os.path.join(*([out_folder] + pathlist[:-1])) - slug = utils.slugify(pathlist[-1]) + slug = utils.slugify(pathlist[-1], self.lang) description = get_text_tag(item, 'description', '') post_date = get_text_tag( @@ -928,14 +1013,32 @@ class CommandImportWordpress(Command, ImportMixin): meta_slug = slug tags, other_meta = self._create_metadata(status, excerpt, tags, categories, post_name=os.path.join(out_folder, slug)) - self.write_metadata(os.path.join(self.output_folder, out_folder, - out_meta_filename), - title, meta_slug, post_date, description, tags, **other_meta) - self.write_content( - os.path.join(self.output_folder, - out_folder, out_content_filename), - content, - rewrite_html) + + meta = { + "title": title, + "slug": meta_slug, + "date": post_date, + "description": description, + "tags": ','.join(tags), + } + meta.update(other_meta) + if self.onefile: + self.write_post( + os.path.join(self.output_folder, + out_folder, out_content_filename), + content, + meta, + self._get_compiler(), + rewrite_html) + else: + self.write_metadata(os.path.join(self.output_folder, out_folder, + out_meta_filename), + title, meta_slug, post_date, description, tags, **other_meta) + self.write_content( + os.path.join(self.output_folder, + out_folder, out_content_filename), + content, + rewrite_html) if self.export_comments: comments = [] @@ -995,7 +1098,7 @@ class CommandImportWordpress(Command, ImportMixin): if post_type == 'post': out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'posts', attachments) else: - out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'stories', attachments) + out_folder_slug = self.import_postpage_item(item, wordpress_namespace, 'pages', attachments) # Process attachment data if attachments is not None: # If post was exported, store data |
