# Copyright (c) 2012 Roberto Alsina y otros. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated # documentation files (the "Software"), to deal in the # Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the # Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice # shall be included in all copies or substantial portions of # the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE # WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS # OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from __future__ import unicode_literals, print_function import codecs import csv import datetime import os from optparse import OptionParser import time try: from urlparse import urlparse except ImportError: from urllib.parse import urlparse # NOQA try: import feedparser except ImportError: feedparser = None # NOQA from lxml import html from mako.template import Template from nikola.plugin_categories import Command from nikola import utils links = {} class CommandImportBlogger(Command): """Import a blogger dump.""" name = "import_blogger" @classmethod def get_channel_from_file(cls, filename): return feedparser.parse(filename) @staticmethod def configure_redirections(url_map): redirections = [] for k, v in url_map.items(): # remove the initial "/" because src is a relative file path src = (urlparse(k).path + 'index.html')[1:] dst = (urlparse(v).path) if src == 'index.html': print("Can't do a redirect for: %r" % k) else: redirections.append((src, dst)) return redirections def generate_base_site(self): if not os.path.exists(self.output_folder): os.system('nikola init --empty %s' % (self.output_folder, )) else: self.import_into_existing_site = True print('The folder %s already exists - assuming that this is a ' 'already existing nikola site.' % self.output_folder) conf_template = Template(filename=os.path.join( os.path.dirname(utils.__file__), 'conf.py.in')) return conf_template @staticmethod def populate_context(channel): context = {} context['DEFAULT_LANG'] = 'en' # blogger doesn't include the language # in the dump context['BLOG_TITLE'] = channel.feed.title context['BLOG_DESCRIPTION'] = '' # Missing in the dump context['BLOG_URL'] = channel.feed.link.rstrip('/') context['BLOG_EMAIL'] = channel.feed.author_detail.email context['BLOG_AUTHOR'] = channel.feed.author_detail.name context['POST_PAGES'] = '''( ("posts/*.html", "posts", "post.tmpl", True), ("stories/*.html", "stories", "story.tmpl", False), )''' context['POST_COMPILERS'] = '''{ "rest": ('.txt', '.rst'), "markdown": ('.md', '.mdown', '.markdown', '.wp'), "html": ('.html', '.htm') } ''' return context @classmethod def transform_content(cls, content): # No transformations yet return content @classmethod def write_content(cls, filename, content): doc = html.document_fromstring(content) doc.rewrite_links(replacer) with open(filename, "wb+") as fd: fd.write(html.tostring(doc, encoding='utf8')) @staticmethod def write_metadata(filename, title, slug, post_date, description, tags): with codecs.open(filename, "w+", "utf8") as fd: fd.write('%s\n' % title) fd.write('%s\n' % slug) fd.write('%s\n' % post_date) fd.write('%s\n' % ','.join(tags)) fd.write('\n') fd.write('%s\n' % description) def import_item(self, item, out_folder=None): """Takes an item from the feed and creates a post file.""" if out_folder is None: out_folder = 'posts' # link is something like http://foo.com/2012/09/01/hello-world/ # So, take the path, utils.slugify it, and that's our slug link = item.link link_path = urlparse(link).path title = item.title # blogger supports empty titles, which Nikola doesn't if not title: print("Warning: Empty title in post with URL %s. Using NO_TITLE " "as placeholder, please fix." % link) title = "NO_TITLE" if link_path.lower().endswith('.html'): link_path = link_path[:-5] slug = utils.slugify(link_path) if not slug: # should never happen print("Error converting post:", title) return description = '' post_date = datetime.datetime.fromtimestamp(time.mktime( item.published_parsed)) for candidate in item.content: if candidate.type == 'text/html': content = candidate.value break # FIXME: handle attachments tags = [] for tag in item.tags: if tag.scheme == 'http://www.blogger.com/atom/ns#': tags.append(tag.term) if item.get('app_draft'): tags.append('draft') is_draft = True else: is_draft = False self.url_map[link] = self.context['BLOG_URL'] + '/' + \ out_folder + '/' + slug + '.html' if is_draft and self.exclude_drafts: print('Draft "%s" will not be imported.' % (title, )) elif content.strip(): # If no content is found, no files are written. content = self.transform_content(content) self.write_metadata(os.path.join(self.output_folder, out_folder, slug + '.meta'), title, slug, post_date, description, tags) self.write_content( os.path.join(self.output_folder, out_folder, slug + '.html'), content) else: print('Not going to import "%s" because it seems to contain' ' no content.' % (title, )) def process_item(self, item): post_type = item.tags[0].term if post_type == 'http://schemas.google.com/blogger/2008/kind#post': self.import_item(item, 'posts') elif post_type == 'http://schemas.google.com/blogger/2008/kind#page': self.import_item(item, 'stories') elif post_type == ('http://schemas.google.com/blogger/2008/kind' '#settings'): # Ignore settings pass elif post_type == ('http://schemas.google.com/blogger/2008/kind' '#template'): # Ignore template pass elif post_type == ('http://schemas.google.com/blogger/2008/kind' '#comment'): # FIXME: not importing comments. Does blogger support "pages"? pass else: print("Unknown post_type:", post_type) def import_posts(self, channel): for item in channel.entries: self.process_item(item) @staticmethod def write_urlmap_csv(output_file, url_map): with codecs.open(output_file, 'w+', 'utf8') as fd: csv_writer = csv.writer(fd) for item in url_map.items(): csv_writer.writerow(item) def get_configuration_output_path(self): if not self.import_into_existing_site: filename = 'conf.py' else: filename = 'conf.py.wordpress_import-%s' % datetime.datetime.now( ).strftime('%Y%m%d_%H%M%s') config_output_path = os.path.join(self.output_folder, filename) print('Configuration will be written to: %s' % config_output_path) return config_output_path @staticmethod def write_configuration(filename, rendered_template): with codecs.open(filename, 'w+', 'utf8') as fd: fd.write(rendered_template) def run(self, *arguments): """Import a Wordpress blog from an export file into a Nikola site.""" # Parse the data if feedparser is None: print('To use the import_blogger command,' ' you have to install the "feedparser" package.') return parser = OptionParser( usage="nikola %s [options] blogger_export_file" % self.name) parser.add_option('-f', '--filename', dest='filename', help='Blogger export file from which the import is ' 'made.') parser.add_option('-o', '--output-folder', dest='output_folder', default='new_site', help='The location into which the imported content ' 'will be written') parser.add_option('-d', '--no-drafts', dest='exclude_drafts', default=False, action="store_true", help='Do not ' 'import drafts.') (options, args) = parser.parse_args(list(arguments)) if not options.filename and args: options.filename = args[0] if not options.filename: parser.print_usage() return self.blogger_export_file = options.filename self.output_folder = options.output_folder self.import_into_existing_site = False self.exclude_drafts = options.exclude_drafts self.url_map = {} channel = self.get_channel_from_file(self.blogger_export_file) self.context = self.populate_context(channel) conf_template = self.generate_base_site() self.context['REDIRECTIONS'] = self.configure_redirections( self.url_map) self.import_posts(channel) self.write_urlmap_csv( os.path.join(self.output_folder, 'url_map.csv'), self.url_map) self.write_configuration(self.get_configuration_output_path( ), conf_template.render(**self.context)) def replacer(dst): return links.get(dst, dst)