diff options
Diffstat (limited to 'nikola/plugins/command_import_blogger.py')
| -rw-r--r-- | nikola/plugins/command_import_blogger.py | 300 |
1 files changed, 300 insertions, 0 deletions
diff --git a/nikola/plugins/command_import_blogger.py b/nikola/plugins/command_import_blogger.py new file mode 100644 index 0000000..aea210a --- /dev/null +++ b/nikola/plugins/command_import_blogger.py @@ -0,0 +1,300 @@ +# Copyright (c) 2012 Roberto Alsina y otros. + +# Permission is hereby granted, free of charge, to any +# person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice +# shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import unicode_literals, print_function +import codecs +import csv +import datetime +import os +from optparse import OptionParser +import time + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse # NOQA + +try: + import feedparser +except ImportError: + feedparser = None # NOQA +from lxml import html +from mako.template import Template + +from nikola.plugin_categories import Command +from nikola import utils + +links = {} + + +class CommandImportBlogger(Command): + """Import a blogger dump.""" + + name = "import_blogger" + + @classmethod + def get_channel_from_file(cls, filename): + return feedparser.parse(filename) + + @staticmethod + def configure_redirections(url_map): + redirections = [] + for k, v in url_map.items(): + # remove the initial "/" because src is a relative file path + src = (urlparse(k).path + 'index.html')[1:] + dst = (urlparse(v).path) + if src == 'index.html': + print("Can't do a redirect for: %r" % k) + else: + redirections.append((src, dst)) + + return redirections + + def generate_base_site(self): + if not os.path.exists(self.output_folder): + os.system('nikola init --empty %s' % (self.output_folder, )) + else: + self.import_into_existing_site = True + print('The folder %s already exists - assuming that this is a ' + 'already existing nikola site.' % self.output_folder) + + conf_template = Template(filename=os.path.join( + os.path.dirname(utils.__file__), 'conf.py.in')) + + return conf_template + + @staticmethod + def populate_context(channel): + context = {} + context['DEFAULT_LANG'] = 'en' # blogger doesn't include the language + # in the dump + context['BLOG_TITLE'] = channel.feed.title + + context['BLOG_DESCRIPTION'] = '' # Missing in the dump + context['BLOG_URL'] = channel.feed.link.rstrip('/') + context['BLOG_EMAIL'] = channel.feed.author_detail.email + context['BLOG_AUTHOR'] = channel.feed.author_detail.name + context['POST_PAGES'] = '''( + ("posts/*.html", "posts", "post.tmpl", True), + ("stories/*.html", "stories", "story.tmpl", False), + )''' + context['POST_COMPILERS'] = '''{ + "rest": ('.txt', '.rst'), + "markdown": ('.md', '.mdown', '.markdown', '.wp'), + "html": ('.html', '.htm') + } + ''' + + return context + + @classmethod + def transform_content(cls, content): + # No transformations yet + return content + + @classmethod + def write_content(cls, filename, content): + doc = html.document_fromstring(content) + doc.rewrite_links(replacer) + + with open(filename, "wb+") as fd: + fd.write(html.tostring(doc, encoding='utf8')) + + @staticmethod + def write_metadata(filename, title, slug, post_date, description, tags): + with codecs.open(filename, "w+", "utf8") as fd: + fd.write('%s\n' % title) + fd.write('%s\n' % slug) + fd.write('%s\n' % post_date) + fd.write('%s\n' % ','.join(tags)) + fd.write('\n') + fd.write('%s\n' % description) + + def import_item(self, item, out_folder=None): + """Takes an item from the feed and creates a post file.""" + if out_folder is None: + out_folder = 'posts' + + # link is something like http://foo.com/2012/09/01/hello-world/ + # So, take the path, utils.slugify it, and that's our slug + link = item.link + link_path = urlparse(link).path + + title = item.title + + # blogger supports empty titles, which Nikola doesn't + if not title: + print("Warning: Empty title in post with URL %s. Using NO_TITLE " + "as placeholder, please fix." % link) + title = "NO_TITLE" + + if link_path.lower().endswith('.html'): + link_path = link_path[:-5] + + slug = utils.slugify(link_path) + + if not slug: # should never happen + print("Error converting post:", title) + return + + description = '' + post_date = datetime.datetime.fromtimestamp(time.mktime( + item.published_parsed)) + + for candidate in item.content: + if candidate.type == 'text/html': + content = candidate.value + break + # FIXME: handle attachments + + tags = [] + for tag in item.tags: + if tag.scheme == 'http://www.blogger.com/atom/ns#': + tags.append(tag.term) + + if item.get('app_draft'): + tags.append('draft') + is_draft = True + else: + is_draft = False + + self.url_map[link] = self.context['BLOG_URL'] + '/' + \ + out_folder + '/' + slug + '.html' + + if is_draft and self.exclude_drafts: + print('Draft "%s" will not be imported.' % (title, )) + elif content.strip(): + # If no content is found, no files are written. + content = self.transform_content(content) + + self.write_metadata(os.path.join(self.output_folder, out_folder, + slug + '.meta'), + title, slug, post_date, description, tags) + self.write_content( + os.path.join(self.output_folder, out_folder, slug + '.html'), + content) + else: + print('Not going to import "%s" because it seems to contain' + ' no content.' % (title, )) + + def process_item(self, item): + post_type = item.tags[0].term + + if post_type == 'http://schemas.google.com/blogger/2008/kind#post': + self.import_item(item, 'posts') + elif post_type == 'http://schemas.google.com/blogger/2008/kind#page': + self.import_item(item, 'stories') + elif post_type == ('http://schemas.google.com/blogger/2008/kind' + '#settings'): + # Ignore settings + pass + elif post_type == ('http://schemas.google.com/blogger/2008/kind' + '#template'): + # Ignore template + pass + elif post_type == ('http://schemas.google.com/blogger/2008/kind' + '#comment'): + # FIXME: not importing comments. Does blogger support "pages"? + pass + else: + print("Unknown post_type:", post_type) + + def import_posts(self, channel): + for item in channel.entries: + self.process_item(item) + + @staticmethod + def write_urlmap_csv(output_file, url_map): + with codecs.open(output_file, 'w+', 'utf8') as fd: + csv_writer = csv.writer(fd) + for item in url_map.items(): + csv_writer.writerow(item) + + def get_configuration_output_path(self): + if not self.import_into_existing_site: + filename = 'conf.py' + else: + filename = 'conf.py.wordpress_import-%s' % datetime.datetime.now( + ).strftime('%Y%m%d_%H%M%s') + config_output_path = os.path.join(self.output_folder, filename) + print('Configuration will be written to: %s' % config_output_path) + + return config_output_path + + @staticmethod + def write_configuration(filename, rendered_template): + with codecs.open(filename, 'w+', 'utf8') as fd: + fd.write(rendered_template) + + def run(self, *arguments): + """Import a Wordpress blog from an export file into a Nikola site.""" + # Parse the data + if feedparser is None: + print('To use the import_blogger command,' + ' you have to install the "feedparser" package.') + return + + parser = OptionParser( + usage="nikola %s [options] blogger_export_file" % self.name) + parser.add_option('-f', '--filename', dest='filename', + help='Blogger export file from which the import is ' + 'made.') + parser.add_option('-o', '--output-folder', dest='output_folder', + default='new_site', + help='The location into which the imported content ' + 'will be written') + parser.add_option('-d', '--no-drafts', dest='exclude_drafts', + default=False, action="store_true", help='Do not ' + 'import drafts.') + + (options, args) = parser.parse_args(list(arguments)) + + if not options.filename and args: + options.filename = args[0] + + if not options.filename: + parser.print_usage() + return + + self.blogger_export_file = options.filename + self.output_folder = options.output_folder + self.import_into_existing_site = False + self.exclude_drafts = options.exclude_drafts + self.url_map = {} + channel = self.get_channel_from_file(self.blogger_export_file) + self.context = self.populate_context(channel) + conf_template = self.generate_base_site() + self.context['REDIRECTIONS'] = self.configure_redirections( + self.url_map) + + self.import_posts(channel) + self.write_urlmap_csv( + os.path.join(self.output_folder, 'url_map.csv'), self.url_map) + + self.write_configuration(self.get_configuration_output_path( + ), conf_template.render(**self.context)) + + +def replacer(dst): + return links.get(dst, dst) |
