diff options
Diffstat (limited to 'nikola/plugins/command_import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command_import_wordpress.py | 166 |
1 files changed, 121 insertions, 45 deletions
diff --git a/nikola/plugins/command_import_wordpress.py b/nikola/plugins/command_import_wordpress.py index 1552da4..07028d8 100644 --- a/nikola/plugins/command_import_wordpress.py +++ b/nikola/plugins/command_import_wordpress.py @@ -25,20 +25,23 @@ from __future__ import unicode_literals, print_function import codecs import csv +import datetime import os import re +from optparse import OptionParser + try: from urlparse import urlparse except ImportError: - from urllib.parse import urlparse + from urllib.parse import urlparse # NOQA -from lxml import etree, html, builder +from lxml import etree, html from mako.template import Template try: import requests except ImportError: - requests = None + requests = None # NOQA from nikola.plugin_categories import Command from nikola import utils @@ -85,9 +88,14 @@ class CommandImportWordpress(Command): return redirections - @staticmethod - def generate_base_site(context): - os.system('nikola init new_site') + def generate_base_site(self): + if not os.path.exists(self.output_folder): + os.system('nikola init --empty %s' % (self.output_folder, )) + else: + self.import_into_existing_site = True + print('The folder %s already exists - assuming that this is a ' + 'already existing nikola site.' % self.output_folder) + conf_template = Template(filename=os.path.join( os.path.dirname(utils.__file__), 'conf.py.in')) @@ -128,14 +136,18 @@ class CommandImportWordpress(Command): @staticmethod def download_url_content_to_file(url, dst_path): - with open(dst_path, 'wb+') as fd: - fd.write(requests.get(url).content) + try: + with open(dst_path, 'wb+') as fd: + fd.write(requests.get(url).content) + except requests.exceptions.ConnectionError as err: + print("Downloading %s to %s failed: %s" % (url, dst_path, err)) def import_attachment(self, item, wordpress_namespace): - url = get_text_tag(item, '{%s}attachment_url' % wordpress_namespace, 'foo') + url = get_text_tag( + item, '{%s}attachment_url' % wordpress_namespace, 'foo') link = get_text_tag(item, '{%s}link' % wordpress_namespace, 'foo') path = urlparse(url).path - dst_path = os.path.join(*(['new_site', 'files'] + dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) dst_dir = os.path.dirname(dst_path) if not os.path.isdir(dst_dir): @@ -147,23 +159,32 @@ class CommandImportWordpress(Command): links[url] = '/' + dst_url @staticmethod - def write_content(filename, content): + def transform_sourcecode(content): + new_content = re.sub('\[sourcecode language="([^"]+)"\]', + "\n~~~~~~~~~~~~{.\\1}\n", content) + new_content = new_content.replace('[/sourcecode]', + "\n~~~~~~~~~~~~\n") + return new_content + + @staticmethod + def transform_caption(content): + new_caption = re.sub(r'\[/caption\]', '', content) + new_caption = re.sub(r'\[caption.*\]', '', new_caption) + + return new_caption + + @classmethod + def transform_content(cls, content): + new_content = cls.transform_sourcecode(content) + return cls.transform_caption(new_content) + + @classmethod + def write_content(cls, filename, content): + doc = html.document_fromstring(content) + doc.rewrite_links(replacer) + with open(filename, "wb+") as fd: - if content.strip(): - # Handle sourcecode pseudo-tags - content = re.sub('\[sourcecode language="([^"]+)"\]', - "\n~~~~~~~~~~~~{.\\1}\n", content) - content = content.replace('[/sourcecode]', "\n~~~~~~~~~~~~\n") - doc = html.document_fromstring(content) - doc.rewrite_links(replacer) - # Replace H1 elements with H2 elements - for tag in doc.findall('.//h1'): - if not tag.text: - print("Failed to fix bad title: %r" % - html.tostring(tag)) - else: - tag.getparent().replace(tag, builder.E.h2(tag.text)) - fd.write(html.tostring(doc, encoding='utf8')) + fd.write(html.tostring(doc, encoding='utf8')) @staticmethod def write_metadata(filename, title, slug, post_date, description, tags): @@ -186,22 +207,30 @@ class CommandImportWordpress(Command): link = get_text_tag(item, 'link', None) slug = utils.slugify(urlparse(link).path) if not slug: # it happens if the post has no "nice" URL - slug = get_text_tag(item, '{%s}post_name' % wordpress_namespace, None) + slug = get_text_tag( + item, '{%s}post_name' % wordpress_namespace, None) if not slug: # it *may* happen - slug = get_text_tag(item, '{%s}post_id' % wordpress_namespace, None) + slug = get_text_tag( + item, '{%s}post_id' % wordpress_namespace, None) if not slug: # should never happen print("Error converting post:", title) return description = get_text_tag(item, 'description', '') - post_date = get_text_tag(item, '{%s}post_date' % wordpress_namespace, None) - status = get_text_tag(item, '{%s}status' % wordpress_namespace, 'publish') + post_date = get_text_tag( + item, '{%s}post_date' % wordpress_namespace, None) + status = get_text_tag( + item, '{%s}status' % wordpress_namespace, 'publish') content = get_text_tag( item, '{http://purl.org/rss/1.0/modules/content/}encoded', '') tags = [] if status != 'publish': tags.append('draft') + is_draft = True + else: + is_draft = False + for tag in item.findall('category'): text = tag.text if text == 'Uncategorized': @@ -211,17 +240,28 @@ class CommandImportWordpress(Command): self.url_map[link] = self.context['BLOG_URL'] + '/' + \ out_folder + '/' + slug + '.html' - self.write_metadata(os.path.join('new_site', out_folder, - slug + '.meta'), - title, slug, post_date, description, tags) - self.write_content( - os.path.join('new_site', out_folder, slug + '.wp'), content) + if is_draft and self.exclude_drafts: + print('Draft "%s" will not be imported.' % (title, )) + elif content.strip(): + # If no content is found, no files are written. + content = self.transform_content(content) + + self.write_metadata(os.path.join(self.output_folder, out_folder, + slug + '.meta'), + title, slug, post_date, description, tags) + self.write_content( + os.path.join(self.output_folder, out_folder, slug + '.wp'), + content) + else: + print('Not going to import "%s" because it seems to contain' + ' no content.' % (title, )) def process_item(self, item): # The namespace usually is something like: # http://wordpress.org/export/1.2/ wordpress_namespace = item.nsmap['wp'] - post_type = get_text_tag(item, '{%s}post_type' % wordpress_namespace, 'post') + post_type = get_text_tag( + item, '{%s}post_type' % wordpress_namespace, 'post') if post_type == 'attachment': self.import_attachment(item, wordpress_namespace) @@ -241,32 +281,68 @@ class CommandImportWordpress(Command): for item in url_map.items(): csv_writer.writerow(item) + def get_configuration_output_path(self): + if not self.import_into_existing_site: + filename = 'conf.py' + else: + filename = 'conf.py.wordpress_import-%s' % datetime.datetime.now( + ).strftime('%Y%m%d_%H%M%s') + config_output_path = os.path.join(self.output_folder, filename) + print('Configuration will be written to: %s' % config_output_path) + + return config_output_path + @staticmethod def write_configuration(filename, rendered_template): with codecs.open(filename, 'w+', 'utf8') as fd: fd.write(rendered_template) - def run(self, fname=None): + def run(self, *arguments): + """Import a Wordpress blog from an export file into a Nikola site.""" # Parse the data if requests is None: - print('To use the import_wordpress command, you have to install the "requests" package.') + print('To use the import_wordpress command,' + ' you have to install the "requests" package.') return - if fname is None: - print("Usage: nikola import_wordpress wordpress_dump.xml") + + parser = OptionParser(usage="nikola %s [options] " + "wordpress_export_file" % self.name) + parser.add_option('-f', '--filename', dest='filename', + help='WordPress export file from which the import ' + 'made.') + parser.add_option('-o', '--output-folder', dest='output_folder', + default='new_site', help='The location into which ' + 'the imported content will be written') + parser.add_option('-d', '--no-drafts', dest='exclude_drafts', + default=False, action="store_true", help='Do not ' + 'import drafts.') + + (options, args) = parser.parse_args(list(arguments)) + + if not options.filename and args: + options.filename = args[0] + + if not options.filename: + parser.print_usage() return + self.wordpress_export_file = options.filename + self.output_folder = options.output_folder + self.import_into_existing_site = False + self.exclude_drafts = options.exclude_drafts self.url_map = {} - channel = self.get_channel_from_file(fname) + channel = self.get_channel_from_file(self.wordpress_export_file) self.context = self.populate_context(channel) - conf_template = self.generate_base_site(self.context) + conf_template = self.generate_base_site() self.context['REDIRECTIONS'] = self.configure_redirections( self.url_map) self.import_posts(channel) self.write_urlmap_csv( - os.path.join('new_site', 'url_map.csv'), self.url_map) - self.write_configuration(os.path.join( - 'new_site', 'conf.py'), conf_template.render(**self.context)) + os.path.join(self.output_folder, 'url_map.csv'), self.url_map) + + self.write_configuration(self.get_configuration_output_path( + ), conf_template.render(**self.context)) def replacer(dst): |
