diff options
Diffstat (limited to 'nikola/plugins/command_import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command_import_wordpress.py | 240 |
1 files changed, 151 insertions, 89 deletions
diff --git a/nikola/plugins/command_import_wordpress.py b/nikola/plugins/command_import_wordpress.py index 07028d8..e7ecca0 100644 --- a/nikola/plugins/command_import_wordpress.py +++ b/nikola/plugins/command_import_wordpress.py @@ -28,7 +28,6 @@ import csv import datetime import os import re -from optparse import OptionParser try: from urlparse import urlparse @@ -53,9 +52,104 @@ class CommandImportWordpress(Command): """Import a wordpress dump.""" name = "import_wordpress" + needs_config = False + doc_usage = "[options] wordpress_export_file" + doc_purpose = "Import a wordpress dump." + cmd_options = [ + { + 'name': 'output_folder', + 'long': 'output-folder', + 'short': 'o', + 'default': 'new_site', + 'help': 'Location to write imported content.' + }, + { + 'name': 'exclude_drafts', + 'long': 'no-drafts', + 'short': 'd', + 'default': False, + 'type': bool, + 'help': "Don't import drafts", + }, + { + 'name': 'squash_newlines', + 'long': 'squash-newlines', + 'default': False, + 'type': bool, + 'help': "Shorten multiple newlines in a row to only two newlines", + }, + { + 'name': 'no_downloads', + 'long': 'no-downloads', + 'default': False, + 'type': bool, + 'help': "Do not try to download files for the import", + }, + ] + + def _execute(self, options={}, args=[]): + """Import a Wordpress blog from an export file into a Nikola site.""" + # Parse the data + print(options, args) + if requests is None: + print('To use the import_wordpress command,' + ' you have to install the "requests" package.') + return - @staticmethod - def read_xml_file(filename): + if not args: + print(self.help()) + return + + options['filename'] = args[0] + + if len(args) > 1: + options['output_folder'] = args[1] + + self.wordpress_export_file = options['filename'] + self.squash_newlines = options.get('squash_newlines', False) + self.no_downloads = options.get('no_downloads', False) + self.output_folder = options.get('output_folder', 'new_site') + self.import_into_existing_site = False + self.exclude_drafts = options.get('exclude_drafts', False) + self.url_map = {} + channel = self.get_channel_from_file(self.wordpress_export_file) + self.context = self.populate_context(channel) + conf_template = self.generate_base_site() + + self.import_posts(channel) + + self.context['REDIRECTIONS'] = self.configure_redirections( + self.url_map) + self.write_urlmap_csv( + os.path.join(self.output_folder, 'url_map.csv'), self.url_map) + rendered_template = conf_template.render(**self.context) + rendered_template = re.sub('# REDIRECTIONS = ', 'REDIRECTIONS = ', + rendered_template) + self.write_configuration(self.get_configuration_output_path(), + rendered_template) + + @classmethod + def _glue_xml_lines(cls, xml): + new_xml = xml[0] + previous_line_ended_in_newline = new_xml.endswith(b'\n') + previous_line_was_indentet = False + for line in xml[1:]: + if (re.match(b'^[ \t]+', line) and previous_line_ended_in_newline): + new_xml = b''.join((new_xml, line)) + previous_line_was_indentet = True + elif previous_line_was_indentet: + new_xml = b''.join((new_xml, line)) + previous_line_was_indentet = False + else: + new_xml = b'\n'.join((new_xml, line)) + previous_line_was_indentet = False + + previous_line_ended_in_newline = line.endswith(b'\n') + + return new_xml + + @classmethod + def read_xml_file(cls, filename): xml = [] with open(filename, 'rb') as fd: @@ -64,9 +158,8 @@ class CommandImportWordpress(Command): if b'<atom:link rel=' in line: continue xml.append(line) - xml = b'\n'.join(xml) - return xml + return cls._glue_xml_lines(xml) @classmethod def get_channel_from_file(cls, filename): @@ -82,7 +175,7 @@ class CommandImportWordpress(Command): src = (urlparse(k).path + 'index.html')[1:] dst = (urlparse(v).path) if src == 'index.html': - print("Can't do a redirect for: %r" % k) + print("Can't do a redirect for: {0!r}".format(k)) else: redirections.append((src, dst)) @@ -90,11 +183,11 @@ class CommandImportWordpress(Command): def generate_base_site(self): if not os.path.exists(self.output_folder): - os.system('nikola init --empty %s' % (self.output_folder, )) + os.system('nikola init ' + self.output_folder) else: self.import_into_existing_site = True - print('The folder %s already exists - assuming that this is a ' - 'already existing nikola site.' % self.output_folder) + print('The folder {0} already exists - assuming that this is a ' + 'already existing nikola site.'.format(self.output_folder)) conf_template = Template(filename=os.path.join( os.path.dirname(utils.__file__), 'conf.py.in')) @@ -111,15 +204,16 @@ class CommandImportWordpress(Command): 'PUT TITLE HERE') context['BLOG_DESCRIPTION'] = get_text_tag( channel, 'description', 'PUT DESCRIPTION HERE') - context['BLOG_URL'] = get_text_tag(channel, 'link', '#') - author = channel.find('{%s}author' % wordpress_namespace) + context['SITE_URL'] = get_text_tag(channel, 'link', '#') + context['BASE_URL'] = get_text_tag(channel, 'link', '#') + author = channel.find('{{{0}}}author'.format(wordpress_namespace)) context['BLOG_EMAIL'] = get_text_tag( author, - '{%s}author_email' % wordpress_namespace, + '{{{0}}}author_email'.format(wordpress_namespace), "joe@example.com") context['BLOG_AUTHOR'] = get_text_tag( author, - '{%s}author_display_name' % wordpress_namespace, + '{{{0}}}author_display_name'.format(wordpress_namespace), "Joe Example") context['POST_PAGES'] = '''( ("posts/*.wp", "posts", "post.tmpl", True), @@ -134,25 +228,29 @@ class CommandImportWordpress(Command): return context - @staticmethod - def download_url_content_to_file(url, dst_path): + def download_url_content_to_file(self, url, dst_path): + if self.no_downloads: + return + try: with open(dst_path, 'wb+') as fd: fd.write(requests.get(url).content) except requests.exceptions.ConnectionError as err: - print("Downloading %s to %s failed: %s" % (url, dst_path, err)) + print("Downloading {0} to {1} failed: {2}".format(url, dst_path, + err)) def import_attachment(self, item, wordpress_namespace): url = get_text_tag( - item, '{%s}attachment_url' % wordpress_namespace, 'foo') - link = get_text_tag(item, '{%s}link' % wordpress_namespace, 'foo') + item, '{{{0}}}attachment_url'.format(wordpress_namespace), 'foo') + link = get_text_tag(item, '{{{0}}}link'.format(wordpress_namespace), + 'foo') path = urlparse(url).path dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/')))) dst_dir = os.path.dirname(dst_path) if not os.path.isdir(dst_dir): os.makedirs(dst_dir) - print("Downloading %s => %s" % (url, dst_path)) + print("Downloading {0} => {1}".format(url, dst_path)) self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[link] = '/' + dst_url @@ -173,10 +271,18 @@ class CommandImportWordpress(Command): return new_caption - @classmethod - def transform_content(cls, content): - new_content = cls.transform_sourcecode(content) - return cls.transform_caption(new_content) + def transform_multiple_newlines(self, content): + """Replaces multiple newlines with only two.""" + if self.squash_newlines: + return re.sub(r'\n{3,}', r'\n\n', content) + else: + return content + + def transform_content(self, content): + new_content = self.transform_sourcecode(content) + new_content = self.transform_caption(new_content) + new_content = self.transform_multiple_newlines(new_content) + return new_content @classmethod def write_content(cls, filename, content): @@ -188,13 +294,16 @@ class CommandImportWordpress(Command): @staticmethod def write_metadata(filename, title, slug, post_date, description, tags): + if not description: + description = "" + with codecs.open(filename, "w+", "utf8") as fd: - fd.write('%s\n' % title) - fd.write('%s\n' % slug) - fd.write('%s\n' % post_date) - fd.write('%s\n' % ','.join(tags)) + fd.write('{0}\n'.format(title)) + fd.write('{0}\n'.format(slug)) + fd.write('{0}\n'.format(post_date)) + fd.write('{0}\n'.format(','.join(tags))) fd.write('\n') - fd.write('%s\n' % description) + fd.write('{0}\n'.format(description)) def import_item(self, item, wordpress_namespace, out_folder=None): """Takes an item from the feed and creates a post file.""" @@ -208,19 +317,19 @@ class CommandImportWordpress(Command): slug = utils.slugify(urlparse(link).path) if not slug: # it happens if the post has no "nice" URL slug = get_text_tag( - item, '{%s}post_name' % wordpress_namespace, None) + item, '{{{0}}}post_name'.format(wordpress_namespace), None) if not slug: # it *may* happen slug = get_text_tag( - item, '{%s}post_id' % wordpress_namespace, None) + item, '{{{0}}}post_id'.format(wordpress_namespace), None) if not slug: # should never happen print("Error converting post:", title) return description = get_text_tag(item, 'description', '') post_date = get_text_tag( - item, '{%s}post_date' % wordpress_namespace, None) + item, '{{{0}}}post_date'.format(wordpress_namespace), None) status = get_text_tag( - item, '{%s}status' % wordpress_namespace, 'publish') + item, '{{{0}}}status'.format(wordpress_namespace), 'publish') content = get_text_tag( item, '{http://purl.org/rss/1.0/modules/content/}encoded', '') @@ -237,13 +346,13 @@ class CommandImportWordpress(Command): continue tags.append(text) - self.url_map[link] = self.context['BLOG_URL'] + '/' + \ - out_folder + '/' + slug + '.html' - if is_draft and self.exclude_drafts: - print('Draft "%s" will not be imported.' % (title, )) + print('Draft "{0}" will not be imported.'.format(title)) elif content.strip(): # If no content is found, no files are written. + self.url_map[link] = self.context['SITE_URL'] + '/' + \ + out_folder + '/' + slug + '.html' + content = self.transform_content(content) self.write_metadata(os.path.join(self.output_folder, out_folder, @@ -253,15 +362,15 @@ class CommandImportWordpress(Command): os.path.join(self.output_folder, out_folder, slug + '.wp'), content) else: - print('Not going to import "%s" because it seems to contain' - ' no content.' % (title, )) + print('Not going to import "{0}" because it seems to contain' + ' no content.'.format(title)) def process_item(self, item): # The namespace usually is something like: # http://wordpress.org/export/1.2/ wordpress_namespace = item.nsmap['wp'] post_type = get_text_tag( - item, '{%s}post_type' % wordpress_namespace, 'post') + item, '{{{0}}}post_type'.format(wordpress_namespace), 'post') if post_type == 'attachment': self.import_attachment(item, wordpress_namespace) @@ -285,10 +394,10 @@ class CommandImportWordpress(Command): if not self.import_into_existing_site: filename = 'conf.py' else: - filename = 'conf.py.wordpress_import-%s' % datetime.datetime.now( - ).strftime('%Y%m%d_%H%M%s') + filename = 'conf.py.wordpress_import-{0}'.format( + datetime.datetime.now().strftime('%Y%m%d_%H%M%s')) config_output_path = os.path.join(self.output_folder, filename) - print('Configuration will be written to: %s' % config_output_path) + print('Configuration will be written to:', config_output_path) return config_output_path @@ -297,53 +406,6 @@ class CommandImportWordpress(Command): with codecs.open(filename, 'w+', 'utf8') as fd: fd.write(rendered_template) - def run(self, *arguments): - """Import a Wordpress blog from an export file into a Nikola site.""" - # Parse the data - if requests is None: - print('To use the import_wordpress command,' - ' you have to install the "requests" package.') - return - - parser = OptionParser(usage="nikola %s [options] " - "wordpress_export_file" % self.name) - parser.add_option('-f', '--filename', dest='filename', - help='WordPress export file from which the import ' - 'made.') - parser.add_option('-o', '--output-folder', dest='output_folder', - default='new_site', help='The location into which ' - 'the imported content will be written') - parser.add_option('-d', '--no-drafts', dest='exclude_drafts', - default=False, action="store_true", help='Do not ' - 'import drafts.') - - (options, args) = parser.parse_args(list(arguments)) - - if not options.filename and args: - options.filename = args[0] - - if not options.filename: - parser.print_usage() - return - - self.wordpress_export_file = options.filename - self.output_folder = options.output_folder - self.import_into_existing_site = False - self.exclude_drafts = options.exclude_drafts - self.url_map = {} - channel = self.get_channel_from_file(self.wordpress_export_file) - self.context = self.populate_context(channel) - conf_template = self.generate_base_site() - self.context['REDIRECTIONS'] = self.configure_redirections( - self.url_map) - - self.import_posts(channel) - self.write_urlmap_csv( - os.path.join(self.output_folder, 'url_map.csv'), self.url_map) - - self.write_configuration(self.get_configuration_output_path( - ), conf_template.render(**self.context)) - def replacer(dst): return links.get(dst, dst) |
