diff options
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command/import_wordpress.py | 60 |
1 files changed, 50 insertions, 10 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py index a652ec8..69ef144 100644 --- a/nikola/plugins/command/import_wordpress.py +++ b/nikola/plugins/command/import_wordpress.py @@ -50,7 +50,7 @@ except ImportError: from nikola.plugin_categories import Command from nikola import utils -from nikola.utils import req_missing +from nikola.utils import req_missing, unicode_str from nikola.plugins.basic_import import ImportMixin, links from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN from nikola.plugins.command.init import SAMPLE_CONF, prepare_config, format_default_translations_config @@ -88,7 +88,6 @@ def install_plugin(site, plugin_name, output_dir=None, show_install_notes=False) class CommandImportWordpress(Command, ImportMixin): - """Import a WordPress dump.""" name = "import_wordpress" @@ -191,6 +190,12 @@ class CommandImportWordpress(Command, ImportMixin): 'type': bool, 'help': "Automatically installs the WordPress page compiler (either locally or in the new site) if required by other options.\nWarning: the compiler is GPL software!", }, + { + 'name': 'tag_saniziting_strategy', + 'long': 'tag-saniziting-strategy', + 'default': 'first', + 'help': 'lower: Convert all tag and category names to lower case\nfirst: Keep first spelling of tag or category name', + }, ] all_tags = set([]) @@ -239,6 +244,8 @@ class CommandImportWordpress(Command, ImportMixin): self.install_wordpress_compiler = options.get('install_wordpress_compiler', False) self.wordpress_page_compiler = None + self.tag_saniziting_strategy = options.get('tag_saniziting_strategy', 'first') + self.auth = None if options.get('download_auth') is not None: username_password = options.get('download_auth') @@ -337,7 +344,10 @@ class CommandImportWordpress(Command, ImportMixin): # Add tag redirects for tag in self.all_tags: try: - tag_str = tag.decode('utf8') + if isinstance(tag, utils.bytes_str): + tag_str = tag.decode('utf8', 'replace') + else: + tag_str = tag except AttributeError: tag_str = tag tag = utils.slugify(tag_str) @@ -604,7 +614,7 @@ class CommandImportWordpress(Command, ImportMixin): def transform_code(self, content): """Transform code blocks.""" - # http://en.support.wordpress.com/code/posting-source-code/. There are + # https://en.support.wordpress.com/code/posting-source-code/. There are # a ton of things not supported here. We only do a basic [code # lang="x"] -> ```x translation, and remove quoted html entities (<, # >, &, and "). @@ -686,7 +696,7 @@ class CommandImportWordpress(Command, ImportMixin): elif approved == 'spam' or approved == 'trash': pass else: - LOGGER.warn("Unknown comment approved status: " + str(approved)) + LOGGER.warn("Unknown comment approved status: {0}".format(approved)) parent = int(get_text_tag(comment, "{{{0}}}comment_parent".format(wordpress_namespace), 0)) if parent == 0: parent = None @@ -707,7 +717,7 @@ class CommandImportWordpress(Command, ImportMixin): """Write comment header line.""" if header_content is None: return - header_content = str(header_content).replace('\n', ' ') + header_content = unicode_str(header_content).replace('\n', ' ') line = '.. ' + header_field + ': ' + header_content + '\n' fd.write(line.encode('utf8')) @@ -747,6 +757,24 @@ class CommandImportWordpress(Command, ImportMixin): tags_cats = tags + categories return tags_cats, other_meta + _tag_sanitize_map = {True: {}, False: {}} + + def _sanitize(self, tag, is_category): + if self.tag_saniziting_strategy == 'lower': + return tag.lower() + if tag.lower() not in self._tag_sanitize_map[is_category]: + self._tag_sanitize_map[is_category][tag.lower()] = [tag] + return tag + previous = self._tag_sanitize_map[is_category][tag.lower()] + if self.tag_saniziting_strategy == 'first': + if tag != previous[0]: + LOGGER.warn("Changing spelling of {0} name '{1}' to {2}.".format('category' if is_category else 'tag', tag, previous[0])) + return previous[0] + else: + LOGGER.error("Unknown tag sanitizing strategy '{0}'!".format(self.tag_saniziting_strategy)) + sys.exit(1) + return tag + def import_postpage_item(self, item, wordpress_namespace, out_folder=None, attachments=None): """Take an item from the feed and creates a post file.""" if out_folder is None: @@ -760,7 +788,10 @@ class CommandImportWordpress(Command, ImportMixin): path = unquote(parsed.path.strip('/')) try: - path = path.decode('utf8') + if isinstance(path, utils.bytes_str): + path = path.decode('utf8', 'replace') + else: + path = path except AttributeError: pass @@ -831,15 +862,24 @@ class CommandImportWordpress(Command, ImportMixin): type = tag.attrib['domain'] if text == 'Uncategorized' and type == 'category': continue - self.all_tags.add(text) if type == 'category': - categories.append(type) + categories.append(text) else: tags.append(text) if '$latex' in content: tags.append('mathjax') + for i, cat in enumerate(categories[:]): + cat = self._sanitize(cat, True) + categories[i] = cat + self.all_tags.add(cat) + + for i, tag in enumerate(tags[:]): + tag = self._sanitize(tag, False) + tags[i] = tag + self.all_tags.add(tag) + # Find post format if it's there post_format = 'wp' format_tag = [x for x in item.findall('*//{%s}meta_key' % wordpress_namespace) if x.text == '_tc_post_format'] @@ -905,7 +945,7 @@ class CommandImportWordpress(Command, ImportMixin): comments.append(comment) for comment in comments: - comment_filename = slug + "." + str(comment['id']) + ".wpcomment" + comment_filename = "{0}.{1}.wpcomment".format(slug, comment['id']) self._write_comment(os.path.join(self.output_folder, out_folder, comment_filename), comment) return (out_folder, slug) |
