diff options
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
| -rw-r--r-- | nikola/plugins/command/import_wordpress.py | 139 |
1 files changed, 119 insertions, 20 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py index 0c9915a..b567c77 100644 --- a/nikola/plugins/command/import_wordpress.py +++ b/nikola/plugins/command/import_wordpress.py @@ -50,6 +50,8 @@ from nikola.plugin_categories import Command from nikola import utils from nikola.utils import req_missing from nikola.plugins.basic_import import ImportMixin, links +from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN +from nikola.plugins.command.init import SAMPLE_CONF, prepare_config LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER) @@ -84,6 +86,23 @@ class CommandImportWordpress(Command, ImportMixin): 'type': bool, 'help': "Do not try to download files for the import", }, + { + 'name': 'separate_qtranslate_content', + 'long': 'qtranslate', + 'default': False, + 'type': bool, + 'help': "Look for translations generated by qtranslate plugin", + # WARNING: won't recover translated titles that actually + # don't seem to be part of the wordpress XML export at the + # time of writing :( + }, + { + 'name': 'translations_pattern', + 'long': 'translations_pattern', + 'default': None, + 'type': str, + 'help': "The pattern for translation files names", + }, ] def _execute(self, options={}, args=[]): @@ -114,6 +133,9 @@ class CommandImportWordpress(Command, ImportMixin): self.exclude_drafts = options.get('exclude_drafts', False) self.no_downloads = options.get('no_downloads', False) + self.separate_qtranslate_content = options.get('separate_qtranslate_content') + self.translations_pattern = options.get('translations_pattern') + if not self.no_downloads: def show_info_about_mising_module(modulename): LOGGER.error( @@ -135,15 +157,21 @@ class CommandImportWordpress(Command, ImportMixin): self.context = self.populate_context(channel) conf_template = self.generate_base_site() + # If user has specified a custom pattern for translation files we + # need to fix the config + if self.translations_pattern: + self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern + self.import_posts(channel) self.context['REDIRECTIONS'] = self.configure_redirections( self.url_map) self.write_urlmap_csv( os.path.join(self.output_folder, 'url_map.csv'), self.url_map) - rendered_template = conf_template.render(**self.context) + rendered_template = conf_template.render(**prepare_config(self.context)) rendered_template = re.sub('# REDIRECTIONS = ', 'REDIRECTIONS = ', rendered_template) + if self.timezone: rendered_template = re.sub('# TIMEZONE = \'UTC\'', 'TIMEZONE = \'' + self.timezone + '\'', @@ -194,8 +222,9 @@ class CommandImportWordpress(Command, ImportMixin): def populate_context(channel): wordpress_namespace = channel.nsmap['wp'] - context = {} + context = SAMPLE_CONF.copy() context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2] + context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN context['BLOG_TITLE'] = get_text_tag(channel, 'title', 'PUT TITLE HERE') context['BLOG_DESCRIPTION'] = get_text_tag( @@ -205,9 +234,10 @@ class CommandImportWordpress(Command, ImportMixin): base_site_url = channel.find('{{{0}}}author'.format(wordpress_namespace)) context['BASE_URL'] = get_text_tag(base_site_url, None, - "http://foo.com") + "http://foo.com/") + if not context['BASE_URL'].endswith('/'): + context['BASE_URL'] += '/' context['SITE_URL'] = context['BASE_URL'] - context['THEME'] = 'bootstrap3' author = channel.find('{{{0}}}author'.format(wordpress_namespace)) context['BLOG_EMAIL'] = get_text_tag( @@ -253,7 +283,7 @@ class CommandImportWordpress(Command, ImportMixin): + list(path.split('/')))) dst_dir = os.path.dirname(dst_path) utils.makedirs(dst_dir) - LOGGER.notice("Downloading {0} => {1}".format(url, dst_path)) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[link] = '/' + dst_url @@ -288,7 +318,7 @@ class CommandImportWordpress(Command, ImportMixin): # your blogging into another site or system its not. # Why don't they just use JSON? if sys.version_info[0] == 2: - metadata = phpserialize.loads(meta_value.text) + metadata = phpserialize.loads(utils.sys_encode(meta_value.text)) size_key = 'sizes' file_key = 'file' else: @@ -307,7 +337,7 @@ class CommandImportWordpress(Command, ImportMixin): + list(path.split('/')))) dst_dir = os.path.dirname(dst_path) utils.makedirs(dst_dir) - LOGGER.notice("Downloading {0} => {1}".format(url, dst_path)) + LOGGER.info("Downloading {0} => {1}".format(url, dst_path)) self.download_url_content_to_file(url, dst_path) dst_url = '/'.join(dst_path.split(os.sep)[2:]) links[url] = '/' + dst_url @@ -350,14 +380,17 @@ class CommandImportWordpress(Command, ImportMixin): # link is something like http://foo.com/2012/09/01/hello-world/ # So, take the path, utils.slugify it, and that's our slug link = get_text_tag(item, 'link', None) - path = unquote(urlparse(link).path) + path = unquote(urlparse(link).path.strip('/')) # In python 2, path is a str. slug requires a unicode # object. According to wikipedia, unquoted strings will # usually be UTF8 if isinstance(path, utils.bytes_str): path = path.decode('utf8') - slug = utils.slugify(path) + pathlist = path.split('/') + if len(pathlist) > 1: + out_folder = os.path.join(*([out_folder] + pathlist[:-1])) + slug = utils.slugify(pathlist[-1]) if not slug: # it happens if the post has no "nice" URL slug = get_text_tag( item, '{{{0}}}post_name'.format(wordpress_namespace), None) @@ -395,21 +428,43 @@ class CommandImportWordpress(Command, ImportMixin): continue tags.append(text) + if '$latex' in content: + tags.append('mathjax') + if is_draft and self.exclude_drafts: LOGGER.notice('Draft "{0}" will not be imported.'.format(title)) elif content.strip(): # If no content is found, no files are written. - self.url_map[link] = self.context['SITE_URL'] + '/' + \ - out_folder + '/' + slug + '.html' - - content = self.transform_content(content) - - self.write_metadata(os.path.join(self.output_folder, out_folder, - slug + '.meta'), - title, slug, post_date, description, tags) - self.write_content( - os.path.join(self.output_folder, out_folder, slug + '.wp'), - content) + self.url_map[link] = (self.context['SITE_URL'] + out_folder + '/' + + slug + '.html') + if hasattr(self, "separate_qtranslate_content") \ + and self.separate_qtranslate_content: + content_translations = separate_qtranslate_content(content) + else: + content_translations = {"": content} + default_language = self.context["DEFAULT_LANG"] + for lang, content in content_translations.items(): + if lang: + out_meta_filename = slug + '.meta' + if lang == default_language: + out_content_filename = slug + '.wp' + else: + out_content_filename \ + = utils.get_translation_candidate(self.context, + slug + ".wp", lang) + meta_slug = slug + else: + out_meta_filename = slug + '.meta' + out_content_filename = slug + '.wp' + meta_slug = slug + content = self.transform_content(content) + self.write_metadata(os.path.join(self.output_folder, out_folder, + out_meta_filename), + title, meta_slug, post_date, description, tags) + self.write_content( + os.path.join(self.output_folder, + out_folder, out_content_filename), + content) else: LOGGER.warn('Not going to import "{0}" because it seems to contain' ' no content.'.format(title)) @@ -441,3 +496,47 @@ def get_text_tag(tag, name, default): return t.text else: return default + + +def separate_qtranslate_content(text): + """Parse the content of a wordpress post or page and separate + the various language specific contents when they are delimited + with qtranslate tags: <!--:LL-->blabla<!--:-->""" + # TODO: uniformize qtranslate tags <!--/en--> => <!--:--> + qt_start = "<!--:" + qt_end = "-->" + qt_end_with_lang_len = 5 + qt_chunks = text.split(qt_start) + content_by_lang = {} + common_txt_list = [] + for c in qt_chunks: + if not c.strip(): + continue + if c.startswith(qt_end): + # just after the end of a language specific section, there may + # be some piece of common text or tags, or just nothing + lang = "" # default language + c = c.lstrip(qt_end) + if not c: + continue + elif c[2:].startswith(qt_end): + # a language specific section (with language code at the begining) + lang = c[:2] + c = c[qt_end_with_lang_len:] + else: + # nowhere specific (maybe there is no language section in the + # currently parsed content) + lang = "" # default language + if not lang: + common_txt_list.append(c) + for l in content_by_lang.keys(): + content_by_lang[l].append(c) + else: + content_by_lang[lang] = content_by_lang.get(lang, common_txt_list) + [c] + # in case there was no language specific section, just add the text + if common_txt_list and not content_by_lang: + content_by_lang[""] = common_txt_list + # Format back the list to simple text + for l in content_by_lang.keys(): + content_by_lang[l] = " ".join(content_by_lang[l]) + return content_by_lang |
