aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command/import_wordpress.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/command/import_wordpress.py')
-rw-r--r--nikola/plugins/command/import_wordpress.py139
1 files changed, 119 insertions, 20 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index 0c9915a..b567c77 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -50,6 +50,8 @@ from nikola.plugin_categories import Command
from nikola import utils
from nikola.utils import req_missing
from nikola.plugins.basic_import import ImportMixin, links
+from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
+from nikola.plugins.command.init import SAMPLE_CONF, prepare_config
LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER)
@@ -84,6 +86,23 @@ class CommandImportWordpress(Command, ImportMixin):
'type': bool,
'help': "Do not try to download files for the import",
},
+ {
+ 'name': 'separate_qtranslate_content',
+ 'long': 'qtranslate',
+ 'default': False,
+ 'type': bool,
+ 'help': "Look for translations generated by qtranslate plugin",
+ # WARNING: won't recover translated titles that actually
+ # don't seem to be part of the wordpress XML export at the
+ # time of writing :(
+ },
+ {
+ 'name': 'translations_pattern',
+ 'long': 'translations_pattern',
+ 'default': None,
+ 'type': str,
+ 'help': "The pattern for translation files names",
+ },
]
def _execute(self, options={}, args=[]):
@@ -114,6 +133,9 @@ class CommandImportWordpress(Command, ImportMixin):
self.exclude_drafts = options.get('exclude_drafts', False)
self.no_downloads = options.get('no_downloads', False)
+ self.separate_qtranslate_content = options.get('separate_qtranslate_content')
+ self.translations_pattern = options.get('translations_pattern')
+
if not self.no_downloads:
def show_info_about_mising_module(modulename):
LOGGER.error(
@@ -135,15 +157,21 @@ class CommandImportWordpress(Command, ImportMixin):
self.context = self.populate_context(channel)
conf_template = self.generate_base_site()
+ # If user has specified a custom pattern for translation files we
+ # need to fix the config
+ if self.translations_pattern:
+ self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern
+
self.import_posts(channel)
self.context['REDIRECTIONS'] = self.configure_redirections(
self.url_map)
self.write_urlmap_csv(
os.path.join(self.output_folder, 'url_map.csv'), self.url_map)
- rendered_template = conf_template.render(**self.context)
+ rendered_template = conf_template.render(**prepare_config(self.context))
rendered_template = re.sub('# REDIRECTIONS = ', 'REDIRECTIONS = ',
rendered_template)
+
if self.timezone:
rendered_template = re.sub('# TIMEZONE = \'UTC\'',
'TIMEZONE = \'' + self.timezone + '\'',
@@ -194,8 +222,9 @@ class CommandImportWordpress(Command, ImportMixin):
def populate_context(channel):
wordpress_namespace = channel.nsmap['wp']
- context = {}
+ context = SAMPLE_CONF.copy()
context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
+ context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
context['BLOG_TITLE'] = get_text_tag(channel, 'title',
'PUT TITLE HERE')
context['BLOG_DESCRIPTION'] = get_text_tag(
@@ -205,9 +234,10 @@ class CommandImportWordpress(Command, ImportMixin):
base_site_url = channel.find('{{{0}}}author'.format(wordpress_namespace))
context['BASE_URL'] = get_text_tag(base_site_url,
None,
- "http://foo.com")
+ "http://foo.com/")
+ if not context['BASE_URL'].endswith('/'):
+ context['BASE_URL'] += '/'
context['SITE_URL'] = context['BASE_URL']
- context['THEME'] = 'bootstrap3'
author = channel.find('{{{0}}}author'.format(wordpress_namespace))
context['BLOG_EMAIL'] = get_text_tag(
@@ -253,7 +283,7 @@ class CommandImportWordpress(Command, ImportMixin):
+ list(path.split('/'))))
dst_dir = os.path.dirname(dst_path)
utils.makedirs(dst_dir)
- LOGGER.notice("Downloading {0} => {1}".format(url, dst_path))
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[link] = '/' + dst_url
@@ -288,7 +318,7 @@ class CommandImportWordpress(Command, ImportMixin):
# your blogging into another site or system its not.
# Why don't they just use JSON?
if sys.version_info[0] == 2:
- metadata = phpserialize.loads(meta_value.text)
+ metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
size_key = 'sizes'
file_key = 'file'
else:
@@ -307,7 +337,7 @@ class CommandImportWordpress(Command, ImportMixin):
+ list(path.split('/'))))
dst_dir = os.path.dirname(dst_path)
utils.makedirs(dst_dir)
- LOGGER.notice("Downloading {0} => {1}".format(url, dst_path))
+ LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[url] = '/' + dst_url
@@ -350,14 +380,17 @@ class CommandImportWordpress(Command, ImportMixin):
# link is something like http://foo.com/2012/09/01/hello-world/
# So, take the path, utils.slugify it, and that's our slug
link = get_text_tag(item, 'link', None)
- path = unquote(urlparse(link).path)
+ path = unquote(urlparse(link).path.strip('/'))
# In python 2, path is a str. slug requires a unicode
# object. According to wikipedia, unquoted strings will
# usually be UTF8
if isinstance(path, utils.bytes_str):
path = path.decode('utf8')
- slug = utils.slugify(path)
+ pathlist = path.split('/')
+ if len(pathlist) > 1:
+ out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
+ slug = utils.slugify(pathlist[-1])
if not slug: # it happens if the post has no "nice" URL
slug = get_text_tag(
item, '{{{0}}}post_name'.format(wordpress_namespace), None)
@@ -395,21 +428,43 @@ class CommandImportWordpress(Command, ImportMixin):
continue
tags.append(text)
+ if '$latex' in content:
+ tags.append('mathjax')
+
if is_draft and self.exclude_drafts:
LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
elif content.strip():
# If no content is found, no files are written.
- self.url_map[link] = self.context['SITE_URL'] + '/' + \
- out_folder + '/' + slug + '.html'
-
- content = self.transform_content(content)
-
- self.write_metadata(os.path.join(self.output_folder, out_folder,
- slug + '.meta'),
- title, slug, post_date, description, tags)
- self.write_content(
- os.path.join(self.output_folder, out_folder, slug + '.wp'),
- content)
+ self.url_map[link] = (self.context['SITE_URL'] + out_folder + '/'
+ + slug + '.html')
+ if hasattr(self, "separate_qtranslate_content") \
+ and self.separate_qtranslate_content:
+ content_translations = separate_qtranslate_content(content)
+ else:
+ content_translations = {"": content}
+ default_language = self.context["DEFAULT_LANG"]
+ for lang, content in content_translations.items():
+ if lang:
+ out_meta_filename = slug + '.meta'
+ if lang == default_language:
+ out_content_filename = slug + '.wp'
+ else:
+ out_content_filename \
+ = utils.get_translation_candidate(self.context,
+ slug + ".wp", lang)
+ meta_slug = slug
+ else:
+ out_meta_filename = slug + '.meta'
+ out_content_filename = slug + '.wp'
+ meta_slug = slug
+ content = self.transform_content(content)
+ self.write_metadata(os.path.join(self.output_folder, out_folder,
+ out_meta_filename),
+ title, meta_slug, post_date, description, tags)
+ self.write_content(
+ os.path.join(self.output_folder,
+ out_folder, out_content_filename),
+ content)
else:
LOGGER.warn('Not going to import "{0}" because it seems to contain'
' no content.'.format(title))
@@ -441,3 +496,47 @@ def get_text_tag(tag, name, default):
return t.text
else:
return default
+
+
+def separate_qtranslate_content(text):
+ """Parse the content of a wordpress post or page and separate
+ the various language specific contents when they are delimited
+ with qtranslate tags: <!--:LL-->blabla<!--:-->"""
+ # TODO: uniformize qtranslate tags <!--/en--> => <!--:-->
+ qt_start = "<!--:"
+ qt_end = "-->"
+ qt_end_with_lang_len = 5
+ qt_chunks = text.split(qt_start)
+ content_by_lang = {}
+ common_txt_list = []
+ for c in qt_chunks:
+ if not c.strip():
+ continue
+ if c.startswith(qt_end):
+ # just after the end of a language specific section, there may
+ # be some piece of common text or tags, or just nothing
+ lang = "" # default language
+ c = c.lstrip(qt_end)
+ if not c:
+ continue
+ elif c[2:].startswith(qt_end):
+ # a language specific section (with language code at the begining)
+ lang = c[:2]
+ c = c[qt_end_with_lang_len:]
+ else:
+ # nowhere specific (maybe there is no language section in the
+ # currently parsed content)
+ lang = "" # default language
+ if not lang:
+ common_txt_list.append(c)
+ for l in content_by_lang.keys():
+ content_by_lang[l].append(c)
+ else:
+ content_by_lang[lang] = content_by_lang.get(lang, common_txt_list) + [c]
+ # in case there was no language specific section, just add the text
+ if common_txt_list and not content_by_lang:
+ content_by_lang[""] = common_txt_list
+ # Format back the list to simple text
+ for l in content_by_lang.keys():
+ content_by_lang[l] = " ".join(content_by_lang[l])
+ return content_by_lang