1 files changed, 119 insertions, 20 deletions
diff --git a/nikola/plugins/command/import_wordpress.py b/nikola/plugins/command/import_wordpress.py
index 0c9915a..b567c77 100644
--- a/nikola/plugins/command/import_wordpress.py
+++ b/nikola/plugins/command/import_wordpress.py
@@ -50,6 +50,8 @@ from nikola.plugin_categories import Command
 from nikola import utils
 from nikola.utils import req_missing
 from nikola.plugins.basic_import import ImportMixin, links
+from nikola.nikola import DEFAULT_TRANSLATIONS_PATTERN
+from nikola.plugins.command.init import SAMPLE_CONF, prepare_config
 
 LOGGER = utils.get_logger('import_wordpress', utils.STDERR_HANDLER)
 
@@ -84,6 +86,23 @@ class CommandImportWordpress(Command, ImportMixin):
             'type': bool,
             'help': "Do not try to download files for the import",
         },
+        {
+            'name': 'separate_qtranslate_content',
+            'long': 'qtranslate',
+            'default': False,
+            'type': bool,
+            'help': "Look for translations generated by qtranslate plugin",
+            # WARNING: won't recover translated titles that actually
+            # don't seem to be part of the wordpress XML export at the
+            # time of writing :(
+        },
+        {
+            'name': 'translations_pattern',
+            'long': 'translations_pattern',
+            'default': None,
+            'type': str,
+            'help': "The pattern for translation files names",
+        },
     ]
 
     def _execute(self, options={}, args=[]):
@@ -114,6 +133,9 @@ class CommandImportWordpress(Command, ImportMixin):
         self.exclude_drafts = options.get('exclude_drafts', False)
         self.no_downloads = options.get('no_downloads', False)
 
+        self.separate_qtranslate_content = options.get('separate_qtranslate_content')
+        self.translations_pattern = options.get('translations_pattern')
+
         if not self.no_downloads:
             def show_info_about_mising_module(modulename):
                 LOGGER.error(
@@ -135,15 +157,21 @@ class CommandImportWordpress(Command, ImportMixin):
         self.context = self.populate_context(channel)
         conf_template = self.generate_base_site()
 
+        # If user  has specified a custom pattern for translation files we
+        # need to fix the config
+        if self.translations_pattern:
+            self.context['TRANSLATIONS_PATTERN'] = self.translations_pattern
+
         self.import_posts(channel)
 
         self.context['REDIRECTIONS'] = self.configure_redirections(
             self.url_map)
         self.write_urlmap_csv(
             os.path.join(self.output_folder, 'url_map.csv'), self.url_map)
-        rendered_template = conf_template.render(**self.context)
+        rendered_template = conf_template.render(**prepare_config(self.context))
         rendered_template = re.sub('# REDIRECTIONS = ', 'REDIRECTIONS = ',
                                    rendered_template)
+
         if self.timezone:
             rendered_template = re.sub('# TIMEZONE = \'UTC\'',
                                        'TIMEZONE = \'' + self.timezone + '\'',
@@ -194,8 +222,9 @@ class CommandImportWordpress(Command, ImportMixin):
     def populate_context(channel):
         wordpress_namespace = channel.nsmap['wp']
 
-        context = {}
+        context = SAMPLE_CONF.copy()
         context['DEFAULT_LANG'] = get_text_tag(channel, 'language', 'en')[:2]
+        context['TRANSLATIONS_PATTERN'] = DEFAULT_TRANSLATIONS_PATTERN
         context['BLOG_TITLE'] = get_text_tag(channel, 'title',
                                              'PUT TITLE HERE')
         context['BLOG_DESCRIPTION'] = get_text_tag(
@@ -205,9 +234,10 @@ class CommandImportWordpress(Command, ImportMixin):
             base_site_url = channel.find('{{{0}}}author'.format(wordpress_namespace))
             context['BASE_URL'] = get_text_tag(base_site_url,
                                                None,
-                                               "http://foo.com")
+                                               "http://foo.com/")
+        if not context['BASE_URL'].endswith('/'):
+            context['BASE_URL'] += '/'
         context['SITE_URL'] = context['BASE_URL']
-        context['THEME'] = 'bootstrap3'
 
         author = channel.find('{{{0}}}author'.format(wordpress_namespace))
         context['BLOG_EMAIL'] = get_text_tag(
@@ -253,7 +283,7 @@ class CommandImportWordpress(Command, ImportMixin):
                                   + list(path.split('/'))))
         dst_dir = os.path.dirname(dst_path)
         utils.makedirs(dst_dir)
-        LOGGER.notice("Downloading {0} => {1}".format(url, dst_path))
+        LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
         self.download_url_content_to_file(url, dst_path)
         dst_url = '/'.join(dst_path.split(os.sep)[2:])
         links[link] = '/' + dst_url
@@ -288,7 +318,7 @@ class CommandImportWordpress(Command, ImportMixin):
                 # your blogging into another site or system its not.
                 # Why don't they just use JSON?
                 if sys.version_info[0] == 2:
-                    metadata = phpserialize.loads(meta_value.text)
+                    metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
                     size_key = 'sizes'
                     file_key = 'file'
                 else:
@@ -307,7 +337,7 @@ class CommandImportWordpress(Command, ImportMixin):
                                               + list(path.split('/'))))
                     dst_dir = os.path.dirname(dst_path)
                     utils.makedirs(dst_dir)
-                    LOGGER.notice("Downloading {0} => {1}".format(url, dst_path))
+                    LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
                     self.download_url_content_to_file(url, dst_path)
                     dst_url = '/'.join(dst_path.split(os.sep)[2:])
                     links[url] = '/' + dst_url
@@ -350,14 +380,17 @@ class CommandImportWordpress(Command, ImportMixin):
         # link is something like http://foo.com/2012/09/01/hello-world/
         # So, take the path, utils.slugify it, and that's our slug
         link = get_text_tag(item, 'link', None)
-        path = unquote(urlparse(link).path)
+        path = unquote(urlparse(link).path.strip('/'))
 
         # In python 2, path is a str. slug requires a unicode
         # object. According to wikipedia, unquoted strings will
         # usually be UTF8
         if isinstance(path, utils.bytes_str):
             path = path.decode('utf8')
-        slug = utils.slugify(path)
+        pathlist = path.split('/')
+        if len(pathlist) > 1:
+            out_folder = os.path.join(*([out_folder] + pathlist[:-1]))
+        slug = utils.slugify(pathlist[-1])
         if not slug:  # it happens if the post has no "nice" URL
             slug = get_text_tag(
                 item, '{{{0}}}post_name'.format(wordpress_namespace), None)
@@ -395,21 +428,43 @@ class CommandImportWordpress(Command, ImportMixin):
                 continue
             tags.append(text)
 
+        if '$latex' in content:
+            tags.append('mathjax')
+
         if is_draft and self.exclude_drafts:
             LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
         elif content.strip():
             # If no content is found, no files are written.
-            self.url_map[link] = self.context['SITE_URL'] + '/' + \
-                out_folder + '/' + slug + '.html'
-
-            content = self.transform_content(content)
-
-            self.write_metadata(os.path.join(self.output_folder, out_folder,
-                                             slug + '.meta'),
-                                title, slug, post_date, description, tags)
-            self.write_content(
-                os.path.join(self.output_folder, out_folder, slug + '.wp'),
-                content)
+            self.url_map[link] = (self.context['SITE_URL'] + out_folder + '/'
+                                  + slug + '.html')
+            if hasattr(self, "separate_qtranslate_content") \
+               and self.separate_qtranslate_content:
+                content_translations = separate_qtranslate_content(content)
+            else:
+                content_translations = {"": content}
+            default_language = self.context["DEFAULT_LANG"]
+            for lang, content in content_translations.items():
+                if lang:
+                    out_meta_filename = slug + '.meta'
+                    if lang == default_language:
+                        out_content_filename = slug + '.wp'
+                    else:
+                        out_content_filename \
+                            = utils.get_translation_candidate(self.context,
+                                                              slug + ".wp", lang)
+                    meta_slug = slug
+                else:
+                    out_meta_filename = slug + '.meta'
+                    out_content_filename = slug + '.wp'
+                    meta_slug = slug
+                content = self.transform_content(content)
+                self.write_metadata(os.path.join(self.output_folder, out_folder,
+                                                 out_meta_filename),
+                                    title, meta_slug, post_date, description, tags)
+                self.write_content(
+                    os.path.join(self.output_folder,
+                                 out_folder, out_content_filename),
+                    content)
         else:
             LOGGER.warn('Not going to import "{0}" because it seems to contain'
                         ' no content.'.format(title))
@@ -441,3 +496,47 @@ def get_text_tag(tag, name, default):
         return t.text
     else:
         return default
+
+
+def separate_qtranslate_content(text):
+    """Parse the content of a wordpress post or page and separate
+    the various language specific contents when they are delimited
+    with qtranslate tags: <!--:LL-->blabla<!--:-->"""
+    # TODO: uniformize qtranslate tags <!--/en--> => <!--:-->
+    qt_start = "<!--:"
+    qt_end = "-->"
+    qt_end_with_lang_len = 5
+    qt_chunks = text.split(qt_start)
+    content_by_lang = {}
+    common_txt_list = []
+    for c in qt_chunks:
+        if not c.strip():
+            continue
+        if c.startswith(qt_end):
+            # just after the end of a language specific section, there may
+            # be some piece of common text or tags, or just nothing
+            lang = ""  # default language
+            c = c.lstrip(qt_end)
+            if not c:
+                continue
+        elif c[2:].startswith(qt_end):
+            # a language specific section (with language code at the begining)
+            lang = c[:2]
+            c = c[qt_end_with_lang_len:]
+        else:
+            # nowhere specific (maybe there is no language section in the
+            # currently parsed content)
+            lang = ""  # default language
+        if not lang:
+            common_txt_list.append(c)
+            for l in content_by_lang.keys():
+                content_by_lang[l].append(c)
+        else:
+            content_by_lang[lang] = content_by_lang.get(lang, common_txt_list) + [c]
+    # in case there was no language specific section, just add the text
+    if common_txt_list and not content_by_lang:
+        content_by_lang[""] = common_txt_list
+    # Format back the list to simple text
+    for l in content_by_lang.keys():
+        content_by_lang[l] = " ".join(content_by_lang[l])
+    return content_by_lang