# -*- coding: utf-8 -*-

# Copyright © 2012-2014 Roberto Alsina and others.

# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the
# Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the
# Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice
# shall be included in all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from __future__ import print_function
import os
import re
import sys
try:
    from urllib import unquote
    from urlparse import urlparse, urljoin, urldefrag
except ImportError:
    from urllib.parse import unquote, urlparse, urljoin, urldefrag  # NOQA

import lxml.html

from nikola.plugin_categories import Command
from nikola.utils import get_logger


def real_scan_files(site):
    task_fnames = set([])
    real_fnames = set([])
    output_folder = site.config['OUTPUT_FOLDER']
    # First check that all targets are generated in the right places
    for task in os.popen('nikola list --all', 'r').readlines():
        task = task.strip()
        if output_folder in task and ':' in task:
            fname = task.split(':', 1)[-1]
            task_fnames.add(fname)
    # And now check that there are no non-target files
    for root, dirs, files in os.walk(output_folder, followlinks=True):
        for src_name in files:
            fname = os.path.join(root, src_name)
            real_fnames.add(fname)

    only_on_output = list(real_fnames - task_fnames)

    only_on_input = list(task_fnames - real_fnames)

    return (only_on_output, only_on_input)


def fs_relpath_from_url_path(url_path):
    """Expects as input an urlparse(s).path"""
    url_path = unquote(url_path)
    # in windows relative paths don't begin with os.sep
    if sys.platform == 'win32' and len(url_path):
        url_path = url_path[1:].replace('/', '\\')
    return url_path


class CommandCheck(Command):
    """Check the generated site."""

    name = "check"
    logger = None

    doc_usage = "-l [--find-sources] | -f"
    doc_purpose = "check links and files in the generated site"
    cmd_options = [
        {
            'name': 'links',
            'short': 'l',
            'long': 'check-links',
            'type': bool,
            'default': False,
            'help': 'Check for dangling links',
        },
        {
            'name': 'files',
            'short': 'f',
            'long': 'check-files',
            'type': bool,
            'default': False,
            'help': 'Check for unknown (orphaned and not generated) files',
        },
        {
            'name': 'clean',
            'long': 'clean-files',
            'type': bool,
            'default': False,
            'help': 'Remove all unknown files, use with caution',
        },
        {
            'name': 'find_sources',
            'long': 'find-sources',
            'type': bool,
            'default': False,
            'help': 'List possible source files for files with broken links.',
        },
        {
            'name': 'verbose',
            'long': 'verbose',
            'short': 'v',
            'type': bool,
            'default': False,
            'help': 'Be more verbose.',
        },
    ]

    def _execute(self, options, args):
        """Check the generated site."""

        self.logger = get_logger('check', self.site.loghandlers)

        if not options['links'] and not options['files'] and not options['clean']:
            print(self.help())
            return False
        if options['verbose']:
            self.logger.level = 1
        else:
            self.logger.level = 4
        if options['links']:
            failure = self.scan_links(options['find_sources'])
        if options['files']:
            failure = self.scan_files()
        if options['clean']:
            failure = self.clean_files()
        if failure:
            sys.exit(1)

    existing_targets = set([])

    def analyze(self, task, find_sources=False):
        rv = False
        self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']]
        base_url = urlparse(self.site.config['BASE_URL'])
        self.existing_targets.add(self.site.config['SITE_URL'])
        self.existing_targets.add(self.site.config['BASE_URL'])
        url_type = self.site.config['URL_TYPE']
        if url_type == 'absolute':
            url_netloc_to_root = urlparse(self.site.config['SITE_URL']).path
        try:
            filename = task.split(":")[-1]
            d = lxml.html.fromstring(open(filename).read())
            for l in d.iterlinks():
                target = l[0].attrib[l[1]]
                if target == "#":
                    continue
                target, _ = urldefrag(target)
                parsed = urlparse(target)

                # Absolute links when using only paths, skip.
                if (parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path'):
                    continue

                # Absolute links to other domains, skip
                if (parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc:
                    continue

                if url_type == 'rel_path':
                    target_filename = os.path.abspath(
                        os.path.join(os.path.dirname(filename), unquote(target)))

                elif url_type in ('full_path', 'absolute'):
                    if url_type == 'absolute':
                        # convert to 'full_path' case, ie url relative to root
                        url_rel_path = target.path[len(url_netloc_to_root):]
                    else:
                        url_rel_path = target.path
                    if url_rel_path == '' or url_rel_path.endswith('/'):
                        url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE'])
                    fs_rel_path = fs_relpath_from_url_path(url_rel_path)
                    target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path)

                if any(re.match(x, target_filename) for x in self.whitelist):
                    continue
                elif target_filename not in self.existing_targets:
                    if os.path.exists(target_filename):
                        self.logger.notice("Good link {0} => {1}".format(target, target_filename))
                        self.existing_targets.add(target_filename)
                    else:
                        rv = True
                        self.logger.warn("Broken link in {0}: {1}".format(filename, target))
                        if find_sources:
                            self.logger.warn("Possible sources:")
                            self.logger.warn(os.popen('nikola list --deps ' + task, 'r').read())
                            self.logger.warn("===============================\n")
        except Exception as exc:
            self.logger.error("Error with: {0} {1}".format(filename, exc))
        return rv

    def scan_links(self, find_sources=False):
        self.logger.info("Checking Links:")
        self.logger.info("===============\n")
        self.logger.notice("{0} mode".format(self.site.config['URL_TYPE']))
        failure = False
        for task in os.popen('nikola list --all', 'r').readlines():
            task = task.strip()
            if task.split(':')[0] in (
                    'render_tags', 'render_archive',
                    'render_galleries', 'render_indexes',
                    'render_pages'
                    'render_site') and '.html' in task:
                if self.analyze(task, find_sources):
                    failure = True
        if not failure:
            self.logger.info("All links checked.")
        return failure

    def scan_files(self):
        failure = False
        self.logger.info("Checking Files:")
        self.logger.info("===============\n")
        only_on_output, only_on_input = real_scan_files(self.site)

        # Ignore folders
        only_on_output = [p for p in only_on_output if not os.path.isdir(p)]
        only_on_input = [p for p in only_on_input if not os.path.isdir(p)]

        if only_on_output:
            only_on_output.sort()
            self.logger.warn("Files from unknown origins (orphans):")
            for f in only_on_output:
                self.logger.warn(f)
            failure = True
        if only_on_input:
            only_on_input.sort()
            self.logger.warn("Files not generated:")
            for f in only_on_input:
                self.logger.warn(f)
        if not failure:
            self.logger.info("All files checked.")
        return failure

    def clean_files(self):
        only_on_output, _ = real_scan_files(self.site)
        for f in only_on_output:
            os.unlink(f)
        return True