aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/task_sitemap
diff options
context:
space:
mode:
authorLibravatarAgustin Henze <tin@sluc.org.ar>2013-05-30 17:41:06 -0300
committerLibravatarAgustin Henze <tin@sluc.org.ar>2013-05-30 17:41:06 -0300
commit0c4dfdec5b55b6064dccc38bbfb0a7c0699c895a (patch)
treea6707225ccc559f7edf50ddd3fdc7fc85145c921 /nikola/plugins/task_sitemap
parent8b14a1e5b2ca574fdd4fd2377567ec98a110d4b6 (diff)
Imported Upstream version 5.4.4
Diffstat (limited to 'nikola/plugins/task_sitemap')
-rw-r--r--nikola/plugins/task_sitemap/__init__.py94
-rw-r--r--nikola/plugins/task_sitemap/sitemap_gen.py2137
2 files changed, 52 insertions, 2179 deletions
diff --git a/nikola/plugins/task_sitemap/__init__.py b/nikola/plugins/task_sitemap/__init__.py
index 9d89070..044e0e3 100644
--- a/nikola/plugins/task_sitemap/__init__.py
+++ b/nikola/plugins/task_sitemap/__init__.py
@@ -22,72 +22,82 @@
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-from __future__ import print_function, absolute_import
+from __future__ import print_function, absolute_import, unicode_literals
+import codecs
+import datetime
import os
-import sys
-import tempfile
+try:
+ from urlparse import urljoin
+except ImportError:
+ from urllib.parse import urljoin # NOQA
from nikola.plugin_categories import LateTask
from nikola.utils import config_changed
-from nikola.plugins.task_sitemap import sitemap_gen
+
+header = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
+ http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+"""
+
+url_format = """ <url>
+ <loc>{0}</loc>
+ <lastmod>{1}</lastmod>
+ <priority>0.5000</priority>
+ </url>
+"""
+
+get_lastmod = lambda p: datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0]
class Sitemap(LateTask):
- """Copy theme assets into output."""
+ """Generate google sitemap."""
name = "sitemap"
def gen_tasks(self):
- if sys.version_info[0] == 3:
- print("sitemap generation is not available for python 3")
- yield {
- 'basename': 'sitemap',
- 'name': 'sitemap',
- 'actions': [],
- }
- return
"""Generate Google sitemap."""
kw = {
"base_url": self.site.config["BASE_URL"],
"site_url": self.site.config["SITE_URL"],
"output_folder": self.site.config["OUTPUT_FOLDER"],
+ "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm'])
}
- output_path = os.path.abspath(kw['output_folder'])
- sitemap_path = os.path.join(output_path, "sitemap.xml.gz")
+ output_path = kw['output_folder']
+ sitemap_path = os.path.join(output_path, "sitemap.xml")
def sitemap():
- # Generate config
- config_data = """<?xml version="1.0" encoding="UTF-8"?>
- <site
- base_url="{0}"
- store_into="{1}"
- verbose="1" >
- <directory path="{2}" url="{3}" />
- <filter action="drop" type="wildcard" pattern="*~" />
- <filter action="drop" type="regexp" pattern="/\.[^/]*" />
- </site>""".format(kw["site_url"], sitemap_path, output_path,
- kw["base_url"])
- config_file = tempfile.NamedTemporaryFile(delete=False)
- config_file.write(config_data.encode('utf8'))
- config_file.close()
+ with codecs.open(sitemap_path, 'wb+', 'utf8') as outf:
+ output = kw['output_folder']
+ base_url = kw['base_url']
+ mapped_exts = kw['mapped_extensions']
+ outf.write(header)
+ locs = {}
+ for root, dirs, files in os.walk(output):
+ path = os.path.relpath(root, output)
+ path = path.replace(os.sep, '/') + '/'
+ lastmod = get_lastmod(root)
+ loc = urljoin(base_url, path)
+ locs[loc] = url_format.format(loc, lastmod)
+ for fname in files:
+ if os.path.splitext(fname)[-1] in mapped_exts:
+ real_path = os.path.join(root, fname)
+ path = os.path.relpath(real_path, output)
+ path = path.replace(os.sep, '/')
+ lastmod = get_lastmod(real_path)
+ loc = urljoin(base_url, path)
+ locs[loc] = url_format.format(loc, lastmod)
- # Generate sitemap
- sitemap = sitemap_gen.CreateSitemapFromFile(config_file.name, True)
- if not sitemap:
- sitemap_gen.output.Log('Configuration file errors -- exiting.',
- 0)
- else:
- sitemap.Generate()
- sitemap_gen.output.Log('Number of errors: {0}'.format(
- sitemap_gen.output.num_errors), 1)
- sitemap_gen.output.Log('Number of warnings: {0}'.format(
- sitemap_gen.output.num_warns), 1)
- os.unlink(config_file.name)
+ for k in sorted(locs.keys()):
+ outf.write(locs[k])
+ outf.write("</urlset>")
yield {
"basename": "sitemap",
- "name": os.path.join(kw['output_folder'], "sitemap.xml.gz"),
+ "name": sitemap_path,
"targets": [sitemap_path],
"actions": [(sitemap,)],
"uptodate": [config_changed(kw)],
diff --git a/nikola/plugins/task_sitemap/sitemap_gen.py b/nikola/plugins/task_sitemap/sitemap_gen.py
deleted file mode 100644
index 898325a..0000000
--- a/nikola/plugins/task_sitemap/sitemap_gen.py
+++ /dev/null
@@ -1,2137 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (c) 2004, 2005 Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-#
-# * Neither the name of Google nor the names of its contributors may
-# be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-#
-# The sitemap_gen.py script is written in Python 2.2 and released to
-# the open source community for continuous improvements under the BSD
-# 2.0 new license, which can be found at:
-#
-# http://www.opensource.org/licenses/bsd-license.php
-#
-from __future__ import print_function
-
-__usage__ = \
- """A simple script to automatically produce sitemaps for a webserver,
-in the Google Sitemap Protocol (GSP).
-
-Usage: python sitemap_gen.py --config=config.xml [--help] [--testing]
- --config=config.xml, specifies config file location
- --help, displays usage message
- --testing, specified when user is experimenting
-"""
-
-import fnmatch
-import glob
-import gzip
-import os
-import re
-import stat
-import sys
-import time
-import urllib
-import xml.sax
-
-try:
- import md5
-except ImportError:
- md5 = None # NOQA
- import hashlib
-
-try:
- from urlparse import urlsplit, urlunsplit, urljoin
-except ImportError:
- from urllib.parse import urlsplit, urlunsplit, urljoin # NOQA
-
-try:
- from urllib import quote as urllib_quote
- from urllib import FancyURLopener
- from urllib import urlopen
-except ImportError:
- from urllib.parse import quote as urllib_quote # NOQA
- from urllib.request import FancyURLopener # NOQA
- from urllib.request import urlopen # NOQA
-
-
-if sys.version_info[0] == 3:
- # Python 3
- bytes_str = bytes
- unicode_str = str
- unichr = chr
-else:
- bytes_str = str
- unicode_str = unicode # NOQA
-
-# Text encodings
-ENC_ASCII = 'ASCII'
-ENC_UTF8 = 'UTF-8'
-ENC_IDNA = 'IDNA'
-ENC_ASCII_LIST = ['ASCII', 'US-ASCII', 'US', 'IBM367', 'CP367', 'ISO646-US'
- 'ISO_646.IRV:1991', 'ISO-IR-6', 'ANSI_X3.4-1968',
- 'ANSI_X3.4-1986', 'CPASCII']
-ENC_DEFAULT_LIST = ['ISO-8859-1', 'ISO-8859-2', 'ISO-8859-5']
-
-# Available Sitemap types
-SITEMAP_TYPES = ['web', 'mobile', 'news']
-
-# General Sitemap tags
-GENERAL_SITEMAP_TAGS = ['loc', 'changefreq', 'priority', 'lastmod']
-
-# News specific tags
-NEWS_SPECIFIC_TAGS = ['keywords', 'publication_date', 'stock_tickers']
-
-# News Sitemap tags
-NEWS_SITEMAP_TAGS = GENERAL_SITEMAP_TAGS + NEWS_SPECIFIC_TAGS
-
-# Maximum number of urls in each sitemap, before next Sitemap is created
-MAXURLS_PER_SITEMAP = 50000
-
-# Suffix on a Sitemap index file
-SITEINDEX_SUFFIX = '_index.xml'
-
-# Regular expressions tried for extracting URLs from access logs.
-ACCESSLOG_CLF_PATTERN = re.compile(
- r'.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*'
-)
-
-# Match patterns for lastmod attributes
-DATE_PATTERNS = list(map(re.compile, [
- r'^\d\d\d\d$',
- r'^\d\d\d\d-\d\d$',
- r'^\d\d\d\d-\d\d-\d\d$',
- r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$',
- r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$',
- r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$',
- r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$',
-]))
-
-# Match patterns for changefreq attributes
-CHANGEFREQ_PATTERNS = [
- 'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'
-]
-
-# XML formats
-GENERAL_SITEINDEX_HEADER = \
- '<?xml version="1.0" encoding="UTF-8"?>\n' \
- '<sitemapindex\n' \
- ' xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
- ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
- ' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
- ' http://www.sitemaps.org/schemas/sitemap/0.9/' \
- 'siteindex.xsd">\n'
-
-NEWS_SITEINDEX_HEADER = \
- '<?xml version="1.0" encoding="UTF-8"?>\n' \
- '<sitemapindex\n' \
- ' xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
- ' xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"\n' \
- ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
- ' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
- ' http://www.sitemaps.org/schemas/sitemap/0.9/' \
- 'siteindex.xsd">\n'
-
-SITEINDEX_FOOTER = '</sitemapindex>\n'
-SITEINDEX_ENTRY = \
- ' <sitemap>\n' \
- ' <loc>%(loc)s</loc>\n' \
- ' <lastmod>%(lastmod)s</lastmod>\n' \
- ' </sitemap>\n'
-GENERAL_SITEMAP_HEADER = \
- '<?xml version="1.0" encoding="UTF-8"?>\n' \
- '<urlset\n' \
- ' xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
- ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
- ' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
- ' http://www.sitemaps.org/schemas/sitemap/0.9/' \
- 'sitemap.xsd">\n'
-
-NEWS_SITEMAP_HEADER = \
- '<?xml version="1.0" encoding="UTF-8"?>\n' \
- '<urlset\n' \
- ' xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
- ' xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"\n' \
- ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
- ' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
- ' http://www.sitemaps.org/schemas/sitemap/0.9/' \
- 'sitemap.xsd">\n'
-
-SITEMAP_FOOTER = '</urlset>\n'
-SITEURL_XML_PREFIX = ' <url>\n'
-SITEURL_XML_SUFFIX = ' </url>\n'
-
-NEWS_TAG_XML_PREFIX = ' <news:news>\n'
-NEWS_TAG_XML_SUFFIX = ' </news:news>\n'
-
-# Search engines to notify with the updated sitemaps
-#
-# This list is very non-obvious in what's going on. Here's the gist:
-# Each item in the list is a 6-tuple of items. The first 5 are "almost"
-# the same as the input arguments to urlparse.urlunsplit():
-# 0 - schema
-# 1 - netloc
-# 2 - path
-# 3 - query <-- EXCEPTION: specify a query map rather than a string
-# 4 - fragment
-# Additionally, add item 5:
-# 5 - query attribute that should be set to the new Sitemap URL
-# Clear as mud, I know.
-NOTIFICATION_SITES = [
- ('http', 'www.google.com', 'webmasters/sitemaps/ping', {}, '', 'sitemap'),
-]
-
-
-def get_hash(text):
- if md5 is not None:
- return md5.new(text).digest()
- else:
- m = hashlib.md5()
- m.update(text.encode('utf8'))
- return m.digest()
-
-
-class Error(Exception):
- """
- Base exception class. In this module we tend not to use our own exception
- types for very much, but they come in very handy on XML parsing with SAX.
- """
- pass
-# end class Error
-
-
-class SchemaError(Error):
- """Failure to process an XML file according to the schema we know."""
- pass
-# end class SchemeError
-
-
-class Encoder:
- """
- Manages wide-character/narrow-character conversions for just about all
- text that flows into or out of the script.
-
- You should always use this class for string coercion, as opposed to
- letting Python handle coercions automatically. Reason: Python
- usually assumes ASCII (7-bit) as a default narrow character encoding,
- which is not the kind of data we generally deal with.
-
- General high-level methodologies used in sitemap_gen:
-
- [PATHS]
- File system paths may be wide or narrow, depending on platform.
- This works fine, just be aware of it and be very careful to not
- mix them. That is, if you have to pass several file path arguments
- into a library call, make sure they are all narrow or all wide.
- This class has MaybeNarrowPath() which should be called on every
- file system path you deal with.
-
- [URLS]
- URL locations are stored in Narrow form, already escaped. This has the
- benefit of keeping escaping and encoding as close as possible to the format
- we read them in. The downside is we may end up with URLs that have
- intermingled encodings -- the root path may be encoded in one way
- while the filename is encoded in another. This is obviously wrong, but
- it should hopefully be an issue hit by very few users. The workaround
- from the user level (assuming they notice) is to specify a default_encoding
- parameter in their config file.
-
- [OTHER]
- Other text, such as attributes of the URL class, configuration options,
- etc, are generally stored in Unicode for simplicity.
- """
-
- def __init__(self):
- self._user = None # User-specified default encoding
- self._learned = [] # Learned default encodings
- self._widefiles = False # File system can be wide
-
- # Can the file system be Unicode?
- try:
- self._widefiles = os.path.supports_unicode_filenames
- except AttributeError:
- try:
- self._widefiles = sys.getwindowsversion(
- ) == os.VER_PLATFORM_WIN32_NT
- except AttributeError:
- pass
-
- # Try to guess a working default
- try:
- encoding = sys.getfilesystemencoding()
- if encoding and not (encoding.upper() in ENC_ASCII_LIST):
- self._learned = [encoding]
- except AttributeError:
- pass
-
- if not self._learned:
- encoding = sys.getdefaultencoding()
- if encoding and not (encoding.upper() in ENC_ASCII_LIST):
- self._learned = [encoding]
-
- # If we had no guesses, start with some European defaults
- if not self._learned:
- self._learned = ENC_DEFAULT_LIST
- # end def __init__
-
- def SetUserEncoding(self, encoding):
- self._user = encoding
- # end def SetUserEncoding
-
- def NarrowText(self, text, encoding):
- """ Narrow a piece of arbitrary text """
- if isinstance(text, bytes_str):
- return text
-
- # Try the passed in preference
- if encoding:
- try:
- result = text.encode(encoding)
- if not encoding in self._learned:
- self._learned.append(encoding)
- return result
- except UnicodeError:
- pass
- except LookupError:
- output.Warn('Unknown encoding: %s' % encoding)
-
- # Try the user preference
- if self._user:
- try:
- return text.encode(self._user)
- except UnicodeError:
- pass
- except LookupError:
- temp = self._user
- self._user = None
- output.Warn('Unknown default_encoding: %s' % temp)
-
- # Look through learned defaults, knock any failing ones out of the list
- while self._learned:
- try:
- return text.encode(self._learned[0])
- except:
- del self._learned[0]
-
- # When all other defaults are exhausted, use UTF-8
- try:
- return text.encode(ENC_UTF8)
- except UnicodeError:
- pass
-
- # Something is seriously wrong if we get to here
- return text.encode(ENC_ASCII, 'ignore')
- # end def NarrowText
-
- def MaybeNarrowPath(self, text):
- """ Paths may be allowed to stay wide """
- if self._widefiles:
- return text
- return self.NarrowText(text, None)
- # end def MaybeNarrowPath
-
- def WidenText(self, text, encoding):
- """ Widen a piece of arbitrary text """
- if not isinstance(text, bytes_str):
- return text
-
- # Try the passed in preference
- if encoding:
- try:
- result = unicode_str(text, encoding)
- if not encoding in self._learned:
- self._learned.append(encoding)
- return result
- except UnicodeError:
- pass
- except LookupError:
- output.Warn('Unknown encoding: %s' % encoding)
-
- # Try the user preference
- if self._user:
- try:
- return unicode_str(text, self._user)
- except UnicodeError:
- pass
- except LookupError:
- temp = self._user
- self._user = None
- output.Warn('Unknown default_encoding: %s' % temp)
-
- # Look through learned defaults, knock any failing ones out of the list
- while self._learned:
- try:
- return unicode_str(text, self._learned[0])
- except:
- del self._learned[0]
-
- # When all other defaults are exhausted, use UTF-8
- try:
- return unicode_str(text, ENC_UTF8)
- except UnicodeError:
- pass
-
- # Getting here means it wasn't UTF-8 and we had no working default.
- # We really don't have anything "right" we can do anymore.
- output.Warn('Unrecognized encoding in text: %s' % text)
- if not self._user:
- output.Warn('You may need to set a default_encoding in your '
- 'configuration file.')
- return text.decode(ENC_ASCII, 'ignore')
- # end def WidenText
-# end class Encoder
-encoder = Encoder()
-
-
-class Output:
- """
- Exposes logging functionality, and tracks how many errors
- we have thus output.
-
- Logging levels should be used as thus:
- Fatal -- extremely sparingly
- Error -- config errors, entire blocks of user 'intention' lost
- Warn -- individual URLs lost
- Log(,0) -- Un-suppressable text that's not an error
- Log(,1) -- touched files, major actions
- Log(,2) -- parsing notes, filtered or duplicated URLs
- Log(,3) -- each accepted URL
- """
-
- def __init__(self):
- self.num_errors = 0 # Count of errors
- self.num_warns = 0 # Count of warnings
-
- self._errors_shown = {} # Shown errors
- self._warns_shown = {} # Shown warnings
- self._verbose = 0 # Level of verbosity
- # end def __init__
-
- def Log(self, text, level):
- """ Output a blurb of diagnostic text, if the verbose level allows it """
- if text:
- text = encoder.NarrowText(text, None)
- if self._verbose >= level:
- print(text)
- # end def Log
-
- def Warn(self, text):
- """ Output and count a warning. Suppress duplicate warnings. """
- if text:
- text = encoder.NarrowText(text, None)
- hash = get_hash(text)
- if not hash in self._warns_shown:
- self._warns_shown[hash] = 1
- print('[WARNING] ' + text)
- else:
- self.Log('(suppressed) [WARNING] ' + text, 3)
- self.num_warns = self.num_warns + 1
- # end def Warn
-
- def Error(self, text):
- """ Output and count an error. Suppress duplicate errors. """
- if text:
- text = encoder.NarrowText(text, None)
- hash = get_hash(text)
- if not hash in self._errors_shown:
- self._errors_shown[hash] = 1
- print('[ERROR] ' + text)
- else:
- self.Log('(suppressed) [ERROR] ' + text, 3)
- self.num_errors = self.num_errors + 1
- # end def Error
-
- def Fatal(self, text):
- """ Output an error and terminate the program. """
- if text:
- text = encoder.NarrowText(text, None)
- print('[FATAL] ' + text)
- else:
- print('Fatal error.')
- sys.exit(1)
- # end def Fatal
-
- def SetVerbose(self, level):
- """ Sets the verbose level. """
- try:
- if not isinstance(level, int):
- level = int(level)
- if (level >= 0) and (level <= 3):
- self._verbose = level
- return
- except ValueError:
- pass
- self.Error(
- 'Verbose level (%s) must be between 0 and 3 inclusive.' % level)
- # end def SetVerbose
-# end class Output
-output = Output()
-
-
-class URL(object):
- """ URL is a smart structure grouping together the properties we
- care about for a single web reference. """
- __slots__ = 'loc', 'lastmod', 'changefreq', 'priority'
-
- def __init__(self):
- self.loc = None # URL -- in Narrow characters
- self.lastmod = None # ISO8601 timestamp of last modify
- self.changefreq = None # Text term for update frequency
- self.priority = None # Float between 0 and 1 (inc)
- # end def __init__
-
- def __cmp__(self, other):
- if self.loc < other.loc:
- return -1
- if self.loc > other.loc:
- return 1
- return 0
- # end def __cmp__
-
- def TrySetAttribute(self, attribute, value):
- """ Attempt to set the attribute to the value, with a pretty try
- block around it. """
- if attribute == 'loc':
- self.loc = self.Canonicalize(value)
- else:
- try:
- setattr(self, attribute, value)
- except AttributeError:
- output.Warn('Unknown URL attribute: %s' % attribute)
- # end def TrySetAttribute
-
- def IsAbsolute(loc):
- """ Decide if the URL is absolute or not """
- if not loc:
- return False
- narrow = encoder.NarrowText(loc, None)
- (scheme, netloc, path, query, frag) = urlsplit(narrow)
- if (not scheme) or (not netloc):
- return False
- return True
- # end def IsAbsolute
- IsAbsolute = staticmethod(IsAbsolute)
-
- def Canonicalize(loc):
- """ Do encoding and canonicalization on a URL string """
- if not loc:
- return loc
-
- # Let the encoder try to narrow it
- narrow = encoder.NarrowText(loc, None)
-
- # Escape components individually
- (scheme, netloc, path, query, frag) = urlsplit(narrow)
- unr = '-._~'
- sub = '!$&\'()*+,;='
- netloc = urllib_quote(netloc, unr + sub + '%:@/[]')
- path = urllib_quote(path, unr + sub + '%:@/')
- query = urllib_quote(query, unr + sub + '%:@/?')
- frag = urllib_quote(frag, unr + sub + '%:@/?')
-
- # Try built-in IDNA encoding on the netloc
- try:
- (ignore, widenetloc, ignore, ignore, ignore) = urlsplit(loc)
- for c in widenetloc:
- if c >= unichr(128):
- netloc = widenetloc.encode(ENC_IDNA)
- netloc = urllib_quote(netloc, unr + sub + '%:@/[]')
- break
- except UnicodeError:
- # urlsplit must have failed, based on implementation differences in the
- # library. There is not much we can do here, except ignore it.
- pass
- except LookupError:
- output.Warn('An International Domain Name (IDN) is being used, but this '
- 'version of Python does not have support for IDNA encoding. '
- ' (IDNA support was introduced in Python 2.3) The encoding '
- 'we have used instead is wrong and will probably not yield '
- 'valid URLs.')
- bad_netloc = False
- if '%' in netloc:
- bad_netloc = True
-
- # Put it all back together
- narrow = urlunsplit((scheme, netloc, path, query, frag))
-
- # I let '%' through. Fix any that aren't pre-existing escapes.
- HEXDIG = '0123456789abcdefABCDEF'
- list = narrow.split('%')
- narrow = list[0]
- del list[0]
- for item in list:
- if (len(item) >= 2) and (item[0] in HEXDIG) and (item[1] in HEXDIG):
- narrow = narrow + '%' + item
- else:
- narrow = narrow + '%25' + item
-
- # Issue a warning if this is a bad URL
- if bad_netloc:
- output.Warn('Invalid characters in the host or domain portion of a URL: '
- + narrow)
-
- return narrow
- # end def Canonicalize
- Canonicalize = staticmethod(Canonicalize)
-
- def VerifyDate(self, date, metatag):
- """Verify the date format is valid"""
- match = False
- if date:
- date = date.upper()
- for pattern in DATE_PATTERNS:
- match = pattern.match(date)
- if match:
- return True
- if not match:
- output.Warn('The value for %s does not appear to be in ISO8601 '
- 'format on URL: %s' % (metatag, self.loc))
- return False
- # end of VerifyDate
-
- def Validate(self, base_url, allow_fragment):
- """ Verify the data in this URL is well-formed, and override if not. """
- assert isinstance(base_url, bytes_str)
-
- # Test (and normalize) the ref
- if not self.loc:
- output.Warn('Empty URL')
- return False
- if allow_fragment:
- self.loc = urljoin(base_url, self.loc)
- if not self.loc.startswith(base_url):
- output.Warn('Discarded URL for not starting with the base_url: %s' %
- self.loc)
- self.loc = None
- return False
-
- # Test the lastmod
- if self.lastmod:
- if not self.VerifyDate(self.lastmod, "lastmod"):
- self.lastmod = None
-
- # Test the changefreq
- if self.changefreq:
- match = False
- self.changefreq = self.changefreq.lower()
- for pattern in CHANGEFREQ_PATTERNS:
- if self.changefreq == pattern:
- match = True
- break
- if not match:
- output.Warn('Changefreq "%s" is not a valid change frequency on URL '
- ': %s' % (self.changefreq, self.loc))
- self.changefreq = None
-
- # Test the priority
- if self.priority:
- priority = -1.0
- try:
- priority = float(self.priority)
- except ValueError:
- pass
- if (priority < 0.0) or (priority > 1.0):
- output.Warn('Priority "%s" is not a number between 0 and 1 inclusive '
- 'on URL: %s' % (self.priority, self.loc))
- self.priority = None
-
- return True
- # end def Validate
-
- def MakeHash(self):
- """ Provides a uniform way of hashing URLs """
- if not self.loc:
- return None
- if self.loc.endswith('/'):
- return get_hash(self.loc[:-1])
- return get_hash(self.loc)
- # end def MakeHash
-
- def Log(self, prefix='URL', level=3):
- """ Dump the contents, empty or not, to the log. """
- out = prefix + ':'
-
- for attribute in self.__slots__:
- value = getattr(self, attribute)
- if not value:
- value = ''
- out = out + (' %s=[%s]' % (attribute, value))
-
- output.Log('%s' % encoder.NarrowText(out, None), level)
- # end def Log
-
- def WriteXML(self, file):
- """ Dump non-empty contents to the output file, in XML format. """
- if not self.loc:
- return
- out = SITEURL_XML_PREFIX
-
- for attribute in self.__slots__:
- value = getattr(self, attribute)
- if value:
- if isinstance(value, unicode_str):
- value = encoder.NarrowText(value, None)
- elif not isinstance(value, bytes_str):
- value = str(value)
- value = xml.sax.saxutils.escape(value)
- out = out + (' <%s>%s</%s>\n' % (attribute, value, attribute))
-
- out = out + SITEURL_XML_SUFFIX
- file.write(out)
- # end def WriteXML
-# end class URL
-
-
-class NewsURL(URL):
- """ NewsURL is a subclass of URL with News-Sitemap specific properties. """
- __slots__ = 'loc', 'lastmod', 'changefreq', 'priority', 'publication_date', \
- 'keywords', 'stock_tickers'
-
- def __init__(self):
- URL.__init__(self)
- self.publication_date = None # ISO8601 timestamp of publication date
- self.keywords = None # Text keywords
- self.stock_tickers = None # Text stock
- # end def __init__
-
- def Validate(self, base_url, allow_fragment):
- """ Verify the data in this News URL is well-formed, and override if not. """
- assert isinstance(base_url, bytes_str)
-
- if not URL.Validate(self, base_url, allow_fragment):
- return False
-
- if not URL.VerifyDate(self, self.publication_date, "publication_date"):
- self.publication_date = None
-
- return True
- # end def Validate
-
- def WriteXML(self, file):
- """ Dump non-empty contents to the output file, in XML format. """
- if not self.loc:
- return
- out = SITEURL_XML_PREFIX
-
- # printed_news_tag indicates if news-specific metatags are present
- printed_news_tag = False
- for attribute in self.__slots__:
- value = getattr(self, attribute)
- if value:
- if isinstance(value, unicode_str):
- value = encoder.NarrowText(value, None)
- elif not isinstance(value, bytes_str):
- value = str(value)
- value = xml.sax.saxutils.escape(value)
- if attribute in NEWS_SPECIFIC_TAGS:
- if not printed_news_tag:
- printed_news_tag = True
- out = out + NEWS_TAG_XML_PREFIX
- out = out + (' <news:%s>%s</news:%s>\n' %
- (attribute, value, attribute))
- else:
- out = out + (' <%s>%s</%s>\n' % (
- attribute, value, attribute))
-
- if printed_news_tag:
- out = out + NEWS_TAG_XML_SUFFIX
- out = out + SITEURL_XML_SUFFIX
- file.write(out)
- # end def WriteXML
-# end class NewsURL
-
-
-class Filter:
- """
- A filter on the stream of URLs we find. A filter is, in essence,
- a wildcard applied to the stream. You can think of this as an
- operator that returns a tri-state when given a URL:
-
- True -- this URL is to be included in the sitemap
- None -- this URL is undecided
- False -- this URL is to be dropped from the sitemap
- """
-
- def __init__(self, attributes):
- self._wildcard = None # Pattern for wildcard match
- self._regexp = None # Pattern for regexp match
- self._pass = False # "Drop" filter vs. "Pass" filter
-
- if not ValidateAttributes('FILTER', attributes,
- ('pattern', 'type', 'action')):
- return
-
- # Check error count on the way in
- num_errors = output.num_errors
-
- # Fetch the attributes
- pattern = attributes.get('pattern')
- type = attributes.get('type', 'wildcard')
- action = attributes.get('action', 'drop')
- if type:
- type = type.lower()
- if action:
- action = action.lower()
-
- # Verify the attributes
- if not pattern:
- output.Error('On a filter you must specify a "pattern" to match')
- elif (not type) or ((type != 'wildcard') and (type != 'regexp')):
- output.Error('On a filter you must specify either \'type="wildcard"\' '
- 'or \'type="regexp"\'')
- elif (action != 'pass') and (action != 'drop'):
- output.Error('If you specify a filter action, it must be either '
- '\'action="pass"\' or \'action="drop"\'')
-
- # Set the rule
- if action == 'drop':
- self._pass = False
- elif action == 'pass':
- self._pass = True
-
- if type == 'wildcard':
- self._wildcard = pattern
- elif type == 'regexp':
- try:
- self._regexp = re.compile(pattern)
- except re.error:
- output.Error('Bad regular expression: %s' % pattern)
-
- # Log the final results iff we didn't add any errors
- if num_errors == output.num_errors:
- output.Log('Filter: %s any URL that matches %s "%s"' %
- (action, type, pattern), 2)
- # end def __init__
-
- def Apply(self, url):
- """ Process the URL, as above. """
- if (not url) or (not url.loc):
- return None
-
- if self._wildcard:
- if fnmatch.fnmatchcase(url.loc, self._wildcard):
- return self._pass
- return None
-
- if self._regexp:
- if self._regexp.search(url.loc):
- return self._pass
- return None
-
- assert False # unreachable
- # end def Apply
-# end class Filter
-
-
-class InputURL:
- """
- Each Input class knows how to yield a set of URLs from a data source.
-
- This one handles a single URL, manually specified in the config file.
- """
-
- def __init__(self, attributes):
- self._url = None # The lonely URL
-
- if not ValidateAttributes('URL', attributes,
- ('href', 'lastmod', 'changefreq', 'priority')):
- return
-
- url = URL()
- for attr in attributes.keys():
- if attr == 'href':
- url.TrySetAttribute('loc', attributes[attr])
- else:
- url.TrySetAttribute(attr, attributes[attr])
-
- if not url.loc:
- output.Error('Url entries must have an href attribute.')
- return
-
- self._url = url
- output.Log('Input: From URL "%s"' % self._url.loc, 2)
- # end def __init__
-
- def ProduceURLs(self, consumer):
- """ Produces URLs from our data source, hands them in to the consumer. """
- if self._url:
- consumer(self._url, True)
- # end def ProduceURLs
-# end class InputURL
-
-
-class InputURLList:
- """
- Each Input class knows how to yield a set of URLs from a data source.
-
- This one handles a text file with a list of URLs
- """
-
- def __init__(self, attributes):
- self._path = None # The file path
- self._encoding = None # Encoding of that file
-
- if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding')):
- return
-
- self._path = attributes.get('path')
- self._encoding = attributes.get('encoding', ENC_UTF8)
- if self._path:
- self._path = encoder.MaybeNarrowPath(self._path)
- if os.path.isfile(self._path):
- output.Log('Input: From URLLIST "%s"' % self._path, 2)
- else:
- output.Error('Can not locate file: %s' % self._path)
- self._path = None
- else:
- output.Error('Urllist entries must have a "path" attribute.')
- # end def __init__
-
- def ProduceURLs(self, consumer):
- """ Produces URLs from our data source, hands them in to the consumer. """
-
- # Open the file
- (frame, file) = OpenFileForRead(self._path, 'URLLIST')
- if not file:
- return
-
- # Iterate lines
- linenum = 0
- for line in file.readlines():
- linenum = linenum + 1
-
- # Strip comments and empty lines
- if self._encoding:
- line = encoder.WidenText(line, self._encoding)
- line = line.strip()
- if (not line) or line[0] == '#':
- continue
-
- # Split the line on space
- url = URL()
- cols = line.split(' ')
- for i in range(0, len(cols)):
- cols[i] = cols[i].strip()
- url.TrySetAttribute('loc', cols[0])
-
- # Extract attributes from the other columns
- for i in range(1, len(cols)):
- if cols[i]:
- try:
- (attr_name, attr_val) = cols[i].split('=', 1)
- url.TrySetAttribute(attr_name, attr_val)
- except ValueError:
- output.Warn('Line %d: Unable to parse attribute: %s' %
- (linenum, cols[i]))
-
- # Pass it on
- consumer(url, False)
-
- file.close()
- if frame:
- frame.close()
- # end def ProduceURLs
-# end class InputURLList
-
-
-class InputNewsURLList:
- """
- Each Input class knows how to yield a set of URLs from a data source.
-
- This one handles a text file with a list of News URLs and their metadata
- """
-
- def __init__(self, attributes):
- self._path = None # The file path
- self._encoding = None # Encoding of that file
- self._tag_order = [] # Order of URL metadata
-
- if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding', 'tag_order')):
- return
-
- self._path = attributes.get('path')
- self._encoding = attributes.get('encoding', ENC_UTF8)
- self._tag_order = attributes.get('tag_order')
-
- if self._path:
- self._path = encoder.MaybeNarrowPath(self._path)
- if os.path.isfile(self._path):
- output.Log('Input: From URLLIST "%s"' % self._path, 2)
- else:
- output.Error('Can not locate file: %s' % self._path)
- self._path = None
- else:
- output.Error('Urllist entries must have a "path" attribute.')
-
- # parse tag_order into an array
- # tag_order_ascii created for more readable logging
- tag_order_ascii = []
- if self._tag_order:
- self._tag_order = self._tag_order.split(",")
- for i in range(0, len(self._tag_order)):
- element = self._tag_order[i].strip().lower()
- self._tag_order[i] = element
- tag_order_ascii.append(element.encode('ascii'))
- output.Log(
- 'Input: From URLLIST tag order is "%s"' % tag_order_ascii, 0)
- else:
- output.Error('News Urllist configuration file must contain tag_order '
- 'to define Sitemap metatags.')
-
- # verify all tag_order inputs are valid
- tag_order_dict = {}
- for tag in self._tag_order:
- tag_order_dict[tag] = ""
- if not ValidateAttributes('URLLIST', tag_order_dict,
- NEWS_SITEMAP_TAGS):
- return
-
- # loc tag must be present
- loc_tag = False
- for tag in self._tag_order:
- if tag == 'loc':
- loc_tag = True
- break
- if not loc_tag:
- output.Error('News Urllist tag_order in configuration file '
- 'does not contain "loc" value: %s' % tag_order_ascii)
- # end def __init__
-
- def ProduceURLs(self, consumer):
- """ Produces URLs from our data source, hands them in to the consumer. """
-
- # Open the file
- (frame, file) = OpenFileForRead(self._path, 'URLLIST')
- if not file:
- return
-
- # Iterate lines
- linenum = 0
- for line in file.readlines():
- linenum = linenum + 1
-
- # Strip comments and empty lines
- if self._encoding:
- line = encoder.WidenText(line, self._encoding)
- line = line.strip()
- if (not line) or line[0] == '#':
- continue
-
- # Split the line on tabs
- url = NewsURL()
- cols = line.split('\t')
- for i in range(0, len(cols)):
- cols[i] = cols[i].strip()
-
- for i in range(0, len(cols)):
- if cols[i]:
- attr_value = cols[i]
- if i < len(self._tag_order):
- attr_name = self._tag_order[i]
- try:
- url.TrySetAttribute(attr_name, attr_value)
- except ValueError:
- output.Warn('Line %d: Unable to parse attribute: %s' %
- (linenum, cols[i]))
-
- # Pass it on
- consumer(url, False)
-
- file.close()
- if frame:
- frame.close()
- # end def ProduceURLs
-# end class InputNewsURLList
-
-
-class InputDirectory:
- """
- Each Input class knows how to yield a set of URLs from a data source.
-
- This one handles a directory that acts as base for walking the filesystem.
- """
-
- def __init__(self, attributes, base_url):
- self._path = None # The directory
- self._url = None # The URL equivalent
- self._default_file = None
- self._remove_empty_directories = False
-
- if not ValidateAttributes('DIRECTORY', attributes, ('path', 'url',
- 'default_file', 'remove_empty_directories')):
- return
-
- # Prep the path -- it MUST end in a sep
- path = attributes.get('path')
- if not path:
- output.Error('Directory entries must have both "path" and "url" '
- 'attributes')
- return
- path = encoder.MaybeNarrowPath(path)
- if not path.endswith(os.sep):
- path = path + os.sep
- if not os.path.isdir(path):
- output.Error('Can not locate directory: %s' % path)
- return
-
- # Prep the URL -- it MUST end in a sep
- url = attributes.get('url')
- if not url:
- output.Error('Directory entries must have both "path" and "url" '
- 'attributes')
- return
- url = URL.Canonicalize(url)
- if not url.endswith('/'):
- url = url + '/'
- if not url.startswith(base_url):
- url = urljoin(base_url, url)
- if not url.startswith(base_url):
- output.Error('The directory URL "%s" is not relative to the '
- 'base_url: %s' % (url, base_url))
- return
-
- # Prep the default file -- it MUST be just a filename
- file = attributes.get('default_file')
- if file:
- file = encoder.MaybeNarrowPath(file)
- if os.sep in file:
- output.Error('The default_file "%s" can not include path information.'
- % file)
- file = None
-
- # Prep the remove_empty_directories -- default is false
- remove_empty_directories = attributes.get('remove_empty_directories')
- if remove_empty_directories:
- if (remove_empty_directories == '1') or \
- (remove_empty_directories.lower() == 'true'):
- remove_empty_directories = True
- elif (remove_empty_directories == '0') or \
- (remove_empty_directories.lower() == 'false'):
- remove_empty_directories = False
- # otherwise the user set a non-default value
- else:
- output.Error('Configuration file remove_empty_directories '
- 'value is not recognized. Value must be true or false.')
- return
- else:
- remove_empty_directories = False
-
- self._path = path
- self._url = url
- self._default_file = file
- self._remove_empty_directories = remove_empty_directories
-
- if file:
- output.Log('Input: From DIRECTORY "%s" (%s) with default file "%s"'
- % (path, url, file), 2)
- else:
- output.Log('Input: From DIRECTORY "%s" (%s) with no default file'
- % (path, url), 2)
- # end def __init__
-
- def ProduceURLs(self, consumer):
- """ Produces URLs from our data source, hands them in to the consumer. """
- if not self._path:
- return
-
- root_path = self._path
- root_URL = self._url
- root_file = self._default_file
- remove_empty_directories = self._remove_empty_directories
-
- def HasReadPermissions(path):
- """ Verifies a given path has read permissions. """
- stat_info = os.stat(path)
- mode = stat_info[stat.ST_MODE]
- if mode & stat.S_IREAD:
- return True
- else:
- return None
-
- def PerFile(dirpath, name):
- """
- Called once per file.
- Note that 'name' will occasionally be None -- for a directory itself
- """
- # Pull a timestamp
- url = URL()
- isdir = False
- try:
- if name:
- path = os.path.join(dirpath, name)
- else:
- path = dirpath
- isdir = os.path.isdir(path)
- time = None
- if isdir and root_file:
- file = os.path.join(path, root_file)
- try:
- time = os.stat(file)[stat.ST_MTIME]
- except OSError:
- pass
- if not time:
- time = os.stat(path)[stat.ST_MTIME]
- url.lastmod = TimestampISO8601(time)
- except OSError:
- pass
- except ValueError:
- pass
-
- # Build a URL
- middle = dirpath[len(root_path):]
- if os.sep != '/':
- middle = middle.replace(os.sep, '/')
- if middle:
- middle = middle + '/'
- if name:
- middle = middle + name
- if isdir:
- middle = middle + '/'
- url.TrySetAttribute(
- 'loc', root_URL + encoder.WidenText(middle, None))
-
- # Suppress default files. (All the way down here so we can log
- # it.)
- if name and (root_file == name):
- url.Log(prefix='IGNORED (default file)', level=2)
- return
-
- # Suppress directories when remove_empty_directories="true"
- try:
- if isdir:
- if HasReadPermissions(path):
- if remove_empty_directories == 'true' and \
- len(os.listdir(path)) == 0:
- output.Log(
- 'IGNORED empty directory %s' % str(path), level=1)
- return
- elif path == self._path:
- output.Error('IGNORED configuration file directory input %s due '
- 'to file permissions' % self._path)
- else:
- output.Log('IGNORED files within directory %s due to file '
- 'permissions' % str(path), level=0)
- except OSError:
- pass
- except ValueError:
- pass
-
- consumer(url, False)
- # end def PerFile
-
- def PerDirectory(ignore, dirpath, namelist):
- """
- Called once per directory with a list of all the contained files/dirs.
- """
- ignore = ignore # Avoid warnings of an unused parameter
-
- if not dirpath.startswith(root_path):
- output.Warn('Unable to decide what the root path is for directory: '
- '%s' % dirpath)
- return
-
- for name in namelist:
- PerFile(dirpath, name)
- # end def PerDirectory
-
- output.Log('Walking DIRECTORY "%s"' % self._path, 1)
- PerFile(self._path, None)
- os.path.walk(self._path, PerDirectory, None)
- # end def ProduceURLs
-# end class InputDirectory
-
-
-class InputAccessLog:
- """
- Each Input class knows how to yield a set of URLs from a data source.
-
- This one handles access logs. It's non-trivial in that we want to
- auto-detect log files in the Common Logfile Format (as used by Apache,
- for instance) and the Extended Log File Format (as used by IIS, for
- instance).
- """
-
- def __init__(self, attributes):
- self._path = None # The file path
- self._encoding = None # Encoding of that file
- self._is_elf = False # Extended Log File Format?
- self._is_clf = False # Common Logfile Format?
- self._elf_status = -1 # ELF field: '200'
- self._elf_method = -1 # ELF field: 'HEAD'
- self._elf_uri = -1 # ELF field: '/foo?bar=1'
- self._elf_urifrag1 = -1 # ELF field: '/foo'
- self._elf_urifrag2 = -1 # ELF field: 'bar=1'
-
- if not ValidateAttributes('ACCESSLOG', attributes, ('path', 'encoding')):
- return
-
- self._path = attributes.get('path')
- self._encoding = attributes.get('encoding', ENC_UTF8)
- if self._path:
- self._path = encoder.MaybeNarrowPath(self._path)
- if os.path.isfile(self._path):
- output.Log('Input: From ACCESSLOG "%s"' % self._path, 2)
- else:
- output.Error('Can not locate file: %s' % self._path)
- self._path = None
- else:
- output.Error('Accesslog entries must have a "path" attribute.')
- # end def __init__
-
- def RecognizeELFLine(self, line):
- """ Recognize the Fields directive that heads an ELF file """
- if not line.startswith('#Fields:'):
- return False
- fields = line.split(' ')
- del fields[0]
- for i in range(0, len(fields)):
- field = fields[i].strip()
- if field == 'sc-status':
- self._elf_status = i
- elif field == 'cs-method':
- self._elf_method = i
- elif field == 'cs-uri':
- self._elf_uri = i
- elif field == 'cs-uri-stem':
- self._elf_urifrag1 = i
- elif field == 'cs-uri-query':
- self._elf_urifrag2 = i
- output.Log('Recognized an Extended Log File Format file.', 2)
- return True
- # end def RecognizeELFLine
-
- def GetELFLine(self, line):
- """ Fetch the requested URL from an ELF line """
- fields = line.split(' ')
- count = len(fields)
-
- # Verify status was Ok
- if self._elf_status >= 0:
- if self._elf_status >= count:
- return None
- if not fields[self._elf_status].strip() == '200':
- return None
-
- # Verify method was HEAD or GET
- if self._elf_method >= 0:
- if self._elf_method >= count:
- return None
- if not fields[self._elf_method].strip() in ('HEAD', 'GET'):
- return None
-
- # Pull the full URL if we can
- if self._elf_uri >= 0:
- if self._elf_uri >= count:
- return None
- url = fields[self._elf_uri].strip()
- if url != '-':
- return url
-
- # Put together a fragmentary URL
- if self._elf_urifrag1 >= 0:
- if self._elf_urifrag1 >= count or self._elf_urifrag2 >= count:
- return None
- urlfrag1 = fields[self._elf_urifrag1].strip()
- urlfrag2 = None
- if self._elf_urifrag2 >= 0:
- urlfrag2 = fields[self._elf_urifrag2]
- if urlfrag1 and (urlfrag1 != '-'):
- if urlfrag2 and (urlfrag2 != '-'):
- urlfrag1 = urlfrag1 + '?' + urlfrag2
- return urlfrag1
-
- return None
- # end def GetELFLine
-
- def RecognizeCLFLine(self, line):
- """ Try to tokenize a logfile line according to CLF pattern and see if
- it works. """
- match = ACCESSLOG_CLF_PATTERN.match(line)
- recognize = match and (match.group(1) in ('HEAD', 'GET'))
- if recognize:
- output.Log('Recognized a Common Logfile Format file.', 2)
- return recognize
- # end def RecognizeCLFLine
-
- def GetCLFLine(self, line):
- """ Fetch the requested URL from a CLF line """
- match = ACCESSLOG_CLF_PATTERN.match(line)
- if match:
- request = match.group(1)
- if request in ('HEAD', 'GET'):
- return match.group(2)
- return None
- # end def GetCLFLine
-
- def ProduceURLs(self, consumer):
- """ Produces URLs from our data source, hands them in to the consumer. """
-
- # Open the file
- (frame, file) = OpenFileForRead(self._path, 'ACCESSLOG')
- if not file:
- return
-
- # Iterate lines
- for line in file.readlines():
- if self._encoding:
- line = encoder.WidenText(line, self._encoding)
- line = line.strip()
-
- # If we don't know the format yet, try them both
- if (not self._is_clf) and (not self._is_elf):
- self._is_elf = self.RecognizeELFLine(line)
- self._is_clf = self.RecognizeCLFLine(line)
-
- # Digest the line
- match = None
- if self._is_elf:
- match = self.GetELFLine(line)
- elif self._is_clf:
- match = self.GetCLFLine(line)
- if not match:
- continue
-
- # Pass it on
- url = URL()
- url.TrySetAttribute('loc', match)
- consumer(url, True)
-
- file.close()
- if frame:
- frame.close()
- # end def ProduceURLs
-# end class InputAccessLog
-
-
-class FilePathGenerator:
- """
- This class generates filenames in a series, upon request.
- You can request any iteration number at any time, you don't
- have to go in order.
-
- Example of iterations for '/path/foo.xml.gz':
- 0 --> /path/foo.xml.gz
- 1 --> /path/foo1.xml.gz
- 2 --> /path/foo2.xml.gz
- _index.xml --> /path/foo_index.xml
- """
-
- def __init__(self):
- self.is_gzip = False # Is this a GZIP file?
-
- self._path = None # '/path/'
- self._prefix = None # 'foo'
- self._suffix = None # '.xml.gz'
- # end def __init__
-
- def Preload(self, path):
- """ Splits up a path into forms ready for recombination. """
- path = encoder.MaybeNarrowPath(path)
-
- # Get down to a base name
- path = os.path.normpath(path)
- base = os.path.basename(path).lower()
- if not base:
- output.Error('Couldn\'t parse the file path: %s' % path)
- return False
- lenbase = len(base)
-
- # Recognize extension
- lensuffix = 0
- compare_suffix = ['.xml', '.xml.gz', '.gz']
- for suffix in compare_suffix:
- if base.endswith(suffix):
- lensuffix = len(suffix)
- break
- if not lensuffix:
- output.Error('The path "%s" doesn\'t end in a supported file '
- 'extension.' % path)
- return False
- self.is_gzip = suffix.endswith('.gz')
-
- # Split the original path
- lenpath = len(path)
- self._path = path[:lenpath - lenbase]
- self._prefix = path[lenpath - lenbase:lenpath - lensuffix]
- self._suffix = path[lenpath - lensuffix:]
-
- return True
- # end def Preload
-
- def GeneratePath(self, instance):
- """ Generates the iterations, as described above. """
- prefix = self._path + self._prefix
- if isinstance(instance, int):
- if instance:
- return '%s%d%s' % (prefix, instance, self._suffix)
- return prefix + self._suffix
- return prefix + instance
- # end def GeneratePath
-
- def GenerateURL(self, instance, root_url):
- """ Generates iterations, but as a URL instead of a path. """
- prefix = root_url + self._prefix
- retval = None
- if isinstance(instance, int):
- if instance:
- retval = '%s%d%s' % (prefix, instance, self._suffix)
- else:
- retval = prefix + self._suffix
- else:
- retval = prefix + instance
- return URL.Canonicalize(retval)
- # end def GenerateURL
-
- def GenerateWildURL(self, root_url):
- """ Generates a wildcard that should match all our iterations """
- prefix = URL.Canonicalize(root_url + self._prefix)
- temp = URL.Canonicalize(prefix + self._suffix)
- suffix = temp[len(prefix):]
- return prefix + '*' + suffix
- # end def GenerateURL
-# end class FilePathGenerator
-
-
-class PerURLStatistics:
- """ Keep track of some simple per-URL statistics, like file extension. """
-
- def __init__(self):
- self._extensions = {} # Count of extension instances
- # end def __init__
-
- def Consume(self, url):
- """ Log some stats for the URL. At the moment, that means extension. """
- if url and url.loc:
- (scheme, netloc, path, query, frag) = urlsplit(url.loc)
- if not path:
- return
-
- # Recognize directories
- if path.endswith('/'):
- if '/' in self._extensions:
- self._extensions['/'] = self._extensions['/'] + 1
- else:
- self._extensions['/'] = 1
- return
-
- # Strip to a filename
- i = path.rfind('/')
- if i >= 0:
- assert i < len(path)
- path = path[i:]
-
- # Find extension
- i = path.rfind('.')
- if i > 0:
- assert i < len(path)
- ext = path[i:].lower()
- if ext in self._extensions:
- self._extensions[ext] = self._extensions[ext] + 1
- else:
- self._extensions[ext] = 1
- else:
- if '(no extension)' in self._extensions:
- self._extensions['(no extension)'] = self._extensions[
- '(no extension)'] + 1
- else:
- self._extensions['(no extension)'] = 1
- # end def Consume
-
- def Log(self):
- """ Dump out stats to the output. """
- if len(self._extensions):
- output.Log('Count of file extensions on URLs:', 1)
- set = sorted(self._extensions.keys())
- for ext in set:
- output.Log(' %7d %s' % (self._extensions[ext], ext), 1)
- # end def Log
-
-
-class Sitemap(xml.sax.handler.ContentHandler):
- """
- This is the big workhorse class that processes your inputs and spits
- out sitemap files. It is built as a SAX handler for set up purposes.
- That is, it processes an XML stream to bring itself up.
- """
-
- def __init__(self, suppress_notify):
- xml.sax.handler.ContentHandler.__init__(self)
- self._filters = [] # Filter objects
- self._inputs = [] # Input objects
- self._urls = {} # Maps URLs to count of dups
- self._set = [] # Current set of URLs
- self._filegen = None # Path generator for output files
- self._wildurl1 = None # Sitemap URLs to filter out
- self._wildurl2 = None # Sitemap URLs to filter out
- self._sitemaps = 0 # Number of output files
- # We init _dup_max to 2 so the default priority is 0.5 instead of 1.0
- self._dup_max = 2 # Max number of duplicate URLs
- self._stat = PerURLStatistics() # Some simple stats
- self._in_site = False # SAX: are we in a Site node?
- self._in_Site_ever = False # SAX: were we ever in a Site?
-
- self._default_enc = None # Best encoding to try on URLs
- self._base_url = None # Prefix to all valid URLs
- self._store_into = None # Output filepath
- self._sitemap_type = None # Sitemap type (web, mobile or news)
- self._suppress = suppress_notify # Suppress notify of servers
- # end def __init__
-
- def ValidateBasicConfig(self):
- """ Verifies (and cleans up) the basic user-configurable options. """
- all_good = True
-
- if self._default_enc:
- encoder.SetUserEncoding(self._default_enc)
-
- # Canonicalize the base_url
- if all_good and not self._base_url:
- output.Error('A site needs a "base_url" attribute.')
- all_good = False
- if all_good and not URL.IsAbsolute(self._base_url):
- output.Error('The "base_url" must be absolute, not relative: %s' %
- self._base_url)
- all_good = False
- if all_good:
- self._base_url = URL.Canonicalize(self._base_url)
- if not self._base_url.endswith('/'):
- self._base_url = self._base_url + '/'
- output.Log('BaseURL is set to: %s' % self._base_url, 2)
-
- # Load store_into into a generator
- if all_good:
- if self._store_into:
- self._filegen = FilePathGenerator()
- if not self._filegen.Preload(self._store_into):
- all_good = False
- else:
- output.Error('A site needs a "store_into" attribute.')
- all_good = False
-
- # Ask the generator for patterns on what its output will look like
- if all_good:
- self._wildurl1 = self._filegen.GenerateWildURL(self._base_url)
- self._wildurl2 = self._filegen.GenerateURL(SITEINDEX_SUFFIX,
- self._base_url)
-
- # Unify various forms of False
- if all_good:
- if self._suppress:
- if (isinstance(self._suppress, bytes_str)) or (isinstance(self._suppress, unicode_str)):
- if (self._suppress == '0') or (self._suppress.lower() == 'false'):
- self._suppress = False
-
- # Clean up the sitemap_type
- if all_good:
- match = False
- # If sitemap_type is not specified, default to web sitemap
- if not self._sitemap_type:
- self._sitemap_type = 'web'
- else:
- self._sitemap_type = self._sitemap_type.lower()
- for pattern in SITEMAP_TYPES:
- if self._sitemap_type == pattern:
- match = True
- break
- if not match:
- output.Error('The "sitemap_type" value must be "web", "mobile" '
- 'or "news": %s' % self._sitemap_type)
- all_good = False
- output.Log('The Sitemap type is %s Sitemap.' %
- self._sitemap_type.upper(), 0)
-
- # Done
- if not all_good:
- output.Log('See "example_config.xml" for more information.', 0)
- return all_good
- # end def ValidateBasicConfig
-
- def Generate(self):
- """ Run over all the Inputs and ask them to Produce """
- # Run the inputs
- for input in self._inputs:
- input.ProduceURLs(self.ConsumeURL)
-
- # Do last flushes
- if len(self._set):
- self.FlushSet()
- if not self._sitemaps:
- output.Warn('No URLs were recorded, writing an empty sitemap.')
- self.FlushSet()
-
- # Write an index as needed
- if self._sitemaps > 1:
- self.WriteIndex()
-
- # Notify
- self.NotifySearch()
-
- # Dump stats
- self._stat.Log()
- # end def Generate
-
- def ConsumeURL(self, url, allow_fragment):
- """
- All per-URL processing comes together here, regardless of Input.
- Here we run filters, remove duplicates, spill to disk as needed, etc.
-
- """
- if not url:
- return
-
- # Validate
- if not url.Validate(self._base_url, allow_fragment):
- return
-
- # Run filters
- accept = None
- for filter in self._filters:
- accept = filter.Apply(url)
- if accept is not None:
- break
- if not (accept or (accept is None)):
- url.Log(prefix='FILTERED', level=2)
- return
-
- # Ignore our out output URLs
- if fnmatch.fnmatchcase(url.loc, self._wildurl1) or fnmatch.fnmatchcase(
- url.loc, self._wildurl2):
- url.Log(prefix='IGNORED (output file)', level=2)
- return
-
- # Note the sighting
- hash = url.MakeHash()
- if hash in self._urls:
- dup = self._urls[hash]
- if dup > 0:
- dup = dup + 1
- self._urls[hash] = dup
- if self._dup_max < dup:
- self._dup_max = dup
- url.Log(prefix='DUPLICATE')
- return
-
- # Acceptance -- add to set
- self._urls[hash] = 1
- self._set.append(url)
- self._stat.Consume(url)
- url.Log()
-
- # Flush the set if needed
- if len(self._set) >= MAXURLS_PER_SITEMAP:
- self.FlushSet()
- # end def ConsumeURL
-
- def FlushSet(self):
- """
- Flush the current set of URLs to the output. This is a little
- slow because we like to sort them all and normalize the priorities
- before dumping.
- """
-
- # Determine what Sitemap header to use (News or General)
- if self._sitemap_type == 'news':
- sitemap_header = NEWS_SITEMAP_HEADER
- else:
- sitemap_header = GENERAL_SITEMAP_HEADER
-
- # Sort and normalize
- output.Log('Sorting and normalizing collected URLs.', 1)
- self._set.sort()
- for url in self._set:
- hash = url.MakeHash()
- dup = self._urls[hash]
- if dup > 0:
- self._urls[hash] = -1
- if not url.priority:
- url.priority = '%.4f' % (float(dup) / float(self._dup_max))
-
- # Get the filename we're going to write to
- filename = self._filegen.GeneratePath(self._sitemaps)
- if not filename:
- output.Fatal('Unexpected: Couldn\'t generate output filename.')
- self._sitemaps = self._sitemaps + 1
- output.Log('Writing Sitemap file "%s" with %d URLs' %
- (filename, len(self._set)), 1)
-
- # Write to it
- frame = None
- file = None
-
- try:
- if self._filegen.is_gzip:
- basename = os.path.basename(filename)
- frame = open(filename, 'wb')
- file = gzip.GzipFile(
- fileobj=frame, filename=basename, mode='wt')
- else:
- file = open(filename, 'wt')
-
- file.write(sitemap_header)
- for url in self._set:
- url.WriteXML(file)
- file.write(SITEMAP_FOOTER)
-
- file.close()
- if frame:
- frame.close()
-
- frame = None
- file = None
- except IOError:
- output.Fatal('Couldn\'t write out to file: %s' % filename)
- os.chmod(filename, 0o0644)
-
- # Flush
- self._set = []
- # end def FlushSet
-
- def WriteIndex(self):
- """ Write the master index of all Sitemap files """
- # Make a filename
- filename = self._filegen.GeneratePath(SITEINDEX_SUFFIX)
- if not filename:
- output.Fatal(
- 'Unexpected: Couldn\'t generate output index filename.')
- output.Log('Writing index file "%s" with %d Sitemaps' %
- (filename, self._sitemaps), 1)
-
- # Determine what Sitemap index header to use (News or General)
- if self._sitemap_type == 'news':
- sitemap_index_header = NEWS_SITEMAP_HEADER
- else:
- sitemap_index_header = GENERAL_SITEMAP_HEADER
-
- # Make a lastmod time
- lastmod = TimestampISO8601(time.time())
-
- # Write to it
- try:
- fd = open(filename, 'wt')
- fd.write(sitemap_index_header)
-
- for mapnumber in range(0, self._sitemaps):
- # Write the entry
- mapurl = self._filegen.GenerateURL(mapnumber, self._base_url)
- mapattributes = {'loc': mapurl, 'lastmod': lastmod}
- fd.write(SITEINDEX_ENTRY % mapattributes)
-
- fd.write(SITEINDEX_FOOTER)
-
- fd.close()
- fd = None
- except IOError:
- output.Fatal('Couldn\'t write out to file: %s' % filename)
- os.chmod(filename, 0o0644)
- # end def WriteIndex
-
- def NotifySearch(self):
- """ Send notification of the new Sitemap(s) to the search engines. """
- if self._suppress:
- output.Log('Search engine notification is suppressed.', 1)
- return
-
- output.Log('Notifying search engines.', 1)
-
- # Override the urllib's opener class with one that doesn't ignore 404s
- class ExceptionURLopener(FancyURLopener):
- def http_error_default(self, url, fp, errcode, errmsg, headers):
- output.Log('HTTP error %d: %s' % (errcode, errmsg), 2)
- raise IOError
- # end def http_error_default
- # end class ExceptionURLOpener
- if sys.version_info[0] == 3:
- old_opener = urllib.request._urlopener
- urllib.request._urlopener = ExceptionURLopener()
- else:
- old_opener = urllib._urlopener
- urllib._urlopener = ExceptionURLopener()
-
- # Build the URL we want to send in
- if self._sitemaps > 1:
- url = self._filegen.GenerateURL(SITEINDEX_SUFFIX, self._base_url)
- else:
- url = self._filegen.GenerateURL(0, self._base_url)
-
- # Test if we can hit it ourselves
- try:
- u = urlopen(url)
- u.close()
- except IOError:
- output.Error('When attempting to access our generated Sitemap at the '
- 'following URL:\n %s\n we failed to read it. Please '
- 'verify the store_into path you specified in\n'
- ' your configuration file is web-accessable. Consult '
- 'the FAQ for more\n information.' % url)
- output.Warn('Proceeding to notify with an unverifyable URL.')
-
- # Cycle through notifications
- # To understand this, see the comment near the NOTIFICATION_SITES
- # comment
- for ping in NOTIFICATION_SITES:
- query_map = ping[3]
- query_attr = ping[5]
- query_map[query_attr] = url
- query = urllib.urlencode(query_map)
- notify = urlunsplit((ping[0], ping[1], ping[2], query, ping[4]))
-
- # Send the notification
- output.Log('Notifying: %s' % ping[1], 0)
- output.Log('Notification URL: %s' % notify, 2)
- try:
- u = urlopen(notify)
- u.read()
- u.close()
- except IOError:
- output.Warn('Cannot contact: %s' % ping[1])
-
- if old_opener:
- if sys.version_info[0] == 3:
- urllib.request._urlopener = old_opener
- else:
- urllib._urlopener = old_opener
- # end def NotifySearch
-
- def startElement(self, tag, attributes):
- """ SAX processing, called per node in the config stream. """
- if tag == 'site':
- if self._in_site:
- output.Error('Can not nest Site entries in the configuration.')
- else:
- self._in_site = True
-
- if not ValidateAttributes('SITE', attributes,
- ('verbose', 'default_encoding', 'base_url', 'store_into',
- 'suppress_search_engine_notify', 'sitemap_type')):
- return
-
- verbose = attributes.get('verbose', 0)
- if verbose:
- output.SetVerbose(verbose)
-
- self._default_enc = attributes.get('default_encoding')
- self._base_url = attributes.get('base_url')
- self._store_into = attributes.get('store_into')
- self._sitemap_type = attributes.get('sitemap_type')
- if not self._suppress:
- self._suppress = attributes.get(
- 'suppress_search_engine_notify',
- False)
- self.ValidateBasicConfig()
- elif tag == 'filter':
- self._filters.append(Filter(attributes))
-
- elif tag == 'url':
- print(type(attributes))
- self._inputs.append(InputURL(attributes))
-
- elif tag == 'urllist':
- for attributeset in ExpandPathAttribute(attributes, 'path'):
- if self._sitemap_type == 'news':
- self._inputs.append(InputNewsURLList(attributeset))
- else:
- self._inputs.append(InputURLList(attributeset))
-
- elif tag == 'directory':
- self._inputs.append(InputDirectory(attributes, self._base_url))
-
- elif tag == 'accesslog':
- for attributeset in ExpandPathAttribute(attributes, 'path'):
- self._inputs.append(InputAccessLog(attributeset))
- else:
- output.Error('Unrecognized tag in the configuration: %s' % tag)
- # end def startElement
-
- def endElement(self, tag):
- """ SAX processing, called per node in the config stream. """
- if tag == 'site':
- assert self._in_site
- self._in_site = False
- self._in_site_ever = True
- # end def endElement
-
- def endDocument(self):
- """ End of SAX, verify we can proceed. """
- if not self._in_site_ever:
- output.Error('The configuration must specify a "site" element.')
- else:
- if not self._inputs:
- output.Warn('There were no inputs to generate a sitemap from.')
- # end def endDocument
-# end class Sitemap
-
-
-def ValidateAttributes(tag, attributes, goodattributes):
- """ Makes sure 'attributes' does not contain any attribute not
- listed in 'goodattributes' """
- all_good = True
- for attr in attributes.keys():
- if not attr in goodattributes:
- output.Error('Unknown %s attribute: %s' % (tag, attr))
- all_good = False
- return all_good
-# end def ValidateAttributes
-
-
-def ExpandPathAttribute(src, attrib):
- """ Given a dictionary of attributes, return a list of dictionaries
- with all the same attributes except for the one named attrib.
- That one, we treat as a file path and expand into all its possible
- variations. """
- # Do the path expansion. On any error, just return the source dictionary.
- path = src.get(attrib)
- if not path:
- return [src]
- path = encoder.MaybeNarrowPath(path)
- pathlist = glob.glob(path)
- if not pathlist:
- return [src]
-
- # If this isn't actually a dictionary, make it one
- if not isinstance(src, dict):
- tmp = {}
- for key in src.keys():
- tmp[key] = src[key]
- src = tmp
- # Create N new dictionaries
- retval = []
- for path in pathlist:
- dst = src.copy()
- dst[attrib] = path
- retval.append(dst)
-
- return retval
-# end def ExpandPathAttribute
-
-
-def OpenFileForRead(path, logtext):
- """ Opens a text file, be it GZip or plain """
-
- frame = None
- file = None
-
- if not path:
- return (frame, file)
-
- try:
- if path.endswith('.gz'):
- frame = open(path, 'rb')
- file = gzip.GzipFile(fileobj=frame, mode='rt')
- else:
- file = open(path, 'rt')
-
- if logtext:
- output.Log('Opened %s file: %s' % (logtext, path), 1)
- else:
- output.Log('Opened file: %s' % path, 1)
- except IOError:
- output.Error('Can not open file: %s' % path)
-
- return (frame, file)
-# end def OpenFileForRead
-
-
-def TimestampISO8601(t):
- """Seconds since epoch (1970-01-01) --> ISO 8601 time string."""
- return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
-# end def TimestampISO8601
-
-
-def CreateSitemapFromFile(configpath, suppress_notify):
- """ Sets up a new Sitemap object from the specified configuration file. """
-
- # Remember error count on the way in
- num_errors = output.num_errors
-
- # Rev up SAX to parse the config
- sitemap = Sitemap(suppress_notify)
- try:
- output.Log('Reading configuration file: %s' % configpath, 0)
- xml.sax.parse(configpath, sitemap)
- except IOError:
- output.Error('Cannot read configuration file: %s' % configpath)
- except xml.sax._exceptions.SAXParseException as e:
- output.Error('XML error in the config file (line %d, column %d): %s' %
- (e._linenum, e._colnum, e.getMessage()))
- except xml.sax._exceptions.SAXReaderNotAvailable:
- output.Error('Some installs of Python 2.2 did not include complete support'
- ' for XML.\n Please try upgrading your version of Python'
- ' and re-running the script.')
-
- # If we added any errors, return no sitemap
- if num_errors == output.num_errors:
- return sitemap
- return None
-# end def CreateSitemapFromFile
-
-
-def ProcessCommandFlags(args):
- """
- Parse command line flags per specified usage, pick off key, value pairs
- All flags of type "--key=value" will be processed as __flags[key] = value,
- "--option" will be processed as __flags[option] = option
- """
-
- flags = {}
- rkeyval = '--(?P<key>\S*)[=](?P<value>\S*)' # --key=val
- roption = '--(?P<option>\S*)' # --key
- r = '(' + rkeyval + ')|(' + roption + ')'
- rc = re.compile(r)
- for a in args:
- try:
- rcg = rc.search(a).groupdict()
- if 'key' in rcg:
- flags[rcg['key']] = rcg['value']
- if 'option' in rcg:
- flags[rcg['option']] = rcg['option']
- except AttributeError:
- return None
- return flags
-# end def ProcessCommandFlags
-
-
-#
-# __main__
-#
-
-if __name__ == '__main__':
- flags = ProcessCommandFlags(sys.argv[1:])
- if not flags or not 'config' in flags or 'help' in flags:
- output.Log(__usage__, 0)
- else:
- suppress_notify = 'testing' in flags
- sitemap = CreateSitemapFromFile(flags['config'], suppress_notify)
- if not sitemap:
- output.Log('Configuration file errors -- exiting.', 0)
- else:
- sitemap.Generate()
- output.Log('Number of errors: %d' % output.num_errors, 1)
- output.Log('Number of warnings: %d' % output.num_warns, 1)