1 files changed, 1927 insertions, 2028 deletions
diff --git a/nikola/plugins/task_sitemap/sitemap_gen.py b/nikola/plugins/task_sitemap/sitemap_gen.py
index eef2b0b..a877c24 100755..100644
--- a/nikola/plugins/task_sitemap/sitemap_gen.py
+++ b/nikola/plugins/task_sitemap/sitemap_gen.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# flake8: noqa
 #
 # Copyright (c) 2004, 2005 Google Inc.
 # All rights reserved.
@@ -43,7 +42,7 @@
 from __future__ import print_function
 
 __usage__ = \
-"""A simple script to automatically produce sitemaps for a webserver,
+    """A simple script to automatically produce sitemaps for a webserver,
 in the Google Sitemap Protocol (GSP).
 
 Usage: python sitemap_gen.py --config=config.xml [--help] [--testing]
@@ -52,41 +51,68 @@ Usage: python sitemap_gen.py --config=config.xml [--help] [--testing]
             --testing, specified when user is experimenting
 """
 
-# Please be careful that all syntax used in this file can be parsed on
-# Python 1.5 -- this version check is not evaluated until after the
-# entire file has been parsed.
-import sys
-if sys.hexversion < 0x02020000:
-  print('This script requires Python 2.2 or later.')
-  print('Currently run with version: %s' % sys.version)
-  sys.exit(1)
-
 import fnmatch
 import glob
 import gzip
-import hashlib
 import os
 import re
 import stat
+import sys
 import time
-import types
 import urllib
 import xml.sax
 
 try:
-    from urlparse import urlparse, urlsplit, urlunsplit
+    import md5
 except ImportError:
-    from urllib.parse import urlparse, urlsplit, urlunsplit
+    md5 = None  # NOQA
+    import hashlib
+
+try:
+    from urlparse import urlsplit, urlunsplit, urljoin
+except ImportError:
+    from urllib.parse import urlsplit, urlunsplit, urljoin  # NOQA
+
+try:
+    from urllib import quote as urllib_quote
+    from urllib import FancyURLopener
+    from urllib import urlopen
+except ImportError:
+    from urllib.parse import quote as urllib_quote  # NOQA
+    from urllib.request import FancyURLopener  # NOQA
+    from urllib.request import urlopen  # NOQA
+
+
+if sys.version_info[0] == 3:
+    # Python 3
+    bytes_str = bytes
+    unicode_str = str
+    unichr = chr
+else:
+    bytes_str = str
+    unicode_str = unicode
 
 # Text encodings
 ENC_ASCII = 'ASCII'
-ENC_UTF8  = 'UTF-8'
-ENC_IDNA  = 'IDNA'
+ENC_UTF8 = 'UTF-8'
+ENC_IDNA = 'IDNA'
 ENC_ASCII_LIST = ['ASCII', 'US-ASCII', 'US', 'IBM367', 'CP367', 'ISO646-US'
                   'ISO_646.IRV:1991', 'ISO-IR-6', 'ANSI_X3.4-1968',
-                  'ANSI_X3.4-1986', 'CPASCII' ]
+                  'ANSI_X3.4-1986', 'CPASCII']
 ENC_DEFAULT_LIST = ['ISO-8859-1', 'ISO-8859-2', 'ISO-8859-5']
 
+# Available Sitemap types
+SITEMAP_TYPES = ['web', 'mobile', 'news']
+
+# General Sitemap tags
+GENERAL_SITEMAP_TAGS = ['loc', 'changefreq', 'priority', 'lastmod']
+
+# News specific tags
+NEWS_SPECIFIC_TAGS = ['keywords', 'publication_date', 'stock_tickers']
+
+# News Sitemap tags
+NEWS_SITEMAP_TAGS = GENERAL_SITEMAP_TAGS + NEWS_SPECIFIC_TAGS
+
 # Maximum number of urls in each sitemap, before next Sitemap is created
 MAXURLS_PER_SITEMAP = 50000
 
@@ -95,53 +121,77 @@ SITEINDEX_SUFFIX = '_index.xml'
 
 # Regular expressions tried for extracting URLs from access logs.
 ACCESSLOG_CLF_PATTERN = re.compile(
-  r'.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*'
-  )
+    r'.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*'
+)
 
 # Match patterns for lastmod attributes
-LASTMOD_PATTERNS = map(re.compile, [
-  r'^\d\d\d\d$',
-  r'^\d\d\d\d-\d\d$',
-  r'^\d\d\d\d-\d\d-\d\d$',
-  r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$',
-  r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$',
-  r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$',
-  r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$',
-  ])
+DATE_PATTERNS = list(map(re.compile, [
+    r'^\d\d\d\d$',
+    r'^\d\d\d\d-\d\d$',
+    r'^\d\d\d\d-\d\d-\d\d$',
+    r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$',
+    r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$',
+    r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$',
+    r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$',
+]))
 
 # Match patterns for changefreq attributes
 CHANGEFREQ_PATTERNS = [
-  'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'
-  ]
+    'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'
+]
 
 # XML formats
-SITEINDEX_HEADER   = \
-  '<?xml version="1.0" encoding="UTF-8"?>\n' \
-  '<?xml-stylesheet type="text/xsl" href="gss.xsl"?>\n' \
-  '<sitemapindex\n' \
-  '  xmlns="http://www.google.com/schemas/sitemap/0.84"\n' \
-  '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
-  '  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84\n' \
-  '                      http://www.google.com/schemas/sitemap/0.84/' \
-  'siteindex.xsd">\n'
-SITEINDEX_FOOTER   = '</sitemapindex>\n'
-SITEINDEX_ENTRY    = \
-  ' <sitemap>\n' \
-  '  <loc>%(loc)s</loc>\n' \
-  '  <lastmod>%(lastmod)s</lastmod>\n' \
-  ' </sitemap>\n'
-SITEMAP_HEADER     = \
-  '<?xml version="1.0" encoding="UTF-8"?>\n' \
-  '<urlset\n' \
-  '  xmlns="http://www.google.com/schemas/sitemap/0.84"\n' \
-  '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
-  '  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84\n' \
-  '                      http://www.google.com/schemas/sitemap/0.84/' \
-  'sitemap.xsd">\n'
-SITEMAP_FOOTER     = '</urlset>\n'
+GENERAL_SITEINDEX_HEADER = \
+    '<?xml version="1.0" encoding="UTF-8"?>\n' \
+    '<sitemapindex\n' \
+    '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
+    '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
+    '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
+    '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
+    'siteindex.xsd">\n'
+
+NEWS_SITEINDEX_HEADER = \
+    '<?xml version="1.0" encoding="UTF-8"?>\n' \
+    '<sitemapindex\n' \
+    '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
+    '  xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"\n' \
+    '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
+    '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
+    '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
+    'siteindex.xsd">\n'
+
+SITEINDEX_FOOTER = '</sitemapindex>\n'
+SITEINDEX_ENTRY = \
+    ' <sitemap>\n' \
+    '  <loc>%(loc)s</loc>\n' \
+    '  <lastmod>%(lastmod)s</lastmod>\n' \
+    ' </sitemap>\n'
+GENERAL_SITEMAP_HEADER = \
+    '<?xml version="1.0" encoding="UTF-8"?>\n' \
+    '<urlset\n' \
+    '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
+    '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
+    '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
+    '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
+    'sitemap.xsd">\n'
+
+NEWS_SITEMAP_HEADER = \
+    '<?xml version="1.0" encoding="UTF-8"?>\n' \
+    '<urlset\n' \
+    '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
+    '  xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"\n' \
+    '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
+    '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
+    '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
+    'sitemap.xsd">\n'
+
+SITEMAP_FOOTER = '</urlset>\n'
 SITEURL_XML_PREFIX = ' <url>\n'
 SITEURL_XML_SUFFIX = ' </url>\n'
 
+NEWS_TAG_XML_PREFIX = '  <news:news>\n'
+NEWS_TAG_XML_SUFFIX = '  </news:news>\n'
+
 # Search engines to notify with the updated sitemaps
 #
 # This list is very non-obvious in what's going on.  Here's the gist:
@@ -156,2067 +206,1916 @@ SITEURL_XML_SUFFIX = ' </url>\n'
 #   5 - query attribute that should be set to the new Sitemap URL
 # Clear as mud, I know.
 NOTIFICATION_SITES = [
-  ('http', 'www.google.com', 'webmasters/sitemaps/ping', {}, '', 'sitemap')
-  ]
+    ('http', 'www.google.com', 'webmasters/sitemaps/ping', {}, '', 'sitemap'),
+]
+
+
+def get_hash(text):
+    if md5 is not None:
+        return md5.new(text).digest()
+    else:
+        m = hashlib.md5()
+        m.update(text.encode('utf8'))
+        return m.digest()
 
 
 class Error(Exception):
-  """
-  Base exception class.  In this module we tend not to use our own exception
-  types for very much, but they come in very handy on XML parsing with SAX.
-  """
-  pass
-#end class Error
+    """
+    Base exception class.  In this module we tend not to use our own exception
+    types for very much, but they come in very handy on XML parsing with SAX.
+    """
+    pass
+# end class Error
 
 
 class SchemaError(Error):
-  """Failure to process an XML file according to the schema we know."""
-  pass
-#end class SchemeError
+    """Failure to process an XML file according to the schema we know."""
+    pass
+# end class SchemeError
 
 
 class Encoder:
-  """
-  Manages wide-character/narrow-character conversions for just about all
-  text that flows into or out of the script.
-
-  You should always use this class for string coercion, as opposed to
-  letting Python handle coercions automatically.  Reason: Python
-  usually assumes ASCII (7-bit) as a default narrow character encoding,
-  which is not the kind of data we generally deal with.
-
-  General high-level methodologies used in sitemap_gen:
-
-  [PATHS]
-  File system paths may be wide or narrow, depending on platform.
-  This works fine, just be aware of it and be very careful to not
-  mix them.  That is, if you have to pass several file path arguments
-  into a library call, make sure they are all narrow or all wide.
-  This class has MaybeNarrowPath() which should be called on every
-  file system path you deal with.
-
-  [URLS]
-  URL locations are stored in Narrow form, already escaped.  This has the
-  benefit of keeping escaping and encoding as close as possible to the format
-  we read them in.  The downside is we may end up with URLs that have
-  intermingled encodings -- the root path may be encoded in one way
-  while the filename is encoded in another.  This is obviously wrong, but
-  it should hopefully be an issue hit by very few users.  The workaround
-  from the user level (assuming they notice) is to specify a default_encoding
-  parameter in their config file.
-
-  [OTHER]
-  Other text, such as attributes of the URL class, configuration options,
-  etc, are generally stored in Unicode for simplicity.
-  """
-
-  def __init__(self):
-    self._user      = None                  # User-specified default encoding
-    self._learned   = []                    # Learned default encodings
-    self._widefiles = False                 # File system can be wide
-
-    # Can the file system be Unicode?
-    try:
-      self._widefiles = os.path.supports_unicode_filenames
-    except AttributeError:
-      try:
-        self._widefiles = sys.getwindowsversion() == os.VER_PLATFORM_WIN32_NT
-      except AttributeError:
-        pass
-
-    # Try to guess a working default
-    try:
-      encoding = sys.getfilesystemencoding()
-      if encoding and not (encoding.upper() in ENC_ASCII_LIST):
-        self._learned = [ encoding ]
-    except AttributeError:
-      pass
-
-    if not self._learned:
-      encoding = sys.getdefaultencoding()
-      if encoding and not (encoding.upper() in ENC_ASCII_LIST):
-        self._learned = [ encoding ]
-
-    # If we had no guesses, start with some European defaults
-    if not self._learned:
-      self._learned = ENC_DEFAULT_LIST
-  #end def __init__
-
-  def SetUserEncoding(self, encoding):
-    self._user = encoding
-  #end def SetUserEncoding
-
-  def NarrowText(self, text, encoding):
-    """ Narrow a piece of arbitrary text """
-    if type(text) != types.UnicodeType:
-      return text
-
-    # Try the passed in preference
-    if encoding:
-      try:
-        result = text.encode(encoding)
-        if not encoding in self._learned:
-          self._learned.append(encoding)
-        return result
-      except UnicodeError:
-        pass
-      except LookupError:
-        output.Warn('Unknown encoding: %s' % encoding)
-
-    # Try the user preference
-    if self._user:
-      try:
-        return text.encode(self._user)
-      except UnicodeError:
-        pass
-      except LookupError:
-        temp = self._user
-        self._user = None
-        output.Warn('Unknown default_encoding: %s' % temp)
-
-    # Look through learned defaults, knock any failing ones out of the list
-    while self._learned:
-      try:
-        return text.encode(self._learned[0])
-      except:
-        del self._learned[0]
-
-    # When all other defaults are exhausted, use UTF-8
-    try:
-      return text.encode(ENC_UTF8)
-    except UnicodeError:
-      pass
-
-    # Something is seriously wrong if we get to here
-    return text.encode(ENC_ASCII, 'ignore')
-  #end def NarrowText
-
-  def MaybeNarrowPath(self, text):
-    """ Paths may be allowed to stay wide """
-    if self._widefiles:
-      return text
-    return self.NarrowText(text, None)
-  #end def MaybeNarrowPath
-
-  def WidenText(self, text, encoding):
-    """ Widen a piece of arbitrary text """
-    if type(text) != types.StringType:
-      return text
-
-    # Try the passed in preference
-    if encoding:
-      try:
-        result = unicode(text, encoding)
-        if not encoding in self._learned:
-          self._learned.append(encoding)
-        return result
-      except UnicodeError:
-        pass
-      except LookupError:
-        output.Warn('Unknown encoding: %s' % encoding)
-
-    # Try the user preference
-    if self._user:
-      try:
-        return unicode(text, self._user)
-      except UnicodeError:
-        pass
-      except LookupError:
-        temp = self._user
-        self._user = None
-        output.Warn('Unknown default_encoding: %s' % temp)
-
-    # Look through learned defaults, knock any failing ones out of the list
-    while self._learned:
-      try:
-        return unicode(text, self._learned[0])
-      except:
-        del self._learned[0]
-
-    # When all other defaults are exhausted, use UTF-8
-    try:
-      return unicode(text, ENC_UTF8)
-    except UnicodeError:
-      pass
-
-    # Getting here means it wasn't UTF-8 and we had no working default.
-    # We really don't have anything "right" we can do anymore.
-    output.Warn('Unrecognized encoding in text: %s' % text)
-    if not self._user:
-      output.Warn('You may need to set a default_encoding in your '
-                  'configuration file.')
-    return text.decode(ENC_ASCII, 'ignore')
-  #end def WidenText
-#end class Encoder
+    """
+    Manages wide-character/narrow-character conversions for just about all
+    text that flows into or out of the script.
+
+    You should always use this class for string coercion, as opposed to
+    letting Python handle coercions automatically.  Reason: Python
+    usually assumes ASCII (7-bit) as a default narrow character encoding,
+    which is not the kind of data we generally deal with.
+
+    General high-level methodologies used in sitemap_gen:
+
+    [PATHS]
+    File system paths may be wide or narrow, depending on platform.
+    This works fine, just be aware of it and be very careful to not
+    mix them.  That is, if you have to pass several file path arguments
+    into a library call, make sure they are all narrow or all wide.
+    This class has MaybeNarrowPath() which should be called on every
+    file system path you deal with.
+
+    [URLS]
+    URL locations are stored in Narrow form, already escaped.  This has the
+    benefit of keeping escaping and encoding as close as possible to the format
+    we read them in.  The downside is we may end up with URLs that have
+    intermingled encodings -- the root path may be encoded in one way
+    while the filename is encoded in another.  This is obviously wrong, but
+    it should hopefully be an issue hit by very few users.  The workaround
+    from the user level (assuming they notice) is to specify a default_encoding
+    parameter in their config file.
+
+    [OTHER]
+    Other text, such as attributes of the URL class, configuration options,
+    etc, are generally stored in Unicode for simplicity.
+    """
+
+    def __init__(self):
+        self._user = None                  # User-specified default encoding
+        self._learned = []                    # Learned default encodings
+        self._widefiles = False                 # File system can be wide
+
+        # Can the file system be Unicode?
+        try:
+            self._widefiles = os.path.supports_unicode_filenames
+        except AttributeError:
+            try:
+                self._widefiles = sys.getwindowsversion(
+                ) == os.VER_PLATFORM_WIN32_NT
+            except AttributeError:
+                pass
+
+        # Try to guess a working default
+        try:
+            encoding = sys.getfilesystemencoding()
+            if encoding and not (encoding.upper() in ENC_ASCII_LIST):
+                self._learned = [encoding]
+        except AttributeError:
+            pass
+
+        if not self._learned:
+            encoding = sys.getdefaultencoding()
+            if encoding and not (encoding.upper() in ENC_ASCII_LIST):
+                self._learned = [encoding]
+
+        # If we had no guesses, start with some European defaults
+        if not self._learned:
+            self._learned = ENC_DEFAULT_LIST
+    # end def __init__
+
+    def SetUserEncoding(self, encoding):
+        self._user = encoding
+    # end def SetUserEncoding
+
+    def NarrowText(self, text, encoding):
+        """ Narrow a piece of arbitrary text """
+        if isinstance(text, bytes_str):
+            return text
+
+        # Try the passed in preference
+        if encoding:
+            try:
+                result = text.encode(encoding)
+                if not encoding in self._learned:
+                    self._learned.append(encoding)
+                return result
+            except UnicodeError:
+                pass
+            except LookupError:
+                output.Warn('Unknown encoding: %s' % encoding)
+
+        # Try the user preference
+        if self._user:
+            try:
+                return text.encode(self._user)
+            except UnicodeError:
+                pass
+            except LookupError:
+                temp = self._user
+                self._user = None
+                output.Warn('Unknown default_encoding: %s' % temp)
+
+        # Look through learned defaults, knock any failing ones out of the list
+        while self._learned:
+            try:
+                return text.encode(self._learned[0])
+            except:
+                del self._learned[0]
+
+        # When all other defaults are exhausted, use UTF-8
+        try:
+            return text.encode(ENC_UTF8)
+        except UnicodeError:
+            pass
+
+        # Something is seriously wrong if we get to here
+        return text.encode(ENC_ASCII, 'ignore')
+    # end def NarrowText
+
+    def MaybeNarrowPath(self, text):
+        """ Paths may be allowed to stay wide """
+        if self._widefiles:
+            return text
+        return self.NarrowText(text, None)
+    # end def MaybeNarrowPath
+
+    def WidenText(self, text, encoding):
+        """ Widen a piece of arbitrary text """
+        if not isinstance(text, bytes_str):
+            return text
+
+        # Try the passed in preference
+        if encoding:
+            try:
+                result = unicode_str(text, encoding)
+                if not encoding in self._learned:
+                    self._learned.append(encoding)
+                return result
+            except UnicodeError:
+                pass
+            except LookupError:
+                output.Warn('Unknown encoding: %s' % encoding)
+
+        # Try the user preference
+        if self._user:
+            try:
+                return unicode_str(text, self._user)
+            except UnicodeError:
+                pass
+            except LookupError:
+                temp = self._user
+                self._user = None
+                output.Warn('Unknown default_encoding: %s' % temp)
+
+        # Look through learned defaults, knock any failing ones out of the list
+        while self._learned:
+            try:
+                return unicode_str(text, self._learned[0])
+            except:
+                del self._learned[0]
+
+        # When all other defaults are exhausted, use UTF-8
+        try:
+            return unicode_str(text, ENC_UTF8)
+        except UnicodeError:
+            pass
+
+        # Getting here means it wasn't UTF-8 and we had no working default.
+        # We really don't have anything "right" we can do anymore.
+        output.Warn('Unrecognized encoding in text: %s' % text)
+        if not self._user:
+            output.Warn('You may need to set a default_encoding in your '
+                        'configuration file.')
+        return text.decode(ENC_ASCII, 'ignore')
+    # end def WidenText
+# end class Encoder
 encoder = Encoder()
 
 
 class Output:
-  """
-  Exposes logging functionality, and tracks how many errors
-  we have thus output.
-
-  Logging levels should be used as thus:
-    Fatal     -- extremely sparingly
-    Error     -- config errors, entire blocks of user 'intention' lost
-    Warn      -- individual URLs lost
-    Log(,0)   -- Un-suppressable text that's not an error
-    Log(,1)   -- touched files, major actions
-    Log(,2)   -- parsing notes, filtered or duplicated URLs
-    Log(,3)   -- each accepted URL
-  """
-
-  def __init__(self):
-    self.num_errors    = 0                   # Count of errors
-    self.num_warns     = 0                   # Count of warnings
-
-    self._errors_shown = {}                  # Shown errors
-    self._warns_shown  = {}                  # Shown warnings
-    self._verbose      = 0                   # Level of verbosity
-  #end def __init__
-
-  def Log(self, text, level):
-    """ Output a blurb of diagnostic text, if the verbose level allows it """
-    if text:
-      text = encoder.NarrowText(text, None)
-      if self._verbose >= level:
-        print(text)
-  #end def Log
-
-  def Warn(self, text):
-    """ Output and count a warning.  Suppress duplicate warnings. """
-    if text:
-      text = encoder.NarrowText(text, None)
-      hash = hashlib.md5(text).digest()
-      if not self._warns_shown.has_key(hash):
-        self._warns_shown[hash] = 1
-        print('[WARNING] ' + text)
-      else:
-        self.Log('(suppressed) [WARNING] ' + text, 3)
-      self.num_warns = self.num_warns + 1
-  #end def Warn
-
-  def Error(self, text):
-    """ Output and count an error.  Suppress duplicate errors. """
-    if text:
-      text = encoder.NarrowText(text, None)
-      hash = hashlib.md5(text).digest()
-      if not self._errors_shown.has_key(hash):
-        self._errors_shown[hash] = 1
-        print('[ERROR] ' + text)
-      else:
-        self.Log('(suppressed) [ERROR] ' + text, 3)
-      self.num_errors = self.num_errors + 1
-  #end def Error
-
-  def Fatal(self, text):
-    """ Output an error and terminate the program. """
-    if text:
-      text = encoder.NarrowText(text, None)
-      print('[FATAL] ' + text)
-    else:
-      print('Fatal error.')
-    sys.exit(1)
-  #end def Fatal
+    """
+    Exposes logging functionality, and tracks how many errors
+    we have thus output.
+
+    Logging levels should be used as thus:
+      Fatal     -- extremely sparingly
+      Error     -- config errors, entire blocks of user 'intention' lost
+      Warn      -- individual URLs lost
+      Log(,0)   -- Un-suppressable text that's not an error
+      Log(,1)   -- touched files, major actions
+      Log(,2)   -- parsing notes, filtered or duplicated URLs
+      Log(,3)   -- each accepted URL
+    """
 
-  def SetVerbose(self, level):
-    """ Sets the verbose level. """
-    try:
-      if type(level) != types.IntType:
-        level = int(level)
-      if (level >= 0) and (level <= 3):
-        self._verbose = level
-        return
-    except ValueError:
-      pass
-    self.Error('Verbose level (%s) must be between 0 and 3 inclusive.' % level)
-  #end def SetVerbose
-#end class Output
+    def __init__(self):
+        self.num_errors = 0                   # Count of errors
+        self.num_warns = 0                   # Count of warnings
+
+        self._errors_shown = {}                  # Shown errors
+        self._warns_shown = {}                  # Shown warnings
+        self._verbose = 0                   # Level of verbosity
+    # end def __init__
+
+    def Log(self, text, level):
+        """ Output a blurb of diagnostic text, if the verbose level allows it """
+        if text:
+            text = encoder.NarrowText(text, None)
+            if self._verbose >= level:
+                print(text)
+    # end def Log
+
+    def Warn(self, text):
+        """ Output and count a warning.  Suppress duplicate warnings. """
+        if text:
+            text = encoder.NarrowText(text, None)
+            hash = get_hash(text)
+            if not hash in self._warns_shown:
+                self._warns_shown[hash] = 1
+                print('[WARNING] ' + text)
+            else:
+                self.Log('(suppressed) [WARNING] ' + text, 3)
+            self.num_warns = self.num_warns + 1
+    # end def Warn
+
+    def Error(self, text):
+        """ Output and count an error.  Suppress duplicate errors. """
+        if text:
+            text = encoder.NarrowText(text, None)
+            hash = get_hash(text)
+            if not hash in self._errors_shown:
+                self._errors_shown[hash] = 1
+                print('[ERROR] ' + text)
+            else:
+                self.Log('(suppressed) [ERROR] ' + text, 3)
+            self.num_errors = self.num_errors + 1
+    # end def Error
+
+    def Fatal(self, text):
+        """ Output an error and terminate the program. """
+        if text:
+            text = encoder.NarrowText(text, None)
+            print('[FATAL] ' + text)
+        else:
+            print('Fatal error.')
+        sys.exit(1)
+    # end def Fatal
+
+    def SetVerbose(self, level):
+        """ Sets the verbose level. """
+        try:
+            if not isinstance(level, int):
+                level = int(level)
+            if (level >= 0) and (level <= 3):
+                self._verbose = level
+                return
+        except ValueError:
+            pass
+        self.Error(
+            'Verbose level (%s) must be between 0 and 3 inclusive.' % level)
+    # end def SetVerbose
+# end class Output
 output = Output()
 
 
 class URL(object):
-  """ URL is a smart structure grouping together the properties we
-  care about for a single web reference. """
-  __slots__ = 'loc', 'lastmod', 'changefreq', 'priority'
-
-  def __init__(self):
-    self.loc        = None                  # URL -- in Narrow characters
-    self.lastmod    = None                  # ISO8601 timestamp of last modify
-    self.changefreq = None                  # Text term for update frequency
-    self.priority   = None                  # Float between 0 and 1 (inc)
-  #end def __init__
-
-  def __cmp__(self, other):
-    if self.loc < other.loc:
-      return -1
-    if self.loc > other.loc:
-      return 1
-    return 0
-  #end def __cmp__
-
-  def TrySetAttribute(self, attribute, value):
-    """ Attempt to set the attribute to the value, with a pretty try
-    block around it.  """
-    if attribute == 'loc':
-      self.loc = self.Canonicalize(value)
-    else:
-      try:
-        setattr(self, attribute, value)
-      except AttributeError:
-        output.Warn('Unknown URL attribute: %s' % attribute)
-  #end def TrySetAttribute
-
-  def IsAbsolute(loc):
-    """ Decide if the URL is absolute or not """
-    if not loc:
-      return False
-    narrow = encoder.NarrowText(loc, None)
-    (scheme, netloc, path, query, frag) = urlsplit(narrow)
-    if (not scheme) or (not netloc):
-      return False
-    return True
-  #end def IsAbsolute
-  IsAbsolute = staticmethod(IsAbsolute)
-
-  def Canonicalize(loc):
-    """ Do encoding and canonicalization on a URL string """
-    if not loc:
-      return loc
-
-    # Let the encoder try to narrow it
-    narrow = encoder.NarrowText(loc, None)
-
-    # Escape components individually
-    (scheme, netloc, path, query, frag) = urlsplit(narrow)
-    unr    = '-._~'
-    sub    = '!$&\'()*+,;='
-    netloc = urllib.quote(netloc, unr + sub + '%:@/[]')
-    path   = urllib.quote(path,   unr + sub + '%:@/')
-    query  = urllib.quote(query,  unr + sub + '%:@/?')
-    frag   = urllib.quote(frag,   unr + sub + '%:@/?')
-
-    # Try built-in IDNA encoding on the netloc
-    try:
-      (ignore, widenetloc, ignore, ignore, ignore) = urlsplit(loc)
-      for c in widenetloc:
-        if c >= unichr(128):
-          netloc = widenetloc.encode(ENC_IDNA)
-          netloc = urllib.quote(netloc, unr + sub + '%:@/[]')
-          break
-    except UnicodeError:
-      # urlsplit must have failed, based on implementation differences in the
-      # library.  There is not much we can do here, except ignore it.
-      pass
-    except LookupError:
-      output.Warn('An International Domain Name (IDN) is being used, but this '
-                  'version of Python does not have support for IDNA encoding. '
-                  ' (IDNA support was introduced in Python 2.3)  The encoding '
-                  'we have used instead is wrong and will probably not yield '
-                  'valid URLs.')
-    bad_netloc = False
-    if '%' in netloc:
-      bad_netloc = True
-
-    # Put it all back together
-    narrow = urlunsplit((scheme, netloc, path, query, frag))
-
-    # I let '%' through.  Fix any that aren't pre-existing escapes.
-    HEXDIG = '0123456789abcdefABCDEF'
-    list   = narrow.split('%')
-    narrow = list[0]
-    del list[0]
-    for item in list:
-      if (len(item) >= 2) and (item[0] in HEXDIG) and (item[1] in HEXDIG):
-        narrow = narrow + '%' + item
-      else:
-        narrow = narrow + '%25' + item
-
-    # Issue a warning if this is a bad URL
-    if bad_netloc:
-      output.Warn('Invalid characters in the host or domain portion of a URL: '
-                  + narrow)
-
-    return narrow
-  #end def Canonicalize
-  Canonicalize = staticmethod(Canonicalize)
-
-  def Validate(self, base_url, allow_fragment):
-    """ Verify the data in this URL is well-formed, and override if not. """
-    assert type(base_url) == types.StringType
-
-    # Test (and normalize) the ref
-    if not self.loc:
-      output.Warn('Empty URL')
-      return False
-    if allow_fragment:
-      self.loc = urlparse.urljoin(base_url, self.loc)
-    if not self.loc.startswith(base_url):
-      output.Warn('Discarded URL for not starting with the base_url: %s' %
-                  self.loc)
-      self.loc = None
-      return False
-
-    # Test the lastmod
-    if self.lastmod:
-      match = False
-      self.lastmod = self.lastmod.upper()
-      for pattern in LASTMOD_PATTERNS:
-        match = pattern.match(self.lastmod)
-        if match:
-          break
-      if not match:
-        output.Warn('Lastmod "%s" does not appear to be in ISO8601 format on '
-                    'URL: %s' % (self.lastmod, self.loc))
-        self.lastmod = None
-
-    # Test the changefreq
-    if self.changefreq:
-      match = False
-      self.changefreq = self.changefreq.lower()
-      for pattern in CHANGEFREQ_PATTERNS:
-        if self.changefreq == pattern:
-          match = True
-          break
-      if not match:
-        output.Warn('Changefreq "%s" is not a valid change frequency on URL '
-                    ': %s' % (self.changefreq, self.loc))
-        self.changefreq = None
-
-    # Test the priority
-    if self.priority:
-      priority = -1.0
-      try:
-        priority = float(self.priority)
-      except ValueError:
-        pass
-      if (priority < 0.0) or (priority > 1.0):
-        output.Warn('Priority "%s" is not a number between 0 and 1 inclusive '
-                    'on URL: %s' % (self.priority, self.loc))
-        self.priority = None
-
-    return True
-  #end def Validate
-
-  def MakeHash(self):
-    """ Provides a uniform way of hashing URLs """
-    if not self.loc:
-      return None
-    if self.loc.endswith('/'):
-      return hashlib.md5(self.loc[:-1]).digest()
-    return hashlib.md5(self.loc).digest()
-  #end def MakeHash
-
-  def Log(self, prefix='URL', level=3):
-    """ Dump the contents, empty or not, to the log. """
-    out = prefix + ':'
-
-    for attribute in self.__slots__:
-      value = getattr(self, attribute)
-      if not value:
-        value = ''
-      out = out + ('  %s=[%s]' % (attribute, value))
-
-    output.Log('%s' % encoder.NarrowText(out, None), level)
-  #end def Log
-
-  def WriteXML(self, file):
-    """ Dump non-empty contents to the output file, in XML format. """
-    if not self.loc:
-      return
-    out = SITEURL_XML_PREFIX
-
-    for attribute in self.__slots__:
-      value = getattr(self, attribute)
-      if value:
-        if type(value) == types.UnicodeType:
-          value = encoder.NarrowText(value, None)
-        elif type(value) != types.StringType:
-          value = str(value)
-        value = xml.sax.saxutils.escape(value)
-        out = out + ('  <%s>%s</%s>\n' % (attribute, value, attribute))
-
-    out = out + SITEURL_XML_SUFFIX
-    file.write(out)
-  #end def WriteXML
-#end class URL
+    """ URL is a smart structure grouping together the properties we
+    care about for a single web reference. """
+    __slots__ = 'loc', 'lastmod', 'changefreq', 'priority'
 
+    def __init__(self):
+        self.loc = None                  # URL -- in Narrow characters
+        self.lastmod = None                  # ISO8601 timestamp of last modify
+        self.changefreq = None                  # Text term for update frequency
+        self.priority = None                  # Float between 0 and 1 (inc)
+    # end def __init__
+
+    def __cmp__(self, other):
+        if self.loc < other.loc:
+            return -1
+        if self.loc > other.loc:
+            return 1
+        return 0
+    # end def __cmp__
+
+    def TrySetAttribute(self, attribute, value):
+        """ Attempt to set the attribute to the value, with a pretty try
+        block around it.  """
+        if attribute == 'loc':
+            self.loc = self.Canonicalize(value)
+        else:
+            try:
+                setattr(self, attribute, value)
+            except AttributeError:
+                output.Warn('Unknown URL attribute: %s' % attribute)
+    # end def TrySetAttribute
+
+    def IsAbsolute(loc):
+        """ Decide if the URL is absolute or not """
+        if not loc:
+            return False
+        narrow = encoder.NarrowText(loc, None)
+        (scheme, netloc, path, query, frag) = urlsplit(narrow)
+        if (not scheme) or (not netloc):
+            return False
+        return True
+    # end def IsAbsolute
+    IsAbsolute = staticmethod(IsAbsolute)
+
+    def Canonicalize(loc):
+        """ Do encoding and canonicalization on a URL string """
+        if not loc:
+            return loc
+
+        # Let the encoder try to narrow it
+        narrow = encoder.NarrowText(loc, None)
+
+        # Escape components individually
+        (scheme, netloc, path, query, frag) = urlsplit(narrow)
+        unr = '-._~'
+        sub = '!$&\'()*+,;='
+        netloc = urllib_quote(netloc, unr + sub + '%:@/[]')
+        path = urllib_quote(path, unr + sub + '%:@/')
+        query = urllib_quote(query, unr + sub + '%:@/?')
+        frag = urllib_quote(frag, unr + sub + '%:@/?')
+
+        # Try built-in IDNA encoding on the netloc
+        try:
+            (ignore, widenetloc, ignore, ignore, ignore) = urlsplit(loc)
+            for c in widenetloc:
+                if c >= unichr(128):
+                    netloc = widenetloc.encode(ENC_IDNA)
+                    netloc = urllib_quote(netloc, unr + sub + '%:@/[]')
+                    break
+        except UnicodeError:
+            # urlsplit must have failed, based on implementation differences in the
+            # library.  There is not much we can do here, except ignore it.
+            pass
+        except LookupError:
+            output.Warn('An International Domain Name (IDN) is being used, but this '
+                        'version of Python does not have support for IDNA encoding. '
+                        ' (IDNA support was introduced in Python 2.3)  The encoding '
+                        'we have used instead is wrong and will probably not yield '
+                        'valid URLs.')
+        bad_netloc = False
+        if '%' in netloc:
+            bad_netloc = True
+
+        # Put it all back together
+        narrow = urlunsplit((scheme, netloc, path, query, frag))
+
+        # I let '%' through.  Fix any that aren't pre-existing escapes.
+        HEXDIG = '0123456789abcdefABCDEF'
+        list = narrow.split('%')
+        narrow = list[0]
+        del list[0]
+        for item in list:
+            if (len(item) >= 2) and (item[0] in HEXDIG) and (item[1] in HEXDIG):
+                narrow = narrow + '%' + item
+            else:
+                narrow = narrow + '%25' + item
+
+        # Issue a warning if this is a bad URL
+        if bad_netloc:
+            output.Warn('Invalid characters in the host or domain portion of a URL: '
+                        + narrow)
+
+        return narrow
+    # end def Canonicalize
+    Canonicalize = staticmethod(Canonicalize)
+
+    def VerifyDate(self, date, metatag):
+        """Verify the date format is valid"""
+        match = False
+        if date:
+            date = date.upper()
+            for pattern in DATE_PATTERNS:
+                match = pattern.match(date)
+                if match:
+                    return True
+            if not match:
+                output.Warn('The value for %s does not appear to be in ISO8601 '
+                            'format on URL: %s' % (metatag, self.loc))
+                return False
+    # end of VerifyDate
+
+    def Validate(self, base_url, allow_fragment):
+        """ Verify the data in this URL is well-formed, and override if not. """
+        assert isinstance(base_url, bytes_str)
+
+        # Test (and normalize) the ref
+        if not self.loc:
+            output.Warn('Empty URL')
+            return False
+        if allow_fragment:
+            self.loc = urljoin(base_url, self.loc)
+        if not self.loc.startswith(base_url):
+            output.Warn('Discarded URL for not starting with the base_url: %s' %
+                        self.loc)
+            self.loc = None
+            return False
+
+        # Test the lastmod
+        if self.lastmod:
+            if not self.VerifyDate(self.lastmod, "lastmod"):
+                self.lastmod = None
+
+        # Test the changefreq
+        if self.changefreq:
+            match = False
+            self.changefreq = self.changefreq.lower()
+            for pattern in CHANGEFREQ_PATTERNS:
+                if self.changefreq == pattern:
+                    match = True
+                    break
+            if not match:
+                output.Warn('Changefreq "%s" is not a valid change frequency on URL '
+                            ': %s' % (self.changefreq, self.loc))
+                self.changefreq = None
+
+        # Test the priority
+        if self.priority:
+            priority = -1.0
+            try:
+                priority = float(self.priority)
+            except ValueError:
+                pass
+            if (priority < 0.0) or (priority > 1.0):
+                output.Warn('Priority "%s" is not a number between 0 and 1 inclusive '
+                            'on URL: %s' % (self.priority, self.loc))
+                self.priority = None
 
-class Filter:
-  """
-  A filter on the stream of URLs we find.  A filter is, in essence,
-  a wildcard applied to the stream.  You can think of this as an
-  operator that returns a tri-state when given a URL:
-
-    True  -- this URL is to be included in the sitemap
-    None  -- this URL is undecided
-    False -- this URL is to be dropped from the sitemap
-  """
-
-  def __init__(self, attributes):
-    self._wildcard  = None                  # Pattern for wildcard match
-    self._regexp    = None                  # Pattern for regexp match
-    self._pass      = False                 # "Drop" filter vs. "Pass" filter
-
-    if not ValidateAttributes('FILTER', attributes,
-                              ('pattern', 'type', 'action')):
-      return
-
-    # Check error count on the way in
-    num_errors = output.num_errors
+        return True
+    # end def Validate
+
+    def MakeHash(self):
+        """ Provides a uniform way of hashing URLs """
+        if not self.loc:
+            return None
+        if self.loc.endswith('/'):
+            return get_hash(self.loc[:-1])
+        return get_hash(self.loc)
+    # end def MakeHash
+
+    def Log(self, prefix='URL', level=3):
+        """ Dump the contents, empty or not, to the log. """
+        out = prefix + ':'
+
+        for attribute in self.__slots__:
+            value = getattr(self, attribute)
+            if not value:
+                value = ''
+            out = out + ('  %s=[%s]' % (attribute, value))
+
+        output.Log('%s' % encoder.NarrowText(out, None), level)
+    # end def Log
+
+    def WriteXML(self, file):
+        """ Dump non-empty contents to the output file, in XML format. """
+        if not self.loc:
+            return
+        out = SITEURL_XML_PREFIX
+
+        for attribute in self.__slots__:
+            value = getattr(self, attribute)
+            if value:
+                if isinstance(value, unicode_str):
+                    value = encoder.NarrowText(value, None)
+                elif not isinstance(value, bytes_str):
+                    value = str(value)
+                value = xml.sax.saxutils.escape(value)
+                out = out + ('  <%s>%s</%s>\n' % (attribute, value, attribute))
+
+        out = out + SITEURL_XML_SUFFIX
+        file.write(out)
+    # end def WriteXML
+# end class URL
+
+
+class NewsURL(URL):
+    """ NewsURL is a subclass of URL with News-Sitemap specific properties. """
+    __slots__ = 'loc', 'lastmod', 'changefreq', 'priority', 'publication_date', \
+                'keywords', 'stock_tickers'
 
-    # Fetch the attributes
-    pattern = attributes.get('pattern')
-    type    = attributes.get('type', 'wildcard')
-    action  = attributes.get('action', 'drop')
-    if type:
-      type = type.lower()
-    if action:
-      action = action.lower()
-
-    # Verify the attributes
-    if not pattern:
-      output.Error('On a filter you must specify a "pattern" to match')
-    elif (not type) or ((type != 'wildcard') and (type != 'regexp')):
-      output.Error('On a filter you must specify either \'type="wildcard"\' '
-                   'or \'type="regexp"\'')
-    elif (action != 'pass') and (action != 'drop'):
-      output.Error('If you specify a filter action, it must be either '
-                   '\'action="pass"\' or \'action="drop"\'')
-
-    # Set the rule
-    if action == 'drop':
-      self._pass = False
-    elif action == 'pass':
-      self._pass = True
-
-    if type == 'wildcard':
-      self._wildcard = pattern
-    elif type == 'regexp':
-      try:
-        self._regexp = re.compile(pattern)
-      except re.error:
-        output.Error('Bad regular expression: %s' %  pattern)
-
-    # Log the final results iff we didn't add any errors
-    if num_errors == output.num_errors:
-      output.Log('Filter: %s any URL that matches %s "%s"' %
-                 (action, type, pattern), 2)
-  #end def __init__
+    def __init__(self):
+        URL.__init__(self)
+        self.publication_date = None        # ISO8601 timestamp of publication date
+        self.keywords = None  # Text keywords
+        self.stock_tickers = None  # Text stock
+    # end def __init__
 
-  def Apply(self, url):
-    """ Process the URL, as above. """
-    if (not url) or (not url.loc):
-      return None
+    def Validate(self, base_url, allow_fragment):
+        """ Verify the data in this News URL is well-formed, and override if not. """
+        assert isinstance(base_url, bytes_str)
 
-    if self._wildcard:
-      if fnmatch.fnmatchcase(url.loc, self._wildcard):
-        return self._pass
-      return None
+        if not URL.Validate(self, base_url, allow_fragment):
+            return False
 
-    if self._regexp:
-      if self._regexp.search(url.loc):
-        return self._pass
-      return None
+        if not URL.VerifyDate(self, self.publication_date, "publication_date"):
+            self.publication_date = None
 
-    assert False # unreachable
-  #end def Apply
-#end class Filter
+        return True
+    # end def Validate
+
+    def WriteXML(self, file):
+        """ Dump non-empty contents to the output file, in XML format. """
+        if not self.loc:
+            return
+        out = SITEURL_XML_PREFIX
+
+        # printed_news_tag indicates if news-specific metatags are present
+        printed_news_tag = False
+        for attribute in self.__slots__:
+            value = getattr(self, attribute)
+            if value:
+                if isinstance(value, unicode_str):
+                    value = encoder.NarrowText(value, None)
+                elif not isinstance(value, bytes_str):
+                    value = str(value)
+                    value = xml.sax.saxutils.escape(value)
+                if attribute in NEWS_SPECIFIC_TAGS:
+                    if not printed_news_tag:
+                        printed_news_tag = True
+                        out = out + NEWS_TAG_XML_PREFIX
+                    out = out + ('    <news:%s>%s</news:%s>\n' %
+                                 (attribute, value, attribute))
+                else:
+                    out = out + ('  <%s>%s</%s>\n' % (
+                        attribute, value, attribute))
+
+        if printed_news_tag:
+            out = out + NEWS_TAG_XML_SUFFIX
+        out = out + SITEURL_XML_SUFFIX
+        file.write(out)
+    # end def WriteXML
+# end class NewsURL
 
 
-class InputURL:
-  """
-  Each Input class knows how to yield a set of URLs from a data source.
+class Filter:
+    """
+    A filter on the stream of URLs we find.  A filter is, in essence,
+    a wildcard applied to the stream.  You can think of this as an
+    operator that returns a tri-state when given a URL:
 
-  This one handles a single URL, manually specified in the config file.
-  """
+      True  -- this URL is to be included in the sitemap
+      None  -- this URL is undecided
+      False -- this URL is to be dropped from the sitemap
+    """
 
-  def __init__(self, attributes):
-    self._url = None                        # The lonely URL
+    def __init__(self, attributes):
+        self._wildcard = None                  # Pattern for wildcard match
+        self._regexp = None                  # Pattern for regexp match
+        self._pass = False                 # "Drop" filter vs. "Pass" filter
+
+        if not ValidateAttributes('FILTER', attributes,
+                                  ('pattern', 'type', 'action')):
+            return
+
+        # Check error count on the way in
+        num_errors = output.num_errors
+
+        # Fetch the attributes
+        pattern = attributes.get('pattern')
+        type = attributes.get('type', 'wildcard')
+        action = attributes.get('action', 'drop')
+        if type:
+            type = type.lower()
+        if action:
+            action = action.lower()
+
+        # Verify the attributes
+        if not pattern:
+            output.Error('On a filter you must specify a "pattern" to match')
+        elif (not type) or ((type != 'wildcard') and (type != 'regexp')):
+            output.Error('On a filter you must specify either \'type="wildcard"\' '
+                         'or \'type="regexp"\'')
+        elif (action != 'pass') and (action != 'drop'):
+            output.Error('If you specify a filter action, it must be either '
+                         '\'action="pass"\' or \'action="drop"\'')
+
+        # Set the rule
+        if action == 'drop':
+            self._pass = False
+        elif action == 'pass':
+            self._pass = True
+
+        if type == 'wildcard':
+            self._wildcard = pattern
+        elif type == 'regexp':
+            try:
+                self._regexp = re.compile(pattern)
+            except re.error:
+                output.Error('Bad regular expression: %s' % pattern)
+
+        # Log the final results iff we didn't add any errors
+        if num_errors == output.num_errors:
+            output.Log('Filter: %s any URL that matches %s "%s"' %
+                       (action, type, pattern), 2)
+    # end def __init__
+
+    def Apply(self, url):
+        """ Process the URL, as above. """
+        if (not url) or (not url.loc):
+            return None
+
+        if self._wildcard:
+            if fnmatch.fnmatchcase(url.loc, self._wildcard):
+                return self._pass
+            return None
+
+        if self._regexp:
+            if self._regexp.search(url.loc):
+                return self._pass
+            return None
+
+        assert False  # unreachable
+    # end def Apply
+# end class Filter
 
-    if not ValidateAttributes('URL', attributes,
-                                ('href', 'lastmod', 'changefreq', 'priority')):
-      return
 
-    url = URL()
-    for attr in attributes.keys():
-      if attr == 'href':
-        url.TrySetAttribute('loc', attributes[attr])
-      else:
-        url.TrySetAttribute(attr, attributes[attr])
+class InputURL:
+    """
+    Each Input class knows how to yield a set of URLs from a data source.
 
-    if not url.loc:
-      output.Error('Url entries must have an href attribute.')
-      return
+    This one handles a single URL, manually specified in the config file.
+    """
 
-    self._url = url
-    output.Log('Input: From URL "%s"' % self._url.loc, 2)
-  #end def __init__
+    def __init__(self, attributes):
+        self._url = None                        # The lonely URL
 
-  def ProduceURLs(self, consumer):
-    """ Produces URLs from our data source, hands them in to the consumer. """
-    if self._url:
-      consumer(self._url, True)
-  #end def ProduceURLs
-#end class InputURL
+        if not ValidateAttributes('URL', attributes,
+                                 ('href', 'lastmod', 'changefreq', 'priority')):
+            return
 
+        url = URL()
+        for attr in attributes.keys():
+            if attr == 'href':
+                url.TrySetAttribute('loc', attributes[attr])
+            else:
+                url.TrySetAttribute(attr, attributes[attr])
 
-class InputURLList:
-  """
-  Each Input class knows how to yield a set of URLs from a data source.
-
-  This one handles a text file with a list of URLs
-  """
-
-  def __init__(self, attributes):
-    self._path      = None                  # The file path
-    self._encoding  = None                  # Encoding of that file
-
-    if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding')):
-      return
-
-    self._path      = attributes.get('path')
-    self._encoding  = attributes.get('encoding', ENC_UTF8)
-    if self._path:
-      self._path    = encoder.MaybeNarrowPath(self._path)
-      if os.path.isfile(self._path):
-        output.Log('Input: From URLLIST "%s"' % self._path, 2)
-      else:
-        output.Error('Can not locate file: %s' % self._path)
-        self._path = None
-    else:
-      output.Error('Urllist entries must have a "path" attribute.')
-  #end def __init__
-
-  def ProduceURLs(self, consumer):
-    """ Produces URLs from our data source, hands them in to the consumer. """
-
-    # Open the file
-    (frame, file) = OpenFileForRead(self._path, 'URLLIST')
-    if not file:
-      return
-
-    # Iterate lines
-    linenum = 0
-    for line in file.readlines():
-      linenum = linenum + 1
-
-      # Strip comments and empty lines
-      if self._encoding:
-        line = encoder.WidenText(line, self._encoding)
-      line = line.strip()
-      if (not line) or line[0] == '#':
-        continue
-
-      # Split the line on space
-      url = URL()
-      cols = line.split(' ')
-      for i in range(0,len(cols)):
-        cols[i] = cols[i].strip()
-      url.TrySetAttribute('loc', cols[0])
-
-      # Extract attributes from the other columns
-      for i in range(1,len(cols)):
-        if cols[i]:
-          try:
-            (attr_name, attr_val) = cols[i].split('=', 1)
-            url.TrySetAttribute(attr_name, attr_val)
-          except ValueError:
-            output.Warn('Line %d: Unable to parse attribute: %s' %
-                        (linenum, cols[i]))
-
-      # Pass it on
-      consumer(url, False)
-
-    file.close()
-    if frame:
-      frame.close()
-  #end def ProduceURLs
-#end class InputURLList
+        if not url.loc:
+            output.Error('Url entries must have an href attribute.')
+            return
 
+        self._url = url
+        output.Log('Input: From URL "%s"' % self._url.loc, 2)
+    # end def __init__
 
-class InputDirectory:
-  """
-  Each Input class knows how to yield a set of URLs from a data source.
+    def ProduceURLs(self, consumer):
+        """ Produces URLs from our data source, hands them in to the consumer. """
+        if self._url:
+            consumer(self._url, True)
+    # end def ProduceURLs
+# end class InputURL
 
-  This one handles a directory that acts as base for walking the filesystem.
-  """
 
-  def __init__(self, attributes, base_url):
-    self._path         = None               # The directory
-    self._url          = None               # The URL equivelant
-    self._default_file = None
+class InputURLList:
+    """
+    Each Input class knows how to yield a set of URLs from a data source.
 
-    if not ValidateAttributes('DIRECTORY', attributes, ('path', 'url',
-                                                           'default_file')):
-      return
+    This one handles a text file with a list of URLs
+    """
 
-    # Prep the path -- it MUST end in a sep
-    path = attributes.get('path')
-    if not path:
-      output.Error('Directory entries must have both "path" and "url" '
-                  'attributes')
-      return
-    path = encoder.MaybeNarrowPath(path)
-    if not path.endswith(os.sep):
-      path = path + os.sep
-    if not os.path.isdir(path):
-      output.Error('Can not locate directory: %s' % path)
-      return
-
-    # Prep the URL -- it MUST end in a sep
-    url = attributes.get('url')
-    if not url:
-      output.Error('Directory entries must have both "path" and "url" '
-                  'attributes')
-      return
-    url = URL.Canonicalize(url)
-    if not url.endswith('/'):
-      url = url + '/'
-    if not url.startswith(base_url):
-      url = urlparse.urljoin(base_url, url)
-      if not url.startswith(base_url):
-        output.Error('The directory URL "%s" is not relative to the '
-                    'base_url: %s' % (url, base_url))
-        return
-
-    # Prep the default file -- it MUST be just a filename
-    file = attributes.get('default_file')
-    if file:
-      file = encoder.MaybeNarrowPath(file)
-      if os.sep in file:
-        output.Error('The default_file "%s" can not include path information.'
-                     % file)
-        file = None
+    def __init__(self, attributes):
+        self._path = None                  # The file path
+        self._encoding = None                  # Encoding of that file
+
+        if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding')):
+            return
+
+        self._path = attributes.get('path')
+        self._encoding = attributes.get('encoding', ENC_UTF8)
+        if self._path:
+            self._path = encoder.MaybeNarrowPath(self._path)
+            if os.path.isfile(self._path):
+                output.Log('Input: From URLLIST "%s"' % self._path, 2)
+            else:
+                output.Error('Can not locate file: %s' % self._path)
+                self._path = None
+        else:
+            output.Error('Urllist entries must have a "path" attribute.')
+    # end def __init__
+
+    def ProduceURLs(self, consumer):
+        """ Produces URLs from our data source, hands them in to the consumer. """
+
+        # Open the file
+        (frame, file) = OpenFileForRead(self._path, 'URLLIST')
+        if not file:
+            return
+
+        # Iterate lines
+        linenum = 0
+        for line in file.readlines():
+            linenum = linenum + 1
+
+            # Strip comments and empty lines
+            if self._encoding:
+                line = encoder.WidenText(line, self._encoding)
+            line = line.strip()
+            if (not line) or line[0] == '#':
+                continue
+
+            # Split the line on space
+            url = URL()
+            cols = line.split(' ')
+            for i in range(0, len(cols)):
+                cols[i] = cols[i].strip()
+            url.TrySetAttribute('loc', cols[0])
+
+            # Extract attributes from the other columns
+            for i in range(1, len(cols)):
+                if cols[i]:
+                    try:
+                        (attr_name, attr_val) = cols[i].split('=', 1)
+                        url.TrySetAttribute(attr_name, attr_val)
+                    except ValueError:
+                        output.Warn('Line %d: Unable to parse attribute: %s' %
+                                    (linenum, cols[i]))
+
+            # Pass it on
+            consumer(url, False)
+
+        file.close()
+        if frame:
+            frame.close()
+    # end def ProduceURLs
+# end class InputURLList
+
+
+class InputNewsURLList:
+    """
+    Each Input class knows how to yield a set of URLs from a data source.
 
-    self._path         = path
-    self._url          = url
-    self._default_file = file
-    if file:
-      output.Log('Input: From DIRECTORY "%s" (%s) with default file "%s"'
-                 % (path, url, file), 2)
-    else:
-      output.Log('Input: From DIRECTORY "%s" (%s) with no default file'
-                 % (path, url), 2)
-  #end def __init__
+    This one handles a text file with a list of News URLs and their metadata
+    """
 
-  def ProduceURLs(self, consumer):
-    """ Produces URLs from our data source, hands them in to the consumer. """
-    if not self._path:
-      return
+    def __init__(self, attributes):
+        self._path = None                  # The file path
+        self._encoding = None                  # Encoding of that file
+        self._tag_order = []                    # Order of URL metadata
+
+        if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding', 'tag_order')):
+            return
+
+        self._path = attributes.get('path')
+        self._encoding = attributes.get('encoding', ENC_UTF8)
+        self._tag_order = attributes.get('tag_order')
+
+        if self._path:
+            self._path = encoder.MaybeNarrowPath(self._path)
+            if os.path.isfile(self._path):
+                output.Log('Input: From URLLIST "%s"' % self._path, 2)
+            else:
+                output.Error('Can not locate file: %s' % self._path)
+                self._path = None
+        else:
+            output.Error('Urllist entries must have a "path" attribute.')
+
+        # parse tag_order into an array
+        # tag_order_ascii created for more readable logging
+        tag_order_ascii = []
+        if self._tag_order:
+            self._tag_order = self._tag_order.split(",")
+            for i in range(0, len(self._tag_order)):
+                element = self._tag_order[i].strip().lower()
+                self._tag_order[i] = element
+                tag_order_ascii.append(element.encode('ascii'))
+            output.Log(
+                'Input: From URLLIST tag order is "%s"' % tag_order_ascii, 0)
+        else:
+            output.Error('News Urllist configuration file must contain tag_order '
+                         'to define Sitemap metatags.')
+
+        # verify all tag_order inputs are valid
+        tag_order_dict = {}
+        for tag in self._tag_order:
+            tag_order_dict[tag] = ""
+        if not ValidateAttributes('URLLIST', tag_order_dict,
+                                  NEWS_SITEMAP_TAGS):
+            return
+
+        # loc tag must be present
+        loc_tag = False
+        for tag in self._tag_order:
+            if tag == 'loc':
+                loc_tag = True
+                break
+        if not loc_tag:
+            output.Error('News Urllist tag_order in configuration file '
+                         'does not contain "loc" value: %s' % tag_order_ascii)
+    # end def __init__
+
+    def ProduceURLs(self, consumer):
+        """ Produces URLs from our data source, hands them in to the consumer. """
+
+        # Open the file
+        (frame, file) = OpenFileForRead(self._path, 'URLLIST')
+        if not file:
+            return
+
+        # Iterate lines
+        linenum = 0
+        for line in file.readlines():
+            linenum = linenum + 1
+
+            # Strip comments and empty lines
+            if self._encoding:
+                line = encoder.WidenText(line, self._encoding)
+            line = line.strip()
+            if (not line) or line[0] == '#':
+                continue
+
+            # Split the line on tabs
+            url = NewsURL()
+            cols = line.split('\t')
+            for i in range(0, len(cols)):
+                cols[i] = cols[i].strip()
+
+            for i in range(0, len(cols)):
+                if cols[i]:
+                    attr_value = cols[i]
+                    if i < len(self._tag_order):
+                        attr_name = self._tag_order[i]
+                        try:
+                            url.TrySetAttribute(attr_name, attr_value)
+                        except ValueError:
+                            output.Warn('Line %d: Unable to parse attribute: %s' %
+                                       (linenum, cols[i]))
+
+            # Pass it on
+            consumer(url, False)
+
+        file.close()
+        if frame:
+            frame.close()
+    # end def ProduceURLs
+# end class InputNewsURLList
 
-    root_path = self._path
-    root_URL  = self._url
-    root_file = "index.html"
 
-    def DecideFilename(name):
-      assert "/" not in name
+class InputDirectory:
+    """
+    Each Input class knows how to yield a set of URLs from a data source.
 
-      if name in ( "robots.txt, " ):
-        return False
+    This one handles a directory that acts as base for walking the filesystem.
+    """
 
-      if ".thumbnail." in name:
-        return False
+    def __init__(self, attributes, base_url):
+        self._path = None               # The directory
+        self._url = None               # The URL equivalent
+        self._default_file = None
+        self._remove_empty_directories = False
+
+        if not ValidateAttributes('DIRECTORY', attributes, ('path', 'url',
+                                  'default_file', 'remove_empty_directories')):
+            return
+
+        # Prep the path -- it MUST end in a sep
+        path = attributes.get('path')
+        if not path:
+            output.Error('Directory entries must have both "path" and "url" '
+                         'attributes')
+            return
+        path = encoder.MaybeNarrowPath(path)
+        if not path.endswith(os.sep):
+            path = path + os.sep
+        if not os.path.isdir(path):
+            output.Error('Can not locate directory: %s' % path)
+            return
+
+        # Prep the URL -- it MUST end in a sep
+        url = attributes.get('url')
+        if not url:
+            output.Error('Directory entries must have both "path" and "url" '
+                         'attributes')
+            return
+        url = URL.Canonicalize(url)
+        if not url.endswith('/'):
+            url = url + '/'
+        if not url.startswith(base_url):
+            url = urljoin(base_url, url)
+            if not url.startswith(base_url):
+                output.Error('The directory URL "%s" is not relative to the '
+                             'base_url: %s' % (url, base_url))
+                return
+
+        # Prep the default file -- it MUST be just a filename
+        file = attributes.get('default_file')
+        if file:
+            file = encoder.MaybeNarrowPath(file)
+            if os.sep in file:
+                output.Error('The default_file "%s" can not include path information.'
+                             % file)
+                file = None
+
+        # Prep the remove_empty_directories -- default is false
+        remove_empty_directories = attributes.get('remove_empty_directories')
+        if remove_empty_directories:
+            if (remove_empty_directories == '1') or \
+               (remove_empty_directories.lower() == 'true'):
+                remove_empty_directories = True
+            elif (remove_empty_directories == '0') or \
+                 (remove_empty_directories.lower() == 'false'):
+                remove_empty_directories = False
+            # otherwise the user set a non-default value
+            else:
+                output.Error('Configuration file remove_empty_directories '
+                             'value is not recognized.  Value must be true or false.')
+                return
+        else:
+            remove_empty_directories = False
 
-      if re.match( r"google[a-f0-9]+.html", name ):
-        return False
+        self._path = path
+        self._url = url
+        self._default_file = file
+        self._remove_empty_directories = remove_empty_directories
 
-      return not re.match( r"^index(\-\d+)?.html$", name )
+        if file:
+            output.Log('Input: From DIRECTORY "%s" (%s) with default file "%s"'
+                       % (path, url, file), 2)
+        else:
+            output.Log('Input: From DIRECTORY "%s" (%s) with no default file'
+                       % (path, url), 2)
+    # end def __init__
+
+    def ProduceURLs(self, consumer):
+        """ Produces URLs from our data source, hands them in to the consumer. """
+        if not self._path:
+            return
+
+        root_path = self._path
+        root_URL = self._url
+        root_file = self._default_file
+        remove_empty_directories = self._remove_empty_directories
+
+        def HasReadPermissions(path):
+            """ Verifies a given path has read permissions. """
+            stat_info = os.stat(path)
+            mode = stat_info[stat.ST_MODE]
+            if mode & stat.S_IREAD:
+                return True
+            else:
+                return None
+
+        def PerFile(dirpath, name):
+            """
+            Called once per file.
+            Note that 'name' will occasionally be None -- for a directory itself
+            """
+            # Pull a timestamp
+            url = URL()
+            isdir = False
+            try:
+                if name:
+                    path = os.path.join(dirpath, name)
+                else:
+                    path = dirpath
+                isdir = os.path.isdir(path)
+                time = None
+                if isdir and root_file:
+                    file = os.path.join(path, root_file)
+                    try:
+                        time = os.stat(file)[stat.ST_MTIME]
+                    except OSError:
+                        pass
+                if not time:
+                    time = os.stat(path)[stat.ST_MTIME]
+                url.lastmod = TimestampISO8601(time)
+            except OSError:
+                pass
+            except ValueError:
+                pass
+
+            # Build a URL
+            middle = dirpath[len(root_path):]
+            if os.sep != '/':
+                middle = middle.replace(os.sep, '/')
+            if middle:
+                middle = middle + '/'
+            if name:
+                middle = middle + name
+                if isdir:
+                    middle = middle + '/'
+            url.TrySetAttribute(
+                'loc', root_URL + encoder.WidenText(middle, None))
+
+            # Suppress default files.  (All the way down here so we can log
+            # it.)
+            if name and (root_file == name):
+                url.Log(prefix='IGNORED (default file)', level=2)
+                return
+
+            # Suppress directories when remove_empty_directories="true"
+            try:
+                if isdir:
+                    if HasReadPermissions(path):
+                        if remove_empty_directories == 'true' and \
+                                len(os.listdir(path)) == 0:
+                            output.Log(
+                                'IGNORED empty directory %s' % str(path), level=1)
+                            return
+                    elif path == self._path:
+                        output.Error('IGNORED configuration file directory input %s due '
+                                     'to file permissions' % self._path)
+                    else:
+                        output.Log('IGNORED files within directory %s due to file '
+                                   'permissions' % str(path), level=0)
+            except OSError:
+                pass
+            except ValueError:
+                pass
+
+            consumer(url, False)
+        # end def PerFile
+
+        def PerDirectory(ignore, dirpath, namelist):
+            """
+            Called once per directory with a list of all the contained files/dirs.
+            """
+            ignore = ignore  # Avoid warnings of an unused parameter
+
+            if not dirpath.startswith(root_path):
+                output.Warn('Unable to decide what the root path is for directory: '
+                            '%s' % dirpath)
+                return
+
+            for name in namelist:
+                PerFile(dirpath, name)
+        # end def PerDirectory
+
+        output.Log('Walking DIRECTORY "%s"' % self._path, 1)
+        PerFile(self._path, None)
+        os.path.walk(self._path, PerDirectory, None)
+    # end def ProduceURLs
+# end class InputDirectory
 
-    def DecideDirectory(dirpath):
-      subpath = dirpath[len(root_path):]
 
-      assert not subpath.startswith( "/" ), subpath
+class InputAccessLog:
+    """
+    Each Input class knows how to yield a set of URLs from a data source.
 
-      for remove in ( "assets", ):
-        if subpath == remove or subpath.startswith( remove + os.path.sep ):
-          return False
-      else:
-        return True
+    This one handles access logs.  It's non-trivial in that we want to
+    auto-detect log files in the Common Logfile Format (as used by Apache,
+    for instance) and the Extended Log File Format (as used by IIS, for
+    instance).
+    """
 
-    def PerFile(dirpath, name):
-      """
-      Called once per file.
-      Note that 'name' will occasionally be None -- for a directory itself
-      """
-      if not DecideDirectory(dirpath):
-        return
-
-      if name is not None and not DecideFilename(name):
-        return
-
-      # Pull a timestamp
-      url           = URL()
-      isdir         = False
-      try:
-        if name:
-          path      = os.path.join(dirpath, name)
+    def __init__(self, attributes):
+        self._path = None               # The file path
+        self._encoding = None               # Encoding of that file
+        self._is_elf = False              # Extended Log File Format?
+        self._is_clf = False              # Common Logfile Format?
+        self._elf_status = -1                 # ELF field: '200'
+        self._elf_method = -1                 # ELF field: 'HEAD'
+        self._elf_uri = -1                 # ELF field: '/foo?bar=1'
+        self._elf_urifrag1 = -1                 # ELF field: '/foo'
+        self._elf_urifrag2 = -1                 # ELF field: 'bar=1'
+
+        if not ValidateAttributes('ACCESSLOG', attributes, ('path', 'encoding')):
+            return
+
+        self._path = attributes.get('path')
+        self._encoding = attributes.get('encoding', ENC_UTF8)
+        if self._path:
+            self._path = encoder.MaybeNarrowPath(self._path)
+            if os.path.isfile(self._path):
+                output.Log('Input: From ACCESSLOG "%s"' % self._path, 2)
+            else:
+                output.Error('Can not locate file: %s' % self._path)
+                self._path = None
         else:
-          path      = dirpath
-        isdir       = os.path.isdir(path)
-        time        = None
-        if isdir and root_file:
-          file      = os.path.join(path, root_file)
-          try:
-            time    = os.stat(file)[stat.ST_MTIME];
-          except OSError:
-            pass
-        if not time:
-          time      = os.stat(path)[stat.ST_MTIME];
-        url.lastmod = TimestampISO8601(time)
-      except OSError:
-        pass
-      except ValueError:
-        pass
-
-      # Build a URL
-      middle        = dirpath[len(root_path):]
-      if os.sep != '/':
-        middle = middle.replace(os.sep, '/')
-      if middle:
-        middle      = middle + '/'
-      if name:
-        middle      = middle + name
-        if isdir:
-          middle    = middle + '/'
-      url.TrySetAttribute('loc', root_URL + encoder.WidenText(middle, None))
-
-      # Suppress default files.  (All the way down here so we can log it.)
-      if name and (root_file == name):
-        url.Log(prefix='IGNORED (default file)', level=2)
-        return
-
-      consumer(url, False)
-    #end def PerFile
-
-    def PerDirectory(ignore, dirpath, namelist):
-      """
-      Called once per directory with a list of all the contained files/dirs.
-      """
-      ignore = ignore  # Avoid warnings of an unused parameter
-
-      if not dirpath.startswith(root_path):
-        output.Warn('Unable to decide what the root path is for directory: '
-                    '%s' % dirpath)
-        return
-
-      if not DecideDirectory(dirpath):
-        return
-
-      for name in namelist:
-        PerFile(dirpath, name)
-    #end def PerDirectory
-
-    output.Log('Walking DIRECTORY "%s"' % self._path, 1)
-    PerFile(self._path, None)
-    os.path.walk(self._path, PerDirectory, None)
-  #end def ProduceURLs
-#end class InputDirectory
-
+            output.Error('Accesslog entries must have a "path" attribute.')
+    # end def __init__
+
+    def RecognizeELFLine(self, line):
+        """ Recognize the Fields directive that heads an ELF file """
+        if not line.startswith('#Fields:'):
+            return False
+        fields = line.split(' ')
+        del fields[0]
+        for i in range(0, len(fields)):
+            field = fields[i].strip()
+            if field == 'sc-status':
+                self._elf_status = i
+            elif field == 'cs-method':
+                self._elf_method = i
+            elif field == 'cs-uri':
+                self._elf_uri = i
+            elif field == 'cs-uri-stem':
+                self._elf_urifrag1 = i
+            elif field == 'cs-uri-query':
+                self._elf_urifrag2 = i
+        output.Log('Recognized an Extended Log File Format file.', 2)
+        return True
+    # end def RecognizeELFLine
+
+    def GetELFLine(self, line):
+        """ Fetch the requested URL from an ELF line """
+        fields = line.split(' ')
+        count = len(fields)
+
+        # Verify status was Ok
+        if self._elf_status >= 0:
+            if self._elf_status >= count:
+                return None
+            if not fields[self._elf_status].strip() == '200':
+                return None
+
+        # Verify method was HEAD or GET
+        if self._elf_method >= 0:
+            if self._elf_method >= count:
+                return None
+            if not fields[self._elf_method].strip() in ('HEAD', 'GET'):
+                return None
+
+        # Pull the full URL if we can
+        if self._elf_uri >= 0:
+            if self._elf_uri >= count:
+                return None
+            url = fields[self._elf_uri].strip()
+            if url != '-':
+                return url
+
+        # Put together a fragmentary URL
+        if self._elf_urifrag1 >= 0:
+            if self._elf_urifrag1 >= count or self._elf_urifrag2 >= count:
+                return None
+            urlfrag1 = fields[self._elf_urifrag1].strip()
+            urlfrag2 = None
+            if self._elf_urifrag2 >= 0:
+                urlfrag2 = fields[self._elf_urifrag2]
+            if urlfrag1 and (urlfrag1 != '-'):
+                if urlfrag2 and (urlfrag2 != '-'):
+                    urlfrag1 = urlfrag1 + '?' + urlfrag2
+                return urlfrag1
 
-class InputAccessLog:
-  """
-  Each Input class knows how to yield a set of URLs from a data source.
-
-  This one handles access logs.  It's non-trivial in that we want to
-  auto-detect log files in the Common Logfile Format (as used by Apache,
-  for instance) and the Extended Log File Format (as used by IIS, for
-  instance).
-  """
-
-  def __init__(self, attributes):
-    self._path         = None               # The file path
-    self._encoding     = None               # Encoding of that file
-    self._is_elf       = False              # Extended Log File Format?
-    self._is_clf       = False              # Common Logfile Format?
-    self._elf_status   = -1                 # ELF field: '200'
-    self._elf_method   = -1                 # ELF field: 'HEAD'
-    self._elf_uri      = -1                 # ELF field: '/foo?bar=1'
-    self._elf_urifrag1 = -1                 # ELF field: '/foo'
-    self._elf_urifrag2 = -1                 # ELF field: 'bar=1'
-
-    if not ValidateAttributes('ACCESSLOG', attributes, ('path', 'encoding')):
-      return
-
-    self._path      = attributes.get('path')
-    self._encoding  = attributes.get('encoding', ENC_UTF8)
-    if self._path:
-      self._path    = encoder.MaybeNarrowPath(self._path)
-      if os.path.isfile(self._path):
-        output.Log('Input: From ACCESSLOG "%s"' % self._path, 2)
-      else:
-        output.Error('Can not locate file: %s' % self._path)
-        self._path = None
-    else:
-      output.Error('Accesslog entries must have a "path" attribute.')
-  #end def __init__
-
-  def RecognizeELFLine(self, line):
-    """ Recognize the Fields directive that heads an ELF file """
-    if not line.startswith('#Fields:'):
-      return False
-    fields = line.split(' ')
-    del fields[0]
-    for i in range(0, len(fields)):
-      field = fields[i].strip()
-      if field == 'sc-status':
-        self._elf_status   = i
-      elif field == 'cs-method':
-        self._elf_method   = i
-      elif field == 'cs-uri':
-        self._elf_uri      = i
-      elif field == 'cs-uri-stem':
-        self._elf_urifrag1 = i
-      elif field == 'cs-uri-query':
-        self._elf_urifrag2 = i
-    output.Log('Recognized an Extended Log File Format file.', 2)
-    return True
-  #end def RecognizeELFLine
-
-  def GetELFLine(self, line):
-    """ Fetch the requested URL from an ELF line """
-    fields = line.split(' ')
-    count  = len(fields)
-
-    # Verify status was Ok
-    if self._elf_status >= 0:
-      if self._elf_status >= count:
         return None
-      if not fields[self._elf_status].strip() == '200':
+    # end def GetELFLine
+
+    def RecognizeCLFLine(self, line):
+        """ Try to tokenize a logfile line according to CLF pattern and see if
+        it works. """
+        match = ACCESSLOG_CLF_PATTERN.match(line)
+        recognize = match and (match.group(1) in ('HEAD', 'GET'))
+        if recognize:
+            output.Log('Recognized a Common Logfile Format file.', 2)
+        return recognize
+    # end def RecognizeCLFLine
+
+    def GetCLFLine(self, line):
+        """ Fetch the requested URL from a CLF line """
+        match = ACCESSLOG_CLF_PATTERN.match(line)
+        if match:
+            request = match.group(1)
+            if request in ('HEAD', 'GET'):
+                return match.group(2)
         return None
+    # end def GetCLFLine
+
+    def ProduceURLs(self, consumer):
+        """ Produces URLs from our data source, hands them in to the consumer. """
+
+        # Open the file
+        (frame, file) = OpenFileForRead(self._path, 'ACCESSLOG')
+        if not file:
+            return
+
+        # Iterate lines
+        for line in file.readlines():
+            if self._encoding:
+                line = encoder.WidenText(line, self._encoding)
+            line = line.strip()
+
+            # If we don't know the format yet, try them both
+            if (not self._is_clf) and (not self._is_elf):
+                self._is_elf = self.RecognizeELFLine(line)
+                self._is_clf = self.RecognizeCLFLine(line)
+
+            # Digest the line
+            match = None
+            if self._is_elf:
+                match = self.GetELFLine(line)
+            elif self._is_clf:
+                match = self.GetCLFLine(line)
+            if not match:
+                continue
+
+            # Pass it on
+            url = URL()
+            url.TrySetAttribute('loc', match)
+            consumer(url, True)
+
+        file.close()
+        if frame:
+            frame.close()
+    # end def ProduceURLs
+# end class InputAccessLog
 
-    # Verify method was HEAD or GET
-    if self._elf_method >= 0:
-      if self._elf_method >= count:
-        return None
-      if not fields[self._elf_method].strip() in ('HEAD', 'GET'):
-        return None
 
-    # Pull the full URL if we can
-    if self._elf_uri >= 0:
-      if self._elf_uri >= count:
-        return None
-      url = fields[self._elf_uri].strip()
-      if url != '-':
-        return url
+class FilePathGenerator:
+    """
+    This class generates filenames in a series, upon request.
+    You can request any iteration number at any time, you don't
+    have to go in order.
+
+    Example of iterations for '/path/foo.xml.gz':
+      0           --> /path/foo.xml.gz
+      1           --> /path/foo1.xml.gz
+      2           --> /path/foo2.xml.gz
+      _index.xml  --> /path/foo_index.xml
+    """
 
-    # Put together a fragmentary URL
-    if self._elf_urifrag1 >= 0:
-      if self._elf_urifrag1 >= count or self._elf_urifrag2 >= count:
-        return None
-      urlfrag1 = fields[self._elf_urifrag1].strip()
-      urlfrag2 = None
-      if self._elf_urifrag2 >= 0:
-        urlfrag2 = fields[self._elf_urifrag2]
-      if urlfrag1 and (urlfrag1 != '-'):
-        if urlfrag2 and (urlfrag2 != '-'):
-          urlfrag1 = urlfrag1 + '?' + urlfrag2
-        return urlfrag1
+    def __init__(self):
+        self.is_gzip = False                 # Is this a  GZIP file?
+
+        self._path = None                  # '/path/'
+        self._prefix = None                  # 'foo'
+        self._suffix = None                  # '.xml.gz'
+    # end def __init__
+
+    def Preload(self, path):
+        """ Splits up a path into forms ready for recombination. """
+        path = encoder.MaybeNarrowPath(path)
+
+        # Get down to a base name
+        path = os.path.normpath(path)
+        base = os.path.basename(path).lower()
+        if not base:
+            output.Error('Couldn\'t parse the file path: %s' % path)
+            return False
+        lenbase = len(base)
+
+        # Recognize extension
+        lensuffix = 0
+        compare_suffix = ['.xml', '.xml.gz', '.gz']
+        for suffix in compare_suffix:
+            if base.endswith(suffix):
+                lensuffix = len(suffix)
+                break
+        if not lensuffix:
+            output.Error('The path "%s" doesn\'t end in a supported file '
+                         'extension.' % path)
+            return False
+        self.is_gzip = suffix.endswith('.gz')
+
+        # Split the original path
+        lenpath = len(path)
+        self._path = path[:lenpath - lenbase]
+        self._prefix = path[lenpath - lenbase:lenpath - lensuffix]
+        self._suffix = path[lenpath - lensuffix:]
 
-    return None
-  #end def GetELFLine
-
-  def RecognizeCLFLine(self, line):
-    """ Try to tokenize a logfile line according to CLF pattern and see if
-    it works. """
-    match = ACCESSLOG_CLF_PATTERN.match(line)
-    recognize = match and (match.group(1) in ('HEAD', 'GET'))
-    if recognize:
-      output.Log('Recognized a Common Logfile Format file.', 2)
-    return recognize
-  #end def RecognizeCLFLine
-
-  def GetCLFLine(self, line):
-    """ Fetch the requested URL from a CLF line """
-    match = ACCESSLOG_CLF_PATTERN.match(line)
-    if match:
-      request = match.group(1)
-      if request in ('HEAD', 'GET'):
-        return match.group(2)
-    return None
-  #end def GetCLFLine
-
-  def ProduceURLs(self, consumer):
-    """ Produces URLs from our data source, hands them in to the consumer. """
-
-    # Open the file
-    (frame, file) = OpenFileForRead(self._path, 'ACCESSLOG')
-    if not file:
-      return
-
-    # Iterate lines
-    for line in file.readlines():
-      if self._encoding:
-        line = encoder.WidenText(line, self._encoding)
-      line = line.strip()
-
-      # If we don't know the format yet, try them both
-      if (not self._is_clf) and (not self._is_elf):
-        self._is_elf = self.RecognizeELFLine(line)
-        self._is_clf = self.RecognizeCLFLine(line)
-
-      # Digest the line
-      match = None
-      if self._is_elf:
-        match = self.GetELFLine(line)
-      elif self._is_clf:
-        match = self.GetCLFLine(line)
-      if not match:
-        continue
-
-      # Pass it on
-      url = URL()
-      url.TrySetAttribute('loc', match)
-      consumer(url, True)
-
-    file.close()
-    if frame:
-      frame.close()
-  #end def ProduceURLs
-#end class InputAccessLog
-
-
-class InputSitemap(xml.sax.handler.ContentHandler):
-
-  """
-  Each Input class knows how to yield a set of URLs from a data source.
-
-  This one handles Sitemap files and Sitemap index files.  For the sake
-  of simplicity in design (and simplicity in interfacing with the SAX
-  package), we do not handle these at the same time, recursively.  Instead
-  we read an index file completely and make a list of Sitemap files, then
-  go back and process each Sitemap.
-  """
-
-  class _ContextBase(object):
-
-    """Base class for context handlers in our SAX processing.  A context
-    handler is a class that is responsible for understanding one level of
-    depth in the XML schema.  The class knows what sub-tags are allowed,
-    and doing any processing specific for the tag we're in.
-
-    This base class is the API filled in by specific context handlers,
-    all defined below.
-    """
+        return True
+    # end def Preload
+
+    def GeneratePath(self, instance):
+        """ Generates the iterations, as described above. """
+        prefix = self._path + self._prefix
+        if isinstance(instance, int):
+            if instance:
+                return '%s%d%s' % (prefix, instance, self._suffix)
+            return prefix + self._suffix
+        return prefix + instance
+    # end def GeneratePath
+
+    def GenerateURL(self, instance, root_url):
+        """ Generates iterations, but as a URL instead of a path. """
+        prefix = root_url + self._prefix
+        retval = None
+        if isinstance(instance, int):
+            if instance:
+                retval = '%s%d%s' % (prefix, instance, self._suffix)
+            else:
+                retval = prefix + self._suffix
+        else:
+            retval = prefix + instance
+        return URL.Canonicalize(retval)
+    # end def GenerateURL
 
-    def __init__(self, subtags):
-      """Initialize with a sequence of the sub-tags that would be valid in
-      this context."""
-      self._allowed_tags = subtags          # Sequence of sub-tags we can have
-      self._last_tag     = None             # Most recent seen sub-tag
-    #end def __init__
-
-    def AcceptTag(self, tag):
-      """Returns True iff opening a sub-tag is valid in this context."""
-      valid = tag in self._allowed_tags
-      if valid:
-        self._last_tag = tag
-      else:
-        self._last_tag = None
-      return valid
-    #end def AcceptTag
-
-    def AcceptText(self, text):
-      """Returns True iff a blurb of text is valid in this context."""
-      return False
-    #end def AcceptText
-
-    def Open(self):
-      """The context is opening.  Do initialization."""
-      pass
-    #end def Open
-
-    def Close(self):
-      """The context is closing.  Return our result, if any."""
-      pass
-    #end def Close
-
-    def Return(self, result):
-      """We're returning to this context after handling a sub-tag.  This
-      method is called with the result data from the sub-tag that just
-      closed.  Here in _ContextBase, if we ever see a result it means
-      the derived child class forgot to override this method."""
-      if result:
-        raise NotImplementedError
-    #end def Return
-  #end class _ContextBase
-
-  class _ContextUrlSet(_ContextBase):
-
-    """Context handler for the document node in a Sitemap."""
+    def GenerateWildURL(self, root_url):
+        """ Generates a wildcard that should match all our iterations """
+        prefix = URL.Canonicalize(root_url + self._prefix)
+        temp = URL.Canonicalize(prefix + self._suffix)
+        suffix = temp[len(prefix):]
+        return prefix + '*' + suffix
+    # end def GenerateURL
+# end class FilePathGenerator
 
-    def __init__(self):
-      InputSitemap._ContextBase.__init__(self, ('url',))
-    #end def __init__
-  #end class _ContextUrlSet
-
-  class _ContextUrl(_ContextBase):
-
-    """Context handler for a URL node in a Sitemap."""
-
-    def __init__(self, consumer):
-      """Initialize this context handler with the callable consumer that
-      wants our URLs."""
-      InputSitemap._ContextBase.__init__(self, URL.__slots__)
-      self._url          = None            # The URL object we're building
-      self._consumer     = consumer        # Who wants to consume it
-    #end def __init__
-
-    def Open(self):
-      """Initialize the URL."""
-      assert not self._url
-      self._url = URL()
-    #end def Open
-
-    def Close(self):
-      """Pass the URL to the consumer and reset it to None."""
-      assert self._url
-      self._consumer(self._url, False)
-      self._url = None
-    #end def Close
-
-    def Return(self, result):
-      """A value context has closed, absorb the data it gave us."""
-      assert self._url
-      if result:
-        self._url.TrySetAttribute(self._last_tag, result)
-    #end def Return
-  #end class _ContextUrl
-
-  class _ContextSitemapIndex(_ContextBase):
-
-    """Context handler for the document node in an index file."""
 
-    def __init__(self):
-      InputSitemap._ContextBase.__init__(self, ('sitemap',))
-      self._loclist = []                    # List of accumulated Sitemap URLs
-    #end def __init__
-
-    def Open(self):
-      """Just a quick verify of state."""
-      assert not self._loclist
-    #end def Open
-
-    def Close(self):
-      """Return our list of accumulated URLs."""
-      if self._loclist:
-        temp = self._loclist
-        self._loclist = []
-        return temp
-    #end def Close
-
-    def Return(self, result):
-      """Getting a new loc URL, add it to the collection."""
-      if result:
-        self._loclist.append(result)
-    #end def Return
-  #end class _ContextSitemapIndex
-
-  class _ContextSitemap(_ContextBase):
-
-    """Context handler for a Sitemap entry in an index file."""
+class PerURLStatistics:
+    """ Keep track of some simple per-URL statistics, like file extension. """
 
     def __init__(self):
-      InputSitemap._ContextBase.__init__(self, ('loc', 'lastmod'))
-      self._loc = None                      # The URL to the Sitemap
-    #end def __init__
-
-    def Open(self):
-      """Just a quick verify of state."""
-      assert not self._loc
-    #end def Open
-
-    def Close(self):
-      """Return our URL to our parent."""
-      if self._loc:
-        temp = self._loc
-        self._loc = None
-        return temp
-      output.Warn('In the Sitemap index file, a "sitemap" entry had no "loc".')
-    #end def Close
-
-    def Return(self, result):
-      """A value has closed.  If it was a 'loc', absorb it."""
-      if result and (self._last_tag == 'loc'):
-        self._loc = result
-    #end def Return
-  #end class _ContextSitemap
-
-  class _ContextValue(_ContextBase):
-
-    """Context handler for a single value.  We return just the value.  The
-    higher level context has to remember what tag led into us."""
+        self._extensions = {}                  # Count of extension instances
+    # end def __init__
+
+    def Consume(self, url):
+        """ Log some stats for the URL.  At the moment, that means extension. """
+        if url and url.loc:
+            (scheme, netloc, path, query, frag) = urlsplit(url.loc)
+            if not path:
+                return
+
+            # Recognize directories
+            if path.endswith('/'):
+                if '/' in self._extensions:
+                    self._extensions['/'] = self._extensions['/'] + 1
+                else:
+                    self._extensions['/'] = 1
+                return
+
+            # Strip to a filename
+            i = path.rfind('/')
+            if i >= 0:
+                assert i < len(path)
+                path = path[i:]
+
+            # Find extension
+            i = path.rfind('.')
+            if i > 0:
+                assert i < len(path)
+                ext = path[i:].lower()
+                if ext in self._extensions:
+                    self._extensions[ext] = self._extensions[ext] + 1
+                else:
+                    self._extensions[ext] = 1
+            else:
+                if '(no extension)' in self._extensions:
+                    self._extensions['(no extension)'] = self._extensions[
+                        '(no extension)'] + 1
+                else:
+                    self._extensions['(no extension)'] = 1
+    # end def Consume
+
+    def Log(self):
+        """ Dump out stats to the output. """
+        if len(self._extensions):
+            output.Log('Count of file extensions on URLs:', 1)
+            set = sorted(self._extensions.keys())
+            for ext in set:
+                output.Log(' %7d  %s' % (self._extensions[ext], ext), 1)
+    # end def Log
 
-    def __init__(self):
-      InputSitemap._ContextBase.__init__(self, ())
-      self._text        = None
-    #end def __init__
-
-    def AcceptText(self, text):
-      """Allow all text, adding it to our buffer."""
-      if self._text:
-        self._text = self._text + text
-      else:
-        self._text = text
-      return True
-    #end def AcceptText
-
-    def Open(self):
-      """Initialize our buffer."""
-      self._text = None
-    #end def Open
-
-    def Close(self):
-      """Return what's in our buffer."""
-      text = self._text
-      self._text = None
-      if text:
-        text = text.strip()
-      return text
-    #end def Close
-  #end class _ContextValue
-
-  def __init__(self, attributes):
-    """Initialize with a dictionary of attributes from our entry in the
-    config file."""
-    xml.sax.handler.ContentHandler.__init__(self)
-    self._pathlist      = None              # A list of files
-    self._current       = -1                # Current context in _contexts
-    self._contexts      = None              # The stack of contexts we allow
-    self._contexts_idx  = None              # ...contexts for index files
-    self._contexts_stm  = None              # ...contexts for Sitemap files
-
-    if not ValidateAttributes('SITEMAP', attributes, ['path']):
-      return
-
-    # Init the first file path
-    path = attributes.get('path')
-    if path:
-      path = encoder.MaybeNarrowPath(path)
-      if os.path.isfile(path):
-        output.Log('Input: From SITEMAP "%s"' % path, 2)
-        self._pathlist = [path]
-      else:
-        output.Error('Can not locate file "%s"' % path)
-    else:
-      output.Error('Sitemap entries must have a "path" attribute.')
-  #end def __init__
-
-  def ProduceURLs(self, consumer):
-    """In general: Produces URLs from our data source, hand them to the
-    callable consumer.
-
-    In specific: Iterate over our list of paths and delegate the actual
-    processing to helper methods.  This is a complexity no other data source
-    needs to suffer.  We are unique in that we can have files that tell us
-    to bring in other files.
-
-    Note the decision to allow an index file or not is made in this method.
-    If we call our parser with (self._contexts == None) the parser will
-    grab whichever context stack can handle the file.  IE: index is allowed.
-    If instead we set (self._contexts = ...) before parsing, the parser
-    will only use the stack we specify.  IE: index not allowed.
-    """
-    # Set up two stacks of contexts
-    self._contexts_idx = [InputSitemap._ContextSitemapIndex(),
-                          InputSitemap._ContextSitemap(),
-                          InputSitemap._ContextValue()]
-
-    self._contexts_stm = [InputSitemap._ContextUrlSet(),
-                          InputSitemap._ContextUrl(consumer),
-                          InputSitemap._ContextValue()]
-
-    # Process the first file
-    assert self._pathlist
-    path = self._pathlist[0]
-    self._contexts = None                # We allow an index file here
-    self._ProcessFile(path)
-
-    # Iterate over remaining files
-    self._contexts = self._contexts_stm  # No index files allowed
-    for path in self._pathlist[1:]:
-      self._ProcessFile(path)
-  #end def ProduceURLs
-
-  def _ProcessFile(self, path):
-    """Do per-file reading/parsing/consuming for the file path passed in."""
-    assert path
-
-    # Open our file
-    (frame, file) = OpenFileForRead(path, 'SITEMAP')
-    if not file:
-      return
-
-    # Rev up the SAX engine
-    try:
-      self._current = -1
-      xml.sax.parse(file, self)
-    except SchemaError:
-      output.Error('An error in file "%s" made us abort reading the Sitemap.'
-                   % path)
-    except IOError:
-      output.Error('Cannot read from file "%s"' % path)
-    except xml.sax._exceptions.SAXParseException as e:
-      output.Error('XML error in the file "%s" (line %d, column %d): %s' %
-                   (path, e._linenum, e._colnum, e.getMessage()))
-
-    # Clean up
-    file.close()
-    if frame:
-      frame.close()
-  #end def _ProcessFile
-
-  def _MungeLocationListIntoFiles(self, urllist):
-    """Given a list of URLs, munge them into our self._pathlist property.
-    We do this by assuming all the files live in the same directory as
-    the first file in the existing pathlist.  That is, we assume a
-    Sitemap index points to Sitemaps only in the same directory.  This
-    is not true in general, but will be true for any output produced
-    by this script.
-    """
-    assert self._pathlist
-    path = self._pathlist[0]
-    path = os.path.normpath(path)
-    dir  = os.path.dirname(path)
-    wide = False
-    if type(path) == types.UnicodeType:
-      wide = True
-
-    for url in urllist:
-      url = URL.Canonicalize(url)
-      output.Log('Index points to Sitemap file at: %s' % url, 2)
-      (scheme, netloc, path, query, frag) = urlsplit(url)
-      file = os.path.basename(path)
-      file = urllib.unquote(file)
-      if wide:
-        file = encoder.WidenText(file)
-      if dir:
-        file = dir + os.sep + file
-      if file:
-        self._pathlist.append(file)
-        output.Log('Will attempt to read Sitemap file: %s' % file, 1)
-  #end def _MungeLocationListIntoFiles
-
-  def startElement(self, tag, attributes):
-    """SAX processing, called per node in the config stream.
-    As long as the new tag is legal in our current context, this
-    becomes an Open call on one context deeper.
-    """
-    # If this is the document node, we may have to look for a context stack
-    if (self._current < 0) and not self._contexts:
-      assert self._contexts_idx and self._contexts_stm
-      if tag == 'urlset':
-        self._contexts = self._contexts_stm
-      elif tag == 'sitemapindex':
-        self._contexts = self._contexts_idx
-        output.Log('File is a Sitemap index.', 2)
-      else:
-        output.Error('The document appears to be neither a Sitemap nor a '
-                     'Sitemap index.')
-        raise SchemaError
-
-    # Display a kinder error on a common mistake
-    if (self._current < 0) and (self._contexts == self._contexts_stm) and (
-      tag == 'sitemapindex'):
-      output.Error('A Sitemap index can not refer to another Sitemap index.')
-      raise SchemaError
-
-    # Verify no unexpected attributes
-    if attributes:
-      text = ''
-      for attr in attributes.keys():
-        # The document node will probably have namespaces
-        if self._current < 0:
-          if attr.find('xmlns') >= 0:
-            continue
-          if attr.find('xsi') >= 0:
-            continue
-        if text:
-          text = text + ', '
-        text = text + attr
-      if text:
-        output.Warn('Did not expect any attributes on any tag, instead tag '
-                     '"%s" had attributes: %s' % (tag, text))
-
-    # Switch contexts
-    if (self._current < 0) or (self._contexts[self._current].AcceptTag(tag)):
-      self._current = self._current + 1
-      assert self._current < len(self._contexts)
-      self._contexts[self._current].Open()
-    else:
-      output.Error('Can not accept tag "%s" where it appears.' % tag)
-      raise SchemaError
-  #end def startElement
-
-  def endElement(self, tag):
-    """SAX processing, called per node in the config stream.
-    This becomes a call to Close on one context followed by a call
-    to Return on the previous.
+
+class Sitemap(xml.sax.handler.ContentHandler):
     """
-    tag = tag  # Avoid warning on unused argument
-    assert self._current >= 0
-    retval = self._contexts[self._current].Close()
-    self._current = self._current - 1
-    if self._current >= 0:
-      self._contexts[self._current].Return(retval)
-    elif retval and (self._contexts == self._contexts_idx):
-      self._MungeLocationListIntoFiles(retval)
-  #end def endElement
-
-  def characters(self, text):
-    """SAX processing, called when text values are read.  Important to
-    note that one single text value may be split across multiple calls
-    of this method.
+    This is the big workhorse class that processes your inputs and spits
+    out sitemap files.  It is built as a SAX handler for set up purposes.
+    That is, it processes an XML stream to bring itself up.
     """
-    if (self._current < 0) or (
-      not self._contexts[self._current].AcceptText(text)):
-      if text.strip():
-        output.Error('Can not accept text "%s" where it appears.' % text)
-        raise SchemaError
-  #end def characters
-#end class InputSitemap
-
-
-class FilePathGenerator:
-  """
-  This class generates filenames in a series, upon request.
-  You can request any iteration number at any time, you don't
-  have to go in order.
-
-  Example of iterations for '/path/foo.xml.gz':
-    0           --> /path/foo.xml.gz
-    1           --> /path/foo1.xml.gz
-    2           --> /path/foo2.xml.gz
-    _index.xml  --> /path/foo_index.xml
-  """
-
-  def __init__(self):
-    self.is_gzip     = False                 # Is this a  GZIP file?
-
-    self._path       = None                  # '/path/'
-    self._prefix     = None                  # 'foo'
-    self._suffix     = None                  # '.xml.gz'
-  #end def __init__
-
-  def Preload(self, path):
-    """ Splits up a path into forms ready for recombination. """
-    path = encoder.MaybeNarrowPath(path)
-
-    # Get down to a base name
-    path = os.path.normpath(path)
-    base = os.path.basename(path).lower()
-    if not base:
-      output.Error('Couldn\'t parse the file path: %s' % path)
-      return False
-    lenbase = len(base)
-
-    # Recognize extension
-    lensuffix = 0
-    compare_suffix = ['.xml', '.xml.gz', '.gz']
-    for suffix in compare_suffix:
-      if base.endswith(suffix):
-        lensuffix = len(suffix)
-        break
-    if not lensuffix:
-      output.Error('The path "%s" doesn\'t end in a supported file '
-                   'extension.' % path)
-      return False
-    self.is_gzip = suffix.endswith('.gz')
-
-    # Split the original path
-    lenpath = len(path)
-    self._path   = path[:lenpath-lenbase]
-    self._prefix = path[lenpath-lenbase:lenpath-lensuffix]
-    self._suffix = path[lenpath-lensuffix:]
-
-    return True
-  #end def Preload
-
-  def GeneratePath(self, instance):
-    """ Generates the iterations, as described above. """
-    prefix = self._path + self._prefix
-    if type(instance) == types.IntType:
-      if instance:
-        return '%s%d%s' % (prefix, instance, self._suffix)
-      return prefix + self._suffix
-    return prefix + instance
-  #end def GeneratePath
-
-  def GenerateURL(self, instance, root_url):
-    """ Generates iterations, but as a URL instead of a path. """
-    prefix = root_url + self._prefix
-    retval = None
-    if type(instance) == types.IntType:
-      if instance:
-        retval = '%s%d%s' % (prefix, instance, self._suffix)
-      else:
-        retval = prefix + self._suffix
-    else:
-      retval = prefix + instance
-    return URL.Canonicalize(retval)
-  #end def GenerateURL
 
-  def GenerateWildURL(self, root_url):
-    """ Generates a wildcard that should match all our iterations """
-    prefix = URL.Canonicalize(root_url + self._prefix)
-    temp   = URL.Canonicalize(prefix + self._suffix)
-    suffix = temp[len(prefix):]
-    return prefix + '*' + suffix
-  #end def GenerateURL
-#end class FilePathGenerator
+    def __init__(self, suppress_notify):
+        xml.sax.handler.ContentHandler.__init__(self)
+        self._filters = []                  # Filter objects
+        self._inputs = []                  # Input objects
+        self._urls = {}                  # Maps URLs to count of dups
+        self._set = []                  # Current set of URLs
+        self._filegen = None                # Path generator for output files
+        self._wildurl1 = None                # Sitemap URLs to filter out
+        self._wildurl2 = None                # Sitemap URLs to filter out
+        self._sitemaps = 0                   # Number of output files
+        # We init _dup_max to 2 so the default priority is 0.5 instead of 1.0
+        self._dup_max = 2                   # Max number of duplicate URLs
+        self._stat = PerURLStatistics()  # Some simple stats
+        self._in_site = False               # SAX: are we in a Site node?
+        self._in_Site_ever = False               # SAX: were we ever in a Site?
+
+        self._default_enc = None                # Best encoding to try on URLs
+        self._base_url = None                # Prefix to all valid URLs
+        self._store_into = None                # Output filepath
+        self._sitemap_type = None                     # Sitemap type (web, mobile or news)
+        self._suppress = suppress_notify     # Suppress notify of servers
+    # end def __init__
+
+    def ValidateBasicConfig(self):
+        """ Verifies (and cleans up) the basic user-configurable options. """
+        all_good = True
+
+        if self._default_enc:
+            encoder.SetUserEncoding(self._default_enc)
+
+        # Canonicalize the base_url
+        if all_good and not self._base_url:
+            output.Error('A site needs a "base_url" attribute.')
+            all_good = False
+        if all_good and not URL.IsAbsolute(self._base_url):
+            output.Error('The "base_url" must be absolute, not relative: %s' %
+                         self._base_url)
+            all_good = False
+        if all_good:
+            self._base_url = URL.Canonicalize(self._base_url)
+            if not self._base_url.endswith('/'):
+                self._base_url = self._base_url + '/'
+            output.Log('BaseURL is set to: %s' % self._base_url, 2)
+
+        # Load store_into into a generator
+        if all_good:
+            if self._store_into:
+                self._filegen = FilePathGenerator()
+                if not self._filegen.Preload(self._store_into):
+                    all_good = False
+            else:
+                output.Error('A site needs a "store_into" attribute.')
+                all_good = False
+
+        # Ask the generator for patterns on what its output will look like
+        if all_good:
+            self._wildurl1 = self._filegen.GenerateWildURL(self._base_url)
+            self._wildurl2 = self._filegen.GenerateURL(SITEINDEX_SUFFIX,
+                                                       self._base_url)
+
+        # Unify various forms of False
+        if all_good:
+            if self._suppress:
+                if (isinstance(self._suppress, bytes_str)) or (isinstance(self._suppress, unicode_str)):
+                    if (self._suppress == '0') or (self._suppress.lower() == 'false'):
+                        self._suppress = False
+
+        # Clean up the sitemap_type
+        if all_good:
+            match = False
+            # If sitemap_type is not specified, default to web sitemap
+            if not self._sitemap_type:
+                self._sitemap_type = 'web'
+            else:
+                self._sitemap_type = self._sitemap_type.lower()
+                for pattern in SITEMAP_TYPES:
+                    if self._sitemap_type == pattern:
+                        match = True
+                        break
+                if not match:
+                    output.Error('The "sitemap_type" value must be "web", "mobile" '
+                                 'or "news": %s' % self._sitemap_type)
+                    all_good = False
+            output.Log('The Sitemap type is %s Sitemap.' %
+                       self._sitemap_type.upper(), 0)
+
+        # Done
+        if not all_good:
+            output.Log('See "example_config.xml" for more information.', 0)
+        return all_good
+    # end def ValidateBasicConfig
+
+    def Generate(self):
+        """ Run over all the Inputs and ask them to Produce """
+        # Run the inputs
+        for input in self._inputs:
+            input.ProduceURLs(self.ConsumeURL)
+
+        # Do last flushes
+        if len(self._set):
+            self.FlushSet()
+        if not self._sitemaps:
+            output.Warn('No URLs were recorded, writing an empty sitemap.')
+            self.FlushSet()
+
+        # Write an index as needed
+        if self._sitemaps > 1:
+            self.WriteIndex()
+
+        # Notify
+        self.NotifySearch()
+
+        # Dump stats
+        self._stat.Log()
+    # end def Generate
+
+    def ConsumeURL(self, url, allow_fragment):
+        """
+        All per-URL processing comes together here, regardless of Input.
+        Here we run filters, remove duplicates, spill to disk as needed, etc.
+
+        """
+        if not url:
+            return
+
+        # Validate
+        if not url.Validate(self._base_url, allow_fragment):
+            return
+
+        # Run filters
+        accept = None
+        for filter in self._filters:
+            accept = filter.Apply(url)
+            if accept is not None:
+                break
+        if not (accept or (accept is None)):
+            url.Log(prefix='FILTERED', level=2)
+            return
+
+        # Ignore our out output URLs
+        if fnmatch.fnmatchcase(url.loc, self._wildurl1) or fnmatch.fnmatchcase(
+                url.loc, self._wildurl2):
+            url.Log(prefix='IGNORED (output file)', level=2)
+            return
+
+        # Note the sighting
+        hash = url.MakeHash()
+        if hash in self._urls:
+            dup = self._urls[hash]
+            if dup > 0:
+                dup = dup + 1
+                self._urls[hash] = dup
+                if self._dup_max < dup:
+                    self._dup_max = dup
+            url.Log(prefix='DUPLICATE')
+            return
+
+        # Acceptance -- add to set
+        self._urls[hash] = 1
+        self._set.append(url)
+        self._stat.Consume(url)
+        url.Log()
+
+        # Flush the set if needed
+        if len(self._set) >= MAXURLS_PER_SITEMAP:
+            self.FlushSet()
+    # end def ConsumeURL
+
+    def FlushSet(self):
+        """
+        Flush the current set of URLs to the output.  This is a little
+        slow because we like to sort them all and normalize the priorities
+        before dumping.
+        """
+
+        # Determine what Sitemap header to use (News or General)
+        if self._sitemap_type == 'news':
+            sitemap_header = NEWS_SITEMAP_HEADER
+        else:
+            sitemap_header = GENERAL_SITEMAP_HEADER
+
+        # Sort and normalize
+        output.Log('Sorting and normalizing collected URLs.', 1)
+        self._set.sort()
+        for url in self._set:
+            hash = url.MakeHash()
+            dup = self._urls[hash]
+            if dup > 0:
+                self._urls[hash] = -1
+                if not url.priority:
+                    url.priority = '%.4f' % (float(dup) / float(self._dup_max))
+
+        # Get the filename we're going to write to
+        filename = self._filegen.GeneratePath(self._sitemaps)
+        if not filename:
+            output.Fatal('Unexpected: Couldn\'t generate output filename.')
+        self._sitemaps = self._sitemaps + 1
+        output.Log('Writing Sitemap file "%s" with %d URLs' %
+                  (filename, len(self._set)), 1)
+
+        # Write to it
+        frame = None
+        file = None
 
+        try:
+            if self._filegen.is_gzip:
+                basename = os.path.basename(filename)
+                frame = open(filename, 'wb')
+                file = gzip.GzipFile(
+                    fileobj=frame, filename=basename, mode='wt')
+            else:
+                file = open(filename, 'wt')
+
+            file.write(sitemap_header)
+            for url in self._set:
+                url.WriteXML(file)
+            file.write(SITEMAP_FOOTER)
+
+            file.close()
+            if frame:
+                frame.close()
+
+            frame = None
+            file = None
+        except IOError:
+            output.Fatal('Couldn\'t write out to file: %s' % filename)
+        os.chmod(filename, 0o0644)
+
+        # Flush
+        self._set = []
+    # end def FlushSet
+
+    def WriteIndex(self):
+        """ Write the master index of all Sitemap files """
+        # Make a filename
+        filename = self._filegen.GeneratePath(SITEINDEX_SUFFIX)
+        if not filename:
+            output.Fatal(
+                'Unexpected: Couldn\'t generate output index filename.')
+        output.Log('Writing index file "%s" with %d Sitemaps' %
+                  (filename, self._sitemaps), 1)
+
+       # Determine what Sitemap index header to use (News or General)
+        if self._sitemap_type == 'news':
+            sitemap_index_header = NEWS_SITEMAP_HEADER
+        else:
+            sitemap_index_header = GENERAL_SITEMAP_HEADER
+
+        # Make a lastmod time
+        lastmod = TimestampISO8601(time.time())
+
+        # Write to it
+        try:
+            fd = open(filename, 'wt')
+            fd.write(sitemap_index_header)
+
+            for mapnumber in range(0, self._sitemaps):
+                # Write the entry
+                mapurl = self._filegen.GenerateURL(mapnumber, self._base_url)
+                mapattributes = {'loc': mapurl, 'lastmod': lastmod}
+                fd.write(SITEINDEX_ENTRY % mapattributes)
+
+            fd.write(SITEINDEX_FOOTER)
+
+            fd.close()
+            fd = None
+        except IOError:
+            output.Fatal('Couldn\'t write out to file: %s' % filename)
+        os.chmod(filename, 0o0644)
+    # end def WriteIndex
+
+    def NotifySearch(self):
+        """ Send notification of the new Sitemap(s) to the search engines. """
+        if self._suppress:
+            output.Log('Search engine notification is suppressed.', 1)
+            return
+
+        output.Log('Notifying search engines.', 1)
+
+        # Override the urllib's opener class with one that doesn't ignore 404s
+        class ExceptionURLopener(FancyURLopener):
+            def http_error_default(self, url, fp, errcode, errmsg, headers):
+                output.Log('HTTP error %d: %s' % (errcode, errmsg), 2)
+                raise IOError
+            # end def http_error_default
+        # end class ExceptionURLOpener
+        if sys.version_info[0] == 3:
+            old_opener = urllib.request._urlopener
+            urllib.request._urlopener = ExceptionURLopener()
+        else:
+            old_opener = urllib._urlopener
+            urllib._urlopener = ExceptionURLopener()
 
-class PerURLStatistics:
-  """ Keep track of some simple per-URL statistics, like file extension. """
-
-  def __init__(self):
-    self._extensions  = {}                  # Count of extension instances
-  #end def __init__
-
-  def Consume(self, url):
-    """ Log some stats for the URL.  At the moment, that means extension. """
-    if url and url.loc:
-      (scheme, netloc, path, query, frag) = urlsplit(url.loc)
-      if not path:
-        return
-
-      # Recognize directories
-      if path.endswith('/'):
-        if self._extensions.has_key('/'):
-          self._extensions['/'] = self._extensions['/'] + 1
+        # Build the URL we want to send in
+        if self._sitemaps > 1:
+            url = self._filegen.GenerateURL(SITEINDEX_SUFFIX, self._base_url)
         else:
-          self._extensions['/'] = 1
-        return
-
-      # Strip to a filename
-      i = path.rfind('/')
-      if i >= 0:
-        assert i < len(path)
-        path = path[i:]
-
-      # Find extension
-      i = path.rfind('.')
-      if i > 0:
-        assert i < len(path)
-        ext = path[i:].lower()
-        if self._extensions.has_key(ext):
-          self._extensions[ext] = self._extensions[ext] + 1
+            url = self._filegen.GenerateURL(0, self._base_url)
+
+        # Test if we can hit it ourselves
+        try:
+            u = urlopen(url)
+            u.close()
+        except IOError:
+            output.Error('When attempting to access our generated Sitemap at the '
+                         'following URL:\n    %s\n  we failed to read it.  Please '
+                         'verify the store_into path you specified in\n'
+                         '  your configuration file is web-accessable.  Consult '
+                         'the FAQ for more\n  information.' % url)
+            output.Warn('Proceeding to notify with an unverifyable URL.')
+
+        # Cycle through notifications
+        # To understand this, see the comment near the NOTIFICATION_SITES
+        # comment
+        for ping in NOTIFICATION_SITES:
+            query_map = ping[3]
+            query_attr = ping[5]
+            query_map[query_attr] = url
+            query = urllib.urlencode(query_map)
+            notify = urlunsplit((ping[0], ping[1], ping[2], query, ping[4]))
+
+            # Send the notification
+            output.Log('Notifying: %s' % ping[1], 0)
+            output.Log('Notification URL: %s' % notify, 2)
+            try:
+                u = urlopen(notify)
+                u.read()
+                u.close()
+            except IOError:
+                output.Warn('Cannot contact: %s' % ping[1])
+
+        if old_opener:
+            if sys.version_info[0] == 3:
+                urllib.request._urlopener = old_opener
+            else:
+                urllib._urlopener = old_opener
+    # end def NotifySearch
+
+    def startElement(self, tag, attributes):
+        """ SAX processing, called per node in the config stream. """
+        if tag == 'site':
+            if self._in_site:
+                output.Error('Can not nest Site entries in the configuration.')
+            else:
+                self._in_site = True
+
+                if not ValidateAttributes('SITE', attributes,
+                                         ('verbose', 'default_encoding', 'base_url', 'store_into',
+                                          'suppress_search_engine_notify', 'sitemap_type')):
+                    return
+
+                verbose = attributes.get('verbose', 0)
+                if verbose:
+                    output.SetVerbose(verbose)
+
+                self._default_enc = attributes.get('default_encoding')
+                self._base_url = attributes.get('base_url')
+                self._store_into = attributes.get('store_into')
+                self._sitemap_type = attributes.get('sitemap_type')
+                if not self._suppress:
+                    self._suppress = attributes.get(
+                        'suppress_search_engine_notify',
+                        False)
+                self.ValidateBasicConfig()
+        elif tag == 'filter':
+            self._filters.append(Filter(attributes))
+
+        elif tag == 'url':
+            print(type(attributes))
+            self._inputs.append(InputURL(attributes))
+
+        elif tag == 'urllist':
+            for attributeset in ExpandPathAttribute(attributes, 'path'):
+                if self._sitemap_type == 'news':
+                    self._inputs.append(InputNewsURLList(attributeset))
+                else:
+                    self._inputs.append(InputURLList(attributeset))
+
+        elif tag == 'directory':
+            self._inputs.append(InputDirectory(attributes, self._base_url))
+
+        elif tag == 'accesslog':
+            for attributeset in ExpandPathAttribute(attributes, 'path'):
+                self._inputs.append(InputAccessLog(attributeset))
         else:
-          self._extensions[ext] = 1
-      else:
-        if self._extensions.has_key('(no extension)'):
-          self._extensions['(no extension)'] = self._extensions[
-            '(no extension)'] + 1
+            output.Error('Unrecognized tag in the configuration: %s' % tag)
+    # end def startElement
+
+    def endElement(self, tag):
+        """ SAX processing, called per node in the config stream. """
+        if tag == 'site':
+            assert self._in_site
+            self._in_site = False
+            self._in_site_ever = True
+    # end def endElement
+
+    def endDocument(self):
+        """ End of SAX, verify we can proceed. """
+        if not self._in_site_ever:
+            output.Error('The configuration must specify a "site" element.')
         else:
-          self._extensions['(no extension)'] = 1
-  #end def Consume
-
-  def Log(self):
-    """ Dump out stats to the output. """
-    if len(self._extensions):
-      output.Log('Count of file extensions on URLs:', 1)
-      set = self._extensions.keys()
-      set.sort()
-      for ext in set:
-        output.Log(' %7d  %s' % (self._extensions[ext], ext), 1)
-  #end def Log
+            if not self._inputs:
+                output.Warn('There were no inputs to generate a sitemap from.')
+    # end def endDocument
+# end class Sitemap
 
-class Sitemap(xml.sax.handler.ContentHandler):
-  """
-  This is the big workhorse class that processes your inputs and spits
-  out sitemap files.  It is built as a SAX handler for set up purposes.
-  That is, it processes an XML stream to bring itself up.
-  """
-
-  def __init__(self, suppress_notify):
-    xml.sax.handler.ContentHandler.__init__(self)
-    self._filters      = []                  # Filter objects
-    self._inputs       = []                  # Input objects
-    self._urls         = {}                  # Maps URLs to count of dups
-    self._set          = []                  # Current set of URLs
-    self._filegen      = None                # Path generator for output files
-    self._wildurl1     = None                # Sitemap URLs to filter out
-    self._wildurl2     = None                # Sitemap URLs to filter out
-    self._sitemaps     = 0                   # Number of output files
-    # We init _dup_max to 2 so the default priority is 0.5 instead of 1.0
-    self._dup_max      = 2                   # Max number of duplicate URLs
-    self._stat         = PerURLStatistics()  # Some simple stats
-    self._in_site      = False               # SAX: are we in a Site node?
-    self._in_Site_ever = False               # SAX: were we ever in a Site?
-
-    self._default_enc  = None                # Best encoding to try on URLs
-    self._base_url     = None                # Prefix to all valid URLs
-    self._store_into   = None                # Output filepath
-    self._suppress     = suppress_notify     # Suppress notify of servers
-  #end def __init__
-
-  def ValidateBasicConfig(self):
-    """ Verifies (and cleans up) the basic user-configurable options. """
-    all_good = True
 
-    if self._default_enc:
-      encoder.SetUserEncoding(self._default_enc)
-
-    # Canonicalize the base_url
-    if all_good and not self._base_url:
-      output.Error('A site needs a "base_url" attribute.')
-      all_good = False
-    if all_good and not URL.IsAbsolute(self._base_url):
-        output.Error('The "base_url" must be absolute, not relative: %s' %
-                     self._base_url)
-        all_good = False
-    if all_good:
-      self._base_url = URL.Canonicalize(self._base_url)
-      if not self._base_url.endswith('/'):
-        self._base_url = self._base_url + '/'
-      output.Log('BaseURL is set to: %s' % self._base_url, 2)
-
-    # Load store_into into a generator
-    if all_good:
-      if self._store_into:
-        self._filegen = FilePathGenerator()
-        if not self._filegen.Preload(self._store_into):
-          all_good = False
-      else:
-        output.Error('A site needs a "store_into" attribute.')
-        all_good = False
-
-    # Ask the generator for patterns on what its output will look like
-    if all_good:
-      self._wildurl1 = self._filegen.GenerateWildURL(self._base_url)
-      self._wildurl2 = self._filegen.GenerateURL(SITEINDEX_SUFFIX,
-                                                 self._base_url)
-
-    # Unify various forms of False
-    if all_good:
-      if self._suppress:
-        if (type(self._suppress) == types.StringType) or (type(self._suppress)
-                                 == types.UnicodeType):
-          if (self._suppress == '0') or (self._suppress.lower() == 'false'):
-            self._suppress = False
-
-    # Done
-    if not all_good:
-      output.Log('See "example_config.xml" for more information.', 0)
+def ValidateAttributes(tag, attributes, goodattributes):
+    """ Makes sure 'attributes' does not contain any attribute not
+        listed in 'goodattributes' """
+    all_good = True
+    for attr in attributes.keys():
+        if not attr in goodattributes:
+            output.Error('Unknown %s attribute: %s' % (tag, attr))
+            all_good = False
     return all_good
-  #end def ValidateBasicConfig
-
-  def Generate(self):
-    """ Run over all the Inputs and ask them to Produce """
-    # Run the inputs
-    for input in self._inputs:
-      input.ProduceURLs(self.ConsumeURL)
-
-    # Do last flushes
-    if len(self._set):
-      self.FlushSet()
-    if not self._sitemaps:
-      output.Warn('No URLs were recorded, writing an empty sitemap.')
-      self.FlushSet()
+# end def ValidateAttributes
 
-    # Write an index as needed
-    if self._sitemaps > 1:
-      self.WriteIndex()
 
-    # Notify
-    self.NotifySearch()
+def ExpandPathAttribute(src, attrib):
+    """ Given a dictionary of attributes, return a list of dictionaries
+        with all the same attributes except for the one named attrib.
+        That one, we treat as a file path and expand into all its possible
+        variations. """
+    # Do the path expansion.  On any error, just return the source dictionary.
+    path = src.get(attrib)
+    if not path:
+        return [src]
+    path = encoder.MaybeNarrowPath(path)
+    pathlist = glob.glob(path)
+    if not pathlist:
+        return [src]
+
+    # If this isn't actually a dictionary, make it one
+    if not isinstance(src, dict):
+        tmp = {}
+        for key in src.keys():
+            tmp[key] = src[key]
+        src = tmp
+    # Create N new dictionaries
+    retval = []
+    for path in pathlist:
+        dst = src.copy()
+        dst[attrib] = path
+        retval.append(dst)
+
+    return retval
+# end def ExpandPathAttribute
 
-    # Dump stats
-    self._stat.Log()
-  #end def Generate
 
-  def ConsumeURL(self, url, allow_fragment):
-    """
-    All per-URL processing comes together here, regardless of Input.
-    Here we run filters, remove duplicates, spill to disk as needed, etc.
-    """
-    if not url:
-      return
-
-    # Validate
-    if not url.Validate(self._base_url, allow_fragment):
-      return
-
-    # Run filters
-    accept = None
-    for filter in self._filters:
-      accept = filter.Apply(url)
-      if accept != None:
-        break
-    if not (accept or (accept == None)):
-      url.Log(prefix='FILTERED', level=2)
-      return
-
-    # Ignore our out output URLs
-    if fnmatch.fnmatchcase(url.loc, self._wildurl1) or fnmatch.fnmatchcase(
-      url.loc, self._wildurl2):
-      url.Log(prefix='IGNORED (output file)', level=2)
-      return
-
-    # Note the sighting
-    hash = url.MakeHash()
-    if self._urls.has_key(hash):
-      dup = self._urls[hash]
-      if dup > 0:
-        dup = dup + 1
-        self._urls[hash] = dup
-        if self._dup_max < dup:
-          self._dup_max = dup
-      url.Log(prefix='DUPLICATE')
-      return
-
-    # Acceptance -- add to set
-    self._urls[hash] = 1
-    self._set.append(url)
-    self._stat.Consume(url)
-    url.Log()
-
-    # Flush the set if needed
-    if len(self._set) >= MAXURLS_PER_SITEMAP:
-      self.FlushSet()
-  #end def ConsumeURL
-
-  def FlushSet(self):
-    """
-    Flush the current set of URLs to the output.  This is a little
-    slow because we like to sort them all and normalize the priorities
-    before dumping.
-    """
+def OpenFileForRead(path, logtext):
+    """ Opens a text file, be it GZip or plain """
 
-    # Sort and normalize
-    output.Log('Sorting and normalizing collected URLs.', 1)
-    self._set.sort()
-    for url in self._set:
-      hash = url.MakeHash()
-      dup = self._urls[hash]
-      if dup > 0:
-        self._urls[hash] = -1
-        if not url.priority:
-          url.priority = '%.4f' % (float(dup) / float(self._dup_max))
-
-    # Get the filename we're going to write to
-    filename = self._filegen.GeneratePath(self._sitemaps)
-    if not filename:
-      output.Fatal('Unexpected: Couldn\'t generate output filename.')
-    self._sitemaps = self._sitemaps + 1
-    output.Log('Writing Sitemap file "%s" with %d URLs' %
-        (filename, len(self._set)), 1)
-
-    # Write to it
     frame = None
-    file  = None
-
-    try:
-      if self._filegen.is_gzip:
-        basename = os.path.basename(filename);
-        frame = open(filename, 'wb')
-        file = gzip.GzipFile(fileobj=frame, filename=basename, mode='wt')
-      else:
-        file = open(filename, 'wt')
-
-      file.write(SITEMAP_HEADER)
-      for url in self._set:
-        url.WriteXML(file)
-      file.write(SITEMAP_FOOTER)
-
-      file.close()
-      if frame:
-        frame.close()
-
-      frame = None
-      file  = None
-    except IOError:
-      output.Fatal('Couldn\'t write out to file: %s' % filename)
-    os.chmod(filename, 0o0644)
-
-    # Flush
-    self._set = []
-  #end def FlushSet
-
-  def WriteIndex(self):
-    """ Write the master index of all Sitemap files """
-    # Make a filename
-    filename = self._filegen.GeneratePath(SITEINDEX_SUFFIX)
-    if not filename:
-      output.Fatal('Unexpected: Couldn\'t generate output index filename.')
-    output.Log('Writing index file "%s" with %d Sitemaps' %
-        (filename, self._sitemaps), 1)
-
-    # Make a lastmod time
-    lastmod = TimestampISO8601(time.time())
-
-    # Write to it
-    try:
-      fd = open(filename, 'wt')
-      fd.write(SITEINDEX_HEADER)
+    file = None
 
-      for mapnumber in range(0,self._sitemaps):
-        # Write the entry
-        mapurl = self._filegen.GenerateURL(mapnumber, self._base_url)
-        mapattributes = { 'loc' : mapurl, 'lastmod' : lastmod }
-        fd.write(SITEINDEX_ENTRY % mapattributes)
-
-      fd.write(SITEINDEX_FOOTER)
-
-      fd.close()
-      fd = None
-    except IOError:
-      output.Fatal('Couldn\'t write out to file: %s' % filename)
-    os.chmod(filename, 0o0644)
-  #end def WriteIndex
-
-  def NotifySearch(self):
-    """ Send notification of the new Sitemap(s) to the search engines. """
-    if self._suppress:
-      output.Log('Search engine notification is suppressed.', 1)
-      return
-
-    output.Log('Notifying search engines.', 1)
-
-    # Override the urllib's opener class with one that doesn't ignore 404s
-    class ExceptionURLopener(urllib.FancyURLopener):
-      def http_error_default(self, url, fp, errcode, errmsg, headers):
-        output.Log('HTTP error %d: %s' % (errcode, errmsg), 2)
-        raise IOError
-      #end def http_error_default
-    #end class ExceptionURLOpener
-    old_opener = urllib._urlopener
-    urllib._urlopener = ExceptionURLopener()
-
-    # Build the URL we want to send in
-    if self._sitemaps > 1:
-      url = self._filegen.GenerateURL(SITEINDEX_SUFFIX, self._base_url)
-    else:
-      url = self._filegen.GenerateURL(0, self._base_url)
+    if not path:
+        return (frame, file)
 
-    # Test if we can hit it ourselves
     try:
-      u = urllib.urlopen(url)
-      u.close()
-    except IOError:
-      output.Error('When attempting to access our generated Sitemap at the '
-                   'following URL:\n    %s\n  we failed to read it.  Please '
-                   'verify the store_into path you specified in\n'
-                   '  your configuration file is web-accessable.  Consult '
-                   'the FAQ for more\n  information.' % url)
-      output.Warn('Proceeding to notify with an unverifyable URL.')
-
-    # Cycle through notifications
-    # To understand this, see the comment near the NOTIFICATION_SITES comment
-    for ping in NOTIFICATION_SITES:
-      query_map             = ping[3]
-      query_attr            = ping[5]
-      query_map[query_attr] = url
-      query = urllib.urlencode(query_map)
-      notify = urlunsplit((ping[0], ping[1], ping[2], query, ping[4]))
-
-      # Send the notification
-      output.Log('Notifying: %s' % ping[1], 1)
-      output.Log('Notification URL: %s' % notify, 2)
-      try:
-        u = urllib.urlopen(notify)
-        u.read()
-        u.close()
-      except IOError:
-        output.Warn('Cannot contact: %s' % ping[1])
-
-    if old_opener:
-      urllib._urlopener = old_opener
-  #end def NotifySearch
-
-  def startElement(self, tag, attributes):
-    """ SAX processing, called per node in the config stream. """
-
-    if tag == 'site':
-      if self._in_site:
-        output.Error('Can not nest Site entries in the configuration.')
-      else:
-        self._in_site     = True
-
-        if not ValidateAttributes('SITE', attributes,
-          ('verbose', 'default_encoding', 'base_url', 'store_into',
-           'suppress_search_engine_notify')):
-          return
-
-        verbose           = attributes.get('verbose', 0)
-        if verbose:
-          output.SetVerbose(verbose)
-
-        self._default_enc = attributes.get('default_encoding')
-        self._base_url    = attributes.get('base_url')
-        self._store_into  = attributes.get('store_into')
-        if not self._suppress:
-          self._suppress  = attributes.get('suppress_search_engine_notify',
-                                            False)
-        self.ValidateBasicConfig()
-
-    elif tag == 'filter':
-      self._filters.append(Filter(attributes))
-
-    elif tag == 'url':
-      self._inputs.append(InputURL(attributes))
-
-    elif tag == 'urllist':
-      for attributeset in ExpandPathAttribute(attributes, 'path'):
-        self._inputs.append(InputURLList(attributeset))
-
-    elif tag == 'directory':
-      self._inputs.append(InputDirectory(attributes, self._base_url))
-
-    elif tag == 'accesslog':
-      for attributeset in ExpandPathAttribute(attributes, 'path'):
-        self._inputs.append(InputAccessLog(attributeset))
-
-    elif tag == 'sitemap':
-      for attributeset in ExpandPathAttribute(attributes, 'path'):
-        self._inputs.append(InputSitemap(attributeset))
-
-    else:
-      output.Error('Unrecognized tag in the configuration: %s' % tag)
-  #end def startElement
-
-  def endElement(self, tag):
-    """ SAX processing, called per node in the config stream. """
-    if tag == 'site':
-      assert self._in_site
-      self._in_site      = False
-      self._in_site_ever = True
-  #end def endElement
-
-  def endDocument(self):
-    """ End of SAX, verify we can proceed. """
-    if not self._in_site_ever:
-      output.Error('The configuration must specify a "site" element.')
-    else:
-      if not self._inputs:
-        output.Warn('There were no inputs to generate a sitemap from.')
-  #end def endDocument
-#end class Sitemap
-
-
-def ValidateAttributes(tag, attributes, goodattributes):
-  """ Makes sure 'attributes' does not contain any attribute not
-      listed in 'goodattributes' """
-  all_good = True
-  for attr in attributes.keys():
-    if not attr in goodattributes:
-      output.Error('Unknown %s attribute: %s' % (tag, attr))
-      all_good = False
-  return all_good
-#end def ValidateAttributes
+        if path.endswith('.gz'):
+            frame = open(path, 'rb')
+            file = gzip.GzipFile(fileobj=frame, mode='rt')
+        else:
+            file = open(path, 'rt')
 
-def ExpandPathAttribute(src, attrib):
-  """ Given a dictionary of attributes, return a list of dictionaries
-      with all the same attributes except for the one named attrib.
-      That one, we treat as a file path and expand into all its possible
-      variations. """
-  # Do the path expansion.  On any error, just return the source dictionary.
-  path = src.get(attrib)
-  if not path:
-    return [src]
-  path = encoder.MaybeNarrowPath(path);
-  pathlist = glob.glob(path)
-  if not pathlist:
-    return [src]
-
-  # If this isn't actually a dictionary, make it one
-  if type(src) != types.DictionaryType:
-    tmp = {}
-    for key in src.keys():
-      tmp[key] = src[key]
-    src = tmp
-
-  # Create N new dictionaries
-  retval = []
-  for path in pathlist:
-    dst = src.copy()
-    dst[attrib] = path
-    retval.append(dst)
-
-  return retval
-#end def ExpandPathAttribute
+        if logtext:
+            output.Log('Opened %s file: %s' % (logtext, path), 1)
+        else:
+            output.Log('Opened file: %s' % path, 1)
+    except IOError:
+        output.Error('Can not open file: %s' % path)
 
-def OpenFileForRead(path, logtext):
-  """ Opens a text file, be it GZip or plain """
+    return (frame, file)
+# end def OpenFileForRead
 
-  frame = None
-  file  = None
 
-  if not path:
-    return (frame, file)
+def TimestampISO8601(t):
+    """Seconds since epoch (1970-01-01) --> ISO 8601 time string."""
+    return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
+# end def TimestampISO8601
 
-  try:
-    if path.endswith('.gz'):
-      frame = open(path, 'rb')
-      file = gzip.GzipFile(fileobj=frame, mode='rt')
-    else:
-      file = open(path, 'rt')
 
-    if logtext:
-      output.Log('Opened %s file: %s' % (logtext, path), 1)
-    else:
-      output.Log('Opened file: %s' % path, 1)
-  except IOError:
-    output.Error('Can not open file: %s' % path)
+def CreateSitemapFromFile(configpath, suppress_notify):
+    """ Sets up a new Sitemap object from the specified configuration file.  """
 
-  return (frame, file)
-#end def OpenFileForRead
+    # Remember error count on the way in
+    num_errors = output.num_errors
 
-def TimestampISO8601(t):
-  """Seconds since epoch (1970-01-01) --> ISO 8601 time string."""
-  return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
-#end def TimestampISO8601
+    # Rev up SAX to parse the config
+    sitemap = Sitemap(suppress_notify)
+    try:
+        output.Log('Reading configuration file: %s' % configpath, 0)
+        xml.sax.parse(configpath, sitemap)
+    except IOError:
+        output.Error('Cannot read configuration file: %s' % configpath)
+    except xml.sax._exceptions.SAXParseException as e:
+        output.Error('XML error in the config file (line %d, column %d): %s' %
+                     (e._linenum, e._colnum, e.getMessage()))
+    except xml.sax._exceptions.SAXReaderNotAvailable:
+        output.Error('Some installs of Python 2.2 did not include complete support'
+                     ' for XML.\n  Please try upgrading your version of Python'
+                     ' and re-running the script.')
+
+    # If we added any errors, return no sitemap
+    if num_errors == output.num_errors:
+        return sitemap
+    return None
+# end def CreateSitemapFromFile
 
-def CreateSitemapFromFile(configpath, suppress_notify):
-  """ Sets up a new Sitemap object from the specified configuration file.  """
-
-  # Remember error count on the way in
-  num_errors = output.num_errors
-
-  # Rev up SAX to parse the config
-  sitemap = Sitemap(suppress_notify)
-  try:
-    output.Log('Reading configuration file: %s' % configpath, 0)
-    xml.sax.parse(configpath, sitemap)
-  except IOError:
-    output.Error('Cannot read configuration file: %s' % configpath)
-  except xml.sax._exceptions.SAXParseException as e:
-    output.Error('XML error in the config file (line %d, column %d): %s' %
-                 (e._linenum, e._colnum, e.getMessage()))
-  except xml.sax._exceptions.SAXReaderNotAvailable:
-    output.Error('Some installs of Python 2.2 did not include complete support'
-                 ' for XML.\n  Please try upgrading your version of Python'
-                 ' and re-running the script.')
-
-  # If we added any errors, return no sitemap
-  if num_errors == output.num_errors:
-    return sitemap
-  return None
-#end def CreateSitemapFromFile
 
 def ProcessCommandFlags(args):
-  """
-  Parse command line flags per specified usage, pick off key, value pairs
-  All flags of type "--key=value" will be processed as __flags[key] = value,
-                    "--option" will be processed as __flags[option] = option
-  """
-
-  flags   = {}
-  rkeyval = '--(?P<key>\S*)[=](?P<value>\S*)' # --key=val
-  roption = '--(?P<option>\S*)'               # --key
-  r = '(' + rkeyval + ')|(' + roption + ')'
-  rc = re.compile(r)
-  for a in args:
-    try:
-      rcg = rc.search(a).groupdict()
-      if rcg.has_key('key'):
-        flags[rcg['key']] = rcg['value']
-      if rcg.has_key('option'):
-        flags[rcg['option']] = rcg['option']
-    except AttributeError:
-      return None
-  return flags
-#end def ProcessCommandFlags
+    """
+    Parse command line flags per specified usage, pick off key, value pairs
+    All flags of type "--key=value" will be processed as __flags[key] = value,
+                      "--option" will be processed as __flags[option] = option
+    """
+
+    flags = {}
+    rkeyval = '--(?P<key>\S*)[=](?P<value>\S*)'  # --key=val
+    roption = '--(?P<option>\S*)'               # --key
+    r = '(' + rkeyval + ')|(' + roption + ')'
+    rc = re.compile(r)
+    for a in args:
+        try:
+            rcg = rc.search(a).groupdict()
+            if 'key' in rcg:
+                flags[rcg['key']] = rcg['value']
+            if 'option' in rcg:
+                flags[rcg['option']] = rcg['option']
+        except AttributeError:
+            return None
+    return flags
+# end def ProcessCommandFlags
 
 
 #
@@ -2224,15 +2123,15 @@ def ProcessCommandFlags(args):
 #
 
 if __name__ == '__main__':
-  flags = ProcessCommandFlags(sys.argv[1:])
-  if not flags or not flags.has_key('config') or flags.has_key('help'):
-    output.Log(__usage__, 0)
-  else:
-    suppress_notify = flags.has_key('testing')
-    sitemap = CreateSitemapFromFile(flags['config'], suppress_notify)
-    if not sitemap:
-      output.Log('Configuration file errors -- exiting.', 0)
+    flags = ProcessCommandFlags(sys.argv[1:])
+    if not flags or not 'config' in flags or 'help' in flags:
+        output.Log(__usage__, 0)
     else:
-      sitemap.Generate()
-      output.Log('Number of errors: %d' % output.num_errors, 1)
-      output.Log('Number of warnings: %d' % output.num_warns, 1)
+        suppress_notify = 'testing' in flags
+        sitemap = CreateSitemapFromFile(flags['config'], suppress_notify)
+        if not sitemap:
+            output.Log('Configuration file errors -- exiting.', 0)
+        else:
+            sitemap.Generate()
+            output.Log('Number of errors: %d' % output.num_errors, 1)
+            output.Log('Number of warnings: %d' % output.num_warns, 1)