# -*- coding: utf-8 -*- # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://hatenablog.com""" from .common import Extractor, Message from .. import text BASE_PATTERN = ( r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?" r"([\w-]+\.(?:hatenablog\.(?:com|jp)" r"|hatenadiary\.com|hateblo\.jp)))" ) QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" class HatenablogExtractor(Extractor): """Base class for HatenaBlog extractors""" category = "hatenablog" directory_fmt = ("{category}", "{domain}") filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" archive_fmt = "{filename}" def __init__(self, match): Extractor.__init__(self, match) self.domain = match[1] or match[2] def _init(self): self._find_img = text.re(r']+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) date = self.parse_datetime_iso(extr('