diff options
Diffstat (limited to 'gallery_dl/extractor/hatenablog.py')
| -rw-r--r-- | gallery_dl/extractor/hatenablog.py | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py new file mode 100644 index 0000000..792f666 --- /dev/null +++ b/gallery_dl/extractor/hatenablog.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hatenablog.com""" + +import re +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?" + r"([\w-]+\.(?:hatenablog\.(?:com|jp)" + r"|hatenadiary\.com|hateblo\.jp)))" +) +QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" + + +class HatenablogExtractor(Extractor): + """Base class for HatenaBlog extractors""" + category = "hatenablog" + directory_fmt = ("{category}", "{domain}") + filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" + archive_fmt = "{filename}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.domain = match.group(1) or match.group(2) + + def _init(self): + self._find_img = re.compile(r'<img +([^>]+)').finditer + + def _handle_article(self, article: str): + extr = text.extract_from(article) + date = text.parse_datetime(extr('<time datetime="', '"')) + entry_link = text.unescape(extr('<a href="', '"')) + entry = entry_link.partition("/entry/")[2] + title = text.unescape(extr('>', '<')) + content = extr( + '<div class="entry-content hatenablog-entry">', '</div>') + + images = [] + for i in self._find_img(content): + attributes = i.group(1) + if 'class="hatena-fotolife"' not in attributes: + continue + image = text.unescape(text.extr(attributes, 'src="', '"')) + images.append(image) + + data = { + "domain": self.domain, + "date": date, + "entry": entry, + "title": title, + "count": len(images), + } + yield Message.Directory, data + for data["num"], url in enumerate(images, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + +class HatenablogEntriesExtractor(HatenablogExtractor): + """Base class for a list of entries""" + allowed_parameters = () + + def __init__(self, match): + HatenablogExtractor.__init__(self, match) + self.path = match.group(3) + self.query = {key: value for key, value in text.parse_query( + match.group(4)).items() if self._acceptable_query(key)} + + def _init(self): + HatenablogExtractor._init(self) + self._find_pager_url = re.compile( + r' class="pager-next">\s*<a href="([^"]+)').search + + def items(self): + url = "https://" + self.domain + self.path + query = self.query + + while url: + page = self.request(url, params=query).text + + extr = text.extract_from(page) + attributes = extr('<body ', '>') + if "page-archive" in attributes: + yield from self._handle_partial_articles(extr) + else: + yield from self._handle_full_articles(extr) + + match = self._find_pager_url(page) + url = text.unescape(match.group(1)) if match else None + query = None + + def _handle_partial_articles(self, extr): + while True: + section = extr('<section class="archive-entry', '</section>') + if not section: + break + + url = "hatenablog:" + text.unescape(text.extr( + section, '<a class="entry-title-link" href="', '"')) + data = {"_extractor": HatenablogEntryExtractor} + yield Message.Queue, url, data + + def _handle_full_articles(self, extr): + while True: + attributes = extr('<article ', '>') + if not attributes: + break + if "no-entry" in attributes: + continue + + article = extr('', '</article>') + yield from self._handle_article(article) + + def _acceptable_query(self, key): + return key == "page" or key in self.allowed_parameters + + +class HatenablogEntryExtractor(HatenablogExtractor): + """Extractor for a single entry URL""" + subcategory = "entry" + pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE + example = "https://BLOG.hatenablog.com/entry/PATH" + + def __init__(self, match): + HatenablogExtractor.__init__(self, match) + self.path = match.group(3) + + def items(self): + url = "https://" + self.domain + "/entry/" + self.path + page = self.request(url).text + + extr = text.extract_from(page) + while True: + attributes = extr('<article ', '>') + if "no-entry" in attributes: + continue + article = extr('', '</article>') + return self._handle_article(article) + + +class HatenablogHomeExtractor(HatenablogEntriesExtractor): + """Extractor for a blog's home page""" + subcategory = "home" + pattern = BASE_PATTERN + r"(/?)" + QUERY_RE + example = "https://BLOG.hatenablog.com" + + +class HatenablogArchiveExtractor(HatenablogEntriesExtractor): + """Extractor for a blog's archive page""" + subcategory = "archive" + pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + r"|/category/[^?#]+)?)" + QUERY_RE) + example = "https://BLOG.hatenablog.com/archive/2024" + + +class HatenablogSearchExtractor(HatenablogEntriesExtractor): + """Extractor for a blog's search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"(/search)" + QUERY_RE + example = "https://BLOG.hatenablog.com/search?q=QUERY" + allowed_parameters = ("q",) |
