summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/hatenablog.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/hatenablog.py')
-rw-r--r--gallery_dl/extractor/hatenablog.py167
1 files changed, 167 insertions, 0 deletions
diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py
new file mode 100644
index 0000000..792f666
--- /dev/null
+++ b/gallery_dl/extractor/hatenablog.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hatenablog.com"""
+
+import re
+from .common import Extractor, Message
+from .. import text
+
+
+BASE_PATTERN = (
+ r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
+ r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
+ r"|hatenadiary\.com|hateblo\.jp)))"
+)
+QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"
+
+
+class HatenablogExtractor(Extractor):
+ """Base class for HatenaBlog extractors"""
+ category = "hatenablog"
+ directory_fmt = ("{category}", "{domain}")
+ filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
+ archive_fmt = "{filename}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.domain = match.group(1) or match.group(2)
+
+ def _init(self):
+ self._find_img = re.compile(r'<img +([^>]+)').finditer
+
+ def _handle_article(self, article: str):
+ extr = text.extract_from(article)
+ date = text.parse_datetime(extr('<time datetime="', '"'))
+ entry_link = text.unescape(extr('<a href="', '"'))
+ entry = entry_link.partition("/entry/")[2]
+ title = text.unescape(extr('>', '<'))
+ content = extr(
+ '<div class="entry-content hatenablog-entry">', '</div>')
+
+ images = []
+ for i in self._find_img(content):
+ attributes = i.group(1)
+ if 'class="hatena-fotolife"' not in attributes:
+ continue
+ image = text.unescape(text.extr(attributes, 'src="', '"'))
+ images.append(image)
+
+ data = {
+ "domain": self.domain,
+ "date": date,
+ "entry": entry,
+ "title": title,
+ "count": len(images),
+ }
+ yield Message.Directory, data
+ for data["num"], url in enumerate(images, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class HatenablogEntriesExtractor(HatenablogExtractor):
+ """Base class for a list of entries"""
+ allowed_parameters = ()
+
+ def __init__(self, match):
+ HatenablogExtractor.__init__(self, match)
+ self.path = match.group(3)
+ self.query = {key: value for key, value in text.parse_query(
+ match.group(4)).items() if self._acceptable_query(key)}
+
+ def _init(self):
+ HatenablogExtractor._init(self)
+ self._find_pager_url = re.compile(
+ r' class="pager-next">\s*<a href="([^"]+)').search
+
+ def items(self):
+ url = "https://" + self.domain + self.path
+ query = self.query
+
+ while url:
+ page = self.request(url, params=query).text
+
+ extr = text.extract_from(page)
+ attributes = extr('<body ', '>')
+ if "page-archive" in attributes:
+ yield from self._handle_partial_articles(extr)
+ else:
+ yield from self._handle_full_articles(extr)
+
+ match = self._find_pager_url(page)
+ url = text.unescape(match.group(1)) if match else None
+ query = None
+
+ def _handle_partial_articles(self, extr):
+ while True:
+ section = extr('<section class="archive-entry', '</section>')
+ if not section:
+ break
+
+ url = "hatenablog:" + text.unescape(text.extr(
+ section, '<a class="entry-title-link" href="', '"'))
+ data = {"_extractor": HatenablogEntryExtractor}
+ yield Message.Queue, url, data
+
+ def _handle_full_articles(self, extr):
+ while True:
+ attributes = extr('<article ', '>')
+ if not attributes:
+ break
+ if "no-entry" in attributes:
+ continue
+
+ article = extr('', '</article>')
+ yield from self._handle_article(article)
+
+ def _acceptable_query(self, key):
+ return key == "page" or key in self.allowed_parameters
+
+
+class HatenablogEntryExtractor(HatenablogExtractor):
+ """Extractor for a single entry URL"""
+ subcategory = "entry"
+ pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
+ example = "https://BLOG.hatenablog.com/entry/PATH"
+
+ def __init__(self, match):
+ HatenablogExtractor.__init__(self, match)
+ self.path = match.group(3)
+
+ def items(self):
+ url = "https://" + self.domain + "/entry/" + self.path
+ page = self.request(url).text
+
+ extr = text.extract_from(page)
+ while True:
+ attributes = extr('<article ', '>')
+ if "no-entry" in attributes:
+ continue
+ article = extr('', '</article>')
+ return self._handle_article(article)
+
+
+class HatenablogHomeExtractor(HatenablogEntriesExtractor):
+ """Extractor for a blog's home page"""
+ subcategory = "home"
+ pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
+ example = "https://BLOG.hatenablog.com"
+
+
+class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
+ """Extractor for a blog's archive page"""
+ subcategory = "archive"
+ pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
+ r"|/category/[^?#]+)?)" + QUERY_RE)
+ example = "https://BLOG.hatenablog.com/archive/2024"
+
+
+class HatenablogSearchExtractor(HatenablogEntriesExtractor):
+ """Extractor for a blog's search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
+ example = "https://BLOG.hatenablog.com/search?q=QUERY"
+ allowed_parameters = ("q",)