summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/blogger.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/blogger.py')
-rw-r--r--gallery_dl/extractor/blogger.py178
1 files changed, 178 insertions, 0 deletions
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
new file mode 100644
index 0000000..31bbaf8
--- /dev/null
+++ b/gallery_dl/extractor/blogger.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Blogger blogs"""
+
+from .common import Extractor, Message
+from .. import text
+import re
+
+BASE_PATTERN = (
+ r"(?:blogger:(?:https?://)?([^/]+)|"
+ r"(?:https?://)?([^.]+\.blogspot\.com))")
+
+
+class BloggerExtractor(Extractor):
+ """Base class for blogger extractors"""
+ category = "blogger"
+ directory_fmt = ("{category}", "{blog[name]}",
+ "{post[date]:%Y-%m-%d} {post[title]}")
+ filename_fmt = "{num:>03}.{extension}"
+ archive_fmt = "{post[id]}_{num}"
+ root = "https://www.blogger.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.blog = match.group(1) or match.group(2)
+ self.api = BloggerAPI(self)
+
+ def items(self):
+ yield Message.Version, 1
+
+ blog = self.api.blog_by_url("http://" + self.blog)
+ blog["pages"] = blog["pages"]["totalItems"]
+ blog["posts"] = blog["posts"]["totalItems"]
+ blog["date"] = text.parse_datetime(blog["published"])
+ del blog["selfLink"]
+
+ sub = re.compile(r"/s\d+/").sub
+ findall = re.compile(
+ r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall
+
+ for post in self.posts(blog):
+ images = findall(post["content"])
+ if not images:
+ continue
+
+ post["author"] = post["author"]["displayName"]
+ post["replies"] = post["replies"]["totalItems"]
+ post["content"] = text.remove_html(post["content"])
+ post["date"] = text.parse_datetime(post["published"])
+ del post["selfLink"]
+ del post["blog"]
+
+ yield Message.Directory, {"blog": blog, "post": post}
+ for num, url in enumerate(images, 1):
+ url = sub("/s0/", url).replace("http:", "https:", 1)
+ yield Message.Url, url, text.nameext_from_url(url, {
+ "blog": blog,
+ "post": post,
+ "url" : url,
+ "num" : num,
+ })
+
+ def posts(self, blog):
+ """Return an iterable with all relevant post objects"""
+
+
+class BloggerPostExtractor(BloggerExtractor):
+ """Extractor for a single blog post"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"(/\d{4}/\d\d/[^/?&#]+\.html)"
+ test = (
+ ("https://julianbphotography.blogspot.com/2010/12/moon-rise.html", {
+ "url": "9928429fb62f712eb4de80f53625eccecc614aae",
+ "pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg",
+ "keyword": {
+ "blog": {
+ "date" : "type:datetime",
+ "description": "",
+ "id" : "5623928067739466034",
+ "kind" : "blogger#blog",
+ "locale" : dict,
+ "name" : "Julian Bunker Photography",
+ "pages" : int,
+ "posts" : int,
+ "published" : "2010-11-21T10:19:42-08:00",
+ "updated" : str,
+ "url" : "http://www.julianbunker.com/",
+ },
+ "post": {
+ "author" : "Julian Bunker",
+ "content" : str,
+ "date" : "type:datetime",
+ "etag" : str,
+ "id" : "6955139236418998998",
+ "kind" : "blogger#post",
+ "published" : "2010-12-25T17:08:00-08:00",
+ "replies" : "0",
+ "title" : "Moon Rise",
+ "updated" : "2011-12-06T05:21:24-08:00",
+ "url" : "re:.+/2010/12/moon-rise.html$",
+ },
+ "num": int,
+ "url": str,
+ },
+ }),
+ ("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", {
+ "url": "9928429fb62f712eb4de80f53625eccecc614aae",
+ }),
+ )
+
+ def __init__(self, match):
+ BloggerExtractor.__init__(self, match)
+ self.path = match.group(3)
+
+ def posts(self, blog):
+ return (self.api.post_by_path(blog["id"], self.path),)
+
+
+class BloggerBlogExtractor(BloggerExtractor):
+ """Extractor for an entire Blogger blog"""
+ subcategory = "blog"
+ pattern = BASE_PATTERN + "/?$"
+ test = (
+ ("https://julianbphotography.blogspot.com/", {
+ "range": "1-25",
+ "count": 25,
+ "pattern": r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg",
+ }),
+ ("blogger:http://www.julianbunker.com/", {
+ "range": "1-25",
+ "count": 25,
+ }),
+ )
+
+ def posts(self, blog):
+ return self.api.blog_posts(blog["id"])
+
+
+class BloggerAPI():
+ """Minimal interface for the Blogger v3 API
+
+ Ref: https://developers.google.com/blogger
+ """
+ API_KEY = "AIzaSyCN9ax34oMMyM07g_M-5pjeDp_312eITK8"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.api_key = extractor.config("api-key", self.API_KEY)
+
+ def blog_by_url(self, url):
+ return self._call("blogs/byurl", {"url": url})
+
+ def blog_posts(self, blog_id):
+ return self._pagination("blogs/{}/posts".format(blog_id), {})
+
+ def post_by_path(self, blog_id, path):
+ endpoint = "blogs/{}/posts/bypath".format(blog_id)
+ return self._call(endpoint, {"path": path})
+
+ def _call(self, endpoint, params):
+ url = "https://www.googleapis.com/blogger/v3/" + endpoint
+ params["key"] = self.api_key
+ return self.extractor.request(url, params=params).json()
+
+ def _pagination(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, params)
+ yield from data["items"]
+
+ if "nextPageToken" not in data:
+ return
+ params["pageToken"] = data["nextPageToken"]