summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/blogger.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/blogger.py')
-rw-r--r--gallery_dl/extractor/blogger.py48
1 files changed, 36 insertions, 12 deletions
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 31bbaf8..2657b5d 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text
+import json
import re
BASE_PATTERN = (
@@ -28,6 +29,7 @@ class BloggerExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
+ self.videos = self.config("videos", True)
self.blog = match.group(1) or match.group(2)
self.api = BloggerAPI(self)
@@ -41,24 +43,41 @@ class BloggerExtractor(Extractor):
del blog["selfLink"]
sub = re.compile(r"/s\d+/").sub
- findall = re.compile(
- r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall
+ findall_image = re.compile(
+ r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall
+ findall_video = re.compile(
+ r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
for post in self.posts(blog):
- images = findall(post["content"])
- if not images:
+ content = post["content"]
+
+ files = findall_image(content)
+ for idx, url in enumerate(files):
+ files[idx] = sub("/s0/", url).replace("http:", "https:", 1)
+
+ if self.videos and 'id="BLOG_video-' in content:
+ page = self.request(post["url"]).text
+ for url in findall_video(page):
+ page = self.request(url).text
+ video_config = json.loads(text.extract(
+ page, 'var VIDEO_CONFIG =', '\n')[0])
+ files.append(max(
+ video_config["streams"],
+ key=lambda x: x["format_id"],
+ )["play_url"])
+
+ if not files:
continue
post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"]
- post["content"] = text.remove_html(post["content"])
+ post["content"] = text.remove_html(content)
post["date"] = text.parse_datetime(post["published"])
del post["selfLink"]
del post["blog"]
yield Message.Directory, {"blog": blog, "post": post}
- for num, url in enumerate(images, 1):
- url = sub("/s0/", url).replace("http:", "https:", 1)
+ for num, url in enumerate(files, 1):
yield Message.Url, url, text.nameext_from_url(url, {
"blog": blog,
"post": post,
@@ -80,7 +99,7 @@ class BloggerPostExtractor(BloggerExtractor):
"pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg",
"keyword": {
"blog": {
- "date" : "type:datetime",
+ "date" : "dt:2010-11-21 18:19:42",
"description": "",
"id" : "5623928067739466034",
"kind" : "blogger#blog",
@@ -95,7 +114,7 @@ class BloggerPostExtractor(BloggerExtractor):
"post": {
"author" : "Julian Bunker",
"content" : str,
- "date" : "type:datetime",
+ "date" : "dt:2010-12-26 01:08:00",
"etag" : str,
"id" : "6955139236418998998",
"kind" : "blogger#post",
@@ -112,6 +131,11 @@ class BloggerPostExtractor(BloggerExtractor):
("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", {
"url": "9928429fb62f712eb4de80f53625eccecc614aae",
}),
+ # video (#587)
+ (("http://cfnmscenesinmovies.blogspot.com/2011/11/"
+ "cfnm-scene-jenna-fischer-in-office.html"), {
+ "pattern": r"https://.+\.googlevideo\.com/videoplayback",
+ }),
)
def __init__(self, match):
@@ -171,8 +195,8 @@ class BloggerAPI():
def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
- yield from data["items"]
-
+ if "items" in data:
+ yield from data["items"]
if "nextPageToken" not in data:
return
params["pageToken"] = data["nextPageToken"]