diff options
Diffstat (limited to 'gallery_dl/extractor/blogger.py')
| -rw-r--r-- | gallery_dl/extractor/blogger.py | 48 |
1 files changed, 36 insertions, 12 deletions
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 31bbaf8..2657b5d 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text +import json import re BASE_PATTERN = ( @@ -28,6 +29,7 @@ class BloggerExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.videos = self.config("videos", True) self.blog = match.group(1) or match.group(2) self.api = BloggerAPI(self) @@ -41,24 +43,41 @@ class BloggerExtractor(Extractor): del blog["selfLink"] sub = re.compile(r"/s\d+/").sub - findall = re.compile( - r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall + findall_image = re.compile( + r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall + findall_video = re.compile( + r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall for post in self.posts(blog): - images = findall(post["content"]) - if not images: + content = post["content"] + + files = findall_image(content) + for idx, url in enumerate(files): + files[idx] = sub("/s0/", url).replace("http:", "https:", 1) + + if self.videos and 'id="BLOG_video-' in content: + page = self.request(post["url"]).text + for url in findall_video(page): + page = self.request(url).text + video_config = json.loads(text.extract( + page, 'var VIDEO_CONFIG =', '\n')[0]) + files.append(max( + video_config["streams"], + key=lambda x: x["format_id"], + )["play_url"]) + + if not files: continue post["author"] = post["author"]["displayName"] post["replies"] = post["replies"]["totalItems"] - post["content"] = text.remove_html(post["content"]) + post["content"] = text.remove_html(content) post["date"] = text.parse_datetime(post["published"]) del post["selfLink"] del post["blog"] yield Message.Directory, {"blog": blog, "post": post} - for num, url in enumerate(images, 1): - url = sub("/s0/", url).replace("http:", "https:", 1) + for num, url in enumerate(files, 1): yield Message.Url, url, text.nameext_from_url(url, { "blog": blog, "post": post, @@ -80,7 +99,7 @@ class BloggerPostExtractor(BloggerExtractor): "pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg", "keyword": { "blog": { - "date" : "type:datetime", + "date" : "dt:2010-11-21 18:19:42", "description": "", "id" : "5623928067739466034", "kind" : "blogger#blog", @@ -95,7 +114,7 @@ class BloggerPostExtractor(BloggerExtractor): "post": { "author" : "Julian Bunker", "content" : str, - "date" : "type:datetime", + "date" : "dt:2010-12-26 01:08:00", "etag" : str, "id" : "6955139236418998998", "kind" : "blogger#post", @@ -112,6 +131,11 @@ class BloggerPostExtractor(BloggerExtractor): ("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", { "url": "9928429fb62f712eb4de80f53625eccecc614aae", }), + # video (#587) + (("http://cfnmscenesinmovies.blogspot.com/2011/11/" + "cfnm-scene-jenna-fischer-in-office.html"), { + "pattern": r"https://.+\.googlevideo\.com/videoplayback", + }), ) def __init__(self, match): @@ -171,8 +195,8 @@ class BloggerAPI(): def _pagination(self, endpoint, params): while True: data = self._call(endpoint, params) - yield from data["items"] - + if "items" in data: + yield from data["items"] if "nextPageToken" not in data: return params["pageToken"] = data["nextPageToken"] |
