aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2022-09-22 19:43:53 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2022-09-22 19:43:53 -0400
commite6b82556343116256be047ab7099bedd9063f66a (patch)
tree884c0435863d130ec967163b82a2638ff1bd9505 /gallery_dl
parenta768930761f7f20587ae40a8cacca0e55c85290a (diff)
New upstream version 1.23.1.upstream/1.23.1
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/__init__.py4
-rw-r--r--gallery_dl/extractor/__init__.py1
-rw-r--r--gallery_dl/extractor/bunkr.py12
-rw-r--r--gallery_dl/extractor/common.py3
-rw-r--r--gallery_dl/extractor/exhentai.py16
-rw-r--r--gallery_dl/extractor/flickr.py32
-rw-r--r--gallery_dl/extractor/hotleak.py228
-rw-r--r--gallery_dl/extractor/instagram.py6
-rw-r--r--gallery_dl/extractor/paheal.py30
-rw-r--r--gallery_dl/extractor/poipiku.py13
-rw-r--r--gallery_dl/extractor/reddit.py4
-rw-r--r--gallery_dl/extractor/redgifs.py1
-rw-r--r--gallery_dl/extractor/smugmug.py6
-rw-r--r--gallery_dl/extractor/tumblr.py81
-rw-r--r--gallery_dl/extractor/twitter.py77
-rw-r--r--gallery_dl/extractor/zerochan.py55
-rw-r--r--gallery_dl/postprocessor/zip.py19
-rw-r--r--gallery_dl/version.py2
18 files changed, 488 insertions, 102 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index 329e7ab..7504fa4 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -96,9 +96,9 @@ def parse_inputfile(file, log):
else:
# url
if " #" in line:
- line = line.partition(" #")[0]
+ line = line.partition(" #")[0].rstrip()
elif "\t#" in line:
- line = line.partition("\t#")[0]
+ line = line.partition("\t#")[0].rstrip()
if gconf or lconf:
yield util.ExtendedUrl(line, gconf, lconf)
gconf = []
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 9e4507a..fed6998 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -55,6 +55,7 @@ modules = [
"hentaihere",
"hiperdex",
"hitomi",
+ "hotleak",
"idolcomplex",
"imagebam",
"imagechest",
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 3091f57..2502411 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -37,6 +37,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4",
"content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
}),
+ # cdn4
+ ("https://bunkr.is/a/iXTTc1o2", {
+ "pattern": r"https://(cdn|media-files)4\.bunkr\.is/",
+ "content": "da29aae371b7adc8c5ef8e6991b66b69823791e8",
+ }),
("https://bunkr.to/a/Lktg9Keq"),
)
@@ -66,9 +71,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
data = json.loads(text.extract(
self.request(url).text,
'id="__NEXT_DATA__" type="application/json">', '<')[0])
- props = data["props"]["pageProps"]
- album = props["album"]
- files = props["files"]
+ album = data["props"]["pageProps"]["album"]
+ files = album["files"]
except Exception as exc:
self.log.debug(exc.__class__.__name__, exc)
self.root = self.root.replace("bunkr", "app.bunkr", 1)
@@ -77,7 +81,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
for file in files:
name = file["name"]
cdn = file["cdn"]
- if name.endswith(".mp4"):
+ if name.endswith((".mp4", ".m4v", ".mov")):
cdn = cdn.replace("//cdn", "//media-files")
file["file"] = cdn + "/" + name
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 1b41101..f7ee51f 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -152,7 +152,8 @@ class Extractor():
server = response.headers.get("Server")
if server and server.startswith("cloudflare"):
if code == 503 and \
- b"jschl-answer" in response.content:
+ (b"_cf_chl_opt" in response.content or
+ b"jschl-answer" in response.content):
self.log.warning("Cloudflare IUAM challenge")
break
if code == 403 and \
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 2720691..01ba03a 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -219,7 +219,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self.limits:
self._check_limits(data)
if "/fullimg.php" in url:
- data["extension"] = ""
data["_http_validate"] = _validate_response
else:
data["_http_validate"] = None
@@ -328,8 +327,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["image_token"] = self.key["start"] = extr('var startkey="', '";')
self.key["show"] = extr('var showkey="', '";')
- if iurl.endswith("g/509.gif"):
- self._report_limits(data)
+ self._check_509(iurl, data)
return url, text.nameext_from_url(iurl, data)
def images_from_api(self):
@@ -365,8 +363,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["num"] = request["page"]
data["image_token"] = imgkey
- if imgurl.endswith("g/509.gif"):
- self._report_limits(data)
+ self._check_509(imgurl, data)
yield url, text.nameext_from_url(imgurl, data)
request["imgkey"] = nextkey
@@ -385,6 +382,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self._remaining <= 0:
self._report_limits(data)
+ def _check_509(self, url, data):
+ # full 509.gif URLs
+ # - https://exhentai.org/img/509.gif
+ # - https://ehgt.org/g/509.gif
+ if url.endswith(("hentai.org/img/509.gif",
+ "ehgt.org/g/509.gif")):
+ self.log.debug(url)
+ self._report_limits(data)
+
def _update_limits(self):
url = "https://e-hentai.org/home.php"
cookies = {
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 2bd8c6b..e85d68a 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2020 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,8 @@
from .common import Extractor, Message
from .. import text, oauth, util, exception
+BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com"
+
class FlickrExtractor(Extractor):
"""Base class for flickr extractors"""
@@ -55,7 +57,7 @@ class FlickrImageExtractor(FlickrExtractor):
"""Extractor for individual images from flickr.com"""
subcategory = "image"
pattern = (r"(?:https?://)?(?:"
- r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
+ r"(?:(?:www\.|secure\.|m\.)?flickr\.com/photos/[^/?#]+/"
r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
r"|flic\.kr/p/([A-Za-z1-9]+))")
test = (
@@ -77,6 +79,10 @@ class FlickrImageExtractor(FlickrExtractor):
"width": 1024,
},
}),
+ ("https://secure.flickr.com/photos/departingyyz/16089302239"),
+ ("https://m.flickr.com/photos/departingyyz/16089302239"),
+ ("https://flickr.com/photos/departingyyz/16089302239"),
+
("https://www.flickr.com/photos/145617051@N08/46733161535", {
"count": 1,
"keyword": {"media": "video"},
@@ -132,8 +138,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
directory_fmt = ("{category}", "{user[username]}",
"Albums", "{album[id]} {album[title]}")
archive_fmt = "a_{album[id]}_{id}"
- pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
- r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?")
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?"
test = (
(("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), {
"pattern": FlickrImageExtractor.pattern,
@@ -143,6 +148,8 @@ class FlickrAlbumExtractor(FlickrExtractor):
"pattern": pattern,
"count": 2,
}),
+ ("https://secure.flickr.com/photos/shona_s/albums"),
+ ("https://m.flickr.com/photos/shona_s/albums"),
)
def __init__(self, match):
@@ -180,8 +187,7 @@ class FlickrGalleryExtractor(FlickrExtractor):
directory_fmt = ("{category}", "{user[username]}",
"Galleries", "{gallery[gallery_id]} {gallery[title]}")
archive_fmt = "g_{gallery[id]}_{id}"
- pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
- r"photos/([^/]+)/galleries/(\d+)")
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)"
test = (("https://www.flickr.com/photos/flickr/"
"galleries/72157681572514792/"), {
"pattern": FlickrImageExtractor.pattern,
@@ -206,7 +212,7 @@ class FlickrGroupExtractor(FlickrExtractor):
subcategory = "group"
directory_fmt = ("{category}", "Groups", "{group[groupname]}")
archive_fmt = "G_{group[nsid]}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"
+ pattern = BASE_PATTERN + r"/groups/([^/?#]+)"
test = ("https://www.flickr.com/groups/bird_headshots/", {
"pattern": FlickrImageExtractor.pattern,
"count": "> 150",
@@ -224,7 +230,7 @@ class FlickrUserExtractor(FlickrExtractor):
"""Extractor for the photostream of a flickr user"""
subcategory = "user"
archive_fmt = "u_{user[nsid]}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/?$"
test = ("https://www.flickr.com/photos/shona_s/", {
"pattern": FlickrImageExtractor.pattern,
"count": 28,
@@ -239,7 +245,7 @@ class FlickrFavoriteExtractor(FlickrExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "{user[username]}", "Favorites")
archive_fmt = "f_{user[nsid]}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/favorites"
test = ("https://www.flickr.com/photos/shona_s/favorites", {
"pattern": FlickrImageExtractor.pattern,
"count": 4,
@@ -254,7 +260,7 @@ class FlickrSearchExtractor(FlickrExtractor):
subcategory = "search"
directory_fmt = ("{category}", "Search", "{search[text]}")
archive_fmt = "s_{search}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"
+ pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
test = (
("https://flickr.com/search/?text=mountain"),
("https://flickr.com/search/?text=tree%20cloud%20house"
@@ -275,7 +281,11 @@ class FlickrSearchExtractor(FlickrExtractor):
class FlickrAPI(oauth.OAuth1API):
- """Minimal interface for the flickr API"""
+ """Minimal interface for the flickr API
+
+ https://www.flickr.com/services/api/
+ """
+
API_URL = "https://api.flickr.com/services/rest/"
API_KEY = "ac4fd7aa98585b9eee1ba761c209de68"
API_SECRET = "3adb0f568dc68393"
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
new file mode 100644
index 0000000..d6575cf
--- /dev/null
+++ b/gallery_dl/extractor/hotleak.py
@@ -0,0 +1,228 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hotleak.vip/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip"
+
+
+class HotleakExtractor(Extractor):
+ """Base class for hotleak extractors"""
+ category = "hotleak"
+ directory_fmt = ("{category}", "{creator}",)
+ filename_fmt = "{creator}_{id}.{extension}"
+ archive_fmt = "{type}_{creator}_{id}"
+ root = "https://hotleak.vip"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.session.headers["Referer"] = self.root
+
+ def items(self):
+ for post in self.posts():
+ yield Message.Directory, post
+ yield Message.Url, post["url"], post
+
+ def posts(self):
+ """Return an iterable containing relevant posts"""
+ return ()
+
+ def _pagination(self, url, params):
+ params = text.parse_query(params)
+ params["page"] = text.parse_int(params.get("page"), 1)
+
+ while True:
+ page = self.request(url, params=params).text
+ if "</article>" not in page:
+ return
+
+ for item in text.extract_iter(
+ page, '<article class="movie-item', '</article>'):
+ yield text.extract(item, '<a href="', '"')[0]
+
+ params["page"] += 1
+
+
+class HotleakPostExtractor(HotleakExtractor):
+ """Extractor for individual posts on hotleak"""
+ subcategory = "post"
+ pattern = (BASE_PATTERN + r"/(?!hot|creators|videos|photos)"
+ r"([^/]+)/(photo|video)/(\d+)")
+ test = (
+ ("https://hotleak.vip/kaiyakawaii/photo/1617145", {
+ "pattern": r"https://hotleak\.vip/storage/images/3625"
+ r"/1617145/fefdd5988dfcf6b98cc9e11616018868\.jpg",
+ "keyword": {
+ "id": 1617145,
+ "creator": "kaiyakawaii",
+ "type": "photo",
+ "filename": "fefdd5988dfcf6b98cc9e11616018868",
+ "extension": "jpg",
+ },
+ }),
+ ("https://hotleak.vip/lilmochidoll/video/1625538", {
+ "pattern": r"ytdl:https://cdn8-leak\.camhdxx\.com"
+ r"/1661/1625538/index\.m3u8",
+ "keyword": {
+ "id": 1625538,
+ "creator": "lilmochidoll",
+ "type": "video",
+ "filename": "index",
+ "extension": "mp4",
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self.creator, self.type, self.id = match.groups()
+
+ def posts(self):
+ url = "{}/{}/{}/{}".format(
+ self.root, self.creator, self.type, self.id)
+ page = self.request(url).text
+ page = text.extract(
+ page, '<div class="movie-image thumb">', '</article>')[0]
+ data = {
+ "id" : text.parse_int(self.id),
+ "creator": self.creator,
+ "type" : self.type,
+ }
+
+ if self.type == "photo":
+ data["url"] = text.extract(page, 'data-src="', '"')[0]
+ text.nameext_from_url(data["url"], data)
+
+ elif self.type == "video":
+ data["url"] = "ytdl:" + text.extract(
+ text.unescape(page), '"src":"', '"')[0]
+ text.nameext_from_url(data["url"], data)
+ data["extension"] = "mp4"
+
+ return (data,)
+
+
+class HotleakCreatorExtractor(HotleakExtractor):
+ """Extractor for all posts from a hotleak creator"""
+ subcategory = "creator"
+ pattern = BASE_PATTERN + r"/(?!hot|creators|videos|photos)([^/?#]+)/?$"
+ test = (
+ ("https://hotleak.vip/kaiyakawaii", {
+ "range": "1-200",
+ "count": 200,
+ }),
+ ("https://hotleak.vip/stellaviolet", {
+ "count": "> 600"
+ }),
+ ("https://hotleak.vip/doesnotexist", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self.creator = match.group(1)
+
+ def posts(self):
+ url = "{}/{}".format(self.root, self.creator)
+ return self._pagination(url)
+
+ def _pagination(self, url):
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ params = {"page": 1}
+
+ while True:
+ try:
+ response = self.request(
+ url, headers=headers, params=params, notfound="creator")
+ except exception.HttpError as exc:
+ if exc.response.status_code == 429:
+ self.wait(
+ until=exc.response.headers.get("X-RateLimit-Reset"))
+ continue
+
+ posts = response.json()
+ if not posts:
+ return
+
+ data = {"creator": self.creator}
+ for post in posts:
+ data["id"] = text.parse_int(post["id"])
+
+ if post["type"] == 0:
+ data["type"] = "photo"
+ data["url"] = self.root + "/storage/" + post["image"]
+ text.nameext_from_url(data["url"], data)
+
+ elif post["type"] == 1:
+ data["type"] = "video"
+ data["url"] = "ytdl:" + post["stream_url_play"]
+ text.nameext_from_url(data["url"], data)
+ data["extension"] = "mp4"
+
+ yield data
+ params["page"] += 1
+
+
+class HotleakCategoryExtractor(HotleakExtractor):
+ """Extractor for hotleak categories"""
+ subcategory = "category"
+ pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?"
+ test = (
+ ("https://hotleak.vip/photos", {
+ "pattern": HotleakPostExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://hotleak.vip/videos"),
+ ("https://hotleak.vip/creators", {
+ "pattern": HotleakCreatorExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://hotleak.vip/hot"),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self._category, self.params = match.groups()
+
+ def items(self):
+ url = "{}/{}".format(self.root, self._category)
+
+ if self._category in ("hot", "creators"):
+ data = {"_extractor": HotleakCreatorExtractor}
+ elif self._category in ("videos", "photos"):
+ data = {"_extractor": HotleakPostExtractor}
+
+ for item in self._pagination(url, self.params):
+ yield Message.Queue, item, data
+
+
+class HotleakSearchExtractor(HotleakExtractor):
+ """Extractor for hotleak search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))"
+ test = (
+ ("https://hotleak.vip/search?search=gallery-dl", {
+ "count": 0,
+ }),
+ ("https://hotleak.vip/search?search=hannah", {
+ "count": "> 30",
+ }),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self.params = match.group(1)
+
+ def items(self):
+ data = {"_extractor": HotleakCreatorExtractor}
+ for creator in self._pagination(self.root + "/search", self.params):
+ yield Message.Queue, creator, data
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index d56af8b..8c98d2e 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -264,6 +264,12 @@ class InstagramExtractor(Extractor):
"post_id": reel_id,
"post_shortcode": shortcode_from_id(reel_id),
}
+
+ if "title" in post:
+ data["highlight_title"] = post["title"]
+ if "created_at" in post:
+ data["date"] = text.parse_timestamp(post.get("created_at"))
+
else:
data = {
"post_id" : post["pk"],
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 0a6a6d3..56e3b39 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -49,7 +49,8 @@ class PahealExtractor(Extractor):
"id" : post_id,
"tags" : extr(": ", "<"),
"md5" : extr("/_thumbs/", "/"),
- "file_url": extr("id='main_image' src='", "'"),
+ "file_url": (extr("id='main_image' src='", "'") or
+ extr("<source src='", "'")),
"uploader": text.unquote(extr(
"class='username' href='/user/", "'")),
"date" : text.parse_datetime(
@@ -59,8 +60,10 @@ class PahealExtractor(Extractor):
}
dimensions, size, ext = extr("Info</th><td>", ">").split(" // ")
- post["width"], _, post["height"] = dimensions.partition("x")
+ post["width"], _, height = dimensions.partition("x")
post["size"] = text.parse_bytes(size[:-1])
+ post["height"], _, duration = height.partition(", ")
+ post["duration"] = text.parse_float(duration[:-1])
return post
@@ -111,10 +114,12 @@ class PahealTagExtractor(PahealExtractor):
tags, data, date = data.split("\n")
dimensions, size, ext = data.split(" // ")
width, _, height = dimensions.partition("x")
+ height, _, duration = height.partition(", ")
return {
"id": pid, "md5": md5, "file_url": url,
"width": width, "height": height,
+ "duration": text.parse_float(duration[:-1]),
"tags": text.unescape(tags),
"size": text.parse_bytes(size[:-1]),
"date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
@@ -163,6 +168,27 @@ class PahealPostExtractor(PahealExtractor):
"width": 1200,
},
}),
+ # video
+ ("https://rule34.paheal.net/post/view/3864982", {
+ "pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d"
+ r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_"
+ r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm",
+ "keyword": {
+ "date": "dt:2020-09-06 01:59:03",
+ "duration": 30.0,
+ "extension": "webm",
+ "height": 2500,
+ "id": 3864982,
+ "md5": "7629fc0ff77e32637dde5bf4f992b2cb",
+ "size": 18454938,
+ "source": "https://twitter.com/VG_Worklog"
+ "/status/1302407696294055936",
+ "tags": "Metal_Gear Metal_Gear_Solid_V Quiet "
+ "Vg_erotica animated webm",
+ "uploader": "justausername",
+ "width": 1768,
+ },
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index 8203885..4283081 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -42,6 +42,7 @@ class PoipikuExtractor(Extractor):
'<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),
"description": text.unescape(extr(
'class="IllustItemDesc" >', '<')),
+ "_http_headers": {"Referer": post_url},
}
yield Message.Directory, post
@@ -54,7 +55,8 @@ class PoipikuExtractor(Extractor):
elif thumb.startswith(("//img.poipiku.com/img/", "/img/")):
continue
post["num"] += 1
- url = text.ensure_http_scheme(thumb[:-8])
+ url = text.ensure_http_scheme(thumb[:-8]).replace(
+ "//img.", "//img-org.", 1)
yield Message.Url, url, text.nameext_from_url(url, post)
if not extr('> show all', '<'):
@@ -80,7 +82,8 @@ class PoipikuExtractor(Extractor):
for thumb in text.extract_iter(
page, 'class="IllustItemThumbImg" src="', '"'):
post["num"] += 1
- url = text.ensure_http_scheme(thumb[:-8])
+ url = text.ensure_http_scheme(thumb[:-8]).replace(
+ "//img.", "//img-org.", 1)
yield Message.Url, url, text.nameext_from_url(url, post)
@@ -91,7 +94,7 @@ class PoipikuUserExtractor(PoipikuExtractor):
r"(\d+)/?(?:$|[?&#])")
test = (
("https://poipiku.com/25049/", {
- "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+ "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
r"/\d+_\w+\.(jpe?g|png)$",
"range": "1-10",
"count": 10,
@@ -131,7 +134,7 @@ class PoipikuPostExtractor(PoipikuExtractor):
pattern = BASE_PATTERN + r"/(\d+)/(\d+)"
test = (
("https://poipiku.com/25049/5864576.html", {
- "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+ "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
r"/005864576_EWN1Y65gQ\.png$",
"keyword": {
"count": "1",
@@ -146,7 +149,7 @@ class PoipikuPostExtractor(PoipikuExtractor):
},
}),
("https://poipiku.com/2166245/6411749.html", {
- "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245"
+ "pattern": r"https://img-org\.poipiku\.com/user_img\d+/002166245"
r"/006411749_\w+\.jpeg$",
"count": 4,
"keyword": {
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index d35e24e..954a84f 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -46,10 +46,10 @@ class RedditExtractor(Extractor):
submission["created_utc"])
yield Message.Directory, submission
visited.add(submission["id"])
- url = submission["url"]
submission["num"] = 0
- if url.startswith("https://i.redd.it/"):
+ url = submission["url"]
+ if url and url.startswith("https://i.redd.it/"):
text.nameext_from_url(url, submission)
yield Message.Url, url, submission
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 2c3ed44..3a4fb0e 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -53,6 +53,7 @@ class RedgifsExtractor(Extractor):
for fmt in self.formats:
url = urls.get(fmt)
if url:
+ url = url.replace("//thumbs2.", "//thumbs3.", 1)
text.nameext_from_url(url, gif)
yield url
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index 4010da3..2264fe4 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -209,9 +209,9 @@ class SmugmugPathExtractor(SmugmugExtractor):
class SmugmugAPI(oauth.OAuth1API):
"""Minimal interface for the smugmug API v2"""
API_DOMAIN = "api.smugmug.com"
- API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK"
- API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S"
- "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq")
+ API_KEY = "RCVHDGjcbc4Fhzq4qzqLdZmvwmwB6LM2"
+ API_SECRET = ("jGrdndvJqhTx8XSNs7TFTSSthhZHq92d"
+ "dMpbpDpkDVNM7TDgnvLFMtfB5Mg5kH73")
HEADERS = {"Accept": "application/json"}
def album(self, album_id, expands=None):
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index b694fa0..6f53881 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -14,25 +14,6 @@ from datetime import datetime, timedelta
import re
-def _original_inline_image(url):
- return re.sub(
- (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
- r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
- r"https://\1_1280.\2", url
- )
-
-
-def _original_video(url):
- return re.sub(
- (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
- r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"),
- r"https://\1.\2", url
- )
-
-
-POST_TYPES = frozenset((
- "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
-
BASE_PATTERN = (
r"(?:tumblr:(?:https?://)?([^/]+)|"
r"(?:https?://)?"
@@ -40,6 +21,9 @@ BASE_PATTERN = (
r"([\w-]+\.tumblr\.com)))"
)
+POST_TYPES = frozenset((
+ "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+
class TumblrExtractor(Extractor):
"""Base class for tumblr extractors"""
@@ -79,6 +63,18 @@ class TumblrExtractor(Extractor):
def items(self):
blog = None
+ # pre-compile regular expressions
+ self._sub_video = re.compile(
+ r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
+ r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
+ if self.inline:
+ self._sub_image = re.compile(
+ r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
+ r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
+ self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn
+ _findall_image = re.compile('<img src="([^"]+)"').findall
+ _findall_video = re.compile('<source src="([^"]+)"').findall
+
for post in self.posts():
if self.date_min > post["timestamp"]:
return
@@ -120,7 +116,7 @@ class TumblrExtractor(Extractor):
if self.original and "/s2048x3072/" in photo["url"] and (
photo["width"] == 2048 or photo["height"] == 3072):
- photo["url"] = self._original_image(photo["url"])
+ photo["url"] = self._original_photo(photo["url"])
del photo["original_size"]
del photo["alt_sizes"]
@@ -134,17 +130,18 @@ class TumblrExtractor(Extractor):
url = post.get("video_url") # type "video"
if url:
- posts.append(self._prepare(_original_video(url), post.copy()))
+ posts.append(self._prepare(
+ self._original_video(url), post.copy()))
if self.inline and "reblog" in post: # inline media
# only "chat" posts are missing a "reblog" key in their
# API response, but they can't contain images/videos anyway
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
- for url in re.findall('<img src="([^"]+)"', body):
- url = _original_inline_image(url)
+ for url in _findall_image(body):
+ url = self._original_inline_image(url)
posts.append(self._prepare_image(url, post.copy()))
- for url in re.findall('<source src="([^"]+)"', body):
- url = _original_video(url)
+ for url in _findall_video(body):
+ url = self._original_video(url)
posts.append(self._prepare(url, post.copy()))
if self.external: # external links
@@ -220,8 +217,21 @@ class TumblrExtractor(Extractor):
def _skip_reblog_same_blog(self, post):
return self.blog != post.get("reblogged_root_uuid")
- def _original_image(self, url):
- url = url.replace("/s2048x3072/", "/s99999x99999/", 1)
+ def _original_photo(self, url):
+ return self._update_image_token(
+ url.replace("/s2048x3072/", "/s99999x99999/", 1))
+
+ def _original_inline_image(self, url):
+ if self.original:
+ url, n = self._subn_orig_image("/s99999x99999/", url, 1)
+ if n:
+ return self._update_image_token(url)
+ return self._sub_image(r"https://\1_1280.\2", url)
+
+ def _original_video(self, url):
+ return self._sub_video(r"https://\1.\2", url)
+
+ def _update_image_token(self, url):
headers = {"Accept": "text/html,*/*;q=0.8"}
response = self.request(url, headers=headers)
return text.extract(response.text, '" src="', '"')[0]
@@ -305,6 +315,14 @@ class TumblrPostExtractor(TumblrExtractor):
("https://mikf123.tumblr.com/post/181022380064/chat-post", {
"count": 0,
}),
+ ("https://kichatundk.tumblr.com/post/654953419288821760", {
+ "count": 2, # high-quality images (#1846)
+ "content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b",
+ }),
+ ("https://hameru-is-cool.tumblr.com/post/639261855227002880", {
+ "count": 2, # high-quality images (#1344)
+ "content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34",
+ }),
("https://mikf123.tumblr.com/image/689860196535762944", {
"pattern": r"^https://\d+\.media\.tumblr\.com"
r"/134791621559a79793563b636b5fe2c6"
@@ -446,10 +464,8 @@ class TumblrAPI(oauth.OAuth1API):
# daily rate limit
if response.headers.get("x-ratelimit-perday-remaining") == "0":
+ self.log.info("Daily API rate limit exceeded")
reset = response.headers.get("x-ratelimit-perday-reset")
- t = (datetime.now() + timedelta(seconds=float(reset))).time()
-
- self.log.error("Daily API rate limit exceeded")
api_key = self.api_key or self.session.auth.consumer_key
if api_key == self.API_KEY:
@@ -459,6 +475,11 @@ class TumblrAPI(oauth.OAuth1API):
"ter/docs/configuration.rst#extractortumblra"
"pi-key--api-secret")
+ if self.extractor.config("ratelimit") == "wait":
+ self.extractor.wait(seconds=reset)
+ return self._call(blog, endpoint, params)
+
+ t = (datetime.now() + timedelta(seconds=float(reset))).time()
raise exception.StopExtraction(
"Aborting - Rate limit will reset at %s",
"{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 0df4ea2..ba0597e 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -41,6 +41,7 @@ class TwitterExtractor(Extractor):
self.quoted = self.config("quoted", False)
self.videos = self.config("videos", True)
self.cards = self.config("cards", False)
+ self.cards_blacklist = self.config("cards-blacklist")
self._user = self._user_obj = None
self._user_cache = {}
self._init_sizes()
@@ -154,8 +155,11 @@ class TwitterExtractor(Extractor):
})
elif "media_url_https" in media:
url = media["media_url_https"]
- base, _, fmt = url.rpartition(".")
- base += "?format=" + fmt + "&name="
+ if url[-4] == ".":
+ base, _, fmt = url.rpartition(".")
+ base += "?format=" + fmt + "&name="
+ else:
+ base = url.rpartition("=")[0] + "="
files.append(text.nameext_from_url(url, {
"url" : base + self._size_image,
"width" : width,
@@ -174,15 +178,23 @@ class TwitterExtractor(Extractor):
card = tweet["card"]
if "legacy" in card:
card = card["legacy"]
- name = card["name"]
+
+ name = card["name"].rpartition(":")[2]
+ bvals = card["binding_values"]
+ if isinstance(bvals, list):
+ bvals = {bval["key"]: bval["value"]
+ for bval in card["binding_values"]}
+
+ cbl = self.cards_blacklist
+ if cbl:
+ if name in cbl:
+ return
+ if "vanity_url" in bvals:
+ domain = bvals["vanity_url"]["string_value"]
+ if domain in cbl or name + ":" + domain in cbl:
+ return
if name in ("summary", "summary_large_image"):
- bvals = card["binding_values"]
- if isinstance(bvals, list):
- bvals = {
- bval["key"]: bval["value"]
- for bval in card["binding_values"]
- }
for prefix in ("photo_image_full_size_",
"summary_photo_image_",
"thumbnail_image_"):
@@ -199,19 +211,9 @@ class TwitterExtractor(Extractor):
files.append(value)
return
elif name == "unified_card":
- bvals = card["binding_values"]
- if isinstance(bvals, list):
- for bval in card["binding_values"]:
- if bval["key"] == "unified_card":
- bval = bval["value"]["string_value"]
- break
- else:
- bval = bvals["unified_card"]["string_value"]
- data = json.loads(bval)
- if data.get("type") == "image_carousel_website":
- self._extract_media(
- tweet, data["media_entities"].values(), files)
- return
+ data = json.loads(bvals["unified_card"]["string_value"])
+ self._extract_media(tweet, data["media_entities"].values(), files)
+ return
if self.cards == "ytdl":
tweet_id = tweet.get("rest_id") or tweet["id_str"]
@@ -735,16 +737,33 @@ class TwitterTweetExtractor(TwitterExtractor):
"options": (("cards", True),),
"pattern": r"https://pbs.twimg.com/card_img/\d+/",
}),
- # unified_card with image_carousel_website
+ # unified_card image_website (#2875)
+ ("https://twitter.com/i/web/status/1561674543323910144", {
+ "options": (("cards", True),),
+ "pattern": r"https://pbs\.twimg\.com/media/F.+=jpg",
+ }),
+ # unified_card image_carousel_website
("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
"options": (("cards", True),),
"pattern": r"https://pbs\.twimg\.com/media/F.+=png",
"count": 6,
}),
+ # unified_card video_website (#2875)
+ ("https://twitter.com/bang_dream_1242/status/1561548715348746241", {
+ "options": (("cards", True),),
+ "pattern": r"https://video\.twimg\.com/amplify_video"
+ r"/1560607284333449216/vid/720x720/\w+\.mp4",
+ }),
# unified_card without type
("https://twitter.com/i/web/status/1466183847628865544", {
"count": 0,
}),
+ # 'cards-blacklist' option
+ ("https://twitter.com/i/web/status/1571141912295243776", {
+ "options": (("cards", "ytdl"),
+ ("cards-blacklist", ("twitch.tv",))),
+ "count": 0,
+ }),
# original retweets (#1026)
("https://twitter.com/jessica_3978/status/1296304589591810048", {
"options": (("retweets", "original"),),
@@ -776,12 +795,20 @@ class TwitterTweetExtractor(TwitterExtractor):
# age-restricted (#2354)
("https://twitter.com/mightbecursed/status/1492954264909479936", {
"options": (("syndication", True),),
+ "keywords": {"date": "dt:2022-02-13 20:10:09"},
"count": 1,
}),
# media alt texts / descriptions (#2617)
("https://twitter.com/my0nruri/status/1528379296041299968", {
"keyword": {"description": "oc"}
}),
+ # '?format=...&name=...'-style URLs
+ ("https://twitter.com/poco_dandy/status/1150646424461176832", {
+ "options": (("cards", True),),
+ "pattern": r"https://pbs.twimg.com/card_img/157\d+/\w+"
+ r"\?format=(jpg|png)&name=orig$",
+ "range": "1-2",
+ }),
)
def __init__(self, match):
@@ -1442,6 +1469,10 @@ class TwitterAPI():
else:
retweet_id = None
+ tweet["created_at"] = text.parse_datetime(
+ tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime(
+ "%a %b %d %H:%M:%S +0000 %Y")
+
if "video" in tweet:
video = tweet["video"]
video["variants"] = (max(
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 2b5acd8..72cf438 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -11,6 +11,8 @@
from .booru import BooruExtractor
from ..cache import cache
from .. import text, exception
+from xml.etree import ElementTree
+
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
@@ -54,7 +56,7 @@ class ZerochanExtractor(BooruExtractor):
return response.cookies
- def _parse_entry_page(self, entry_id):
+ def _parse_entry_html(self, entry_id):
url = "{}/{}".format(self.root, entry_id)
extr = text.extract_from(self.request(url).text)
@@ -66,10 +68,26 @@ class ZerochanExtractor(BooruExtractor):
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
"width" : extr('"width": "', ' '),
"height": extr('"height": "', ' '),
- "size" : extr('"contentSize": "', 'B'),
+ "size" : text.parse_bytes(extr('"contentSize": "', 'B')),
"path" : text.split_html(extr(
'class="breadcrumbs', '</p>'))[3::2],
- "tags" : extr('alt="Tags: ', '"').split(", ")
+ "tags" : extr('alt="Tags: Anime, ', '"').split(", ")
+ }
+
+ def _parse_entry_xml(self, entry_id):
+ url = "{}/{}?xml".format(self.root, entry_id)
+ item = ElementTree.fromstring(self.request(url).text)[0][-1]
+ # content = item[4].attrib
+
+ return {
+ # "id" : entry_id,
+ # "file_url": content["url"],
+ # "width" : content["width"],
+ # "height": content["height"],
+ # "size" : content["filesize"],
+ "name" : item[2].text,
+ "tags" : item[5].text.lstrip().split(", "),
+ "md5" : item[6].text,
}
@@ -105,6 +123,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
url = self.root + "/" + self.search_tag
params = text.parse_query(self.query)
params["p"] = text.parse_int(params.get("p"), 1)
+ metadata = self.config("metadata")
while True:
page = self.request(url, params=params).text
@@ -115,15 +134,22 @@ class ZerochanTagExtractor(ZerochanExtractor):
post = extr('<li class="', '>')
if not post:
break
- yield {
- "id" : extr('href="/', '"'),
- "name" : extr('alt="', '"'),
- "width" : extr('title="', 'x'),
- "height": extr('', ' '),
- "size" : extr('', 'B'),
- "file_url": "https://static." + extr(
- '<a href="https://static.', '"'),
- }
+
+ if metadata:
+ entry_id = extr('href="/', '"')
+ post = self._parse_entry_html(entry_id)
+ post.update(self._parse_entry_xml(entry_id))
+ yield post
+ else:
+ yield {
+ "id" : extr('href="/', '"'),
+ "name" : extr('alt="', '"'),
+ "width" : extr('title="', 'x'),
+ "height": extr('', ' '),
+ "size" : extr('', 'B'),
+ "file_url": "https://static." + extr(
+ '<a href="https://static.', '"'),
+ }
if 'rel="next"' not in page:
break
@@ -153,4 +179,7 @@ class ZerochanImageExtractor(ZerochanExtractor):
self.image_id = match.group(1)
def posts(self):
- return (self._parse_entry_page(self.image_id),)
+ post = self._parse_entry_html(self.image_id)
+ if self.config("metadata"):
+ post.update(self._parse_entry_xml(self.image_id))
+ return (post,)
diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py
index ff97add..4f376fe 100644
--- a/gallery_dl/postprocessor/zip.py
+++ b/gallery_dl/postprocessor/zip.py
@@ -26,6 +26,7 @@ class ZipPP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
self.delete = not options.get("keep-files", False)
+ self.files = options.get("files")
ext = "." + options.get("extension", "zip")
algorithm = options.get("compression", "store")
if algorithm not in self.COMPRESSION_ALGORITHMS:
@@ -56,6 +57,9 @@ class ZipPP(PostProcessor):
# 'NameToInfo' is not officially documented, but it's available
# for all supported Python versions and using it directly is a lot
# faster than calling getinfo()
+ if self.files:
+ self.write_extra(pathfmt, zfile, self.files)
+ self.files = None
if pathfmt.filename not in zfile.NameToInfo:
zfile.write(pathfmt.temppath, pathfmt.filename)
pathfmt.delete = self.delete
@@ -69,6 +73,21 @@ class ZipPP(PostProcessor):
with self.open() as zfile:
self.write(pathfmt, zfile)
+ def write_extra(self, pathfmt, zfile, files):
+ for path in map(util.expand_path, files):
+ if not os.path.isabs(path):
+ path = os.path.join(pathfmt.realdirectory, path)
+ try:
+ zfile.write(path, os.path.basename(path))
+ except OSError as exc:
+ self.log.warning(
+ "Unable to write %s to %s", path, zfile.filename)
+ self.log.debug("%s: %s", exc, exc.__class__.__name__)
+ pass
+ else:
+ if self.delete:
+ util.remove_file(path)
+
def finalize(self, pathfmt, status):
if self.zfile:
self.zfile.close()
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index d12d088..ce018fe 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.23.0"
+__version__ = "1.23.1"