summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md25
-rw-r--r--PKG-INFO10
-rw-r--r--README.rst8
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.596
-rw-r--r--docs/gallery-dl.conf3
-rw-r--r--gallery_dl.egg-info/PKG-INFO10
-rw-r--r--gallery_dl.egg-info/SOURCES.txt1
-rw-r--r--gallery_dl/__init__.py4
-rw-r--r--gallery_dl/extractor/__init__.py1
-rw-r--r--gallery_dl/extractor/bunkr.py12
-rw-r--r--gallery_dl/extractor/common.py3
-rw-r--r--gallery_dl/extractor/exhentai.py16
-rw-r--r--gallery_dl/extractor/flickr.py32
-rw-r--r--gallery_dl/extractor/hotleak.py228
-rw-r--r--gallery_dl/extractor/instagram.py6
-rw-r--r--gallery_dl/extractor/paheal.py30
-rw-r--r--gallery_dl/extractor/poipiku.py13
-rw-r--r--gallery_dl/extractor/reddit.py4
-rw-r--r--gallery_dl/extractor/redgifs.py1
-rw-r--r--gallery_dl/extractor/smugmug.py6
-rw-r--r--gallery_dl/extractor/tumblr.py81
-rw-r--r--gallery_dl/extractor/twitter.py77
-rw-r--r--gallery_dl/extractor/zerochan.py55
-rw-r--r--gallery_dl/postprocessor/zip.py19
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_postprocessor.py16
27 files changed, 632 insertions, 129 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61987d9..4f4fdf9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,30 @@
# Changelog
+## 1.23.1 - 2022-09-18
+### Additions
+- [flickr] add support for `secure.flickr.com` URLs ([#2910](https://github.com/mikf/gallery-dl/issues/2910))
+- [hotleak] add hotleak extractors ([#2890](https://github.com/mikf/gallery-dl/issues/2890), [#2909](https://github.com/mikf/gallery-dl/issues/2909))
+- [instagram] add `highlight_title` and `date` metadata for highlight downloads ([#2879](https://github.com/mikf/gallery-dl/issues/2879))
+- [paheal] add support for videos ([#2892](https://github.com/mikf/gallery-dl/issues/2892))
+- [twitter] add general support for unified cards ([#2875](https://github.com/mikf/gallery-dl/issues/2875))
+- [twitter] implement `cards-blacklist` option ([#2875](https://github.com/mikf/gallery-dl/issues/2875))
+- [tumblr] fetch high-quality inline images ([#2877](https://github.com/mikf/gallery-dl/issues/2877))
+- [tumblr] implement `ratelimit` option ([#2919](https://github.com/mikf/gallery-dl/issues/2919))
+- [zerochan] add `metadata` option ([#2861](https://github.com/mikf/gallery-dl/issues/2861))
+- [postprocessor:zip] implement `files` option ([#2872](https://github.com/mikf/gallery-dl/issues/2872))
+### Fixes
+- [bunkr] fix extraction ([#2903](https://github.com/mikf/gallery-dl/issues/2903))
+- [bunkr] use `media-files` servers for `m4v` and `mov` downloads ([#2925](https://github.com/mikf/gallery-dl/issues/2925))
+- [exhentai] improve 509.gif detection ([#2901](https://github.com/mikf/gallery-dl/issues/2901))
+- [exhentai] guess extension for original files ([#2842](https://github.com/mikf/gallery-dl/issues/2842))
+- [poipiku] use `img-org.poipiku.com` as image domain ([#2796](https://github.com/mikf/gallery-dl/issues/2796))
+- [reddit] prevent exception with empty submission URLs ([#2913](https://github.com/mikf/gallery-dl/issues/2913))
+- [redgifs] fix download URLs ([#2884](https://github.com/mikf/gallery-dl/issues/2884))
+- [smugmug] update default API credentials ([#2881](https://github.com/mikf/gallery-dl/issues/2881))
+- [twitter] provide proper `date` for syndication results ([#2920](https://github.com/mikf/gallery-dl/issues/2920))
+- [twitter] fix new-style `/card_img/` URLs
+- remove all whitespace before comments after input file URLs ([#2808](https://github.com/mikf/gallery-dl/issues/2808))
+
## 1.23.0 - 2022-08-28
### Changes
- [twitter] update `user` and `author` metdata fields
diff --git a/PKG-INFO b/PKG-INFO
index 60a798f..b15426c 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.23.0
+Version: 1.23.1
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -99,8 +99,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -166,11 +166,11 @@ Get the direct URL of an image from a site supporting authentication with userna
gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256"
-Filter manga chapters by language and chapter number:
+Filter manga chapters by chapter number and language:
.. code:: bash
- gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/"
+ gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539"
| Search a remote resource for URLs and download images from them:
diff --git a/README.rst b/README.rst
index 2b45b27..813d6d8 100644
--- a/README.rst
+++ b/README.rst
@@ -66,8 +66,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -133,11 +133,11 @@ Get the direct URL of an image from a site supporting authentication with userna
gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256"
-Filter manga chapters by language and chapter number:
+Filter manga chapters by chapter number and language:
.. code:: bash
- gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/"
+ gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539"
| Search a remote resource for URLs and download images from them:
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index d4efeed..e76a380 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2022-08-28" "1.23.0" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2022-09-18" "1.23.1" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 642cb78..f465d84 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2022-08-28" "1.23.0" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2022-09-18" "1.23.1" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -2575,13 +2575,30 @@ Search posts for inline images and videos.
\f[I]true\f[]
.IP "Description:" 4
-Download full-resolution \f[I]photo\f[] images.
+Download full-resolution \f[I]photo\f[] and \f[I]inline\f[] images.
For each photo with "maximum" resolution
-(width equal to 2048 or height equal to 3072),
+(width equal to 2048 or height equal to 3072)
+or each inline image,
use an extra HTTP request to find the URL to its full-resolution version.
+.SS extractor.tumblr.ratelimit
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"abort"\f[]
+
+.IP "Description:" 4
+Selects how to handle exceeding the daily API rate limit.
+
+.br
+* \f[I]"abort"\f[]: Raise an error and stop extraction
+.br
+* \f[I]"wait"\f[]: Wait until rate limit reset
+
+
.SS extractor.tumblr.reblogs
.IP "Type:" 6
\f[I]bool\f[] or \f[I]string\f[]
@@ -2664,6 +2681,26 @@ Controls how to handle \f[I]Twitter Cards\f[].
* \f[I]"ytdl"\f[]: Additionally download video content from unsupported cards using \f[I]youtube-dl\f[]
+.SS extractor.twitter.cards-blacklist
+.IP "Type:" 6
+\f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Example:" 4
+["summary", "youtube.com", "player:twitch.tv"]
+
+.IP "Description:" 4
+List of card types to ignore.
+
+Possible values are
+
+.br
+* card names
+.br
+* card domains
+.br
+* \f[I]<card name>:<card domain>\f[]
+
+
.SS extractor.twitter.conversations
.IP "Type:" 6
\f[I]bool\f[]
@@ -2672,8 +2709,11 @@ Controls how to handle \f[I]Twitter Cards\f[].
\f[I]false\f[]
.IP "Description:" 4
-Fetch media from all Tweets and replies in a \f[I]conversation
-<https://help.twitter.com/en/using-twitter/twitter-conversations>\f[].
+For input URLs pointing to a single Tweet,
+e.g. https://twitter.com/i/web/status/<TweetID>,
+fetch media from all Tweets and replies in this \f[I]conversation
+<https://help.twitter.com/en/using-twitter/twitter-conversations>\f[]
+or thread.
.SS extractor.twitter.csrf
@@ -2692,6 +2732,25 @@ Controls how to handle Cross Site Request Forgery (CSRF) tokens.
* \f[I]"cookies"\f[]: Use token given by the \f[I]ct0\f[] cookie if present.
+.SS extractor.twitter.expand
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+For each Tweet, return *all* Tweets from that initial Tweet's
+conversation or thread, i.e. *expand* all Twitter threads.
+
+Going through a timeline with this option enabled is essentially the same
+as running \f[I]gallery-dl https://twitter.com/i/web/status/<TweetID>\f[]
+with enabled \f[I]conversations\f[] option
+for each Tweet in said timeline.
+
+Note: This requires at least 1 additional API call per initial Tweet.
+
+
.SS extractor.twitter.size
.IP "Type:" 6
\f[I]list\f[] of \f[I]strings\f[]
@@ -3140,6 +3199,19 @@ Additional options specified as youtube-dl command-line arguments.
Location of a youtube-dl configuration file to load options from.
+.SS extractor.zerochan.metadata
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Extract additional metadata (date, md5, tags, ...)
+
+Note: This requires 1-2 additional HTTP request for each post.
+
+
.SS extractor.[booru].tags
.IP "Type:" 6
\f[I]bool\f[]
@@ -4225,6 +4297,20 @@ to prevent it from only being displayed for a very short amount of time.
Filename extension for the created ZIP archive.
+.SS zip.files
+.IP "Type:" 6
+\f[I]list\f[] of \f[I]Path\f[]
+
+.IP "Example:" 4
+["info.json"]
+
+.IP "Description:" 4
+List of extra files to be added to a ZIP archive.
+
+Note: Relative paths are relative to the current
+\f[I]download directory\f[].
+
+
.SS zip.keep-files
.IP "Type:" 6
\f[I]bool\f[]
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 1e485ee..6ba50f2 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -341,7 +341,8 @@
"zerochan":
{
"username": null,
- "password": null
+ "password": null,
+ "metadata": false
},
"booru":
{
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index 6b9d68b..ea2164a 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.23.0
+Version: 1.23.1
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -99,8 +99,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -166,11 +166,11 @@ Get the direct URL of an image from a site supporting authentication with userna
gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256"
-Filter manga chapters by language and chapter number:
+Filter manga chapters by chapter number and language:
.. code:: bash
- gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/"
+ gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539"
| Search a remote resource for URLs and download images from them:
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 5f5084b..73cc80b 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -92,6 +92,7 @@ gallery_dl/extractor/hentaihand.py
gallery_dl/extractor/hentaihere.py
gallery_dl/extractor/hiperdex.py
gallery_dl/extractor/hitomi.py
+gallery_dl/extractor/hotleak.py
gallery_dl/extractor/idolcomplex.py
gallery_dl/extractor/imagebam.py
gallery_dl/extractor/imagechest.py
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index 329e7ab..7504fa4 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -96,9 +96,9 @@ def parse_inputfile(file, log):
else:
# url
if " #" in line:
- line = line.partition(" #")[0]
+ line = line.partition(" #")[0].rstrip()
elif "\t#" in line:
- line = line.partition("\t#")[0]
+ line = line.partition("\t#")[0].rstrip()
if gconf or lconf:
yield util.ExtendedUrl(line, gconf, lconf)
gconf = []
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 9e4507a..fed6998 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -55,6 +55,7 @@ modules = [
"hentaihere",
"hiperdex",
"hitomi",
+ "hotleak",
"idolcomplex",
"imagebam",
"imagechest",
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 3091f57..2502411 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -37,6 +37,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4",
"content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
}),
+ # cdn4
+ ("https://bunkr.is/a/iXTTc1o2", {
+ "pattern": r"https://(cdn|media-files)4\.bunkr\.is/",
+ "content": "da29aae371b7adc8c5ef8e6991b66b69823791e8",
+ }),
("https://bunkr.to/a/Lktg9Keq"),
)
@@ -66,9 +71,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
data = json.loads(text.extract(
self.request(url).text,
'id="__NEXT_DATA__" type="application/json">', '<')[0])
- props = data["props"]["pageProps"]
- album = props["album"]
- files = props["files"]
+ album = data["props"]["pageProps"]["album"]
+ files = album["files"]
except Exception as exc:
self.log.debug(exc.__class__.__name__, exc)
self.root = self.root.replace("bunkr", "app.bunkr", 1)
@@ -77,7 +81,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
for file in files:
name = file["name"]
cdn = file["cdn"]
- if name.endswith(".mp4"):
+ if name.endswith((".mp4", ".m4v", ".mov")):
cdn = cdn.replace("//cdn", "//media-files")
file["file"] = cdn + "/" + name
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 1b41101..f7ee51f 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -152,7 +152,8 @@ class Extractor():
server = response.headers.get("Server")
if server and server.startswith("cloudflare"):
if code == 503 and \
- b"jschl-answer" in response.content:
+ (b"_cf_chl_opt" in response.content or
+ b"jschl-answer" in response.content):
self.log.warning("Cloudflare IUAM challenge")
break
if code == 403 and \
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 2720691..01ba03a 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -219,7 +219,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self.limits:
self._check_limits(data)
if "/fullimg.php" in url:
- data["extension"] = ""
data["_http_validate"] = _validate_response
else:
data["_http_validate"] = None
@@ -328,8 +327,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["image_token"] = self.key["start"] = extr('var startkey="', '";')
self.key["show"] = extr('var showkey="', '";')
- if iurl.endswith("g/509.gif"):
- self._report_limits(data)
+ self._check_509(iurl, data)
return url, text.nameext_from_url(iurl, data)
def images_from_api(self):
@@ -365,8 +363,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["num"] = request["page"]
data["image_token"] = imgkey
- if imgurl.endswith("g/509.gif"):
- self._report_limits(data)
+ self._check_509(imgurl, data)
yield url, text.nameext_from_url(imgurl, data)
request["imgkey"] = nextkey
@@ -385,6 +382,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self._remaining <= 0:
self._report_limits(data)
+ def _check_509(self, url, data):
+ # full 509.gif URLs
+ # - https://exhentai.org/img/509.gif
+ # - https://ehgt.org/g/509.gif
+ if url.endswith(("hentai.org/img/509.gif",
+ "ehgt.org/g/509.gif")):
+ self.log.debug(url)
+ self._report_limits(data)
+
def _update_limits(self):
url = "https://e-hentai.org/home.php"
cookies = {
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 2bd8c6b..e85d68a 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2020 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,8 @@
from .common import Extractor, Message
from .. import text, oauth, util, exception
+BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com"
+
class FlickrExtractor(Extractor):
"""Base class for flickr extractors"""
@@ -55,7 +57,7 @@ class FlickrImageExtractor(FlickrExtractor):
"""Extractor for individual images from flickr.com"""
subcategory = "image"
pattern = (r"(?:https?://)?(?:"
- r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
+ r"(?:(?:www\.|secure\.|m\.)?flickr\.com/photos/[^/?#]+/"
r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
r"|flic\.kr/p/([A-Za-z1-9]+))")
test = (
@@ -77,6 +79,10 @@ class FlickrImageExtractor(FlickrExtractor):
"width": 1024,
},
}),
+ ("https://secure.flickr.com/photos/departingyyz/16089302239"),
+ ("https://m.flickr.com/photos/departingyyz/16089302239"),
+ ("https://flickr.com/photos/departingyyz/16089302239"),
+
("https://www.flickr.com/photos/145617051@N08/46733161535", {
"count": 1,
"keyword": {"media": "video"},
@@ -132,8 +138,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
directory_fmt = ("{category}", "{user[username]}",
"Albums", "{album[id]} {album[title]}")
archive_fmt = "a_{album[id]}_{id}"
- pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
- r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?")
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?"
test = (
(("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), {
"pattern": FlickrImageExtractor.pattern,
@@ -143,6 +148,8 @@ class FlickrAlbumExtractor(FlickrExtractor):
"pattern": pattern,
"count": 2,
}),
+ ("https://secure.flickr.com/photos/shona_s/albums"),
+ ("https://m.flickr.com/photos/shona_s/albums"),
)
def __init__(self, match):
@@ -180,8 +187,7 @@ class FlickrGalleryExtractor(FlickrExtractor):
directory_fmt = ("{category}", "{user[username]}",
"Galleries", "{gallery[gallery_id]} {gallery[title]}")
archive_fmt = "g_{gallery[id]}_{id}"
- pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
- r"photos/([^/]+)/galleries/(\d+)")
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)"
test = (("https://www.flickr.com/photos/flickr/"
"galleries/72157681572514792/"), {
"pattern": FlickrImageExtractor.pattern,
@@ -206,7 +212,7 @@ class FlickrGroupExtractor(FlickrExtractor):
subcategory = "group"
directory_fmt = ("{category}", "Groups", "{group[groupname]}")
archive_fmt = "G_{group[nsid]}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"
+ pattern = BASE_PATTERN + r"/groups/([^/?#]+)"
test = ("https://www.flickr.com/groups/bird_headshots/", {
"pattern": FlickrImageExtractor.pattern,
"count": "> 150",
@@ -224,7 +230,7 @@ class FlickrUserExtractor(FlickrExtractor):
"""Extractor for the photostream of a flickr user"""
subcategory = "user"
archive_fmt = "u_{user[nsid]}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/?$"
test = ("https://www.flickr.com/photos/shona_s/", {
"pattern": FlickrImageExtractor.pattern,
"count": 28,
@@ -239,7 +245,7 @@ class FlickrFavoriteExtractor(FlickrExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "{user[username]}", "Favorites")
archive_fmt = "f_{user[nsid]}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"
+ pattern = BASE_PATTERN + r"/photos/([^/?#]+)/favorites"
test = ("https://www.flickr.com/photos/shona_s/favorites", {
"pattern": FlickrImageExtractor.pattern,
"count": 4,
@@ -254,7 +260,7 @@ class FlickrSearchExtractor(FlickrExtractor):
subcategory = "search"
directory_fmt = ("{category}", "Search", "{search[text]}")
archive_fmt = "s_{search}_{id}"
- pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"
+ pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
test = (
("https://flickr.com/search/?text=mountain"),
("https://flickr.com/search/?text=tree%20cloud%20house"
@@ -275,7 +281,11 @@ class FlickrSearchExtractor(FlickrExtractor):
class FlickrAPI(oauth.OAuth1API):
- """Minimal interface for the flickr API"""
+ """Minimal interface for the flickr API
+
+ https://www.flickr.com/services/api/
+ """
+
API_URL = "https://api.flickr.com/services/rest/"
API_KEY = "ac4fd7aa98585b9eee1ba761c209de68"
API_SECRET = "3adb0f568dc68393"
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
new file mode 100644
index 0000000..d6575cf
--- /dev/null
+++ b/gallery_dl/extractor/hotleak.py
@@ -0,0 +1,228 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hotleak.vip/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip"
+
+
+class HotleakExtractor(Extractor):
+ """Base class for hotleak extractors"""
+ category = "hotleak"
+ directory_fmt = ("{category}", "{creator}",)
+ filename_fmt = "{creator}_{id}.{extension}"
+ archive_fmt = "{type}_{creator}_{id}"
+ root = "https://hotleak.vip"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.session.headers["Referer"] = self.root
+
+ def items(self):
+ for post in self.posts():
+ yield Message.Directory, post
+ yield Message.Url, post["url"], post
+
+ def posts(self):
+ """Return an iterable containing relevant posts"""
+ return ()
+
+ def _pagination(self, url, params):
+ params = text.parse_query(params)
+ params["page"] = text.parse_int(params.get("page"), 1)
+
+ while True:
+ page = self.request(url, params=params).text
+ if "</article>" not in page:
+ return
+
+ for item in text.extract_iter(
+ page, '<article class="movie-item', '</article>'):
+ yield text.extract(item, '<a href="', '"')[0]
+
+ params["page"] += 1
+
+
+class HotleakPostExtractor(HotleakExtractor):
+ """Extractor for individual posts on hotleak"""
+ subcategory = "post"
+ pattern = (BASE_PATTERN + r"/(?!hot|creators|videos|photos)"
+ r"([^/]+)/(photo|video)/(\d+)")
+ test = (
+ ("https://hotleak.vip/kaiyakawaii/photo/1617145", {
+ "pattern": r"https://hotleak\.vip/storage/images/3625"
+ r"/1617145/fefdd5988dfcf6b98cc9e11616018868\.jpg",
+ "keyword": {
+ "id": 1617145,
+ "creator": "kaiyakawaii",
+ "type": "photo",
+ "filename": "fefdd5988dfcf6b98cc9e11616018868",
+ "extension": "jpg",
+ },
+ }),
+ ("https://hotleak.vip/lilmochidoll/video/1625538", {
+ "pattern": r"ytdl:https://cdn8-leak\.camhdxx\.com"
+ r"/1661/1625538/index\.m3u8",
+ "keyword": {
+ "id": 1625538,
+ "creator": "lilmochidoll",
+ "type": "video",
+ "filename": "index",
+ "extension": "mp4",
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self.creator, self.type, self.id = match.groups()
+
+ def posts(self):
+ url = "{}/{}/{}/{}".format(
+ self.root, self.creator, self.type, self.id)
+ page = self.request(url).text
+ page = text.extract(
+ page, '<div class="movie-image thumb">', '</article>')[0]
+ data = {
+ "id" : text.parse_int(self.id),
+ "creator": self.creator,
+ "type" : self.type,
+ }
+
+ if self.type == "photo":
+ data["url"] = text.extract(page, 'data-src="', '"')[0]
+ text.nameext_from_url(data["url"], data)
+
+ elif self.type == "video":
+ data["url"] = "ytdl:" + text.extract(
+ text.unescape(page), '"src":"', '"')[0]
+ text.nameext_from_url(data["url"], data)
+ data["extension"] = "mp4"
+
+ return (data,)
+
+
+class HotleakCreatorExtractor(HotleakExtractor):
+ """Extractor for all posts from a hotleak creator"""
+ subcategory = "creator"
+ pattern = BASE_PATTERN + r"/(?!hot|creators|videos|photos)([^/?#]+)/?$"
+ test = (
+ ("https://hotleak.vip/kaiyakawaii", {
+ "range": "1-200",
+ "count": 200,
+ }),
+ ("https://hotleak.vip/stellaviolet", {
+ "count": "> 600"
+ }),
+ ("https://hotleak.vip/doesnotexist", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self.creator = match.group(1)
+
+ def posts(self):
+ url = "{}/{}".format(self.root, self.creator)
+ return self._pagination(url)
+
+ def _pagination(self, url):
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ params = {"page": 1}
+
+ while True:
+ try:
+ response = self.request(
+ url, headers=headers, params=params, notfound="creator")
+ except exception.HttpError as exc:
+ if exc.response.status_code == 429:
+ self.wait(
+ until=exc.response.headers.get("X-RateLimit-Reset"))
+ continue
+
+ posts = response.json()
+ if not posts:
+ return
+
+ data = {"creator": self.creator}
+ for post in posts:
+ data["id"] = text.parse_int(post["id"])
+
+ if post["type"] == 0:
+ data["type"] = "photo"
+ data["url"] = self.root + "/storage/" + post["image"]
+ text.nameext_from_url(data["url"], data)
+
+ elif post["type"] == 1:
+ data["type"] = "video"
+ data["url"] = "ytdl:" + post["stream_url_play"]
+ text.nameext_from_url(data["url"], data)
+ data["extension"] = "mp4"
+
+ yield data
+ params["page"] += 1
+
+
+class HotleakCategoryExtractor(HotleakExtractor):
+ """Extractor for hotleak categories"""
+ subcategory = "category"
+ pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?"
+ test = (
+ ("https://hotleak.vip/photos", {
+ "pattern": HotleakPostExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://hotleak.vip/videos"),
+ ("https://hotleak.vip/creators", {
+ "pattern": HotleakCreatorExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://hotleak.vip/hot"),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self._category, self.params = match.groups()
+
+ def items(self):
+ url = "{}/{}".format(self.root, self._category)
+
+ if self._category in ("hot", "creators"):
+ data = {"_extractor": HotleakCreatorExtractor}
+ elif self._category in ("videos", "photos"):
+ data = {"_extractor": HotleakPostExtractor}
+
+ for item in self._pagination(url, self.params):
+ yield Message.Queue, item, data
+
+
+class HotleakSearchExtractor(HotleakExtractor):
+ """Extractor for hotleak search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))"
+ test = (
+ ("https://hotleak.vip/search?search=gallery-dl", {
+ "count": 0,
+ }),
+ ("https://hotleak.vip/search?search=hannah", {
+ "count": "> 30",
+ }),
+ )
+
+ def __init__(self, match):
+ HotleakExtractor.__init__(self, match)
+ self.params = match.group(1)
+
+ def items(self):
+ data = {"_extractor": HotleakCreatorExtractor}
+ for creator in self._pagination(self.root + "/search", self.params):
+ yield Message.Queue, creator, data
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index d56af8b..8c98d2e 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -264,6 +264,12 @@ class InstagramExtractor(Extractor):
"post_id": reel_id,
"post_shortcode": shortcode_from_id(reel_id),
}
+
+ if "title" in post:
+ data["highlight_title"] = post["title"]
+ if "created_at" in post:
+ data["date"] = text.parse_timestamp(post.get("created_at"))
+
else:
data = {
"post_id" : post["pk"],
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 0a6a6d3..56e3b39 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -49,7 +49,8 @@ class PahealExtractor(Extractor):
"id" : post_id,
"tags" : extr(": ", "<"),
"md5" : extr("/_thumbs/", "/"),
- "file_url": extr("id='main_image' src='", "'"),
+ "file_url": (extr("id='main_image' src='", "'") or
+ extr("<source src='", "'")),
"uploader": text.unquote(extr(
"class='username' href='/user/", "'")),
"date" : text.parse_datetime(
@@ -59,8 +60,10 @@ class PahealExtractor(Extractor):
}
dimensions, size, ext = extr("Info</th><td>", ">").split(" // ")
- post["width"], _, post["height"] = dimensions.partition("x")
+ post["width"], _, height = dimensions.partition("x")
post["size"] = text.parse_bytes(size[:-1])
+ post["height"], _, duration = height.partition(", ")
+ post["duration"] = text.parse_float(duration[:-1])
return post
@@ -111,10 +114,12 @@ class PahealTagExtractor(PahealExtractor):
tags, data, date = data.split("\n")
dimensions, size, ext = data.split(" // ")
width, _, height = dimensions.partition("x")
+ height, _, duration = height.partition(", ")
return {
"id": pid, "md5": md5, "file_url": url,
"width": width, "height": height,
+ "duration": text.parse_float(duration[:-1]),
"tags": text.unescape(tags),
"size": text.parse_bytes(size[:-1]),
"date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
@@ -163,6 +168,27 @@ class PahealPostExtractor(PahealExtractor):
"width": 1200,
},
}),
+ # video
+ ("https://rule34.paheal.net/post/view/3864982", {
+ "pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d"
+ r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_"
+ r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm",
+ "keyword": {
+ "date": "dt:2020-09-06 01:59:03",
+ "duration": 30.0,
+ "extension": "webm",
+ "height": 2500,
+ "id": 3864982,
+ "md5": "7629fc0ff77e32637dde5bf4f992b2cb",
+ "size": 18454938,
+ "source": "https://twitter.com/VG_Worklog"
+ "/status/1302407696294055936",
+ "tags": "Metal_Gear Metal_Gear_Solid_V Quiet "
+ "Vg_erotica animated webm",
+ "uploader": "justausername",
+ "width": 1768,
+ },
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index 8203885..4283081 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -42,6 +42,7 @@ class PoipikuExtractor(Extractor):
'<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),
"description": text.unescape(extr(
'class="IllustItemDesc" >', '<')),
+ "_http_headers": {"Referer": post_url},
}
yield Message.Directory, post
@@ -54,7 +55,8 @@ class PoipikuExtractor(Extractor):
elif thumb.startswith(("//img.poipiku.com/img/", "/img/")):
continue
post["num"] += 1
- url = text.ensure_http_scheme(thumb[:-8])
+ url = text.ensure_http_scheme(thumb[:-8]).replace(
+ "//img.", "//img-org.", 1)
yield Message.Url, url, text.nameext_from_url(url, post)
if not extr('> show all', '<'):
@@ -80,7 +82,8 @@ class PoipikuExtractor(Extractor):
for thumb in text.extract_iter(
page, 'class="IllustItemThumbImg" src="', '"'):
post["num"] += 1
- url = text.ensure_http_scheme(thumb[:-8])
+ url = text.ensure_http_scheme(thumb[:-8]).replace(
+ "//img.", "//img-org.", 1)
yield Message.Url, url, text.nameext_from_url(url, post)
@@ -91,7 +94,7 @@ class PoipikuUserExtractor(PoipikuExtractor):
r"(\d+)/?(?:$|[?&#])")
test = (
("https://poipiku.com/25049/", {
- "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+ "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
r"/\d+_\w+\.(jpe?g|png)$",
"range": "1-10",
"count": 10,
@@ -131,7 +134,7 @@ class PoipikuPostExtractor(PoipikuExtractor):
pattern = BASE_PATTERN + r"/(\d+)/(\d+)"
test = (
("https://poipiku.com/25049/5864576.html", {
- "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+ "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
r"/005864576_EWN1Y65gQ\.png$",
"keyword": {
"count": "1",
@@ -146,7 +149,7 @@ class PoipikuPostExtractor(PoipikuExtractor):
},
}),
("https://poipiku.com/2166245/6411749.html", {
- "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245"
+ "pattern": r"https://img-org\.poipiku\.com/user_img\d+/002166245"
r"/006411749_\w+\.jpeg$",
"count": 4,
"keyword": {
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index d35e24e..954a84f 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -46,10 +46,10 @@ class RedditExtractor(Extractor):
submission["created_utc"])
yield Message.Directory, submission
visited.add(submission["id"])
- url = submission["url"]
submission["num"] = 0
- if url.startswith("https://i.redd.it/"):
+ url = submission["url"]
+ if url and url.startswith("https://i.redd.it/"):
text.nameext_from_url(url, submission)
yield Message.Url, url, submission
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 2c3ed44..3a4fb0e 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -53,6 +53,7 @@ class RedgifsExtractor(Extractor):
for fmt in self.formats:
url = urls.get(fmt)
if url:
+ url = url.replace("//thumbs2.", "//thumbs3.", 1)
text.nameext_from_url(url, gif)
yield url
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index 4010da3..2264fe4 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -209,9 +209,9 @@ class SmugmugPathExtractor(SmugmugExtractor):
class SmugmugAPI(oauth.OAuth1API):
"""Minimal interface for the smugmug API v2"""
API_DOMAIN = "api.smugmug.com"
- API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK"
- API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S"
- "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq")
+ API_KEY = "RCVHDGjcbc4Fhzq4qzqLdZmvwmwB6LM2"
+ API_SECRET = ("jGrdndvJqhTx8XSNs7TFTSSthhZHq92d"
+ "dMpbpDpkDVNM7TDgnvLFMtfB5Mg5kH73")
HEADERS = {"Accept": "application/json"}
def album(self, album_id, expands=None):
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index b694fa0..6f53881 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -14,25 +14,6 @@ from datetime import datetime, timedelta
import re
-def _original_inline_image(url):
- return re.sub(
- (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
- r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
- r"https://\1_1280.\2", url
- )
-
-
-def _original_video(url):
- return re.sub(
- (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
- r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"),
- r"https://\1.\2", url
- )
-
-
-POST_TYPES = frozenset((
- "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
-
BASE_PATTERN = (
r"(?:tumblr:(?:https?://)?([^/]+)|"
r"(?:https?://)?"
@@ -40,6 +21,9 @@ BASE_PATTERN = (
r"([\w-]+\.tumblr\.com)))"
)
+POST_TYPES = frozenset((
+ "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+
class TumblrExtractor(Extractor):
"""Base class for tumblr extractors"""
@@ -79,6 +63,18 @@ class TumblrExtractor(Extractor):
def items(self):
blog = None
+ # pre-compile regular expressions
+ self._sub_video = re.compile(
+ r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
+ r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
+ if self.inline:
+ self._sub_image = re.compile(
+ r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
+ r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
+ self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn
+ _findall_image = re.compile('<img src="([^"]+)"').findall
+ _findall_video = re.compile('<source src="([^"]+)"').findall
+
for post in self.posts():
if self.date_min > post["timestamp"]:
return
@@ -120,7 +116,7 @@ class TumblrExtractor(Extractor):
if self.original and "/s2048x3072/" in photo["url"] and (
photo["width"] == 2048 or photo["height"] == 3072):
- photo["url"] = self._original_image(photo["url"])
+ photo["url"] = self._original_photo(photo["url"])
del photo["original_size"]
del photo["alt_sizes"]
@@ -134,17 +130,18 @@ class TumblrExtractor(Extractor):
url = post.get("video_url") # type "video"
if url:
- posts.append(self._prepare(_original_video(url), post.copy()))
+ posts.append(self._prepare(
+ self._original_video(url), post.copy()))
if self.inline and "reblog" in post: # inline media
# only "chat" posts are missing a "reblog" key in their
# API response, but they can't contain images/videos anyway
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
- for url in re.findall('<img src="([^"]+)"', body):
- url = _original_inline_image(url)
+ for url in _findall_image(body):
+ url = self._original_inline_image(url)
posts.append(self._prepare_image(url, post.copy()))
- for url in re.findall('<source src="([^"]+)"', body):
- url = _original_video(url)
+ for url in _findall_video(body):
+ url = self._original_video(url)
posts.append(self._prepare(url, post.copy()))
if self.external: # external links
@@ -220,8 +217,21 @@ class TumblrExtractor(Extractor):
def _skip_reblog_same_blog(self, post):
return self.blog != post.get("reblogged_root_uuid")
- def _original_image(self, url):
- url = url.replace("/s2048x3072/", "/s99999x99999/", 1)
+ def _original_photo(self, url):
+ return self._update_image_token(
+ url.replace("/s2048x3072/", "/s99999x99999/", 1))
+
+ def _original_inline_image(self, url):
+ if self.original:
+ url, n = self._subn_orig_image("/s99999x99999/", url, 1)
+ if n:
+ return self._update_image_token(url)
+ return self._sub_image(r"https://\1_1280.\2", url)
+
+ def _original_video(self, url):
+ return self._sub_video(r"https://\1.\2", url)
+
+ def _update_image_token(self, url):
headers = {"Accept": "text/html,*/*;q=0.8"}
response = self.request(url, headers=headers)
return text.extract(response.text, '" src="', '"')[0]
@@ -305,6 +315,14 @@ class TumblrPostExtractor(TumblrExtractor):
("https://mikf123.tumblr.com/post/181022380064/chat-post", {
"count": 0,
}),
+ ("https://kichatundk.tumblr.com/post/654953419288821760", {
+ "count": 2, # high-quality images (#1846)
+ "content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b",
+ }),
+ ("https://hameru-is-cool.tumblr.com/post/639261855227002880", {
+ "count": 2, # high-quality images (#1344)
+ "content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34",
+ }),
("https://mikf123.tumblr.com/image/689860196535762944", {
"pattern": r"^https://\d+\.media\.tumblr\.com"
r"/134791621559a79793563b636b5fe2c6"
@@ -446,10 +464,8 @@ class TumblrAPI(oauth.OAuth1API):
# daily rate limit
if response.headers.get("x-ratelimit-perday-remaining") == "0":
+ self.log.info("Daily API rate limit exceeded")
reset = response.headers.get("x-ratelimit-perday-reset")
- t = (datetime.now() + timedelta(seconds=float(reset))).time()
-
- self.log.error("Daily API rate limit exceeded")
api_key = self.api_key or self.session.auth.consumer_key
if api_key == self.API_KEY:
@@ -459,6 +475,11 @@ class TumblrAPI(oauth.OAuth1API):
"ter/docs/configuration.rst#extractortumblra"
"pi-key--api-secret")
+ if self.extractor.config("ratelimit") == "wait":
+ self.extractor.wait(seconds=reset)
+ return self._call(blog, endpoint, params)
+
+ t = (datetime.now() + timedelta(seconds=float(reset))).time()
raise exception.StopExtraction(
"Aborting - Rate limit will reset at %s",
"{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second))
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 0df4ea2..ba0597e 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -41,6 +41,7 @@ class TwitterExtractor(Extractor):
self.quoted = self.config("quoted", False)
self.videos = self.config("videos", True)
self.cards = self.config("cards", False)
+ self.cards_blacklist = self.config("cards-blacklist")
self._user = self._user_obj = None
self._user_cache = {}
self._init_sizes()
@@ -154,8 +155,11 @@ class TwitterExtractor(Extractor):
})
elif "media_url_https" in media:
url = media["media_url_https"]
- base, _, fmt = url.rpartition(".")
- base += "?format=" + fmt + "&name="
+ if url[-4] == ".":
+ base, _, fmt = url.rpartition(".")
+ base += "?format=" + fmt + "&name="
+ else:
+ base = url.rpartition("=")[0] + "="
files.append(text.nameext_from_url(url, {
"url" : base + self._size_image,
"width" : width,
@@ -174,15 +178,23 @@ class TwitterExtractor(Extractor):
card = tweet["card"]
if "legacy" in card:
card = card["legacy"]
- name = card["name"]
+
+ name = card["name"].rpartition(":")[2]
+ bvals = card["binding_values"]
+ if isinstance(bvals, list):
+ bvals = {bval["key"]: bval["value"]
+ for bval in card["binding_values"]}
+
+ cbl = self.cards_blacklist
+ if cbl:
+ if name in cbl:
+ return
+ if "vanity_url" in bvals:
+ domain = bvals["vanity_url"]["string_value"]
+ if domain in cbl or name + ":" + domain in cbl:
+ return
if name in ("summary", "summary_large_image"):
- bvals = card["binding_values"]
- if isinstance(bvals, list):
- bvals = {
- bval["key"]: bval["value"]
- for bval in card["binding_values"]
- }
for prefix in ("photo_image_full_size_",
"summary_photo_image_",
"thumbnail_image_"):
@@ -199,19 +211,9 @@ class TwitterExtractor(Extractor):
files.append(value)
return
elif name == "unified_card":
- bvals = card["binding_values"]
- if isinstance(bvals, list):
- for bval in card["binding_values"]:
- if bval["key"] == "unified_card":
- bval = bval["value"]["string_value"]
- break
- else:
- bval = bvals["unified_card"]["string_value"]
- data = json.loads(bval)
- if data.get("type") == "image_carousel_website":
- self._extract_media(
- tweet, data["media_entities"].values(), files)
- return
+ data = json.loads(bvals["unified_card"]["string_value"])
+ self._extract_media(tweet, data["media_entities"].values(), files)
+ return
if self.cards == "ytdl":
tweet_id = tweet.get("rest_id") or tweet["id_str"]
@@ -735,16 +737,33 @@ class TwitterTweetExtractor(TwitterExtractor):
"options": (("cards", True),),
"pattern": r"https://pbs.twimg.com/card_img/\d+/",
}),
- # unified_card with image_carousel_website
+ # unified_card image_website (#2875)
+ ("https://twitter.com/i/web/status/1561674543323910144", {
+ "options": (("cards", True),),
+ "pattern": r"https://pbs\.twimg\.com/media/F.+=jpg",
+ }),
+ # unified_card image_carousel_website
("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
"options": (("cards", True),),
"pattern": r"https://pbs\.twimg\.com/media/F.+=png",
"count": 6,
}),
+ # unified_card video_website (#2875)
+ ("https://twitter.com/bang_dream_1242/status/1561548715348746241", {
+ "options": (("cards", True),),
+ "pattern": r"https://video\.twimg\.com/amplify_video"
+ r"/1560607284333449216/vid/720x720/\w+\.mp4",
+ }),
# unified_card without type
("https://twitter.com/i/web/status/1466183847628865544", {
"count": 0,
}),
+ # 'cards-blacklist' option
+ ("https://twitter.com/i/web/status/1571141912295243776", {
+ "options": (("cards", "ytdl"),
+ ("cards-blacklist", ("twitch.tv",))),
+ "count": 0,
+ }),
# original retweets (#1026)
("https://twitter.com/jessica_3978/status/1296304589591810048", {
"options": (("retweets", "original"),),
@@ -776,12 +795,20 @@ class TwitterTweetExtractor(TwitterExtractor):
# age-restricted (#2354)
("https://twitter.com/mightbecursed/status/1492954264909479936", {
"options": (("syndication", True),),
+ "keywords": {"date": "dt:2022-02-13 20:10:09"},
"count": 1,
}),
# media alt texts / descriptions (#2617)
("https://twitter.com/my0nruri/status/1528379296041299968", {
"keyword": {"description": "oc"}
}),
+ # '?format=...&name=...'-style URLs
+ ("https://twitter.com/poco_dandy/status/1150646424461176832", {
+ "options": (("cards", True),),
+ "pattern": r"https://pbs.twimg.com/card_img/157\d+/\w+"
+ r"\?format=(jpg|png)&name=orig$",
+ "range": "1-2",
+ }),
)
def __init__(self, match):
@@ -1442,6 +1469,10 @@ class TwitterAPI():
else:
retweet_id = None
+ tweet["created_at"] = text.parse_datetime(
+ tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime(
+ "%a %b %d %H:%M:%S +0000 %Y")
+
if "video" in tweet:
video = tweet["video"]
video["variants"] = (max(
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 2b5acd8..72cf438 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -11,6 +11,8 @@
from .booru import BooruExtractor
from ..cache import cache
from .. import text, exception
+from xml.etree import ElementTree
+
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
@@ -54,7 +56,7 @@ class ZerochanExtractor(BooruExtractor):
return response.cookies
- def _parse_entry_page(self, entry_id):
+ def _parse_entry_html(self, entry_id):
url = "{}/{}".format(self.root, entry_id)
extr = text.extract_from(self.request(url).text)
@@ -66,10 +68,26 @@ class ZerochanExtractor(BooruExtractor):
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
"width" : extr('"width": "', ' '),
"height": extr('"height": "', ' '),
- "size" : extr('"contentSize": "', 'B'),
+ "size" : text.parse_bytes(extr('"contentSize": "', 'B')),
"path" : text.split_html(extr(
'class="breadcrumbs', '</p>'))[3::2],
- "tags" : extr('alt="Tags: ', '"').split(", ")
+ "tags" : extr('alt="Tags: Anime, ', '"').split(", ")
+ }
+
+ def _parse_entry_xml(self, entry_id):
+ url = "{}/{}?xml".format(self.root, entry_id)
+ item = ElementTree.fromstring(self.request(url).text)[0][-1]
+ # content = item[4].attrib
+
+ return {
+ # "id" : entry_id,
+ # "file_url": content["url"],
+ # "width" : content["width"],
+ # "height": content["height"],
+ # "size" : content["filesize"],
+ "name" : item[2].text,
+ "tags" : item[5].text.lstrip().split(", "),
+ "md5" : item[6].text,
}
@@ -105,6 +123,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
url = self.root + "/" + self.search_tag
params = text.parse_query(self.query)
params["p"] = text.parse_int(params.get("p"), 1)
+ metadata = self.config("metadata")
while True:
page = self.request(url, params=params).text
@@ -115,15 +134,22 @@ class ZerochanTagExtractor(ZerochanExtractor):
post = extr('<li class="', '>')
if not post:
break
- yield {
- "id" : extr('href="/', '"'),
- "name" : extr('alt="', '"'),
- "width" : extr('title="', 'x'),
- "height": extr('', ' '),
- "size" : extr('', 'B'),
- "file_url": "https://static." + extr(
- '<a href="https://static.', '"'),
- }
+
+ if metadata:
+ entry_id = extr('href="/', '"')
+ post = self._parse_entry_html(entry_id)
+ post.update(self._parse_entry_xml(entry_id))
+ yield post
+ else:
+ yield {
+ "id" : extr('href="/', '"'),
+ "name" : extr('alt="', '"'),
+ "width" : extr('title="', 'x'),
+ "height": extr('', ' '),
+ "size" : extr('', 'B'),
+ "file_url": "https://static." + extr(
+ '<a href="https://static.', '"'),
+ }
if 'rel="next"' not in page:
break
@@ -153,4 +179,7 @@ class ZerochanImageExtractor(ZerochanExtractor):
self.image_id = match.group(1)
def posts(self):
- return (self._parse_entry_page(self.image_id),)
+ post = self._parse_entry_html(self.image_id)
+ if self.config("metadata"):
+ post.update(self._parse_entry_xml(self.image_id))
+ return (post,)
diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py
index ff97add..4f376fe 100644
--- a/gallery_dl/postprocessor/zip.py
+++ b/gallery_dl/postprocessor/zip.py
@@ -26,6 +26,7 @@ class ZipPP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
self.delete = not options.get("keep-files", False)
+ self.files = options.get("files")
ext = "." + options.get("extension", "zip")
algorithm = options.get("compression", "store")
if algorithm not in self.COMPRESSION_ALGORITHMS:
@@ -56,6 +57,9 @@ class ZipPP(PostProcessor):
# 'NameToInfo' is not officially documented, but it's available
# for all supported Python versions and using it directly is a lot
# faster than calling getinfo()
+ if self.files:
+ self.write_extra(pathfmt, zfile, self.files)
+ self.files = None
if pathfmt.filename not in zfile.NameToInfo:
zfile.write(pathfmt.temppath, pathfmt.filename)
pathfmt.delete = self.delete
@@ -69,6 +73,21 @@ class ZipPP(PostProcessor):
with self.open() as zfile:
self.write(pathfmt, zfile)
+ def write_extra(self, pathfmt, zfile, files):
+ for path in map(util.expand_path, files):
+ if not os.path.isabs(path):
+ path = os.path.join(pathfmt.realdirectory, path)
+ try:
+ zfile.write(path, os.path.basename(path))
+ except OSError as exc:
+ self.log.warning(
+ "Unable to write %s to %s", path, zfile.filename)
+ self.log.debug("%s: %s", exc, exc.__class__.__name__)
+ pass
+ else:
+ if self.delete:
+ util.remove_file(path)
+
def finalize(self, pathfmt, status):
if self.zfile:
self.zfile.close()
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index d12d088..ce018fe 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.23.0"
+__version__ = "1.23.1"
diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py
index 42babd3..af8b0af 100644
--- a/test/test_postprocessor.py
+++ b/test/test_postprocessor.py
@@ -452,9 +452,11 @@ class ZipTest(BasePostprocessorTest):
self.assertTrue(pp.args[0].endswith("/test.cbz"))
def test_zip_write(self):
- pp = self._create()
-
with tempfile.NamedTemporaryFile("w", dir=self.dir.name) as file:
+ pp = self._create({"files": [file.name, "_info_.json"],
+ "keep-files": True})
+
+ filename = os.path.basename(file.name)
file.write("foobar\n")
# write dummy file with 3 different names
@@ -466,18 +468,19 @@ class ZipTest(BasePostprocessorTest):
self._trigger()
nti = pp.zfile.NameToInfo
- self.assertEqual(len(nti), i+1)
+ self.assertEqual(len(nti), i+2)
self.assertIn(name, nti)
# check file contents
- self.assertEqual(len(nti), 3)
+ self.assertEqual(len(nti), 4)
self.assertIn("file0.ext", nti)
self.assertIn("file1.ext", nti)
self.assertIn("file2.ext", nti)
+ self.assertIn(filename, nti)
# write the last file a second time (will be skipped)
self._trigger()
- self.assertEqual(len(pp.zfile.NameToInfo), 3)
+ self.assertEqual(len(pp.zfile.NameToInfo), 4)
# close file
self._trigger(("finalize",), 0)
@@ -485,10 +488,11 @@ class ZipTest(BasePostprocessorTest):
# reopen to check persistence
with zipfile.ZipFile(pp.zfile.filename) as file:
nti = file.NameToInfo
- self.assertEqual(len(pp.zfile.NameToInfo), 3)
+ self.assertEqual(len(pp.zfile.NameToInfo), 4)
self.assertIn("file0.ext", nti)
self.assertIn("file1.ext", nti)
self.assertIn("file2.ext", nti)
+ self.assertIn(filename, nti)
os.unlink(pp.zfile.filename)