aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2023-07-15 17:08:47 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2023-07-15 17:08:47 -0400
commite2f67519f8c1750a71aab3dc56b8345fff21bac5 (patch)
tree26770e9b79821f2fa10ed3b07a4669f857db8071
parentef30b1fa552fd4ceebdd14bbcc16f30f430883f8 (diff)
New upstream version 1.25.8.upstream/1.25.8
-rw-r--r--CHANGELOG.md36
-rw-r--r--PKG-INFO6
-rw-r--r--README.rst4
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.536
-rw-r--r--docs/gallery-dl.conf5
-rw-r--r--gallery_dl.egg-info/PKG-INFO6
-rw-r--r--gallery_dl.egg-info/SOURCES.txt2
-rw-r--r--gallery_dl/extractor/__init__.py2
-rw-r--r--gallery_dl/extractor/bcy.py206
-rw-r--r--gallery_dl/extractor/bunkr.py15
-rw-r--r--gallery_dl/extractor/common.py6
-rw-r--r--gallery_dl/extractor/erome.py4
-rw-r--r--gallery_dl/extractor/fantia.py4
-rw-r--r--gallery_dl/extractor/gelbooru_v01.py14
-rw-r--r--gallery_dl/extractor/gfycat.py68
-rw-r--r--gallery_dl/extractor/jpgfish.py15
-rw-r--r--gallery_dl/extractor/lineblog.py73
-rw-r--r--gallery_dl/extractor/mangaread.py5
-rw-r--r--gallery_dl/extractor/naverwebtoon.py2
-rw-r--r--gallery_dl/extractor/newgrounds.py5
-rw-r--r--gallery_dl/extractor/paheal.py67
-rw-r--r--gallery_dl/extractor/philomena.py116
-rw-r--r--gallery_dl/extractor/pornhub.py3
-rw-r--r--gallery_dl/extractor/reddit.py64
-rw-r--r--gallery_dl/extractor/seiga.py9
-rw-r--r--gallery_dl/extractor/slideshare.py59
-rw-r--r--gallery_dl/extractor/twibooru.py5
-rw-r--r--gallery_dl/extractor/twitter.py81
-rw-r--r--gallery_dl/extractor/weibo.py2
-rw-r--r--gallery_dl/extractor/wikifeet.py9
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_results.py2
33 files changed, 455 insertions, 480 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b71b404..53034fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,41 @@
# Changelog
+## 1.25.8 - 2023-07-15
+### Changes
+- update default User-Agent header to Firefox 115 ESR
+### Additions
+- [gfycat] support `@me` user ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271))
+- [gfycat] implement login support ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271))
+- [reddit] notify users about registering an OAuth application ([#4292](https://github.com/mikf/gallery-dl/issues/4292))
+- [twitter] add `ratelimit` option ([#4251](https://github.com/mikf/gallery-dl/issues/4251))
+- [twitter] use `TweetResultByRestId` endpoint that allows accessing single Tweets without login ([#4250](https://github.com/mikf/gallery-dl/issues/4250))
+### Fixes
+- [bunkr] use `.la` TLD for `media-files12` servers ([#4147](https://github.com/mikf/gallery-dl/issues/4147), [#4276](https://github.com/mikf/gallery-dl/issues/4276))
+- [erome] ignore duplicate album IDs
+- [fantia] send `X-Requested-With` header ([#4273](https://github.com/mikf/gallery-dl/issues/4273))
+- [gelbooru_v01] fix `source` metadata ([#4302](https://github.com/mikf/gallery-dl/issues/4302), [#4303](https://github.com/mikf/gallery-dl/issues/4303))
+- [gelbooru_v01] update `vidyart` domain
+- [jpgfish] update domain to `jpeg.pet`
+- [mangaread] fix `tags` metadata extraction
+- [naverwebtoon] fix `comic` metadata extraction
+- [newgrounds] extract & pass auth token during login ([#4268](https://github.com/mikf/gallery-dl/issues/4268))
+- [paheal] fix extraction ([#4262](https://github.com/mikf/gallery-dl/issues/4262), [#4293](https://github.com/mikf/gallery-dl/issues/4293))
+- [paheal] unescape `source`
+- [philomena] fix `--range` ([#4288](https://github.com/mikf/gallery-dl/issues/4288))
+- [philomena] handle `429 Too Many Requests` errors ([#4288](https://github.com/mikf/gallery-dl/issues/4288))
+- [pornhub] set `accessAgeDisclaimerPH` cookie ([#4301](https://github.com/mikf/gallery-dl/issues/4301))
+- [reddit] use 0.6s delay between API requests ([#4292](https://github.com/mikf/gallery-dl/issues/4292))
+- [seiga] set `skip_fetish_warning` cookie ([#4242](https://github.com/mikf/gallery-dl/issues/4242))
+- [slideshare] fix extraction
+- [twitter] fix `following` extractor not getting all users ([#4287](https://github.com/mikf/gallery-dl/issues/4287))
+- [twitter] use GraphQL search endpoint by default ([#4264](https://github.com/mikf/gallery-dl/issues/4264))
+- [twitter] do not treat missing `TimelineAddEntries` instruction as fatal ([#4278](https://github.com/mikf/gallery-dl/issues/4278))
+- [weibo] fix cursor based pagination
+- [wikifeet] fix `tag` extraction ([#4289](https://github.com/mikf/gallery-dl/issues/4289), [#4291](https://github.com/mikf/gallery-dl/issues/4291))
+### Removals
+- [bcy] remove module
+- [lineblog] remove module
+
## 1.25.7 - 2023-07-02
### Additions
- [flickr] add 'exif' option
diff --git a/PKG-INFO b/PKG-INFO
index ff9ab3f..953bc56 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.25.7
+Version: 1.25.8
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -109,9 +109,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.bin>`__
Nightly Builds
diff --git a/README.rst b/README.rst
index 86dd58d..51e239c 100644
--- a/README.rst
+++ b/README.rst
@@ -72,9 +72,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.bin>`__
Nightly Builds
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 16a4bba..84fd161 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2023-07-02" "1.25.7" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2023-07-15" "1.25.8" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 2cba623..5fa271b 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2023-07-02" "1.25.7" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2023-07-15" "1.25.8" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -460,6 +460,8 @@ and optional for
.br
* \f[I]exhentai\f[]
.br
+* \f[I]gfycat\f[]
+.br
* \f[I]idolcomplex\f[]
.br
* \f[I]imgbb\f[]
@@ -646,7 +648,7 @@ or a \f[I]list\f[] with IP and explicit port number as elements.
\f[I]string\f[]
.IP "Default:" 9
-\f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"\f[]
+\f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0"\f[]
.IP "Description:" 4
User-Agent header value to be used for HTTP requests.
@@ -3687,6 +3689,22 @@ If this option is enabled, gallery-dl will try to fetch
a quoted (original) Tweet when it sees the Tweet which quotes it.
+.SS extractor.twitter.ratelimit
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"wait"\f[]
+
+.IP "Description:" 4
+Selects how to handle exceeding the API rate limit.
+
+.br
+* \f[I]"abort"\f[]: Raise an error and stop extraction
+.br
+* \f[I]"wait"\f[]: Wait until rate limit reset
+
+
.SS extractor.twitter.replies
.IP "Type:" 6
\f[I]bool\f[]
@@ -3727,17 +3745,15 @@ will be taken from the original Tweets, not the Retweets.
\f[I]string\f[]
.IP "Default:" 9
-\f[I]"auto"\f[]
+\f[I]"graphql"\f[]
.IP "Description:" 4
Selects the API endpoint used to retrieve search results.
.br
-* \f[I]"rest"\f[]: Legacy REST endpoint - returns a \f[I]403 Forbidden\f[] error when not logged in
+* \f[I]"graphql"\f[]: GraphQL endpoint
.br
-* \f[I]"graphql"\f[]: New GraphQL endpoint
-.br
-* \f[I]"auto"\f[]: \f[I]"rest"\f[] when logged in, \f[I]"graphql"\f[] otherwise
+* \f[I]"rest"\f[]: Legacy REST endpoint
.SS extractor.twitter.timeline.strategy
@@ -5822,6 +5838,12 @@ as \f[I]"client-id"\f[]
\f[I]user-agent\f[] and replace \f[I]<application name>\f[] and \f[I]<username>\f[]
accordingly (see Reddit's
\f[I]API access rules\f[])
+.br
+* clear your \f[I]cache\f[] to delete any remaining
+\f[I]access-token\f[] entries. (\f[I]gallery-dl --clear-cache reddit\f[])
+.br
+* get a \f[I]refresh-token\f[] for the
+new \f[I]client-id\f[] (\f[I]gallery-dl oauth:reddit\f[])
.SS extractor.smugmug.api-key & .api-secret
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 902d0a2..b5efc73 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -10,7 +10,7 @@
"proxy": null,
"skip": true,
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0",
"retries": 4,
"timeout": 30.0,
"verify": true,
@@ -261,6 +261,9 @@
},
"reddit":
{
+ "client-id": null,
+ "user-agent": null,
+ "refresh-token": null,
"comments": 0,
"morecomments": false,
"date-min": 0,
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index d008254..00db3b4 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.25.7
+Version: 1.25.8
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -109,9 +109,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.7/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.8/gallery-dl.bin>`__
Nightly Builds
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 44fbd22..355a3f0 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -55,7 +55,6 @@ gallery_dl/extractor/architizer.py
gallery_dl/extractor/artstation.py
gallery_dl/extractor/aryion.py
gallery_dl/extractor/bbc.py
-gallery_dl/extractor/bcy.py
gallery_dl/extractor/behance.py
gallery_dl/extractor/blogger.py
gallery_dl/extractor/booru.py
@@ -123,7 +122,6 @@ gallery_dl/extractor/komikcast.py
gallery_dl/extractor/lensdump.py
gallery_dl/extractor/lexica.py
gallery_dl/extractor/lightroom.py
-gallery_dl/extractor/lineblog.py
gallery_dl/extractor/livedoor.py
gallery_dl/extractor/lolisafe.py
gallery_dl/extractor/luscious.py
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index a344fe4..fa56bfb 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -24,7 +24,6 @@ modules = [
"artstation",
"aryion",
"bbc",
- "bcy",
"behance",
"blogger",
"bunkr",
@@ -85,7 +84,6 @@ modules = [
"lensdump",
"lexica",
"lightroom",
- "lineblog",
"livedoor",
"luscious",
"lynxchan",
diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py
deleted file mode 100644
index d6adb4e..0000000
--- a/gallery_dl/extractor/bcy.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2020-2023 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://bcy.net/"""
-
-from .common import Extractor, Message
-from .. import text, util, exception
-import re
-
-
-class BcyExtractor(Extractor):
- """Base class for bcy extractors"""
- category = "bcy"
- directory_fmt = ("{category}", "{user[id]} {user[name]}")
- filename_fmt = "{post[id]} {id}.{extension}"
- archive_fmt = "{post[id]}_{id}"
- root = "https://bcy.net"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.item_id = match.group(1)
- self.session.headers["Referer"] = self.root + "/"
-
- def items(self):
- sub = re.compile(r"^https?://p\d+-bcy"
- r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)"
- r"/banciyuan").sub
- iroot = "https://img-bcy-qn.pstatp.com"
- noop = self.config("noop")
-
- for post in self.posts():
- if not post["image_list"]:
- continue
-
- multi = None
- tags = post.get("post_tags") or ()
- data = {
- "user": {
- "id" : post["uid"],
- "name" : post["uname"],
- "avatar" : sub(iroot, post["avatar"].partition("~")[0]),
- },
- "post": {
- "id" : text.parse_int(post["item_id"]),
- "tags" : [t["tag_name"] for t in tags],
- "date" : text.parse_timestamp(post["ctime"]),
- "parody" : post["work"],
- "content": post["plain"],
- "likes" : post["like_count"],
- "shares" : post["share_count"],
- "replies": post["reply_count"],
- },
- }
-
- yield Message.Directory, data
- for data["num"], image in enumerate(post["image_list"], 1):
- data["id"] = image["mid"]
- data["width"] = image["w"]
- data["height"] = image["h"]
-
- url = image["path"].partition("~")[0]
- text.nameext_from_url(url, data)
-
- # full-resolution image without watermark
- if data["extension"]:
- if not url.startswith(iroot):
- url = sub(iroot, url)
- data["filter"] = ""
- yield Message.Url, url, data
-
- # watermarked image & low quality noop filter
- else:
- if multi is None:
- multi = self._data_from_post(
- post["item_id"])["post_data"]["multi"]
- image = multi[data["num"] - 1]
-
- if image["origin"]:
- data["filter"] = "watermark"
- yield Message.Url, image["origin"], data
-
- if noop:
- data["extension"] = ""
- data["filter"] = "noop"
- yield Message.Url, image["original_path"], data
-
- def posts(self):
- """Returns an iterable with all relevant 'post' objects"""
-
- def _data_from_post(self, post_id):
- url = "{}/item/detail/{}".format(self.root, post_id)
- page = self.request(url, notfound="post").text
- data = (text.extr(page, 'JSON.parse("', '");')
- .replace('\\\\u002F', '/')
- .replace('\\"', '"'))
- try:
- return util.json_loads(data)["detail"]
- except ValueError:
- return util.json_loads(data.replace('\\"', '"'))["detail"]
-
-
-class BcyUserExtractor(BcyExtractor):
- """Extractor for user timelines"""
- subcategory = "user"
- pattern = r"(?:https?://)?bcy\.net/u/(\d+)"
- test = (
- ("https://bcy.net/u/1933712", {
- "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg",
- "count": ">= 20",
- }),
- ("https://bcy.net/u/109282764041", {
- "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+"
- r"~tplv-bcyx-yuan-logo-v1:.+\.image",
- "range": "1-25",
- "count": 25,
- }),
- )
-
- def posts(self):
- url = self.root + "/apiv3/user/selfPosts"
- params = {"uid": self.item_id, "since": None}
-
- while True:
- data = self.request(url, params=params).json()
-
- try:
- items = data["data"]["items"]
- except KeyError:
- return
- if not items:
- return
-
- for item in items:
- yield item["item_detail"]
- params["since"] = item["since"]
-
-
-class BcyPostExtractor(BcyExtractor):
- """Extractor for individual posts"""
- subcategory = "post"
- pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)"
- test = (
- ("https://bcy.net/item/detail/6355835481002893070", {
- "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3",
- "count": 1,
- "keyword": {
- "user": {
- "id" : 1933712,
- "name" : "wukloo",
- "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/",
- },
- "post": {
- "id" : 6355835481002893070,
- "tags" : list,
- "date" : "dt:2016-11-22 08:47:46",
- "parody" : "东方PROJECT",
- "content": "re:根据微博的建议稍微做了点修改",
- "likes" : int,
- "shares" : int,
- "replies": int,
- },
- "id": 8330182,
- "num": 1,
- "width" : 3000,
- "height": 1687,
- "filename": "712e0780b09011e696f973c3d1568337",
- "extension": "jpg",
- },
- }),
- # only watermarked images available
- ("https://bcy.net/item/detail/6950136331708144648", {
- "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+"
- r"~tplv-bcyx-yuan-logo-v1:.+\.image",
- "count": 10,
- "keyword": {"filter": "watermark"},
- }),
- # deleted
- ("https://bcy.net/item/detail/6780546160802143237", {
- "exception": exception.NotFoundError,
- "count": 0,
- }),
- # only visible to logged in users
- ("https://bcy.net/item/detail/6747523535150783495", {
- "count": 0,
- }),
- # JSON decode error (#3321)
- ("https://bcy.net/item/detail/7166939271872388110", {
- "count": 0,
- }),
- )
-
- def posts(self):
- try:
- data = self._data_from_post(self.item_id)
- except KeyError:
- return ()
- post = data["post_data"]
- post["image_list"] = post["multi"]
- post["plain"] = text.parse_unicode_escapes(post["plain"])
- post.update(data["detail_user"])
- return (post,)
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 5c8c530..35b2752 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -52,6 +52,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"num": int,
},
}),
+ # cdn12 .ru TLD (#4147)
+ ("https://bunkrr.su/a/j1G29CnD", {
+ "pattern": r"https://(cdn12.bunkr.ru|media-files12.bunkr.la)/\w+",
+ "count": 8,
+ }),
("https://bunkrr.su/a/Lktg9Keq"),
("https://bunkr.la/a/Lktg9Keq"),
("https://bunkr.su/a/Lktg9Keq"),
@@ -87,10 +92,12 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
url = text.unescape(url)
if url.endswith((".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts",
".zip", ".rar", ".7z")):
- append({"file": url.replace("://cdn", "://media-files", 1),
- "_http_headers": headers})
- else:
- append({"file": url})
+ if url.startswith("https://cdn12."):
+ url = ("https://media-files12.bunkr.la" +
+ url[url.find("/", 14):])
+ else:
+ url = url.replace("://cdn", "://media-files", 1)
+ append({"file": url, "_http_headers": headers})
return files, {
"album_id" : self.album_id,
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 50d1026..5c9b157 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -286,7 +286,7 @@ class Extractor():
useragent = self.config("user-agent")
if useragent is None:
useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
- "rv:102.0) Gecko/20100101 Firefox/102.0")
+ "rv:115.0) Gecko/20100101 Firefox/115.0")
elif useragent == "browser":
useragent = _browser_useragent()
headers["User-Agent"] = useragent
@@ -805,8 +805,8 @@ _browser_cookies = {}
HTTP_HEADERS = {
"firefox": (
- ("User-Agent", "Mozilla/5.0 ({}; rv:102.0) "
- "Gecko/20100101 Firefox/102.0"),
+ ("User-Agent", "Mozilla/5.0 ({}; rv:115.0) "
+ "Gecko/20100101 Firefox/115.0"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
"image/avif,image/webp,*/*;q=0.8"),
("Accept-Language", "en-US,en;q=0.5"),
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 03307f8..709bc57 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2022 Mike Fährmann
+# Copyright 2021-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -80,7 +80,7 @@ class EromeExtractor(Extractor):
for params["page"] in itertools.count(1):
page = self.request(url, params=params).text
- album_ids = EromeAlbumExtractor.pattern.findall(page)
+ album_ids = EromeAlbumExtractor.pattern.findall(page)[::2]
yield from album_ids
if len(album_ids) < 36:
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
index 35c4cc4..f92b904 100644
--- a/gallery_dl/extractor/fantia.py
+++ b/gallery_dl/extractor/fantia.py
@@ -23,6 +23,7 @@ class FantiaExtractor(Extractor):
self.headers = {
"Accept" : "application/json, text/plain, */*",
"Referer": self.root,
+ "X-Requested-With": "XMLHttpRequest",
}
_empty_plan = {
"id" : 0,
@@ -68,7 +69,8 @@ class FantiaExtractor(Extractor):
def _pagination(self, url):
params = {"page": 1}
- headers = self.headers
+ headers = self.headers.copy()
+ del headers["X-Requested-With"]
while True:
page = self.request(url, params=params, headers=headers).text
diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py
index c4f32a4..b6fbcb6 100644
--- a/gallery_dl/extractor/gelbooru_v01.py
+++ b/gallery_dl/extractor/gelbooru_v01.py
@@ -27,7 +27,7 @@ class GelbooruV01Extractor(booru.BooruExtractor):
"uploader" : extr('By: ', ' <'),
"width" : extr('Size: ', 'x'),
"height" : extr('', ' <'),
- "source" : extr('Source: <a href="', '"'),
+ "source" : extr('Source: ', ' <'),
"rating" : (extr('Rating: ', '<') or "?")[0].lower(),
"score" : extr('Score: ', ' <'),
"file_url" : extr('<img alt="img" src="', '"'),
@@ -78,9 +78,9 @@ BASE_PATTERN = GelbooruV01Extractor.update({
"root": "https://drawfriends.booru.org",
"pattern": r"drawfriends\.booru\.org",
},
- "vidyart": {
- "root": "https://vidyart.booru.org",
- "pattern": r"vidyart\.booru\.org",
+ "vidyart2": {
+ "root": "https://vidyart2.booru.org",
+ "pattern": r"vidyart2\.booru\.org",
},
})
@@ -106,7 +106,7 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor):
"count": 25,
}),
("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"),
- ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"),
+ ("https://vidyart2.booru.org/index.php?page=post&s=list&tags=all"),
)
def __init__(self, match):
@@ -141,7 +141,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor):
"count": 4,
}),
("https://drawfriends.booru.org/index.php?page=favorites&s=view&id=1"),
- ("https://vidyart.booru.org/index.php?page=favorites&s=view&id=1"),
+ ("https://vidyart2.booru.org/index.php?page=favorites&s=view&id=1"),
)
def __init__(self, match):
@@ -193,7 +193,7 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor):
},
}),
("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"),
- ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"),
+ ("https://vidyart2.booru.org/index.php?page=post&s=view&id=39168"),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py
index 0ccd7fa..ccebdf9 100644
--- a/gallery_dl/extractor/gfycat.py
+++ b/gallery_dl/extractor/gfycat.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2022 Mike Fährmann
+# Copyright 2017-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text, exception
+from ..cache import cache
class GfycatExtractor(Extractor):
@@ -80,6 +81,8 @@ class GfycatUserExtractor(GfycatExtractor):
})
def gfycats(self):
+ if self.key == "me":
+ return GfycatAPI(self).me()
return GfycatAPI(self).user(self.key)
@@ -219,15 +222,8 @@ class GfycatAPI():
def __init__(self, extractor):
self.extractor = extractor
-
- def gfycat(self, gfycat_id):
- endpoint = "/v1/gfycats/" + gfycat_id
- return self._call(endpoint)["gfyItem"]
-
- def user(self, user):
- endpoint = "/v1/users/{}/gfycats".format(user.lower())
- params = {"count": 100}
- return self._pagination(endpoint, params)
+ self.headers = {}
+ self.username, self.password = extractor._get_auth_info()
def collection(self, user, collection):
endpoint = "/v1/users/{}/collections/{}/gfycats".format(
@@ -240,14 +236,64 @@ class GfycatAPI():
params = {"count": 100}
return self._pagination(endpoint, params, "gfyCollections")
+ def gfycat(self, gfycat_id):
+ endpoint = "/v1/gfycats/" + gfycat_id
+ return self._call(endpoint)["gfyItem"]
+
+ def me(self):
+ endpoint = "/v1/me/gfycats"
+ params = {"count": 100}
+ return self._pagination(endpoint, params)
+
def search(self, query):
endpoint = "/v1/gfycats/search"
params = {"search_text": query, "count": 150}
return self._pagination(endpoint, params)
+ def user(self, user):
+ endpoint = "/v1/users/{}/gfycats".format(user.lower())
+ params = {"count": 100}
+ return self._pagination(endpoint, params)
+
+ def authenticate(self):
+ self.headers["Authorization"] = \
+ self._authenticate_impl(self.username, self.password)
+
+ @cache(maxage=3600, keyarg=1)
+ def _authenticate_impl(self, username, password):
+ self.extractor.log.info("Logging in as %s", username)
+
+ url = "https://weblogin.gfycat.com/oauth/webtoken"
+ headers = {"Origin": "https://gfycat.com"}
+ data = {
+ "access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2"
+ "M95Rfa5FLLhPFucu8H5HTzeutyAa",
+ }
+ response = self.extractor.request(
+ url, method="POST", headers=headers, json=data).json()
+
+ url = "https://weblogin.gfycat.com/oauth/weblogin"
+ headers["authorization"] = "Bearer " + response["access_token"]
+ data = {
+ "grant_type": "password",
+ "username" : username,
+ "password" : password,
+ }
+ response = self.extractor.request(
+ url, method="POST", headers=headers, json=data, fatal=None).json()
+
+ if "errorMessage" in response:
+ raise exception.AuthenticationError(
+ response["errorMessage"]["description"])
+ return "Bearer " + response["access_token"]
+
def _call(self, endpoint, params=None):
+ if self.username:
+ self.authenticate()
+
url = self.API_ROOT + endpoint
- return self.extractor.request(url, params=params).json()
+ return self.extractor.request(
+ url, params=params, headers=self.headers).json()
def _pagination(self, endpoint, params, key="gfycats"):
while True:
diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py
index b8d425a..39208e5 100644
--- a/gallery_dl/extractor/jpgfish.py
+++ b/gallery_dl/extractor/jpgfish.py
@@ -4,18 +4,18 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://jpg.pet/"""
+"""Extractors for https://jpeg.pet/"""
from .common import Extractor, Message
from .. import text
-BASE_PATTERN = r"(?:https?://)?jpg\.(?:pet|fish(?:ing)?|church)"
+BASE_PATTERN = r"(?:https?://)?jpe?g\.(?:pet|fish(?:ing)?|church)"
class JpgfishExtractor(Extractor):
"""Base class for jpgfish extractors"""
category = "jpgfish"
- root = "https://jpg.pet"
+ root = "https://jpeg.pet"
directory_fmt = ("{category}", "{user}", "{album}",)
archive_fmt = "{id}"
@@ -36,7 +36,7 @@ class JpgfishImageExtractor(JpgfishExtractor):
subcategory = "image"
pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))"
test = (
- ("https://jpg.pet/img/funnymeme.LecXGS", {
+ ("https://jpeg.pet/img/funnymeme.LecXGS", {
"pattern": r"https://simp3\.jpg\.church/images/funnymeme\.jpg",
"content": "098e5e9b17ad634358426e0ffd1c93871474d13c",
"keyword": {
@@ -52,6 +52,7 @@ class JpgfishImageExtractor(JpgfishExtractor):
"pattern": r"https://simp2\.jpg\.church/hannahowo_00457\.jpg",
"keyword": {"album": "401-500"},
}),
+ ("https://jpg.pet/img/funnymeme.LecXGS"),
("https://jpg.fishing/img/funnymeme.LecXGS"),
("https://jpg.fish/img/funnymeme.LecXGS"),
("https://jpg.church/img/funnymeme.LecXGS"),
@@ -83,7 +84,7 @@ class JpgfishAlbumExtractor(JpgfishExtractor):
subcategory = "album"
pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?"
test = (
- ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1", {
+ ("https://jpeg.pet/album/CDilP/?sort=date_desc&page=1", {
"count": 2,
}),
("https://jpg.fishing/a/gunggingnsk.N9OOI", {
@@ -95,6 +96,7 @@ class JpgfishAlbumExtractor(JpgfishExtractor):
("https://jpg.church/a/hannahowo.aNTdH/sub", {
"count": 606,
}),
+ ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1"),
)
def __init__(self, match):
@@ -120,12 +122,13 @@ class JpgfishUserExtractor(JpgfishExtractor):
subcategory = "user"
pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?"
test = (
- ("https://jpg.pet/exearco", {
+ ("https://jpeg.pet/exearco", {
"count": 3,
}),
("https://jpg.church/exearco/albums", {
"count": 1,
}),
+ ("https://jpg.pet/exearco"),
("https://jpg.fishing/exearco"),
("https://jpg.fish/exearco"),
("https://jpg.church/exearco"),
diff --git a/gallery_dl/extractor/lineblog.py b/gallery_dl/extractor/lineblog.py
deleted file mode 100644
index adb27a8..0000000
--- a/gallery_dl/extractor/lineblog.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019-2020 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://www.lineblog.me/"""
-
-from .livedoor import LivedoorBlogExtractor, LivedoorPostExtractor
-from .. import text
-
-
-class LineblogBase():
- """Base class for lineblog extractors"""
- category = "lineblog"
- root = "https://lineblog.me"
-
- def _images(self, post):
- imgs = []
- body = post.pop("body")
-
- for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
- src = text.extr(img, 'src="', '"')
- alt = text.extr(img, 'alt="', '"')
-
- if not src:
- continue
- if src.startswith("https://obs.line-scdn.") and src.count("/") > 3:
- src = src.rpartition("/")[0]
-
- imgs.append(text.nameext_from_url(alt or src, {
- "url" : src,
- "num" : num,
- "hash": src.rpartition("/")[2],
- "post": post,
- }))
-
- return imgs
-
-
-class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor):
- """Extractor for a user's blog on lineblog.me"""
- pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?#])"
- test = ("https://lineblog.me/mamoru_miyano/", {
- "range": "1-20",
- "count": 20,
- "pattern": r"https://obs.line-scdn.net/[\w-]+$",
- "keyword": {
- "post": {
- "categories" : tuple,
- "date" : "type:datetime",
- "description": str,
- "id" : int,
- "tags" : list,
- "title" : str,
- "user" : "mamoru_miyano"
- },
- "filename": str,
- "hash" : r"re:\w{32,}",
- "num" : int,
- },
- })
-
-
-class LineblogPostExtractor(LineblogBase, LivedoorPostExtractor):
- """Extractor for blog posts on lineblog.me"""
- pattern = r"(?:https?://)?lineblog\.me/(\w+)/archives/(\d+)"
- test = ("https://lineblog.me/mamoru_miyano/archives/1919150.html", {
- "url": "24afeb4044c554f80c374b52bf8109c6f1c0c757",
- "keyword": "76a38e2c0074926bd3362f66f9fc0e6c41591dcb",
- })
diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py
index 49d4d7d..74c239e 100644
--- a/gallery_dl/extractor/mangaread.py
+++ b/gallery_dl/extractor/mangaread.py
@@ -87,7 +87,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor):
)
def metadata(self, page):
- data = {"tags": list(text.extract_iter(page, "class>", "<"))}
+ tags = text.extr(page, 'class="wp-manga-tags-list">', '</div>')
+ data = {"tags": list(text.split_html(tags)[::2])}
info = text.extr(page, '<h1 id="chapter-heading">', "</h1>")
if not info:
raise exception.NotFoundError("chapter")
@@ -148,7 +149,7 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor):
}
}),
("https://www.mangaread.org/manga/doesnotexist", {
- "exception": exception.NotFoundError,
+ "exception": exception.HttpError,
}),
)
diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py
index d6292af..cafe4f7 100644
--- a/gallery_dl/extractor/naverwebtoon.py
+++ b/gallery_dl/extractor/naverwebtoon.py
@@ -91,7 +91,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
return {
"title_id": self.title_id,
"episode" : self.episode,
- "comic" : extr("titleName: '", "'"),
+ "comic" : extr('titleName: "', '"'),
"tags" : [t.strip() for t in text.extract_iter(
extr("tagList: [", "}],"), '"tagName":"', '"')],
"title" : extr('"subtitle":"', '"'),
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 5d100a4..e047f3d 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -87,14 +87,15 @@ class NewgroundsExtractor(Extractor):
if response.history and response.url.endswith("/social"):
return self.session.cookies
+ page = response.text
headers = {"Origin": self.root, "Referer": url}
- url = text.urljoin(self.root, text.extr(
- response.text, 'action="', '"'))
+ url = text.urljoin(self.root, text.extr(page, 'action="', '"'))
data = {
"username": username,
"password": password,
"remember": "1",
"login" : "1",
+ "auth" : text.extr(page, 'name="auth" value="', '"'),
}
response = self.request(url, method="POST", headers=headers, data=data)
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index f0a50c8..1fa571c 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -55,8 +55,8 @@ class PahealExtractor(Extractor):
"class='username' href='/user/", "'")),
"date" : text.parse_datetime(
extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
- "source" : text.extract(
- extr(">Source&nbsp;Link<", "</td>"), "href='", "'")[0],
+ "source" : text.unescape(text.extr(
+ extr(">Source&nbsp;Link<", "</td>"), "href='", "'")),
}
dimensions, size, ext = extr("Info</th><td>", ">").split(" // ")
@@ -74,10 +74,34 @@ class PahealTagExtractor(PahealExtractor):
directory_fmt = ("{category}", "{search_tags}")
pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
r"/post/list/([^/?#]+)")
- test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", {
- "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20",
- "count": ">= 15"
- })
+ test = (
+ ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", {
+ "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20",
+ "count": ">= 15"
+ }),
+ ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", {
+ "range": "1",
+ "options": (("metadata", True),),
+ "keyword": {
+ "date": "dt:2018-01-07 07:04:05",
+ "duration": 0.0,
+ "extension": "jpg",
+ "filename": "2446128 - Ayane_Suzuki Idolmaster "
+ "idolmaster_dearly_stars Zanzi",
+ "height": 768,
+ "id": 2446128,
+ "md5": "b0ceda9d860df1d15b60293a7eb465c1",
+ "search_tags": "Ayane_Suzuki",
+ "size": 205312,
+ "source": "https://www.pixiv.net/member_illust.php"
+ "?mode=medium&illust_id=19957280",
+ "tags": "Ayane_Suzuki Idolmaster "
+ "idolmaster_dearly_stars Zanzi",
+ "uploader": "XXXname",
+ "width": 1024,
+ },
+ }),
+ )
per_page = 70
def __init__(self, match):
@@ -96,8 +120,9 @@ class PahealTagExtractor(PahealExtractor):
url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
page = self.request(url).text
+ pos = page.find("id='image-list'")
for post in text.extract_iter(
- page, '<img id="thumb_', 'Only</a>'):
+ page, "<img id='thumb_", "Only</a>", pos):
yield self._extract_data(post)
if ">Next<" not in page:
@@ -106,10 +131,10 @@ class PahealTagExtractor(PahealExtractor):
@staticmethod
def _extract_data(post):
- pid , pos = text.extract(post, '', '"')
- data, pos = text.extract(post, 'title="', '"', pos)
- md5 , pos = text.extract(post, '/_thumbs/', '/', pos)
- url , pos = text.extract(post, '<a href="', '"', pos)
+ pid , pos = text.extract(post, "", "'")
+ data, pos = text.extract(post, "title='", "'", pos)
+ md5 , pos = text.extract(post, "/_thumbs/", "/", pos)
+ url , pos = text.extract(post, "<a href='", "'", pos)
tags, data, date = data.split("\n")
dimensions, size, ext = data.split(" // ")
@@ -126,7 +151,7 @@ class PahealTagExtractor(PahealExtractor):
}
def _extract_data_ex(self, post):
- pid = post[:post.index('"')]
+ pid = post[:post.index("'")]
return self._extract_post(pid)
@@ -139,19 +164,19 @@ class PahealPostExtractor(PahealExtractor):
("https://rule34.paheal.net/post/view/481609", {
"pattern": r"https://tulip\.paheal\.net/_images"
r"/bbdc1c33410c2cdce7556c7990be26b7/481609%20-%20"
- r"Azumanga_Daioh%20Osaka%20Vuvuzela%20inanimate\.jpg",
+ r"Azumanga_Daioh%20inanimate%20Osaka%20Vuvuzela\.jpg",
"content": "7b924bcf150b352ac75c9d281d061e174c851a11",
"keyword": {
"date": "dt:2010-06-17 15:40:23",
"extension": "jpg",
"file_url": "re:https://tulip.paheal.net/_images/bbdc1c33410c",
- "filename": "481609 - Azumanga_Daioh Osaka Vuvuzela inanimate",
+ "filename": "481609 - Azumanga_Daioh inanimate Osaka Vuvuzela",
"height": 660,
"id": 481609,
"md5": "bbdc1c33410c2cdce7556c7990be26b7",
"size": 157389,
- "source": None,
- "tags": "Azumanga_Daioh Osaka Vuvuzela inanimate",
+ "source": "",
+ "tags": "Azumanga_Daioh inanimate Osaka Vuvuzela",
"uploader": "CaptainButtface",
"width": 614,
},
@@ -163,7 +188,7 @@ class PahealPostExtractor(PahealExtractor):
"md5": "b39edfe455a0381110c710d6ed2ef57d",
"size": 758989,
"source": "http://www.furaffinity.net/view/4057821/",
- "tags": "Vuvuzela inanimate thelost-dragon",
+ "tags": "inanimate thelost-dragon Vuvuzela",
"uploader": "leacheate_soup",
"width": 1200,
},
@@ -171,8 +196,8 @@ class PahealPostExtractor(PahealExtractor):
# video
("https://rule34.paheal.net/post/view/3864982", {
"pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d"
- r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_"
- r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm",
+ r"de5bf4f992b2cb/3864982%20-%20animated%20Metal_Gear%20"
+ r"Metal_Gear_Solid_V%20Quiet%20Vg_erotica%20webm\.webm",
"keyword": {
"date": "dt:2020-09-06 01:59:03",
"duration": 30.0,
@@ -183,8 +208,8 @@ class PahealPostExtractor(PahealExtractor):
"size": 18454938,
"source": "https://twitter.com/VG_Worklog"
"/status/1302407696294055936",
- "tags": "Metal_Gear Metal_Gear_Solid_V Quiet "
- "Vg_erotica animated webm",
+ "tags": "animated Metal_Gear Metal_Gear_Solid_V "
+ "Quiet Vg_erotica webm",
"uploader": "justausername",
"width": 1768,
},
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index df85b96..e718828 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -19,39 +19,19 @@ class PhilomenaExtractor(BooruExtractor):
filename_fmt = "{filename}.{extension}"
archive_fmt = "{id}"
request_interval = 1.0
+ page_start = 1
per_page = 50
+ def __init__(self, match):
+ BooruExtractor.__init__(self, match)
+ self.api = PhilomenaAPI(self)
+
_file_url = operator.itemgetter("view_url")
@staticmethod
def _prepare(post):
post["date"] = text.parse_datetime(post["created_at"])
- def _pagination(self, url, params):
- params["page"] = 1
- params["per_page"] = self.per_page
-
- api_key = self.config("api-key")
- if api_key:
- params["key"] = api_key
-
- filter_id = self.config("filter")
- if filter_id:
- params["filter_id"] = filter_id
- elif not api_key:
- try:
- params["filter_id"] = INSTANCES[self.category]["filter_id"]
- except (KeyError, TypeError):
- params["filter_id"] = "2"
-
- while True:
- data = self.request(url, params=params).json()
- yield from data["images"]
-
- if len(data["images"]) < self.per_page:
- return
- params["page"] += 1
-
INSTANCES = {
"derpibooru": {
@@ -146,8 +126,7 @@ class PhilomenaPostExtractor(PhilomenaExtractor):
self.image_id = match.group(match.lastindex)
def posts(self):
- url = self.root + "/api/v1/json/images/" + self.image_id
- return (self.request(url).json()["image"],)
+ return (self.api.image(self.image_id),)
class PhilomenaSearchExtractor(PhilomenaExtractor):
@@ -201,8 +180,7 @@ class PhilomenaSearchExtractor(PhilomenaExtractor):
return {"search_tags": self.params.get("q", "")}
def posts(self):
- url = self.root + "/api/v1/json/search/images"
- return self._pagination(url, self.params)
+ return self.api.search(self.params)
class PhilomenaGalleryExtractor(PhilomenaExtractor):
@@ -239,15 +217,81 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor):
self.gallery_id = match.group(match.lastindex)
def metadata(self):
- url = self.root + "/api/v1/json/search/galleries"
- params = {"q": "id:" + self.gallery_id}
- galleries = self.request(url, params=params).json()["galleries"]
- if not galleries:
+ try:
+ return {"gallery": self.api.gallery(self.gallery_id)}
+ except IndexError:
raise exception.NotFoundError("gallery")
- return {"gallery": galleries[0]}
def posts(self):
gallery_id = "gallery_id:" + self.gallery_id
- url = self.root + "/api/v1/json/search/images"
params = {"sd": "desc", "sf": gallery_id, "q": gallery_id}
- return self._pagination(url, params)
+ return self.api.search(params)
+
+
+class PhilomenaAPI():
+ """Interface for the Philomena API
+
+ https://www.derpibooru.org/pages/api
+ """
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.root = extractor.root + "/api"
+
+ def gallery(self, gallery_id):
+ endpoint = "/v1/json/search/galleries"
+ params = {"q": "id:" + gallery_id}
+ return self._call(endpoint, params)["galleries"][0]
+
+ def image(self, image_id):
+ endpoint = "/v1/json/images/" + image_id
+ return self._call(endpoint)["image"]
+
+ def search(self, params):
+ endpoint = "/v1/json/search/images"
+ return self._pagination(endpoint, params)
+
+ def _call(self, endpoint, params=None):
+ url = self.root + endpoint
+
+ while True:
+ response = self.extractor.request(url, params=params, fatal=None)
+
+ if response.status_code < 400:
+ return response.json()
+
+ if response.status_code == 429:
+ self.extractor.wait(seconds=600)
+ continue
+
+ # error
+ self.extractor.log.debug(response.content)
+ raise exception.StopExtraction(
+ "%s %s", response.status_code, response.reason)
+
+ def _pagination(self, endpoint, params):
+ extr = self.extractor
+
+ api_key = extr.config("api-key")
+ if api_key:
+ params["key"] = api_key
+
+ filter_id = extr.config("filter")
+ if filter_id:
+ params["filter_id"] = filter_id
+ elif not api_key:
+ try:
+ params["filter_id"] = INSTANCES[extr.category]["filter_id"]
+ except (KeyError, TypeError):
+ params["filter_id"] = "2"
+
+ params["page"] = extr.page_start
+ params["per_page"] = extr.per_page
+
+ while True:
+ data = self._call(endpoint, params)
+ yield from data["images"]
+
+ if len(data["images"]) < extr.per_page:
+ return
+ params["page"] += 1
diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
index f19e33c..fa4efa0 100644
--- a/gallery_dl/extractor/pornhub.py
+++ b/gallery_dl/extractor/pornhub.py
@@ -58,6 +58,9 @@ class PornhubGalleryExtractor(PornhubExtractor):
self._first = None
def items(self):
+ self.session.cookies.set(
+ "accessAgeDisclaimerPH", "1", domain=".pornhub.com")
+
data = self.metadata()
yield Message.Directory, data
for num, image in enumerate(self.images(), 1):
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 9a57dcf..54b162b 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -20,6 +20,7 @@ class RedditExtractor(Extractor):
filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}"
archive_fmt = "{filename}"
cookiedomain = ".reddit.com"
+ request_interval = 0.6
def items(self):
self.api = RedditAPI(self)
@@ -377,6 +378,18 @@ class RedditAPI():
self.client_id = client_id
self.headers = {"User-Agent": config("user-agent")}
+ if self.client_id == self.CLIENT_ID:
+ client_id = self.client_id
+ self._warn_429 = True
+ kind = "default"
+ else:
+ client_id = client_id[:5] + "*" * (len(client_id)-5)
+ self._warn_429 = False
+ kind = "custom"
+
+ self.log.debug(
+ "Using %s API credentials (client-id %s)", kind, client_id)
+
token = config("refresh-token")
if token is None or token == "cache":
key = "#" + self.client_id
@@ -463,28 +476,39 @@ class RedditAPI():
def _call(self, endpoint, params):
url = "https://oauth.reddit.com" + endpoint
params["raw_json"] = "1"
- self.authenticate()
- response = self.extractor.request(
- url, params=params, headers=self.headers, fatal=None)
- remaining = response.headers.get("x-ratelimit-remaining")
- if remaining and float(remaining) < 2:
- self.extractor.wait(seconds=response.headers["x-ratelimit-reset"])
- return self._call(endpoint, params)
+ while True:
+ self.authenticate()
+ response = self.extractor.request(
+ url, params=params, headers=self.headers, fatal=None)
+
+ remaining = response.headers.get("x-ratelimit-remaining")
+ if remaining and float(remaining) < 2:
+ if self._warn_429:
+ self._warn_429 = False
+ self.log.info(
+ "Register your own OAuth application and use its "
+ "credentials to prevent this error: "
+ "https://github.com/mikf/gallery-dl/blob/master"
+ "/docs/configuration.rst"
+ "#extractorredditclient-id--user-agent")
+ self.extractor.wait(
+ seconds=response.headers["x-ratelimit-reset"])
+ continue
- try:
- data = response.json()
- except ValueError:
- raise exception.StopExtraction(text.remove_html(response.text))
-
- if "error" in data:
- if data["error"] == 403:
- raise exception.AuthorizationError()
- if data["error"] == 404:
- raise exception.NotFoundError()
- self.log.debug(data)
- raise exception.StopExtraction(data.get("message"))
- return data
+ try:
+ data = response.json()
+ except ValueError:
+ raise exception.StopExtraction(text.remove_html(response.text))
+
+ if "error" in data:
+ if data["error"] == 403:
+ raise exception.AuthorizationError()
+ if data["error"] == 404:
+ raise exception.NotFoundError()
+ self.log.debug(data)
+ raise exception.StopExtraction(data.get("message"))
+ return data
def _pagination(self, endpoint, params):
id_min = self._parse_id("id-min", 0)
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
index 7b8d2a3..711435e 100644
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2022 Mike Fährmann
+# Copyright 2016-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -164,6 +164,10 @@ class SeigaImageExtractor(SeigaExtractor):
("https://seiga.nicovideo.jp/seiga/im123", {
"exception": exception.NotFoundError,
}),
+ ("https://seiga.nicovideo.jp/seiga/im10877923", {
+ "pattern": r"https://lohas\.nicoseiga\.jp/priv/5936a2a6c860a600e46"
+ r"5e0411c0822e0b510e286/1688757110/10877923",
+ }),
("https://seiga.nicovideo.jp/image/source/5977527"),
("https://sp.seiga.nicovideo.jp/seiga/#!/im5977527"),
("https://lohas.nicoseiga.jp/thumb/5977527i"),
@@ -182,6 +186,9 @@ class SeigaImageExtractor(SeigaExtractor):
return num
def get_images(self):
+ self.session.cookies.set(
+ "skip_fetish_warning", "1", domain="seiga.nicovideo.jp")
+
url = "{}/seiga/im{}".format(self.root, self.image_id)
page = self.request(url, notfound="image").text
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index bea457f..3521298 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -30,21 +30,20 @@ class SlidesharePresentationExtractor(GalleryExtractor):
"count": 19,
"content": "2b6a191eab60b3978fdacfecf2da302dd45bc108",
"keyword": {
- "comments": "0",
"description": "Get Started with SlideShare - "
"A Beginngers Guide for Creators",
- "likes": r"re:\d{3,}",
+ "likes": int,
"presentation": "get-started-with-slide-share",
- "published": "dt:2015-05-20 00:00:00",
+ "date": "dt:2015-05-20 17:38:21",
"title": "Getting Started With SlideShare",
"user": "Slideshare",
- "views": r"re:\d{7,}",
+ "views": int,
},
}),
# long title and description
(("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
"-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
- "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
+ "url": "d8952260f8bec337dd809a958ec8091350393f6b",
"keyword": {
"title": "Warum Sie nicht Ihren Mitarbeitenden ändern "
"sollten, sondern Ihr Managementsystem",
@@ -58,7 +57,7 @@ class SlidesharePresentationExtractor(GalleryExtractor):
# mobile URL
(("https://www.slideshare.net"
"/mobile/uqudent/introduction-to-fixed-prosthodontics"), {
- "url": "43eda2adf4dd221a251c8df794dfb82649e94647",
+ "url": "72c431cb1eccbb6794f608ecbbc01d52e8768159",
}),
)
@@ -69,43 +68,31 @@ class SlidesharePresentationExtractor(GalleryExtractor):
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
- extr = text.extract_from(page)
- descr = extr('<meta name="description" content="', '"')
- comments = extr('content="UserComments:', '"')
- likes = extr('content="UserLikes:', '"')
- views = extr('content="UserPageVisits:', '"')
- title = extr('<span class="j-title-breadcrumb">', '</span>')
- published = extr('<div class="metadata-item">', '</div>')
-
- if descr.endswith("…"):
- alt_descr = extr('slideshow-description-text"', '</p>')
- if alt_descr:
- descr = text.remove_html(alt_descr.partition(">")[2]).strip()
+ data = util.json_loads(text.extr(
+ page, 'id="__NEXT_DATA__" type="application/json">', '</script>'))
+ self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"]
return {
- "user": self.user,
+ "user" : slideshow["username"],
"presentation": self.presentation,
- "title": text.unescape(title.strip()),
- "description": text.unescape(descr),
- "views": views,
- "likes": likes,
- "comments": comments,
- "published": text.parse_datetime(
- published.strip(), "%b. %d, %Y"),
+ "title" : slideshow["title"].strip(),
+ "description" : slideshow["description"].strip(),
+ "views" : slideshow["views"],
+ "likes" : slideshow["likes"],
+ "date" : text.parse_datetime(
+ slideshow["createdAt"], "%Y-%m-%d %H:%M:%S %Z"),
}
- @staticmethod
- def images(page):
- data = util.json_loads(text.extract(
- page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0])
+ def images(self, page):
+ parts = self.slideshow["slideImages"][0]["baseUrl"].split("/")
- # useing 'stripped_title' here is technically wrong, but it works all
- # the same, slideshare doesn't seem to care what characters go there
- begin = "https://image.slidesharecdn.com/{}/95/{}-".format(
- data["ppt_location"], data["stripped_title"])
- end = "-1024.jpg?cb=" + str(data["timestamp"])
+ begin = "{}/95/{}-".format(
+ "/".join(parts[:4]),
+ self.slideshow["strippedTitle"],
+ )
+ end = "-1024.jpg?" + parts[-1].rpartition("?")[2]
return [
(begin + str(n) + end, None)
- for n in range(1, data["slide_count"]+1)
+ for n in range(1, self.slideshow["totalSlides"]+1)
]
diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py
index 30bf2f1..a8acd31 100644
--- a/gallery_dl/extractor/twibooru.py
+++ b/gallery_dl/extractor/twibooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022 Mike Fährmann
+# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,6 +22,7 @@ class TwibooruExtractor(BooruExtractor):
filename_fmt = "{id}_{filename}.{extension}"
archive_fmt = "{id}"
request_interval = 6.05
+ page_start = 1
per_page = 50
root = "https://twibooru.org"
@@ -230,7 +231,7 @@ class TwibooruAPI():
elif not api_key:
params["filter_id"] = "2"
- params["page"] = 1
+ params["page"] = extr.page_start
params["per_page"] = per_page = extr.per_page
while True:
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 10db974..7b9a2e4 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -510,13 +510,13 @@ class TwitterTimelineExtractor(TwitterExtractor):
if not self.textonly:
# try to search for media-only tweets
tweet = None
- for tweet in self.api.search_adaptive(query + " filter:links"):
+ for tweet in self.api.search_timeline(query + " filter:links"):
yield tweet
if tweet is not None:
return
# yield unfiltered search results
- yield from self.api.search_adaptive(query)
+ yield from self.api.search_timeline(query)
def _select_tweet_source(self):
strategy = self.config("strategy")
@@ -693,7 +693,7 @@ class TwitterSearchExtractor(TwitterExtractor):
except KeyError:
pass
- return self.api.search_adaptive(query)
+ return self.api.search_timeline(query)
class TwitterHashtagExtractor(TwitterExtractor):
@@ -929,16 +929,15 @@ Your reaction.""",
def _tweets_single(self, tweet_id):
tweets = []
- for tweet in self.api.tweet_detail(tweet_id):
- if tweet["rest_id"] == tweet_id or \
- tweet.get("_retweet_id_str") == tweet_id:
- if self._user_obj is None:
- self._assign_user(tweet["core"]["user_results"]["result"])
- tweets.append(tweet)
+ tweet = self.api.tweet_result_by_rest_id(tweet_id)
+ self._assign_user(tweet["core"]["user_results"]["result"])
- tweet_id = tweet["legacy"].get("quoted_status_id_str")
- if not tweet_id:
- break
+ while True:
+ tweets.append(tweet)
+ tweet_id = tweet["legacy"].get("quoted_status_id_str")
+ if not tweet_id:
+ break
+ tweet = self.api.tweet_result_by_rest_id(tweet_id)
return tweets
@@ -1087,8 +1086,8 @@ class TwitterAPI():
auth_token = cookies.get("auth_token", domain=cookiedomain)
search = extractor.config("search-endpoint")
- if search == "graphql" or not auth_token and search in ("auto", None):
- self.search_adaptive = self.search_timeline
+ if search == "rest":
+ self.search_timeline = self.search_adaptive
self.headers = {
"Accept": "*/*",
@@ -1179,6 +1178,46 @@ class TwitterAPI():
"responsive_web_enhance_cards_enabled": False,
}
+ def tweet_result_by_rest_id(self, tweet_id):
+ endpoint = "/graphql/2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId"
+ params = {
+ "variables": self._json_dumps({
+ "tweetId": tweet_id,
+ "withCommunity": False,
+ "includePromotedContent": False,
+ "withVoice": False,
+ }),
+ "features": self._json_dumps({
+ "creator_subscriptions_tweet_preview_api_enabled": True,
+ "tweetypie_unmention_optimization_enabled": True,
+ "responsive_web_edit_tweet_api_enabled": True,
+ "graphql_is_translatable_rweb_tweet_is_translatable_enabled":
+ True,
+ "view_counts_everywhere_api_enabled": True,
+ "longform_notetweets_consumption_enabled": True,
+ "responsive_web_twitter_article_tweet_consumption_enabled":
+ False,
+ "tweet_awards_web_tipping_enabled": False,
+ "freedom_of_speech_not_reach_fetch_enabled": True,
+ "standardized_nudges_misinfo": True,
+ "tweet_with_visibility_results_prefer_gql_"
+ "limited_actions_policy_enabled": True,
+ "longform_notetweets_rich_text_read_enabled": True,
+ "longform_notetweets_inline_media_enabled": True,
+ "responsive_web_graphql_exclude_directive_enabled": True,
+ "verified_phone_label_enabled": False,
+ "responsive_web_media_download_video_enabled": False,
+ "responsive_web_graphql_skip_user_profile_"
+ "image_extensions_enabled": False,
+ "responsive_web_graphql_timeline_navigation_enabled": True,
+ "responsive_web_enhance_cards_enabled": False,
+ }),
+ "fieldToggles": self._json_dumps({
+ "withArticleRichContentState": False,
+ }),
+ }
+ return self._call(endpoint, params)["data"]["tweetResult"]["result"]
+
def tweet_detail(self, tweet_id):
endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail"
variables = {
@@ -1439,6 +1478,9 @@ class TwitterAPI():
if response.status_code == 429:
# rate limit exceeded
+ if self.extractor.config("ratelimit") == "abort":
+ raise exception.StopExtraction("Rate limit exceeded")
+
until = response.headers.get("x-rate-limit-reset")
seconds = None if until else 60
self.extractor.wait(until=until, seconds=seconds)
@@ -1592,7 +1634,9 @@ class TwitterAPI():
if entry["entryId"].startswith("cursor-bottom-"):
cursor = entry["content"]["value"]
if entries is None:
- raise KeyError()
+ if not cursor:
+ return
+ entries = ()
except LookupError:
extr.log.debug(data)
@@ -1730,7 +1774,7 @@ class TwitterAPI():
"features" : self._json_dumps(self.features_pagination)}
while True:
- cursor = entry = stop = None
+ cursor = entry = None
params["variables"] = self._json_dumps(variables)
data = self._call(endpoint, params)["data"]
@@ -1759,11 +1803,8 @@ class TwitterAPI():
yield user
elif entry["entryId"].startswith("cursor-bottom-"):
cursor = entry["content"]["value"]
- elif instr["type"] == "TimelineTerminateTimeline":
- if instr["direction"] == "Bottom":
- stop = True
- if stop or not cursor or not entry:
+ if not cursor or cursor.startswith(("-1|", "0|")) or not entry:
return
variables["cursor"] = cursor
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 805aa53..5a3adc8 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -170,6 +170,8 @@ class WeiboExtractor(Extractor):
yield from statuses
if "next_cursor" in data: # videos, newvideo
+ if data["next_cursor"] == -1:
+ return
params["cursor"] = data["next_cursor"]
elif "page" in params: # home, article
params["page"] += 1
diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py
index 662e08b..5f02e94 100644
--- a/gallery_dl/extractor/wikifeet.py
+++ b/gallery_dl/extractor/wikifeet.py
@@ -32,7 +32,7 @@ class WikifeetGalleryExtractor(GalleryExtractor):
"pid" : int,
"width" : int,
"height" : int,
- "shoesize" : "7.5 US",
+ "shoesize" : "9 US",
"type" : "women",
"tags" : list,
},
@@ -50,7 +50,7 @@ class WikifeetGalleryExtractor(GalleryExtractor):
"pid" : int,
"width" : int,
"height" : int,
- "shoesize" : "[NOT SET]",
+ "shoesize" : "4 US",
"type" : "women",
"tags" : list,
},
@@ -111,7 +111,10 @@ class WikifeetGalleryExtractor(GalleryExtractor):
"pid" : data["pid"],
"width" : data["pw"],
"height": data["ph"],
- "tags" : [tagmap[tag] for tag in data["tags"]],
+ "tags" : [
+ tagmap[tag]
+ for tag in data["tags"] if tag in tagmap
+ ],
})
for data in util.json_loads(text.extr(page, "['gdata'] = ", ";"))
]
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 9438d73..f2a3111 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.25.7"
+__version__ = "1.25.8"
diff --git a/test/test_results.py b/test/test_results.py
index 03a17c4..3c7d284 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -325,7 +325,7 @@ def setup_test_config():
for category in ("danbooru", "atfbooru", "aibooru", "e621", "e926",
"instagram", "twitter", "subscribestar", "deviantart",
"inkbunny", "tapas", "pillowfort", "mangadex",
- "vipergirls"):
+ "vipergirls", "gfycat"):
config.set(("extractor", category), "username", None)
config.set(("extractor", "mastodon.social"), "access-token",