summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/4archive.py2
-rw-r--r--gallery_dl/extractor/8chan.py3
-rw-r--r--gallery_dl/extractor/artstation.py1
-rw-r--r--gallery_dl/extractor/bluesky.py8
-rw-r--r--gallery_dl/extractor/cien.py86
-rw-r--r--gallery_dl/extractor/common.py85
-rw-r--r--gallery_dl/extractor/deviantart.py5
-rw-r--r--gallery_dl/extractor/exhentai.py10
-rw-r--r--gallery_dl/extractor/foolfuuka.py24
-rw-r--r--gallery_dl/extractor/furaffinity.py2
-rw-r--r--gallery_dl/extractor/gelbooru.py35
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py31
-rw-r--r--gallery_dl/extractor/hiperdex.py20
-rw-r--r--gallery_dl/extractor/hotleak.py1
-rw-r--r--gallery_dl/extractor/imgur.py6
-rw-r--r--gallery_dl/extractor/inkbunny.py13
-rw-r--r--gallery_dl/extractor/kemonoparty.py50
-rw-r--r--gallery_dl/extractor/mastodon.py71
-rw-r--r--gallery_dl/extractor/newgrounds.py47
-rw-r--r--gallery_dl/extractor/oauth.py12
-rw-r--r--gallery_dl/extractor/patreon.py5
-rw-r--r--gallery_dl/extractor/pixeldrain.py16
-rw-r--r--gallery_dl/extractor/pixiv.py19
-rw-r--r--gallery_dl/extractor/poipiku.py8
-rw-r--r--gallery_dl/extractor/readcomiconline.py5
-rw-r--r--gallery_dl/extractor/reddit.py49
-rw-r--r--gallery_dl/extractor/seiga.py58
-rw-r--r--gallery_dl/extractor/slideshare.py12
-rw-r--r--gallery_dl/extractor/subscribestar.py2
-rw-r--r--gallery_dl/extractor/tapas.py15
-rw-r--r--gallery_dl/extractor/tcbscans.py2
-rw-r--r--gallery_dl/extractor/tumblr.py6
-rw-r--r--gallery_dl/extractor/twitter.py321
-rw-r--r--gallery_dl/extractor/vsco.py30
-rw-r--r--gallery_dl/extractor/wikimedia.py20
35 files changed, 790 insertions, 290 deletions
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py
index d198369..948a605 100644
--- a/gallery_dl/extractor/4archive.py
+++ b/gallery_dl/extractor/4archive.py
@@ -64,7 +64,7 @@ class _4archiveThreadExtractor(Extractor):
data = {
"name": extr('class="name">', "</span>"),
"date": text.parse_datetime(
- extr('class="dateTime postNum" >', "<").strip(),
+ extr('class="dateTime postNum">', "<").strip(),
"%Y-%m-%d %H:%M:%S"),
"no" : text.parse_int(extr('href="#p', '"')),
}
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
index fc16f43..a4b0997 100644
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -26,6 +26,9 @@ class _8chanExtractor(Extractor):
self.root = "https://8chan." + match.group(1)
Extractor.__init__(self, match)
+ def _init(self):
+ self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2])
+
@memcache()
def cookies_prepare(self):
# fetch captcha cookies
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index 49fde7b..ce1a78d 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -22,6 +22,7 @@ class ArtstationExtractor(Extractor):
directory_fmt = ("{category}", "{userinfo[username]}")
archive_fmt = "{asset[id]}"
browser = "firefox"
+ tls12 = False
root = "https://www.artstation.com"
def __init__(self, match):
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
index 84c3187..c97bf65 100644
--- a/gallery_dl/extractor/bluesky.py
+++ b/gallery_dl/extractor/bluesky.py
@@ -317,7 +317,7 @@ class BlueskyAPI():
def get_author_feed(self, actor, filter="posts_and_author_threads"):
endpoint = "app.bsky.feed.getAuthorFeed"
params = {
- "actor" : self._did_from_actor(actor),
+ "actor" : self._did_from_actor(actor, True),
"filter": filter,
"limit" : "100",
}
@@ -327,7 +327,7 @@ class BlueskyAPI():
endpoint = "app.bsky.feed.getFeed"
params = {
"feed" : "at://{}/app.bsky.feed.generator/{}".format(
- self._did_from_actor(actor, False), feed),
+ self._did_from_actor(actor), feed),
"limit": "100",
}
return self._pagination(endpoint, params)
@@ -344,7 +344,7 @@ class BlueskyAPI():
endpoint = "app.bsky.feed.getListFeed"
params = {
"list" : "at://{}/app.bsky.graph.list/{}".format(
- self._did_from_actor(actor, False), list),
+ self._did_from_actor(actor), list),
"limit": "100",
}
return self._pagination(endpoint, params)
@@ -391,7 +391,7 @@ class BlueskyAPI():
}
return self._pagination(endpoint, params, "posts")
- def _did_from_actor(self, actor, user_did=True):
+ def _did_from_actor(self, actor, user_did=False):
if actor.startswith("did:"):
did = actor
else:
diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py
new file mode 100644
index 0000000..a9ccab5
--- /dev/null
+++ b/gallery_dl/extractor/cien.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://ci-en.net/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)"
+
+
+class CienExtractor(Extractor):
+ category = "cien"
+ root = "https://ci-en.net"
+
+ def __init__(self, match):
+ self.root = text.root_from_url(match.group(0))
+ Extractor.__init__(self, match)
+
+ def _pagination_articles(self, url, params):
+ data = {"extractor": CienArticleExtractor}
+ params["page"] = text.parse_int(params.get("page"), 1)
+
+ while True:
+ page = self.request(url, params=params).text
+
+ for card in text.extract_iter(
+ page, ' class="c-cardCase-item', '</div>'):
+ article_url = text.extr(card, ' href="', '"')
+ yield Message.Queue, article_url, data
+
+ if ' rel="next"' not in page:
+ return
+ params["page"] += 1
+
+
+class CienArticleExtractor(CienExtractor):
+ subcategory = "article"
+ pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)"
+ example = "https://ci-en.net/creator/123/article/12345"
+
+ def items(self):
+ url = "{}/creator/{}/article/{}".format(
+ self.root, self.groups[0], self.groups[1])
+ page = self.request(url, notfound="article").text
+ return
+ yield 1
+
+
+class CienCreatorExtractor(CienExtractor):
+ subcategory = "creator"
+ pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$"
+ example = "https://ci-en.net/creator/123"
+
+ def items(self):
+ url = "{}/creator/{}/article".format(self.root, self.groups[0])
+ params = text.parse_query(self.groups[1])
+ params["mode"] = "list"
+ return self._pagination_articles(url, params)
+
+
+class CienRecentExtractor(CienExtractor):
+ subcategory = "recent"
+ pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?"
+ example = "https://ci-en.net/mypage/recent"
+
+ def items(self):
+ url = self.root + "/mypage/recent"
+ params = text.parse_query(self.groups[0])
+ return self._pagination_articles(url, params)
+
+
+class CienFollowingExtractor(CienExtractor):
+ subcategory = "following"
+ pattern = BASE_PATTERN + r"/mypage/subscription(/following)?"
+ example = "https://ci-en.net/mypage/subscription"
+
+ def items(self):
+ url = self.root + "/mypage/recent"
+ params = text.parse_query(self.groups[0])
+ return self._pagination_articles(url, params)
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index d14e13a..8771261 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -14,6 +14,7 @@ import ssl
import time
import netrc
import queue
+import getpass
import logging
import datetime
import requests
@@ -21,6 +22,7 @@ import threading
from requests.adapters import HTTPAdapter
from .message import Message
from .. import config, text, util, cache, exception
+urllib3 = requests.packages.urllib3
class Extractor():
@@ -45,6 +47,8 @@ class Extractor():
def __init__(self, match):
self.log = logging.getLogger(self.category)
self.url = match.string
+ self.match = match
+ self.groups = match.groups()
self._cfgpath = ("extractor", self.category, self.subcategory)
self._parentdir = ""
@@ -168,22 +172,25 @@ class Extractor():
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ContentDecodingError) as exc:
msg = exc
+ code = 0
except (requests.exceptions.RequestException) as exc:
raise exception.HttpError(exc)
else:
code = response.status_code
if self._write_pages:
self._dump_response(response)
- if 200 <= code < 400 or fatal is None and \
- (400 <= code < 500) or not fatal and \
- (400 <= code < 429 or 431 <= code < 500):
+ if (
+ code < 400 or
+ code < 500 and (not fatal and code != 429 or fatal is None)
+ ):
if encoding:
response.encoding = encoding
return response
if notfound and code == 404:
raise exception.NotFoundError(notfound)
- msg = "'{} {}' for '{}'".format(code, response.reason, url)
+ msg = "'{} {}' for '{}'".format(
+ code, response.reason, response.url)
server = response.headers.get("Server")
if server and server.startswith("cloudflare") and \
code in (403, 503):
@@ -194,7 +201,10 @@ class Extractor():
if b'name="captcha-bypass"' in content:
self.log.warning("Cloudflare CAPTCHA")
break
- if code not in retry_codes and code < 500:
+
+ if code == 429 and self._interval_429:
+ pass
+ elif code not in retry_codes and code < 500:
break
finally:
@@ -204,20 +214,24 @@ class Extractor():
if tries > retries:
break
+ seconds = tries
if self._interval:
- seconds = self._interval()
- if seconds < tries:
- seconds = tries
+ s = self._interval()
+ if seconds < s:
+ seconds = s
+ if code == 429 and self._interval_429:
+ s = self._interval_429()
+ if seconds < s:
+ seconds = s
+ self.wait(seconds=seconds, reason="429 Too Many Requests")
else:
- seconds = tries
-
- self.sleep(seconds, "retry")
+ self.sleep(seconds, "retry")
tries += 1
raise exception.HttpError(msg, response)
def wait(self, seconds=None, until=None, adjust=1.0,
- reason="rate limit reset"):
+ reason="rate limit"):
now = time.time()
if seconds:
@@ -240,7 +254,7 @@ class Extractor():
if reason:
t = datetime.datetime.fromtimestamp(until).time()
isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)
- self.log.info("Waiting until %s for %s.", isotime, reason)
+ self.log.info("Waiting until %s (%s)", isotime, reason)
time.sleep(seconds)
def sleep(self, seconds, reason):
@@ -248,6 +262,15 @@ class Extractor():
seconds, reason)
time.sleep(seconds)
+ def input(self, prompt, echo=True):
+ if echo:
+ try:
+ return input(prompt)
+ except (EOFError, OSError):
+ return None
+ else:
+ return getpass.getpass(prompt)
+
def _get_auth_info(self):
"""Return authentication information as (username, password) tuple"""
username = self.config("username")
@@ -280,6 +303,9 @@ class Extractor():
self.config("sleep-request", self.request_interval),
self.request_interval_min,
)
+ self._interval_429 = util.build_duration_func(
+ self.config("sleep-429", 60),
+ )
if self._retries < 0:
self._retries = float("inf")
@@ -439,9 +465,11 @@ class Extractor():
if not path:
return
+ path_tmp = path + ".tmp"
try:
- with open(path, "w") as fp:
+ with open(path_tmp, "w") as fp:
util.cookiestxt_store(fp, self.cookies)
+ os.replace(path_tmp, path)
except OSError as exc:
self.log.warning("cookies: %s", exc)
@@ -599,7 +627,7 @@ class GalleryExtractor(Extractor):
def __init__(self, match, url=None):
Extractor.__init__(self, match)
- self.gallery_url = self.root + match.group(1) if url is None else url
+ self.gallery_url = self.root + self.groups[0] if url is None else url
def items(self):
self.login()
@@ -674,7 +702,7 @@ class MangaExtractor(Extractor):
def __init__(self, match, url=None):
Extractor.__init__(self, match)
- self.manga_url = url or self.root + match.group(1)
+ self.manga_url = self.root + self.groups[0] if url is None else url
if self.config("chapter-reverse", False):
self.reverse = not self.reverse
@@ -736,17 +764,18 @@ class BaseExtractor(Extractor):
instances = ()
def __init__(self, match):
- if not self.category:
- self._init_category(match)
Extractor.__init__(self, match)
+ if not self.category:
+ self._init_category()
+ self._cfgpath = ("extractor", self.category, self.subcategory)
- def _init_category(self, match):
- for index, group in enumerate(match.groups()):
+ def _init_category(self):
+ for index, group in enumerate(self.groups):
if group is not None:
if index:
self.category, self.root, info = self.instances[index-1]
if not self.root:
- self.root = text.root_from_url(match.group(0))
+ self.root = text.root_from_url(self.match.group(0))
self.config_instance = info.get
else:
self.root = group
@@ -806,12 +835,12 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
pass
if ssl_options or ssl_ciphers:
- ssl_context = ssl.create_default_context()
- if ssl_options:
- ssl_context.options |= ssl_options
- if ssl_ciphers:
- ssl_context.set_ecdh_curve("prime256v1")
- ssl_context.set_ciphers(ssl_ciphers)
+ ssl_context = urllib3.connection.create_urllib3_context(
+ options=ssl_options or None, ciphers=ssl_ciphers)
+ if requests.__version__ > "2.31":
+ # https://github.com/psf/requests/pull/6731
+ ssl_context.load_default_certs()
+ ssl_context.check_hostname = False
else:
ssl_context = None
@@ -931,8 +960,6 @@ SSL_CIPHERS = {
}
-urllib3 = requests.packages.urllib3
-
# detect brotli support
try:
BROTLI = urllib3.response.brotli is not None
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index ca8acaa..993885a 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -1457,9 +1457,8 @@ class DeviantartOAuthAPI():
self.log.info(
"Register your own OAuth application and use its "
"credentials to prevent this error: "
- "https://github.com/mikf/gallery-dl/blob/master/do"
- "cs/configuration.rst#extractordeviantartclient-id"
- "--client-secret")
+ "https://gdl-org.github.io/docs/configuration.html"
+ "#extractor-deviantart-client-id-client-secret")
else:
if log:
self.log.error(msg)
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index acad95c..1805403 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -50,7 +50,7 @@ class ExhentaiExtractor(Extractor):
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
- if response.history and response.headers.get("Content-Length") == "0":
+ if "Cache-Control" not in response.headers and not response.content:
self.log.info("blank page")
raise exception.AuthorizationError()
return response
@@ -95,7 +95,11 @@ class ExhentaiExtractor(Extractor):
self.cookies.clear()
response = self.request(url, method="POST", headers=headers, data=data)
- if b"You are now logged in as:" not in response.content:
+ content = response.content
+ if b"You are now logged in as:" not in content:
+ if b"The captcha was not entered correctly" in content:
+ raise exception.AuthenticationError(
+ "CAPTCHA required. Use cookies instead.")
raise exception.AuthenticationError()
# collect more cookies
@@ -437,7 +441,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.AuthorizationError()
if page.startswith(("Key missing", "Gallery not found")):
raise exception.NotFoundError("gallery")
- if "hentai.org/mpv/" in page:
+ if page.count("hentai.org/mpv/") > 1:
self.log.warning("Enabled Multi-Page Viewer is not supported")
return page
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index 715abcb..85dd896 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -117,8 +117,8 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
- self.board = match.group(match.lastindex-1)
- self.thread = match.group(match.lastindex)
+ self.board = self.groups[-2]
+ self.thread = self.groups[-1]
self.data = None
def metadata(self):
@@ -140,20 +140,22 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
class FoolfuukaBoardExtractor(FoolfuukaExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
subcategory = "board"
- pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$"
+ pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$"
example = "https://archived.moe/a/"
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
- self.board = match.group(match.lastindex)
+ self.board = self.groups[-2]
+ self.page = self.groups[-1]
def items(self):
index_base = "{}/_/api/chan/index/?board={}&page=".format(
self.root, self.board)
thread_base = "{}/{}/thread/".format(self.root, self.board)
- for page in itertools.count(1):
- with self.request(index_base + format(page)) as response:
+ page = self.page
+ for pnum in itertools.count(text.parse_int(page, 1)):
+ with self.request(index_base + format(pnum)) as response:
try:
threads = response.json()
except ValueError:
@@ -167,6 +169,9 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor):
thread["_extractor"] = FoolfuukaThreadExtractor
yield Message.Queue, thread["url"], thread
+ if page:
+ return
+
class FoolfuukaSearchExtractor(FoolfuukaExtractor):
"""Base extractor for search results on FoolFuuka based boards/archives"""
@@ -179,17 +184,16 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
self.params = params = {}
- args = match.group(match.lastindex).split("/")
- key = None
- for arg in args:
+ key = None
+ for arg in self.groups[-1].split("/"):
if key:
params[key] = text.unescape(arg)
key = None
else:
key = arg
- board = match.group(match.lastindex-1)
+ board = self.groups[-2]
if board != "_":
params["boards"] = board
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 56721d0..6040187 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -11,7 +11,7 @@
from .common import Extractor, Message
from .. import text, util
-BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net"
+BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net"
class FuraffinityExtractor(Extractor):
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 2459a61..37c776e 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -51,19 +51,44 @@ class GelbooruBase():
params["pid"] = self.page_start
params["limit"] = self.per_page
limit = self.per_page // 2
+ pid = False
+
+ if "tags" in params:
+ tags = params["tags"].split()
+ op = "<"
+ id = False
+
+ for tag in tags:
+ if tag.startswith("sort:"):
+ if tag == "sort:id:asc":
+ op = ">"
+ elif tag == "sort:id" or tag.startswith("sort:id:"):
+ op = "<"
+ else:
+ pid = True
+ elif tag.startswith("id:"):
+ id = True
+
+ if not pid:
+ if id:
+ tag = "id:" + op
+ tags = [t for t in tags if not t.startswith(tag)]
+ tags = "{} id:{}".format(" ".join(tags), op)
while True:
posts = self._api_request(params)
- for post in posts:
- yield post
+ yield from posts
if len(posts) < limit:
return
- if "pid" in params:
- del params["pid"]
- params["tags"] = "{} id:<{}".format(self.tags, post["id"])
+ if pid:
+ params["pid"] += 1
+ else:
+ if "pid" in params:
+ del params["pid"]
+ params["tags"] = tags + str(posts[-1]["id"])
def _pagination_html(self, params):
url = self.root + "/index.php"
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 7ab6d02..8d8b8ad 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -25,7 +25,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.api_root = self.config_instance("api_root") or self.root
if self.category == "realbooru":
- self._file_url = self._file_url_realbooru
+ self.items = self._items_realbooru
self._tags = self._tags_realbooru
def _api_request(self, params):
@@ -124,6 +124,35 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url
+ def _items_realbooru(self):
+ from .common import Message
+ data = self.metadata()
+
+ for post in self.posts():
+ try:
+ html = self._html(post)
+ fallback = post["file_url"]
+ url = post["file_url"] = text.rextract(
+ html, 'href="', '"', html.index(">Original<"))[0]
+ except Exception:
+ self.log.debug("Unable to fetch download URL for post %s "
+ "(md5: %s)", post.get("id"), post.get("md5"))
+ continue
+
+ text.nameext_from_url(url, post)
+ post.update(data)
+ self._prepare(post)
+ self._tags(post, html)
+
+ path = url.rpartition("/")[0]
+ post["_fallback"] = (
+ "{}/{}.{}".format(path, post["md5"], post["extension"]),
+ fallback,
+ )
+
+ yield Message.Directory, post
+ yield Message.Url, url, post
+
def _tags_realbooru(self, post, page):
tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list)
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index aadce6c..4a9759f 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://hiperdex.com/"""
+"""Extractors for https://hiperdex.top/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
@@ -14,18 +14,18 @@ from ..cache import memcache
import re
BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
- r"(?:1st)?hiperdex\d?\.(?:com|net|info))")
+ r"(?:1st)?hiperdex\d?\.(?:com|net|info|top))")
class HiperdexBase():
"""Base class for hiperdex extractors"""
category = "hiperdex"
- root = "https://hiperdex.com"
+ root = "https://hiperdex.top"
@memcache(keyarg=1)
def manga_data(self, manga, page=None):
if not page:
- url = "{}/mangas/{}/".format(self.root, manga)
+ url = "{}/manga/{}/".format(self.root, manga)
page = self.request(url).text
extr = text.extract_from(page)
@@ -67,9 +67,9 @@ class HiperdexBase():
class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
- """Extractor for manga chapters from hiperdex.com"""
+ """Extractor for hiperdex manga chapters"""
pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))"
- example = "https://hiperdex.com/mangas/MANGA/CHAPTER/"
+ example = "https://hiperdex.top/manga/MANGA/CHAPTER/"
def __init__(self, match):
root, path, self.manga, self.chapter = match.groups()
@@ -88,10 +88,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
- """Extractor for manga from hiperdex.com"""
+ """Extractor for hiperdex manga"""
chapterclass = HiperdexChapterExtractor
pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$"
- example = "https://hiperdex.com/mangas/MANGA/"
+ example = "https://hiperdex.top/manga/MANGA/"
def __init__(self, match):
root, path, self.manga = match.groups()
@@ -121,13 +121,13 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
- """Extractor for an artists's manga on hiperdex.com"""
+ """Extractor for an artists's manga on hiperdex"""
subcategory = "artist"
categorytransfer = False
chapterclass = HiperdexMangaExtractor
reverse = False
pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))"
- example = "https://hiperdex.com/manga-artist/NAME/"
+ example = "https://hiperdex.top/manga-artist/NAME/"
def __init__(self, match):
self.root = text.ensure_http_scheme(match.group(1))
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
index 6d3184d..a2b51be 100644
--- a/gallery_dl/extractor/hotleak.py
+++ b/gallery_dl/extractor/hotleak.py
@@ -23,6 +23,7 @@ class HotleakExtractor(Extractor):
def items(self):
for post in self.posts():
+ post["_http_expected_status"] = (404,)
yield Message.Directory, post
yield Message.Url, post["url"], post
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 86b1edd..481fb1e 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -68,7 +68,7 @@ class ImgurImageExtractor(ImgurExtractor):
filename_fmt = "{category}_{id}{title:?_//}.{extension}"
archive_fmt = "{id}"
pattern = (BASE_PATTERN + r"/(?!gallery|search)"
- r"(?:r/\w+/)?(\w{7}|\w{5})[sbtmlh]?")
+ r"(?:r/\w+/)?(?:[^/?#]+-)?(\w{7}|\w{5})[sbtmlh]?")
example = "https://imgur.com/abcdefg"
def items(self):
@@ -93,7 +93,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}")
filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}"
archive_fmt = "{album[id]}_{id}"
- pattern = BASE_PATTERN + r"/a/(\w{7}|\w{5})"
+ pattern = BASE_PATTERN + r"/a/(?:[^/?#]+-)?(\w{7}|\w{5})"
example = "https://imgur.com/a/abcde"
def items(self):
@@ -126,7 +126,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
class ImgurGalleryExtractor(ImgurExtractor):
"""Extractor for imgur galleries"""
subcategory = "gallery"
- pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(\w{7}|\w{5})"
+ pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{7}|\w{5})"
example = "https://imgur.com/gallery/abcde"
def items(self):
diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py
index 62586af..2ae8cbe 100644
--- a/gallery_dl/extractor/inkbunny.py
+++ b/gallery_dl/extractor/inkbunny.py
@@ -330,15 +330,18 @@ class InkbunnyAPI():
def _call(self, endpoint, params):
url = "https://inkbunny.net/api_" + endpoint + ".php"
params["sid"] = self.session_id
- data = self.extractor.request(url, params=params).json()
- if "error_code" in data:
+ while True:
+ data = self.extractor.request(url, params=params).json()
+
+ if "error_code" not in data:
+ return data
+
if str(data["error_code"]) == "2":
self.authenticate(invalidate=True)
- return self._call(endpoint, params)
- raise exception.StopExtraction(data.get("error_message"))
+ continue
- return data
+ raise exception.StopExtraction(data.get("error_message"))
def _pagination_search(self, params):
params["page"] = 1
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 9c77b7a..b0c24de 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -57,7 +57,7 @@ class KemonopartyExtractor(Extractor):
generators = self._build_file_generators(self.config("files"))
duplicates = self.config("duplicates")
comments = self.config("comments")
- username = dms = None
+ username = dms = announcements = None
# prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"}
@@ -68,6 +68,8 @@ class KemonopartyExtractor(Extractor):
'<meta name="artist_name" content="', '"')[0])
if self.config("dms"):
dms = True
+ if self.config("announcements"):
+ announcements = True
posts = self.posts()
max_posts = self.config("max-posts")
@@ -80,7 +82,7 @@ class KemonopartyExtractor(Extractor):
self.root, post["service"], post["user"], post["id"])
post["_http_headers"] = headers
post["date"] = self._parse_datetime(
- post["published"] or post["added"])
+ post.get("published") or post.get("added") or "")
if username:
post["username"] = username
@@ -88,8 +90,12 @@ class KemonopartyExtractor(Extractor):
post["comments"] = self._extract_comments(post)
if dms is not None:
if dms is True:
- dms = self._extract_dms(post)
+ dms = self._extract_cards(post, "dms")
post["dms"] = dms
+ if announcements is not None:
+ if announcements is True:
+ announcements = self._extract_cards(post, "announcements")
+ post["announcements"] = announcements
files = []
hashes = set()
@@ -156,7 +162,7 @@ class KemonopartyExtractor(Extractor):
def _file(self, post):
file = post["file"]
- if not file:
+ if not file or "path" not in file:
return ()
file["type"] = "file"
return (file,)
@@ -200,21 +206,21 @@ class KemonopartyExtractor(Extractor):
})
return comments
- def _extract_dms(self, post):
- url = "{}/{}/user/{}/dms".format(
- self.root, post["service"], post["user"])
+ def _extract_cards(self, post, type):
+ url = "{}/{}/user/{}/{}".format(
+ self.root, post["service"], post["user"], type)
page = self.request(url).text
- dms = []
- for dm in text.extract_iter(page, "<article", "</article>"):
- footer = text.extr(dm, "<footer", "</footer>")
- dms.append({
+ cards = []
+ for card in text.extract_iter(page, "<article", "</article>"):
+ footer = text.extr(card, "<footer", "</footer>")
+ cards.append({
"body": text.unescape(text.extr(
- dm, "<pre>", "</pre></",
+ card, "<pre>", "</pre></",
).strip()),
- "date": text.extr(footer, 'Published: ', '\n'),
+ "date": text.extr(footer, ': ', '\n'),
})
- return dms
+ return cards
def _parse_datetime(self, date_string):
if len(date_string) > 19:
@@ -494,7 +500,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
def __init__(self, match):
KemonopartyExtractor.__init__(self, match)
- self.favorites = (text.parse_query(match.group(3)).get("type") or
+ self.params = text.parse_query(match.group(3))
+ self.favorites = (self.params.get("type") or
self.config("favorites") or
"artist")
@@ -502,9 +509,17 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
self._prepare_ddosguard_cookies()
self.login()
+ sort = self.params.get("sort")
+ order = self.params.get("order") or "desc"
+
if self.favorites == "artist":
users = self.request(
self.root + "/api/v1/account/favorites?type=artist").json()
+
+ if not sort:
+ sort = "updated"
+ users.sort(key=lambda x: x[sort], reverse=(order == "desc"))
+
for user in users:
user["_extractor"] = KemonopartyUserExtractor
url = "{}/{}/user/{}".format(
@@ -514,6 +529,11 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
elif self.favorites == "post":
posts = self.request(
self.root + "/api/v1/account/favorites?type=post").json()
+
+ if not sort:
+ sort = "faved_seq"
+ posts.sort(key=lambda x: x[sort], reverse=(order == "desc"))
+
for post in posts:
post["_extractor"] = KemonopartyPostExtractor
url = "{}/{}/user/{}/post/{}".format(
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 030d7d1..cb7f701 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -29,6 +29,7 @@ class MastodonExtractor(BaseExtractor):
self.instance = self.root.partition("://")[2]
self.reblogs = self.config("reblogs", False)
self.replies = self.config("replies", True)
+ self.cards = self.config("cards", False)
def items(self):
for status in self.statuses():
@@ -48,6 +49,17 @@ class MastodonExtractor(BaseExtractor):
if status["reblog"]:
attachments.extend(status["reblog"]["media_attachments"])
+ if self.cards:
+ card = status.get("card")
+ if card:
+ url = card.get("image")
+ if url:
+ card["weburl"] = card.get("url")
+ card["url"] = url
+ card["id"] = "card" + "".join(
+ url.split("/")[6:-2]).lstrip("0")
+ attachments.append(card)
+
status["instance"] = self.instance
acct = status["account"]["acct"]
status["instance_remote"] = \
@@ -120,6 +132,7 @@ class MastodonUserExtractor(MastodonExtractor):
api.account_id_by_username(self.item),
only_media=(
not self.reblogs and
+ not self.cards and
not self.config("text-posts", False)
),
exclude_replies=not self.replies,
@@ -136,6 +149,36 @@ class MastodonBookmarkExtractor(MastodonExtractor):
return MastodonAPI(self).account_bookmarks()
+class MastodonFavoriteExtractor(MastodonExtractor):
+ """Extractor for mastodon favorites"""
+ subcategory = "favorite"
+ pattern = BASE_PATTERN + r"/favourites"
+ example = "https://mastodon.social/favourites"
+
+ def statuses(self):
+ return MastodonAPI(self).account_favorites()
+
+
+class MastodonListExtractor(MastodonExtractor):
+ """Extractor for mastodon lists"""
+ subcategory = "list"
+ pattern = BASE_PATTERN + r"/lists/(\w+)"
+ example = "https://mastodon.social/lists/12345"
+
+ def statuses(self):
+ return MastodonAPI(self).timelines_list(self.item)
+
+
+class MastodonHashtagExtractor(MastodonExtractor):
+ """Extractor for mastodon hashtags"""
+ subcategory = "hashtag"
+ pattern = BASE_PATTERN + r"/tags/(\w+)"
+ example = "https://mastodon.social/tags/NAME"
+
+ def statuses(self):
+ return MastodonAPI(self).timelines_tag(self.item)
+
+
class MastodonFollowingExtractor(MastodonExtractor):
"""Extractor for followed mastodon users"""
subcategory = "following"
@@ -205,37 +248,55 @@ class MastodonAPI():
raise exception.NotFoundError("account")
def account_bookmarks(self):
+ """Statuses the user has bookmarked"""
endpoint = "/v1/bookmarks"
return self._pagination(endpoint, None)
+ def account_favorites(self):
+ """Statuses the user has favourited"""
+ endpoint = "/v1/favourites"
+ return self._pagination(endpoint, None)
+
def account_following(self, account_id):
+ """Accounts which the given account is following"""
endpoint = "/v1/accounts/{}/following".format(account_id)
return self._pagination(endpoint, None)
def account_lookup(self, username):
+ """Quickly lookup a username to see if it is available"""
endpoint = "/v1/accounts/lookup"
params = {"acct": username}
return self._call(endpoint, params).json()
def account_search(self, query, limit=40):
- """Search for accounts"""
+ """Search for matching accounts by username or display name"""
endpoint = "/v1/accounts/search"
params = {"q": query, "limit": limit}
return self._call(endpoint, params).json()
def account_statuses(self, account_id, only_media=True,
exclude_replies=False):
- """Fetch an account's statuses"""
+ """Statuses posted to the given account"""
endpoint = "/v1/accounts/{}/statuses".format(account_id)
- params = {"only_media" : "1" if only_media else "0",
- "exclude_replies": "1" if exclude_replies else "0"}
+ params = {"only_media" : "true" if only_media else "false",
+ "exclude_replies": "true" if exclude_replies else "false"}
return self._pagination(endpoint, params)
def status(self, status_id):
- """Fetch a status"""
+ """Obtain information about a status"""
endpoint = "/v1/statuses/" + status_id
return self._call(endpoint).json()
+ def timelines_list(self, list_id):
+ """View statuses in the given list timeline"""
+ endpoint = "/v1/timelines/list/" + list_id
+ return self._pagination(endpoint, None)
+
+ def timelines_tag(self, hashtag):
+ """View public statuses containing the given hashtag"""
+ endpoint = "/v1/timelines/tag/" + hashtag
+ return self._pagination(endpoint, None)
+
def _call(self, endpoint, params=None):
if endpoint.startswith("http"):
url = endpoint
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 4cdcf87..7ac3a3a 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -102,30 +102,55 @@ class NewgroundsExtractor(Extractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- url = self.root + "/passport/"
+ url = self.root + "/passport"
response = self.request(url)
if response.history and response.url.endswith("/social"):
return self.cookies
page = response.text
- headers = {"Origin": self.root, "Referer": url}
+ headers = {
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+ "X-Requested-With": "XMLHttpRequest",
+ "Origin": self.root,
+ "Referer": url,
+ }
url = text.urljoin(self.root, text.extr(page, 'action="', '"'))
data = {
- "username": username,
- "password": password,
- "remember": "1",
- "login" : "1",
"auth" : text.extr(page, 'name="auth" value="', '"'),
+ "remember": "1",
+ "username": username,
+ "password": str(password),
+ "code" : "",
+ "codehint": "------",
+ "mfaCheck": "1",
}
- response = self.request(url, method="POST", headers=headers, data=data)
- if not response.history:
- raise exception.AuthenticationError()
+ while True:
+ response = self.request(
+ url, method="POST", headers=headers, data=data)
+ result = response.json()
+
+ if result.get("success"):
+ break
+ if "errors" in result:
+ raise exception.AuthenticationError(
+ '"' + '", "'.join(result["errors"]) + '"')
+
+ if result.get("requiresMfa"):
+ data["code"] = self.input("Verification Code: ")
+ data["codehint"] = " "
+ elif result.get("requiresEmailMfa"):
+ email = result.get("obfuscatedEmail")
+ prompt = "Email Verification Code ({}): ".format(email)
+ data["code"] = self.input(prompt)
+ data["codehint"] = " "
+
+ data.pop("mfaCheck", None)
return {
cookie.name: cookie.value
- for cookie in response.history[0].cookies
- if cookie.expires and cookie.domain == self.cookies_domain
+ for cookie in response.cookies
}
def extract_post(self, post_url):
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 8c8a5a9..5571575 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -110,7 +110,7 @@ class OAuthBase(Extractor):
# get a request token
params = {"oauth_callback": self.redirect_uri}
- data = self.session.get(request_token_url, params=params).text
+ data = self.request(request_token_url, params=params).text
data = text.parse_query(data)
self.session.auth.token_secret = data["oauth_token_secret"]
@@ -120,7 +120,7 @@ class OAuthBase(Extractor):
data = self.open(authorize_url, params)
# exchange the request token for an access token
- data = self.session.get(access_token_url, params=data).text
+ data = self.request(access_token_url, params=data).text
data = text.parse_query(data)
token = data["oauth_token"]
token_secret = data["oauth_token_secret"]
@@ -189,7 +189,8 @@ class OAuthBase(Extractor):
data["client_id"] = client_id
data["client_secret"] = client_secret
- data = self.session.post(token_url, data=data, auth=auth).json()
+ data = self.request(
+ token_url, method="POST", data=data, auth=auth).json()
# check token response
if "error" in data:
@@ -386,7 +387,7 @@ class OAuthMastodon(OAuthBase):
"redirect_uris": self.redirect_uri,
"scopes": "read",
}
- data = self.session.post(url, data=data).json()
+ data = self.request(url, method="POST", data=data).json()
if "client_id" not in data or "client_secret" not in data:
raise exception.StopExtraction(
@@ -441,7 +442,8 @@ class OAuthPixiv(OAuthBase):
"redirect_uri" : "https://app-api.pixiv.net"
"/web/v1/users/auth/pixiv/callback",
}
- data = self.session.post(url, headers=headers, data=data).json()
+ data = self.request(
+ url, method="POST", headers=headers, data=data).json()
if "error" in data:
stdout_write("\n{}\n".format(data))
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 62d11f2..eb6d677 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -263,8 +263,9 @@ class PatreonExtractor(Extractor):
page, 'id="__NEXT_DATA__" type="application/json">', '</script')
if data:
try:
- return (util.json_loads(data)["props"]["pageProps"]
- ["bootstrapEnvelope"]["bootstrap"])
+ data = util.json_loads(data)
+ env = data["props"]["pageProps"]["bootstrapEnvelope"]
+ return env.get("pageBootstrap") or env["bootstrap"]
except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py
index 5cfdc43..83f3577 100644
--- a/gallery_dl/extractor/pixeldrain.py
+++ b/gallery_dl/extractor/pixeldrain.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -59,12 +59,13 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
directory_fmt = ("{category}",
"{album[date]:%Y-%m-%d} {album[title]} ({album[id]})")
filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}"
- pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)"
+ pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)(?:#item=(\d+))?"
example = "https://pixeldrain.com/l/abcdefgh"
def __init__(self, match):
Extractor.__init__(self, match)
self.album_id = match.group(1)
+ self.file_index = match.group(2)
def items(self):
url = "{}/api/list/{}".format(self.root, self.album_id)
@@ -74,11 +75,20 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
album["count"] = album["file_count"]
album["date"] = self.parse_datetime(album["date_created"])
+ if self.file_index:
+ idx = text.parse_int(self.file_index)
+ try:
+ files = (files[idx],)
+ except LookupError:
+ files = ()
+ else:
+ idx = 0
+
del album["files"]
del album["file_count"]
yield Message.Directory, {"album": album}
- for num, file in enumerate(files, 1):
+ for num, file in enumerate(files, idx+1):
file["album"] = album
file["num"] = num
file["url"] = url = "{}/api/file/{}?download".format(
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 862a7db..d732894 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -104,8 +104,9 @@ class PixivExtractor(Extractor):
elif work["page_count"] == 1:
url = meta_single_page["original_image_url"]
if url == url_sanity:
- self.log.debug("Skipping 'sanity_level' warning (%s)",
- work["id"])
+ self.log.warning(
+ "Unable to download work %s ('sanity_level' warning)",
+ work["id"])
continue
work["date_url"] = self._date_from_url(url)
yield Message.Url, url, text.nameext_from_url(url, work)
@@ -619,6 +620,7 @@ class PixivNovelExtractor(PixivExtractor):
meta_user = self.config("metadata")
meta_bookmark = self.config("metadata-bookmark")
embeds = self.config("embeds")
+ covers = self.config("covers")
if embeds:
headers = {
@@ -658,6 +660,19 @@ class PixivNovelExtractor(PixivExtractor):
novel["extension"] = "txt"
yield Message.Url, "text:" + content, novel
+ if covers:
+ path = novel["image_urls"]["large"].partition("/img/")[2]
+ url = ("https://i.pximg.net/novel-cover-original/img/" +
+ path.rpartition(".")[0].replace("_master1200", ""))
+ novel["date_url"] = self._date_from_url(url)
+ novel["num"] += 1
+ novel["suffix"] = "_p{:02}".format(novel["num"])
+ novel["_fallback"] = (url + ".png",)
+ url_jpg = url + ".jpg"
+ text.nameext_from_url(url_jpg, novel)
+ yield Message.Url, url_jpg, novel
+ del novel["_fallback"]
+
if embeds:
desktop = False
illusts = {}
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index f42016f..bd22283 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -23,6 +23,12 @@ class PoipikuExtractor(Extractor):
archive_fmt = "{post_id}_{num}"
request_interval = (0.5, 1.5)
+ def _init(self):
+ self.cookies.set(
+ "LANG", "en", domain="poipiku.com")
+ self.cookies.set(
+ "POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com")
+
def items(self):
password = self.config("password", "")
@@ -59,7 +65,7 @@ class PoipikuExtractor(Extractor):
"//img.", "//img-org.", 1)
yield Message.Url, url, text.nameext_from_url(url, post)
- if not extr(' show all(+', '<'):
+ if not extr('ShowAppendFile', '<'):
continue
url = self.root + "/f/ShowAppendFileF.jsp"
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index 3569860..115de9a 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -35,10 +35,7 @@ class ReadcomiconlineBase():
self.log.warning(
"Redirect to \n%s\nVisit this URL in your browser, solve "
"the CAPTCHA, and press ENTER to continue", response.url)
- try:
- input()
- except (EOFError, OSError):
- pass
+ self.input()
else:
raise exception.StopExtraction(
"Redirect to \n%s\nVisit this URL in your browser and "
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index e099c7e..ce602f6 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -74,8 +74,8 @@ class RedditExtractor(Extractor):
yield Message.Url, url, submission
elif "gallery_data" in media:
- for submission["num"], url in enumerate(
- self._extract_gallery(media), 1):
+ for url in self._extract_gallery(media):
+ submission["num"] += 1
text.nameext_from_url(url, submission)
yield Message.Url, url, submission
@@ -99,7 +99,10 @@ class RedditExtractor(Extractor):
urls.append((url, submission))
for comment in comments:
html = comment["body_html"] or ""
- if ' href="' in html:
+ href = (' href="' in html)
+ media = ("media_metadata" in comment)
+
+ if media or href:
comment["date"] = text.parse_timestamp(
comment["created_utc"])
if submission:
@@ -107,6 +110,14 @@ class RedditExtractor(Extractor):
data["comment"] = comment
else:
data = comment
+
+ if media:
+ for embed in self._extract_embed(comment):
+ submission["num"] += 1
+ text.nameext_from_url(embed, submission)
+ yield Message.Url, embed, submission
+
+ if href:
for url in text.extract_iter(html, ' href="', '"'):
urls.append((url, data))
@@ -118,6 +129,7 @@ class RedditExtractor(Extractor):
if url.startswith((
"https://www.reddit.com/message/compose",
"https://reddit.com/message/compose",
+ "https://preview.redd.it/",
)):
continue
@@ -172,6 +184,27 @@ class RedditExtractor(Extractor):
submission["id"], item["media_id"])
self.log.debug(src)
+ def _extract_embed(self, submission):
+ meta = submission["media_metadata"]
+ if not meta:
+ return
+
+ for mid, data in meta.items():
+ if data["status"] != "valid" or "s" not in data:
+ self.log.warning(
+ "embed %s: skipping item %s (status: %s)",
+ submission["id"], mid, data.get("status"))
+ continue
+ src = data["s"]
+ url = src.get("u") or src.get("gif") or src.get("mp4")
+ if url:
+ yield url.partition("?")[0].replace("/preview.", "/i.", 1)
+ else:
+ self.log.error(
+ "embed %s: unable to fetch download URL for item %s",
+ submission["id"], mid)
+ self.log.debug(src)
+
def _extract_video_ytdl(self, submission):
return "https://www.reddit.com" + submission["permalink"]
@@ -454,14 +487,14 @@ class RedditAPI():
remaining = response.headers.get("x-ratelimit-remaining")
if remaining and float(remaining) < 2:
- if self._warn_429:
- self._warn_429 = False
+ self.log.warning("API rate limit exceeded")
+ if self._warn_429 and self.client_id == self.CLIENT_ID:
self.log.info(
"Register your own OAuth application and use its "
"credentials to prevent this error: "
- "https://github.com/mikf/gallery-dl/blob/master"
- "/docs/configuration.rst"
- "#extractorredditclient-id--user-agent")
+ "https://gdl-org.github.io/docs/configuration.html"
+ "#extractor-reddit-client-id-user-agent")
+ self._warn_429 = False
self.extractor.wait(
seconds=response.headers["x-ratelimit-reset"])
continue
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
index edfe1dc..23ba340 100644
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text, util, exception
+from ..cache import cache
class SeigaExtractor(Extractor):
@@ -17,6 +18,7 @@ class SeigaExtractor(Extractor):
category = "seiga"
archive_fmt = "{image_id}"
cookies_domain = ".nicovideo.jp"
+ cookies_names = ("user_session",)
root = "https://seiga.nicovideo.jp"
def __init__(self, match):
@@ -24,8 +26,7 @@ class SeigaExtractor(Extractor):
self.start_image = 0
def items(self):
- if not self.cookies_check(("user_session",)):
- raise exception.StopExtraction("'user_session' cookie required")
+ self.login()
images = iter(self.get_images())
data = next(images)
@@ -50,6 +51,59 @@ class SeigaExtractor(Extractor):
"HTTP redirect to login page (%s)", location.partition("?")[0])
return location.replace("/o/", "/priv/", 1)
+ def login(self):
+ if self.cookies_check(self.cookies_names):
+ return
+
+ username, password = self._get_auth_info()
+ if username:
+ return self.cookies_update(self._login_impl(username, password))
+
+ raise exception.AuthorizationError(
+ "username & password or 'user_session' cookie required")
+
+ @cache(maxage=365*86400, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ root = "https://account.nicovideo.jp"
+ response = self.request(root + "/login?site=seiga")
+ page = response.text
+
+ data = {
+ "mail_tel": username,
+ "password": password,
+ }
+ url = root + text.unescape(text.extr(page, '<form action="', '"'))
+ response = self.request(url, method="POST", data=data)
+
+ if "message=cant_login" in response.url:
+ raise exception.AuthenticationError()
+
+ if "/mfa" in response.url:
+ page = response.text
+ email = text.extr(page, 'class="userAccount">', "<")
+ code = self.input("Email Confirmation Code ({}): ".format(email))
+
+ data = {
+ "otp": code,
+ "loginBtn": "Login",
+ "device_name": "gdl",
+ }
+ url = root + text.unescape(text.extr(page, '<form action="', '"'))
+ response = self.request(url, method="POST", data=data)
+
+ if not response.history and \
+ b"Confirmation code is incorrect" in response.content:
+ raise exception.AuthenticationError(
+ "Incorrect Confirmation Code")
+
+ return {
+ cookie.name: cookie.value
+ for cookie in self.cookies
+ if cookie.expires and cookie.domain == self.cookies_domain
+ }
+
class SeigaUserExtractor(SeigaExtractor):
"""Extractor for images of a user from seiga.nicovideo.jp"""
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index b56ed27..e5e7a6b 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -47,13 +47,13 @@ class SlidesharePresentationExtractor(GalleryExtractor):
}
def images(self, page):
- parts = self.slideshow["slideImages"][0]["baseUrl"].split("/")
-
- begin = "{}/95/{}-".format(
- "/".join(parts[:4]),
- self.slideshow["strippedTitle"],
+ slides = self.slideshow["slides"]
+ begin = "{}/{}/95/{}-".format(
+ slides["host"],
+ slides["imageLocation"],
+ slides["title"],
)
- end = "-1024.jpg?" + parts[-1].rpartition("?")[2]
+ end = "-1024.jpg"
return [
(begin + str(n) + end, None)
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index d4adfed..0abb3ab 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -43,6 +43,8 @@ class SubscribestarExtractor(Extractor):
item.update(data)
item["num"] = num
text.nameext_from_url(item.get("name") or item["url"], item)
+ if item["url"][0] == "/":
+ item["url"] = self.root + item["url"]
yield Message.Url, item["url"], item
def posts(self):
diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py
index 0a9df20..167953d 100644
--- a/gallery_dl/extractor/tapas.py
+++ b/gallery_dl/extractor/tapas.py
@@ -151,3 +151,18 @@ class TapasEpisodeExtractor(TapasExtractor):
def episode_ids(self):
return (self.episode_id,)
+
+
+class TapasCreatorExtractor(TapasExtractor):
+ subcategory = "creator"
+ pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)"
+ example = "https://tapas.io/CREATOR"
+
+ def items(self):
+ url = "{}/{}/series".format(self.root, self.groups[0])
+ page = self.request(url).text
+ page = text.extr(page, '<ul class="content-list-wrap', "</ul>")
+
+ data = {"_extractor": TapasSeriesExtractor}
+ for path in text.extract_iter(page, ' href="', '"'):
+ yield Message.Queue, self.root + path, data
diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py
index a3ef26c..de6f3ee 100644
--- a/gallery_dl/extractor/tcbscans.py
+++ b/gallery_dl/extractor/tcbscans.py
@@ -30,7 +30,7 @@ class TcbscansChapterExtractor(ChapterExtractor):
page, 'font-bold mt-8">', "</h1>").rpartition(" - Chapter ")
chapter, sep, minor = chapter.partition(".")
return {
- "manga": text.unescape(manga),
+ "manga": text.unescape(manga).strip(),
"chapter": text.parse_int(chapter),
"chapter_minor": sep + minor,
"lang": "en", "language": "English",
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index fee0145..c34910f 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -447,9 +447,9 @@ class TumblrAPI(oauth.OAuth1API):
if api_key == self.API_KEY:
self.log.info(
"Register your own OAuth application and use its "
- "credentials to prevent this error: https://githu"
- "b.com/mikf/gallery-dl/blob/master/docs/configurat"
- "ion.rst#extractortumblrapi-key--api-secret")
+ "credentials to prevent this error: "
+ "https://gdl-org.github.io/docs/configuration.html"
+ "#extractor-tumblr-api-key-api-secret")
if self.extractor.config("ratelimit") == "wait":
self.extractor.wait(seconds=reset)
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index a5bd984..ff77828 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -6,17 +6,18 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://twitter.com/"""
+"""Extractors for https://x.com/"""
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache, memcache
import itertools
+import random
import json
import re
BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?"
- r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com")
+ r"(?:(?:[fv]x)?twitter|(?:fix(?:up|v))?x)\.com")
class TwitterExtractor(Extractor):
@@ -25,9 +26,9 @@ class TwitterExtractor(Extractor):
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
- cookies_domain = ".twitter.com"
+ cookies_domain = ".x.com"
cookies_names = ("auth_token",)
- root = "https://twitter.com"
+ root = "https://x.com"
browser = "firefox"
def __init__(self, match):
@@ -243,8 +244,8 @@ class TwitterExtractor(Extractor):
# collect URLs from entities
for url in tweet["entities"].get("urls") or ():
- url = url["expanded_url"]
- if "//twitpic.com/" not in url or "/photos/" in url:
+ url = url.get("expanded_url") or url.get("url") or ""
+ if not url or "//twitpic.com/" not in url or "/photos/" in url:
continue
if url.startswith("http:"):
url = "https" + url[4:]
@@ -336,12 +337,20 @@ class TwitterExtractor(Extractor):
urls = entities.get("urls")
if urls:
for url in urls:
- content = content.replace(url["url"], url["expanded_url"])
+ try:
+ content = content.replace(url["url"], url["expanded_url"])
+ except KeyError:
+ pass
txt, _, tco = content.rpartition(" ")
tdata["content"] = txt if tco.startswith("https://t.co/") else content
if "birdwatch_pivot" in tweet:
- tdata["birdwatch"] = tweet["birdwatch_pivot"]["subtitle"]["text"]
+ try:
+ tdata["birdwatch"] = \
+ tweet["birdwatch_pivot"]["subtitle"]["text"]
+ except KeyError:
+ self.log.debug("Unable to extract 'birdwatch' note from %s",
+ tweet["birdwatch_pivot"])
if "in_reply_to_screen_name" in legacy:
tdata["reply_to"] = legacy["in_reply_to_screen_name"]
if "quoted_by" in legacy:
@@ -398,7 +407,10 @@ class TwitterExtractor(Extractor):
urls = entities["description"].get("urls")
if urls:
for url in urls:
- descr = descr.replace(url["url"], url["expanded_url"])
+ try:
+ descr = descr.replace(url["url"], url["expanded_url"])
+ except KeyError:
+ pass
udata["description"] = descr
if "url" in entities:
@@ -483,7 +495,13 @@ class TwitterExtractor(Extractor):
username, password = self._get_auth_info()
if username:
- self.cookies_update(_login_impl(self, username, password))
+ return self.cookies_update(_login_impl(self, username, password))
+
+ for cookie in self.cookies:
+ if cookie.domain == ".twitter.com":
+ self.cookies.set(
+ cookie.name, cookie.value, domain=self.cookies_domain,
+ expires=cookie.expires, secure=cookie.secure)
class TwitterUserExtractor(TwitterExtractor):
@@ -491,7 +509,7 @@ class TwitterUserExtractor(TwitterExtractor):
subcategory = "user"
pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
- example = "https://twitter.com/USER"
+ example = "https://x.com/USER"
def __init__(self, match):
TwitterExtractor.__init__(self, match)
@@ -519,7 +537,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
"""Extractor for a Twitter user timeline"""
subcategory = "timeline"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)"
- example = "https://twitter.com/USER/timeline"
+ example = "https://x.com/USER/timeline"
def tweets(self):
# yield initial batch of (media) tweets
@@ -566,7 +584,7 @@ class TwitterTweetsExtractor(TwitterExtractor):
"""Extractor for Tweets from a user's Tweets timeline"""
subcategory = "tweets"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)"
- example = "https://twitter.com/USER/tweets"
+ example = "https://x.com/USER/tweets"
def tweets(self):
return self.api.user_tweets(self.user)
@@ -576,7 +594,7 @@ class TwitterRepliesExtractor(TwitterExtractor):
"""Extractor for Tweets from a user's timeline including replies"""
subcategory = "replies"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)"
- example = "https://twitter.com/USER/with_replies"
+ example = "https://x.com/USER/with_replies"
def tweets(self):
return self.api.user_tweets_and_replies(self.user)
@@ -586,7 +604,7 @@ class TwitterMediaExtractor(TwitterExtractor):
"""Extractor for Tweets from a user's Media timeline"""
subcategory = "media"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)"
- example = "https://twitter.com/USER/media"
+ example = "https://x.com/USER/media"
def tweets(self):
return self.api.user_media(self.user)
@@ -596,7 +614,7 @@ class TwitterLikesExtractor(TwitterExtractor):
"""Extractor for liked tweets"""
subcategory = "likes"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
- example = "https://twitter.com/USER/likes"
+ example = "https://x.com/USER/likes"
def metadata(self):
return {"user_likes": self.user}
@@ -609,7 +627,7 @@ class TwitterBookmarkExtractor(TwitterExtractor):
"""Extractor for bookmarked tweets"""
subcategory = "bookmark"
pattern = BASE_PATTERN + r"/i/bookmarks()"
- example = "https://twitter.com/i/bookmarks"
+ example = "https://x.com/i/bookmarks"
def tweets(self):
return self.api.user_bookmarks()
@@ -625,7 +643,7 @@ class TwitterListExtractor(TwitterExtractor):
"""Extractor for Twitter lists"""
subcategory = "list"
pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
- example = "https://twitter.com/i/lists/12345"
+ example = "https://x.com/i/lists/12345"
def tweets(self):
return self.api.list_latest_tweets_timeline(self.user)
@@ -635,7 +653,7 @@ class TwitterListMembersExtractor(TwitterExtractor):
"""Extractor for members of a Twitter list"""
subcategory = "list-members"
pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
- example = "https://twitter.com/i/lists/12345/members"
+ example = "https://x.com/i/lists/12345/members"
def items(self):
self.login()
@@ -646,7 +664,7 @@ class TwitterFollowingExtractor(TwitterExtractor):
"""Extractor for followed users"""
subcategory = "following"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
- example = "https://twitter.com/USER/following"
+ example = "https://x.com/USER/following"
def items(self):
self.login()
@@ -657,7 +675,7 @@ class TwitterSearchExtractor(TwitterExtractor):
"""Extractor for Twitter search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
- example = "https://twitter.com/search?q=QUERY"
+ example = "https://x.com/search?q=QUERY"
def metadata(self):
return {"search": text.unquote(self.user)}
@@ -688,7 +706,7 @@ class TwitterHashtagExtractor(TwitterExtractor):
"""Extractor for Twitter hashtags"""
subcategory = "hashtag"
pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)"
- example = "https://twitter.com/hashtag/NAME"
+ example = "https://x.com/hashtag/NAME"
def items(self):
url = "{}/search?q=%23{}".format(self.root, self.user)
@@ -700,7 +718,7 @@ class TwitterCommunityExtractor(TwitterExtractor):
"""Extractor for a Twitter community"""
subcategory = "community"
pattern = BASE_PATTERN + r"/i/communities/(\d+)"
- example = "https://twitter.com/i/communities/12345"
+ example = "https://x.com/i/communities/12345"
def tweets(self):
if self.textonly:
@@ -712,7 +730,7 @@ class TwitterCommunitiesExtractor(TwitterExtractor):
"""Extractor for followed Twitter communities"""
subcategory = "communities"
pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$"
- example = "https://twitter.com/i/communities"
+ example = "https://x.com/i/communities"
def tweets(self):
return self.api.communities_main_page_timeline(self.user)
@@ -724,7 +742,7 @@ class TwitterEventExtractor(TwitterExtractor):
directory_fmt = ("{category}", "Events",
"{event[id]} {event[short_title]}")
pattern = BASE_PATTERN + r"/i/events/(\d+)"
- example = "https://twitter.com/i/events/12345"
+ example = "https://x.com/i/events/12345"
def metadata(self):
return {"event": self.api.live_event(self.user)}
@@ -736,8 +754,9 @@ class TwitterEventExtractor(TwitterExtractor):
class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for individual tweets"""
subcategory = "tweet"
- pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?$"
- example = "https://twitter.com/USER/status/12345"
+ pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
+ r"/?(?:$|\?|#|photo/|video/)")
+ example = "https://x.com/USER/status/12345"
def __init__(self, match):
TwitterExtractor.__init__(self, match)
@@ -817,7 +836,7 @@ class TwitterQuotesExtractor(TwitterExtractor):
"""Extractor for quotes of a Tweet"""
subcategory = "quotes"
pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes"
- example = "https://twitter.com/USER/status/12345/quotes"
+ example = "https://x.com/USER/status/12345/quotes"
def items(self):
url = "{}/search?q=quoted_tweet_id:{}".format(self.root, self.user)
@@ -830,7 +849,7 @@ class TwitterAvatarExtractor(TwitterExtractor):
filename_fmt = "avatar {date}.{extension}"
archive_fmt = "AV_{user[id]}_{date}"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/photo"
- example = "https://twitter.com/USER/photo"
+ example = "https://x.com/USER/photo"
def tweets(self):
self.api._user_id_by_screen_name(self.user)
@@ -852,7 +871,7 @@ class TwitterBackgroundExtractor(TwitterExtractor):
filename_fmt = "background {date}.{extension}"
archive_fmt = "BG_{user[id]}_{date}"
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/header_photo"
- example = "https://twitter.com/USER/header_photo"
+ example = "https://x.com/USER/header_photo"
def tweets(self):
self.api._user_id_by_screen_name(self.user)
@@ -899,7 +918,7 @@ class TwitterAPI():
self.extractor = extractor
self.log = extractor.log
- self.root = "https://twitter.com/i/api"
+ self.root = "https://x.com/i/api"
self._nsfw_warning = True
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
@@ -919,7 +938,7 @@ class TwitterAPI():
self.headers = {
"Accept": "*/*",
- "Referer": "https://twitter.com/",
+ "Referer": extractor.root + "/",
"content-type": "application/json",
"x-guest-token": None,
"x-twitter-auth-type": "OAuth2Session" if auth_token else None,
@@ -1262,7 +1281,7 @@ class TwitterAPI():
endpoint = "/1.1/guest/activate.json"
self.log.info("Requesting guest token")
return str(self._call(
- endpoint, None, "POST", False, "https://api.twitter.com",
+ endpoint, None, "POST", False, "https://api.x.com",
)["guest_token"])
def _authenticate_guest(self):
@@ -1288,63 +1307,72 @@ class TwitterAPI():
if csrf_token:
self.headers["x-csrf-token"] = csrf_token
- if response.status_code < 400:
+ remaining = int(response.headers.get("x-rate-limit-remaining", 6))
+ if remaining < 6 and remaining <= random.randrange(1, 6):
+ self._handle_ratelimit(response)
+ continue
+
+ try:
data = response.json()
+ except ValueError:
+ data = {"errors": ({"message": response.text},)}
+
+ errors = data.get("errors")
+ if not errors:
+ return data
+
+ retry = False
+ for error in errors:
+ msg = error.get("message") or "Unspecified"
+ self.log.debug("API error: '%s'", msg)
+
+ if "this account is temporarily locked" in msg:
+ msg = "Account temporarily locked"
+ if self.extractor.config("locked") != "wait":
+ raise exception.AuthorizationError(msg)
+ self.log.warning(msg)
+ self.extractor.input("Press ENTER to retry.")
+ retry = True
+
+ elif "Could not authenticate you" in msg:
+ if not self.extractor.config("relogin", True):
+ continue
- errors = data.get("errors")
- if not errors:
- return data
+ username, password = self.extractor._get_auth_info()
+ if not username:
+ continue
- retry = False
- for error in errors:
- msg = error.get("message") or "Unspecified"
- self.log.debug("API error: '%s'", msg)
+ _login_impl.invalidate(username)
+ self.extractor.cookies_update(
+ _login_impl(self.extractor, username, password))
+ self.__init__(self.extractor)
+ retry = True
- if "this account is temporarily locked" in msg:
- msg = "Account temporarily locked"
- if self.extractor.config("locked") != "wait":
- raise exception.AuthorizationError(msg)
- self.log.warning("%s. Press ENTER to retry.", msg)
- try:
- input()
- except (EOFError, OSError):
- pass
- retry = True
-
- elif msg.lower().startswith("timeout"):
- retry = True
+ elif msg.lower().startswith("timeout"):
+ retry = True
- if not retry:
- return data
- elif self.headers["x-twitter-auth-type"]:
+ if retry:
+ if self.headers["x-twitter-auth-type"]:
self.log.debug("Retrying API request")
continue
+ else:
+ # fall through to "Login Required"
+ response.status_code = 404
- # fall through to "Login Required"
- response.status_code = 404
-
- if response.status_code == 429:
- # rate limit exceeded
- if self.extractor.config("ratelimit") == "abort":
- raise exception.StopExtraction("Rate limit exceeded")
-
- until = response.headers.get("x-rate-limit-reset")
- seconds = None if until else 60
- self.extractor.wait(until=until, seconds=seconds)
- continue
-
- if response.status_code in (403, 404) and \
+ if response.status_code < 400:
+ return data
+ elif response.status_code in (403, 404) and \
not self.headers["x-twitter-auth-type"]:
raise exception.AuthorizationError("Login required")
+ elif response.status_code == 429:
+ self._handle_ratelimit(response)
+ continue
# error
try:
- data = response.json()
- errors = ", ".join(e["message"] for e in data["errors"])
- except ValueError:
- errors = response.text
+ errors = ", ".join(e["message"] for e in errors)
except Exception:
- errors = data.get("errors", "")
+ pass
raise exception.StopExtraction(
"%s %s (%s)", response.status_code, response.reason, errors)
@@ -1680,6 +1708,13 @@ class TwitterAPI():
return
variables["cursor"] = cursor
+ def _handle_ratelimit(self, response):
+ if self.extractor.config("ratelimit") == "abort":
+ raise exception.StopExtraction("Rate limit exceeded")
+
+ until = response.headers.get("x-rate-limit-reset")
+ self.extractor.wait(until=until, seconds=None if until else 60)
+
def _process_tombstone(self, entry, tombstone):
text = (tombstone.get("richText") or tombstone["text"])["text"]
tweet_id = entry["entryId"].rpartition("-")[2]
@@ -1695,22 +1730,22 @@ class TwitterAPI():
@cache(maxage=365*86400, keyarg=1)
def _login_impl(extr, username, password):
- import re
- import random
+ def process(data, params=None):
+ response = extr.request(
+ url, params=params, headers=headers, json=data,
+ method="POST", fatal=None)
- if re.fullmatch(r"[\w.%+-]+@[\w.-]+\.\w{2,}", username):
- extr.log.warning(
- "Login with email is no longer possible. "
- "You need to provide your username or phone number instead.")
-
- def process(response):
try:
data = response.json()
except ValueError:
data = {"errors": ({"message": "Invalid response"},)}
else:
if response.status_code < 400:
- return data["flow_token"]
+ try:
+ return (data["flow_token"],
+ data["subtasks"][0]["subtask_id"])
+ except LookupError:
+ pass
errors = []
for error in data.get("errors") or ():
@@ -1719,9 +1754,13 @@ def _login_impl(extr, username, password):
extr.log.debug(response.text)
raise exception.AuthenticationError(", ".join(errors))
- extr.cookies.clear()
+ cookies = extr.cookies
+ cookies.clear()
api = TwitterAPI(extr)
api._authenticate_guest()
+
+ url = "https://api.x.com/1.1/onboarding/task.json"
+ params = {"flow_name": "login"}
headers = api.headers
extr.log.info("Logging in as %s", username)
@@ -1778,31 +1817,18 @@ def _login_impl(extr, username, password):
"web_modal": 1,
},
}
- url = "https://api.twitter.com/1.1/onboarding/task.json?flow_name=login"
- response = extr.request(url, method="POST", headers=headers, json=data)
- data = {
- "flow_token": process(response),
- "subtask_inputs": [
- {
- "subtask_id": "LoginJsInstrumentationSubtask",
+ flow_token, subtask = process(data, params)
+ while not cookies.get("auth_token"):
+ if subtask == "LoginJsInstrumentationSubtask":
+ data = {
"js_instrumentation": {
"response": "{}",
"link": "next_link",
},
- },
- ],
- }
- url = "https://api.twitter.com/1.1/onboarding/task.json"
- response = extr.request(
- url, method="POST", headers=headers, json=data, fatal=None)
-
- # username
- data = {
- "flow_token": process(response),
- "subtask_inputs": [
- {
- "subtask_id": "LoginEnterUserIdentifierSSO",
+ }
+ elif subtask == "LoginEnterUserIdentifierSSO":
+ data = {
"settings_list": {
"setting_responses": [
{
@@ -1814,48 +1840,61 @@ def _login_impl(extr, username, password):
],
"link": "next_link",
},
- },
- ],
- }
- # url = "https://api.twitter.com/1.1/onboarding/task.json"
- extr.sleep(random.uniform(2.0, 4.0), "login (username)")
- response = extr.request(
- url, method="POST", headers=headers, json=data, fatal=None)
-
- # password
- data = {
- "flow_token": process(response),
- "subtask_inputs": [
- {
- "subtask_id": "LoginEnterPassword",
+ }
+ elif subtask == "LoginEnterPassword":
+ data = {
"enter_password": {
"password": password,
"link": "next_link",
},
- },
- ],
- }
- # url = "https://api.twitter.com/1.1/onboarding/task.json"
- extr.sleep(random.uniform(2.0, 4.0), "login (password)")
- response = extr.request(
- url, method="POST", headers=headers, json=data, fatal=None)
-
- # account duplication check ?
- data = {
- "flow_token": process(response),
- "subtask_inputs": [
- {
- "subtask_id": "AccountDuplicationCheck",
+ }
+ elif subtask == "LoginEnterAlternateIdentifierSubtask":
+ alt = extr.input(
+ "Alternate Identifier (username, email, phone number): ")
+ data = {
+ "enter_text": {
+ "text": alt,
+ "link": "next_link",
+ },
+ }
+ elif subtask == "LoginTwoFactorAuthChallenge":
+ data = {
+ "enter_text": {
+ "text": extr.input("2FA Token: "),
+ "link": "next_link",
+ },
+ }
+ elif subtask == "LoginAcid":
+ data = {
+ "enter_text": {
+ "text": extr.input("Email Verification Code: "),
+ "link": "next_link",
+ },
+ }
+ elif subtask == "AccountDuplicationCheck":
+ data = {
"check_logged_in_account": {
"link": "AccountDuplicationCheck_false",
},
- },
- ],
- }
- # url = "https://api.twitter.com/1.1/onboarding/task.json"
- response = extr.request(
- url, method="POST", headers=headers, json=data, fatal=None)
- process(response)
+ }
+ elif subtask == "ArkoseLogin":
+ raise exception.AuthenticationError("Login requires CAPTCHA")
+ elif subtask == "DenyLoginSubtask":
+ raise exception.AuthenticationError("Login rejected as suspicious")
+ elif subtask == "ArkoseLogin":
+ raise exception.AuthenticationError("No auth token cookie")
+ else:
+ raise exception.StopExtraction("Unrecognized subtask %s", subtask)
+
+ inputs = {"subtask_id": subtask}
+ inputs.update(data)
+ data = {
+ "flow_token": flow_token,
+ "subtask_inputs": [inputs],
+ }
+
+ extr.sleep(random.uniform(1.0, 3.0), "login ({})".format(subtask))
+ flow_token, subtask = process(data)
return {
cookie.name: cookie.value
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index 41141c6..c112f4a 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -46,6 +46,8 @@ class VscoExtractor(Extractor):
url = "https://image-{}.vsco.co/{}".format(cdn, path)
elif cdn.isdecimal():
url = "https://image.vsco.co/" + base
+ elif img["responsive_url"].startswith("http"):
+ url = img["responsive_url"]
else:
url = "https://" + img["responsive_url"]
@@ -238,6 +240,34 @@ class VscoSpacesExtractor(VscoExtractor):
yield Message.Queue, url, space
+class VscoAvatarExtractor(VscoExtractor):
+ """Extractor for vsco.co user avatars"""
+ subcategory = "avatar"
+ pattern = USER_PATTERN + r"/avatar"
+ example = "https://vsco.co/USER/avatar"
+
+ def images(self):
+ url = "{}/{}/gallery".format(self.root, self.user)
+ page = self.request(url).text
+ piid = text.extr(page, '"profileImageId":"', '"')
+
+ url = "https://im.vsco.co/" + piid
+ # needs GET request, since HEAD does not redirect to full URL
+ response = self.request(url, allow_redirects=False)
+
+ return ({
+ "_id" : piid,
+ "is_video" : False,
+ "grid_name" : "",
+ "upload_date" : 0,
+ "responsive_url": response.headers["Location"],
+ "video_url" : "",
+ "image_meta" : None,
+ "width" : 0,
+ "height" : 0,
+ },)
+
+
class VscoImageExtractor(VscoExtractor):
"""Extractor for individual images on vsco.co"""
subcategory = "image"
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index ac00682..9370cfb 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -27,9 +27,9 @@ class WikimediaExtractor(BaseExtractor):
if self.category == "wikimedia":
self.category = self.root.split(".")[-2]
- elif self.category == "fandom":
- self.category = \
- "fandom-" + self.root.partition(".")[0].rpartition("/")[2]
+ elif self.category in ("fandom", "wikigg"):
+ self.category = "{}-{}".format(
+ self.category, self.root.partition(".")[0].rpartition("/")[2])
if path.startswith("wiki/"):
path = path[5:]
@@ -69,14 +69,18 @@ class WikimediaExtractor(BaseExtractor):
def items(self):
for info in self._pagination(self.params):
- image = info["imageinfo"][0]
+ try:
+ image = info["imageinfo"][0]
+ except LookupError:
+ self.log.debug("Missing 'imageinfo' for %s", info)
+ continue
image["metadata"] = {
m["name"]: m["value"]
- for m in image["metadata"]}
+ for m in image["metadata"] or ()}
image["commonmetadata"] = {
m["name"]: m["value"]
- for m in image["commonmetadata"]}
+ for m in image["commonmetadata"] or ()}
filename = image["canonicaltitle"]
image["filename"], _, image["extension"] = \
@@ -148,6 +152,10 @@ BASE_PATTERN = WikimediaExtractor.update({
"root": None,
"pattern": r"[\w-]+\.fandom\.com",
},
+ "wikigg": {
+ "root": None,
+ "pattern": r"\w+\.wiki\.gg",
+ },
"mariowiki": {
"root": "https://www.mariowiki.com",
"pattern": r"(?:www\.)?mariowiki\.com",