summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py2
-rw-r--r--gallery_dl/extractor/cien.py86
-rw-r--r--gallery_dl/extractor/common.py33
-rw-r--r--gallery_dl/extractor/deviantart.py11
-rw-r--r--gallery_dl/extractor/fanbox.py12
-rw-r--r--gallery_dl/extractor/hentainexus.py176
-rw-r--r--gallery_dl/extractor/hitomi.py1
-rw-r--r--gallery_dl/extractor/instagram.py2
-rw-r--r--gallery_dl/extractor/kemonoparty.py6
-rw-r--r--gallery_dl/extractor/newgrounds.py68
-rw-r--r--gallery_dl/extractor/nijie.py11
-rw-r--r--gallery_dl/extractor/nitter.py20
-rw-r--r--gallery_dl/extractor/oauth.py6
-rw-r--r--gallery_dl/extractor/philomena.py7
-rw-r--r--gallery_dl/extractor/photobucket.py145
-rw-r--r--gallery_dl/extractor/shimmie2.py4
-rw-r--r--gallery_dl/extractor/skeb.py19
-rw-r--r--gallery_dl/extractor/speakerdeck.py44
-rw-r--r--gallery_dl/extractor/szurubooru.py12
-rw-r--r--gallery_dl/extractor/tcbscans.py19
-rw-r--r--gallery_dl/extractor/twibooru.py7
-rw-r--r--gallery_dl/extractor/twitter.py30
-rw-r--r--gallery_dl/extractor/vichan.py4
23 files changed, 363 insertions, 362 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 591e6a8..6aff1f3 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -62,6 +62,7 @@ modules = [
"hentaifox",
"hentaihand",
"hentaihere",
+ "hentainexus",
"hiperdex",
"hitomi",
"hotleak",
@@ -113,7 +114,6 @@ modules = [
"paheal",
"patreon",
"philomena",
- "photobucket",
"photovogue",
"picarto",
"piczel",
diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py
deleted file mode 100644
index a9ccab5..0000000
--- a/gallery_dl/extractor/cien.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2024 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://ci-en.net/"""
-
-from .common import Extractor, Message
-from .. import text, util
-
-BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)"
-
-
-class CienExtractor(Extractor):
- category = "cien"
- root = "https://ci-en.net"
-
- def __init__(self, match):
- self.root = text.root_from_url(match.group(0))
- Extractor.__init__(self, match)
-
- def _pagination_articles(self, url, params):
- data = {"extractor": CienArticleExtractor}
- params["page"] = text.parse_int(params.get("page"), 1)
-
- while True:
- page = self.request(url, params=params).text
-
- for card in text.extract_iter(
- page, ' class="c-cardCase-item', '</div>'):
- article_url = text.extr(card, ' href="', '"')
- yield Message.Queue, article_url, data
-
- if ' rel="next"' not in page:
- return
- params["page"] += 1
-
-
-class CienArticleExtractor(CienExtractor):
- subcategory = "article"
- pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)"
- example = "https://ci-en.net/creator/123/article/12345"
-
- def items(self):
- url = "{}/creator/{}/article/{}".format(
- self.root, self.groups[0], self.groups[1])
- page = self.request(url, notfound="article").text
- return
- yield 1
-
-
-class CienCreatorExtractor(CienExtractor):
- subcategory = "creator"
- pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$"
- example = "https://ci-en.net/creator/123"
-
- def items(self):
- url = "{}/creator/{}/article".format(self.root, self.groups[0])
- params = text.parse_query(self.groups[1])
- params["mode"] = "list"
- return self._pagination_articles(url, params)
-
-
-class CienRecentExtractor(CienExtractor):
- subcategory = "recent"
- pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?"
- example = "https://ci-en.net/mypage/recent"
-
- def items(self):
- url = self.root + "/mypage/recent"
- params = text.parse_query(self.groups[0])
- return self._pagination_articles(url, params)
-
-
-class CienFollowingExtractor(CienExtractor):
- subcategory = "following"
- pattern = BASE_PATTERN + r"/mypage/subscription(/following)?"
- example = "https://ci-en.net/mypage/subscription"
-
- def items(self):
- url = self.root + "/mypage/recent"
- params = text.parse_query(self.groups[0])
- return self._pagination_articles(url, params)
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 8771261..d7a41bc 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -11,6 +11,7 @@
import os
import re
import ssl
+import sys
import time
import netrc
import queue
@@ -42,6 +43,7 @@ class Extractor():
browser = None
request_interval = 0.0
request_interval_min = 0.0
+ request_interval_429 = 60.0
request_timestamp = 0.0
def __init__(self, match):
@@ -202,7 +204,9 @@ class Extractor():
self.log.warning("Cloudflare CAPTCHA")
break
- if code == 429 and self._interval_429:
+ if code == 429 and self._handle_429(response):
+ continue
+ elif code == 429 and self._interval_429:
pass
elif code not in retry_codes and code < 500:
break
@@ -230,6 +234,8 @@ class Extractor():
raise exception.HttpError(msg, response)
+ _handle_429 = util.false
+
def wait(self, seconds=None, until=None, adjust=1.0,
reason="rate limit"):
now = time.time()
@@ -263,6 +269,8 @@ class Extractor():
time.sleep(seconds)
def input(self, prompt, echo=True):
+ self._check_input_allowed(prompt)
+
if echo:
try:
return input(prompt)
@@ -271,13 +279,30 @@ class Extractor():
else:
return getpass.getpass(prompt)
+ def _check_input_allowed(self, prompt=""):
+ input = self.config("input")
+
+ if input is None:
+ try:
+ input = sys.stdin.isatty()
+ except Exception:
+ input = False
+
+ if not input:
+ raise exception.StopExtraction(
+ "User input required (%s)", prompt.strip(" :"))
+
def _get_auth_info(self):
"""Return authentication information as (username, password) tuple"""
username = self.config("username")
password = None
if username:
- password = self.config("password") or util.LazyPrompt()
+ password = self.config("password")
+ if not password:
+ self._check_input_allowed("password")
+ password = util.LazyPrompt()
+
elif self.config("netrc", False):
try:
info = netrc.netrc().authenticators(self.category)
@@ -304,7 +329,7 @@ class Extractor():
self.request_interval_min,
)
self._interval_429 = util.build_duration_func(
- self.config("sleep-429", 60),
+ self.config("sleep-429", self.request_interval_429),
)
if self._retries < 0:
@@ -837,7 +862,7 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
if ssl_options or ssl_ciphers:
ssl_context = urllib3.connection.create_urllib3_context(
options=ssl_options or None, ciphers=ssl_ciphers)
- if requests.__version__ > "2.31":
+ if not requests.__version__ < "2.32":
# https://github.com/psf/requests/pull/6731
ssl_context.load_default_certs()
ssl_context.check_hostname = False
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 993885a..2199cc8 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -1730,15 +1730,16 @@ class DeviantartEclipseAPI():
url = "{}/{}/about".format(self.extractor.root, user)
page = self.request(url).text
- gruserid, pos = text.extract(page, ' data-userid="', '"')
+ gruser_id = text.extr(page, ' data-userid="', '"')
- pos = page.find('\\"type\\":\\"watching\\"', pos)
+ pos = page.find('\\"name\\":\\"watching\\"')
if pos < 0:
- raise exception.NotFoundError("module")
- moduleid = text.rextract(page, '\\"id\\":', ',', pos)[0].strip('" ')
+ raise exception.NotFoundError("'watching' module ID")
+ module_id = text.rextract(
+ page, '\\"id\\":', ',', pos)[0].strip('" ')
self._fetch_csrf_token(page)
- return gruserid, moduleid
+ return gruser_id, module_id
def _fetch_csrf_token(self, page=None):
if page is None:
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index 2223403..d81fd0b 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -113,7 +113,17 @@ class FanboxExtractor(Extractor):
post["user"] = self._get_user_data(post["creatorId"])
if self._meta_plan:
plans = self._get_plan_data(post["creatorId"])
- post["plan"] = plans[post["feeRequired"]]
+ fee = post["feeRequired"]
+ try:
+ post["plan"] = plans[fee]
+ except KeyError:
+ fees = [f for f in plans if f >= fee]
+ if fees:
+ plan = plans[min(fees)]
+ else:
+ plan = plans[0].copy()
+ plan["fee"] = fee
+ post["plan"] = plans[fee] = plan
return content_body, post
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
new file mode 100644
index 0000000..97b7844
--- /dev/null
+++ b/gallery_dl/extractor/hentainexus.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019-2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentainexus.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import binascii
+
+
+class HentainexusGalleryExtractor(GalleryExtractor):
+ """Extractor for hentainexus galleries"""
+ category = "hentainexus"
+ root = "https://hentainexus.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
+ r"/(?:view|read)/(\d+)")
+ example = "https://hentainexus.com/view/12345"
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/view/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ rmve = text.remove_html
+ extr = text.extract_from(page)
+ data = {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "cover" : extr('"og:image" content="', '"'),
+ "title" : extr('<h1 class="title">', '</h1>'),
+ }
+
+ for key in ("Artist", "Book", "Circle", "Event", "Language",
+ "Magazine", "Parody", "Publisher", "Description"):
+ value = rmve(extr('viewcolumn">' + key + '</td>', '</td>'))
+ value, sep, rest = value.rpartition(" (")
+ data[key.lower()] = value if sep else rest
+
+ data["tags"] = tags = []
+ for k in text.extract_iter(page, '<a href="/?q=tag:', '"'):
+ tags.append(text.unquote(k).strip('"').replace("+", " "))
+
+ if not data["language"]:
+ data["language"] = "English"
+ data["lang"] = util.language_to_code(data["language"])
+
+ if "doujin" in data["tags"]:
+ data["type"] = "Doujinshi"
+ elif "illustration" in data["tags"]:
+ data["type"] = "Illustration"
+ else:
+ data["type"] = "Manga"
+ data["title_conventional"] = self._join_title(data)
+ return data
+
+ def images(self, _):
+ url = "{}/read/{}".format(self.root, self.gallery_id)
+ page = self.request(url).text
+ imgs = util.json_loads(self._decode(text.extr(
+ page, 'initReader("', '"')))
+
+ headers = None
+ if not self.config("original", True):
+ headers = {"Accept": "image/webp,*/*"}
+ for img in imgs:
+ img["_http_headers"] = headers
+
+ return [
+ (img["image"], img)
+ for img in imgs
+ ]
+
+ @staticmethod
+ def _decode(data):
+ # https://hentainexus.com/static/js/reader.min.js?r=22
+ hostname = "hentainexus.com"
+ primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53)
+ blob = list(binascii.a2b_base64(data))
+ for i in range(0, len(hostname)):
+ blob[i] = blob[i] ^ ord(hostname[i])
+
+ key = blob[0:64]
+
+ C = 0
+ for k in key:
+ C = C ^ k
+ for _ in range(8):
+ if C & 1:
+ C = C >> 1 ^ 0xc
+ else:
+ C = C >> 1
+ k = primes[C & 0x7]
+
+ x = 0
+ S = list(range(256))
+ for i in range(256):
+ x = (x + S[i] + key[i % len(key)]) % 256
+ S[i], S[x] = S[x], S[i]
+
+ result = ""
+ a = c = m = x = 0
+ for n in range(64, len(blob)):
+ a = (a + k) % 256
+ x = (c + S[(x + S[a]) % 256]) % 256
+ c = (c + a + S[a]) % 256
+
+ S[a], S[x] = S[x], S[a]
+ m = S[(x + S[(a + S[(m + c) % 256]) % 256]) % 256]
+ result += chr(blob[n] ^ m)
+
+ return result
+
+ @staticmethod
+ def _join_title(data):
+ event = data['event']
+ artist = data['artist']
+ circle = data['circle']
+ title = data['title']
+ parody = data['parody']
+ book = data['book']
+ magazine = data['magazine']
+
+ # a few galleries have a large number of artists or parodies,
+ # which get replaced with "Various" in the title string
+ if artist.count(',') >= 3:
+ artist = 'Various'
+ if parody.count(',') >= 3:
+ parody = 'Various'
+
+ jt = ''
+ if event:
+ jt += '({}) '.format(event)
+ if circle:
+ jt += '[{} ({})] '.format(circle, artist)
+ else:
+ jt += '[{}] '.format(artist)
+ jt += title
+ if parody.lower() != 'original work':
+ jt += ' ({})'.format(parody)
+ if book:
+ jt += ' ({})'.format(book)
+ if magazine:
+ jt += ' ({})'.format(magazine)
+ return jt
+
+
+class HentainexusSearchExtractor(Extractor):
+ """Extractor for hentainexus search results"""
+ category = "hentainexus"
+ subcategory = "search"
+ root = "https://hentainexus.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
+ r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$")
+ example = "https://hentainexus.com/?q=QUERY"
+
+ def items(self):
+ params = text.parse_query(self.groups[0])
+ data = {"_extractor": HentainexusGalleryExtractor}
+ path = "/"
+
+ while path:
+ page = self.request(self.root + path, params=params).text
+ extr = text.extract_from(page)
+
+ while True:
+ gallery_id = extr('<a href="/view/', '"')
+ if not gallery_id:
+ break
+ yield Message.Queue, self.root + "/view/" + gallery_id, data
+
+ path = extr('class="pagination-next" href="', '"')
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 88f5708..9b74700 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -58,6 +58,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
return {
"gallery_id": text.parse_int(info["id"]),
"title" : info["title"],
+ "title_jpn" : info.get("japanese_title") or "",
"type" : info["type"].capitalize(),
"language" : language,
"lang" : util.language_to_code(language),
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 9c2b1de..f7a5cc7 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -168,6 +168,7 @@ class InstagramExtractor(Extractor):
"likes": post.get("like_count", 0),
"pinned": post.get("timeline_pinned_user_ids", ()),
"date": text.parse_timestamp(post.get("taken_at")),
+ "liked": post.get("has_liked", False),
}
caption = post["caption"]
@@ -270,6 +271,7 @@ class InstagramExtractor(Extractor):
"typename" : typename,
"date" : text.parse_timestamp(post["taken_at_timestamp"]),
"likes" : post["edge_media_preview_like"]["count"],
+ "liked" : post.get("viewer_has_liked", False),
"pinned" : pinned,
"owner_id" : owner["id"],
"username" : owner.get("username"),
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index b0c24de..6f2d5f3 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -518,7 +518,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
if not sort:
sort = "updated"
- users.sort(key=lambda x: x[sort], reverse=(order == "desc"))
+ users.sort(key=lambda x: x[sort] or util.NONE,
+ reverse=(order == "desc"))
for user in users:
user["_extractor"] = KemonopartyUserExtractor
@@ -532,7 +533,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
if not sort:
sort = "faved_seq"
- posts.sort(key=lambda x: x[sort], reverse=(order == "desc"))
+ posts.sort(key=lambda x: x[sort] or util.NONE,
+ reverse=(order == "desc"))
for post in posts:
post["_extractor"] = KemonopartyPostExtractor
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 7ac3a3a..ecd6619 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
+import re
class NewgroundsExtractor(Extractor):
@@ -33,10 +34,16 @@ class NewgroundsExtractor(Extractor):
def _init(self):
self.flash = self.config("flash", True)
- fmt = self.config("format", "original")
- self.format = (True if not fmt or fmt == "original" else
- fmt if isinstance(fmt, int) else
- text.parse_int(fmt.rstrip("p")))
+ fmt = self.config("format")
+ if not fmt or fmt == "original":
+ self.format = ("mp4", "webm", "m4v", "mov", "mkv",
+ 1080, 720, 360)
+ elif isinstance(fmt, (list, tuple)):
+ self.format = fmt
+ else:
+ self._video_formats = self._video_formats_limit
+ self.format = (fmt if isinstance(fmt, int) else
+ text.parse_int(fmt.rstrip("p")))
def items(self):
self.login()
@@ -266,7 +273,7 @@ class NewgroundsExtractor(Extractor):
if src:
src = src.replace("\\/", "/")
- fallback = ()
+ formats = ()
date = text.parse_datetime(extr(
'itemprop="datePublished" content="', '"'))
else:
@@ -276,23 +283,8 @@ class NewgroundsExtractor(Extractor):
"X-Requested-With": "XMLHttpRequest",
}
sources = self.request(url, headers=headers).json()["sources"]
-
- if self.format is True:
- src = sources["360p"][0]["src"].replace(".360p.", ".")
- formats = sources
- else:
- formats = []
- for fmt, src in sources.items():
- width = text.parse_int(fmt.rstrip("p"))
- if width <= self.format:
- formats.append((width, src))
- if formats:
- formats.sort(reverse=True)
- src, formats = formats[0][1][0]["src"], formats[1:]
- else:
- src = ""
-
- fallback = self._video_fallback(formats)
+ formats = self._video_formats(sources)
+ src = next(formats, "")
date = text.parse_timestamp(src.rpartition("?")[2])
return {
@@ -306,15 +298,33 @@ class NewgroundsExtractor(Extractor):
"rating" : extr('class="rated-', '"'),
"index" : text.parse_int(index),
"_index" : index,
- "_fallback" : fallback,
+ "_fallback" : formats,
}
- @staticmethod
- def _video_fallback(formats):
- if isinstance(formats, dict):
- formats = list(formats.items())
- formats.sort(key=lambda fmt: text.parse_int(fmt[0].rstrip("p")),
- reverse=True)
+ def _video_formats(self, sources):
+ src = sources["360p"][0]["src"]
+ sub = re.compile(r"\.360p\.\w+").sub
+
+ for fmt in self.format:
+ try:
+ if isinstance(fmt, int):
+ yield sources[str(fmt) + "p"][0]["src"]
+ elif fmt in sources:
+ yield sources[fmt][0]["src"]
+ else:
+ yield sub("." + fmt, src, 1)
+ except Exception as exc:
+ self.log.debug("Video format '%s' not available (%s: %s)",
+ fmt, exc.__class__.__name__, exc)
+
+ def _video_formats_limit(self, sources):
+ formats = []
+ for fmt, src in sources.items():
+ width = text.parse_int(fmt.rstrip("p"))
+ if width <= self.format:
+ formats.append((width, src))
+
+ formats.sort(reverse=True)
for fmt in formats:
yield fmt[1][0]["src"]
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index c50c013..60cca22 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -56,7 +56,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
data["user_id"] = data["artist_id"]
data["user_name"] = data["artist_name"]
- urls = list(self._extract_images(image_id, page))
+ urls = self._extract_images(image_id, page)
data["count"] = len(urls)
yield Message.Directory, data
@@ -113,11 +113,14 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
# multiple images
url = "{}/view_popup.php?id={}".format(self.root, image_id)
page = self.request(url).text
- yield from text.extract_iter(
- page, 'href="javascript:void(0);"><img src="', '"')
+ return [
+ text.extr(media, ' src="', '"')
+ for media in text.extract_iter(
+ page, 'href="javascript:void(0);"><', '>')
+ ]
else:
pos = page.find('id="view-center"') + 1
- yield text.extract(page, 'itemprop="image" src="', '"', pos)[0]
+ return (text.extr(page, 'itemprop="image" src="', '"', pos),)
@staticmethod
def _extract_user_name(page):
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py
index 2bce597..cfc8861 100644
--- a/gallery_dl/extractor/nitter.py
+++ b/gallery_dl/extractor/nitter.py
@@ -234,26 +234,6 @@ class NitterExtractor(BaseExtractor):
BASE_PATTERN = NitterExtractor.update({
- "nitter.net": {
- "root": "https://nitter.net",
- "pattern": r"nitter\.net",
- },
- "nitter.1d4.us": {
- "root": "https://nitter.1d4.us",
- "pattern": r"nitter\.1d4\.us",
- },
- "nitter.kavin.rocks": {
- "root": "https://nitter.kavin.rocks",
- "pattern": r"nitter\.kavin\.rocks",
- },
- "nitter.unixfox.eu": {
- "root": "https://nitter.unixfox.eu",
- "pattern": r"nitter\.unixfox\.eu",
- },
- "nitter.it": {
- "root": "https://nitter.it",
- "pattern": r"nitter\.it",
- },
})
USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)"
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 5571575..9d025d5 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -424,7 +424,7 @@ class OAuthPixiv(OAuthBase):
"code_challenge_method": "S256",
"client": "pixiv-android",
}
- code = self.open(url, params, self._input)
+ code = self.open(url, params, self._input_code)
url = "https://oauth.secure.pixiv.net/auth/token"
headers = {
@@ -459,7 +459,7 @@ class OAuthPixiv(OAuthBase):
stdout_write(self._generate_message(("refresh-token",), (token,)))
- def _input(self):
+ def _input_code(self):
stdout_write("""\
1) Open your browser's Developer Tools (F12) and switch to the Network tab
2) Login
@@ -471,5 +471,5 @@ class OAuthPixiv(OAuthBase):
like the entire URL or several query parameters.
""")
- code = input("code: ")
+ code = self.input("code: ")
return code.rpartition("=")[2].strip()
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 339646f..150efed 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -24,8 +24,13 @@ class PhilomenaExtractor(BooruExtractor):
def _init(self):
self.api = PhilomenaAPI(self)
+ if not self.config("svg", True):
+ self._file_url = operator.itemgetter("view_url")
- _file_url = operator.itemgetter("view_url")
+ def _file_url(self, post):
+ if post["format"] == "svg":
+ return post["view_url"].rpartition(".")[0] + ".svg"
+ return post["view_url"]
@staticmethod
def _prepare(post):
diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py
deleted file mode 100644
index a01c9fe..0000000
--- a/gallery_dl/extractor/photobucket.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019-2023 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://photobucket.com/"""
-
-from .common import Extractor, Message
-from .. import text, exception
-import binascii
-import json
-
-
-class PhotobucketAlbumExtractor(Extractor):
- """Extractor for albums on photobucket.com"""
- category = "photobucket"
- subcategory = "album"
- directory_fmt = ("{category}", "{username}", "{location}")
- filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
- archive_fmt = "{id}"
- pattern = (r"(?:https?://)?((?:[\w-]+\.)?photobucket\.com)"
- r"/user/[^/?&#]+/library(?:/[^?&#]*)?")
- example = "https://s123.photobucket.com/user/USER/library"
-
- def __init__(self, match):
- self.root = "https://" + match.group(1)
- Extractor.__init__(self, match)
-
- def _init(self):
- self.session.headers["Referer"] = self.url
-
- def items(self):
- for image in self.images():
- image["titleOrFilename"] = text.unescape(image["titleOrFilename"])
- image["title"] = text.unescape(image["title"])
- image["extension"] = image["ext"]
- yield Message.Directory, image
- yield Message.Url, image["fullsizeUrl"], image
-
- if self.config("subalbums", True):
- for album in self.subalbums():
- album["_extractor"] = PhotobucketAlbumExtractor
- yield Message.Queue, album["url"], album
-
- def images(self):
- """Yield all images of the current album"""
- url = self.url
- params = {"sort": "3", "page": 1}
-
- while True:
- page = self.request(url, params=params).text
- json_data = text.extract(page, "collectionData:", ",\n")[0]
- if not json_data:
- msg = text.extr(page, 'libraryPrivacyBlock">', "</div>")
- msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
- self.log.error("Unable to get JSON data%s", msg)
- return
- data = json.loads(json_data)
-
- yield from data["items"]["objects"]
-
- if data["total"] <= data["offset"] + data["pageSize"]:
- self.album_path = data["currentAlbumPath"]
- return
- params["page"] += 1
-
- def subalbums(self):
- """Return all subalbum objects"""
- url = self.root + "/component/Albums-SubalbumList"
- params = {
- "albumPath": self.album_path,
- "fetchSubAlbumsOnly": "true",
- "deferCollapsed": "true",
- "json": "1",
- }
-
- data = self.request(url, params=params).json()
- return data["body"].get("subAlbums", ())
-
-
-class PhotobucketImageExtractor(Extractor):
- """Extractor for individual images from photobucket.com"""
- category = "photobucket"
- subcategory = "image"
- directory_fmt = ("{category}", "{username}")
- filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
- archive_fmt = "{username}_{id}"
- pattern = (r"(?:https?://)?(?:[\w-]+\.)?photobucket\.com"
- r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"
- r"|/user/([^/?&#]+)/media/[^?&#]+\.html)")
- example = "https://s123.photobucket.com/user/USER/media/NAME.EXT.html"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.user = match.group(1) or match.group(3)
- self.media_id = match.group(2)
-
- def _init(self):
- self.session.headers["Referer"] = self.url
-
- def items(self):
- url = "https://photobucket.com/galleryd/search.php"
- params = {"userName": self.user, "searchTerm": "", "ref": ""}
-
- if self.media_id:
- params["mediaId"] = self.media_id
- else:
- params["url"] = self.url
-
- # retry API call up to 5 times, since it can randomly fail
- tries = 0
- while tries < 5:
- data = self.request(url, method="POST", params=params).json()
- image = data["mediaDocuments"]
- if "message" not in image:
- break # success
- tries += 1
- self.log.debug(image["message"])
- else:
- raise exception.StopExtraction(image["message"])
-
- # adjust metadata entries to be at least somewhat similar
- # to what the 'album' extractor provides
- if "media" in image:
- image = image["media"][image["mediaIndex"]]
- image["albumView"] = data["mediaDocuments"]["albumView"]
- image["username"] = image["ownerId"]
- else:
- image["fileUrl"] = image.pop("imageUrl")
-
- image.setdefault("title", "")
- image.setdefault("description", "")
- name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".")
- image["ext"] = image["extension"] = ext
- image["titleOrFilename"] = image["title"] or name
- image["tags"] = image.pop("clarifaiTagList", [])
-
- mtype, _, mid = binascii.a2b_base64(image["id"]).partition(b":")
- image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""
-
- yield Message.Directory, image
- yield Message.Url, image["fileUrl"], image
diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py
index 67f38c4..a68f0db 100644
--- a/gallery_dl/extractor/shimmie2.py
+++ b/gallery_dl/extractor/shimmie2.py
@@ -92,6 +92,10 @@ BASE_PATTERN = Shimmie2Extractor.update({
"root": "https://rule34hentai.net",
"pattern": r"rule34hentai\.net",
},
+ "vidyapics": {
+ "root": "https://vidya.pics",
+ "pattern": r"vidya\.pics",
+ },
}) + r"/(?:index\.php\?q=/?)?"
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index 38a2d16..6ec44ba 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -7,7 +7,7 @@
"""Extractors for https://skeb.jp/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text
import itertools
@@ -31,14 +31,15 @@ class SkebExtractor(Extractor):
if "Authorization" not in self.session.headers:
self.headers["Authorization"] = "Bearer null"
- def request(self, url, **kwargs):
- while True:
- try:
- return Extractor.request(self, url, **kwargs)
- except exception.HttpError as exc:
- if exc.status == 429 and "request_key" in exc.response.cookies:
- continue
- raise
+ def _handle_429(self, response):
+ if "request_key" in response.cookies:
+ return True
+
+ request_key = text.extr(
+ response.text, "request_key=", ";")
+ if request_key:
+ self.cookies.set("request_key", request_key, domain="skeb.jp")
+ return True
def items(self):
metadata = self.metadata()
diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py
index e44fdae..3210fd8 100644
--- a/gallery_dl/extractor/speakerdeck.py
+++ b/gallery_dl/extractor/speakerdeck.py
@@ -8,45 +8,35 @@
"""Extractors for https://speakerdeck.com/"""
-from .common import Extractor, Message
+from .common import GalleryExtractor
from .. import text
+import re
-class SpeakerdeckPresentationExtractor(Extractor):
+class SpeakerdeckPresentationExtractor(GalleryExtractor):
"""Extractor for images from a presentation on speakerdeck.com"""
category = "speakerdeck"
subcategory = "presentation"
directory_fmt = ("{category}", "{user}")
filename_fmt = "{presentation}-{num:>02}.{extension}"
archive_fmt = "{presentation}_{num}"
+ root = "https://speakerdeck.com"
pattern = r"(?:https?://)?(?:www\.)?speakerdeck\.com/([^/?#]+)/([^/?#]+)"
example = "https://speakerdeck.com/USER/PRESENTATION"
def __init__(self, match):
- Extractor.__init__(self, match)
+ GalleryExtractor.__init__(self, match, "")
self.user, self.presentation = match.groups()
- self.presentation_id = None
-
- def items(self):
- data = self.get_job_metadata()
- imgs = self.get_image_urls()
- data["count"] = len(imgs)
- yield Message.Directory, data
- for data["num"], url in enumerate(imgs, 1):
- yield Message.Url, url, text.nameext_from_url(url, data)
-
- def get_job_metadata(self):
- """Collect metadata for extractor-job"""
- url = "https://speakerdeck.com/oembed.json"
+
+ def metadata(self, _):
+ url = self.root + "/oembed.json"
params = {
- "url": "https://speakerdeck.com/" + self.user +
- "/" + self.presentation,
+ "url": "{}/{}/{}".format(self.root, self.user, self.presentation),
}
-
data = self.request(url, params=params).json()
- self.presentation_id, pos = \
- text.extract(data["html"], 'src="//speakerdeck.com/player/', '"')
+ self.presentation_id = text.extr(
+ data["html"], 'src="//speakerdeck.com/player/', '"')
return {
"user": self.user,
@@ -56,8 +46,10 @@ class SpeakerdeckPresentationExtractor(Extractor):
"author": data["author_name"],
}
- def get_image_urls(self):
- """Extract and return a list of all image-urls"""
- page = self.request("https://speakerdeck.com/player/" +
- self.presentation_id).text
- return list(text.extract_iter(page, 'js-sd-slide" data-url="', '"'))
+ def images(self, _):
+ url = "{}/player/{}".format(self.root, self.presentation_id)
+ page = re.sub(r"\s+", " ", self.request(url).text)
+ return [
+ (url, None)
+ for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"')
+ ]
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
index 08cccab..bba1ece 100644
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@@ -98,13 +98,13 @@ class SzurubooruTagExtractor(SzurubooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}_{version}"
- pattern = BASE_PATTERN + r"/posts/query=([^/?#]+)"
+ pattern = BASE_PATTERN + r"/posts(?:/query=([^/?#]*))?"
example = "https://booru.foalcon.com/posts/query=TAG"
def __init__(self, match):
SzurubooruExtractor.__init__(self, match)
- query = match.group(match.lastindex)
- self.query = text.unquote(query.replace("+", " "))
+ query = self.groups[-1]
+ self.query = text.unquote(query.replace("+", " ")) if query else ""
def metadata(self):
return {"search_tags": self.query}
@@ -119,9 +119,5 @@ class SzurubooruPostExtractor(SzurubooruExtractor):
pattern = BASE_PATTERN + r"/post/(\d+)"
example = "https://booru.foalcon.com/post/12345"
- def __init__(self, match):
- SzurubooruExtractor.__init__(self, match)
- self.post_id = match.group(match.lastindex)
-
def posts(self):
- return (self._api_request("/post/" + self.post_id),)
+ return (self._api_request("/post/" + self.groups[-1]),)
diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py
index de6f3ee..71431ad 100644
--- a/gallery_dl/extractor/tcbscans.py
+++ b/gallery_dl/extractor/tcbscans.py
@@ -4,19 +4,23 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://tcbscans.com/"""
+"""Extractors for https://tcbscans.me/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
-BASE_PATTERN = r"(?:https?://)?(?:tcbscans|onepiecechapters)\.com"
+BASE_PATTERN = (r"(?:https?://)?(?:tcb(?:-backup\.bihar-mirchi|scans)"
+ r"|onepiecechapters)\.(?:com|me)")
class TcbscansChapterExtractor(ChapterExtractor):
category = "tcbscans"
- root = "https://tcbscans.com"
pattern = BASE_PATTERN + r"(/chapters/\d+/[^/?#]+)"
- example = "https://tcbscans.com/chapters/12345/MANGA-chapter-123"
+ example = "https://tcbscans.me/chapters/12345/MANGA-chapter-123"
+
+ def __init__(self, match):
+ self.root = text.root_from_url(match.group(0))
+ ChapterExtractor.__init__(self, match)
def images(self, page):
return [
@@ -39,10 +43,13 @@ class TcbscansChapterExtractor(ChapterExtractor):
class TcbscansMangaExtractor(MangaExtractor):
category = "tcbscans"
- root = "https://tcbscans.com"
chapterclass = TcbscansChapterExtractor
pattern = BASE_PATTERN + r"(/mangas/\d+/[^/?#]+)"
- example = "https://tcbscans.com/mangas/123/MANGA"
+ example = "https://tcbscans.me/mangas/123/MANGA"
+
+ def __init__(self, match):
+ self.root = text.root_from_url(match.group(0))
+ MangaExtractor.__init__(self, match)
def chapters(self, page):
data = {
diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py
index f57f479..a725a2c 100644
--- a/gallery_dl/extractor/twibooru.py
+++ b/gallery_dl/extractor/twibooru.py
@@ -28,8 +28,13 @@ class TwibooruExtractor(BooruExtractor):
def _init(self):
self.api = TwibooruAPI(self)
+ if not self.config("svg", True):
+ self._file_url = operator.itemgetter("view_url")
- _file_url = operator.itemgetter("view_url")
+ def _file_url(self, post):
+ if post["format"] == "svg":
+ return post["view_url"].rpartition(".")[0] + ".svg"
+ return post["view_url"]
@staticmethod
def _prepare(post):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ff77828..ec098aa 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -36,6 +36,7 @@ class TwitterExtractor(Extractor):
self.user = match.group(1)
def _init(self):
+ self.unavailable = self.config("unavailable", False)
self.textonly = self.config("text-tweets", False)
self.retweets = self.config("retweets", False)
self.replies = self.config("replies", True)
@@ -143,6 +144,15 @@ class TwitterExtractor(Extractor):
def _extract_media(self, tweet, entities, files):
for media in entities:
+
+ if "ext_media_availability" in media:
+ ext = media["ext_media_availability"]
+ if ext.get("status") == "Unavailable":
+ self.log.warning("Media unavailable (%s - '%s')",
+ tweet["id_str"], ext.get("reason"))
+ if not self.unavailable:
+ continue
+
descr = media.get("ext_alt_text")
width = media["original_info"].get("width", 0)
height = media["original_info"].get("height", 0)
@@ -1709,11 +1719,16 @@ class TwitterAPI():
variables["cursor"] = cursor
def _handle_ratelimit(self, response):
- if self.extractor.config("ratelimit") == "abort":
+ rl = self.extractor.config("ratelimit")
+ if rl == "abort":
raise exception.StopExtraction("Rate limit exceeded")
-
- until = response.headers.get("x-rate-limit-reset")
- self.extractor.wait(until=until, seconds=None if until else 60)
+ elif rl and isinstance(rl, str) and rl.startswith("wait:"):
+ until = None
+ seconds = text.parse_float(rl.partition(":")[2]) or 60.0
+ else:
+ until = response.headers.get("x-rate-limit-reset")
+ seconds = None if until else 60.0
+ self.extractor.wait(until=until, seconds=seconds)
def _process_tombstone(self, entry, tombstone):
text = (tombstone.get("richText") or tombstone["text"])["text"]
@@ -1849,7 +1864,7 @@ def _login_impl(extr, username, password):
},
}
elif subtask == "LoginEnterAlternateIdentifierSubtask":
- alt = extr.input(
+ alt = extr.config("username_alt") or extr.input(
"Alternate Identifier (username, email, phone number): ")
data = {
"enter_text": {
@@ -1881,8 +1896,9 @@ def _login_impl(extr, username, password):
raise exception.AuthenticationError("Login requires CAPTCHA")
elif subtask == "DenyLoginSubtask":
raise exception.AuthenticationError("Login rejected as suspicious")
- elif subtask == "ArkoseLogin":
- raise exception.AuthenticationError("No auth token cookie")
+ elif subtask == "LoginSuccessSubtask":
+ raise exception.AuthenticationError(
+ "No 'auth_token' cookie received")
else:
raise exception.StopExtraction("Unrecognized subtask %s", subtask)
diff --git a/gallery_dl/extractor/vichan.py b/gallery_dl/extractor/vichan.py
index 79d7916..654c451 100644
--- a/gallery_dl/extractor/vichan.py
+++ b/gallery_dl/extractor/vichan.py
@@ -22,10 +22,6 @@ BASE_PATTERN = VichanExtractor.update({
"root": "https://8kun.top",
"pattern": r"8kun\.top",
},
- "wikieat": {
- "root": "https://wikieat.club",
- "pattern": r"wikieat\.club",
- },
"smugloli": {
"root": None,
"pattern": r"smuglo(?:\.li|li\.net)",