aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2025-12-20 05:49:04 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2025-12-20 05:49:04 -0500
commita24ec1647aeac35a63b744ea856011ad6e06be3b (patch)
treeae94416de786aeddd05d99559098f7f16bb103a6 /gallery_dl
parent33f8a8a37a9cba738ef25fb99955f0730da9eb48 (diff)
New upstream version 1.31.1.upstream/1.31.1
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/__init__.py4
-rw-r--r--gallery_dl/actions.py5
-rw-r--r--gallery_dl/cookies.py2
-rw-r--r--gallery_dl/downloader/__init__.py2
-rw-r--r--gallery_dl/downloader/common.py11
-rw-r--r--gallery_dl/downloader/http.py18
-rw-r--r--gallery_dl/downloader/ytdl.py295
-rw-r--r--gallery_dl/dt.py115
-rw-r--r--gallery_dl/extractor/2ch.py18
-rw-r--r--gallery_dl/extractor/2chan.py2
-rw-r--r--gallery_dl/extractor/2chen.py73
-rw-r--r--gallery_dl/extractor/35photo.py2
-rw-r--r--gallery_dl/extractor/4archive.py11
-rw-r--r--gallery_dl/extractor/4chan.py2
-rw-r--r--gallery_dl/extractor/4chanarchives.py2
-rw-r--r--gallery_dl/extractor/500px.py12
-rw-r--r--gallery_dl/extractor/8chan.py13
-rw-r--r--gallery_dl/extractor/8muses.py5
-rw-r--r--gallery_dl/extractor/__init__.py18
-rw-r--r--gallery_dl/extractor/adultempire.py2
-rw-r--r--gallery_dl/extractor/agnph.py10
-rw-r--r--gallery_dl/extractor/ao3.py38
-rw-r--r--gallery_dl/extractor/arcalive.py13
-rw-r--r--gallery_dl/extractor/arena.py89
-rw-r--r--gallery_dl/extractor/artstation.py7
-rw-r--r--gallery_dl/extractor/aryion.py117
-rw-r--r--gallery_dl/extractor/audiochan.py158
-rw-r--r--gallery_dl/extractor/batoto.py17
-rw-r--r--gallery_dl/extractor/bbc.py7
-rw-r--r--gallery_dl/extractor/behance.py4
-rw-r--r--gallery_dl/extractor/bellazon.py47
-rw-r--r--gallery_dl/extractor/bilibili.py2
-rw-r--r--gallery_dl/extractor/blogger.py20
-rw-r--r--gallery_dl/extractor/bluesky.py45
-rw-r--r--gallery_dl/extractor/booru.py2
-rw-r--r--gallery_dl/extractor/boosty.py30
-rw-r--r--gallery_dl/extractor/booth.py5
-rw-r--r--gallery_dl/extractor/bunkr.py37
-rw-r--r--gallery_dl/extractor/catbox.py4
-rw-r--r--gallery_dl/extractor/cfake.py149
-rw-r--r--gallery_dl/extractor/chevereto.py73
-rw-r--r--gallery_dl/extractor/cien.py14
-rw-r--r--gallery_dl/extractor/civitai.py138
-rw-r--r--gallery_dl/extractor/comedywildlifephoto.py51
-rw-r--r--gallery_dl/extractor/comick.py12
-rw-r--r--gallery_dl/extractor/comicvine.py2
-rw-r--r--gallery_dl/extractor/common.py47
-rw-r--r--gallery_dl/extractor/cyberdrop.py20
-rw-r--r--gallery_dl/extractor/cyberfile.py58
-rw-r--r--gallery_dl/extractor/danbooru.py58
-rw-r--r--gallery_dl/extractor/dankefuerslesen.py6
-rw-r--r--gallery_dl/extractor/desktopography.py8
-rw-r--r--gallery_dl/extractor/deviantart.py50
-rw-r--r--gallery_dl/extractor/directlink.py2
-rw-r--r--gallery_dl/extractor/discord.py18
-rw-r--r--gallery_dl/extractor/dynastyscans.py22
-rw-r--r--gallery_dl/extractor/e621.py40
-rw-r--r--gallery_dl/extractor/eporner.py54
-rw-r--r--gallery_dl/extractor/erome.py16
-rw-r--r--gallery_dl/extractor/everia.py20
-rw-r--r--gallery_dl/extractor/exhentai.py22
-rw-r--r--gallery_dl/extractor/facebook.py49
-rw-r--r--gallery_dl/extractor/fanbox.py63
-rw-r--r--gallery_dl/extractor/fansly.py26
-rw-r--r--gallery_dl/extractor/fantia.py4
-rw-r--r--gallery_dl/extractor/fapachi.py2
-rw-r--r--gallery_dl/extractor/fapello.py16
-rw-r--r--gallery_dl/extractor/fikfap.py105
-rw-r--r--gallery_dl/extractor/fitnakedgirls.py208
-rw-r--r--gallery_dl/extractor/flickr.py43
-rw-r--r--gallery_dl/extractor/foolfuuka.py10
-rw-r--r--gallery_dl/extractor/foolslide.py6
-rw-r--r--gallery_dl/extractor/furaffinity.py24
-rw-r--r--gallery_dl/extractor/furry34.py9
-rw-r--r--gallery_dl/extractor/gelbooru.py18
-rw-r--r--gallery_dl/extractor/gelbooru_v01.py9
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py14
-rw-r--r--gallery_dl/extractor/generic.py10
-rw-r--r--gallery_dl/extractor/girlsreleased.py10
-rw-r--r--gallery_dl/extractor/girlswithmuscle.py15
-rw-r--r--gallery_dl/extractor/gofile.py14
-rw-r--r--gallery_dl/extractor/hatenablog.py20
-rw-r--r--gallery_dl/extractor/hentai2read.py2
-rw-r--r--gallery_dl/extractor/hentaicosplays.py2
-rw-r--r--gallery_dl/extractor/hentaifoundry.py37
-rw-r--r--gallery_dl/extractor/hentaihand.py3
-rw-r--r--gallery_dl/extractor/hentaihere.py2
-rw-r--r--gallery_dl/extractor/hiperdex.py10
-rw-r--r--gallery_dl/extractor/hitomi.py2
-rw-r--r--gallery_dl/extractor/hotleak.py10
-rw-r--r--gallery_dl/extractor/idolcomplex.py9
-rw-r--r--gallery_dl/extractor/imagebam.py14
-rw-r--r--gallery_dl/extractor/imagechest.py10
-rw-r--r--gallery_dl/extractor/imagefap.py20
-rw-r--r--gallery_dl/extractor/imagehosts.py101
-rw-r--r--gallery_dl/extractor/imgbb.py5
-rw-r--r--gallery_dl/extractor/imgbox.py11
-rw-r--r--gallery_dl/extractor/imgpile.py2
-rw-r--r--gallery_dl/extractor/imgth.py2
-rw-r--r--gallery_dl/extractor/imgur.py29
-rw-r--r--gallery_dl/extractor/imhentai.py9
-rw-r--r--gallery_dl/extractor/inkbunny.py24
-rw-r--r--gallery_dl/extractor/instagram.py88
-rw-r--r--gallery_dl/extractor/issuu.py6
-rw-r--r--gallery_dl/extractor/itaku.py64
-rw-r--r--gallery_dl/extractor/itchio.py2
-rw-r--r--gallery_dl/extractor/iwara.py15
-rw-r--r--gallery_dl/extractor/jschan.py6
-rw-r--r--gallery_dl/extractor/kabeuchi.py5
-rw-r--r--gallery_dl/extractor/keenspot.py2
-rw-r--r--gallery_dl/extractor/kemono.py102
-rw-r--r--gallery_dl/extractor/khinsider.py2
-rw-r--r--gallery_dl/extractor/komikcast.py10
-rw-r--r--gallery_dl/extractor/koofr.py55
-rw-r--r--gallery_dl/extractor/leakgallery.py10
-rw-r--r--gallery_dl/extractor/lensdump.py9
-rw-r--r--gallery_dl/extractor/lexica.py2
-rw-r--r--gallery_dl/extractor/lightroom.py2
-rw-r--r--gallery_dl/extractor/livedoor.py4
-rw-r--r--gallery_dl/extractor/lofter.py4
-rw-r--r--gallery_dl/extractor/lolisafe.py4
-rw-r--r--gallery_dl/extractor/luscious.py6
-rw-r--r--gallery_dl/extractor/lynxchan.py6
-rw-r--r--gallery_dl/extractor/madokami.py5
-rw-r--r--gallery_dl/extractor/mangadex.py22
-rw-r--r--gallery_dl/extractor/mangafox.py8
-rw-r--r--gallery_dl/extractor/mangahere.py4
-rw-r--r--gallery_dl/extractor/manganelo.py16
-rw-r--r--gallery_dl/extractor/mangapark.py12
-rw-r--r--gallery_dl/extractor/mangaread.py4
-rw-r--r--gallery_dl/extractor/mangataro.py6
-rw-r--r--gallery_dl/extractor/mangoxo.py4
-rw-r--r--gallery_dl/extractor/mastodon.py25
-rw-r--r--gallery_dl/extractor/message.py9
-rw-r--r--gallery_dl/extractor/misskey.py80
-rw-r--r--gallery_dl/extractor/moebooru.py23
-rw-r--r--gallery_dl/extractor/motherless.py43
-rw-r--r--gallery_dl/extractor/myhentaigallery.py43
-rw-r--r--gallery_dl/extractor/myportfolio.py2
-rw-r--r--gallery_dl/extractor/naverblog.py11
-rw-r--r--gallery_dl/extractor/naverchzzk.py12
-rw-r--r--gallery_dl/extractor/naverwebtoon.py4
-rw-r--r--gallery_dl/extractor/nekohouse.py12
-rw-r--r--gallery_dl/extractor/newgrounds.py37
-rw-r--r--gallery_dl/extractor/nijie.py31
-rw-r--r--gallery_dl/extractor/nitter.py20
-rw-r--r--gallery_dl/extractor/noop.py6
-rw-r--r--gallery_dl/extractor/nozomi.py9
-rw-r--r--gallery_dl/extractor/nudostar.py6
-rw-r--r--gallery_dl/extractor/oauth.py32
-rw-r--r--gallery_dl/extractor/okporn.py39
-rw-r--r--gallery_dl/extractor/paheal.py7
-rw-r--r--gallery_dl/extractor/patreon.py62
-rw-r--r--gallery_dl/extractor/pexels.py13
-rw-r--r--gallery_dl/extractor/philomena.py9
-rw-r--r--gallery_dl/extractor/photovogue.py7
-rw-r--r--gallery_dl/extractor/picarto.py5
-rw-r--r--gallery_dl/extractor/picazor.py59
-rw-r--r--gallery_dl/extractor/pictoa.py6
-rw-r--r--gallery_dl/extractor/piczel.py13
-rw-r--r--gallery_dl/extractor/pillowfort.py16
-rw-r--r--gallery_dl/extractor/pinterest.py22
-rw-r--r--gallery_dl/extractor/pixeldrain.py24
-rw-r--r--gallery_dl/extractor/pixiv.py81
-rw-r--r--gallery_dl/extractor/pixnet.py12
-rw-r--r--gallery_dl/extractor/plurk.py13
-rw-r--r--gallery_dl/extractor/poipiku.py2
-rw-r--r--gallery_dl/extractor/poringa.py8
-rw-r--r--gallery_dl/extractor/pornhub.py17
-rw-r--r--gallery_dl/extractor/pornpics.py38
-rw-r--r--gallery_dl/extractor/pornstarstube.py43
-rw-r--r--gallery_dl/extractor/postmill.py28
-rw-r--r--gallery_dl/extractor/rawkuma.py63
-rw-r--r--gallery_dl/extractor/reactor.py14
-rw-r--r--gallery_dl/extractor/readcomiconline.py4
-rw-r--r--gallery_dl/extractor/realbooru.py39
-rw-r--r--gallery_dl/extractor/recursive.py4
-rw-r--r--gallery_dl/extractor/redbust.py186
-rw-r--r--gallery_dl/extractor/reddit.py36
-rw-r--r--gallery_dl/extractor/redgifs.py4
-rw-r--r--gallery_dl/extractor/rule34us.py6
-rw-r--r--gallery_dl/extractor/rule34vault.py9
-rw-r--r--gallery_dl/extractor/rule34xyz.py9
-rw-r--r--gallery_dl/extractor/s3ndpics.py8
-rw-r--r--gallery_dl/extractor/saint.py8
-rw-r--r--gallery_dl/extractor/sankaku.py22
-rw-r--r--gallery_dl/extractor/sankakucomplex.py10
-rw-r--r--gallery_dl/extractor/schalenetwork.py25
-rw-r--r--gallery_dl/extractor/scrolller.py8
-rw-r--r--gallery_dl/extractor/seiga.py4
-rw-r--r--gallery_dl/extractor/sexcom.py54
-rw-r--r--gallery_dl/extractor/shimmie2.py18
-rw-r--r--gallery_dl/extractor/shopify.py6
-rw-r--r--gallery_dl/extractor/simpcity.py186
-rw-r--r--gallery_dl/extractor/simplyhentai.py8
-rw-r--r--gallery_dl/extractor/sizebooru.py4
-rw-r--r--gallery_dl/extractor/skeb.py18
-rw-r--r--gallery_dl/extractor/slickpic.py6
-rw-r--r--gallery_dl/extractor/slideshare.py5
-rw-r--r--gallery_dl/extractor/smugmug.py8
-rw-r--r--gallery_dl/extractor/soundgasm.py6
-rw-r--r--gallery_dl/extractor/speakerdeck.py4
-rw-r--r--gallery_dl/extractor/steamgriddb.py12
-rw-r--r--gallery_dl/extractor/subscribestar.py28
-rw-r--r--gallery_dl/extractor/sxypix.py39
-rw-r--r--gallery_dl/extractor/szurubooru.py7
-rw-r--r--gallery_dl/extractor/tapas.py10
-rw-r--r--gallery_dl/extractor/tcbscans.py4
-rw-r--r--gallery_dl/extractor/telegraph.py5
-rw-r--r--gallery_dl/extractor/tenor.py17
-rw-r--r--gallery_dl/extractor/thehentaiworld.py7
-rw-r--r--gallery_dl/extractor/tiktok.py47
-rw-r--r--gallery_dl/extractor/tmohentai.py2
-rw-r--r--gallery_dl/extractor/toyhouse.py6
-rw-r--r--gallery_dl/extractor/tsumino.py2
-rw-r--r--gallery_dl/extractor/tumblr.py37
-rw-r--r--gallery_dl/extractor/tumblrgallery.py6
-rw-r--r--gallery_dl/extractor/tungsten.py4
-rw-r--r--gallery_dl/extractor/twibooru.py13
-rw-r--r--gallery_dl/extractor/twitter.py252
-rw-r--r--gallery_dl/extractor/unsplash.py14
-rw-r--r--gallery_dl/extractor/uploadir.py2
-rw-r--r--gallery_dl/extractor/urlgalleries.py4
-rw-r--r--gallery_dl/extractor/urlshortener.py2
-rw-r--r--gallery_dl/extractor/vanillarock.py6
-rw-r--r--gallery_dl/extractor/vichan.py6
-rw-r--r--gallery_dl/extractor/vipergirls.py10
-rw-r--r--gallery_dl/extractor/vk.py16
-rw-r--r--gallery_dl/extractor/vsco.py24
-rw-r--r--gallery_dl/extractor/wallhaven.py5
-rw-r--r--gallery_dl/extractor/wallpapercave.py6
-rw-r--r--gallery_dl/extractor/warosu.py4
-rw-r--r--gallery_dl/extractor/weasyl.py29
-rw-r--r--gallery_dl/extractor/webmshare.py4
-rw-r--r--gallery_dl/extractor/webtoons.py10
-rw-r--r--gallery_dl/extractor/weebcentral.py8
-rw-r--r--gallery_dl/extractor/weebdex.py132
-rw-r--r--gallery_dl/extractor/weibo.py12
-rw-r--r--gallery_dl/extractor/wikiart.py10
-rw-r--r--gallery_dl/extractor/wikifeet.py4
-rw-r--r--gallery_dl/extractor/wikimedia.py98
-rw-r--r--gallery_dl/extractor/xasiat.py25
-rw-r--r--gallery_dl/extractor/xenforo.py348
-rw-r--r--gallery_dl/extractor/xfolio.py8
-rw-r--r--gallery_dl/extractor/xhamster.py8
-rw-r--r--gallery_dl/extractor/xvideos.py4
-rw-r--r--gallery_dl/extractor/yiffverse.py9
-rw-r--r--gallery_dl/extractor/ytdl.py2
-rw-r--r--gallery_dl/extractor/zerochan.py8
-rw-r--r--gallery_dl/formatter.py86
-rw-r--r--gallery_dl/job.py171
-rw-r--r--gallery_dl/option.py18
-rw-r--r--gallery_dl/output.py56
-rw-r--r--gallery_dl/path.py95
-rw-r--r--gallery_dl/postprocessor/__init__.py2
-rw-r--r--gallery_dl/postprocessor/exec.py9
-rw-r--r--gallery_dl/postprocessor/metadata.py10
-rw-r--r--gallery_dl/postprocessor/mtime.py7
-rw-r--r--gallery_dl/postprocessor/ugoira.py6
-rw-r--r--gallery_dl/text.py65
-rw-r--r--gallery_dl/update.py2
-rw-r--r--gallery_dl/util.py74
-rw-r--r--gallery_dl/version.py2
-rw-r--r--gallery_dl/ytdl.py6
264 files changed, 4651 insertions, 2593 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index fdcb6d0..98f8c12 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -284,14 +284,14 @@ def main():
# unsupported file logging handler
if handler := output.setup_logging_handler(
- "unsupportedfile", fmt="{message}"):
+ "unsupportedfile", fmt="{message}", defer=True):
ulog = job.Job.ulog = logging.getLogger("unsupported")
ulog.addHandler(handler)
ulog.propagate = False
# error file logging handler
if handler := output.setup_logging_handler(
- "errorfile", fmt="{message}", mode="a"):
+ "errorfile", fmt="{message}", mode="a", defer=True):
elog = input_manager.err = logging.getLogger("errorfile")
elog.addHandler(handler)
elog.propagate = False
diff --git a/gallery_dl/actions.py b/gallery_dl/actions.py
index 971c4d9..5d2f645 100644
--- a/gallery_dl/actions.py
+++ b/gallery_dl/actions.py
@@ -148,6 +148,11 @@ class LoggerAdapter():
if cond(msg):
action(args)
+ def traceback(self, exc):
+ if self.logger.isEnabledFor(logging.DEBUG):
+ self.logger._log(
+ logging.DEBUG, "", None, exc_info=exc, extra=self.extra)
+
def _level_to_int(level):
try:
diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py
index ba719ac..26f8244 100644
--- a/gallery_dl/cookies.py
+++ b/gallery_dl/cookies.py
@@ -119,7 +119,7 @@ def load_cookies_webkit(browser_name, profile=None, domain=None):
for page_size in page_sizes:
_webkit_parse_cookies_page(p.read_bytes(page_size), cookies)
_log_info("Extracted %s cookies from %s",
- browser_name.capitalize(), len(cookies))
+ len(cookies), browser_name.capitalize())
return cookies
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py
index e1b936e..79dc5cb 100644
--- a/gallery_dl/downloader/__init__.py
+++ b/gallery_dl/downloader/__init__.py
@@ -27,7 +27,7 @@ def find(scheme):
scheme = "http"
if scheme in modules: # prevent unwanted imports
try:
- module = __import__(scheme, globals(), None, (), 1)
+ module = __import__(scheme, globals(), None, None, 1)
except ImportError:
pass
else:
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
index 7cd8d10..66996f7 100644
--- a/gallery_dl/downloader/common.py
+++ b/gallery_dl/downloader/common.py
@@ -31,8 +31,15 @@ class DownloaderBase():
self.partdir = self.config("part-directory")
if self.partdir:
- self.partdir = util.expand_path(self.partdir)
- os.makedirs(self.partdir, exist_ok=True)
+ if isinstance(self.partdir, dict):
+ self.partdir = [
+ (util.compile_filter(expr) if expr else util.true,
+ util.expand_path(pdir))
+ for expr, pdir in self.partdir.items()
+ ]
+ else:
+ self.partdir = util.expand_path(self.partdir)
+ os.makedirs(self.partdir, exist_ok=True)
proxies = self.config("proxy", util.SENTINEL)
if proxies is util.SENTINEL:
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 248bf70..703dcca 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -95,7 +95,7 @@ class HttpDownloader(DownloaderBase):
except Exception as exc:
if self.downloading:
output.stderr_write("\n")
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
raise
finally:
# remove file from incomplete downloads
@@ -230,6 +230,10 @@ class HttpDownloader(DownloaderBase):
# check file size
size = text.parse_int(size, None)
if size is not None:
+ if not size:
+ self.release_conn(response)
+ self.log.warning("Empty file")
+ return False
if self.minsize and size < self.minsize:
self.release_conn(response)
self.log.warning(
@@ -342,9 +346,15 @@ class HttpDownloader(DownloaderBase):
raise
# check file size
- if size and fp.tell() < size:
- msg = f"file size mismatch ({fp.tell()} < {size})"
- output.stderr_write("\n")
+ if size and (fsize := fp.tell()) < size:
+ if (segmented := kwdict.get("_http_segmented")) and \
+ segmented is True or segmented == fsize:
+ tries -= 1
+ msg = "Resuming segmented download"
+ output.stdout_write("\r")
+ else:
+ msg = f"file size mismatch ({fsize} < {size})"
+ output.stderr_write("\n")
continue
break
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
index a56a6be..e9b3294 100644
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@@ -22,9 +22,9 @@ class YoutubeDLDownloader(DownloaderBase):
DownloaderBase.__init__(self, job)
extractor = job.extractor
- retries = self.config("retries", extractor._retries)
+ self.retries = self.config("retries", extractor._retries)
self.ytdl_opts = {
- "retries": retries+1 if retries >= 0 else float("inf"),
+ "retries": self.retries+1 if self.retries >= 0 else float("inf"),
"socket_timeout": self.config("timeout", extractor._timeout),
"nocheckcertificate": not self.config("verify", extractor._verify),
"proxy": self.proxies.get("http") if self.proxies else None,
@@ -39,17 +39,25 @@ class YoutubeDLDownloader(DownloaderBase):
def download(self, url, pathfmt):
kwdict = pathfmt.kwdict
+ tries = 0
- ytdl_instance = kwdict.pop("_ytdl_instance", None)
- if not ytdl_instance:
+ if ytdl_instance := kwdict.pop("_ytdl_instance", None):
+ # 'ytdl' extractor
+ self._prepare(ytdl_instance)
+ info_dict = kwdict.pop("_ytdl_info_dict")
+ else:
+ # other extractors
ytdl_instance = self.ytdl_instance
if not ytdl_instance:
try:
module = ytdl.import_module(self.config("module"))
except (ImportError, SyntaxError) as exc:
- self.log.error("Cannot import module '%s'",
- getattr(exc, "name", ""))
- self.log.debug("", exc_info=exc)
+ if exc.__context__:
+ self.log.error("Cannot import yt-dlp or youtube-dl")
+ else:
+ self.log.error("Cannot import module '%s'",
+ getattr(exc, "name", ""))
+ self.log.traceback(exc)
self.download = lambda u, p: False
return False
@@ -63,6 +71,8 @@ class YoutubeDLDownloader(DownloaderBase):
module, self, self.ytdl_opts)
if self.outtmpl == "default":
self.outtmpl = module.DEFAULT_OUTTMPL
+ self._prepare(ytdl_instance)
+
if self.forward_cookies:
self.log.debug("Forwarding cookies to %s",
ytdl_instance.__module__)
@@ -70,45 +80,150 @@ class YoutubeDLDownloader(DownloaderBase):
for cookie in self.session.cookies:
set_cookie(cookie)
- if "__gdl_initialize" in ytdl_instance.params:
- del ytdl_instance.params["__gdl_initialize"]
+ url = url[5:]
+ manifest = kwdict.get("_ytdl_manifest")
+ while True:
+ tries += 1
+ self.error = None
+ try:
+ if manifest is None:
+ info_dict = self._extract_url(
+ ytdl_instance, url)
+ else:
+ info_dict = self._extract_manifest(
+ ytdl_instance, url, kwdict)
+ except Exception as exc:
+ self.log.traceback(exc)
+ cls = exc.__class__
+ if cls.__module__ == "builtins":
+ tries = False
+ msg = f"{cls.__name__}: {exc}"
+ else:
+ if self.error is not None:
+ msg = self.error
+ elif not info_dict:
+ msg = "Empty 'info_dict' data"
+ else:
+ break
+
+ if tries:
+ self.log.error("%s (%s/%s)", msg, tries, self.retries+1)
+ else:
+ self.log.error(msg)
+ return False
+ if tries > self.retries:
+ return False
- if self.progress is not None:
- ytdl_instance.add_progress_hook(self._progress_hook)
- if rlf := ytdl_instance.params.pop("__gdl_ratelimit_func", False):
- self.rate_dyn = rlf
+ if extra := kwdict.get("_ytdl_extra"):
+ info_dict.update(extra)
- info_dict = kwdict.pop("_ytdl_info_dict", None)
- if not info_dict:
- url = url[5:]
+ while True:
+ tries += 1
+ self.error = None
try:
- if manifest := kwdict.pop("_ytdl_manifest", None):
- info_dict = self._extract_manifest(
- ytdl_instance, url, manifest,
- kwdict.pop("_ytdl_manifest_data", None),
- kwdict.pop("_ytdl_manifest_headers", None),
- kwdict.pop("_ytdl_manifest_cookies", None))
+ if "entries" in info_dict:
+ success = self._download_playlist(
+ ytdl_instance, pathfmt, info_dict)
else:
- info_dict = self._extract_info(ytdl_instance, url)
+ success = self._download_video(
+ ytdl_instance, pathfmt, info_dict)
except Exception as exc:
- self.log.debug("", exc_info=exc)
- self.log.warning("%s: %s", exc.__class__.__name__, exc)
+ self.log.traceback(exc)
+ cls = exc.__class__
+ if cls.__module__ == "builtins":
+ tries = False
+ msg = f"{cls.__name__}: {exc}"
+ else:
+ if self.error is not None:
+ msg = self.error
+ elif not success:
+ msg = "Error"
+ else:
+ break
- if not info_dict:
+ if tries:
+ self.log.error("%s (%s/%s)", msg, tries, self.retries+1)
+ else:
+ self.log.error(msg)
return False
+ if tries > self.retries:
+ return False
+ return True
+
+ def _extract_url(self, ytdl, url):
+ return ytdl.extract_info(url, download=False)
+
+ def _extract_manifest(self, ytdl, url, kwdict):
+ extr = ytdl.get_info_extractor("Generic")
+ video_id = extr._generic_id(url)
+
+ if cookies := kwdict.get("_ytdl_manifest_cookies"):
+ if isinstance(cookies, dict):
+ cookies = cookies.items()
+ set_cookie = ytdl.cookiejar.set_cookie
+ for name, value in cookies:
+ set_cookie(Cookie(
+ 0, name, value, None, False,
+ "", False, False, "/", False,
+ False, None, False, None, None, {},
+ ))
+
+ type = kwdict["_ytdl_manifest"]
+ data = kwdict.get("_ytdl_manifest_data")
+ headers = kwdict.get("_ytdl_manifest_headers")
+ if type == "hls":
+ if data is None:
+ try:
+ fmts, subs = extr._extract_m3u8_formats_and_subtitles(
+ url, video_id, "mp4", headers=headers)
+ except AttributeError:
+ fmts = extr._extract_m3u8_formats(
+ url, video_id, "mp4", headers=headers)
+ subs = None
+ else:
+ try:
+ fmts, subs = extr._parse_m3u8_formats_and_subtitles(
+ data, url, "mp4", headers=headers)
+ except AttributeError:
+ fmts = extr._parse_m3u8_formats(
+ data, url, "mp4", headers=headers)
+ subs = None
- if "entries" in info_dict:
- index = kwdict.get("_ytdl_index")
- if index is None:
- return self._download_playlist(
- ytdl_instance, pathfmt, info_dict)
+ elif type == "dash":
+ if data is None:
+ try:
+ fmts, subs = extr._extract_mpd_formats_and_subtitles(
+ url, video_id, headers=headers)
+ except AttributeError:
+ fmts = extr._extract_mpd_formats(
+ url, video_id, headers=headers)
+ subs = None
else:
- info_dict = info_dict["entries"][index]
+ if isinstance(data, str):
+ data = ElementTree.fromstring(data)
+ try:
+ fmts, subs = extr._parse_mpd_formats_and_subtitles(
+ data, mpd_id="dash")
+ except AttributeError:
+ fmts = extr._parse_mpd_formats(
+ data, mpd_id="dash")
+ subs = None
- if extra := kwdict.get("_ytdl_extra"):
- info_dict.update(extra)
+ else:
+ raise ValueError(f"Unsupported manifest type '{type}'")
- return self._download_video(ytdl_instance, pathfmt, info_dict)
+ if headers:
+ for fmt in fmts:
+ fmt["http_headers"] = headers
+
+ info_dict = {
+ "extractor": "",
+ "id" : video_id,
+ "title" : video_id,
+ "formats" : fmts,
+ "subtitles": subs,
+ }
+ return ytdl.process_ie_result(info_dict, download=False)
def _download_video(self, ytdl_instance, pathfmt, info_dict):
if "url" in info_dict:
@@ -161,12 +276,7 @@ class YoutubeDLDownloader(DownloaderBase):
path = pathfmt.realpath.replace("%", "%%")
self._set_outtmpl(ytdl_instance, path)
- try:
- ytdl_instance.process_info(info_dict)
- except Exception as exc:
- self.log.debug("", exc_info=exc)
- return False
-
+ ytdl_instance.process_info(info_dict)
pathfmt.temppath = info_dict.get("filepath") or info_dict["_filename"]
return True
@@ -188,78 +298,20 @@ class YoutubeDLDownloader(DownloaderBase):
ytdl_instance.process_info(entry)
status = True
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.error("%s: %s", exc.__class__.__name__, exc)
return status
- def _extract_info(self, ytdl, url):
- return ytdl.extract_info(url, download=False)
-
- def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None,
- headers=None, cookies=None):
- extr = ytdl.get_info_extractor("Generic")
- video_id = extr._generic_id(url)
-
- if cookies is not None:
- if isinstance(cookies, dict):
- cookies = cookies.items()
- set_cookie = ytdl.cookiejar.set_cookie
- for name, value in cookies:
- set_cookie(Cookie(
- 0, name, value, None, False,
- "", False, False, "/", False,
- False, None, False, None, None, {},
- ))
+ def _prepare(self, ytdl_instance):
+ if "__gdl_initialize" not in ytdl_instance.params:
+ return
- if manifest_type == "hls":
- if manifest_data is None:
- try:
- fmts, subs = extr._extract_m3u8_formats_and_subtitles(
- url, video_id, "mp4", headers=headers)
- except AttributeError:
- fmts = extr._extract_m3u8_formats(
- url, video_id, "mp4", headers=headers)
- subs = None
- else:
- try:
- fmts, subs = extr._parse_m3u8_formats_and_subtitles(
- url, video_id, "mp4")
- except AttributeError:
- fmts = extr._parse_m3u8_formats(url, video_id, "mp4")
- subs = None
-
- elif manifest_type == "dash":
- if manifest_data is None:
- try:
- fmts, subs = extr._extract_mpd_formats_and_subtitles(
- url, video_id, headers=headers)
- except AttributeError:
- fmts = extr._extract_mpd_formats(
- url, video_id, headers=headers)
- subs = None
- else:
- if isinstance(manifest_data, str):
- manifest_data = ElementTree.fromstring(manifest_data)
- try:
- fmts, subs = extr._parse_mpd_formats_and_subtitles(
- manifest_data, mpd_id="dash")
- except AttributeError:
- fmts = extr._parse_mpd_formats(
- manifest_data, mpd_id="dash")
- subs = None
-
- else:
- self.log.error("Unsupported manifest type '%s'", manifest_type)
- return None
-
- info_dict = {
- "extractor": "",
- "id" : video_id,
- "title" : video_id,
- "formats" : fmts,
- "subtitles": subs,
- }
- return ytdl.process_ie_result(info_dict, download=False)
+ del ytdl_instance.params["__gdl_initialize"]
+ if self.progress is not None:
+ ytdl_instance.add_progress_hook(self._progress_hook)
+ if rlf := ytdl_instance.params.pop("__gdl_ratelimit_func", False):
+ self.rate_dyn = rlf
+ ytdl_instance.params["logger"] = LoggerAdapter(self, ytdl_instance)
def _progress_hook(self, info):
if info["status"] == "downloading" and \
@@ -284,6 +336,31 @@ class YoutubeDLDownloader(DownloaderBase):
ytdl_instance.params["outtmpl"] = {"default": outtmpl}
+class LoggerAdapter():
+ __slots__ = ("obj", "log")
+
+ def __init__(self, obj, ytdl_instance):
+ self.obj = obj
+ self.log = ytdl_instance.params.get("logger")
+
+ def debug(self, msg):
+ if self.log is not None:
+ if msg[0] == "[":
+ msg = msg[msg.find("]")+2:]
+ self.log.debug(msg)
+
+ def warning(self, msg):
+ if self.log is not None:
+ if "WARNING:" in msg:
+ msg = msg[msg.find(" ")+1:]
+ self.log.warning(msg)
+
+ def error(self, msg):
+ if "ERROR:" in msg:
+ msg = msg[msg.find(" ")+1:]
+ self.obj.error = msg
+
+
def compatible_formats(formats):
"""Returns True if 'formats' are compatible for merge"""
video_ext = formats[0].get("ext")
diff --git a/gallery_dl/dt.py b/gallery_dl/dt.py
new file mode 100644
index 0000000..b37ebf3
--- /dev/null
+++ b/gallery_dl/dt.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Date/Time utilities"""
+
+import sys
+import time
+from datetime import datetime, date, timedelta, timezone # noqa F401
+
+
+class NullDatetime(datetime):
+
+ def __bool__(self):
+ return False
+
+ def __str__(self):
+ return "[Invalid DateTime]"
+
+ def __format__(self, format_spec):
+ return "[Invalid DateTime]"
+
+
+NONE = NullDatetime(1, 1, 1)
+EPOCH = datetime(1970, 1, 1)
+SECOND = timedelta(0, 1)
+
+
+def normalize(dt):
+ # if (o := dt.utcoffset()) is not None:
+ # return dt.replace(tzinfo=None, microsecond=0) - o
+ if dt.tzinfo is not None:
+ return dt.astimezone(timezone.utc).replace(tzinfo=None, microsecond=0)
+ if dt.microsecond:
+ return dt.replace(microsecond=0)
+ return dt
+
+
+def convert(value):
+ """Convert 'value' to a naive UTC datetime object"""
+ if not value:
+ return NONE
+ if isinstance(value, datetime):
+ return normalize(value)
+ if isinstance(value, str) and (dt := parse_iso(value)) is not NONE:
+ return dt
+ return parse_ts(value)
+
+
+def parse(dt_string, format):
+ """Parse 'dt_string' according to 'format'"""
+ try:
+ return normalize(datetime.strptime(dt_string, format))
+ except Exception:
+ return NONE
+
+
+if sys.hexversion < 0x30c0000:
+ # Python <= 3.11
+ def parse_iso(dt_string):
+ """Parse 'dt_string' as ISO 8601 value"""
+ try:
+ if dt_string[-1] == "Z":
+ # compat for Python < 3.11
+ dt_string = dt_string[:-1]
+ elif dt_string[-5] in "+-":
+ # compat for Python < 3.11
+ dt_string = f"{dt_string[:-2]}:{dt_string[-2:]}"
+ return normalize(datetime.fromisoformat(dt_string))
+ except Exception:
+ return NONE
+
+ from_ts = datetime.utcfromtimestamp
+ now = datetime.utcnow
+
+else:
+ # Python >= 3.12
+ def parse_iso(dt_string):
+ """Parse 'dt_string' as ISO 8601 value"""
+ try:
+ return normalize(datetime.fromisoformat(dt_string))
+ except Exception:
+ return NONE
+
+ def from_ts(ts=None):
+ """Convert Unix timestamp to naive UTC datetime"""
+ Y, m, d, H, M, S, _, _, _ = time.gmtime(ts)
+ return datetime(Y, m, d, H, M, S)
+
+ now = from_ts
+
+
+def parse_ts(ts, default=NONE):
+ """Create a datetime object from a Unix timestamp"""
+ try:
+ return from_ts(int(ts))
+ except Exception:
+ return default
+
+
+def to_ts(dt):
+ """Convert naive UTC datetime to Unix timestamp"""
+ return (dt - EPOCH) / SECOND
+
+
+def to_ts_string(dt):
+ """Convert naive UTC datetime to Unix timestamp string"""
+ try:
+ return str((dt - EPOCH) // SECOND)
+ except Exception:
+ return ""
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py
index 912a251..1f17c99 100644
--- a/gallery_dl/extractor/2ch.py
+++ b/gallery_dl/extractor/2ch.py
@@ -4,28 +4,28 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://2ch.su/"""
+"""Extractors for https://2ch.org/"""
from .common import Extractor, Message
from .. import text, util
-BASE_PATTERN = r"(?:https?://)?2ch\.(su|life|hk)"
+BASE_PATTERN = r"(?:https?://)?2ch\.(org|su|life|hk)"
class _2chThreadExtractor(Extractor):
"""Extractor for 2ch threads"""
category = "2ch"
subcategory = "thread"
- root = "https://2ch.su"
+ root = "https://2ch.org"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{tim}{filename:? //}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)"
- example = "https://2ch.su/a/res/12345.html"
+ example = "https://2ch.org/a/res/12345.html"
def __init__(self, match):
tld = match[1]
- self.root = f"https://2ch.{'su' if tld == 'hk' else tld}"
+ self.root = f"https://2ch.{'org' if tld == 'hk' else tld}"
Extractor.__init__(self, match)
def items(self):
@@ -42,11 +42,11 @@ class _2chThreadExtractor(Extractor):
"title" : text.unescape(title)[:50],
}
- yield Message.Directory, thread
+ yield Message.Directory, "", thread
for post in posts:
if files := post.get("files"):
post["post_name"] = post["name"]
- post["date"] = text.parse_timestamp(post["timestamp"])
+ post["date"] = self.parse_timestamp(post["timestamp"])
del post["files"]
del post["name"]
@@ -65,9 +65,9 @@ class _2chBoardExtractor(Extractor):
"""Extractor for 2ch boards"""
category = "2ch"
subcategory = "board"
- root = "https://2ch.su"
+ root = "https://2ch.org"
pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$"
- example = "https://2ch.su/a/"
+ example = "https://2ch.org/a/"
def __init__(self, match):
tld = match[1]
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
index 9927b5a..0e250c9 100644
--- a/gallery_dl/extractor/2chan.py
+++ b/gallery_dl/extractor/2chan.py
@@ -31,7 +31,7 @@ class _2chanThreadExtractor(Extractor):
f"/{self.board}/res/{self.thread}.htm")
page = self.request(url).text
data = self.metadata(page)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for post in self.posts(page):
if "filename" not in post:
continue
diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py
index ee3510c..4456fd6 100644
--- a/gallery_dl/extractor/2chen.py
+++ b/gallery_dl/extractor/2chen.py
@@ -1,40 +1,55 @@
# -*- coding: utf-8 -*-
+# Copyright 2022-2025 Mike Fährmann
+#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://sturdychan.help/"""
+"""Extractors for 2chen boards"""
-from .common import Extractor, Message
+from .common import BaseExtractor, Message
from .. import text
-BASE_PATTERN = r"(?:https?://)?(?:sturdychan.help|2chen\.(?:moe|club))"
+class _2chenExtractor(BaseExtractor):
+ basecategory = "2chen"
-class _2chenThreadExtractor(Extractor):
+
+BASE_PATTERN = _2chenExtractor.update({
+ "sturdychan": {
+ "root": "https://sturdychan.help",
+ "pattern": r"(?:sturdychan\.help|2chen\.(?:moe|club))",
+ },
+ "schan": {
+ "root": "https://schan.help/",
+ "pattern": r"schan\.help",
+ },
+})
+
+
+class _2chenThreadExtractor(_2chenExtractor):
"""Extractor for 2chen threads"""
- category = "2chen"
subcategory = "thread"
- root = "https://sturdychan.help"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{time} {filename}.{extension}"
- archive_fmt = "{board}_{thread}_{hash}_{time}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)"
+ archive_fmt = "{board}_{thread}_{no}_{time}"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/(\d+)"
example = "https://sturdychan.help/a/12345/"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.board, self.thread = match.groups()
-
def items(self):
- url = f"{self.root}/{self.board}/{self.thread}"
+ board = self.groups[-2]
+ thread = self.kwdict["thread"] = self.groups[-1]
+ url = f"{self.root}/{board}/{thread}"
page = self.request(url, encoding="utf-8", notfound="thread").text
- data = self.metadata(page)
- yield Message.Directory, data
- for post in self.posts(page):
+ self.kwdict["board"], pos = text.extract(
+ page, 'class="board">/', '/<')
+ self.kwdict["title"] = text.unescape(text.extract(
+ page, "<h3>", "</h3>", pos)[0])
+ yield Message.Directory, "", {}
+ for post in self.posts(page):
url = post["url"]
if not url:
continue
@@ -42,20 +57,10 @@ class _2chenThreadExtractor(Extractor):
url = self.root + url
post["url"] = url = url.partition("?")[0]
- post.update(data)
post["time"] = text.parse_int(post["date"].timestamp())
yield Message.Url, url, text.nameext_from_url(
post["filename"], post)
- def metadata(self, page):
- board, pos = text.extract(page, 'class="board">/', '/<')
- title = text.extract(page, "<h3>", "</h3>", pos)[0]
- return {
- "board" : board,
- "thread": self.thread,
- "title" : text.unescape(title),
- }
-
def posts(self, page):
"""Return iterable with relevant posts"""
return map(self.parse, text.extract_iter(
@@ -65,31 +70,25 @@ class _2chenThreadExtractor(Extractor):
extr = text.extract_from(post)
return {
"name" : text.unescape(extr("<span>", "</span>")),
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr("<time", "<").partition(">")[2],
"%d %b %Y (%a) %H:%M:%S"
),
"no" : extr('href="#p', '"'),
- "url" : extr('</a><a href="', '"'),
"filename": text.unescape(extr('download="', '"')),
+ "url" : text.extr(extr("<figure>", "</"), 'href="', '"'),
"hash" : extr('data-hash="', '"'),
}
-class _2chenBoardExtractor(Extractor):
+class _2chenBoardExtractor(_2chenExtractor):
"""Extractor for 2chen boards"""
- category = "2chen"
subcategory = "board"
- root = "https://sturdychan.help"
- pattern = BASE_PATTERN + r"/([^/?#]+)(?:/catalog|/?$)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/catalog|/?$)"
example = "https://sturdychan.help/a/"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.board = match[1]
-
def items(self):
- url = f"{self.root}/{self.board}/catalog"
+ url = f"{self.root}/{self.groups[-1]}/catalog"
page = self.request(url, notfound="board").text
data = {"_extractor": _2chenThreadExtractor}
for thread in text.extract_iter(
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py
index ec5f0cb..154295e 100644
--- a/gallery_dl/extractor/35photo.py
+++ b/gallery_dl/extractor/35photo.py
@@ -29,7 +29,7 @@ class _35photoExtractor(Extractor):
url = photo["url"]
if first:
first = False
- yield Message.Directory, photo
+ yield Message.Directory, "", photo
yield Message.Url, url, text.nameext_from_url(url, photo)
def metadata(self):
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py
index 4c43464..a6dedde 100644
--- a/gallery_dl/extractor/4archive.py
+++ b/gallery_dl/extractor/4archive.py
@@ -7,7 +7,7 @@
"""Extractors for https://4archive.org/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text, dt
class _4archiveThreadExtractor(Extractor):
@@ -37,8 +37,8 @@ class _4archiveThreadExtractor(Extractor):
for post in posts:
post.update(data)
- post["time"] = int(util.datetime_to_timestamp(post["date"]))
- yield Message.Directory, post
+ post["time"] = int(dt.to_ts(post["date"]))
+ yield Message.Directory, "", post
if "url" in post:
yield Message.Url, post["url"], text.nameext_from_url(
post["filename"], post)
@@ -61,10 +61,9 @@ class _4archiveThreadExtractor(Extractor):
extr = text.extract_from(post)
data = {
"name": extr('class="name">', "</span>"),
- "date": text.parse_datetime(
+ "date": self.parse_datetime_iso(
(extr('class="dateTime">', "<") or
- extr('class="dateTime postNum" >', "<")).strip(),
- "%Y-%m-%d %H:%M:%S"),
+ extr('class="dateTime postNum" >', "<")).strip()),
"no" : text.parse_int(extr(">Post No.", "<")),
}
if 'class="file"' in post:
diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py
index d81f305..ba24899 100644
--- a/gallery_dl/extractor/4chan.py
+++ b/gallery_dl/extractor/4chan.py
@@ -38,7 +38,7 @@ class _4chanThreadExtractor(Extractor):
"title" : text.unescape(title)[:50],
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for post in posts:
if "filename" in post:
post.update(data)
diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py
index c187b41..16f4b39 100644
--- a/gallery_dl/extractor/4chanarchives.py
+++ b/gallery_dl/extractor/4chanarchives.py
@@ -40,7 +40,7 @@ class _4chanarchivesThreadExtractor(Extractor):
for post in posts:
post.update(data)
- yield Message.Directory, post
+ yield Message.Directory, "", post
if "url" in post:
yield Message.Url, post["url"], post
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index d1ac503..b74bc90 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -31,7 +31,7 @@ class _500pxExtractor(Extractor):
photo["extension"] = photo["image_format"]
if data:
photo.update(data)
- yield Message.Directory, photo
+ yield Message.Directory, "", photo
yield Message.Url, url, photo
def metadata(self):
@@ -92,7 +92,7 @@ class _500pxExtractor(Extractor):
class _500pxUserExtractor(_500pxExtractor):
"""Extractor for photos from a user's photostream on 500px.com"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?!photo/|liked)(?:p/)?([^/?#]+)/?(?:$|[?#])"
+ pattern = rf"{BASE_PATTERN}/(?!photo/|liked)(?:p/)?([^/?#]+)/?(?:$|[?#])"
example = "https://500px.com/USER"
def __init__(self, match):
@@ -121,8 +121,8 @@ class _500pxGalleryExtractor(_500pxExtractor):
"""Extractor for photo galleries on 500px.com"""
subcategory = "gallery"
directory_fmt = ("{category}", "{user[username]}", "{gallery[name]}")
- pattern = (BASE_PATTERN + r"/(?!photo/)(?:p/)?"
- r"([^/?#]+)/galleries/([^/?#]+)")
+ pattern = (rf"{BASE_PATTERN}/(?!photo/)(?:p/)?"
+ rf"([^/?#]+)/galleries/([^/?#]+)")
example = "https://500px.com/USER/galleries/GALLERY"
def __init__(self, match):
@@ -178,7 +178,7 @@ class _500pxGalleryExtractor(_500pxExtractor):
class _500pxFavoriteExtractor(_500pxExtractor):
"""Extractor for favorite 500px photos"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/liked/?$"
+ pattern = rf"{BASE_PATTERN}/liked/?$"
example = "https://500px.com/liked"
def photos(self):
@@ -202,7 +202,7 @@ class _500pxFavoriteExtractor(_500pxExtractor):
class _500pxImageExtractor(_500pxExtractor):
"""Extractor for individual images from 500px.com"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/photo/(\d+)"
+ pattern = rf"{BASE_PATTERN}/photo/(\d+)"
example = "https://500px.com/photo/12345/TITLE"
def __init__(self, match):
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
index 0385067..3230182 100644
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -9,9 +9,8 @@
"""Extractors for https://8chan.moe/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text, dt
from ..cache import memcache
-from datetime import timedelta
import itertools
BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)"
@@ -44,7 +43,7 @@ class _8chanExtractor(Extractor):
def cookies_prepare(self):
# fetch captcha cookies
# (necessary to download without getting interrupted)
- now = util.datetime_utcnow()
+ now = dt.now()
url = self.root + "/captcha.js"
params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")}
self.request(url, params=params).content
@@ -57,7 +56,7 @@ class _8chanExtractor(Extractor):
if cookie.domain.endswith(domain):
cookie.expires = None
if cookie.name == "captchaexpiration":
- cookie.value = (now + timedelta(30, 300)).strftime(
+ cookie.value = (now + dt.timedelta(30, 300)).strftime(
"%a, %d %b %Y %H:%M:%S GMT")
return self.cookies
@@ -70,7 +69,7 @@ class _8chanThreadExtractor(_8chanExtractor):
"{threadId} {subject[:50]}")
filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}"
archive_fmt = "{boardUri}_{postId}_{num}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/(?:res|last)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/(?:res|last)/(\d+)"
example = "https://8chan.moe/a/res/12345.html"
def items(self):
@@ -92,7 +91,7 @@ class _8chanThreadExtractor(_8chanExtractor):
# download files
posts = thread.pop("posts", ())
- yield Message.Directory, thread
+ yield Message.Directory, "", thread
for post in itertools.chain((thread,), posts):
files = post.pop("files", ())
if not files:
@@ -108,7 +107,7 @@ class _8chanThreadExtractor(_8chanExtractor):
class _8chanBoardExtractor(_8chanExtractor):
"""Extractor for 8chan boards"""
subcategory = "board"
- pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/(?:(\d+)\.html)?$"
example = "https://8chan.moe/a/"
def items(self):
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index 120cd8a..a8d8b44 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -40,7 +40,7 @@ class _8musesAlbumExtractor(Extractor):
if images := data.get("pictures"):
count = len(images)
album = self._make_album(data["album"])
- yield Message.Directory, {"album": album, "count": count}
+ yield Message.Directory, "", {"album": album, "count": count}
for num, image in enumerate(images, 1):
url = self.root + "/image/fl/" + image["publicUri"]
img = {
@@ -85,8 +85,7 @@ class _8musesAlbumExtractor(Extractor):
"parent" : text.parse_int(album["parentId"]),
"views" : text.parse_int(album["numberViews"]),
"likes" : text.parse_int(album["numberLikes"]),
- "date" : text.parse_datetime(
- album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"),
+ "date" : self.parse_datetime_iso(album["updatedAt"]),
}
def _unobfuscate(self, data):
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index c7e33c8..64134d0 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -26,8 +26,10 @@ modules = [
"ao3",
"arcalive",
"architizer",
+ "arena",
"artstation",
"aryion",
+ "audiochan",
"batoto",
"bbc",
"behance",
@@ -39,9 +41,11 @@ modules = [
"booth",
"bunkr",
"catbox",
+ "cfake",
"chevereto",
"cien",
"civitai",
+ "comedywildlifephoto",
"comick",
"comicvine",
"cyberdrop",
@@ -54,6 +58,7 @@ modules = [
"discord",
"dynastyscans",
"e621",
+ "eporner",
"erome",
"everia",
"exhentai",
@@ -63,6 +68,8 @@ modules = [
"fantia",
"fapello",
"fapachi",
+ "fikfap",
+ "fitnakedgirls",
"flickr",
"furaffinity",
"furry34",
@@ -106,6 +113,7 @@ modules = [
"kemono",
"khinsider",
"komikcast",
+ "koofr",
"leakgallery",
"lensdump",
"lexica",
@@ -140,12 +148,14 @@ modules = [
"nozomi",
"nsfwalbum",
"nudostar",
+ "okporn",
"paheal",
"patreon",
"pexels",
"philomena",
"photovogue",
"picarto",
+ "picazor",
"pictoa",
"piczel",
"pillowfort",
@@ -158,12 +168,12 @@ modules = [
"poringa",
"pornhub",
"pornpics",
+ "pornstarstube",
"postmill",
"rawkuma",
"reactor",
"readcomiconline",
"realbooru",
- "redbust",
"reddit",
"redgifs",
"rule34us",
@@ -179,7 +189,6 @@ modules = [
"senmanga",
"sexcom",
"shimmie2",
- "simpcity",
"simplyhentai",
"sizebooru",
"skeb",
@@ -190,6 +199,7 @@ modules = [
"speakerdeck",
"steamgriddb",
"subscribestar",
+ "sxypix",
"szurubooru",
"tapas",
"tcbscans",
@@ -221,11 +231,13 @@ modules = [
"webmshare",
"webtoons",
"weebcentral",
+ "weebdex",
"weibo",
"wikiart",
"wikifeet",
"wikimedia",
"xasiat",
+ "xenforo",
"xfolio",
"xhamster",
"xvideos",
@@ -299,7 +311,7 @@ def _list_classes():
def _modules_internal():
globals_ = globals()
for module_name in modules:
- yield __import__(module_name, globals_, None, (), 1)
+ yield __import__(module_name, globals_, None, None, 1)
def _modules_path(path, files):
diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py
index 3249ae6..e9adf97 100644
--- a/gallery_dl/extractor/adultempire.py
+++ b/gallery_dl/extractor/adultempire.py
@@ -33,7 +33,7 @@ class AdultempireGalleryExtractor(GalleryExtractor):
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.unescape(extr('title="', '"')),
"studio" : extr(">studio</small>", "<").strip(),
- "date" : text.parse_datetime(extr(
+ "date" : self.parse_datetime(extr(
">released</small>", "<").strip(), "%m/%d/%Y"),
"actors" : sorted(text.split_html(extr(
'<ul class="item-details item-cast-list ', '</ul>'))[1:]),
diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py
index 5bb1835..55b17c7 100644
--- a/gallery_dl/extractor/agnph.py
+++ b/gallery_dl/extractor/agnph.py
@@ -9,7 +9,7 @@
"""Extractors for https://agn.ph/"""
from . import booru
-from .. import text, util
+from .. import text
import collections
BASE_PATTERN = r"(?:https?://)?agn\.ph"
@@ -33,7 +33,7 @@ class AgnphExtractor(booru.BooruExtractor):
self.cookies.set("confirmed_age", "true", domain="agn.ph")
def _prepare(self, post):
- post["date"] = text.parse_timestamp(post["created_at"])
+ post["date"] = self.parse_timestamp(post["created_at"])
post["status"] = post["status"].strip()
post["has_children"] = ("true" in post["has_children"])
@@ -70,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor):
return
tags = collections.defaultdict(list)
- pattern = util.re(r'class="(.)typetag">([^<]+)')
+ pattern = text.re(r'class="(.)typetag">([^<]+)')
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name).replace(" ", "_"))
for key, value in tags.items():
@@ -81,7 +81,7 @@ class AgnphTagExtractor(AgnphExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/gallery/post/(?:\?([^#]+))?$"
+ pattern = rf"{BASE_PATTERN}/gallery/post/(?:\?([^#]+))?$"
example = "https://agn.ph/gallery/post/?search=TAG"
def __init__(self, match):
@@ -99,7 +99,7 @@ class AgnphTagExtractor(AgnphExtractor):
class AgnphPostExtractor(AgnphExtractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/gallery/post/show/(\d+)"
+ pattern = rf"{BASE_PATTERN}/gallery/post/show/(\d+)"
example = "https://agn.ph/gallery/post/show/12345/"
def posts(self):
diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py
index 60380c4..716492e 100644
--- a/gallery_dl/extractor/ao3.py
+++ b/gallery_dl/extractor/ao3.py
@@ -118,7 +118,7 @@ class Ao3WorkExtractor(Ao3Extractor):
directory_fmt = ("{category}", "{author}")
filename_fmt = "{id} {title}.{extension}"
archive_fmt = "{id}.{extension}"
- pattern = BASE_PATTERN + r"/works/(\d+)"
+ pattern = rf"{BASE_PATTERN}/works/(\d+)"
example = "https://archiveofourown.org/works/12345"
def _init(self):
@@ -182,11 +182,11 @@ class Ao3WorkExtractor(Ao3Extractor):
extr('<dd class="freeform tags">', "</dd>")),
"lang" : extr('<dd class="language" lang="', '"'),
"series" : extr('<dd class="series">', "</dd>"),
- "date" : text.parse_datetime(
- extr('<dd class="published">', "<"), "%Y-%m-%d"),
- "date_completed": text.parse_datetime(
- extr('>Completed:</dt><dd class="status">', "<"), "%Y-%m-%d"),
- "date_updated" : text.parse_timestamp(
+ "date" : self.parse_datetime_iso(extr(
+ '<dd class="published">', "<")),
+ "date_completed": self.parse_datetime_iso(extr(
+ '>Completed:</dt><dd class="status">', "<")),
+ "date_updated" : self.parse_timestamp(
path.rpartition("updated_at=")[2]),
"words" : text.parse_int(
extr('<dd class="words">', "<").replace(",", "")),
@@ -220,7 +220,7 @@ class Ao3WorkExtractor(Ao3Extractor):
else:
data["series"] = None
- yield Message.Directory, data
+ yield Message.Directory, "", data
for fmt in self.formats:
try:
url = text.urljoin(self.root, fmts[fmt])
@@ -233,28 +233,28 @@ class Ao3WorkExtractor(Ao3Extractor):
class Ao3SeriesExtractor(Ao3Extractor):
"""Extractor for AO3 works of a series"""
subcategory = "series"
- pattern = BASE_PATTERN + r"(/series/(\d+))"
+ pattern = rf"{BASE_PATTERN}(/series/(\d+))"
example = "https://archiveofourown.org/series/12345"
class Ao3TagExtractor(Ao3Extractor):
"""Extractor for AO3 works by tag"""
subcategory = "tag"
- pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)"
+ pattern = rf"{BASE_PATTERN}(/tags/([^/?#]+)/works(?:/?\?.+)?)"
example = "https://archiveofourown.org/tags/TAG/works"
class Ao3SearchExtractor(Ao3Extractor):
"""Extractor for AO3 search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"(/works/search/?\?.+)"
+ pattern = rf"{BASE_PATTERN}(/works/search/?\?.+)"
example = "https://archiveofourown.org/works/search?work_search[query]=air"
class Ao3UserExtractor(Dispatch, Ao3Extractor):
"""Extractor for an AO3 user profile"""
- pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
- r"(?:/profile)?/?(?:$|\?|#)")
+ pattern = (rf"{BASE_PATTERN}/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
+ rf"(?:/profile)?/?(?:$|\?|#)")
example = "https://archiveofourown.org/users/USER"
def items(self):
@@ -269,16 +269,16 @@ class Ao3UserExtractor(Dispatch, Ao3Extractor):
class Ao3UserWorksExtractor(Ao3Extractor):
"""Extractor for works of an AO3 user"""
subcategory = "user-works"
- pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
- r"works(?:/?\?.+)?)")
+ pattern = (rf"{BASE_PATTERN}(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
+ rf"works(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/works"
class Ao3UserSeriesExtractor(Ao3Extractor):
"""Extractor for series of an AO3 user"""
subcategory = "user-series"
- pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
- r"series(?:/?\?.+)?)")
+ pattern = (rf"{BASE_PATTERN}(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
+ rf"series(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/series"
def items(self):
@@ -297,8 +297,8 @@ class Ao3UserSeriesExtractor(Ao3Extractor):
class Ao3UserBookmarkExtractor(Ao3Extractor):
"""Extractor for bookmarked works of an AO3 user"""
subcategory = "user-bookmark"
- pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
- r"bookmarks(?:/?\?.+)?)")
+ pattern = (rf"{BASE_PATTERN}(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
+ rf"bookmarks(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/bookmarks"
def items(self):
@@ -308,7 +308,7 @@ class Ao3UserBookmarkExtractor(Ao3Extractor):
class Ao3SubscriptionsExtractor(Ao3Extractor):
"""Extractor for your AO3 account's subscriptions"""
subcategory = "subscriptions"
- pattern = BASE_PATTERN + r"(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)"
+ pattern = rf"{BASE_PATTERN}(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)"
example = "https://archiveofourown.org/users/USER/subscriptions"
def items(self):
diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py
index 1df7e0f..f950d14 100644
--- a/gallery_dl/extractor/arcalive.py
+++ b/gallery_dl/extractor/arcalive.py
@@ -36,7 +36,7 @@ class ArcalivePostExtractor(ArcaliveExtractor):
directory_fmt = ("{category}", "{boardSlug}")
filename_fmt = "{id}_{num}{title:? //[b:230]}.{extension}"
archive_fmt = "{id}_{num}"
- pattern = BASE_PATTERN + r"/b/(?:\w+)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/b/(?:\w+)/(\d+)"
example = "https://arca.live/b/breaking/123456789"
def items(self):
@@ -49,13 +49,12 @@ class ArcalivePostExtractor(ArcaliveExtractor):
files = self._extract_files(post)
post["count"] = len(files)
- post["date"] = text.parse_datetime(
- post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
+ post["date"] = self.parse_datetime_iso(post["createdAt"][:19])
post["post_url"] = post_url = \
f"{self.root}/b/{post['boardSlug']}/{post['id']}"
post["_http_headers"] = {"Referer": post_url + "?p=1"}
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
post.update(file)
url = file["url"]
@@ -64,7 +63,7 @@ class ArcalivePostExtractor(ArcaliveExtractor):
def _extract_files(self, post):
files = []
- for video, media in util.re(r"<(?:img|vide(o)) ([^>]+)").findall(
+ for video, media in text.re(r"<(?:img|vide(o)) ([^>]+)").findall(
post["content"]):
if not self.emoticons and 'class="arca-emoticon"' in media:
continue
@@ -116,7 +115,7 @@ class ArcalivePostExtractor(ArcaliveExtractor):
class ArcaliveBoardExtractor(ArcaliveExtractor):
"""Extractor for an arca.live board's posts"""
subcategory = "board"
- pattern = BASE_PATTERN + r"/b/([^/?#]+)/?(?:\?([^#]+))?$"
+ pattern = rf"{BASE_PATTERN}/b/([^/?#]+)/?(?:\?([^#]+))?$"
example = "https://arca.live/b/breaking"
def articles(self):
@@ -128,7 +127,7 @@ class ArcaliveBoardExtractor(ArcaliveExtractor):
class ArcaliveUserExtractor(ArcaliveExtractor):
"""Extractor for an arca.live users's posts"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/u/@([^/?#]+)/?(?:\?([^#]+))?$"
+ pattern = rf"{BASE_PATTERN}/u/@([^/?#]+)/?(?:\?([^#]+))?$"
example = "https://arca.live/u/@USER"
def articles(self):
diff --git a/gallery_dl/extractor/arena.py b/gallery_dl/extractor/arena.py
new file mode 100644
index 0000000..ada2fa1
--- /dev/null
+++ b/gallery_dl/extractor/arena.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractor for https://are.na/"""
+
+from .common import GalleryExtractor
+
+
+class ArenaChannelExtractor(GalleryExtractor):
+ """Extractor for are.na channels"""
+ category = "arena"
+ subcategory = "channel"
+ root = "https://are.na"
+ directory_fmt = ("{category}", "{user[full_name]} ({user[id]})",
+ "{channel[title]} ({channel[id]})")
+ filename_fmt = "{num:>03}{block[id]:? //}.{extension}"
+ archive_fmt = "{channel[id]}/{block[id]}"
+ pattern = r"(?:https?://)?(?:www\.)?are\.na/[^/?#]+/([^/?#]+)"
+ example = "https://are.na/evan-collins-1522646491/cassette-futurism"
+
+ def metadata(self, page):
+ channel = self.request_json(
+ f"https://api.are.na/v2/channels/{self.groups[0]}")
+
+ channel["date"] = self.parse_datetime_iso(
+ channel["created_at"])
+ channel["date_updated"] = self.parse_datetime_iso(
+ channel["updated_at"])
+ channel.pop("contents", None)
+
+ return {
+ "count" : channel.get("length"),
+ "user" : channel.pop("user", None),
+ "owner" : channel.pop("owner", None),
+ "channel": channel,
+ }
+
+ def images(self, page):
+ api = f"https://api.are.na/v2/channels/{self.groups[0]}/contents"
+ limit = 100
+ params = {"page": 1, "per": limit}
+
+ while True:
+ data = self.request_json(api, params=params)
+
+ contents = data.get("contents")
+ if not contents:
+ return
+
+ for block in contents:
+ url = None
+
+ # Attachments (e.g., PDFs, files)
+ if attachment := block.get("attachment"):
+ url = attachment.get("url")
+
+ # Images
+ elif image := block.get("image"):
+ # Prefer original image
+ if original := image.get("original"):
+ url = original.get("url")
+ # Fallback to display/large image if present
+ elif display := image.get("display"):
+ url = display.get("url")
+ elif large := image.get("large"):
+ url = large.get("url")
+
+ # Some Links/Channels may not have downloadable media
+ if not url:
+ continue
+
+ block["date"] = self.parse_datetime_iso(
+ block["created_at"])
+ block["date_updated"] = self.parse_datetime_iso(
+ block["updated_at"])
+
+ yield url, {
+ "block" : block,
+ "source": block.pop("source", None),
+ }
+
+ if len(contents) < limit:
+ return
+ params["page"] += 1
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index fdb92c4..f1b55ce 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -47,7 +47,7 @@ class ArtstationExtractor(Extractor):
asset.update(data)
adict = asset["asset"]
asset["num"] = num
- yield Message.Directory, asset
+ yield Message.Directory, "", asset
if adict["has_embedded_player"]:
if url := self._extract_embed(asset):
@@ -126,8 +126,7 @@ class ArtstationExtractor(Extractor):
data["title"] = text.unescape(data["title"])
data["description"] = text.unescape(text.remove_html(
data["description"]))
- data["date"] = text.parse_datetime(
- data["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ data["date"] = self.parse_datetime_iso(data["created_at"])
assets = data["assets"]
del data["assets"]
@@ -334,7 +333,7 @@ class ArtstationChallengeExtractor(ArtstationExtractor):
update_url = f"{self.root}/contests/submission_updates.json"
challenge = self.request_json(challenge_url)
- yield Message.Directory, {"challenge": challenge}
+ yield Message.Directory, "", {"challenge": challenge}
params = {"sorting": self.sorting}
for submission in self._pagination(submission_url, params):
diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py
index 38b8ee4..5e5d1f2 100644
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@@ -9,10 +9,9 @@
"""Extractors for https://aryion.com/"""
from .common import Extractor, Message
-from .. import text, util, exception
+from .. import text, util, dt, exception
from ..cache import cache
from email.utils import parsedate_tz
-from datetime import datetime
BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4"
@@ -20,7 +19,7 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4"
class AryionExtractor(Extractor):
"""Base class for aryion extractors"""
category = "aryion"
- directory_fmt = ("{category}", "{user!l}", "{path:J - }")
+ directory_fmt = ("{category}", "{user!l}", "{path:I}")
filename_fmt = "{id} {title}.{extension}"
archive_fmt = "{id}"
cookies_domain = ".aryion.com"
@@ -64,7 +63,7 @@ class AryionExtractor(Extractor):
if post := self._parse_post(post_id):
if data:
post.update(data)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, post["url"], post
elif post is False and self.recursive:
base = self.root + "/g4/view/"
@@ -78,20 +77,20 @@ class AryionExtractor(Extractor):
def metadata(self):
"""Return general metadata"""
- def _pagination_params(self, url, params=None, needle=None):
+ def _pagination_params(self, url, params=None, needle=None, quote="'"):
if params is None:
params = {"p": 1}
else:
params["p"] = text.parse_int(params.get("p"), 1)
if needle is None:
- needle = "class='gallery-item' id='"
+ needle = "class='gallery-item' id=" + quote
while True:
page = self.request(url, params=params).text
cnt = 0
- for post_id in text.extract_iter(page, needle, "'"):
+ for post_id in text.extract_iter(page, needle, quote):
cnt += 1
yield post_id
@@ -109,6 +108,42 @@ class AryionExtractor(Extractor):
return
url = self.root + text.rextr(page, "href='", "'", pos)
+ def _pagination_folders(self, url, folder=None, seen=None):
+ if folder is None:
+ self.kwdict["folder"] = ""
+ else:
+ url = f"{url}/{folder}"
+ self.kwdict["folder"] = folder = text.unquote(folder)
+ self.log.debug("Descending into folder '%s'", folder)
+
+ params = {"p": 1}
+ while True:
+ page = self.request(url, params=params).text
+
+ cnt = 0
+ for item in text.extract_iter(
+ page, "<li class='gallery-item", "</li>"):
+ cnt += 1
+ if text.extr(item, 'data-item-type="', '"') == "Folders":
+ folder = text.extr(item, "href='", "'").rpartition("/")[2]
+ if seen is None:
+ seen = set()
+ if folder not in seen:
+ seen.add(folder)
+ if self.recursive:
+ yield from self._pagination_folders(
+ url, folder, seen)
+ else:
+ self.log.debug("Skipping folder '%s'", folder)
+ else:
+ yield text.extr(item, "data-item-id='", "'")
+
+ if cnt < 40 and ">Next &gt;&gt;<" not in page:
+ break
+ params["p"] += 1
+
+ self.kwdict["folder"] = ""
+
def _parse_post(self, post_id):
url = f"{self.root}/g4/data.php?id={post_id}"
with self.request(url, method="HEAD", fatal=False) as response:
@@ -154,9 +189,11 @@ class AryionExtractor(Extractor):
"user" : self.user or artist,
"title" : title,
"artist": artist,
+ "description": text.unescape(extr(
+ 'property="og:description" content="', '"')),
"path" : text.split_html(extr(
"cookiecrumb'>", '</span'))[4:-1:2],
- "date" : datetime(*parsedate_tz(lmod)[:6]),
+ "date" : dt.datetime(*parsedate_tz(lmod)[:6]),
"size" : text.parse_int(clen),
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
"width" : text.parse_int(extr("Resolution</b>:", "x")),
@@ -164,8 +201,6 @@ class AryionExtractor(Extractor):
"comments" : text.parse_int(extr("Comments</b>:", "<")),
"favorites": text.parse_int(extr("Favorites</b>:", "<")),
"tags" : text.split_html(extr("class='taglist'>", "</span>")),
- "description": text.unescape(text.remove_html(extr(
- "<p>", "</p>"), "", "")),
"filename" : fname,
"extension": ext,
"_http_lastmodified": lmod,
@@ -176,14 +211,11 @@ class AryionGalleryExtractor(AryionExtractor):
"""Extractor for a user's gallery on eka's portal"""
subcategory = "gallery"
categorytransfer = True
- pattern = BASE_PATTERN + r"/(?:gallery/|user/|latest.php\?name=)([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?:gallery/|user/|latest.php\?name=)([^/?#]+)"
example = "https://aryion.com/g4/gallery/USER"
- def __init__(self, match):
- AryionExtractor.__init__(self, match)
- self.offset = 0
-
def _init(self):
+ self.offset = 0
self.recursive = self.config("recursive", True)
def skip(self, num):
@@ -204,15 +236,34 @@ class AryionGalleryExtractor(AryionExtractor):
class AryionFavoriteExtractor(AryionExtractor):
"""Extractor for a user's favorites gallery"""
subcategory = "favorite"
- directory_fmt = ("{category}", "{user!l}", "favorites")
+ directory_fmt = ("{category}", "{user!l}", "favorites", "{folder}")
archive_fmt = "f_{user}_{id}"
- categorytransfer = True
- pattern = BASE_PATTERN + r"/favorites/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/favorites/([^/?#]+)(?:/([^?#]+))?"
example = "https://aryion.com/g4/favorites/USER"
+ def _init(self):
+ self.recursive = self.config("recursive", True)
+
def posts(self):
url = f"{self.root}/g4/favorites/{self.user}"
- return self._pagination_params(url, None, "data-item-id='")
+ return self._pagination_folders(url, self.groups[1])
+
+
+class AryionWatchExtractor(AryionExtractor):
+ """Extractor for your watched users and tags"""
+ subcategory = "watch"
+ directory_fmt = ("{category}", "{user!l}",)
+ pattern = rf"{BASE_PATTERN}/messagepage\.php()"
+ example = "https://aryion.com/g4/messagepage.php"
+
+ def posts(self):
+ if not self.cookies_check(self.cookies_names):
+ raise exception.AuthRequired(
+ ("username & password", "authenticated cookies"),
+ "watched Submissions")
+ self.cookies.set("g4p_msgpage_style", "plain", domain="aryion.com")
+ url = self.root + "/g4/messagepage.php"
+ return self._pagination_params(url, None, 'data-item-id="', '"')
class AryionTagExtractor(AryionExtractor):
@@ -220,7 +271,7 @@ class AryionTagExtractor(AryionExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "tags", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/tags\.php\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/tags\.php\?([^#]+)"
example = "https://aryion.com/g4/tags.php?tag=TAG"
def _init(self):
@@ -235,10 +286,34 @@ class AryionTagExtractor(AryionExtractor):
return self._pagination_params(url, self.params)
+class AryionSearchExtractor(AryionExtractor):
+ """Extractor for searches on eka's portal"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "searches", "{search[prefix]}"
+ "{search[q]|search[tags]|search[user]}")
+ archive_fmt = ("s_{search[prefix]}"
+ "{search[q]|search[tags]|search[user]}_{id}")
+ pattern = rf"{BASE_PATTERN}/search\.php\?([^#]+)"
+ example = "https://aryion.com/g4/search.php?q=TEXT&tags=TAGS&user=USER"
+
+ def metadata(self):
+ params = text.parse_query(self.user)
+ return {"search": {
+ **params,
+ "prefix": ("" if params.get("q") else
+ "t_" if params.get("tags") else
+ "u_" if params.get("user") else ""),
+ }}
+
+ def posts(self):
+ url = f"{self.root}/g4/search.php?{self.user}"
+ return self._pagination_next(url)
+
+
class AryionPostExtractor(AryionExtractor):
"""Extractor for individual posts on eka's portal"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/view/(\d+)"
+ pattern = rf"{BASE_PATTERN}/view/(\d+)"
example = "https://aryion.com/g4/view/12345"
def posts(self):
diff --git a/gallery_dl/extractor/audiochan.py b/gallery_dl/extractor/audiochan.py
new file mode 100644
index 0000000..b708ce7
--- /dev/null
+++ b/gallery_dl/extractor/audiochan.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://audiochan.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?audiochan\.com"
+
+
+class AudiochanExtractor(Extractor):
+ """Base class for audiochan extractors"""
+ category = "audiochan"
+ root = "https://audiochan.com"
+ root_api = "https://api.audiochan.com"
+ directory_fmt = ("{category}", "{user[display_name]}")
+ filename_fmt = "{title} ({slug}).{extension}"
+ archive_fmt = "{audioFile[id]}"
+
+ def _init(self):
+ self.user = False
+ self.headers_api = {
+ "content-type" : "application/json",
+ "Origin" : self.root,
+ "Sec-Fetch-Dest" : "empty",
+ "Sec-Fetch-Mode" : "cors",
+ "Sec-Fetch-Site" : "same-site",
+ }
+ self.headers_dl = {
+ "Accept": "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,"
+ "application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
+ "Sec-Fetch-Dest" : "audio",
+ "Sec-Fetch-Mode" : "no-cors",
+ "Sec-Fetch-Site" : "same-site",
+ "Accept-Encoding": "identity",
+ }
+
+ def items(self):
+ for post in self.posts():
+ file = post["audioFile"]
+
+ post["_http_headers"] = self.headers_dl
+ post["date"] = self.parse_datetime_iso(file["created_at"])
+ post["date_updated"] = self.parse_datetime_iso(file["updated_at"])
+ post["description"] = self._extract_description(
+ post["description"])
+
+ tags = []
+ for tag in post["tags"]:
+ if "tag" in tag:
+ tag = tag["tag"]
+ tags.append(f"{tag['category']}:{tag['name']}")
+ post["tags"] = tags
+
+ if self.user:
+ post["user"] = post["credits"][0]["user"]
+
+ if not (url := file["url"]):
+ post["_http_segmented"] = 600000
+ url = file["stream_url"]
+
+ yield Message.Directory, "", post
+ text.nameext_from_name(file["filename"], post)
+ yield Message.Url, url, post
+
+ def request_api(self, endpoint, params=None):
+ url = self.root_api + endpoint
+ return self.request_json(url, params=params, headers=self.headers_api)
+
+ def _pagination(self, endpoint, params, key=None):
+ params["page"] = 1
+ params["limit"] = "12"
+
+ while True:
+ data = self.request_api(endpoint, params)
+ if key is not None:
+ data = data[key]
+
+ yield from data["data"]
+
+ if not data["has_more"]:
+ break
+ params["page"] += 1
+
+ def _extract_description(self, description, texts=None):
+ if texts is None:
+ texts = []
+
+ if "text" in description:
+ texts.append(description["text"])
+ elif "content" in description:
+ for desc in description["content"]:
+ self._extract_description(desc, texts)
+
+ return texts
+
+
+class AudiochanAudioExtractor(AudiochanExtractor):
+ subcategory = "audio"
+ pattern = rf"{BASE_PATTERN}/a/([^/?#]+)"
+ example = "https://audiochan.com/a/SLUG"
+
+ def posts(self):
+ self.user = True
+ audio = self.request_api("/audios/slug/" + self.groups[0])
+ return (audio,)
+
+
+class AudiochanUserExtractor(AudiochanExtractor):
+ subcategory = "user"
+ pattern = rf"{BASE_PATTERN}/u/([^/?#]+)"
+ example = "https://audiochan.com/u/USER"
+
+ def posts(self):
+ endpoint = "/users/" + self.groups[0]
+ self.kwdict["user"] = self.request_api(endpoint)["data"]
+
+ params = {
+ "sfw_only": "false",
+ "sort" : "new",
+ }
+ return self._pagination(endpoint + "/audios", params)
+
+
+class AudiochanCollectionExtractor(AudiochanExtractor):
+ subcategory = "collection"
+ pattern = rf"{BASE_PATTERN}/c/([^/?#]+)"
+ example = "https://audiochan.com/c/SLUG"
+
+ def posts(self):
+ slug = self.groups[0]
+ endpoint = "/collections/" + slug
+ self.kwdict["collection"] = col = self.request_api(endpoint)
+ col.pop("audios", None)
+ col.pop("items", None)
+
+ endpoint = f"/collections/slug/{slug}/items"
+ return self._pagination(endpoint, {})
+
+
+class AudiochanSearchExtractor(AudiochanExtractor):
+ subcategory = "search"
+ pattern = rf"{BASE_PATTERN}/search/?\?([^#]+)"
+ example = "https://audiochan.com/search?q=QUERY"
+
+ def posts(self):
+ self.user = True
+ endpoint = "/search"
+ params = text.parse_query(self.groups[0])
+ params["sfw_only"] = "false"
+ self.kwdict["search_tags"] = params.get("q")
+ return self._pagination(endpoint, params, "audios")
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
index a7d1b78..f8e803b 100644
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -80,7 +80,7 @@ class BatotoBase():
class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
"""Extractor for batoto manga chapters"""
archive_fmt = "{chapter_id}_{page}"
- pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?:title/[^/?#]+|chapter)/(\d+)"
example = "https://xbato.org/title/12345-MANGA/54321"
def __init__(self, match):
@@ -104,7 +104,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
info = text.remove_html(extr('link-hover">', "</"))
info = text.unescape(info)
- match = util.re(
+ match = text.re(
r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?"
r"(?:Chapter|Episode)\s*(\d+)([\w.]*)").match(info)
if match:
@@ -123,7 +123,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
"chapter_minor" : minor,
"chapter_string": info,
"chapter_id" : text.parse_int(self.chapter_id),
- "date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
+ "date" : self.parse_timestamp(extr(' time="', '"')[:-3]),
}
def images(self, page):
@@ -139,8 +139,8 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor):
"""Extractor for batoto manga"""
reverse = False
chapterclass = BatotoChapterExtractor
- pattern = (BASE_PATTERN +
- r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$")
+ pattern = (rf"{BASE_PATTERN}"
+ rf"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$")
example = "https://xbato.org/title/12345-MANGA/"
def __init__(self, match):
@@ -167,8 +167,7 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor):
data["chapter"] = text.parse_int(chapter)
data["chapter_minor"] = sep + minor
- data["date"] = text.parse_datetime(
- extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")
+ data["date"] = self.parse_datetime_iso(extr('time="', '"'))
url = f"{self.root}/title/{href}"
results.append((url, data.copy()))
@@ -188,9 +187,9 @@ def _manga_info(self, manga_id, page=None):
"manga" : data["name"][1],
"manga_id" : text.parse_int(manga_id),
"manga_slug" : data["slug"][1],
- "manga_date" : text.parse_timestamp(
+ "manga_date" : self.parse_timestamp(
data["dateCreate"][1] // 1000),
- "manga_date_updated": text.parse_timestamp(
+ "manga_date_updated": self.parse_timestamp(
data["dateUpdate"][1] / 1000),
"author" : json_list(data["authors"]),
"artist" : json_list(data["artists"]),
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py
index 8efb3db..cb357d1 100644
--- a/gallery_dl/extractor/bbc.py
+++ b/gallery_dl/extractor/bbc.py
@@ -18,11 +18,10 @@ class BbcGalleryExtractor(GalleryExtractor):
"""Extractor for a programme gallery on bbc.co.uk"""
category = "bbc"
root = "https://www.bbc.co.uk"
- directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}",
- "{path[3:]:J - /}")
+ directory_fmt = ("{category}", "{path:I}")
filename_fmt = "{num:>02}.{extension}"
archive_fmt = "{programme}_{num}"
- pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$"
+ pattern = rf"{BASE_PATTERN}[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$"
example = "https://www.bbc.co.uk/programmes/PATH"
def metadata(self, page):
@@ -72,7 +71,7 @@ class BbcProgrammeExtractor(Extractor):
category = "bbc"
subcategory = "programme"
root = "https://www.bbc.co.uk"
- pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?"
+ pattern = rf"{BASE_PATTERN}[^/?#]+/galleries)(?:/?\?page=(\d+))?"
example = "https://www.bbc.co.uk/programmes/ID/galleries"
def items(self):
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 4a7c074..bb0562d 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -67,7 +67,7 @@ class BehanceExtractor(Extractor):
tags = [tag["title"] for tag in tags]
data["tags"] = tags
- data["date"] = text.parse_timestamp(
+ data["date"] = self.parse_timestamp(
data.get("publishedOn") or data.get("conceived_on") or 0)
if creator := data.get("creator"):
@@ -109,7 +109,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
imgs = self.get_images(data)
data["count"] = len(imgs)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], (url, module) in enumerate(imgs, 1):
data["module"] = module
data["extension"] = (module.get("extension") or
diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py
index ce50a91..33f4ad3 100644
--- a/gallery_dl/extractor/bellazon.py
+++ b/gallery_dl/extractor/bellazon.py
@@ -46,8 +46,8 @@ class BellazonExtractor(Extractor):
data = {"post": post}
post["count"] = data["count"] = len(urls)
- yield Message.Directory, data
- data["num"] = 0
+ yield Message.Directory, "", data
+ data["num"] = data["num_internal"] = data["num_external"] = 0
for info, url, url_img in urls:
url = text.unescape(url or url_img)
@@ -59,27 +59,35 @@ class BellazonExtractor(Extractor):
):
continue
data["num"] += 1
+ data["num_internal"] += 1
if not (alt := text.extr(info, ' alt="', '"')) or (
alt.startswith("post-") and "_thumb." in alt):
- name = url
+ dc = text.nameext_from_url(url, data.copy())
else:
- name = text.unescape(alt)
+ dc = data.copy()
+ dc["name"] = name = text.unescape(alt)
+ dc["filename"] = name.partition(".")[0]
- dc = text.nameext_from_url(name, data.copy())
dc["id"] = text.extr(info, 'data-fileid="', '"')
if ext := text.extr(info, 'data-fileext="', '"'):
dc["extension"] = ext
elif "/core/interface/file/attachment.php" in url:
if not dc["id"]:
- dc["id"] = url.rpartition("?id=")[2]
+ dc["id"] = \
+ url.rpartition("?id=")[2].partition("&")[0]
if name := text.extr(info, ">", "<").strip():
- text.nameext_from_url(name, dc)
+ dc["name"] = name = text.unescape(name)
+ text.nameext_from_name(name, dc)
+ else:
+ dc["extension"] = text.ext_from_url(url)
if url[0] == "/":
url = f"https:{url}"
yield Message.Url, url, dc
else:
+ data["num"] += 1
+ data["num_external"] += 1
yield Message.Queue, url, data
def _pagination(self, base, pnum=None):
@@ -106,7 +114,7 @@ class BellazonExtractor(Extractor):
def _pagination_reverse(self, base, pnum=None):
base = f"{self.root}{base}"
- url = f"{base}/page/9999/" # force redirect to highest page number
+ url = f"{base}/page/{'9999' if pnum is None else pnum}/"
with self.request(url) as response:
parts = response.url.rsplit("/", 3)
pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1
@@ -130,7 +138,7 @@ class BellazonExtractor(Extractor):
author = schema["author"]
stats = schema["interactionStatistic"]
url_t = schema["url"]
- url_a = author["url"]
+ url_a = author.get("url") or ""
path = text.split_html(text.extr(
page, '<nav class="ipsBreadcrumb', "</nav>"))[2:-1]
@@ -141,8 +149,8 @@ class BellazonExtractor(Extractor):
"title": schema["headline"],
"views": stats[0]["userInteractionCount"],
"posts": stats[1]["userInteractionCount"],
- "date" : text.parse_datetime(schema["datePublished"]),
- "date_updated": text.parse_datetime(schema["dateModified"]),
+ "date" : self.parse_datetime_iso(schema["datePublished"]),
+ "date_updated": self.parse_datetime_iso(schema["dateModified"]),
"description" : text.unescape(schema["text"]).strip(),
"section" : path[-2],
"author" : author["name"],
@@ -151,8 +159,12 @@ class BellazonExtractor(Extractor):
thread["id"], _, thread["slug"] = \
url_t.rsplit("/", 2)[1].partition("-")
- thread["author_id"], _, thread["author_slug"] = \
- url_a.rsplit("/", 2)[1].partition("-")
+
+ if url_a:
+ thread["author_id"], _, thread["author_slug"] = \
+ url_a.rsplit("/", 2)[1].partition("-")
+ else:
+ thread["author_id"] = thread["author_slug"] = ""
return thread
@@ -162,15 +174,18 @@ class BellazonExtractor(Extractor):
post = {
"id": extr('id="elComment_', '"'),
"author_url": extr(" href='", "'"),
- "date": text.parse_datetime(extr("datetime='", "'")),
+ "date": self.parse_datetime_iso(extr("datetime='", "'")),
"content": extr("<!-- Post content -->", "\n\t\t</div>"),
}
if (pos := post["content"].find(">")) >= 0:
post["content"] = post["content"][pos+1:].strip()
- post["author_id"], _, post["author_slug"] = \
- post["author_url"].rsplit("/", 2)[1].partition("-")
+ if url_a := post["author_url"]:
+ post["author_id"], _, post["author_slug"] = \
+ url_a.rsplit("/", 2)[1].partition("-")
+ else:
+ post["author_id"] = post["author_slug"] = ""
return post
diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py
index 3f0acff..fe10150 100644
--- a/gallery_dl/extractor/bilibili.py
+++ b/gallery_dl/extractor/bilibili.py
@@ -74,7 +74,7 @@ class BilibiliArticleExtractor(BilibiliExtractor):
pass
article["count"] = len(pics)
- yield Message.Directory, article
+ yield Message.Directory, "", article
for article["num"], pic in enumerate(pics, 1):
url = pic["url"]
article.update(pic)
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index af43446..766272f 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -13,7 +13,7 @@ from .. import text, util
def original(url):
- return (util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)")
+ return (text.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)")
.sub(r"\1s0", url)
.replace("http:", "https:", 1))
@@ -32,7 +32,7 @@ class BloggerExtractor(BaseExtractor):
self.videos = self.config("videos", True)
if self.videos:
- self.findall_video = util.re(
+ self.findall_video = text.re(
r"""src=["'](https?://www\.blogger\.com"""
r"""/video\.g\?token=[^"']+)""").findall
@@ -40,10 +40,10 @@ class BloggerExtractor(BaseExtractor):
blog = self.api.blog_by_url("http://" + self.blog)
blog["pages"] = blog["pages"]["totalItems"]
blog["posts"] = blog["posts"]["totalItems"]
- blog["date"] = text.parse_datetime(blog["published"])
+ blog["date"] = self.parse_datetime_iso(blog["published"])
del blog["selfLink"]
- findall_image = util.re(
+ findall_image = text.re(
r'src="(https?://(?:'
r'blogger\.googleusercontent\.com/img|'
r'lh\d+(?:-\w+)?\.googleusercontent\.com|'
@@ -65,14 +65,14 @@ class BloggerExtractor(BaseExtractor):
post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"]
post["content"] = text.remove_html(content)
- post["date"] = text.parse_datetime(post["published"])
+ post["date"] = self.parse_datetime_iso(post["published"])
del post["selfLink"]
del post["blog"]
data = {"blog": blog, "post": post}
if metadata:
data.update(metadata)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(files, 1):
data["url"] = url
@@ -117,7 +117,7 @@ BASE_PATTERN = BloggerExtractor.update({
class BloggerPostExtractor(BloggerExtractor):
"""Extractor for a single blog post"""
subcategory = "post"
- pattern = BASE_PATTERN + r"(/\d\d\d\d/\d\d/[^/?#]+\.html)"
+ pattern = rf"{BASE_PATTERN}(/\d\d\d\d/\d\d/[^/?#]+\.html)"
example = "https://BLOG.blogspot.com/1970/01/TITLE.html"
def posts(self, blog):
@@ -127,7 +127,7 @@ class BloggerPostExtractor(BloggerExtractor):
class BloggerBlogExtractor(BloggerExtractor):
"""Extractor for an entire Blogger blog"""
subcategory = "blog"
- pattern = BASE_PATTERN + r"/?$"
+ pattern = rf"{BASE_PATTERN}/?$"
example = "https://BLOG.blogspot.com/"
def posts(self, blog):
@@ -137,7 +137,7 @@ class BloggerBlogExtractor(BloggerExtractor):
class BloggerSearchExtractor(BloggerExtractor):
"""Extractor for Blogger search resuls"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/search/?\?q=([^&#]+)"
example = "https://BLOG.blogspot.com/search?q=QUERY"
def metadata(self):
@@ -151,7 +151,7 @@ class BloggerSearchExtractor(BloggerExtractor):
class BloggerLabelExtractor(BloggerExtractor):
"""Extractor for Blogger posts by label"""
subcategory = "label"
- pattern = BASE_PATTERN + r"/search/label/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/search/label/([^/?#]+)"
example = "https://BLOG.blogspot.com/search/label/LABEL"
def metadata(self):
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
index e8c5707..c981608 100644
--- a/gallery_dl/extractor/bluesky.py
+++ b/gallery_dl/extractor/bluesky.py
@@ -14,7 +14,7 @@ from ..cache import cache, memcache
BASE_PATTERN = (r"(?:https?://)?"
r"(?:(?:www\.)?(?:c|[fv]x)?bs[ky]y[ex]?\.app|main\.bsky\.dev)")
-USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/profile/([^/?#]+)"
class BlueskyExtractor(Extractor):
@@ -60,7 +60,7 @@ class BlueskyExtractor(Extractor):
self._prepare(post)
files = self._extract_files(post)
- yield Message.Directory, post
+ yield Message.Directory, "", post
if files:
did = post["author"]["did"]
base = (f"{self.api.service_endpoint(did)}/xrpc"
@@ -135,8 +135,7 @@ class BlueskyExtractor(Extractor):
post["instance"] = self.instance
post["post_id"] = self._pid(post)
- post["date"] = text.parse_datetime(
- post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
+ post["date"] = self.parse_datetime_iso(post["createdAt"][:19])
def _extract_files(self, post):
if "embed" not in post:
@@ -217,7 +216,7 @@ class BlueskyExtractor(Extractor):
class BlueskyUserExtractor(Dispatch, BlueskyExtractor):
- pattern = USER_PATTERN + r"$"
+ pattern = rf"{USER_PATTERN}$"
example = "https://bsky.app/profile/HANDLE"
def items(self):
@@ -238,7 +237,7 @@ class BlueskyUserExtractor(Dispatch, BlueskyExtractor):
class BlueskyPostsExtractor(BlueskyExtractor):
subcategory = "posts"
- pattern = USER_PATTERN + r"/posts"
+ pattern = rf"{USER_PATTERN}/posts"
example = "https://bsky.app/profile/HANDLE/posts"
def posts(self):
@@ -248,7 +247,7 @@ class BlueskyPostsExtractor(BlueskyExtractor):
class BlueskyRepliesExtractor(BlueskyExtractor):
subcategory = "replies"
- pattern = USER_PATTERN + r"/replies"
+ pattern = rf"{USER_PATTERN}/replies"
example = "https://bsky.app/profile/HANDLE/replies"
def posts(self):
@@ -258,7 +257,7 @@ class BlueskyRepliesExtractor(BlueskyExtractor):
class BlueskyMediaExtractor(BlueskyExtractor):
subcategory = "media"
- pattern = USER_PATTERN + r"/media"
+ pattern = rf"{USER_PATTERN}/media"
example = "https://bsky.app/profile/HANDLE/media"
def posts(self):
@@ -268,7 +267,7 @@ class BlueskyMediaExtractor(BlueskyExtractor):
class BlueskyVideoExtractor(BlueskyExtractor):
subcategory = "video"
- pattern = USER_PATTERN + r"/video"
+ pattern = rf"{USER_PATTERN}/video"
example = "https://bsky.app/profile/HANDLE/video"
def posts(self):
@@ -278,7 +277,7 @@ class BlueskyVideoExtractor(BlueskyExtractor):
class BlueskyLikesExtractor(BlueskyExtractor):
subcategory = "likes"
- pattern = USER_PATTERN + r"/likes"
+ pattern = rf"{USER_PATTERN}/likes"
example = "https://bsky.app/profile/HANDLE/likes"
def posts(self):
@@ -289,7 +288,7 @@ class BlueskyLikesExtractor(BlueskyExtractor):
class BlueskyFeedExtractor(BlueskyExtractor):
subcategory = "feed"
- pattern = USER_PATTERN + r"/feed/([^/?#]+)"
+ pattern = rf"{USER_PATTERN}/feed/([^/?#]+)"
example = "https://bsky.app/profile/HANDLE/feed/NAME"
def posts(self):
@@ -299,7 +298,7 @@ class BlueskyFeedExtractor(BlueskyExtractor):
class BlueskyListExtractor(BlueskyExtractor):
subcategory = "list"
- pattern = USER_PATTERN + r"/lists/([^/?#]+)"
+ pattern = rf"{USER_PATTERN}/lists/([^/?#]+)"
example = "https://bsky.app/profile/HANDLE/lists/ID"
def posts(self):
@@ -309,7 +308,7 @@ class BlueskyListExtractor(BlueskyExtractor):
class BlueskyFollowingExtractor(BlueskyExtractor):
subcategory = "following"
- pattern = USER_PATTERN + r"/follows"
+ pattern = rf"{USER_PATTERN}/follows"
example = "https://bsky.app/profile/HANDLE/follows"
def items(self):
@@ -321,7 +320,7 @@ class BlueskyFollowingExtractor(BlueskyExtractor):
class BlueskyPostExtractor(BlueskyExtractor):
subcategory = "post"
- pattern = USER_PATTERN + r"/post/([^/?#]+)"
+ pattern = rf"{USER_PATTERN}/post/([^/?#]+)"
example = "https://bsky.app/profile/HANDLE/post/ID"
def posts(self):
@@ -331,19 +330,19 @@ class BlueskyPostExtractor(BlueskyExtractor):
class BlueskyInfoExtractor(BlueskyExtractor):
subcategory = "info"
- pattern = USER_PATTERN + r"/info"
+ pattern = rf"{USER_PATTERN}/info"
example = "https://bsky.app/profile/HANDLE/info"
def items(self):
self._metadata_user = True
self.api._did_from_actor(self.groups[0])
- return iter(((Message.Directory, self._user),))
+ return iter(((Message.Directory, "", self._user),))
class BlueskyAvatarExtractor(BlueskyExtractor):
subcategory = "avatar"
filename_fmt = "avatar_{post_id}.{extension}"
- pattern = USER_PATTERN + r"/avatar"
+ pattern = rf"{USER_PATTERN}/avatar"
example = "https://bsky.app/profile/HANDLE/avatar"
def posts(self):
@@ -353,7 +352,7 @@ class BlueskyAvatarExtractor(BlueskyExtractor):
class BlueskyBackgroundExtractor(BlueskyExtractor):
subcategory = "background"
filename_fmt = "background_{post_id}.{extension}"
- pattern = USER_PATTERN + r"/ba(?:nner|ckground)"
+ pattern = rf"{USER_PATTERN}/ba(?:nner|ckground)"
example = "https://bsky.app/profile/HANDLE/banner"
def posts(self):
@@ -362,7 +361,7 @@ class BlueskyBackgroundExtractor(BlueskyExtractor):
class BlueskySearchExtractor(BlueskyExtractor):
subcategory = "search"
- pattern = BASE_PATTERN + r"/search(?:/|\?q=)(.+)"
+ pattern = rf"{BASE_PATTERN}/search(?:/|\?q=)(.+)"
example = "https://bsky.app/search?q=QUERY"
def posts(self):
@@ -372,7 +371,7 @@ class BlueskySearchExtractor(BlueskyExtractor):
class BlueskyHashtagExtractor(BlueskyExtractor):
subcategory = "hashtag"
- pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)(?:/(top|latest))?"
+ pattern = rf"{BASE_PATTERN}/hashtag/([^/?#]+)(?:/(top|latest))?"
example = "https://bsky.app/hashtag/NAME"
def posts(self):
@@ -382,7 +381,7 @@ class BlueskyHashtagExtractor(BlueskyExtractor):
class BlueskyBookmarkExtractor(BlueskyExtractor):
subcategory = "bookmark"
- pattern = BASE_PATTERN + r"/saved"
+ pattern = rf"{BASE_PATTERN}/saved"
example = "https://bsky.app/saved"
def posts(self):
@@ -401,7 +400,9 @@ class BlueskyAPI():
self.headers = {"Accept": "application/json"}
self.username, self.password = extractor._get_auth_info()
- if self.username:
+ if srv := extractor.config("api-server", False):
+ self.root = srv.rstrip("/")
+ elif self.username:
self.root = "https://bsky.social"
else:
self.root = "https://api.bsky.app"
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index ae455bf..4858a4b 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -57,7 +57,7 @@ class BooruExtractor(BaseExtractor):
post.update(data)
self._prepare(post)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, url, post
def skip(self, num):
diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py
index 22f3259..5add768 100644
--- a/gallery_dl/extractor/boosty.py
+++ b/gallery_dl/extractor/boosty.py
@@ -49,6 +49,9 @@ class BoostyExtractor(Extractor):
self.videos = videos
def items(self):
+ headers = self.api.headers.copy()
+ del headers["Accept"]
+
for post in self.posts():
if not post.get("hasAccess"):
self.log.warning("Not allowed to access post %s", post["id"])
@@ -61,9 +64,10 @@ class BoostyExtractor(Extractor):
"post" : post,
"user" : post.pop("user", None),
"count": len(files),
+ "_http_headers": headers,
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], file in enumerate(files, 1):
data["file"] = file
url = file["url"]
@@ -78,7 +82,7 @@ class BoostyExtractor(Extractor):
post["links"] = links = []
if "createdAt" in post:
- post["date"] = text.parse_timestamp(post["createdAt"])
+ post["date"] = self.parse_timestamp(post["createdAt"])
for block in post["data"]:
try:
@@ -159,7 +163,7 @@ class BoostyExtractor(Extractor):
class BoostyUserExtractor(BoostyExtractor):
"""Extractor for boosty.to user profiles"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/([^/?#]+)(?:\?([^#]+))?$"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:\?([^#]+))?$"
example = "https://boosty.to/USER"
def posts(self):
@@ -175,7 +179,7 @@ class BoostyMediaExtractor(BoostyExtractor):
subcategory = "media"
directory_fmt = "{category}", "{user[blogUrl]} ({user[id]})", "media"
filename_fmt = "{post[id]}_{num}.{extension}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/media/([^/?#]+)(?:\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/media/([^/?#]+)(?:\?([^#]+))?"
example = "https://boosty.to/USER/media/all"
def posts(self):
@@ -188,7 +192,7 @@ class BoostyMediaExtractor(BoostyExtractor):
class BoostyFeedExtractor(BoostyExtractor):
"""Extractor for your boosty.to subscription feed"""
subcategory = "feed"
- pattern = BASE_PATTERN + r"/(?:\?([^#]+))?(?:$|#)"
+ pattern = rf"{BASE_PATTERN}/(?:\?([^#]+))?(?:$|#)"
example = "https://boosty.to/"
def posts(self):
@@ -199,7 +203,7 @@ class BoostyFeedExtractor(BoostyExtractor):
class BoostyPostExtractor(BoostyExtractor):
"""Extractor for boosty.to posts"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/([^/?#]+)/posts/([0-9a-f-]+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts/([0-9a-f-]+)"
example = "https://boosty.to/USER/posts/01234567-89ab-cdef-0123-456789abcd"
def posts(self):
@@ -212,7 +216,7 @@ class BoostyPostExtractor(BoostyExtractor):
class BoostyFollowingExtractor(BoostyExtractor):
"""Extractor for your boosty.to subscribed users"""
subcategory = "following"
- pattern = BASE_PATTERN + r"/app/settings/subscriptions"
+ pattern = rf"{BASE_PATTERN}/app/settings/subscriptions"
example = "https://boosty.to/app/settings/subscriptions"
def items(self):
@@ -227,7 +231,7 @@ class BoostyDirectMessagesExtractor(BoostyExtractor):
subcategory = "direct-messages"
directory_fmt = ("{category}", "{user[blogUrl]} ({user[id]})",
"Direct Messages")
- pattern = BASE_PATTERN + r"/app/messages/?\?dialogId=(\d+)"
+ pattern = rf"{BASE_PATTERN}/app/messages/?\?dialogId=(\d+)"
example = "https://boosty.to/app/messages?dialogId=12345"
def items(self):
@@ -260,7 +264,7 @@ class BoostyDirectMessagesExtractor(BoostyExtractor):
"count": len(files),
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], file in enumerate(files, 1):
data["file"] = file
url = file["url"]
@@ -280,8 +284,12 @@ class BoostyAPI():
if not access_token:
if auth := self.extractor.cookies.get("auth", domain=".boosty.to"):
- access_token = text.extr(
- text.unquote(auth), '"accessToken":"', '"')
+ auth = text.unquote(auth)
+ access_token = text.extr(auth, '"accessToken":"', '"')
+ if expires := text.extr(auth, '"expiresAt":', ','):
+ import time
+ if text.parse_int(expires) < time.time() * 1000:
+ extractor.log.warning("'auth' cookie tokens expired")
if access_token:
self.headers["Authorization"] = "Bearer " + access_token
diff --git a/gallery_dl/extractor/booth.py b/gallery_dl/extractor/booth.py
index 0fcb1cb..3c000b1 100644
--- a/gallery_dl/extractor/booth.py
+++ b/gallery_dl/extractor/booth.py
@@ -70,8 +70,7 @@ class BoothItemExtractor(BoothExtractor):
url + ".json", headers=headers, interval=False)
item["booth_category"] = item.pop("category", None)
- item["date"] = text.parse_datetime(
- item["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ item["date"] = self.parse_datetime_iso(item["published_at"])
item["tags"] = [t["name"] for t in item["tags"]]
shop = item["shop"]
@@ -84,7 +83,7 @@ class BoothItemExtractor(BoothExtractor):
item["count"] = 0
shop["uuid"] = util.NONE
- yield Message.Directory, item
+ yield Message.Directory, "", item
for num, file in enumerate(files, 1):
url = file["url"]
file["num"] = num
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 14ebc48..ed9cd0f 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -11,6 +11,7 @@
from .common import Extractor
from .lolisafe import LolisafeAlbumExtractor
from .. import text, util, config, exception
+from ..cache import memcache
import random
if config.get(("extractor", "bunkr"), "tlds"):
@@ -63,7 +64,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
root_dl = "https://get.bunkrr.su"
root_api = "https://apidl.bunkr.ru"
archive_fmt = "{album_id}_{id|id_url|slug}"
- pattern = BASE_PATTERN + r"/a/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/a/([^/?#]+)"
example = "https://bunkr.si/a/ID"
def __init__(self, match):
@@ -167,7 +168,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
item, 'name: "', ".")
file["size"] = text.parse_int(text.extr(
item, "size: ", " ,\n"))
- file["date"] = text.parse_datetime(text.extr(
+ file["date"] = self.parse_datetime(text.extr(
item, 'timestamp: "', '"'), "%H:%M:%S %d/%m/%Y")
yield file
@@ -176,6 +177,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
self.log.debug("%s", item, exc_info=exc)
+ if isinstance(exc, exception.HttpError) and \
+ exc.status == 400 and \
+ exc.response.url.startswith(self.root_api):
+ raise exception.AbortExtraction("Album deleted")
def _extract_file(self, data_id):
referer = f"{self.root_dl}/file/{data_id}"
@@ -211,7 +216,7 @@ class BunkrMediaExtractor(BunkrAlbumExtractor):
"""Extractor for bunkr.si media links"""
subcategory = "media"
directory_fmt = ("{category}",)
- pattern = BASE_PATTERN + r"(/[fvid]/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/[fvid]/[^/?#]+)"
example = "https://bunkr.si/f/FILENAME"
def fetch_album(self, album_id):
@@ -227,10 +232,26 @@ class BunkrMediaExtractor(BunkrAlbumExtractor):
self.log.error("%s: %s", exc.__class__.__name__, exc)
return (), {}
+ album_id, album_name, album_size = self._album_info(text.extr(
+ page, ' href="../a/', '"'))
return (file,), {
- "album_id" : "",
- "album_name" : "",
- "album_size" : -1,
- "description": "",
- "count" : 1,
+ "album_id" : album_id,
+ "album_name": album_name,
+ "album_size": album_size,
+ "count" : 1,
}
+
+ @memcache(keyarg=1)
+ def _album_info(self, album_id):
+ if album_id:
+ try:
+ page = self.request(f"{self.root}/a/{album_id}").text
+ return (
+ album_id,
+ text.unescape(text.unescape(text.extr(
+ page, 'property="og:title" content="', '"'))),
+ text.extr(page, '<span class="font-semibold">(', ')'),
+ )
+ except Exception:
+ pass
+ return album_id, "", -1
diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py
index 22f7a97..2066839 100644
--- a/gallery_dl/extractor/catbox.py
+++ b/gallery_dl/extractor/catbox.py
@@ -28,7 +28,7 @@ class CatboxAlbumExtractor(GalleryExtractor):
return {
"album_id" : self.page_url.rpartition("/")[2],
"album_name" : text.unescape(extr("<h1>", "<")),
- "date" : text.parse_datetime(extr(
+ "date" : self.parse_datetime(extr(
"<p>Created ", "<"), "%B %d %Y"),
"description": text.unescape(extr("<p>", "<")),
}
@@ -52,5 +52,5 @@ class CatboxFileExtractor(Extractor):
def items(self):
url = text.ensure_http_scheme(self.url)
file = text.nameext_from_url(url, {"url": url})
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, url, file
diff --git a/gallery_dl/extractor/cfake.py b/gallery_dl/extractor/cfake.py
new file mode 100644
index 0000000..4c37455
--- /dev/null
+++ b/gallery_dl/extractor/cfake.py
@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://cfake.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?cfake\.com"
+
+
+class CfakeExtractor(Extractor):
+ """Base class for cfake extractors"""
+ category = "cfake"
+ root = "https://cfake.com"
+ directory_fmt = ("{category}", "{type}", "{type_name} ({type_id})")
+ filename_fmt = "{category}_{type_name}_{id}.{extension}"
+ archive_fmt = "{id}"
+
+ def items(self):
+ type, type_name, type_id, sub_id, pnum = self.groups
+
+ if type.endswith("ies"):
+ type = type[:-3] + "y"
+
+ kwdict = self.kwdict
+ kwdict["type"] = type
+ kwdict["type_id"] = text.parse_int(type_id)
+ kwdict["type_name"] = text.unquote(type_name).replace("_", " ")
+ kwdict["sub_id"] = text.parse_int(sub_id)
+ kwdict["page"] = pnum = text.parse_int(pnum, 1)
+ yield Message.Directory, "", {}
+
+ base = f"{self.root}/images/{type}/{type_name}/{type_id}"
+ if sub_id:
+ base = f"{base}/{sub_id}"
+
+ while True:
+ url = base if pnum < 2 else f"{base}/p{pnum}"
+ page = self.request(url).text
+
+ # Extract and yield images
+ num = 0
+ for image in self._extract_images(page):
+ num += 1
+ image["num"] = num + (pnum - 1) * 50
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ # Check for next page
+ if not num or not (pnum := self._check_pagination(page)):
+ return
+ kwdict["page"] = pnum
+
+ def _extract_images(self, page):
+ """Extract image URLs and metadata from a gallery page"""
+ for item in text.extract_iter(
+ page, '<a href="javascript:showimage(', '</div></div>'):
+
+ # Extract image path from showimage call
+ # Format: 'big.php?show=2025/filename.jpg&id_picture=...
+ show_param = text.extr(item, "show=", "&")
+ if not show_param:
+ continue
+
+ # Extract metadata
+ picture_id = text.extr(item, "id_picture=", "&")
+ name_param = text.extr(item, "p_name=", "'")
+
+ # Extract date
+ date = text.extr(item, 'id="date_vignette">', '</div>')
+
+ # Extract rating
+ rating_text = text.extr(item, 'class="current-rating"', '</li>')
+ rating = text.extr(rating_text, 'width:', 'px')
+
+ # Convert thumbnail path to full image path
+ # show_param is like "2025/filename.jpg"
+ image_url = f"{self.root}/medias/photos/{show_param}"
+
+ yield {
+ "url": image_url,
+ "id": text.parse_int(picture_id) if picture_id else 0,
+ "name": text.unescape(name_param) if name_param else "",
+ "date": date,
+ "rating": rating,
+ }
+
+ def _check_pagination(self, page):
+ """Check if there are more pages and return next page number"""
+ # Look for current page indicator
+ # Format: id="num_page_current" ><a href=".../ p1">1</a>
+ current_section = text.extr(
+ page, 'id="num_page_current"', '</div>')
+ if not current_section:
+ return None
+
+ # Extract current page number from the link text
+ current_page_str = text.extr(current_section, '">', '</a>')
+ if not current_page_str:
+ return None
+
+ current_page = text.parse_int(current_page_str)
+ if not current_page:
+ return None
+
+ next_page = current_page + 1
+
+ # Check if next page link exists anywhere in the page
+ # Look for href="/images/.../pN" pattern
+ if f'/p{next_page}"' in page or f'/p{next_page} ' in page:
+ return next_page
+
+ return None
+
+
+class CfakeCelebrityExtractor(CfakeExtractor):
+ """Extractor for celebrity image galleries from cfake.com"""
+ subcategory = "celebrity"
+ pattern = (BASE_PATTERN + r"/images/(celebrity)"
+ r"/([^/?#]+)/(\d+)()(?:/p(\d+))?")
+ example = "https://cfake.com/images/celebrity/NAME/123"
+
+
+class CfakeCategoryExtractor(CfakeExtractor):
+ """Extractor for category image galleries from cfake.com"""
+ subcategory = "category"
+ pattern = (BASE_PATTERN + r"/images/(categories)"
+ r"/([^/?#]+)/(\d+)()(?:/p(\d+))?")
+ example = "https://cfake.com/images/categories/NAME/123"
+
+
+class CfakeCreatedExtractor(CfakeExtractor):
+ """Extractor for 'created' image galleries from cfake.com"""
+ subcategory = "created"
+ pattern = (BASE_PATTERN + r"/images/(created)"
+ r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?")
+ example = "https://cfake.com/images/created/NAME/12345/123"
+
+
+class CfakeCountryExtractor(CfakeExtractor):
+ """Extractor for country image galleries from cfake.com"""
+ subcategory = "country"
+ pattern = (BASE_PATTERN + r"/images/(country)"
+ r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?")
+ example = "https://cfake.com/images/country/NAME/12345/123"
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index 1552899..9a766d0 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -17,14 +17,17 @@ class CheveretoExtractor(BaseExtractor):
basecategory = "chevereto"
directory_fmt = ("{category}", "{user}", "{album}")
archive_fmt = "{id}"
+ parent = True
def _init(self):
self.path = self.groups[-1]
- def _pagination(self, url):
- while True:
- page = self.request(url).text
+ def _pagination(self, url, callback=None):
+ page = self.request(url).text
+ if callback is not None:
+ callback(page)
+ while True:
for item in text.extract_iter(
page, '<div class="list-item-image ', 'image-container'):
yield text.urljoin(self.root, text.extr(
@@ -35,12 +38,13 @@ class CheveretoExtractor(BaseExtractor):
return
if url[0] == "/":
url = self.root + url
+ page = self.request(url).text
BASE_PATTERN = CheveretoExtractor.update({
"jpgfish": {
- "root": "https://jpg6.su",
- "pattern": r"(?:www\.)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
+ "root": "https://jpg7.cr",
+ "pattern": r"(?:www\.)?jpe?g\d?\.(?:cr|su|pet|fish(?:ing)?|church)",
},
"imagepond": {
"root": "https://imagepond.net",
@@ -56,8 +60,8 @@ BASE_PATTERN = CheveretoExtractor.update({
class CheveretoImageExtractor(CheveretoExtractor):
"""Extractor for chevereto images"""
subcategory = "image"
- pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)"
- example = "https://jpg2.su/img/TITLE.ID"
+ pattern = rf"{BASE_PATTERN}(/im(?:g|age)/[^/?#]+)"
+ example = "https://jpg7.cr/img/TITLE.ID"
def items(self):
url = self.root + self.path
@@ -74,25 +78,27 @@ class CheveretoImageExtractor(CheveretoExtractor):
url, b"seltilovessimpcity@simpcityhatesscrapers",
fromhex=True)
+ album_url, _, album_name = extr("Added to <a", "</a>").rpartition(">")
file = {
"id" : self.path.rpartition("/")[2].rpartition(".")[2],
"url" : url,
- "album": text.remove_html(extr(
- "Added to <a", "</a>").rpartition(">")[2]),
- "date" : text.parse_datetime(extr(
- '<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
+ "album": text.remove_html(album_name),
+ "date" : self.parse_datetime_iso(extr('<span title="', '"')),
"user" : extr('username: "', '"'),
}
+ file["album_slug"], _, file["album_id"] = text.rextr(
+ album_url, "/", '"').rpartition(".")
+
text.nameext_from_url(file["url"], file)
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, file["url"], file
class CheveretoVideoExtractor(CheveretoExtractor):
"""Extractor for chevereto videos"""
subcategory = "video"
- pattern = BASE_PATTERN + r"(/video/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/video/[^/?#]+)"
example = "https://imagepond.net/video/TITLE.ID"
def items(self):
@@ -114,13 +120,17 @@ class CheveretoVideoExtractor(CheveretoExtractor):
'property="video:height" content="', '"')),
"duration" : extr(
'class="far fa-clock"></i>', "—"),
- "album": text.remove_html(extr(
- "Added to <a", "</a>").rpartition(">")[2]),
- "date" : text.parse_datetime(extr(
- '<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
+ "album" : extr(
+ "Added to <a", "</a>"),
+ "date" : self.parse_datetime_iso(extr('<span title="', '"')),
"user" : extr('username: "', '"'),
}
+ album_url, _, album_name = file["album"].rpartition(">")
+ file["album"] = text.remove_html(album_name)
+ file["album_slug"], _, file["album_id"] = text.rextr(
+ album_url, "/", '"').rpartition(".")
+
try:
min, _, sec = file["duration"].partition(":")
file["duration"] = int(min) * 60 + int(sec)
@@ -128,15 +138,15 @@ class CheveretoVideoExtractor(CheveretoExtractor):
pass
text.nameext_from_url(file["url"], file)
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, file["url"], file
class CheveretoAlbumExtractor(CheveretoExtractor):
"""Extractor for chevereto albums"""
subcategory = "album"
- pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)"
- example = "https://jpg2.su/album/TITLE.ID"
+ pattern = rf"{BASE_PATTERN}(/a(?:lbum)?/[^/?#]+(?:/sub)?)"
+ example = "https://jpg7.cr/album/TITLE.ID"
def items(self):
url = self.root + self.path
@@ -148,16 +158,31 @@ class CheveretoAlbumExtractor(CheveretoExtractor):
else:
albums = (url,)
+ kwdict = self.kwdict
for album in albums:
- for item_url in self._pagination(album):
+ for kwdict["num"], item_url in enumerate(self._pagination(
+ album, self._extract_metadata_album), 1):
data = data_video if "/video/" in item_url else data_image
yield Message.Queue, item_url, data
+ def _extract_metadata_album(self, page):
+ url, pos = text.extract(
+ page, 'property="og:url" content="', '"')
+ title, pos = text.extract(
+ page, 'property="og:title" content="', '"', pos)
+
+ kwdict = self.kwdict
+ kwdict["album_slug"], _, kwdict["album_id"] = \
+ url[url.rfind("/")+1:].rpartition(".")
+ kwdict["album"] = text.unescape(title)
+ kwdict["count"] = text.parse_int(text.extract(
+ page, 'data-text="image-count">', "<", pos)[0])
+
class CheveretoCategoryExtractor(CheveretoExtractor):
"""Extractor for chevereto galleries"""
subcategory = "category"
- pattern = BASE_PATTERN + r"(/category/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/category/[^/?#]+)"
example = "https://imglike.com/category/TITLE"
def items(self):
@@ -169,8 +194,8 @@ class CheveretoCategoryExtractor(CheveretoExtractor):
class CheveretoUserExtractor(CheveretoExtractor):
"""Extractor for chevereto users"""
subcategory = "user"
- pattern = BASE_PATTERN + r"(/[^/?#]+(?:/albums)?)"
- example = "https://jpg2.su/USER"
+ pattern = rf"{BASE_PATTERN}(/[^/?#]+(?:/albums)?)"
+ example = "https://jpg7.cr/USER"
def items(self):
url = self.root + self.path
diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py
index 45e5dab..c68af2e 100644
--- a/gallery_dl/extractor/cien.py
+++ b/gallery_dl/extractor/cien.py
@@ -34,7 +34,7 @@ class CienExtractor(Extractor):
page = self.request(url, params=params).text
for card in text.extract_iter(
- page, ' class="c-cardCase-item', '</div>'):
+ page, ' class="c-cardCase-item', '</figure>'):
article_url = text.extr(card, ' href="', '"')
yield Message.Queue, article_url, data
@@ -48,7 +48,7 @@ class CienArticleExtractor(CienExtractor):
filename_fmt = "{num:>02} {filename}.{extension}"
directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}")
archive_fmt = "{post_id}_{num}"
- pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)"
+ pattern = rf"{BASE_PATTERN}/creator/(\d+)/article/(\d+)"
example = "https://ci-en.net/creator/123/article/12345"
def items(self):
@@ -61,7 +61,7 @@ class CienArticleExtractor(CienExtractor):
post["post_url"] = url
post["post_id"] = text.parse_int(post_id)
post["count"] = len(files)
- post["date"] = text.parse_datetime(post["datePublished"])
+ post["date"] = self.parse_datetime_iso(post["datePublished"])
try:
post["author"]["id"] = text.parse_int(author_id)
@@ -70,7 +70,7 @@ class CienArticleExtractor(CienExtractor):
except Exception:
pass
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
post.update(file)
if "extension" not in file:
@@ -160,7 +160,7 @@ class CienArticleExtractor(CienExtractor):
class CienCreatorExtractor(CienExtractor):
subcategory = "creator"
- pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$"
+ pattern = rf"{BASE_PATTERN}/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$"
example = "https://ci-en.net/creator/123"
def items(self):
@@ -172,7 +172,7 @@ class CienCreatorExtractor(CienExtractor):
class CienRecentExtractor(CienExtractor):
subcategory = "recent"
- pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/mypage/recent(?:\?([^#]+))?"
example = "https://ci-en.net/mypage/recent"
def items(self):
@@ -183,7 +183,7 @@ class CienRecentExtractor(CienExtractor):
class CienFollowingExtractor(CienExtractor):
subcategory = "following"
- pattern = BASE_PATTERN + r"/mypage/subscription(/following)?"
+ pattern = rf"{BASE_PATTERN}/mypage/subscription(/following)?"
example = "https://ci-en.net/mypage/subscription"
def items(self):
diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py
index 26ee3fd..742c561 100644
--- a/gallery_dl/extractor/civitai.py
+++ b/gallery_dl/extractor/civitai.py
@@ -15,7 +15,7 @@ import itertools
import time
BASE_PATTERN = r"(?:https?://)?civitai\.com"
-USER_PATTERN = BASE_PATTERN + r"/user/([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/user/([^/?#]+)"
class CivitaiExtractor(Extractor):
@@ -61,13 +61,14 @@ class CivitaiExtractor(Extractor):
if isinstance(metadata, str):
metadata = metadata.split(",")
elif not isinstance(metadata, (list, tuple)):
- metadata = ("generation", "version", "post")
+ metadata = {"generation", "version", "post", "tags"}
self._meta_generation = ("generation" in metadata)
self._meta_version = ("version" in metadata)
self._meta_post = ("post" in metadata)
+ self._meta_tags = ("tags" in metadata)
else:
self._meta_generation = self._meta_version = self._meta_post = \
- False
+ self._meta_tags = False
def items(self):
if models := self.models():
@@ -86,8 +87,7 @@ class CivitaiExtractor(Extractor):
images = self.api.images_post(post["id"])
post = self.api.post(post["id"])
- post["date"] = text.parse_datetime(
- post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["publishedAt"])
data = {
"post": post,
"user": post.pop("user"),
@@ -96,7 +96,7 @@ class CivitaiExtractor(Extractor):
data["model"], data["version"] = \
self._extract_meta_version(post)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for file in self._image_results(images):
file.update(data)
yield Message.Url, file["url"], file
@@ -111,8 +111,9 @@ class CivitaiExtractor(Extractor):
}
if self._meta_generation:
- data["generation"] = \
- self._extract_meta_generation(file)
+ data["generation"] = self._extract_meta_generation(file)
+ if self._meta_tags:
+ data["tags"] = self._extract_meta_tags(file)
if self._meta_version:
data["model"], data["version"] = \
self._extract_meta_version(file, False)
@@ -122,8 +123,7 @@ class CivitaiExtractor(Extractor):
data["post"] = post = self._extract_meta_post(file)
if post:
post.pop("user", None)
- file["date"] = text.parse_datetime(
- file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ file["date"] = self.parse_datetime_iso(file["createdAt"])
data["url"] = url = self._url(file)
text.nameext_from_url(url, data)
@@ -131,7 +131,7 @@ class CivitaiExtractor(Extractor):
data["extension"] = (
self._video_ext if file.get("type") == "video" else
self._image_ext)
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
return
@@ -180,10 +180,11 @@ class CivitaiExtractor(Extractor):
if "id" not in file and data["filename"].isdecimal():
file["id"] = text.parse_int(data["filename"])
if "date" not in file:
- file["date"] = text.parse_datetime(
- file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ file["date"] = self.parse_datetime_iso(file["createdAt"])
if self._meta_generation:
file["generation"] = self._extract_meta_generation(file)
+ if self._meta_tags:
+ file["tags"] = self._extract_meta_tags(file)
yield data
def _image_reactions(self):
@@ -211,16 +212,21 @@ class CivitaiExtractor(Extractor):
try:
return self.api.image_generationdata(image["id"])
except Exception as exc:
- return self.log.debug("", exc_info=exc)
+ return self.log.traceback(exc)
def _extract_meta_post(self, image):
try:
post = self.api.post(image["postId"])
- post["date"] = text.parse_datetime(
- post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["publishedAt"])
return post
except Exception as exc:
- return self.log.debug("", exc_info=exc)
+ return self.log.traceback(exc)
+
+ def _extract_meta_tags(self, image):
+ try:
+ return self.api.tag_getvotabletags(image["id"])
+ except Exception as exc:
+ return self.log.traceback(exc)
def _extract_meta_version(self, item, is_post=True):
try:
@@ -228,7 +234,7 @@ class CivitaiExtractor(Extractor):
version = self.api.model_version(version_id).copy()
return version.pop("model", None), version
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
return None, None
def _extract_version_id(self, item, is_post=True):
@@ -252,7 +258,7 @@ class CivitaiModelExtractor(CivitaiExtractor):
directory_fmt = ("{category}", "{user[username]}",
"{model[id]}{model[name]:? //}",
"{version[id]}{version[name]:? //}")
- pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?"
+ pattern = rf"{BASE_PATTERN}/models/(\d+)(?:/?\?modelVersionId=(\d+))?"
example = "https://civitai.com/models/12345/TITLE"
def items(self):
@@ -278,8 +284,7 @@ class CivitaiModelExtractor(CivitaiExtractor):
versions = (version,)
for version in versions:
- version["date"] = text.parse_datetime(
- version["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ version["date"] = self.parse_datetime_iso(version["createdAt"])
data = {
"model" : model,
@@ -287,7 +292,7 @@ class CivitaiModelExtractor(CivitaiExtractor):
"user" : user,
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for file in self._extract_files(model, version, user):
file.update(data)
yield Message.Url, file["url"], file
@@ -342,9 +347,9 @@ class CivitaiModelExtractor(CivitaiExtractor):
params = {
"modelVersionId": version["id"],
"prioritizedUserIds": (user["id"],),
- "period": "AllTime",
- "sort": "Most Reactions",
- "limit": 20,
+ "period" : self.api._param_period(),
+ "sort" : self.api._param_sort(),
+ "limit" : 20,
"pending": True,
}
images = self.api.images(params, defaults=False)
@@ -370,7 +375,7 @@ class CivitaiModelExtractor(CivitaiExtractor):
class CivitaiImageExtractor(CivitaiExtractor):
subcategory = "image"
- pattern = BASE_PATTERN + r"/images/(\d+)"
+ pattern = rf"{BASE_PATTERN}/images/(\d+)"
example = "https://civitai.com/images/12345"
def images(self):
@@ -381,7 +386,7 @@ class CivitaiCollectionExtractor(CivitaiExtractor):
subcategory = "collection"
directory_fmt = ("{category}", "{user_collection[username]}",
"collections", "{collection[id]}{collection[name]:? //}")
- pattern = BASE_PATTERN + r"/collections/(\d+)"
+ pattern = rf"{BASE_PATTERN}/collections/(\d+)"
example = "https://civitai.com/collections/12345"
def images(self):
@@ -391,8 +396,8 @@ class CivitaiCollectionExtractor(CivitaiExtractor):
params = {
"collectionId" : cid,
- "period" : "AllTime",
- "sort" : "Newest",
+ "period" : self.api._param_period(),
+ "sort" : self.api._param_sort(),
"browsingLevel" : self.api.nsfw,
"include" : ("cosmetics",),
}
@@ -403,7 +408,7 @@ class CivitaiPostExtractor(CivitaiExtractor):
subcategory = "post"
directory_fmt = ("{category}", "{username|user[username]}", "posts",
"{post[id]}{post[title]:? //}")
- pattern = BASE_PATTERN + r"/posts/(\d+)"
+ pattern = rf"{BASE_PATTERN}/posts/(\d+)"
example = "https://civitai.com/posts/12345"
def posts(self):
@@ -412,7 +417,7 @@ class CivitaiPostExtractor(CivitaiExtractor):
class CivitaiTagExtractor(CivitaiExtractor):
subcategory = "tag"
- pattern = BASE_PATTERN + r"/tag/([^/?&#]+)"
+ pattern = rf"{BASE_PATTERN}/tag/([^/?&#]+)"
example = "https://civitai.com/tag/TAG"
def models(self):
@@ -422,7 +427,7 @@ class CivitaiTagExtractor(CivitaiExtractor):
class CivitaiSearchModelsExtractor(CivitaiExtractor):
subcategory = "search-models"
- pattern = BASE_PATTERN + r"/search/models\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/search/models\?([^#]+)"
example = "https://civitai.com/search/models?query=QUERY"
def models(self):
@@ -433,7 +438,7 @@ class CivitaiSearchModelsExtractor(CivitaiExtractor):
class CivitaiSearchImagesExtractor(CivitaiExtractor):
subcategory = "search-images"
- pattern = BASE_PATTERN + r"/search/images\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/search/images\?([^#]+)"
example = "https://civitai.com/search/images?query=QUERY"
def images(self):
@@ -444,7 +449,7 @@ class CivitaiSearchImagesExtractor(CivitaiExtractor):
class CivitaiModelsExtractor(CivitaiExtractor):
subcategory = "models"
- pattern = BASE_PATTERN + r"/models(?:/?\?([^#]+))?(?:$|#)"
+ pattern = rf"{BASE_PATTERN}/models(?:/?\?([^#]+))?(?:$|#)"
example = "https://civitai.com/models"
def models(self):
@@ -454,7 +459,7 @@ class CivitaiModelsExtractor(CivitaiExtractor):
class CivitaiImagesExtractor(CivitaiExtractor):
subcategory = "images"
- pattern = BASE_PATTERN + r"/images(?:/?\?([^#]+))?(?:$|#)"
+ pattern = rf"{BASE_PATTERN}/images(?:/?\?([^#]+))?(?:$|#)"
example = "https://civitai.com/images"
def images(self):
@@ -465,7 +470,7 @@ class CivitaiImagesExtractor(CivitaiExtractor):
class CivitaiVideosExtractor(CivitaiExtractor):
subcategory = "videos"
- pattern = BASE_PATTERN + r"/videos(?:/?\?([^#]+))?(?:$|#)"
+ pattern = rf"{BASE_PATTERN}/videos(?:/?\?([^#]+))?(?:$|#)"
example = "https://civitai.com/videos"
def images(self):
@@ -476,7 +481,7 @@ class CivitaiVideosExtractor(CivitaiExtractor):
class CivitaiPostsExtractor(CivitaiExtractor):
subcategory = "posts"
- pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?(?:$|#)"
+ pattern = rf"{BASE_PATTERN}/posts(?:/?\?([^#]+))?(?:$|#)"
example = "https://civitai.com/posts"
def posts(self):
@@ -485,7 +490,7 @@ class CivitaiPostsExtractor(CivitaiExtractor):
class CivitaiUserExtractor(Dispatch, CivitaiExtractor):
- pattern = USER_PATTERN + r"/?(?:$|\?|#)"
+ pattern = rf"{USER_PATTERN}/?(?:$|\?|#)"
example = "https://civitai.com/user/USER"
def items(self):
@@ -501,7 +506,7 @@ class CivitaiUserExtractor(Dispatch, CivitaiExtractor):
class CivitaiUserModelsExtractor(CivitaiExtractor):
subcategory = "user-models"
- pattern = USER_PATTERN + r"/models/?(?:\?([^#]+))?"
+ pattern = rf"{USER_PATTERN}/models/?(?:\?([^#]+))?"
example = "https://civitai.com/user/USER/models"
def models(self):
@@ -515,7 +520,7 @@ class CivitaiUserPostsExtractor(CivitaiExtractor):
subcategory = "user-posts"
directory_fmt = ("{category}", "{username|user[username]}", "posts",
"{post[id]}{post[title]:? //}")
- pattern = USER_PATTERN + r"/posts/?(?:\?([^#]+))?"
+ pattern = rf"{USER_PATTERN}/posts/?(?:\?([^#]+))?"
example = "https://civitai.com/user/USER/posts"
def posts(self):
@@ -527,7 +532,7 @@ class CivitaiUserPostsExtractor(CivitaiExtractor):
class CivitaiUserImagesExtractor(CivitaiExtractor):
subcategory = "user-images"
- pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?"
+ pattern = rf"{USER_PATTERN}/images/?(?:\?([^#]+))?"
example = "https://civitai.com/user/USER/images"
def __init__(self, match):
@@ -548,7 +553,7 @@ class CivitaiUserImagesExtractor(CivitaiExtractor):
class CivitaiUserVideosExtractor(CivitaiExtractor):
subcategory = "user-videos"
directory_fmt = ("{category}", "{username|user[username]}", "videos")
- pattern = USER_PATTERN + r"/videos/?(?:\?([^#]+))?"
+ pattern = rf"{USER_PATTERN}/videos/?(?:\?([^#]+))?"
example = "https://civitai.com/user/USER/videos"
def __init__(self, match):
@@ -567,7 +572,7 @@ class CivitaiUserVideosExtractor(CivitaiExtractor):
class CivitaiUserCollectionsExtractor(CivitaiExtractor):
subcategory = "user-collections"
- pattern = USER_PATTERN + r"/collections/?(?:\?([^#]+))?"
+ pattern = rf"{USER_PATTERN}/collections/?(?:\?([^#]+))?"
example = "https://civitai.com/user/USER/collections"
def items(self):
@@ -586,16 +591,15 @@ class CivitaiGeneratedExtractor(CivitaiExtractor):
subcategory = "generated"
filename_fmt = "{filename}.{extension}"
directory_fmt = ("{category}", "generated")
- pattern = f"{BASE_PATTERN}/generate"
+ pattern = rf"{BASE_PATTERN}/generate"
example = "https://civitai.com/generate"
def items(self):
self._require_auth()
for gen in self.api.orchestrator_queryGeneratedImages():
- gen["date"] = text.parse_datetime(
- gen["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
- yield Message.Directory, gen
+ gen["date"] = self.parse_datetime_iso(gen["createdAt"])
+ yield Message.Directory, "", gen
for step in gen.pop("steps", ()):
for image in step.pop("images", ()):
data = {"file": image, **step, **gen}
@@ -719,8 +723,8 @@ class CivitaiTrpcAPI():
if defaults:
params = self._merge_params(params, {
"useIndex" : True,
- "period" : "AllTime",
- "sort" : "Newest",
+ "period" : self._param_period(),
+ "sort" : self._param_sort(),
"withMeta" : False, # Metadata Only
"fromPlatform" : False, # Made On-Site
"browsingLevel": self.nsfw,
@@ -733,8 +737,8 @@ class CivitaiTrpcAPI():
def images_gallery(self, model, version, user):
endpoint = "image.getImagesAsPostsInfinite"
params = {
- "period" : "AllTime",
- "sort" : "Newest",
+ "period" : self._param_period(),
+ "sort" : self._param_sort(),
"modelVersionId": version["id"],
"modelId" : model["id"],
"hidden" : False,
@@ -768,9 +772,9 @@ class CivitaiTrpcAPI():
if defaults:
params = self._merge_params(params, {
- "period" : "AllTime",
+ "period" : self._param_period(),
"periodMode" : "published",
- "sort" : "Newest",
+ "sort" : self._param_sort(),
"pending" : False,
"hidden" : False,
"followed" : False,
@@ -797,9 +801,9 @@ class CivitaiTrpcAPI():
if defaults:
params = self._merge_params(params, {
"browsingLevel": self.nsfw,
- "period" : "AllTime",
+ "period" : self._param_period(),
"periodMode" : "published",
- "sort" : "Newest",
+ "sort" : self._param_sort(),
"followed" : False,
"draftOnly" : False,
"pending" : True,
@@ -821,12 +825,17 @@ class CivitaiTrpcAPI():
if defaults:
params = self._merge_params(params, {
"browsingLevel": self.nsfw,
- "sort" : "Newest",
+ "sort" : self._param_sort(),
})
params = self._type_params(params)
return self._pagination(endpoint, params)
+ def tag_getvotabletags(self, image_id):
+ endpoint = "tag.getVotableTags"
+ params = {"id": int(image_id), "type": "image"}
+ return self._call(endpoint, params)
+
def user(self, username):
endpoint = "user.getCreator"
params = {"username": username}
@@ -835,7 +844,7 @@ class CivitaiTrpcAPI():
def orchestrator_queryGeneratedImages(self):
endpoint = "orchestrator.queryGeneratedImages"
params = {
- "ascending": False,
+ "ascending": True if self._param_sort() == "Oldest" else False,
"tags" : ("gen",),
"authed" : True,
}
@@ -908,6 +917,21 @@ class CivitaiTrpcAPI():
params[name] = [type(item) for item in value]
return params
+ def _param_period(self):
+ if period := self.extractor.config("period"):
+ return period
+ return "AllTime"
+
+ def _param_sort(self):
+ if sort := self.extractor.config("sort"):
+ s = sort[0].lower()
+ if s in "drn":
+ return "Newest"
+ if s in "ao":
+ return "Oldest"
+ return sort
+ return "Newest"
+
def _bool(value):
return value == "true"
diff --git a/gallery_dl/extractor/comedywildlifephoto.py b/gallery_dl/extractor/comedywildlifephoto.py
new file mode 100644
index 0000000..a1c1ef4
--- /dev/null
+++ b/gallery_dl/extractor/comedywildlifephoto.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.comedywildlifephoto.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class ComedywildlifephotoGalleryExtractor(GalleryExtractor):
+ """Extractor for comedywildlifephoto galleries"""
+ category = "comedywildlifephoto"
+ root = "https://www.comedywildlifephoto.com"
+ directory_fmt = ("{category}", "{section}", "{title}")
+ filename_fmt = "{num:>03} {filename}.{extension}"
+ archive_fmt = "{section}/{title}/{num}"
+ pattern = (r"(?:https?://)?(?:www\.)?comedywildlifephoto\.com"
+ r"(/gallery/[^/?#]+/[^/?#]+\.php)")
+ example = "https://www.comedywildlifephoto.com/gallery/SECTION/TITLE.php"
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+
+ return {
+ "section": extr("<h1>", "<").strip(),
+ "title" : extr(">", "<"),
+ "description": text.unescape(extr(
+ 'class="c1 np">', "<div")),
+ }
+
+ def images(self, page):
+ results = []
+
+ for fig in text.extract_iter(page, "<figure", "</figure>"):
+ width, _, height = text.extr(
+ fig, 'data-size="', '"').partition("x")
+ results.append((
+ self.root + text.extr(fig, 'href="', '"'), {
+ "width" : text.parse_int(width),
+ "height" : text.parse_int(height),
+ "caption": text.unescape(text.extr(
+ fig, "<figcaption>", "<")),
+ }
+ ))
+
+ return results
diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py
index c76694c..9816786 100644
--- a/gallery_dl/extractor/comick.py
+++ b/gallery_dl/extractor/comick.py
@@ -27,7 +27,7 @@ class ComickCoversExtractor(ComickBase, GalleryExtractor):
directory_fmt = ("{category}", "{manga}", "Covers")
filename_fmt = "{volume:>02}_{lang}.{extension}"
archive_fmt = "c_{id}"
- pattern = BASE_PATTERN + r"/comic/([\w-]+)/cover"
+ pattern = rf"{BASE_PATTERN}/comic/([\w-]+)/cover"
example = "https://comick.io/comic/MANGA/cover"
def metadata(self, page):
@@ -60,7 +60,7 @@ class ComickCoversExtractor(ComickBase, GalleryExtractor):
class ComickChapterExtractor(ComickBase, ChapterExtractor):
"""Extractor for comick.io manga chapters"""
archive_fmt = "{chapter_hid}_{page}"
- pattern = (BASE_PATTERN + r"/comic/([\w-]+)"
+ pattern = (rf"{BASE_PATTERN}/comic/([\w-]+)"
r"/(\w+(?:-(?:chapter|volume)-[^/?#]+)?)")
example = "https://comick.io/comic/MANGA/ID-chapter-123-en"
@@ -114,10 +114,8 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor):
"chapter_hid" : ch["hid"],
"chapter_string": chstr,
"group" : ch["group_name"],
- "date" : text.parse_datetime(
- ch["created_at"][:19], "%Y-%m-%dT%H:%M:%S"),
- "date_updated" : text.parse_datetime(
- ch["updated_at"][:19], "%Y-%m-%dT%H:%M:%S"),
+ "date" : self.parse_datetime_iso(ch["created_at"][:19]),
+ "date_updated" : self.parse_datetime_iso(ch["updated_at"][:19]),
"lang" : ch["lang"],
}
@@ -142,7 +140,7 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor):
class ComickMangaExtractor(ComickBase, MangaExtractor):
"""Extractor for comick.io manga"""
- pattern = BASE_PATTERN + r"/comic/([\w-]+)/?(?:\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/comic/([\w-]+)/?(?:\?([^#]+))?"
example = "https://comick.io/comic/MANGA"
def items(self):
diff --git a/gallery_dl/extractor/comicvine.py b/gallery_dl/extractor/comicvine.py
index 39397b9..f579ef7 100644
--- a/gallery_dl/extractor/comicvine.py
+++ b/gallery_dl/extractor/comicvine.py
@@ -60,6 +60,6 @@ class ComicvineTagExtractor(BooruExtractor):
_file_url = operator.itemgetter("original")
def _prepare(self, post):
- post["date"] = text.parse_datetime(
+ post["date"] = self.parse_datetime(
post["dateCreated"], "%a, %b %d %Y")
post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 34e65c5..13c7bbe 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -19,11 +19,10 @@ import getpass
import logging
import requests
import threading
-from datetime import datetime
from xml.etree import ElementTree
from requests.adapters import HTTPAdapter
from .message import Message
-from .. import config, output, text, util, cache, exception
+from .. import config, output, text, util, dt, cache, exception
urllib3 = requests.packages.urllib3
@@ -32,7 +31,9 @@ class Extractor():
category = ""
subcategory = ""
basecategory = ""
+ basesubcategory = ""
categorytransfer = False
+ parent = False
directory_fmt = ("{category}",)
filename_fmt = "{filename}.{extension}"
archive_fmt = ""
@@ -64,6 +65,10 @@ class Extractor():
else:
self.category = CATEGORY_MAP[self.category]
+ self.parse_datetime = dt.parse
+ self.parse_datetime_iso = dt.parse_iso
+ self.parse_timestamp = dt.parse_ts
+
self._cfgpath = ("extractor", self.category, self.subcategory)
self._parentdir = ""
@@ -89,7 +94,8 @@ class Extractor():
pass
def items(self):
- yield Message.Version, 1
+ return
+ yield
def skip(self, num):
return 0
@@ -313,9 +319,9 @@ class Extractor():
seconds = float(seconds)
until = now + seconds
elif until:
- if isinstance(until, datetime):
+ if isinstance(until, dt.datetime):
# convert to UTC timestamp
- until = util.datetime_to_timestamp(until)
+ until = dt.to_ts(until)
else:
until = float(until)
seconds = until - now
@@ -327,7 +333,7 @@ class Extractor():
return
if reason:
- t = datetime.fromtimestamp(until).time()
+ t = dt.datetime.fromtimestamp(until).time()
isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}"
self.log.info("Waiting until %s (%s)", isotime, reason)
time.sleep(seconds)
@@ -652,7 +658,7 @@ class Extractor():
self.log.warning(
"cookies: %s/%s expired at %s",
cookie.domain.lstrip("."), cookie.name,
- datetime.fromtimestamp(cookie.expires))
+ dt.datetime.fromtimestamp(cookie.expires))
continue
elif diff <= 86400:
@@ -693,13 +699,16 @@ class Extractor():
def get(key, default):
ts = self.config(key, default)
if isinstance(ts, str):
- try:
- ts = int(datetime.strptime(ts, fmt).timestamp())
- except ValueError as exc:
- self.log.warning("Unable to parse '%s': %s", key, exc)
+ dt_obj = dt.parse_iso(ts) if fmt is None else dt.parse(ts, fmt)
+ if dt_obj is dt.NONE:
+ self.log.warning(
+ "Unable to parse '%s': Invalid %s string '%s'",
+ key, "isoformat" if fmt is None else "date", ts)
ts = default
+ else:
+ ts = int(dt.to_ts(dt_obj))
return ts
- fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S")
+ fmt = self.config("date-format")
return get("date-min", dmin), get("date-max", dmax)
@classmethod
@@ -793,7 +802,7 @@ class GalleryExtractor(Extractor):
enum = util.enumerate_reversed
images = enum(imgs, 1)
- yield Message.Directory, data
+ yield Message.Directory, "", data
enum_key = self.enum
if assets:
@@ -912,7 +921,7 @@ class Dispatch():
elif isinstance(include, str):
include = include.replace(" ", "").split(",")
- results = [(Message.Version, 1)]
+ results = []
for category in include:
try:
extr, url = extractors[category]
@@ -962,18 +971,16 @@ class BaseExtractor(Extractor):
def __init__(self, match):
if not self.category:
- self.groups = match.groups()
- self.match = match
- self._init_category()
+ self._init_category(match)
Extractor.__init__(self, match)
- def _init_category(self):
- for index, group in enumerate(self.groups):
+ def _init_category(self, match):
+ for index, group in enumerate(match.groups()):
if group is not None:
if index:
self.category, self.root, info = self.instances[index-1]
if not self.root:
- self.root = text.root_from_url(self.match[0])
+ self.root = text.root_from_url(match[0])
self.config_instance = info.get
else:
self.root = group
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
index b3944f7..93d3953 100644
--- a/gallery_dl/extractor/cyberdrop.py
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -4,27 +4,27 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://cyberdrop.me/"""
+"""Extractors for https://cyberdrop.cr/"""
from . import lolisafe
from .common import Message
from .. import text
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:cr|me|to)"
class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
"""Extractor for cyberdrop albums"""
category = "cyberdrop"
- root = "https://cyberdrop.me"
- root_api = "https://api.cyberdrop.me"
- pattern = BASE_PATTERN + r"/a/([^/?#]+)"
- example = "https://cyberdrop.me/a/ID"
+ root = "https://cyberdrop.cr"
+ root_api = "https://api.cyberdrop.cr"
+ pattern = rf"{BASE_PATTERN}/a/([^/?#]+)"
+ example = "https://cyberdrop.cr/a/ID"
def items(self):
files, data = self.fetch_album(self.album_id)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], file in enumerate(files, 1):
file.update(data)
text.nameext_from_url(file["name"], file)
@@ -47,7 +47,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
"album_name" : text.unescape(extr('title="', '"')),
"album_size" : text.parse_bytes(extr(
'<p class="title">', "B")),
- "date" : text.parse_datetime(extr(
+ "date" : self.parse_datetime(extr(
'<p class="title">', '<'), "%d.%m.%Y"),
"description": text.unescape(text.unescape( # double
desc.rpartition(" [R")[0])),
@@ -76,8 +76,8 @@ class CyberdropMediaExtractor(CyberdropAlbumExtractor):
"""Extractor for cyberdrop media links"""
subcategory = "media"
directory_fmt = ("{category}",)
- pattern = BASE_PATTERN + r"/f/([^/?#]+)"
- example = "https://cyberdrop.me/f/ID"
+ pattern = rf"{BASE_PATTERN}/f/([^/?#]+)"
+ example = "https://cyberdrop.cr/f/ID"
def fetch_album(self, album_id):
return self._extract_files((album_id,)), {
diff --git a/gallery_dl/extractor/cyberfile.py b/gallery_dl/extractor/cyberfile.py
index 2ea81d6..e8c0061 100644
--- a/gallery_dl/extractor/cyberfile.py
+++ b/gallery_dl/extractor/cyberfile.py
@@ -56,7 +56,9 @@ class CyberfileFolderExtractor(CyberfileExtractor):
url = f"{self.root}/folder/{folder_hash}"
folder_num = text.extr(self.request(url).text, "ages('folder', '", "'")
- extract_urls = text.re(r'dtfullurl="([^"]+)').findall
+ extract_folders = text.re(r'sharing-url="([^"]+)').findall
+ extract_files = text.re(r'dtfullurl="([^"]+)').findall
+ recursive = self.config("recursive", True)
perpage = 600
data = {
@@ -67,25 +69,63 @@ class CyberfileFolderExtractor(CyberfileExtractor):
"filterOrderBy": "",
}
resp = self.request_api("/account/ajax/load_files", data)
+ html = resp["html"]
folder = {
- "_extractor" : CyberfileFileExtractor,
"folder_hash": folder_hash,
"folder_num" : text.parse_int(folder_num),
"folder" : resp["page_title"],
}
while True:
- urls = extract_urls(resp["html"])
- for url in urls:
- yield Message.Queue, url, folder
-
- if len(urls) < perpage:
+ folders = extract_folders(html)
+ if recursive and folders:
+ folder["_extractor"] = CyberfileFolderExtractor
+ for url in folders:
+ yield Message.Queue, url, folder
+
+ if files := extract_files(html):
+ folder["_extractor"] = CyberfileFileExtractor
+ for url in files:
+ yield Message.Queue, url, folder
+
+ if len(folders) + len(files) < perpage:
return
data["pageStart"] += 1
resp = self.request_api("/account/ajax/load_files", data)
+class CyberfileSharedExtractor(CyberfileExtractor):
+ subcategory = "shared"
+ pattern = rf"{BASE_PATTERN}/shared/([a-zA-Z0-9]+)"
+ example = "https://cyberfile.me/shared/AbCdEfGhIjK"
+
+ def items(self):
+ # get 'filehosting' cookie
+ url = f"{self.root}/shared/{self.groups[0]}"
+ self.request(url, method="HEAD")
+
+ data = {
+ "pageType" : "nonaccountshared",
+ "nodeId" : "",
+ "pageStart": "1",
+ "perPage" : "500",
+ "filterOrderBy": "",
+ }
+ resp = self.request_api("/account/ajax/load_files", data)
+
+ html = resp["html"]
+ pos = html.find("<!-- /.navbar-collapse -->") + 26
+
+ data = {"_extractor": CyberfileFolderExtractor}
+ for url in text.extract_iter(html, 'sharing-url="', '"', pos):
+ yield Message.Queue, url, data
+
+ data = {"_extractor": CyberfileFileExtractor}
+ for url in text.extract_iter(html, 'dtfullurl="', '"', pos):
+ yield Message.Queue, url, data
+
+
class CyberfileFileExtractor(CyberfileExtractor):
subcategory = "file"
directory_fmt = ("{category}", "{uploader}", "{folder}")
@@ -113,7 +153,7 @@ class CyberfileFileExtractor(CyberfileExtractor):
"Filesize:", "</tr>"))[:-1]),
"tags" : text.split_html(extr(
"Keywords:", "</tr>")),
- "date" : text.parse_datetime(text.remove_html(extr(
+ "date" : self.parse_datetime(text.remove_html(extr(
"Uploaded:", "</tr>")), "%d/%m/%Y %H:%M:%S"),
"permissions": text.remove_html(extr(
"Permissions:", "</tr>")).split(" &amp; "),
@@ -121,5 +161,5 @@ class CyberfileFileExtractor(CyberfileExtractor):
file["file_url"] = url = extr("openUrl('", "'")
text.nameext_from_url(file["name"] or url, file)
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, url, file
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 29c7763..5ea33c4 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -9,8 +9,7 @@
"""Extractors for https://danbooru.donmai.us/ and other Danbooru instances"""
from .common import BaseExtractor, Message
-from .. import text, util
-import datetime
+from .. import text, util, dt
class DanbooruExtractor(BaseExtractor):
@@ -64,13 +63,12 @@ class DanbooruExtractor(BaseExtractor):
except KeyError:
if self.external and post["source"]:
post.update(data)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Queue, post["source"], post
continue
text.nameext_from_url(url, post)
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ post["date"] = dt.parse_iso(post["created_at"])
post["tags"] = (
post["tag_string"].split(" ")
@@ -108,7 +106,7 @@ class DanbooruExtractor(BaseExtractor):
url = self.root + url
post.update(data)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, url, post
def items_artists(self):
@@ -253,7 +251,7 @@ class DanbooruTagExtractor(DanbooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=([^&#]*)"
+ pattern = rf"{BASE_PATTERN}/posts\?(?:[^&#]*&)*tags=([^&#]*)"
example = "https://danbooru.donmai.us/posts?tags=TAG"
def metadata(self):
@@ -281,7 +279,7 @@ class DanbooruTagExtractor(DanbooruExtractor):
class DanbooruRandomExtractor(DanbooruTagExtractor):
"""Extractor for a random danbooru post"""
subcategory = "random"
- pattern = BASE_PATTERN + r"/posts/random(?:\?(?:[^&#]*&)*tags=([^&#]*))?"
+ pattern = rf"{BASE_PATTERN}/posts/random(?:\?(?:[^&#]*&)*tags=([^&#]*))?"
example = "https://danbooru.donmai.us/posts/random?tags=TAG"
def metadata(self):
@@ -301,7 +299,7 @@ class DanbooruPoolExtractor(DanbooruExtractor):
directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")
filename_fmt = "{num:>04}_{id}_{filename}.{extension}"
archive_fmt = "p_{pool[id]}_{id}"
- pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/pool(?:s|/show)/(\d+)"
example = "https://danbooru.donmai.us/pools/12345"
def metadata(self):
@@ -319,7 +317,7 @@ class DanbooruFavgroupExtractor(DanbooruExtractor):
"{favgroup[id]} {favgroup[name]}")
filename_fmt = "{num:>04}_{id}_{filename}.{extension}"
archive_fmt = "fg_{favgroup[id]}_{id}"
- pattern = BASE_PATTERN + r"/favorite_group(?:s|/show)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/favorite_group(?:s|/show)/(\d+)"
example = "https://danbooru.donmai.us/favorite_groups/12345"
def metadata(self):
@@ -334,7 +332,7 @@ class DanbooruPostExtractor(DanbooruExtractor):
"""Extractor for single danbooru posts"""
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post(?:s|/show)/(\d+)"
example = "https://danbooru.donmai.us/posts/12345"
def posts(self):
@@ -346,22 +344,48 @@ class DanbooruPostExtractor(DanbooruExtractor):
return (post,)
+class DanbooruMediaassetExtractor(DanbooruExtractor):
+ """Extractor for a danbooru media asset"""
+ subcategory = "media-asset"
+ filename_fmt = "{category}_ma{id}_{filename}.{extension}"
+ archive_fmt = "m{id}"
+ pattern = rf"{BASE_PATTERN}/media_assets/(\d+)"
+ example = "https://danbooru.donmai.us/media_assets/12345"
+
+ def posts(self):
+ url = f"{self.root}/media_assets/{self.groups[-1]}.json"
+ asset = self.request_json(url)
+
+ asset["file_url"] = asset["variants"][-1]["url"]
+ asset["tag_string"] = \
+ asset["tag_string_artist"] = \
+ asset["tag_string_character"] = \
+ asset["tag_string_copyright"] = \
+ asset["tag_string_general"] = \
+ asset["tag_string_meta"] = ""
+
+ if self.includes:
+ params = {"only": self.includes}
+ asset.update(self.request_json(url, params=params))
+ return (asset,)
+
+
class DanbooruPopularExtractor(DanbooruExtractor):
"""Extractor for popular images from danbooru"""
subcategory = "popular"
directory_fmt = ("{category}", "popular", "{scale}", "{date}")
archive_fmt = "P_{scale[0]}_{date}_{id}"
- pattern = BASE_PATTERN + r"/(?:explore/posts/)?popular(?:\?([^#]*))?"
+ pattern = rf"{BASE_PATTERN}/(?:explore/posts/)?popular(?:\?([^#]*))?"
example = "https://danbooru.donmai.us/explore/posts/popular"
def metadata(self):
self.params = params = text.parse_query(self.groups[-1])
scale = params.get("scale", "day")
- date = params.get("date") or datetime.date.today().isoformat()
+ date = params.get("date") or dt.date.today().isoformat()
if scale == "week":
- date = datetime.date.fromisoformat(date)
- date = (date - datetime.timedelta(days=date.weekday())).isoformat()
+ date = dt.date.fromisoformat(date)
+ date = (date - dt.timedelta(days=date.weekday())).isoformat()
elif scale == "month":
date = date[:-3]
@@ -374,7 +398,7 @@ class DanbooruPopularExtractor(DanbooruExtractor):
class DanbooruArtistExtractor(DanbooruExtractor):
"""Extractor for danbooru artists"""
subcategory = "artist"
- pattern = BASE_PATTERN + r"/artists/(\d+)"
+ pattern = rf"{BASE_PATTERN}/artists/(\d+)"
example = "https://danbooru.donmai.us/artists/12345"
items = DanbooruExtractor.items_artists
@@ -387,7 +411,7 @@ class DanbooruArtistExtractor(DanbooruExtractor):
class DanbooruArtistSearchExtractor(DanbooruExtractor):
"""Extractor for danbooru artist searches"""
subcategory = "artist-search"
- pattern = BASE_PATTERN + r"/artists/?\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/artists/?\?([^#]+)"
example = "https://danbooru.donmai.us/artists?QUERY"
items = DanbooruExtractor.items_artists
diff --git a/gallery_dl/extractor/dankefuerslesen.py b/gallery_dl/extractor/dankefuerslesen.py
index 1c4b7d8..ed7e40b 100644
--- a/gallery_dl/extractor/dankefuerslesen.py
+++ b/gallery_dl/extractor/dankefuerslesen.py
@@ -28,7 +28,7 @@ class DankefuerslesenBase():
class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor):
"""Extractor for Danke fürs Lesen manga chapters"""
- pattern = BASE_PATTERN + r"/read/manga/([\w-]+)/([\w-]+)"
+ pattern = rf"{BASE_PATTERN}/read/manga/([\w-]+)/([\w-]+)"
example = "https://danke.moe/read/manga/TITLE/123/1/"
def _init(self):
@@ -68,7 +68,7 @@ class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor):
"chapter_minor": minor,
"group" : manga["groups"][group_id].split(" & "),
"group_id" : text.parse_int(group_id),
- "date" : text.parse_timestamp(data["release_date"][group_id]),
+ "date" : self.parse_timestamp(data["release_date"][group_id]),
"lang" : util.NONE,
"language" : util.NONE,
}
@@ -95,7 +95,7 @@ class DankefuerslesenMangaExtractor(DankefuerslesenBase, MangaExtractor):
"""Extractor for Danke fürs Lesen manga"""
chapterclass = DankefuerslesenChapterExtractor
reverse = False
- pattern = BASE_PATTERN + r"/read/manga/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/read/manga/([^/?#]+)"
example = "https://danke.moe/read/manga/TITLE/"
def chapters(self, page):
diff --git a/gallery_dl/extractor/desktopography.py b/gallery_dl/extractor/desktopography.py
index 364d88f..be25053 100644
--- a/gallery_dl/extractor/desktopography.py
+++ b/gallery_dl/extractor/desktopography.py
@@ -22,7 +22,7 @@ class DesktopographyExtractor(Extractor):
class DesktopographySiteExtractor(DesktopographyExtractor):
"""Extractor for all desktopography exhibitions """
subcategory = "site"
- pattern = BASE_PATTERN + r"/$"
+ pattern = rf"{BASE_PATTERN}/$"
example = "https://desktopography.net/"
def items(self):
@@ -41,7 +41,7 @@ class DesktopographySiteExtractor(DesktopographyExtractor):
class DesktopographyExhibitionExtractor(DesktopographyExtractor):
"""Extractor for a yearly desktopography exhibition"""
subcategory = "exhibition"
- pattern = BASE_PATTERN + r"/exhibition-([^/?#]+)/"
+ pattern = rf"{BASE_PATTERN}/exhibition-([^/?#]+)/"
example = "https://desktopography.net/exhibition-2020/"
def __init__(self, match):
@@ -70,7 +70,7 @@ class DesktopographyExhibitionExtractor(DesktopographyExtractor):
class DesktopographyEntryExtractor(DesktopographyExtractor):
"""Extractor for all resolutions of a desktopography wallpaper"""
subcategory = "entry"
- pattern = BASE_PATTERN + r"/portfolios/([\w-]+)"
+ pattern = rf"{BASE_PATTERN}/portfolios/([\w-]+)"
example = "https://desktopography.net/portfolios/NAME/"
def __init__(self, match):
@@ -82,7 +82,7 @@ class DesktopographyEntryExtractor(DesktopographyExtractor):
page = self.request(url).text
entry_data = {"entry": self.entry}
- yield Message.Directory, entry_data
+ yield Message.Directory, "", entry_data
for image_data in text.extract_iter(
page,
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 39690da..5bd43d4 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -9,7 +9,7 @@
"""Extractors for https://www.deviantart.com/"""
from .common import Extractor, Message, Dispatch
-from .. import text, util, exception
+from .. import text, util, dt, exception
from ..cache import cache, memcache
import collections
import mimetypes
@@ -64,13 +64,13 @@ class DeviantartExtractor(Extractor):
if self.quality:
if self.quality == "png":
self.quality = "-fullview.png?"
- self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub
+ self.quality_sub = text.re(r"-fullview\.[a-z0-9]+\?").sub
else:
self.quality = f",q_{self.quality}"
- self.quality_sub = util.re(r",q_\d+").sub
+ self.quality_sub = text.re(r",q_\d+").sub
if self.intermediary:
- self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn
+ self.intermediary_subn = text.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn
if isinstance(self.original, str) and \
self.original.lower().startswith("image"):
@@ -154,7 +154,7 @@ class DeviantartExtractor(Extractor):
deviation.update(data)
self.prepare(deviation)
- yield Message.Directory, deviation
+ yield Message.Directory, "", deviation
if "content" in deviation:
content = self._extract_content(deviation)
@@ -259,7 +259,7 @@ class DeviantartExtractor(Extractor):
deviation["published_time"] = text.parse_int(
deviation["published_time"])
- deviation["date"] = text.parse_timestamp(
+ deviation["date"] = self.parse_timestamp(
deviation["published_time"])
if self.comments:
@@ -269,7 +269,7 @@ class DeviantartExtractor(Extractor):
)
# filename metadata
- sub = util.re(r"\W").sub
+ sub = text.re(r"\W").sub
deviation["filename"] = "".join((
sub("_", deviation["title"].lower()), "_by_",
sub("_", deviation["author"]["username"].lower()), "-d",
@@ -404,7 +404,7 @@ class DeviantartExtractor(Extractor):
try:
return self._tiptap_to_html(markup)
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.error("%s: '%s: %s'", deviation["index"],
exc.__class__.__name__, exc)
@@ -675,7 +675,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
def _find_folder(self, folders, name, uuid):
if uuid.isdecimal():
- match = util.re(
+ match = text.re(
"(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match
for folder in folders:
if match(folder["name"]):
@@ -864,7 +864,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
class DeviantartUserExtractor(Dispatch, DeviantartExtractor):
"""Extractor for an artist's user profile"""
- pattern = BASE_PATTERN + r"/?$"
+ pattern = rf"{BASE_PATTERN}/?$"
example = "https://www.deviantart.com/USER"
def items(self):
@@ -887,8 +887,8 @@ class DeviantartGalleryExtractor(DeviantartExtractor):
"""Extractor for all deviations from an artist's gallery"""
subcategory = "gallery"
archive_fmt = "g_{_username}_{index}.{extension}"
- pattern = (BASE_PATTERN + r"/gallery"
- r"(?:/all|/recommended-for-you|/?\?catpath=)?/?$")
+ pattern = (rf"{BASE_PATTERN}/gallery"
+ r"(?:/all|/recommended-for-you)?/?(\?(?!q=).*)?$")
example = "https://www.deviantart.com/USER/gallery/"
def deviations(self):
@@ -902,7 +902,7 @@ class DeviantartAvatarExtractor(DeviantartExtractor):
"""Extractor for an artist's avatar"""
subcategory = "avatar"
archive_fmt = "a_{_username}_{index}"
- pattern = BASE_PATTERN + r"/avatar"
+ pattern = rf"{BASE_PATTERN}/avatar"
example = "https://www.deviantart.com/USER/avatar/"
def deviations(self):
@@ -956,7 +956,7 @@ class DeviantartBackgroundExtractor(DeviantartExtractor):
"""Extractor for an artist's banner"""
subcategory = "background"
archive_fmt = "b_{index}"
- pattern = BASE_PATTERN + r"/ba(?:nner|ckground)"
+ pattern = rf"{BASE_PATTERN}/ba(?:nner|ckground)"
example = "https://www.deviantart.com/USER/banner/"
def deviations(self):
@@ -972,7 +972,7 @@ class DeviantartFolderExtractor(DeviantartExtractor):
subcategory = "folder"
directory_fmt = ("{category}", "{username}", "{folder[title]}")
archive_fmt = "F_{folder[uuid]}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/gallery/([^/?#]+)/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)/([^/?#]+)"
example = "https://www.deviantart.com/USER/gallery/12345/TITLE"
def __init__(self, match):
@@ -1088,7 +1088,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "{username}", "Favourites")
archive_fmt = "f_{_username}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/favourites(?:/all|/?\?catpath=)?/?$"
+ pattern = rf"{BASE_PATTERN}/favourites(?:/all|/?\?catpath=)?/?$"
example = "https://www.deviantart.com/USER/favourites/"
def deviations(self):
@@ -1105,7 +1105,7 @@ class DeviantartCollectionExtractor(DeviantartExtractor):
directory_fmt = ("{category}", "{username}", "Favourites",
"{collection[title]}")
archive_fmt = "C_{collection[uuid]}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/favourites/([^/?#]+)/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/favourites/([^/?#]+)/([^/?#]+)"
example = "https://www.deviantart.com/USER/favourites/12345/TITLE"
def __init__(self, match):
@@ -1136,7 +1136,7 @@ class DeviantartJournalExtractor(DeviantartExtractor):
subcategory = "journal"
directory_fmt = ("{category}", "{username}", "Journal")
archive_fmt = "j_{_username}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/(?:posts(?:/journals)?|journal)/?(?:\?.*)?$"
+ pattern = rf"{BASE_PATTERN}/(?:posts(?:/journals)?|journal)/?(?:\?.*)?$"
example = "https://www.deviantart.com/USER/posts/journals/"
def deviations(self):
@@ -1149,7 +1149,7 @@ class DeviantartStatusExtractor(DeviantartExtractor):
directory_fmt = ("{category}", "{username}", "Status")
filename_fmt = "{category}_{index}_{title}_{date}.{extension}"
archive_fmt = "S_{_username}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/posts/statuses"
+ pattern = rf"{BASE_PATTERN}/posts/statuses"
example = "https://www.deviantart.com/USER/posts/statuses/"
def deviations(self):
@@ -1187,8 +1187,8 @@ class DeviantartStatusExtractor(DeviantartExtractor):
deviation["username"] = deviation["author"]["username"]
deviation["_username"] = deviation["username"].lower()
- deviation["date"] = dt = text.parse_datetime(deviation["ts"])
- deviation["published_time"] = int(util.datetime_to_timestamp(dt))
+ deviation["date"] = d = self.parse_datetime_iso(deviation["ts"])
+ deviation["published_time"] = int(dt.to_ts(d))
deviation["da_category"] = "Status"
deviation["category_path"] = "status"
@@ -1253,7 +1253,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
"""Extractor for single deviations"""
subcategory = "deviation"
archive_fmt = "g_{_username}_{index}.{extension}"
- pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)"
+ pattern = (rf"{BASE_PATTERN}/(art|journal)/(?:[^/?#]+-)?(\d+)"
r"|(?:https?://)?(?:www\.)?(?:fx)?deviantart\.com/"
r"(?:view/|deviation/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)"
r"(\d+)" # bare deviation ID without slug
@@ -1315,7 +1315,7 @@ class DeviantartScrapsExtractor(DeviantartExtractor):
subcategory = "scraps"
directory_fmt = ("{category}", "{username}", "Scraps")
archive_fmt = "s_{_username}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b"
+ pattern = rf"{BASE_PATTERN}/gallery/(?:\?catpath=)?scraps\b"
example = "https://www.deviantart.com/USER/gallery/scraps"
def deviations(self):
@@ -1382,7 +1382,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor):
"""Extractor for deviantart gallery searches"""
subcategory = "gallery-search"
archive_fmt = "g_{_username}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)"
+ pattern = rf"{BASE_PATTERN}/gallery/?\?(q=[^#]+)"
example = "https://www.deviantart.com/USER/gallery?q=QUERY"
def __init__(self, match):
@@ -1412,7 +1412,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor):
class DeviantartFollowingExtractor(DeviantartExtractor):
"""Extractor for user's watched users"""
subcategory = "following"
- pattern = BASE_PATTERN + "/(?:about#)?watching"
+ pattern = rf"{BASE_PATTERN}/(?:about#)?watching"
example = "https://www.deviantart.com/USER/about#watching"
def items(self):
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
index 85358ba..bbc1ef0 100644
--- a/gallery_dl/extractor/directlink.py
+++ b/gallery_dl/extractor/directlink.py
@@ -40,5 +40,5 @@ class DirectlinkExtractor(Extractor):
data["_http_headers"] = {
"Referer": self.url.encode("latin-1", "ignore")}
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, self.url, data
diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py
index 216e486..0e7f309 100644
--- a/gallery_dl/extractor/discord.py
+++ b/gallery_dl/extractor/discord.py
@@ -19,7 +19,7 @@ class DiscordExtractor(Extractor):
root = "https://discord.com"
directory_fmt = ("{category}", "{server_id}_{server}",
"{channel_id}_{channel}")
- filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}"
+ filename_fmt = "{message_id}_{num:>02}_{filename[:220]}.{extension}"
archive_fmt = "{message_id}_{num}"
server_metadata = {}
@@ -72,9 +72,7 @@ class DiscordExtractor(Extractor):
"author_files": [],
"message": self.extract_message_text(message),
"message_id": message["id"],
- "date": text.parse_datetime(
- message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z"
- ),
+ "date": self.parse_datetime_iso(message["timestamp"]),
"files": []
})
@@ -122,7 +120,7 @@ class DiscordExtractor(Extractor):
text.nameext_from_url(file["url"], file)
file["num"] = num
- yield Message.Directory, message_metadata
+ yield Message.Directory, "", message_metadata
for file in message_metadata["files"]:
message_metadata_file = message_metadata.copy()
@@ -240,7 +238,7 @@ class DiscordExtractor(Extractor):
class DiscordChannelExtractor(DiscordExtractor):
subcategory = "channel"
- pattern = BASE_PATTERN + r"/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$"
+ pattern = rf"{BASE_PATTERN}/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$"
example = "https://discord.com/channels/1234567890/9876543210"
def items(self):
@@ -253,7 +251,7 @@ class DiscordChannelExtractor(DiscordExtractor):
class DiscordMessageExtractor(DiscordExtractor):
subcategory = "message"
- pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)/(\d+)/?$"
+ pattern = rf"{BASE_PATTERN}/channels/(\d+)/(\d+)/(\d+)/?$"
example = "https://discord.com/channels/1234567890/9876543210/2468013579"
def items(self):
@@ -270,7 +268,7 @@ class DiscordMessageExtractor(DiscordExtractor):
class DiscordServerExtractor(DiscordExtractor):
subcategory = "server"
- pattern = BASE_PATTERN + r"/channels/(\d+)/?$"
+ pattern = rf"{BASE_PATTERN}/channels/(\d+)/?$"
example = "https://discord.com/channels/1234567890"
def items(self):
@@ -288,7 +286,7 @@ class DiscordDirectMessagesExtractor(DiscordExtractor):
subcategory = "direct-messages"
directory_fmt = ("{category}", "Direct Messages",
"{channel_id}_{recipients:J,}")
- pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$"
+ pattern = rf"{BASE_PATTERN}/channels/@me/(\d+)/?$"
example = "https://discord.com/channels/@me/1234567890"
def items(self):
@@ -299,7 +297,7 @@ class DiscordDirectMessageExtractor(DiscordExtractor):
subcategory = "direct-message"
directory_fmt = ("{category}", "Direct Messages",
"{channel_id}_{recipients:J,}")
- pattern = BASE_PATTERN + r"/channels/@me/(\d+)/(\d+)/?$"
+ pattern = rf"{BASE_PATTERN}/channels/@me/(\d+)/(\d+)/?$"
example = "https://discord.com/channels/@me/1234567890/9876543210"
def items(self):
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index 3e0424d..36423db 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -41,12 +41,12 @@ class DynastyscansBase():
class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
"""Extractor for manga-chapters from dynasty-scans.com"""
- pattern = BASE_PATTERN + r"(/chapters/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/chapters/[^/?#]+)"
example = "https://dynasty-scans.com/chapters/NAME"
def metadata(self, page):
extr = text.extract_from(page)
- match = util.re(
+ match = text.re(
r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
r"(?: ch(\d+)([^:<]*))?" # chapter info
r"(?:: (.+))?" # title
@@ -62,7 +62,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
"author" : text.remove_html(author),
"group" : (text.remove_html(group) or
text.extr(group, ' alt="', '"')),
- "date" : text.parse_datetime(extr(
+ "date" : self.parse_datetime(extr(
'"icon-calendar"></i> ', '<'), "%b %d, %Y"),
"tags" : text.split_html(extr(
"class='tags'>", "<div id='chapter-actions'")),
@@ -81,7 +81,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
class DynastyscansMangaExtractor(DynastyscansBase, MangaExtractor):
chapterclass = DynastyscansChapterExtractor
reverse = False
- pattern = BASE_PATTERN + r"(/series/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/series/[^/?#]+)"
example = "https://dynasty-scans.com/series/NAME"
def chapters(self, page):
@@ -97,7 +97,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
directory_fmt = ("{category}", "Images")
filename_fmt = "{image_id}.{extension}"
archive_fmt = "i_{image_id}"
- pattern = BASE_PATTERN + r"/images/?(?:\?([^#]+))?$"
+ pattern = rf"{BASE_PATTERN}/images/?(?:\?([^#]+))?$"
example = "https://dynasty-scans.com/images?QUERY"
def __init__(self, match):
@@ -105,7 +105,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
self.query = match[1] or ""
def items(self):
- yield Message.Directory, {}
+ yield Message.Directory, "", {}
for image_id in self.images():
image = self._parse_image_page(image_id)
url = image["url"]
@@ -126,7 +126,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
class DynastyscansImageExtractor(DynastyscansSearchExtractor):
"""Extractor for individual images on dynasty-scans.com"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/images/(\d+)"
+ pattern = rf"{BASE_PATTERN}/images/(\d+)"
example = "https://dynasty-scans.com/images/12345"
def images(self):
@@ -136,7 +136,7 @@ class DynastyscansImageExtractor(DynastyscansSearchExtractor):
class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor):
"""Extractor for dynasty-scans anthologies"""
subcategory = "anthology"
- pattern = BASE_PATTERN + r"/anthologies/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/anthologies/([^/?#]+)"
example = "https://dynasty-scans.com/anthologies/TITLE"
def items(self):
@@ -166,8 +166,6 @@ class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor):
data["scanlator"] = content[1].text[11:]
data["tags"] = content[2].text[6:].lower().split(", ")
data["title"] = element[5].text
- data["date"] = text.parse_datetime(
- element[1].text, "%Y-%m-%dT%H:%M:%S%z")
- data["date_updated"] = text.parse_datetime(
- element[2].text, "%Y-%m-%dT%H:%M:%S%z")
+ data["date"] = self.parse_datetime_iso(element[1].text)
+ data["date_updated"] = self.parse_datetime_iso(element[2].text)
yield Message.Queue, element[4].text, data
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index 71c3b30..cc6708d 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -51,13 +51,18 @@ class E621Extractor(danbooru.DanbooruExtractor):
post["filename"] = file["md5"]
post["extension"] = file["ext"]
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ post["date"] = self.parse_datetime_iso(post["created_at"])
post.update(data)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, file["url"], post
+ def items_artists(self):
+ for artist in self.artists():
+ artist["_extractor"] = E621TagExtractor
+ url = f"{self.root}/posts?tags={text.quote(artist['name'])}"
+ yield Message.Queue, url, artist
+
def _get_notes(self, id):
return self.request_json(
f"{self.root}/notes.json?search[post_id]={id}")
@@ -89,13 +94,13 @@ BASE_PATTERN = E621Extractor.update({
class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor):
"""Extractor for e621 posts from tag searches"""
- pattern = BASE_PATTERN + r"/posts?(?:\?[^#]*?tags=|/index/\d+/)([^&#]*)"
+ pattern = rf"{BASE_PATTERN}/posts?(?:\?[^#]*?tags=|/index/\d+/)([^&#]*)"
example = "https://e621.net/posts?tags=TAG"
class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor):
"""Extractor for e621 pools"""
- pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/pool(?:s|/show)/(\d+)"
example = "https://e621.net/pools/12345"
def posts(self):
@@ -120,7 +125,7 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor):
class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor):
"""Extractor for single e621 posts"""
- pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post(?:s|/show)/(\d+)"
example = "https://e621.net/posts/12345"
def posts(self):
@@ -130,19 +135,38 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor):
class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor):
"""Extractor for popular images from e621"""
- pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"
+ pattern = rf"{BASE_PATTERN}/explore/posts/popular(?:\?([^#]*))?"
example = "https://e621.net/explore/posts/popular"
def posts(self):
return self._pagination("/popular.json", self.params)
+class E621ArtistExtractor(E621Extractor, danbooru.DanbooruArtistExtractor):
+ """Extractor for e621 artists"""
+ subcategory = "artist"
+ pattern = rf"{BASE_PATTERN}/artists/(\d+)"
+ example = "https://e621.net/artists/12345"
+
+ items = E621Extractor.items_artists
+
+
+class E621ArtistSearchExtractor(E621Extractor,
+ danbooru.DanbooruArtistSearchExtractor):
+ """Extractor for e621 artist searches"""
+ subcategory = "artist-search"
+ pattern = rf"{BASE_PATTERN}/artists/?\?([^#]+)"
+ example = "https://e621.net/artists?QUERY"
+
+ items = E621Extractor.items_artists
+
+
class E621FavoriteExtractor(E621Extractor):
"""Extractor for e621 favorites"""
subcategory = "favorite"
directory_fmt = ("{category}", "Favorites", "{user_id}")
archive_fmt = "f_{user_id}_{id}"
- pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?"
+ pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?"
example = "https://e621.net/favorites"
def metadata(self):
diff --git a/gallery_dl/extractor/eporner.py b/gallery_dl/extractor/eporner.py
new file mode 100644
index 0000000..307f14b
--- /dev/null
+++ b/gallery_dl/extractor/eporner.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.eporner.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class EpornerGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from eporner.com"""
+ category = "eporner"
+ root = "https://eporner.com"
+ pattern = (r"(?:https?://)?(?:www\.)?eporner\.com"
+ r"/gallery/(\w+)(?:/([\w-]+))?")
+ example = "https://www.eporner.com/gallery/GID/SLUG/"
+
+ def __init__(self, match):
+ url = f"{self.root}/gallery/{match[1]}/{match[2]}/"
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ title = text.extr(page, "<title>", " - EPORNER</title>")
+ if title.endswith(" Photo Gallery"):
+ title = title[:-14]
+
+ return {
+ "gallery_id": self.groups[0],
+ "title" : text.unescape(title),
+ "slug" : text.extr(
+ page, "/gallery/", '/"').rpartition("/")[2],
+ "description": text.unescape(text.extr(
+ page, 'name="description" content="', '"')),
+ "tags": text.extr(
+ page, 'EP.ads.keywords = "', '"').split(","),
+ }
+
+ def images(self, page):
+ album = text.extr(
+ page, 'class="photosgrid gallerygrid"', "id='gallerySlideBox'")
+
+ results = []
+ for url in text.extract_iter(album, ' src="', '"'):
+ url, _, ext = url.rpartition(".")
+ # Preview images have a resolution suffix.
+ # E.g. "11208293-image-3_296x1000.jpg".
+ # The same name, but without the suffix, leads to the full image.
+ url = url[:url.rfind("_")]
+ name = url[url.rfind("/")+1:]
+ results.append((f"{url}.{ext}", {"id": name[:name.find("-")]}))
+ return results
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 68cfdbc..2c9ab47 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -64,7 +64,7 @@ class EromeExtractor(Extractor):
class EromeAlbumExtractor(EromeExtractor):
"""Extractor for albums on erome.com"""
subcategory = "album"
- pattern = BASE_PATTERN + r"/a/(\w+)"
+ pattern = rf"{BASE_PATTERN}/a/(\w+)"
example = "https://www.erome.com/a/ID"
def items(self):
@@ -74,8 +74,12 @@ class EromeAlbumExtractor(EromeExtractor):
try:
page = self.request(url).text
except exception.HttpError as exc:
+ if exc.status == 410:
+ msg = text.extr(exc.response.text, "<h1>", "<")
+ else:
+ msg = "Unable to fetch album page"
raise exception.AbortExtraction(
- f"{album_id}: Unable to fetch album page ({exc})")
+ f"{album_id}: {msg} ({exc})")
title, pos = text.extract(
page, 'property="og:title" content="', '"')
@@ -96,7 +100,7 @@ class EromeAlbumExtractor(EromeExtractor):
if not date:
ts = text.extr(group, '?v=', '"')
if len(ts) > 1:
- date = text.parse_timestamp(ts)
+ date = self.parse_timestamp(ts)
data = {
"album_id": album_id,
@@ -110,14 +114,14 @@ class EromeAlbumExtractor(EromeExtractor):
"_http_headers": {"Referer": url},
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
class EromeUserExtractor(EromeExtractor):
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)(?:/?\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/(?!a/|search\?)([^/?#]+)(?:/?\?([^#]+))?"
example = "https://www.erome.com/USER"
def albums(self):
@@ -133,7 +137,7 @@ class EromeUserExtractor(EromeExtractor):
class EromeSearchExtractor(EromeExtractor):
subcategory = "search"
- pattern = BASE_PATTERN + r"/search/?\?(q=[^#]+)"
+ pattern = rf"{BASE_PATTERN}/search/?\?(q=[^#]+)"
example = "https://www.erome.com/search?q=QUERY"
def albums(self):
diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py
index 91672bb..ce29800 100644
--- a/gallery_dl/extractor/everia.py
+++ b/gallery_dl/extractor/everia.py
@@ -7,7 +7,7 @@
"""Extractors for https://everia.club"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text
BASE_PATTERN = r"(?:https?://)?everia\.club"
@@ -25,7 +25,7 @@ class EveriaExtractor(Extractor):
return self._pagination(self.groups[0])
def _pagination(self, path, params=None, pnum=1):
- find_posts = util.re(r'thumbnail">\s*<a href="([^"]+)').findall
+ find_posts = text.re(r'thumbnail">\s*<a href="([^"]+)').findall
while True:
if pnum == 1:
@@ -45,14 +45,14 @@ class EveriaPostExtractor(EveriaExtractor):
subcategory = "post"
directory_fmt = ("{category}", "{title}")
archive_fmt = "{post_url}_{num}"
- pattern = BASE_PATTERN + r"(/\d{4}/\d{2}/\d{2}/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/\d{{4}}/\d{{2}}/\d{{2}}/[^/?#]+)"
example = "https://everia.club/0000/00/00/TITLE"
def items(self):
url = self.root + self.groups[0] + "/"
page = self.request(url).text
content = text.extr(page, 'itemprop="text">', "<h3")
- urls = util.re(r'img.*?lazy-src="([^"]+)').findall(content)
+ urls = text.re(r'img.*?lazy-src="([^"]+)').findall(content)
data = {
"title": text.unescape(
@@ -64,7 +64,7 @@ class EveriaPostExtractor(EveriaExtractor):
"count": len(urls),
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(urls, 1):
url = text.unquote(url)
yield Message.Url, url, text.nameext_from_url(url, data)
@@ -72,26 +72,26 @@ class EveriaPostExtractor(EveriaExtractor):
class EveriaTagExtractor(EveriaExtractor):
subcategory = "tag"
- pattern = BASE_PATTERN + r"(/tag/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/tag/[^/?#]+)"
example = "https://everia.club/tag/TAG"
class EveriaCategoryExtractor(EveriaExtractor):
subcategory = "category"
- pattern = BASE_PATTERN + r"(/category/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/category/[^/?#]+)"
example = "https://everia.club/category/CATEGORY"
class EveriaDateExtractor(EveriaExtractor):
subcategory = "date"
- pattern = (BASE_PATTERN +
- r"(/\d{4}(?:/\d{2})?(?:/\d{2})?)(?:/page/\d+)?/?$")
+ pattern = (rf"{BASE_PATTERN}"
+ rf"(/\d{{4}}(?:/\d{{2}})?(?:/\d{{2}})?)(?:/page/\d+)?/?$")
example = "https://everia.club/0000/00/00"
class EveriaSearchExtractor(EveriaExtractor):
subcategory = "search"
- pattern = BASE_PATTERN + r"/(?:page/\d+/)?\?s=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/(?:page/\d+/)?\?s=([^&#]+)"
example = "https://everia.club/?s=SEARCH"
def posts(self):
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index f147959..9dab923 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -115,9 +115,9 @@ class ExhentaiExtractor(Extractor):
class ExhentaiGalleryExtractor(ExhentaiExtractor):
"""Extractor for image galleries from exhentai.org"""
subcategory = "gallery"
- pattern = (BASE_PATTERN +
- r"(?:/g/(\d+)/([\da-f]{10})"
- r"|/s/([\da-f]{10})/(\d+)-(\d+))")
+ pattern = (rf"{BASE_PATTERN}/(?:"
+ rf"g/(\d+)/([\da-f]{{10}})|"
+ rf"s/([\da-f]{{10}})/(\d+)-(\d+))")
example = "https://e-hentai.org/g/12345/67890abcde/"
def __init__(self, match):
@@ -193,7 +193,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.data = data = self.get_metadata(gpage)
self.count = text.parse_int(data["filecount"])
- yield Message.Directory, data
+ yield Message.Directory, "", data
images = itertools.chain(
(self.image_from_page(ipage),), self.images_from_api())
@@ -216,7 +216,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _items_hitomi(self):
if self.config("metadata", False):
data = self.metadata_from_api()
- data["date"] = text.parse_timestamp(data["posted"])
+ data["date"] = self.parse_timestamp(data["posted"])
else:
data = {}
@@ -226,14 +226,14 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
yield Message.Queue, url, data
def _items_metadata(self):
- yield Message.Directory, self.metadata_from_api()
+ yield Message.Directory, "", self.metadata_from_api()
def get_metadata(self, page):
"""Extract gallery metadata"""
data = self.metadata_from_page(page)
if self.config("metadata", False):
data.update(self.metadata_from_api())
- data["date"] = text.parse_timestamp(data["posted"])
+ data["date"] = self.parse_timestamp(data["posted"])
if self.config("tags", False):
tags = collections.defaultdict(list)
for tag in data["tags"]:
@@ -258,8 +258,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"_" : extr('<div id="gdc"><div class="cs ct', '"'),
"eh_category" : extr('>', '<'),
"uploader" : extr('<div id="gdn">', '</div>'),
- "date" : text.parse_datetime(extr(
- '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
+ "date" : self.parse_datetime_iso(extr(
+ '>Posted:</td><td class="gdt2">', '</td>')),
"parent" : extr(
'>Parent:</td><td class="gdt2"><a href="', '"'),
"expunged" : "Yes" != extr(
@@ -563,7 +563,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
class ExhentaiSearchExtractor(ExhentaiExtractor):
"""Extractor for exhentai search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/(?:\?([^#]*)|tag/([^/?#]+))"
+ pattern = rf"{BASE_PATTERN}/(?:\?([^#]*)|tag/([^/?#]+))"
example = "https://e-hentai.org/?f_search=QUERY"
def __init__(self, match):
@@ -620,7 +620,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
"""Extractor for favorited exhentai galleries"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/favorites\.php(?:\?([^#]*)())?"
+ pattern = rf"{BASE_PATTERN}/favorites\.php(?:\?([^#]*)())?"
example = "https://e-hentai.org/favorites.php"
def _init(self):
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
index 6061737..5d56a5f 100644
--- a/gallery_dl/extractor/facebook.py
+++ b/gallery_dl/extractor/facebook.py
@@ -11,9 +11,9 @@ from .. import text, util, exception
from ..cache import memcache
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com"
-USER_PATTERN = (BASE_PATTERN +
- r"/(?!media/|photo/|photo.php|watch/)"
- r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)")
+USER_PATTERN = (rf"{BASE_PATTERN}/"
+ rf"(?!media/|photo/|photo.php|watch/|permalink.php)"
+ rf"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)")
class FacebookExtractor(Extractor):
@@ -108,7 +108,7 @@ class FacebookExtractor(Extractor):
'"message":{"delight_ranges"',
'"},"message_preferred_body"'
).rsplit('],"text":"', 1)[-1]),
- "date": text.parse_timestamp(
+ "date": self.parse_timestamp(
text.extr(photo_page, '\\"publish_time\\":', ',') or
text.extr(photo_page, '"created_time":', ',')
),
@@ -172,7 +172,7 @@ class FacebookExtractor(Extractor):
"user_id": text.extr(
video_page, '"owner":{"__typename":"User","id":"', '"'
),
- "date": text.parse_timestamp(text.extr(
+ "date": self.parse_timestamp(text.extr(
video_page, '\\"publish_time\\":', ','
)),
"type": "video"
@@ -292,7 +292,7 @@ class FacebookExtractor(Extractor):
else:
retries = 0
photo.update(set_data)
- yield Message.Directory, photo
+ yield Message.Directory, "", photo
yield Message.Url, photo["url"], photo
if not photo["next_photo_id"]:
@@ -389,9 +389,9 @@ class FacebookExtractor(Extractor):
class FacebookPhotoExtractor(FacebookExtractor):
"""Base class for Facebook Photo extractors"""
subcategory = "photo"
- pattern = (BASE_PATTERN +
- r"/(?:[^/?#]+/photos/[^/?#]+/|photo(?:.php)?/?\?"
- r"(?:[^&#]+&)*fbid=)([^/?&#]+)[^/?#]*(?<!&setextract)$")
+ pattern = (rf"{BASE_PATTERN}/"
+ rf"(?:[^/?#]+/photos/[^/?#]+/|photo(?:.php)?/?\?"
+ rf"(?:[^&#]+&)*fbid=)([^/?&#]+)[^/?#]*(?<!&setextract)$")
example = "https://www.facebook.com/photo/?fbid=PHOTO_ID"
def items(self):
@@ -408,7 +408,7 @@ class FacebookPhotoExtractor(FacebookExtractor):
directory = self.parse_set_page(set_page)
- yield Message.Directory, directory
+ yield Message.Directory, "", directory
yield Message.Url, photo["url"], photo
if self.author_followups:
@@ -427,12 +427,11 @@ class FacebookSetExtractor(FacebookExtractor):
"""Base class for Facebook Set extractors"""
subcategory = "set"
pattern = (
- BASE_PATTERN +
- r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)"
- r"[^/?#]*(?<!&setextract)$"
- r"|([^/?#]+/posts/[^/?#]+)"
- r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)"
- )
+ rf"{BASE_PATTERN}/"
+ rf"(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)"
+ rf"[^/?#]*(?<!&setextract)$"
+ rf"|([^/?#]+/posts/[^/?#]+)"
+ rf"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)")
example = "https://www.facebook.com/media/set/?set=SET_ID"
def items(self):
@@ -455,7 +454,7 @@ class FacebookVideoExtractor(FacebookExtractor):
"""Base class for Facebook Video extractors"""
subcategory = "video"
directory_fmt = ("{category}", "{username}", "{subcategory}")
- pattern = BASE_PATTERN + r"/(?:[^/?#]+/videos/|watch/?\?v=)([^/?&#]+)"
+ pattern = rf"{BASE_PATTERN}/(?:[^/?#]+/videos/|watch/?\?v=)([^/?&#]+)"
example = "https://www.facebook.com/watch/?v=VIDEO_ID"
def items(self):
@@ -468,7 +467,7 @@ class FacebookVideoExtractor(FacebookExtractor):
if "url" not in video:
return
- yield Message.Directory, video
+ yield Message.Directory, "", video
if self.videos == "ytdl":
yield Message.Url, "ytdl:" + video_url, video
@@ -482,18 +481,18 @@ class FacebookInfoExtractor(FacebookExtractor):
"""Extractor for Facebook Profile data"""
subcategory = "info"
directory_fmt = ("{category}", "{username}")
- pattern = USER_PATTERN + r"/info"
+ pattern = rf"{USER_PATTERN}/info"
example = "https://www.facebook.com/USERNAME/info"
def items(self):
user = self._extract_profile(self.groups[0])
- return iter(((Message.Directory, user),))
+ return iter(((Message.Directory, "", user),))
class FacebookAlbumsExtractor(FacebookExtractor):
"""Extractor for Facebook Profile albums"""
subcategory = "albums"
- pattern = USER_PATTERN + r"/photos_albums(?:/([^/?#]+))?"
+ pattern = rf"{USER_PATTERN}/photos_albums(?:/([^/?#]+))?"
example = "https://www.facebook.com/USERNAME/photos_albums"
def items(self):
@@ -526,7 +525,7 @@ class FacebookAlbumsExtractor(FacebookExtractor):
class FacebookPhotosExtractor(FacebookExtractor):
"""Extractor for Facebook Profile Photos"""
subcategory = "photos"
- pattern = USER_PATTERN + r"/photos(?:_by)?"
+ pattern = rf"{USER_PATTERN}/photos(?:_by)?"
example = "https://www.facebook.com/USERNAME/photos"
def items(self):
@@ -543,7 +542,7 @@ class FacebookPhotosExtractor(FacebookExtractor):
class FacebookAvatarExtractor(FacebookExtractor):
"""Extractor for Facebook Profile Avatars"""
subcategory = "avatar"
- pattern = USER_PATTERN + r"/avatar"
+ pattern = rf"{USER_PATTERN}/avatar"
example = "https://www.facebook.com/USERNAME/avatar"
def items(self):
@@ -559,13 +558,13 @@ class FacebookAvatarExtractor(FacebookExtractor):
set_page = self.request(set_url).text
directory = self.parse_set_page(set_page)
- yield Message.Directory, directory
+ yield Message.Directory, "", directory
yield Message.Url, avatar["url"], avatar
class FacebookUserExtractor(Dispatch, FacebookExtractor):
"""Extractor for Facebook Profiles"""
- pattern = USER_PATTERN + r"/?(?:$|\?|#)"
+ pattern = rf"{USER_PATTERN}/?(?:$|\?|#)"
example = "https://www.facebook.com/USERNAME"
def items(self):
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index 70b06e7..036b388 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -66,18 +66,17 @@ class FanboxExtractor(Extractor):
if fee_max is not None and fee_max < item["feeRequired"]:
self.log.warning("Skipping post %s (feeRequired of %s > %s)",
item["id"], item["feeRequired"], fee_max)
- continue
-
- try:
- url = "https://api.fanbox.cc/post.info?postId=" + item["id"]
- body = self.request_json(url, headers=self.headers)["body"]
- content_body, post = self._extract_post(body)
- except Exception as exc:
- self.log.warning("Skipping post %s (%s: %s)",
- item["id"], exc.__class__.__name__, exc)
- continue
-
- yield Message.Directory, post
+ else:
+ try:
+ url = ("https://api.fanbox.cc/post.info?postId=" +
+ item["id"])
+ item = self.request_json(url, headers=self.headers)["body"]
+ except Exception as exc:
+ self.log.warning("Skipping post %s (%s: %s)",
+ item["id"], exc.__class__.__name__, exc)
+
+ content_body, post = self._extract_post(item)
+ yield Message.Directory, "", post
yield from self._get_urls_from_post(content_body, post)
def posts(self):
@@ -128,15 +127,19 @@ class FanboxExtractor(Extractor):
if file.get("extension", "").lower() in exts
]
- post["date"] = text.parse_datetime(post["publishedDatetime"])
+ try:
+ post["date"] = self.parse_datetime_iso(post["publishedDatetime"])
+ except Exception:
+ post["date"] = None
post["text"] = content_body.get("text") if content_body else None
post["isCoverImage"] = False
- if self._meta_user:
- post["user"] = self._get_user_data(post["creatorId"])
- if self._meta_plan:
+ cid = post.get("creatorId")
+ if self._meta_user and cid is not None:
+ post["user"] = self._get_user_data(cid)
+ if self._meta_plan and cid is not None:
plans = self._get_plan_data(post["creatorId"])
- fee = post["feeRequired"]
+ fee = post.get("feeRequired") or 0
try:
post["plan"] = plans[fee]
except KeyError:
@@ -147,7 +150,7 @@ class FanboxExtractor(Extractor):
plan["fee"] = fee
post["plan"] = plans[fee] = plan
if self._meta_comments:
- if post["commentCount"]:
+ if post.get("commentCount"):
post["comments"] = list(self._get_comment_data(post["id"]))
else:
post["commentd"] = ()
@@ -216,7 +219,7 @@ class FanboxExtractor(Extractor):
def _get_urls_from_post(self, content_body, post):
num = 0
if cover_image := post.get("coverImageUrl"):
- cover_image = util.re("/c/[0-9a-z_]+").sub("", cover_image)
+ cover_image = text.re("/c/[0-9a-z_]+").sub("", cover_image)
final_post = post.copy()
final_post["isCoverImage"] = True
final_post["fileUrl"] = cover_image
@@ -352,7 +355,7 @@ class FanboxExtractor(Extractor):
class FanboxCreatorExtractor(FanboxExtractor):
"""Extractor for a Fanbox creator's works"""
subcategory = "creator"
- pattern = USER_PATTERN + r"(?:/posts)?/?$"
+ pattern = rf"{USER_PATTERN}(?:/posts)?/?$"
example = "https://USER.fanbox.cc/"
def posts(self):
@@ -362,15 +365,26 @@ class FanboxCreatorExtractor(FanboxExtractor):
def _pagination_creator(self, url):
urls = self.request_json(url, headers=self.headers)["body"]
+ if offset := self.config("offset"):
+ quotient, remainder = divmod(offset, 10)
+ if quotient:
+ urls = urls[quotient:]
+ else:
+ remainder = None
+
for url in urls:
url = text.ensure_http_scheme(url)
- yield from self.request_json(url, headers=self.headers)["body"]
+ posts = self.request_json(url, headers=self.headers)["body"]
+ if remainder:
+ posts = posts[remainder:]
+ remainder = None
+ yield from posts
class FanboxPostExtractor(FanboxExtractor):
"""Extractor for media from a single Fanbox post"""
subcategory = "post"
- pattern = USER_PATTERN + r"/posts/(\d+)"
+ pattern = rf"{USER_PATTERN}/posts/(\d+)"
example = "https://USER.fanbox.cc/posts/12345"
def posts(self):
@@ -380,7 +394,7 @@ class FanboxPostExtractor(FanboxExtractor):
class FanboxHomeExtractor(FanboxExtractor):
"""Extractor for your Fanbox home feed"""
subcategory = "home"
- pattern = BASE_PATTERN + r"/?$"
+ pattern = rf"{BASE_PATTERN}/?$"
example = "https://fanbox.cc/"
def posts(self):
@@ -391,7 +405,7 @@ class FanboxHomeExtractor(FanboxExtractor):
class FanboxSupportingExtractor(FanboxExtractor):
"""Extractor for your supported Fanbox users feed"""
subcategory = "supporting"
- pattern = BASE_PATTERN + r"/home/supporting"
+ pattern = rf"{BASE_PATTERN}/home/supporting"
example = "https://fanbox.cc/home/supporting"
def posts(self):
@@ -403,6 +417,7 @@ class FanboxRedirectExtractor(Extractor):
"""Extractor for pixiv redirects to fanbox.cc"""
category = "fanbox"
subcategory = "redirect"
+ cookies_domain = None
pattern = r"(?:https?://)?(?:www\.)?pixiv\.net/fanbox/creator/(\d+)"
example = "https://www.pixiv.net/fanbox/creator/12345"
diff --git a/gallery_dl/extractor/fansly.py b/gallery_dl/extractor/fansly.py
index 7138599..ba60b15 100644
--- a/gallery_dl/extractor/fansly.py
+++ b/gallery_dl/extractor/fansly.py
@@ -35,9 +35,9 @@ class FanslyExtractor(Extractor):
for post in self.posts():
files = self._extract_files(post)
post["count"] = len(files)
- post["date"] = text.parse_timestamp(post["createdAt"])
+ post["date"] = self.parse_timestamp(post["createdAt"])
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
post.update(file)
url = file["url"]
@@ -61,7 +61,8 @@ class FanslyExtractor(Extractor):
yield from self.posts_wall(account, wall)
def _extract_files(self, post):
- files = []
+ if "attachments" not in post:
+ return ()
if "_extra" in post:
extra = post.pop("_extra", ())
@@ -75,11 +76,12 @@ class FanslyExtractor(Extractor):
if mid in media
)
+ files = []
for attachment in post.pop("attachments"):
try:
self._extract_attachment(files, post, attachment)
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.error(
"%s/%s, Failed to extract media (%s: %s)",
post["id"], attachment.get("id"),
@@ -117,8 +119,8 @@ class FanslyExtractor(Extractor):
file = {
**variant,
"format": variant["type"],
- "date": text.parse_timestamp(media["createdAt"]),
- "date_updated": text.parse_timestamp(media["updatedAt"]),
+ "date": self.parse_timestamp(media["createdAt"]),
+ "date_updated": self.parse_timestamp(media["updatedAt"]),
}
if "metadata" in location:
@@ -331,12 +333,20 @@ class FanslyAPI():
posts = response["posts"]
for post in posts:
- post["account"] = accounts[post.pop("accountId")]
+ try:
+ post["account"] = accounts[post.pop("accountId")]
+ except KeyError:
+ pass
extra = None
attachments = []
for attachment in post["attachments"]:
- cid = attachment["contentId"]
+ try:
+ cid = attachment["contentId"]
+ except KeyError:
+ attachments.append(attachment)
+ continue
+
if cid in media:
attachments.append(media[cid])
elif cid in bundles:
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
index e32a86b..d13ec13 100644
--- a/gallery_dl/extractor/fantia.py
+++ b/gallery_dl/extractor/fantia.py
@@ -48,7 +48,7 @@ class FantiaExtractor(Extractor):
for content in contents:
files = self._process_content(post, content)
- yield Message.Directory, post
+ yield Message.Directory, "", post
if content["visible_status"] != "visible":
self.log.warning(
@@ -101,7 +101,7 @@ class FantiaExtractor(Extractor):
"comment": resp["comment"],
"rating": resp["rating"],
"posted_at": resp["posted_at"],
- "date": text.parse_datetime(
+ "date": self.parse_datetime(
resp["posted_at"], "%a, %d %b %Y %H:%M:%S %z"),
"fanclub_id": resp["fanclub"]["id"],
"fanclub_user_id": resp["fanclub"]["user"]["id"],
diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py
index 7ff71b0..a18ce31 100644
--- a/gallery_dl/extractor/fapachi.py
+++ b/gallery_dl/extractor/fapachi.py
@@ -34,7 +34,7 @@ class FapachiPostExtractor(Extractor):
page = self.request(f"{self.root}/{self.user}/media/{self.id}").text
url = self.root + text.extract(
page, 'data-src="', '"', page.index('class="media-img'))[0]
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py
index b961cbe..afef942 100644
--- a/gallery_dl/extractor/fapello.py
+++ b/gallery_dl/extractor/fapello.py
@@ -20,7 +20,7 @@ class FapelloPostExtractor(Extractor):
directory_fmt = ("{category}", "{model}")
filename_fmt = "{model}_{id}.{extension}"
archive_fmt = "{type}_{model}_{id}"
- pattern = BASE_PATTERN + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?!search/|popular_videos/)([^/?#]+)/(\d+)"
example = "https://fapello.com/MODEL/12345/"
def __init__(self, match):
@@ -44,7 +44,7 @@ class FapelloPostExtractor(Extractor):
}
url = text.extr(page, 'src="', '"').replace(
".md", "").replace(".th", "")
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, text.nameext_from_url(url, data)
@@ -52,9 +52,9 @@ class FapelloModelExtractor(Extractor):
"""Extractor for all posts from a fapello model"""
category = "fapello"
subcategory = "model"
- pattern = (BASE_PATTERN + r"/(?!top-(?:likes|followers)|popular_videos"
- r"|videos|trending|search/?$)"
- r"([^/?#]+)/?$")
+ pattern = (rf"{BASE_PATTERN}/(?!top-(?:likes|followers)|popular_videos"
+ rf"|videos|trending|search/?$)"
+ rf"([^/?#]+)/?$")
example = "https://fapello.com/model/"
def __init__(self, match):
@@ -85,9 +85,9 @@ class FapelloPathExtractor(Extractor):
"""Extractor for models and posts from fapello.com paths"""
category = "fapello"
subcategory = "path"
- pattern = (BASE_PATTERN +
- r"/(?!search/?$)(top-(?:likes|followers)|videos|trending"
- r"|popular_videos/[^/?#]+)/?$")
+ pattern = (rf"{BASE_PATTERN}/(?!search/?$)"
+ rf"(top-(?:likes|followers)|videos|trending"
+ rf"|popular_videos/[^/?#]+)/?$")
example = "https://fapello.com/trending/"
def __init__(self, match):
diff --git a/gallery_dl/extractor/fikfap.py b/gallery_dl/extractor/fikfap.py
new file mode 100644
index 0000000..75071c5
--- /dev/null
+++ b/gallery_dl/extractor/fikfap.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://fikfap.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?fikfap\.com"
+
+
+class FikfapExtractor(Extractor):
+ """Base class for fikfap extractors"""
+ category = "fikfap"
+ root = "https://fikfap.com"
+ root_api = "https://api.fikfap.com"
+ directory_fmt = ("{category}", "{author[username]}")
+ filename_fmt = "{postId} {label[:240]}.{extension}"
+ archive_fmt = "{postId}"
+
+ def items(self):
+ headers = {
+ "Referer" : self.root + "/",
+ "Origin" : self.root,
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "cross-site",
+ }
+
+ for post in self.posts():
+ if url := post.get("videoFileOriginalUrl"):
+ post["extension"] = text.ext_from_url(url)
+ elif url := post.get("videoStreamUrl"):
+ url = "ytdl:" + url
+ post["extension"] = "mp4"
+ post["_ytdl_manifest"] = "hls"
+ post["_ytdl_manifest_headers"] = headers
+ else:
+ self.log.warning("%s: No video available", post["postId"])
+ continue
+
+ post["date"] = self.parse_datetime_iso(post["createdAt"])
+ post["date_updated"] = self.parse_datetime_iso(post["updatedAt"])
+ post["tags"] = [t["label"] for t in post["hashtags"]]
+ post["filename"] = post["label"]
+
+ yield Message.Directory, "", post
+ yield Message.Url, url, post
+
+ def request_api(self, url, params):
+ return self.request_json(url, params=params, headers={
+ "Referer" : self.root + "/",
+ "Authorization-Anonymous": "2527cc30-c3c5-41be-b8bb-104b6ea7a206",
+ "IsLoggedIn" : "false",
+ "IsPWA" : "false",
+ "Origin" : self.root,
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-site",
+ })
+
+
+class FikfapPostExtractor(FikfapExtractor):
+ subcategory = "post"
+ pattern = rf"{BASE_PATTERN}/user/(\w+)/post/(\d+)"
+ example = "https://fikfap.com/user/USER/post/12345"
+
+ def posts(self):
+ user, pid = self.groups
+
+ url = f"{self.root_api}/profile/username/{user}/posts"
+ params = {"amount" : "1", "startId": pid}
+ posts = self.request_api(url, params)
+
+ pid = int(pid)
+ for post in posts:
+ if post["postId"] == pid:
+ return (post,)
+ raise exception.NotFoundError("post")
+
+
+class FikfapUserExtractor(FikfapExtractor):
+ subcategory = "user"
+ pattern = rf"{BASE_PATTERN}/user/(\w+)"
+ example = "https://fikfap.com/user/USER"
+
+ def posts(self):
+ user = self.groups[0]
+
+ url = f"{self.root_api}/profile/username/{user}/posts"
+ params = {"amount": "21"}
+
+ while True:
+ data = self.request_api(url, params)
+
+ yield from data
+
+ if len(data) < 21:
+ return
+ params["afterId"] = data[-1]["postId"]
diff --git a/gallery_dl/extractor/fitnakedgirls.py b/gallery_dl/extractor/fitnakedgirls.py
new file mode 100644
index 0000000..d252ec4
--- /dev/null
+++ b/gallery_dl/extractor/fitnakedgirls.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://fitnakedgirls.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?fitnakedgirls\.com"
+
+
+class FitnakedgirlsExtractor(Extractor):
+ """Base class for fitnakedgirls extractors"""
+ category = "fitnakedgirls"
+ root = "https://fitnakedgirls.com"
+
+ def items(self):
+ data = {"_extractor": FitnakedgirlsGalleryExtractor}
+ for url in self.galleries():
+ yield Message.Queue, url, data
+
+ def _pagination(self, base):
+ url = base
+ pnum = 1
+
+ while True:
+ page = self.request(url).text
+
+ for post in text.extract_iter(
+ page, 'class="entry-body', "</a>"):
+ yield text.extr(post, 'href="', '"')
+
+ pnum += 1
+ url = f"{base}page/{pnum}/"
+ if f'href="{url}"' not in page:
+ return
+
+ def _extract_title(self, extr, sep=" - "):
+ title = text.unescape(extr("<title>", "<"))
+ if sep in title:
+ title = title.rpartition(sep)[0]
+ return title.strip()
+
+
+class FitnakedgirlsGalleryExtractor(GalleryExtractor, FitnakedgirlsExtractor):
+ """Extractor for fitnakedgirls galleries"""
+ directory_fmt = ("{category}", "{title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{gallery_id}_{filename}"
+ pattern = rf"{BASE_PATTERN}/photos/gallery/([\w-]+)/?$"
+ example = "https://fitnakedgirls.com/photos/gallery/MODEL-nude/"
+
+ def __init__(self, match):
+ url = f"{self.root}/photos/gallery/{match[1]}/"
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ title = self._extract_title(extr)
+
+ # Strip common patterns to get cleaner model name
+ for pattern in (" Nudes", " Nude", " nudes", " nude"):
+ if pattern in title:
+ title = title.partition(pattern)[0]
+ break
+
+ return {
+ "gallery_id" : text.parse_int(extr('data-post-id="', '"')),
+ "gallery_slug": self.groups[0],
+ "model": title,
+ "title": title,
+ "date" : self.parse_datetime_iso(extr(
+ 'article:published_time" content="', '"')),
+ }
+
+ def images(self, page):
+ results = []
+
+ content = text.extr(
+ page, 'itemprop="articleBody"', '<!-- .entry-content -->') or page
+
+ # Extract videos from wp-block-video figures
+ for figure in text.extract_iter(
+ content, '<figure class="wp-block-video">', '</figure>'):
+ if src := text.extr(figure, 'src="', '"'):
+ if "/wp-content/uploads/" in src:
+ results.append((src, None))
+
+ # Extract images from wp-block-image figures (newer template)
+ for figure in text.extract_iter(
+ content, '<figure class="wp-block-image', '</figure>'):
+ if src := text.extr(figure, 'data-src="', '"'):
+ if "/wp-content/uploads/" in src:
+ results.append((src, None))
+
+ # Fallback: Extract images with size-large class (older template)
+ if not results:
+ for img in text.extract_iter(content, "<img ", ">"):
+ if "size-large" in img:
+ if src := text.extr(img, 'data-src="', '"'):
+ if "/wp-content/uploads/" in src:
+ results.append((src, None))
+
+ return results
+
+
+class FitnakedgirlsCategoryExtractor(FitnakedgirlsExtractor):
+ """Extractor for fitnakedgirls category pages"""
+ subcategory = "category"
+ pattern = rf"{BASE_PATTERN}/photos/gallery/category/([\w-]+)"
+ example = "https://fitnakedgirls.com/photos/gallery/category/CATEGORY/"
+
+ def galleries(self):
+ base = f"{self.root}/photos/gallery/category/{self.groups[0]}/"
+ return self._pagination(base)
+
+
+class FitnakedgirlsTagExtractor(FitnakedgirlsExtractor):
+ """Extractor for fitnakedgirls tag pages"""
+ subcategory = "tag"
+ pattern = rf"{BASE_PATTERN}/photos/gallery/tag/([\w-]+)"
+ example = "https://fitnakedgirls.com/photos/gallery/tag/TAG/"
+
+ def galleries(self):
+ base = f"{self.root}/photos/gallery/tag/{self.groups[0]}/"
+ return self._pagination(base)
+
+
+class FitnakedgirlsVideoExtractor(FitnakedgirlsExtractor):
+ """Extractor for fitnakedgirls video posts"""
+ subcategory = "video"
+ directory_fmt = ("{category}", "{title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{video_id}_{filename}"
+ pattern = rf"{BASE_PATTERN}/videos/(\d+)/(\d+)/([\w-]+)"
+ example = "https://fitnakedgirls.com/videos/2025/08/VIDEO-TITLE/"
+
+ def items(self):
+ year, month, slug = self.groups
+ url = f"{self.root}/videos/{year}/{month}/{slug}/"
+ page = self.request(url).text
+
+ extr = text.extract_from(page)
+ data = {
+ "slug" : slug,
+ "title" : self._extract_title(extr, " | "),
+ "video_id": text.parse_int(extr('data-post-id="', '"')),
+ "date" : self.parse_datetime_iso(
+ extr('article:published_time" content="', '"')),
+ }
+
+ yield Message.Directory, "", data
+
+ content = text.extr(
+ page, 'itemprop="articleBody"', '<!-- .entry-content -->') or page
+ for video in text.extract_iter(content, "<video ", "</video>"):
+ if src := text.extr(video, 'src="', '"'):
+ if "/wp-content/uploads/" in src:
+ yield Message.Url, src, text.nameext_from_url(src, data)
+
+
+class FitnakedgirlsBlogExtractor(FitnakedgirlsExtractor):
+ """Extractor for fitnakedgirls blog posts"""
+ subcategory = "blog"
+ directory_fmt = ("{category}", "{title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{post_id}_{filename}"
+ pattern = rf"{BASE_PATTERN}/fitblog/([\w-]+)"
+ example = "https://fitnakedgirls.com/fitblog/MODEL-NAME/"
+
+ def items(self):
+ slug = self.groups[0]
+ url = f"{self.root}/fitblog/{slug}/"
+ page = self.request(url).text
+
+ extr = text.extract_from(page)
+ data = {
+ "slug" : slug,
+ "title" : self._extract_title(extr),
+ "post_id": text.parse_int(extr('data-post-id="', '"')),
+ "date" : self.parse_datetime_iso(
+ extr('article:published_time" content="', '"')),
+ }
+
+ yield Message.Directory, "", data
+
+ # Extract images from wp-block-image figures
+ content = text.extr(
+ page, 'itemprop="articleBody"', '<!-- .entry-content -->') or page
+ for figure in text.extract_iter(
+ content, '<figure class="wp-block-image', '</figure>'):
+ # Try srcset first for highest resolution
+ if srcset := text.extr(figure, 'srcset="', '"'):
+ # Get the last (largest) image from srcset
+ urls = srcset.split(", ")
+ if urls:
+ src = urls[-1].partition(" ")[0]
+ if "/wp-content/uploads/" in src:
+ yield Message.Url, src, text.nameext_from_url(
+ src, data)
+ continue
+ # Fallback to src
+ if src := text.extr(figure, 'src="', '"'):
+ if "/wp-content/uploads/" in src:
+ yield Message.Url, src, text.nameext_from_url(src, data)
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 35263a3..1446eb8 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text, oauth, util, exception
+from ..cache import memcache
BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com"
@@ -17,6 +18,7 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com"
class FlickrExtractor(Extractor):
"""Base class for flickr extractors"""
category = "flickr"
+ root = "https://www.flickr.com"
filename_fmt = "{category}_{id}.{extension}"
directory_fmt = ("{category}", "{user[username]}")
archive_fmt = "{id}"
@@ -24,11 +26,12 @@ class FlickrExtractor(Extractor):
request_interval_min = 0.5
def _init(self):
- self.api = FlickrAPI(self)
self.user = None
self.item_id = self.groups[0]
def items(self):
+ self.api = FlickrAPI(self)
+
data = self.metadata()
extract = self.api._extract_format
for photo in self.photos():
@@ -38,11 +41,11 @@ class FlickrExtractor(Extractor):
self.log.warning(
"Skipping photo %s (%s: %s)",
photo["id"], exc.__class__.__name__, exc)
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
else:
photo.update(data)
url = self._file_url(photo)
- yield Message.Directory, photo
+ yield Message.Directory, "", photo
yield Message.Url, url, text.nameext_from_url(url, photo)
def metadata(self):
@@ -75,6 +78,8 @@ class FlickrImageExtractor(FlickrExtractor):
example = "https://www.flickr.com/photos/USER/12345"
def items(self):
+ self.api = FlickrAPI(self)
+
item_id, enc_id = self.groups
if enc_id is not None:
alphabet = ("123456789abcdefghijkmnopqrstu"
@@ -98,7 +103,7 @@ class FlickrImageExtractor(FlickrExtractor):
photo["comments"] = text.parse_int(photo["comments"]["_content"])
photo["description"] = photo["description"]["_content"]
photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]]
- photo["date"] = text.parse_timestamp(photo["dateuploaded"])
+ photo["date"] = self.parse_timestamp(photo["dateuploaded"])
photo["views"] = text.parse_int(photo["views"])
photo["id"] = text.parse_int(photo["id"])
@@ -109,7 +114,7 @@ class FlickrImageExtractor(FlickrExtractor):
location[key] = value["_content"]
url = self._file_url(photo)
- yield Message.Directory, photo
+ yield Message.Directory, "", photo
yield Message.Url, url, text.nameext_from_url(url, photo)
@@ -119,7 +124,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
directory_fmt = ("{category}", "{user[username]}",
"Albums", "{album[id]} {album[title]}")
archive_fmt = "a_{album[id]}_{id}"
- pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?"
+ pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?"
example = "https://www.flickr.com/photos/USER/albums/12345"
def items(self):
@@ -129,6 +134,8 @@ class FlickrAlbumExtractor(FlickrExtractor):
return self._album_items()
def _album_items(self):
+ self.api = FlickrAPI(self)
+
data = FlickrExtractor.metadata(self)
data["_extractor"] = FlickrAlbumExtractor
@@ -159,7 +166,7 @@ class FlickrGalleryExtractor(FlickrExtractor):
directory_fmt = ("{category}", "{user[username]}",
"Galleries", "{gallery[gallery_id]} {gallery[title]}")
archive_fmt = "g_{gallery[id]}_{id}"
- pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)"
+ pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/galleries/(\d+)"
example = "https://www.flickr.com/photos/USER/galleries/12345/"
def metadata(self):
@@ -177,7 +184,7 @@ class FlickrGroupExtractor(FlickrExtractor):
subcategory = "group"
directory_fmt = ("{category}", "Groups", "{group[groupname]}")
archive_fmt = "G_{group[nsid]}_{id}"
- pattern = BASE_PATTERN + r"/groups/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/groups/([^/?#]+)"
example = "https://www.flickr.com/groups/NAME/"
def metadata(self):
@@ -192,7 +199,7 @@ class FlickrUserExtractor(FlickrExtractor):
"""Extractor for the photostream of a flickr user"""
subcategory = "user"
archive_fmt = "u_{user[nsid]}_{id}"
- pattern = BASE_PATTERN + r"/photos/([^/?#]+)/?$"
+ pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/?$"
example = "https://www.flickr.com/photos/USER/"
def photos(self):
@@ -204,7 +211,7 @@ class FlickrFavoriteExtractor(FlickrExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "{user[username]}", "Favorites")
archive_fmt = "f_{user[nsid]}_{id}"
- pattern = BASE_PATTERN + r"/photos/([^/?#]+)/favorites"
+ pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)/favorites"
example = "https://www.flickr.com/photos/USER/favorites"
def photos(self):
@@ -216,7 +223,7 @@ class FlickrSearchExtractor(FlickrExtractor):
subcategory = "search"
directory_fmt = ("{category}", "Search", "{search[text]}")
archive_fmt = "s_{search}_{id}"
- pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/search/?\?([^#]+)"
example = "https://flickr.com/search/?text=QUERY"
def metadata(self):
@@ -236,8 +243,8 @@ class FlickrAPI(oauth.OAuth1API):
"""
API_URL = "https://api.flickr.com/services/rest/"
- API_KEY = "90c368449018a0cb880ea4889cbb8681"
- API_SECRET = "e4b83e319c11e9e1"
+ # API_KEY = ""
+ API_SECRET = ""
FORMATS = [
("o" , "Original" , None),
("6k", "X-Large 6K" , 6144),
@@ -282,6 +289,14 @@ class FlickrAPI(oauth.OAuth1API):
"10": "Public Domain Mark",
}
+ @property
+ @memcache(maxage=3600)
+ def API_KEY(self):
+ extr = self.extractor
+ extr.log.info("Retrieving public API key")
+ page = extr.request(extr.root + "/prints").text
+ return text.extr(page, '.flickr.api.site_key = "', '"')
+
def __init__(self, extractor):
oauth.OAuth1API.__init__(self, extractor)
@@ -489,7 +504,7 @@ class FlickrAPI(oauth.OAuth1API):
def _extract_format(self, photo):
photo["description"] = photo["description"]["_content"].strip()
photo["views"] = text.parse_int(photo["views"])
- photo["date"] = text.parse_timestamp(photo["dateupload"])
+ photo["date"] = self.extractor.parse_timestamp(photo["dateupload"])
photo["tags"] = photo["tags"].split()
self._extract_metadata(photo)
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index dc23488..3c69489 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -31,7 +31,7 @@ class FoolfuukaExtractor(BaseExtractor):
self.fixup_redirect = False
def items(self):
- yield Message.Directory, self.metadata()
+ yield Message.Directory, "", self.metadata()
for post in self.posts():
media = post["media"]
if not media:
@@ -147,7 +147,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
subcategory = "thread"
directory_fmt = ("{category}", "{board[shortname]}",
"{thread_num} {title|comment[:50]}")
- pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/thread/(\d+)"
example = "https://archived.moe/a/thread/12345/"
def __init__(self, match):
@@ -174,7 +174,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
class FoolfuukaBoardExtractor(FoolfuukaExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
subcategory = "board"
- pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/(?:page/)?(\d*))?$"
example = "https://archived.moe/a/"
def __init__(self, match):
@@ -210,7 +210,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
"""Base extractor for search results on FoolFuuka based boards/archives"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search}")
- pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
example = "https://archived.moe/_/search/text/QUERY/"
request_interval = (0.5, 1.5)
@@ -265,7 +265,7 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor):
"""Base extractor for FoolFuuka galleries"""
subcategory = "gallery"
directory_fmt = ("{category}", "{board}", "gallery")
- pattern = BASE_PATTERN + r"/([^/?#]+)/gallery(?:/(\d+))?"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/gallery(?:/(\d+))?"
example = "https://archived.moe/a/gallery"
def metadata(self):
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index 7c59f72..d932174 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -47,7 +47,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
filename_fmt = (
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
+ pattern = rf"{BASE_PATTERN}(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
example = "https://read.powermanga.org/read/MANGA/en/0/123/"
def items(self):
@@ -58,7 +58,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
data["count"] = len(imgs)
data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"])
- yield Message.Directory, data
+ yield Message.Directory, "", data
enum = util.enumerate_reversed if self.config(
"page-reverse") else enumerate
for data["page"], image in enum(imgs, 1):
@@ -91,7 +91,7 @@ class FoolslideMangaExtractor(FoolslideExtractor):
"""Base class for manga extractors for FoOlSlide based sites"""
subcategory = "manga"
categorytransfer = True
- pattern = BASE_PATTERN + r"(/series/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/series/[^/?#]+)"
example = "https://read.powermanga.org/series/MANGA/"
def items(self):
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 0d24f83..ad57a6b 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -54,7 +54,7 @@ class FuraffinityExtractor(Extractor):
if post := self._parse_post(post_id):
if metadata:
post.update(metadata)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, post["url"], post
if self.external:
@@ -95,7 +95,7 @@ class FuraffinityExtractor(Extractor):
if self._new_layout:
data["tags"] = text.split_html(extr(
- 'class="tags-row">', '</section>'))
+ "<h3>Keywords</h3>", "</section>"))
data["scraps"] = (extr(' submissions">', "<") == "Scraps")
data["title"] = text.unescape(extr("<h2><p>", "</p></h2>"))
data["artist_url"] = extr('title="', '"').strip()
@@ -143,7 +143,7 @@ class FuraffinityExtractor(Extractor):
data["folders"] = () # folders not present in old layout
data["user"] = self.user or data["artist_url"]
- data["date"] = text.parse_timestamp(data["filename"].partition(".")[0])
+ data["date"] = self.parse_timestamp(data["filename"].partition(".")[0])
data["description"] = self._process_description(data["_description"])
data["thumbnail"] = (f"https://t.furaffinity.net/{post_id}@600-"
f"{path.rsplit('/', 2)[1]}.jpg")
@@ -231,7 +231,7 @@ class FuraffinityExtractor(Extractor):
class FuraffinityGalleryExtractor(FuraffinityExtractor):
"""Extractor for a furaffinity user's gallery"""
subcategory = "gallery"
- pattern = BASE_PATTERN + r"/gallery/([^/?#]+)(?:$|/(?!folder/))"
+ pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)(?:$|/(?!folder/))"
example = "https://www.furaffinity.net/gallery/USER/"
def posts(self):
@@ -243,7 +243,7 @@ class FuraffinityFolderExtractor(FuraffinityExtractor):
subcategory = "folder"
directory_fmt = ("{category}", "{user!l}",
"Folders", "{folder_id}{folder_name:? //}")
- pattern = BASE_PATTERN + r"/gallery/([^/?#]+)/folder/(\d+)(?:/([^/?#]+))?"
+ pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)/folder/(\d+)(?:/([^/?#]+))?"
example = "https://www.furaffinity.net/gallery/USER/folder/12345/FOLDER"
def metadata(self):
@@ -260,7 +260,7 @@ class FuraffinityScrapsExtractor(FuraffinityExtractor):
"""Extractor for a furaffinity user's scraps"""
subcategory = "scraps"
directory_fmt = ("{category}", "{user!l}", "Scraps")
- pattern = BASE_PATTERN + r"/scraps/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/scraps/([^/?#]+)"
example = "https://www.furaffinity.net/scraps/USER/"
def posts(self):
@@ -271,7 +271,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor):
"""Extractor for a furaffinity user's favorites"""
subcategory = "favorite"
directory_fmt = ("{category}", "{user!l}", "Favorites")
- pattern = BASE_PATTERN + r"/favorites/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/favorites/([^/?#]+)"
example = "https://www.furaffinity.net/favorites/USER/"
def posts(self):
@@ -287,7 +287,7 @@ class FuraffinitySearchExtractor(FuraffinityExtractor):
"""Extractor for furaffinity search results"""
subcategory = "search"
directory_fmt = ("{category}", "Search", "{search}")
- pattern = BASE_PATTERN + r"/search(?:/([^/?#]+))?/?[?&]([^#]+)"
+ pattern = rf"{BASE_PATTERN}/search(?:/([^/?#]+))?/?[?&]([^#]+)"
example = "https://www.furaffinity.net/search/?q=QUERY"
def __init__(self, match):
@@ -306,7 +306,7 @@ class FuraffinitySearchExtractor(FuraffinityExtractor):
class FuraffinityPostExtractor(FuraffinityExtractor):
"""Extractor for individual posts on furaffinity"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?:view|full)/(\d+)"
example = "https://www.furaffinity.net/view/12345/"
def posts(self):
@@ -317,7 +317,7 @@ class FuraffinityPostExtractor(FuraffinityExtractor):
class FuraffinityUserExtractor(Dispatch, FuraffinityExtractor):
"""Extractor for furaffinity user profiles"""
- pattern = BASE_PATTERN + r"/user/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)"
example = "https://www.furaffinity.net/user/USER/"
def items(self):
@@ -333,7 +333,7 @@ class FuraffinityUserExtractor(Dispatch, FuraffinityExtractor):
class FuraffinityFollowingExtractor(FuraffinityExtractor):
"""Extractor for a furaffinity user's watched users"""
subcategory = "following"
- pattern = BASE_PATTERN + "/watchlist/by/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/watchlist/by/([^/?#]+)"
example = "https://www.furaffinity.net/watchlist/by/USER/"
def items(self):
@@ -355,7 +355,7 @@ class FuraffinityFollowingExtractor(FuraffinityExtractor):
class FuraffinitySubmissionsExtractor(FuraffinityExtractor):
"""Extractor for new furaffinity submissions"""
subcategory = "submissions"
- pattern = BASE_PATTERN + r"(/msg/submissions(?:/[^/?#]+)?)"
+ pattern = rf"{BASE_PATTERN}(/msg/submissions(?:/[^/?#]+)?)"
example = "https://www.furaffinity.net/msg/submissions"
def posts(self):
diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py
index a93ec75..95b98db 100644
--- a/gallery_dl/extractor/furry34.py
+++ b/gallery_dl/extractor/furry34.py
@@ -55,8 +55,7 @@ class Furry34Extractor(BooruExtractor):
def _prepare(self, post):
post.pop("files", None)
- post["date"] = text.parse_datetime(
- post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["created"])
post["filename"], _, post["format"] = post["filename"].rpartition(".")
if "tags" in post:
post["tags"] = [t["value"] for t in post["tags"]]
@@ -98,7 +97,7 @@ class Furry34Extractor(BooruExtractor):
class Furry34PostExtractor(Furry34Extractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/post/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post/(\d+)"
example = "https://furry34.com/post/12345"
def posts(self):
@@ -109,7 +108,7 @@ class Furry34PlaylistExtractor(Furry34Extractor):
subcategory = "playlist"
directory_fmt = ("{category}", "{playlist_id}")
archive_fmt = "p_{playlist_id}_{id}"
- pattern = BASE_PATTERN + r"/playlists/view/(\d+)"
+ pattern = rf"{BASE_PATTERN}/playlists/view/(\d+)"
example = "https://furry34.com/playlists/view/12345"
def metadata(self):
@@ -124,7 +123,7 @@ class Furry34TagExtractor(Furry34Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/(?:([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)"
+ pattern = rf"{BASE_PATTERN}/(?:([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)"
example = "https://furry34.com/TAG"
def _init(self):
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index f32059e..0571fcd 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -148,7 +148,7 @@ class GelbooruBase():
class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
- pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]*)"
+ pattern = rf"{BASE_PATTERN}page=post&s=list&tags=([^&#]*)"
example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG"
@@ -156,7 +156,7 @@ class GelbooruPoolExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02PoolExtractor):
"""Extractor for gelbooru pools"""
per_page = 45
- pattern = BASE_PATTERN + r"page=pool&s=show&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}page=pool&s=show&id=(\d+)"
example = "https://gelbooru.com/index.php?page=pool&s=show&id=12345"
skip = GelbooruBase._skip_offset
@@ -187,7 +187,7 @@ class GelbooruFavoriteExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02FavoriteExtractor):
"""Extractor for gelbooru favorites"""
per_page = 100
- pattern = BASE_PATTERN + r"page=favorites&s=view&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}page=favorites&s=view&id=(\d+)"
example = "https://gelbooru.com/index.php?page=favorites&s=view&id=12345"
skip = GelbooruBase._skip_offset
@@ -246,7 +246,7 @@ class GelbooruFavoriteExtractor(GelbooruBase,
for fav in favs:
for post in self._api_request({"id": fav["favorite"]}):
- post["date_favorited"] = text.parse_timestamp(fav["added"])
+ post["date_favorited"] = self.parse_timestamp(fav["added"])
yield post
params["pid"] += 1
@@ -273,7 +273,7 @@ class GelbooruFavoriteExtractor(GelbooruBase,
for fav in favs:
for post in self._api_request({"id": fav["favorite"]}):
- post["date_favorited"] = text.parse_timestamp(fav["added"])
+ post["date_favorited"] = self.parse_timestamp(fav["added"])
yield post
params["pid"] -= 1
@@ -284,10 +284,10 @@ class GelbooruFavoriteExtractor(GelbooruBase,
class GelbooruPostExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02PostExtractor):
"""Extractor for single images from gelbooru.com"""
- pattern = (BASE_PATTERN +
- r"(?=(?:[^#]+&)?page=post(?:&|#|$))"
- r"(?=(?:[^#]+&)?s=view(?:&|#|$))"
- r"(?:[^#]+&)?id=(\d+)")
+ pattern = (rf"{BASE_PATTERN}"
+ rf"(?=(?:[^#]+&)?page=post(?:&|#|$))"
+ rf"(?=(?:[^#]+&)?s=view(?:&|#|$))"
+ rf"(?:[^#]+&)?id=(\d+)")
example = "https://gelbooru.com/index.php?page=post&s=view&id=12345"
diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py
index 61d0545..7b9c732 100644
--- a/gallery_dl/extractor/gelbooru_v01.py
+++ b/gallery_dl/extractor/gelbooru_v01.py
@@ -35,8 +35,7 @@ class GelbooruV01Extractor(booru.BooruExtractor):
}
post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%d %H:%M:%S")
+ post["date"] = self.parse_datetime_iso(post["created_at"])
return post
@@ -88,7 +87,7 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=list&tags=([^&#]+)"
example = "https://allgirl.booru.org/index.php?page=post&s=list&tags=TAG"
def metadata(self):
@@ -105,7 +104,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor):
directory_fmt = ("{category}", "favorites", "{favorite_id}")
archive_fmt = "f_{favorite_id}_{id}"
per_page = 50
- pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=favorites&s=view&id=(\d+)"
example = "https://allgirl.booru.org/index.php?page=favorites&s=view&id=1"
def metadata(self):
@@ -121,7 +120,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor):
class GelbooruV01PostExtractor(GelbooruV01Extractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=view&id=(\d+)"
example = "https://allgirl.booru.org/index.php?page=post&s=view&id=12345"
def posts(self):
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 33db4e4..122f5a9 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -96,7 +96,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
params["pid"] = self.page_start * self.per_page
data = {}
- find_ids = util.re(r"\sid=\"p(\d+)").findall
+ find_ids = text.re(r"\sid=\"p(\d+)").findall
while True:
page = self.request(url, params=params).text
@@ -122,7 +122,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
def _prepare(self, post):
post["tags"] = post["tags"].strip()
- post["date"] = text.parse_datetime(
+ post["date"] = self.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
def _html(self, post):
@@ -136,7 +136,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
return
tags = collections.defaultdict(list)
- pattern = util.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)")
+ pattern = text.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)")
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items():
@@ -190,7 +190,7 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=list&tags=([^&#]*)"
example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG"
def posts(self):
@@ -206,7 +206,7 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=pool&s=show&id=(\d+)"
example = "https://safebooru.org/index.php?page=pool&s=show&id=12345"
def __init__(self, match):
@@ -257,7 +257,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor):
directory_fmt = ("{category}", "favorites", "{favorite_id}")
archive_fmt = "f_{favorite_id}_{id}"
per_page = 50
- pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=favorites&s=view&id=(\d+)"
example = "https://safebooru.org/index.php?page=favorites&s=view&id=12345"
def metadata(self):
@@ -275,7 +275,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor):
class GelbooruV02PostExtractor(GelbooruV02Extractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=view&id=(\d+)"
example = "https://safebooru.org/index.php?page=post&s=view&id=12345"
def posts(self):
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 407e478..99e6ea7 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -7,7 +7,7 @@
"""Generic information extractor"""
from .common import Extractor, Message
-from .. import config, text, util
+from .. import config, text
import os.path
@@ -75,7 +75,7 @@ class GenericExtractor(Extractor):
pass
images = enumerate(imgs, 1)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], (url, imgdata) in images:
if imgdata:
@@ -171,8 +171,8 @@ class GenericExtractor(Extractor):
r"(?:[^\"'<>\s]*)?" # optional query and fragment
)
- imageurls_src = util.re(imageurl_pattern_src).findall(page)
- imageurls_ext = util.re(imageurl_pattern_ext).findall(page)
+ imageurls_src = text.re(imageurl_pattern_src).findall(page)
+ imageurls_ext = text.re(imageurl_pattern_ext).findall(page)
imageurls = imageurls_src + imageurls_ext
# Resolve relative urls
@@ -181,7 +181,7 @@ class GenericExtractor(Extractor):
# by prepending a suitable base url.
#
# If the page contains a <base> element, use it as base url
- basematch = util.re(
+ basematch = text.re(
r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page)
if basematch:
self.baseurl = basematch['url'].rstrip('/')
diff --git a/gallery_dl/extractor/girlsreleased.py b/gallery_dl/extractor/girlsreleased.py
index 5e68a63..0fbdeff 100644
--- a/gallery_dl/extractor/girlsreleased.py
+++ b/gallery_dl/extractor/girlsreleased.py
@@ -41,7 +41,7 @@ class GirlsreleasedExtractor(Extractor):
class GirlsreleasedSetExtractor(GirlsreleasedExtractor):
"""Extractor for girlsreleased galleries"""
subcategory = "set"
- pattern = BASE_PATTERN + r"/set/(\d+)"
+ pattern = rf"{BASE_PATTERN}/set/(\d+)"
example = "https://girlsreleased.com/set/12345"
def items(self):
@@ -52,11 +52,11 @@ class GirlsreleasedSetExtractor(GirlsreleasedExtractor):
"id": json["id"],
"site": json["site"],
"model": [model for _, model in json["models"]],
- "date": text.parse_timestamp(json["date"]),
+ "date": self.parse_timestamp(json["date"]),
"count": len(json["images"]),
"url": "https://girlsreleased.com/set/" + json["id"],
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], image in enumerate(json["images"], 1):
text.nameext_from_url(image[5], data)
yield Message.Queue, image[3], data
@@ -65,12 +65,12 @@ class GirlsreleasedSetExtractor(GirlsreleasedExtractor):
class GirlsreleasedModelExtractor(GirlsreleasedExtractor):
"""Extractor for girlsreleased models"""
subcategory = _path = "model"
- pattern = BASE_PATTERN + r"/model/(\d+(?:/.+)?)"
+ pattern = rf"{BASE_PATTERN}/model/(\d+(?:/.+)?)"
example = "https://girlsreleased.com/model/12345/MODEL"
class GirlsreleasedSiteExtractor(GirlsreleasedExtractor):
"""Extractor for girlsreleased sites"""
subcategory = _path = "site"
- pattern = BASE_PATTERN + r"/site/([^/?#]+(?:/model/\d+/?.*)?)"
+ pattern = rf"{BASE_PATTERN}/site/([^/?#]+(?:/model/\d+/?.*)?)"
example = "https://girlsreleased.com/site/SITE"
diff --git a/gallery_dl/extractor/girlswithmuscle.py b/gallery_dl/extractor/girlswithmuscle.py
index 51b979f..e61e472 100644
--- a/gallery_dl/extractor/girlswithmuscle.py
+++ b/gallery_dl/extractor/girlswithmuscle.py
@@ -5,7 +5,7 @@
# published by the Free Software Foundation.
from .common import Extractor, Message
-from .. import text, util, exception
+from .. import text, exception
from ..cache import cache
BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlswithmuscle\.com"
@@ -60,7 +60,7 @@ class GirlswithmuscleExtractor(Extractor):
class GirlswithmusclePostExtractor(GirlswithmuscleExtractor):
"""Extractor for individual posts on girlswithmuscle.com"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(\d+)"
example = "https://www.girlswithmuscle.com/12345/"
def items(self):
@@ -80,7 +80,7 @@ class GirlswithmusclePostExtractor(GirlswithmuscleExtractor):
metadata["type"] = "video"
text.nameext_from_url(url, metadata)
- yield Message.Directory, metadata
+ yield Message.Directory, "", metadata
yield Message.Url, url, metadata
def metadata(self, page):
@@ -101,9 +101,8 @@ class GirlswithmusclePostExtractor(GirlswithmuscleExtractor):
"model": model,
"model_list": self._parse_model_list(model),
"tags": text.split_html(tags)[1::2],
- "date": text.parse_datetime(
- text.extr(page, 'class="hover-time" title="', '"')[:19],
- "%Y-%m-%d %H:%M:%S"),
+ "date": self.parse_datetime_iso(text.extr(
+ page, 'class="hover-time" title="', '"')[:19]),
"is_favorite": self._parse_is_favorite(page),
"source_filename": source,
"uploader": uploader,
@@ -144,7 +143,7 @@ class GirlswithmusclePostExtractor(GirlswithmuscleExtractor):
class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor):
"""Extractor for search results on girlswithmuscle.com"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/images/(.*)"
+ pattern = rf"{BASE_PATTERN}/images/(.*)"
example = "https://www.girlswithmuscle.com/images/?name=MODEL"
def pages(self):
@@ -156,7 +155,7 @@ class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor):
raise exception.AuthorizationError(msg)
page = response.text
- match = util.re(r"Page (\d+) of (\d+)").search(page)
+ match = text.re(r"Page (\d+) of (\d+)").search(page)
current, total = match.groups()
current, total = text.parse_int(current), text.parse_int(total)
diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py
index 0a6c9b9..7c9755a 100644
--- a/gallery_dl/extractor/gofile.py
+++ b/gallery_dl/extractor/gofile.py
@@ -39,7 +39,7 @@ class GofileFolderExtractor(Extractor):
self._get_website_token())
folder = self._get_content(self.content_id, password)
- yield Message.Directory, folder
+ yield Message.Directory, "", folder
try:
contents = folder.pop("children")
@@ -75,14 +75,16 @@ class GofileFolderExtractor(Extractor):
@cache(maxage=86400)
def _get_website_token(self):
self.log.debug("Fetching website token")
- page = self.request(self.root + "/dist/js/global.js").text
+ page = self.request(self.root + "/dist/js/config.js").text
return text.extr(page, '.wt = "', '"')
def _get_content(self, content_id, password=None):
- headers = {"Authorization": "Bearer " + self.api_token}
- params = {"wt": self.website_token}
- if password is not None:
- params["password"] = hashlib.sha256(password.encode()).hexdigest()
+ headers = {
+ "Authorization" : "Bearer " + self.api_token,
+ "X-Website-Token": self.website_token,
+ }
+ params = None if password is None else {"password": hashlib.sha256(
+ password.encode()).hexdigest()}
return self._api_request("contents/" + content_id, params, headers)
def _api_request(self, endpoint, params=None, headers=None, method="GET"):
diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py
index 8e350d6..7065d7b 100644
--- a/gallery_dl/extractor/hatenablog.py
+++ b/gallery_dl/extractor/hatenablog.py
@@ -7,7 +7,7 @@
"""Extractors for https://hatenablog.com"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text
BASE_PATTERN = (
@@ -30,11 +30,11 @@ class HatenablogExtractor(Extractor):
self.domain = match[1] or match[2]
def _init(self):
- self._find_img = util.re(r'<img +([^>]+)').finditer
+ self._find_img = text.re(r'<img +([^>]+)').finditer
def _handle_article(self, article: str):
extr = text.extract_from(article)
- date = text.parse_datetime(extr('<time datetime="', '"'))
+ date = self.parse_datetime_iso(extr('<time datetime="', '"'))
entry_link = text.unescape(extr('<a href="', '"'))
entry = entry_link.partition("/entry/")[2]
title = text.unescape(extr('>', '<'))
@@ -56,7 +56,7 @@ class HatenablogExtractor(Extractor):
"title": title,
"count": len(images),
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(images, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
@@ -73,7 +73,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor):
def _init(self):
HatenablogExtractor._init(self)
- self._find_pager_url = util.re(
+ self._find_pager_url = text.re(
r' class="pager-next">\s*<a href="([^"]+)').search
def items(self):
@@ -123,7 +123,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor):
class HatenablogEntryExtractor(HatenablogExtractor):
"""Extractor for a single entry URL"""
subcategory = "entry"
- pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
+ pattern = rf"{BASE_PATTERN}/entry/([^?#]+){QUERY_RE}"
example = "https://BLOG.hatenablog.com/entry/PATH"
def __init__(self, match):
@@ -146,21 +146,21 @@ class HatenablogEntryExtractor(HatenablogExtractor):
class HatenablogHomeExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's home page"""
subcategory = "home"
- pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
+ pattern = rf"{BASE_PATTERN}(/?){QUERY_RE}"
example = "https://BLOG.hatenablog.com"
class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's archive page"""
subcategory = "archive"
- pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
- r"|/category/[^?#]+)?)" + QUERY_RE)
+ pattern = (rf"{BASE_PATTERN}(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
+ rf"|/category/[^?#]+)?){QUERY_RE}")
example = "https://BLOG.hatenablog.com/archive/2024"
class HatenablogSearchExtractor(HatenablogEntriesExtractor):
"""Extractor for a blog's search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
+ pattern = rf"{BASE_PATTERN}(/search){QUERY_RE}"
example = "https://BLOG.hatenablog.com/search?q=QUERY"
allowed_parameters = ("q",)
diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py
index ac4cd02..0e4c040 100644
--- a/gallery_dl/extractor/hentai2read.py
+++ b/gallery_dl/extractor/hentai2read.py
@@ -30,7 +30,7 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
chapter, sep, minor = self.groups[1].partition(".")
- match = util.re(
+ match = text.re(
r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - "
r"([^:]+): (.+) . Page 1 ").match(title)
if match:
diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py
index 5c2628f..b5f3d0e 100644
--- a/gallery_dl/extractor/hentaicosplays.py
+++ b/gallery_dl/extractor/hentaicosplays.py
@@ -38,7 +38,7 @@ class HentaicosplaysGalleryExtractor(
directory_fmt = ("{site}", "{title}")
filename_fmt = "{filename}.{extension}"
archive_fmt = "{title}_{filename}"
- pattern = BASE_PATTERN + r"/(?:image|story)/([\w-]+)"
+ pattern = rf"{BASE_PATTERN}/(?:image|story)/([\w-]+)"
example = "https://hentai-cosplay-xxx.com/image/TITLE/"
def __init__(self, match):
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index a08f7bb..882183b 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -43,7 +43,7 @@ class HentaifoundryExtractor(Extractor):
for post_url in util.advance(self.posts(), self.start_post):
image = self._parse_post(post_url)
image.update(data)
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, image["src"], image
def skip(self, num):
@@ -86,7 +86,8 @@ class HentaifoundryExtractor(Extractor):
.replace("\r\n", "\n")),
"ratings" : [text.unescape(r) for r in text.extract_iter(extr(
"class='ratings_box'", "</div>"), "title='", "'")],
- "date" : text.parse_datetime(extr("datetime='", "'")),
+ "categories" : self._extract_categories(extr),
+ "date" : self.parse_datetime_iso(extr("datetime='", "'")),
"views" : text.parse_int(extr(">Views</span>", "<")),
"score" : text.parse_int(extr(">Vote Score</span>", "<")),
"media" : text.unescape(extr(">Media</span>", "<").strip()),
@@ -126,7 +127,7 @@ class HentaifoundryExtractor(Extractor):
"title" : text.unescape(extr(
"<div class='titlebar'>", "</a>").rpartition(">")[2]),
"author" : text.unescape(extr('alt="', '"')),
- "date" : text.parse_datetime(extr(
+ "date" : self.parse_datetime(extr(
">Updated<", "</span>").rpartition(">")[2], "%B %d, %Y"),
"status" : extr("class='indent'>", "<"),
}
@@ -141,11 +142,17 @@ class HentaifoundryExtractor(Extractor):
path = extr('class="pdfLink" href="', '"')
data["src"] = self.root + path
data["index"] = text.parse_int(path.rsplit("/", 2)[1])
+ data["categories"] = self._extract_categories(extr)
data["ratings"] = [text.unescape(r) for r in text.extract_iter(extr(
"class='ratings_box'", "</div>"), "title='", "'")]
return text.nameext_from_url(data["src"], data)
+ def _extract_categories(self, extr):
+ return [text.unescape(text.extr(c, ">", "<"))
+ for c in extr('class="categoryBreadcrumbs">', "</span>")
+ .split("&raquo;")]
+
def _request_check(self, url, **kwargs):
self.request = self._request_original
@@ -207,7 +214,7 @@ class HentaifoundryExtractor(Extractor):
class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor):
"""Extractor for a hentaifoundry user profile"""
- pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/profile"
example = "https://www.hentai-foundry.com/user/USER/profile"
def items(self):
@@ -228,7 +235,7 @@ class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor):
class HentaifoundryPicturesExtractor(HentaifoundryExtractor):
"""Extractor for all pictures of a hentaifoundry user"""
subcategory = "pictures"
- pattern = BASE_PATTERN + r"/pictures/user/([^/?#]+)(?:/page/(\d+))?/?$"
+ pattern = rf"{BASE_PATTERN}/pictures/user/([^/?#]+)(?:/page/(\d+))?/?$"
example = "https://www.hentai-foundry.com/pictures/user/USER"
def __init__(self, match):
@@ -240,7 +247,7 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
"""Extractor for scraps of a hentaifoundry user"""
subcategory = "scraps"
directory_fmt = ("{category}", "{user}", "Scraps")
- pattern = BASE_PATTERN + r"/pictures/user/([^/?#]+)/scraps"
+ pattern = rf"{BASE_PATTERN}/pictures/user/([^/?#]+)/scraps"
example = "https://www.hentai-foundry.com/pictures/user/USER/scraps"
def __init__(self, match):
@@ -253,7 +260,7 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "{user}", "Favorites")
archive_fmt = "f_{user}_{index}"
- pattern = BASE_PATTERN + r"/user/([^/?#]+)/faves/pictures"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/faves/pictures"
example = "https://www.hentai-foundry.com/user/USER/faves/pictures"
def __init__(self, match):
@@ -266,7 +273,7 @@ class HentaifoundryTagExtractor(HentaifoundryExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{index}"
- pattern = BASE_PATTERN + r"/pictures/tagged/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/pictures/tagged/([^/?#]+)"
example = "https://www.hentai-foundry.com/pictures/tagged/TAG"
def __init__(self, match):
@@ -282,7 +289,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor):
subcategory = "recent"
directory_fmt = ("{category}", "Recent Pictures", "{date}")
archive_fmt = "r_{index}"
- pattern = BASE_PATTERN + r"/pictures/recent/(\d\d\d\d-\d\d-\d\d)"
+ pattern = rf"{BASE_PATTERN}/pictures/recent/(\d\d\d\d-\d\d-\d\d)"
example = "https://www.hentai-foundry.com/pictures/recent/1970-01-01"
def __init__(self, match):
@@ -298,7 +305,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor):
subcategory = "popular"
directory_fmt = ("{category}", "Popular Pictures")
archive_fmt = "p_{index}"
- pattern = BASE_PATTERN + r"/pictures/popular()"
+ pattern = rf"{BASE_PATTERN}/pictures/popular()"
example = "https://www.hentai-foundry.com/pictures/popular"
def __init__(self, match):
@@ -324,7 +331,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
f"/{self.index}/?enterAgree=1")
image = self._parse_post(post_url)
image["user"] = self.user
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, image["src"], image
@@ -332,14 +339,14 @@ class HentaifoundryStoriesExtractor(HentaifoundryExtractor):
"""Extractor for stories of a hentaifoundry user"""
subcategory = "stories"
archive_fmt = "s_{index}"
- pattern = BASE_PATTERN + r"/stories/user/([^/?#]+)(?:/page/(\d+))?/?$"
+ pattern = rf"{BASE_PATTERN}/stories/user/([^/?#]+)(?:/page/(\d+))?/?$"
example = "https://www.hentai-foundry.com/stories/user/USER"
def items(self):
self._init_site_filters()
for story_html in util.advance(self.stories(), self.start_post):
story = self._parse_story(story_html)
- yield Message.Directory, story
+ yield Message.Directory, "", story
yield Message.Url, story["src"], story
def stories(self):
@@ -351,7 +358,7 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor):
"""Extractor for a hentaifoundry story"""
subcategory = "story"
archive_fmt = "s_{index}"
- pattern = BASE_PATTERN + r"/stories/user/([^/?#]+)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/stories/user/([^/?#]+)/(\d+)"
example = "https://www.hentai-foundry.com/stories/user/USER/12345/TITLE"
skip = Extractor.skip
@@ -364,5 +371,5 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor):
story_url = (f"{self.root}/stories/user/{self.user}"
f"/{self.index}/x?enterAgree=1")
story = self._parse_story(self.request(story_url).text)
- yield Message.Directory, story
+ yield Message.Directory, "", story
yield Message.Url, story["src"], story
diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py
index f4f9d86..2ca462e 100644
--- a/gallery_dl/extractor/hentaihand.py
+++ b/gallery_dl/extractor/hentaihand.py
@@ -35,8 +35,7 @@ class HentaihandGalleryExtractor(GalleryExtractor):
"language" : info["language"]["name"],
"lang" : util.language_to_code(info["language"]["name"]),
"tags" : [t["slug"] for t in info["tags"]],
- "date" : text.parse_datetime(
- info["uploaded_at"], "%Y-%m-%d"),
+ "date" : self.parse_datetime_iso(info["uploaded_at"]),
}
for key in ("artists", "authors", "groups", "characters",
"relationships", "parodies"):
diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py
index b894d77..ef72366 100644
--- a/gallery_dl/extractor/hentaihere.py
+++ b/gallery_dl/extractor/hentaihere.py
@@ -33,7 +33,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
title = text.extr(page, "<title>", "</title>")
chapter_id = text.extr(page, 'report/C', '"')
chapter, sep, minor = self.chapter.partition(".")
- match = util.re(
+ match = text.re(
r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by "
r"(.+) at ").match(title)
return {
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index a75eee0..0eaf798 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -9,7 +9,7 @@
"""Extractors for https://hiperdex.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
+from .. import text
from ..cache import memcache
BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
@@ -67,7 +67,7 @@ class HiperdexBase():
class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
"""Extractor for hiperdex manga chapters"""
- pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))"
+ pattern = rf"{BASE_PATTERN}(/mangas?/([^/?#]+)/([^/?#]+))"
example = "https://hiperdex.com/manga/MANGA/CHAPTER/"
def __init__(self, match):
@@ -79,7 +79,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
return self.chapter_data(self.chapter)
def images(self, page):
- pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)')
+ pattern = text.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)')
return [
(url.strip(), None)
for url in pattern.findall(page)
@@ -89,7 +89,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
"""Extractor for hiperdex manga"""
chapterclass = HiperdexChapterExtractor
- pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$"
+ pattern = rf"{BASE_PATTERN}(/mangas?/([^/?#]+))/?$"
example = "https://hiperdex.com/manga/MANGA/"
def __init__(self, match):
@@ -125,7 +125,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
categorytransfer = False
chapterclass = HiperdexMangaExtractor
reverse = False
- pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))"
+ pattern = rf"{BASE_PATTERN}(/manga-a(?:rtist|uthor)/(?:[^/?#]+))"
example = "https://hiperdex.com/manga-artist/NAME/"
def __init__(self, match):
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 82bed80..b05a9a7 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -84,7 +84,7 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
"type" : info["type"].capitalize(),
"language" : language,
"lang" : util.language_to_code(language),
- "date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"),
+ "date" : self.parse_datetime_iso(date),
"tags" : tags,
"artist" : [o["artist"] for o in iget("artists") or ()],
"group" : [o["group"] for o in iget("groups") or ()],
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
index 587d88c..953cf4e 100644
--- a/gallery_dl/extractor/hotleak.py
+++ b/gallery_dl/extractor/hotleak.py
@@ -30,7 +30,7 @@ class HotleakExtractor(Extractor):
.replace("_thumb.", ".")
)
post["_http_expected_status"] = (404,)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, post["url"], post
def posts(self):
@@ -61,7 +61,7 @@ def decode_video_url(url):
class HotleakPostExtractor(HotleakExtractor):
"""Extractor for individual posts on hotleak"""
subcategory = "post"
- pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))"
+ pattern = (rf"{BASE_PATTERN}/(?!(?:hot|creators|videos|photos)(?:$|/))"
r"([^/]+)/(photo|video)/(\d+)")
example = "https://hotleak.vip/MODEL/photo/12345"
@@ -96,7 +96,7 @@ class HotleakPostExtractor(HotleakExtractor):
class HotleakCreatorExtractor(HotleakExtractor):
"""Extractor for all posts from a hotleak creator"""
subcategory = "creator"
- pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))"
+ pattern = (rf"{BASE_PATTERN}/(?!(?:hot|creators|videos|photos)(?:$|/))"
r"([^/?#]+)/?$")
example = "https://hotleak.vip/MODEL"
@@ -150,7 +150,7 @@ class HotleakCreatorExtractor(HotleakExtractor):
class HotleakCategoryExtractor(HotleakExtractor):
"""Extractor for hotleak categories"""
subcategory = "category"
- pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/(hot|creators|videos|photos)(?:/?\?([^#]+))?"
example = "https://hotleak.vip/photos"
def __init__(self, match):
@@ -172,7 +172,7 @@ class HotleakCategoryExtractor(HotleakExtractor):
class HotleakSearchExtractor(HotleakExtractor):
"""Extractor for hotleak search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))"
+ pattern = rf"{BASE_PATTERN}/search(?:/?\?([^#]+))"
example = "https://hotleak.vip/search?search=QUERY"
def __init__(self, match):
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
index 26fd595..a8f1298 100644
--- a/gallery_dl/extractor/idolcomplex.py
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -11,7 +11,8 @@
from . import sankaku
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
- r"idol(?:\.sankaku)?complex\.com(?:/[a-z]{2})?")
+ r"idol(?:\.sankaku)?complex\.com"
+ r"(?:/[a-z]{2}(?:[-_][A-Z]{2})?)?")
class IdolcomplexBase():
@@ -28,17 +29,17 @@ class IdolcomplexBase():
class IdolcomplexTagExtractor(IdolcomplexBase, sankaku.SankakuTagExtractor):
"""Extractor for idolcomplex tag searches"""
- pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)"
+ pattern = rf"{BASE_PATTERN}(?:/posts)?/?\?([^#]*)"
example = "https://www.idolcomplex.com/en/posts?tags=TAGS"
class IdolcomplexPoolExtractor(IdolcomplexBase, sankaku.SankakuPoolExtractor):
"""Extractor for idolcomplex pools"""
- pattern = BASE_PATTERN + r"/pools?/(?:show/)?(\w+)"
+ pattern = rf"{BASE_PATTERN}/pools?/(?:show/)?(\w+)"
example = "https://www.idolcomplex.com/en/pools/0123456789abcdef"
class IdolcomplexPostExtractor(IdolcomplexBase, sankaku.SankakuPostExtractor):
"""Extractor for individual idolcomplex posts"""
- pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)"
+ pattern = rf"{BASE_PATTERN}/posts?(?:/show)?/(\w+)"
example = "https://www.idolcomplex.com/en/posts/0123456789abcdef"
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
index abba9df..66fbdd6 100644
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@@ -9,7 +9,7 @@
"""Extractors for https://www.imagebam.com/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text
class ImagebamExtractor(Extractor):
@@ -30,12 +30,10 @@ class ImagebamExtractor(Extractor):
url, pos = text.extract(page, '<img src="https://images', '"')
filename = text.unescape(text.extract(page, 'alt="', '"', pos)[0])
- data = {
+ return text.nameext_from_name(filename, {
"url" : "https://images" + url,
"image_key": path.rpartition("/")[2],
- }
- data["filename"], _, data["extension"] = filename.rpartition(".")
- return data
+ })
class ImagebamGalleryExtractor(ImagebamExtractor):
@@ -58,7 +56,7 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
data["count"] = len(images)
data["gallery_key"] = self.path.rpartition("/")[2]
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], path in enumerate(images, 1):
image = self._parse_image_page(path)
image.update(data)
@@ -69,7 +67,7 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
page, 'id="gallery-name">', '<').strip())}
def images(self, page):
- findall = util.re(r'<a href="https://www\.imagebam\.com'
+ findall = text.re(r'<a href="https://www\.imagebam\.com'
r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall
paths = []
while True:
@@ -96,5 +94,5 @@ class ImagebamImageExtractor(ImagebamExtractor):
path = ("/view/" if path[10] == "M" else "/image/") + path[10:]
image = self._parse_image_page(path)
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, image["url"], image
diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py
index 464e489..074b4ae 100644
--- a/gallery_dl/extractor/imagechest.py
+++ b/gallery_dl/extractor/imagechest.py
@@ -19,7 +19,7 @@ class ImagechestGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries from imgchest.com"""
category = "imagechest"
root = "https://imgchest.com"
- pattern = BASE_PATTERN + r"/p/([A-Za-z0-9]{11})"
+ pattern = rf"{BASE_PATTERN}/p/([A-Za-z0-9]{{11}})"
example = "https://imgchest.com/p/abcdefghijk"
def __init__(self, match):
@@ -53,11 +53,9 @@ class ImagechestGalleryExtractor(GalleryExtractor):
def _metadata_api(self, page):
post = self.api.post(self.gallery_id)
- post["date"] = text.parse_datetime(
- post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["created"])
for img in post["images"]:
- img["date"] = text.parse_datetime(
- img["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ img["date"] = self.parse_datetime_iso(img["created"])
post["gallery_id"] = self.gallery_id
post.pop("image_count", None)
@@ -80,7 +78,7 @@ class ImagechestUserExtractor(Extractor):
category = "imagechest"
subcategory = "user"
root = "https://imgchest.com"
- pattern = BASE_PATTERN + r"/u/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/u/([^/?#]+)"
example = "https://imgchest.com/u/USER"
def items(self):
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index 993af7c..f727969 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -39,7 +39,7 @@ class ImagefapExtractor(Extractor):
class ImagefapGalleryExtractor(ImagefapExtractor):
"""Extractor for image galleries from imagefap.com"""
subcategory = "gallery"
- pattern = BASE_PATTERN + r"/(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)"
example = "https://www.imagefap.com/gallery/12345"
def __init__(self, match):
@@ -51,7 +51,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
url = f"{self.root}/gallery/{self.gid}"
page = self.request(url).text
data = self.get_job_metadata(page)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for url, image in self.get_images():
data.update(image)
yield Message.Url, url, data
@@ -110,7 +110,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
class ImagefapImageExtractor(ImagefapExtractor):
"""Extractor for single images from imagefap.com"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/photo/(\d+)"
+ pattern = rf"{BASE_PATTERN}/photo/(\d+)"
example = "https://www.imagefap.com/photo/12345"
def __init__(self, match):
@@ -119,7 +119,7 @@ class ImagefapImageExtractor(ImagefapExtractor):
def items(self):
url, data = self.get_image()
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
def get_image(self):
@@ -148,9 +148,9 @@ class ImagefapImageExtractor(ImagefapExtractor):
class ImagefapFolderExtractor(ImagefapExtractor):
"""Extractor for imagefap user folders"""
subcategory = "folder"
- pattern = (BASE_PATTERN + r"/(?:organizer/|"
- r"(?:usergallery\.php\?user(id)?=([^&#]+)&"
- r"|profile/([^/?#]+)/galleries\?)folderid=)(\d+|-1)")
+ pattern = (rf"{BASE_PATTERN}/(?:organizer/|"
+ rf"(?:usergallery\.php\?user(id)?=([^&#]+)&"
+ rf"|profile/([^/?#]+)/galleries\?)folderid=)(\d+|-1)")
example = "https://www.imagefap.com/organizer/12345"
def __init__(self, match):
@@ -206,9 +206,9 @@ class ImagefapFolderExtractor(ImagefapExtractor):
class ImagefapUserExtractor(ImagefapExtractor):
"""Extractor for an imagefap user profile"""
subcategory = "user"
- pattern = (BASE_PATTERN +
- r"/(?:profile(?:\.php\?user=|/)([^/?#]+)(?:/galleries)?"
- r"|usergallery\.php\?userid=(\d+))(?:$|#)")
+ pattern = (rf"{BASE_PATTERN}/(?:"
+ rf"profile(?:\.php\?user=|/)([^/?#]+)(?:/galleries)?|"
+ rf"usergallery\.php\?userid=(\d+))(?:$|#)")
example = "https://www.imagefap.com/profile/USER"
def __init__(self, match):
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index 817d2c4..21e6cf8 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -19,6 +19,7 @@ class ImagehostImageExtractor(Extractor):
basecategory = "imagehost"
subcategory = "image"
archive_fmt = "{token}"
+ parent = True
_https = True
_params = None
_cookies = None
@@ -27,7 +28,10 @@ class ImagehostImageExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.page_url = f"http{'s' if self._https else ''}://{match[1]}"
+ if self.root:
+ self.page_url = f"{self.root}{match[1]}"
+ else:
+ self.page_url = f"http{'s' if self._https else ''}://{match[1]}"
self.token = match[2]
if self._params == "simple":
@@ -53,14 +57,25 @@ class ImagehostImageExtractor(Extractor):
).text
url, filename = self.get_info(page)
- data = text.nameext_from_url(filename, {"token": self.token})
+ if not url:
+ return
+
+ if filename:
+ data = text.nameext_from_name(filename)
+ if not data["extension"]:
+ data["extension"] = text.ext_from_url(url)
+ else:
+ data = text.nameext_from_url(url)
+ data["token"] = self.token
+ data["post_url"] = self.page_url
data.update(self.metadata(page))
+
if self._https and url.startswith("http:"):
url = "https:" + url[5:]
if self._validate is not None:
data["_http_validate"] = self._validate
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
def get_info(self, page):
@@ -70,6 +85,9 @@ class ImagehostImageExtractor(Extractor):
"""Return additional metadata"""
return ()
+ def not_found(self, resource=None):
+ raise exception.NotFoundError(resource or self.__class__.subcategory)
+
class ImxtoImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from imx.to"""
@@ -92,7 +110,7 @@ class ImxtoImageExtractor(ImagehostImageExtractor):
url, pos = text.extract(
page, '<div style="text-align:center;"><a href="', '"')
if not url:
- raise exception.NotFoundError("image")
+ self.not_found()
filename, pos = text.extract(page, ' title="', '"', pos)
if self.url_ext and filename:
filename += splitext(url)[1]
@@ -152,7 +170,7 @@ class AcidimgImageExtractor(ImagehostImageExtractor):
if not url:
url, pos = text.extract(page, '<img class="centred" src="', '"')
if not url:
- raise exception.NotFoundError("image")
+ self.not_found()
filename, pos = text.extract(page, "alt='", "'", pos)
if not filename:
@@ -169,7 +187,11 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
example = "https://www.imagevenue.com/ME123456789"
def get_info(self, page):
- pos = page.index('class="card-body')
+ try:
+ pos = page.index('class="card-body')
+ except ValueError:
+ self.not_found()
+
url, pos = text.extract(page, '<img src="', '"', pos)
if url.endswith("/loader.svg"):
url, pos = text.extract(page, '<img src="', '"', pos)
@@ -199,6 +221,8 @@ class ImagetwistImageExtractor(ImagehostImageExtractor):
def get_info(self, page):
url , pos = text.extract(page, '<img src="', '"')
+ if url and url.startswith("/imgs/"):
+ self.not_found()
filename, pos = text.extract(page, ' alt="', '"', pos)
return url, filename
@@ -249,7 +273,7 @@ class ImgspiceImageExtractor(ImagehostImageExtractor):
def get_info(self, page):
pos = page.find('id="imgpreview"')
if pos < 0:
- raise exception.NotFoundError("image")
+ self.not_found()
url , pos = text.extract(page, 'src="', '"', pos)
name, pos = text.extract(page, 'alt="', '"', pos)
return url, text.unescape(name)
@@ -258,23 +282,26 @@ class ImgspiceImageExtractor(ImagehostImageExtractor):
class PixhostImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from pixhost.to"""
category = "pixhost"
- pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)"
- r"/show/\d+/(\d+)_[^/?#]+)")
+ root = "https://pixhost.to"
+ pattern = (r"(?:https?://)?(?:www\.)?pixhost\.(?:to|org)"
+ r"(/show/\d+/(\d+)_[^/?#]+)")
example = "https://pixhost.to/show/123/12345_NAME.EXT"
_cookies = {"pixhostads": "1", "pixhosttest": "1"}
def get_info(self, page):
- url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"")
- filename, pos = text.extract(page, "alt=\"", "\"", pos)
- return url, filename
+ self.kwdict["directory"] = self.page_url.rsplit("/")[-2]
+ url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"")
+ name, pos = text.extract(page, "alt=\"", "\"", pos)
+ return url, text.unescape(name) if name else None
class PixhostGalleryExtractor(ImagehostImageExtractor):
"""Extractor for image galleries from pixhost.to"""
category = "pixhost"
subcategory = "gallery"
- pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)"
- r"/gallery/([^/?#]+))")
+ root = "https://pixhost.to"
+ pattern = (r"(?:https?://)?(?:www\.)?pixhost\.(?:to|org)"
+ r"(/gallery/([^/?#]+))")
example = "https://pixhost.to/gallery/ID"
def items(self):
@@ -288,29 +315,39 @@ class PixhostGalleryExtractor(ImagehostImageExtractor):
class PostimgImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from postimages.org"""
category = "postimg"
- pattern = (r"(?:https?://)?((?:www\.)?(?:postim(?:ages|g)|pixxxels)"
- r"\.(?:cc|org)/(?!gallery/)(?:image/)?([^/?#]+)/?)")
- example = "https://postimages.org/ID"
+ root = "https://postimg.cc"
+ pattern = (r"(?:https?://)?(?:www\.)?(?:postim(?:ages|g)|pixxxels)"
+ r"\.(?:cc|org)(/(?!gallery/)(?:image/)?([^/?#]+)/?)")
+ example = "https://postimg.cc/ID"
def get_info(self, page):
pos = page.index(' id="download"')
url , pos = text.rextract(page, ' href="', '"', pos)
- filename, pos = text.extract(page, 'class="imagename">', '<', pos)
- return url, text.unescape(filename)
+ filename, pos = text.extract(page, ' class="my-4">', '<', pos)
+ return url, text.unescape(filename) if filename else None
class PostimgGalleryExtractor(ImagehostImageExtractor):
"""Extractor for images galleries from postimages.org"""
category = "postimg"
subcategory = "gallery"
- pattern = (r"(?:https?://)?((?:www\.)?(?:postim(?:ages|g)|pixxxels)"
- r"\.(?:cc|org)/gallery/([^/?#]+))")
- example = "https://postimages.org/gallery/ID"
+ root = "https://postimg.cc"
+ pattern = (r"(?:https?://)?(?:www\.)?(?:postim(?:ages|g)|pixxxels)"
+ r"\.(?:cc|org)(/gallery/([^/?#]+))")
+ example = "https://postimg.cc/gallery/ID"
def items(self):
page = self.request(self.page_url).text
- data = {"_extractor": PostimgImageExtractor}
- for url in text.extract_iter(page, ' class="thumb"><a href="', '"'):
+ title = text.extr(
+ page, 'property="og:title" content="', ' — Postimages"')
+
+ data = {
+ "_extractor" : PostimgImageExtractor,
+ "gallery_title": text.unescape(title),
+ }
+
+ for token in text.extract_iter(page, 'data-image="', '"'):
+ url = f"{self.root}/{token}"
yield Message.Queue, url, data
@@ -323,7 +360,7 @@ class TurboimagehostImageExtractor(ImagehostImageExtractor):
def get_info(self, page):
url = text.extract(page, 'src="', '"', page.index("<img "))[0]
- return url, url
+ return url, None
class TurboimagehostGalleryExtractor(ImagehostImageExtractor):
@@ -343,7 +380,7 @@ class TurboimagehostGalleryExtractor(ImagehostImageExtractor):
if params["p"] == 1 and \
"Requested gallery don`t exist on our website." in page:
- raise exception.NotFoundError("gallery")
+ self.not_found()
thumb_url = None
for thumb_url in text.extract_iter(page, '"><a href="', '"'):
@@ -362,7 +399,7 @@ class ViprImageExtractor(ImagehostImageExtractor):
def get_info(self, page):
url = text.extr(page, '<img src="', '"')
- return url, url
+ return url, None
class ImgclickImageExtractor(ImagehostImageExtractor):
@@ -439,14 +476,16 @@ class ImgdriveImageExtractor(ImagehostImageExtractor):
class SilverpicImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from silverpic.com"""
category = "silverpic"
- pattern = (r"(?:https?://)?((?:www\.)?silverpic\.com"
- r"/([a-z0-9]{10,})/[\S]+\.html)")
- example = "https://silverpic.com/a1b2c3d4f5g6/NAME.EXT.html"
+ root = "https://silverpic.net"
+ _params = "complex"
+ pattern = (r"(?:https?://)?(?:www\.)?silverpic\.(?:net|com)"
+ r"(/([a-z0-9]{10,})/[\S]+\.html)")
+ example = "https://silverpic.net/a1b2c3d4f5g6/NAME.EXT.html"
def get_info(self, page):
url, pos = text.extract(page, '<img src="/img/', '"')
alt, pos = text.extract(page, 'alt="', '"', pos)
- return f"https://silverpic.com/img/{url}", alt
+ return f"{self.root}/img/{url}", alt
def metadata(self, page):
pos = page.find('<img src="/img/')
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index d9a63c7..d957328 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -30,7 +30,7 @@ class ImgbbExtractor(Extractor):
for image in self.posts():
url = image["url"]
text.nameext_from_url(url, image)
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, url, image
def login(self):
@@ -159,8 +159,7 @@ class ImgbbImageExtractor(ImgbbExtractor):
"width" : text.parse_int(extr('"og:image:width" content="', '"')),
"height": text.parse_int(extr('"og:image:height" content="', '"')),
"album" : extr("Added to <a", "</a>"),
- "date" : text.parse_datetime(extr(
- '<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
+ "date" : self.parse_datetime_iso(extr('<span title="', '"')),
"user" : util.json_loads(extr(
"CHV.obj.resource=", "};") + "}").get("user"),
}
diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py
index 5def88d..8ac66f4 100644
--- a/gallery_dl/extractor/imgbox.py
+++ b/gallery_dl/extractor/imgbox.py
@@ -9,7 +9,7 @@
"""Extractors for https://imgbox.com/"""
from .common import Extractor, Message, AsynchronousMixin
-from .. import text, util, exception
+from .. import text, exception
class ImgboxExtractor(Extractor):
@@ -19,7 +19,7 @@ class ImgboxExtractor(Extractor):
def items(self):
data = self.get_job_metadata()
- yield Message.Directory, data
+ yield Message.Directory, "", data
for image_key in self.get_image_keys():
imgpage = self.request(self.root + "/" + image_key).text
@@ -69,7 +69,7 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
page = self.request(self.root + "/g/" + self.gallery_key).text
if "The specified gallery could not be found." in page:
raise exception.NotFoundError("gallery")
- self.image_keys = util.re(
+ self.image_keys = text.re(
r'<a href="/([^"]+)"><img alt="').findall(page)
title = text.extr(page, "<h1>", "</h1>")
@@ -88,7 +88,10 @@ class ImgboxImageExtractor(ImgboxExtractor):
"""Extractor for single images from imgbox.com"""
subcategory = "image"
archive_fmt = "{image_key}"
- pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:www\.|i\.)?imgbox\.com|"
+ r"images\d+\.imgbox\.com/[0-9a-f]{2}/[0-9a-f]{2}"
+ r")/([A-Za-z0-9]{8})")
example = "https://imgbox.com/1234abcd"
def __init__(self, match):
diff --git a/gallery_dl/extractor/imgpile.py b/gallery_dl/extractor/imgpile.py
index 9fc3a9c..f634203 100644
--- a/gallery_dl/extractor/imgpile.py
+++ b/gallery_dl/extractor/imgpile.py
@@ -54,7 +54,7 @@ class ImgpilePostExtractor(ImgpileExtractor):
data = {"post": post}
data["count"] = post["count"] = len(files)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], file in enumerate(files, 1):
data.update(file)
url = file["url"]
diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py
index 7e5e6cf..4a81e53 100644
--- a/gallery_dl/extractor/imgth.py
+++ b/gallery_dl/extractor/imgth.py
@@ -31,7 +31,7 @@ class ImgthGalleryExtractor(GalleryExtractor):
"title": text.unescape(extr("<h1>", "</h1>")),
"count": text.parse_int(extr(
"total of images in this gallery: ", " ")),
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr("created on ", " by <")
.replace("th, ", " ", 1).replace("nd, ", " ", 1)
.replace("st, ", " ", 1), "%B %d %Y at %H:%M"),
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 1ac76e0..4755388 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -38,7 +38,7 @@ class ImgurExtractor(Extractor):
image["url"] = url = \
f"https://i.imgur.com/{image['id']}.{image['ext']}"
- image["date"] = text.parse_datetime(image["created_at"])
+ image["date"] = self.parse_datetime_iso(image["created_at"])
image["_http_validate"] = self._validate
text.nameext_from_url(url, image)
@@ -67,7 +67,7 @@ class ImgurImageExtractor(ImgurExtractor):
subcategory = "image"
filename_fmt = "{category}_{id}{title:?_//}.{extension}"
archive_fmt = "{id}"
- pattern = (BASE_PATTERN + r"/(?!gallery|search)"
+ pattern = (rf"{BASE_PATTERN}/(?!gallery|search)"
r"(?:r/\w+/)?(?:[^/?#]+-)?(\w{7}|\w{5})[sbtmlh]?")
example = "https://imgur.com/abcdefg"
@@ -83,7 +83,7 @@ class ImgurImageExtractor(ImgurExtractor):
image.update(image["media"][0])
del image["media"]
url = self._prepare(image)
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, url, image
@@ -93,7 +93,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}")
filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}"
archive_fmt = "{album[id]}_{id}"
- pattern = BASE_PATTERN + r"/a/(?:[^/?#]+-)?(\w{7}|\w{5})"
+ pattern = rf"{BASE_PATTERN}/a/(?:[^/?#]+-)?(\w{{7}}|\w{{5}})"
example = "https://imgur.com/a/abcde"
def items(self):
@@ -106,7 +106,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
del album["media"]
count = len(images)
- album["date"] = text.parse_datetime(album["created_at"])
+ album["date"] = self.parse_datetime_iso(album["created_at"])
try:
del album["ad_url"]
@@ -119,14 +119,15 @@ class ImgurAlbumExtractor(ImgurExtractor):
image["num"] = num
image["count"] = count
image["album"] = album
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, url, image
class ImgurGalleryExtractor(ImgurExtractor):
"""Extractor for imgur galleries"""
subcategory = "gallery"
- pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{7}|\w{5})"
+ pattern = (rf"{BASE_PATTERN}/"
+ rf"(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{{7}}|\w{{5}})")
example = "https://imgur.com/gallery/abcde"
def items(self):
@@ -142,7 +143,7 @@ class ImgurGalleryExtractor(ImgurExtractor):
class ImgurUserExtractor(ImgurExtractor):
"""Extractor for all images posted by a user"""
subcategory = "user"
- pattern = (BASE_PATTERN + r"/user/(?!me(?:/|$|\?|#))"
+ pattern = (rf"{BASE_PATTERN}/user/(?!me(?:/|$|\?|#))"
r"([^/?#]+)(?:/posts|/submitted)?/?$")
example = "https://imgur.com/user/USER"
@@ -153,7 +154,7 @@ class ImgurUserExtractor(ImgurExtractor):
class ImgurFavoriteExtractor(ImgurExtractor):
"""Extractor for a user's favorites"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/?$"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/favorites/?$"
example = "https://imgur.com/user/USER/favorites"
def items(self):
@@ -163,7 +164,7 @@ class ImgurFavoriteExtractor(ImgurExtractor):
class ImgurFavoriteFolderExtractor(ImgurExtractor):
"""Extractor for a user's favorites folder"""
subcategory = "favorite-folder"
- pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/folder/(\d+)"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/favorites/folder/(\d+)"
example = "https://imgur.com/user/USER/favorites/folder/12345/TITLE"
def __init__(self, match):
@@ -178,7 +179,7 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor):
class ImgurMeExtractor(ImgurExtractor):
"""Extractor for your personal uploads"""
subcategory = "me"
- pattern = BASE_PATTERN + r"/user/me(?:/posts)?(/hidden)?"
+ pattern = rf"{BASE_PATTERN}/user/me(?:/posts)?(/hidden)?"
example = "https://imgur.com/user/me"
def items(self):
@@ -195,7 +196,7 @@ class ImgurMeExtractor(ImgurExtractor):
class ImgurSubredditExtractor(ImgurExtractor):
"""Extractor for a subreddits's imgur links"""
subcategory = "subreddit"
- pattern = BASE_PATTERN + r"/r/([^/?#]+)/?$"
+ pattern = rf"{BASE_PATTERN}/r/([^/?#]+)/?$"
example = "https://imgur.com/r/SUBREDDIT"
def items(self):
@@ -205,7 +206,7 @@ class ImgurSubredditExtractor(ImgurExtractor):
class ImgurTagExtractor(ImgurExtractor):
"""Extractor for imgur tag searches"""
subcategory = "tag"
- pattern = BASE_PATTERN + r"/t/([^/?#]+)$"
+ pattern = rf"{BASE_PATTERN}/t/([^/?#]+)$"
example = "https://imgur.com/t/TAG"
def items(self):
@@ -215,7 +216,7 @@ class ImgurTagExtractor(ImgurExtractor):
class ImgurSearchExtractor(ImgurExtractor):
"""Extractor for imgur search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search(?:/[^?#]+)?/?\?q=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/search(?:/[^?#]+)?/?\?q=([^&#]+)"
example = "https://imgur.com/search?q=UERY"
def items(self):
diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py
index 5ad1c30..d83dcc8 100644
--- a/gallery_dl/extractor/imhentai.py
+++ b/gallery_dl/extractor/imhentai.py
@@ -79,7 +79,7 @@ BASE_PATTERN = ImhentaiExtractor.update({
class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
"""Extractor for imhentai galleries"""
- pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?:gallery|view)/(\d+)"
example = "https://imhentai.xxx/gallery/12345/"
def __init__(self, match):
@@ -141,7 +141,7 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
class ImhentaiTagExtractor(ImhentaiExtractor):
"""Extractor for imhentai tag searches"""
subcategory = "tag"
- pattern = (BASE_PATTERN + r"(/(?:"
+ pattern = (rf"{BASE_PATTERN}(/(?:"
r"artist|category|character|group|language|parody|tag"
r")/([^/?#]+))")
example = "https://imhentai.xxx/tag/TAG/"
@@ -154,9 +154,8 @@ class ImhentaiTagExtractor(ImhentaiExtractor):
class ImhentaiSearchExtractor(ImhentaiExtractor):
"""Extractor for imhentai search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search(/?\?[^#]+|/[^/?#]+/?)"
+ pattern = rf"{BASE_PATTERN}(/(?:advanced-)?search/?\?[^#]+|/[^/?#]+/?)"
example = "https://imhentai.xxx/search/?key=QUERY"
def items(self):
- url = self.root + "/search" + self.groups[-1]
- return self._pagination(url)
+ return self._pagination(self.root + self.groups[-1])
diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py
index 45ae52e..547d4ee 100644
--- a/gallery_dl/extractor/inkbunny.py
+++ b/gallery_dl/extractor/inkbunny.py
@@ -35,8 +35,8 @@ class InkbunnyExtractor(Extractor):
for post in self.posts():
post.update(metadata)
- post["date"] = text.parse_datetime(
- post["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z")
+ post["date"] = self.parse_datetime_iso(
+ post["create_datetime"][:19])
post["tags"] = [kw["keyword_name"] for kw in post["keywords"]]
post["ratings"] = [r["name"] for r in post["ratings"]]
files = post["files"]
@@ -48,12 +48,12 @@ class InkbunnyExtractor(Extractor):
del post["keywords"]
del post["files"]
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
post.update(file)
post["deleted"] = (file["deleted"] == "t")
- post["date"] = text.parse_datetime(
- file["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z")
+ post["date"] = self.parse_datetime_iso(
+ file["create_datetime"][:19])
text.nameext_from_url(file["file_name"], post)
url = file["file_url_full"]
@@ -71,7 +71,7 @@ class InkbunnyExtractor(Extractor):
class InkbunnyUserExtractor(InkbunnyExtractor):
"""Extractor for inkbunny user profiles"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?(\w+)(?:$|[/?#])"
+ pattern = rf"{BASE_PATTERN}/(?!s/)(gallery/|scraps/)?(\w+)(?:$|[/?#])"
example = "https://inkbunny.net/USER"
def __init__(self, match):
@@ -101,7 +101,7 @@ class InkbunnyUserExtractor(InkbunnyExtractor):
class InkbunnyPoolExtractor(InkbunnyExtractor):
"""Extractor for inkbunny pools"""
subcategory = "pool"
- pattern = (BASE_PATTERN + r"/(?:"
+ pattern = (rf"{BASE_PATTERN}/(?:"
r"poolview_process\.php\?pool_id=(\d+)|"
r"submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=pool(?:&[^#]+)?))")
@@ -132,7 +132,7 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
"""Extractor for inkbunny user favorites"""
subcategory = "favorite"
directory_fmt = ("{category}", "{favs_username!l}", "Favorites")
- pattern = (BASE_PATTERN + r"/(?:"
+ pattern = (rf"{BASE_PATTERN}/(?:"
r"userfavorites_process\.php\?favs_user_id=(\d+)|"
r"submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=userfavs(?:&[^#]+)?))")
@@ -175,7 +175,7 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
class InkbunnyUnreadExtractor(InkbunnyExtractor):
"""Extractor for unread inkbunny submissions"""
subcategory = "unread"
- pattern = (BASE_PATTERN + r"/submissionsviewall\.php"
+ pattern = (rf"{BASE_PATTERN}/submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=unreadsubs(?:&[^#]+)?)")
example = ("https://inkbunny.net/submissionsviewall.php"
"?text=&mode=unreadsubs&type=")
@@ -195,7 +195,7 @@ class InkbunnyUnreadExtractor(InkbunnyExtractor):
class InkbunnySearchExtractor(InkbunnyExtractor):
"""Extractor for inkbunny search results"""
subcategory = "search"
- pattern = (BASE_PATTERN + r"/submissionsviewall\.php"
+ pattern = (rf"{BASE_PATTERN}/submissionsviewall\.php"
r"\?((?:[^#]+&)?mode=search(?:&[^#]+)?)")
example = ("https://inkbunny.net/submissionsviewall.php"
"?text=TAG&mode=search&type=")
@@ -229,7 +229,7 @@ class InkbunnySearchExtractor(InkbunnyExtractor):
class InkbunnyFollowingExtractor(InkbunnyExtractor):
"""Extractor for inkbunny user watches"""
subcategory = "following"
- pattern = (BASE_PATTERN + r"/(?:"
+ pattern = (rf"{BASE_PATTERN}/(?:"
r"watchlist_process\.php\?mode=watching&user_id=(\d+)|"
r"usersviewall\.php"
r"\?((?:[^#]+&)?mode=watching(?:&[^#]+)?))")
@@ -268,7 +268,7 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor):
class InkbunnyPostExtractor(InkbunnyExtractor):
"""Extractor for individual Inkbunny posts"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/s/(\d+)"
+ pattern = rf"{BASE_PATTERN}/s/(\d+)"
example = "https://inkbunny.net/s/12345"
def __init__(self, match):
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 71964e9..b89369f 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -16,7 +16,7 @@ import itertools
import binascii
BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
-USER_PATTERN = BASE_PATTERN + r"/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/(?!(?:p|tv|reel|explore|stories)/)([^/?#]+)"
class InstagramExtractor(Extractor):
@@ -38,7 +38,7 @@ class InstagramExtractor(Extractor):
def _init(self):
self.www_claim = "0"
self.csrf_token = util.generate_token()
- self._find_tags = util.re(r"#\w+").findall
+ self._find_tags = text.re(r"#\w+").findall
self._logged_in = True
self._cursor = None
self._user = None
@@ -62,8 +62,10 @@ class InstagramExtractor(Extractor):
data = self.metadata()
if videos := self.config("videos", True):
- videos_dash = (videos != "merged")
+ self.videos_dash = videos_dash = (videos != "merged")
videos_headers = {"User-Agent": "Mozilla/5.0"}
+ else:
+ self.videos_dash = False
previews = self.config("previews", False)
max_posts = self.config("max-posts")
@@ -86,7 +88,7 @@ class InstagramExtractor(Extractor):
files = post.pop("_files")
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
if "date" in post:
del post["date"]
@@ -173,7 +175,7 @@ class InstagramExtractor(Extractor):
post_url = f"{self.root}/stories/highlights/{reel_id}/"
data = {
"user" : post.get("user"),
- "expires": text.parse_timestamp(expires),
+ "expires": self.parse_timestamp(expires),
"post_id": reel_id,
"post_shortcode": shortcode_from_id(reel_id),
"post_url": post_url,
@@ -224,7 +226,7 @@ class InstagramExtractor(Extractor):
data["owner_id"] = owner["pk"]
data["username"] = owner.get("username")
data["fullname"] = owner.get("full_name")
- data["post_date"] = data["date"] = text.parse_timestamp(
+ data["post_date"] = data["date"] = self.parse_timestamp(
post.get("taken_at") or post.get("created_at") or post.get("seen"))
data["_files"] = files = []
for num, item in enumerate(items, 1):
@@ -236,13 +238,23 @@ class InstagramExtractor(Extractor):
data["post_shortcode"])
continue
+ width_orig = item.get("original_width", 0)
+ height_orig = item.get("original_height", 0)
+
if video_versions := item.get("video_versions"):
video = max(
video_versions,
key=lambda x: (x["width"], x["height"], x["type"]),
)
- manifest = item.get("video_dash_manifest")
+
media = video
+ if (manifest := item.get("video_dash_manifest")) and \
+ self.videos_dash:
+ width = width_orig
+ height = height_orig
+ else:
+ width = video["width"]
+ height = video["height"]
if self._warn_video:
self._warn_video = False
@@ -254,22 +266,21 @@ class InstagramExtractor(Extractor):
else:
video = manifest = None
media = image
+ width = image["width"]
+ height = image["height"]
- if self._warn_image < (
- (image["width"] < item.get("original_width", 0)) +
- (image["height"] < item.get("original_height", 0))):
+ if self._warn_image < ((width < width_orig) +
+ (height < height_orig)):
self.log.warning(
"%s: Available image resolutions lower than the "
"original (%sx%s < %sx%s). "
"Consider refreshing your cookies.",
data["post_shortcode"],
- image["width"], image["height"],
- item.get("original_width", 0),
- item.get("original_height", 0))
+ width, height, width_orig, height_orig)
media = {
"num" : num,
- "date" : text.parse_timestamp(item.get("taken_at") or
+ "date" : self.parse_timestamp(item.get("taken_at") or
media.get("taken_at") or
post.get("taken_at")),
"media_id" : item["pk"],
@@ -277,8 +288,10 @@ class InstagramExtractor(Extractor):
shortcode_from_id(item["pk"])),
"display_url": image["url"],
"video_url" : video["url"] if video else None,
- "width" : media["width"],
- "height" : media["height"],
+ "width" : width,
+ "width_original" : width_orig,
+ "height" : height,
+ "height_original": height_orig,
}
if manifest is not None:
@@ -288,7 +301,9 @@ class InstagramExtractor(Extractor):
if "reshared_story_media_author" in item:
media["author"] = item["reshared_story_media_author"]
if "expiring_at" in item:
- media["expires"] = text.parse_timestamp(post["expiring_at"])
+ media["expires"] = self.parse_timestamp(post["expiring_at"])
+ if "subscription_media_visibility" in item:
+ media["subscription"] = item["subscription_media_visibility"]
self._extract_tagged_users(item, media)
files.append(media)
@@ -331,7 +346,7 @@ class InstagramExtractor(Extractor):
"post_id" : post["id"],
"post_shortcode": post["shortcode"],
"post_url" : f"{self.root}/p/{post['shortcode']}/",
- "post_date" : text.parse_timestamp(post["taken_at_timestamp"]),
+ "post_date" : self.parse_timestamp(post["taken_at_timestamp"]),
"description": text.parse_unicode_escapes("\n".join(
edge["node"]["text"]
for edge in post["edge_media_to_caption"]["edges"]
@@ -490,7 +505,7 @@ class InstagramPostExtractor(InstagramExtractor):
class InstagramUserExtractor(Dispatch, InstagramExtractor):
"""Extractor for an Instagram user profile"""
- pattern = USER_PATTERN + r"/?(?:$|[?#])"
+ pattern = rf"{USER_PATTERN}/?(?:$|[?#])"
example = "https://www.instagram.com/USER/"
def items(self):
@@ -510,7 +525,7 @@ class InstagramUserExtractor(Dispatch, InstagramExtractor):
class InstagramPostsExtractor(InstagramExtractor):
"""Extractor for an Instagram user's posts"""
subcategory = "posts"
- pattern = USER_PATTERN + r"/posts"
+ pattern = rf"{USER_PATTERN}/posts"
example = "https://www.instagram.com/USER/posts/"
def posts(self):
@@ -527,7 +542,7 @@ class InstagramPostsExtractor(InstagramExtractor):
class InstagramReelsExtractor(InstagramExtractor):
"""Extractor for an Instagram user's reels"""
subcategory = "reels"
- pattern = USER_PATTERN + r"/reels"
+ pattern = rf"{USER_PATTERN}/reels"
example = "https://www.instagram.com/USER/reels/"
def posts(self):
@@ -544,7 +559,7 @@ class InstagramReelsExtractor(InstagramExtractor):
class InstagramTaggedExtractor(InstagramExtractor):
"""Extractor for an Instagram user's tagged posts"""
subcategory = "tagged"
- pattern = USER_PATTERN + r"/tagged"
+ pattern = rf"{USER_PATTERN}/tagged"
example = "https://www.instagram.com/USER/tagged/"
def metadata(self):
@@ -570,7 +585,7 @@ class InstagramTaggedExtractor(InstagramExtractor):
class InstagramGuideExtractor(InstagramExtractor):
"""Extractor for an Instagram guide"""
subcategory = "guide"
- pattern = USER_PATTERN + r"/guide/[^/?#]+/(\d+)"
+ pattern = rf"{USER_PATTERN}/guide/[^/?#]+/(\d+)"
example = "https://www.instagram.com/USER/guide/NAME/12345"
def __init__(self, match):
@@ -587,7 +602,7 @@ class InstagramGuideExtractor(InstagramExtractor):
class InstagramSavedExtractor(InstagramExtractor):
"""Extractor for an Instagram user's saved media"""
subcategory = "saved"
- pattern = USER_PATTERN + r"/saved(?:/all-posts)?/?$"
+ pattern = rf"{USER_PATTERN}/saved(?:/all-posts)?/?$"
example = "https://www.instagram.com/USER/saved/"
def posts(self):
@@ -597,7 +612,7 @@ class InstagramSavedExtractor(InstagramExtractor):
class InstagramCollectionExtractor(InstagramExtractor):
"""Extractor for Instagram collection"""
subcategory = "collection"
- pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)"
+ pattern = rf"{USER_PATTERN}/saved/([^/?#]+)/([^/?#]+)"
example = "https://www.instagram.com/USER/saved/COLLECTION/12345"
def __init__(self, match):
@@ -623,7 +638,7 @@ class InstagramStoriesTrayExtractor(InstagramExtractor):
def items(self):
base = f"{self.root}/stories/id:"
for story in self.api.reels_tray():
- story["date"] = text.parse_timestamp(story["latest_reel_media"])
+ story["date"] = self.parse_timestamp(story["latest_reel_media"])
story["_extractor"] = InstagramStoriesExtractor
yield Message.Queue, f"{base}{story['id']}/", story
@@ -681,7 +696,7 @@ class InstagramStoriesExtractor(InstagramExtractor):
class InstagramHighlightsExtractor(InstagramExtractor):
"""Extractor for an Instagram user's story highlights"""
subcategory = "highlights"
- pattern = USER_PATTERN + r"/highlights"
+ pattern = rf"{USER_PATTERN}/highlights"
example = "https://www.instagram.com/USER/highlights/"
def posts(self):
@@ -692,7 +707,7 @@ class InstagramHighlightsExtractor(InstagramExtractor):
class InstagramFollowersExtractor(InstagramExtractor):
"""Extractor for an Instagram user's followers"""
subcategory = "followers"
- pattern = USER_PATTERN + r"/followers"
+ pattern = rf"{USER_PATTERN}/followers"
example = "https://www.instagram.com/USER/followers/"
def items(self):
@@ -706,7 +721,7 @@ class InstagramFollowersExtractor(InstagramExtractor):
class InstagramFollowingExtractor(InstagramExtractor):
"""Extractor for an Instagram user's followed users"""
subcategory = "following"
- pattern = USER_PATTERN + r"/following"
+ pattern = rf"{USER_PATTERN}/following"
example = "https://www.instagram.com/USER/following/"
def items(self):
@@ -721,7 +736,7 @@ class InstagramTagExtractor(InstagramExtractor):
"""Extractor for Instagram tags"""
subcategory = "tag"
directory_fmt = ("{category}", "{subcategory}", "{tag}")
- pattern = BASE_PATTERN + r"/explore/tags/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/explore/tags/([^/?#]+)"
example = "https://www.instagram.com/explore/tags/TAG/"
def metadata(self):
@@ -734,7 +749,7 @@ class InstagramTagExtractor(InstagramExtractor):
class InstagramInfoExtractor(InstagramExtractor):
"""Extractor for an Instagram user's profile data"""
subcategory = "info"
- pattern = USER_PATTERN + r"/info"
+ pattern = rf"{USER_PATTERN}/info"
example = "https://www.instagram.com/USER/info/"
def items(self):
@@ -744,13 +759,13 @@ class InstagramInfoExtractor(InstagramExtractor):
else:
user = self.api.user_by_name(screen_name)
- return iter(((Message.Directory, user),))
+ return iter(((Message.Directory, "", user),))
class InstagramAvatarExtractor(InstagramExtractor):
"""Extractor for an Instagram user's avatar"""
subcategory = "avatar"
- pattern = USER_PATTERN + r"/avatar"
+ pattern = rf"{USER_PATTERN}/avatar"
example = "https://www.instagram.com/USER/avatar/"
def posts(self):
@@ -858,8 +873,11 @@ class InstagramRestAPI():
def user_by_name(self, screen_name):
endpoint = "/v1/users/web_profile_info/"
params = {"username": screen_name}
- return self._call(
- endpoint, params=params, notfound="user")["data"]["user"]
+ try:
+ return self._call(
+ endpoint, params=params, notfound="user")["data"]["user"]
+ except KeyError:
+ raise exception.NotFoundError("user")
@memcache(keyarg=1)
def user_by_id(self, user_id):
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index 06c5caa..c3fbf8d 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -36,8 +36,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
'{"":' + data.replace('\\"', '"')))
doc = data["initialDocumentData"]["document"]
- doc["date"] = text.parse_datetime(
- doc["originalPublishDateInISOString"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ doc["date"] = self.parse_datetime_iso(
+ doc["originalPublishDateInISOString"])
self.count = text.parse_int(doc["pageCount"])
self.base = (f"https://image.isu.pub/{doc['revisionId']}-"
@@ -68,7 +68,7 @@ class IssuuUserExtractor(IssuuBase, Extractor):
data = text.extr(html, '\\"docs\\":', '}]\\n"]')
docs = util.json_loads(data.replace('\\"', '"'))
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
return
for publication in docs:
diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py
index 19ffc50..566ee8b 100644
--- a/gallery_dl/extractor/itaku.py
+++ b/gallery_dl/extractor/itaku.py
@@ -13,7 +13,7 @@ from ..cache import memcache
from .. import text, util
BASE_PATTERN = r"(?:https?://)?itaku\.ee"
-USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/profile/([^/?#]+)"
class ItakuExtractor(Extractor):
@@ -32,8 +32,7 @@ class ItakuExtractor(Extractor):
def items(self):
if images := self.images():
for image in images:
- image["date"] = text.parse_datetime(
- image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ image["date"] = self.parse_datetime_iso(image["date_added"])
for category, tags in image.pop("categorized_tags").items():
image[f"tags_{category.lower()}"] = [
t["name"] for t in tags]
@@ -52,7 +51,7 @@ class ItakuExtractor(Extractor):
else:
url = image["image"]
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, url, text.nameext_from_url(url, image)
return
@@ -60,15 +59,14 @@ class ItakuExtractor(Extractor):
for post in posts:
images = post.pop("gallery_images") or ()
post["count"] = len(images)
- post["date"] = text.parse_datetime(
- post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["date_added"])
post["tags"] = [t["name"] for t in post["tags"]]
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], image in enumerate(images, 1):
post["file"] = image
- image["date"] = text.parse_datetime(
- image["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ image["date"] = self.parse_datetime_iso(
+ image["date_added"])
url = image["image"]
yield Message.Url, url, text.nameext_from_url(url, post)
@@ -88,7 +86,7 @@ class ItakuExtractor(Extractor):
class ItakuGalleryExtractor(ItakuExtractor):
"""Extractor for an itaku user's gallery"""
subcategory = "gallery"
- pattern = USER_PATTERN + r"/gallery(?:/(\d+))?"
+ pattern = rf"{USER_PATTERN}/gallery(?:/(\d+))?"
example = "https://itaku.ee/profile/USER/gallery"
def images(self):
@@ -106,7 +104,7 @@ class ItakuPostsExtractor(ItakuExtractor):
"{id}{title:? //}")
filename_fmt = "{file[id]}{file[title]:? //}.{extension}"
archive_fmt = "{id}_{file[id]}"
- pattern = USER_PATTERN + r"/posts(?:/(\d+))?"
+ pattern = rf"{USER_PATTERN}/posts(?:/(\d+))?"
example = "https://itaku.ee/profile/USER/posts"
def posts(self):
@@ -120,7 +118,7 @@ class ItakuPostsExtractor(ItakuExtractor):
class ItakuStarsExtractor(ItakuExtractor):
"""Extractor for an itaku user's starred images"""
subcategory = "stars"
- pattern = USER_PATTERN + r"/stars(?:/(\d+))?"
+ pattern = rf"{USER_PATTERN}/stars(?:/(\d+))?"
example = "https://itaku.ee/profile/USER/stars"
def images(self):
@@ -134,7 +132,7 @@ class ItakuStarsExtractor(ItakuExtractor):
class ItakuFollowingExtractor(ItakuExtractor):
subcategory = "following"
- pattern = USER_PATTERN + r"/following"
+ pattern = rf"{USER_PATTERN}/following"
example = "https://itaku.ee/profile/USER/following"
def users(self):
@@ -145,7 +143,7 @@ class ItakuFollowingExtractor(ItakuExtractor):
class ItakuFollowersExtractor(ItakuExtractor):
subcategory = "followers"
- pattern = USER_PATTERN + r"/followers"
+ pattern = rf"{USER_PATTERN}/followers"
example = "https://itaku.ee/profile/USER/followers"
def users(self):
@@ -157,7 +155,7 @@ class ItakuFollowersExtractor(ItakuExtractor):
class ItakuBookmarksExtractor(ItakuExtractor):
"""Extractor for an itaku bookmarks folder"""
subcategory = "bookmarks"
- pattern = USER_PATTERN + r"/bookmarks/(image|user)/(\d+)"
+ pattern = rf"{USER_PATTERN}/bookmarks/(image|user)/(\d+)"
example = "https://itaku.ee/profile/USER/bookmarks/image/12345"
def _init(self):
@@ -178,23 +176,23 @@ class ItakuBookmarksExtractor(ItakuExtractor):
class ItakuUserExtractor(Dispatch, ItakuExtractor):
"""Extractor for itaku user profiles"""
- pattern = USER_PATTERN + r"/?(?:$|\?|#)"
+ pattern = rf"{USER_PATTERN}/?(?:$|\?|#)"
example = "https://itaku.ee/profile/USER"
def items(self):
base = f"{self.root}/profile/{self.groups[0]}/"
return self._dispatch_extractors((
- (ItakuGalleryExtractor , base + "gallery"),
- (ItakuPostsExtractor , base + "posts"),
- (ItakuFollowersExtractor, base + "followers"),
- (ItakuFollowingExtractor, base + "following"),
- (ItakuStarsExtractor , base + "stars"),
+ (ItakuGalleryExtractor , f"{base}gallery"),
+ (ItakuPostsExtractor , f"{base}posts"),
+ (ItakuFollowersExtractor, f"{base}followers"),
+ (ItakuFollowingExtractor, f"{base}following"),
+ (ItakuStarsExtractor , f"{base}stars"),
), ("gallery",))
class ItakuImageExtractor(ItakuExtractor):
subcategory = "image"
- pattern = BASE_PATTERN + r"/images/(\d+)"
+ pattern = rf"{BASE_PATTERN}/images/(\d+)"
example = "https://itaku.ee/images/12345"
def images(self):
@@ -207,7 +205,7 @@ class ItakuPostExtractor(ItakuExtractor):
"{id}{title:? //}")
filename_fmt = "{file[id]}{file[title]:? //}.{extension}"
archive_fmt = "{id}_{file[id]}"
- pattern = BASE_PATTERN + r"/posts/(\d+)"
+ pattern = rf"{BASE_PATTERN}/posts/(\d+)"
example = "https://itaku.ee/posts/12345"
def posts(self):
@@ -216,7 +214,7 @@ class ItakuPostExtractor(ItakuExtractor):
class ItakuSearchExtractor(ItakuExtractor):
subcategory = "search"
- pattern = BASE_PATTERN + r"/home/images/?\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/home/images/?\?([^#]+)"
example = "https://itaku.ee/home/images?tags=SEARCH"
def images(self):
@@ -248,7 +246,7 @@ class ItakuAPI():
def __init__(self, extractor):
self.extractor = extractor
- self.root = extractor.root + "/api"
+ self.root = f"{extractor.root}/api"
self.headers = {
"Accept": "application/json, text/plain, */*",
}
@@ -259,7 +257,7 @@ class ItakuAPI():
"cursor" : None,
"date_range": "",
"maturity_rating": ("SFW", "Questionable", "NSFW"),
- "ordering" : "-date_added",
+ "ordering" : self._order(),
"page" : "1",
"page_size" : "30",
"visibility": ("PUBLIC", "PROFILE_ONLY"),
@@ -273,7 +271,7 @@ class ItakuAPI():
"cursor" : None,
"date_range": "",
"maturity_rating": ("SFW", "Questionable", "NSFW"),
- "ordering" : "-date_added",
+ "ordering" : self._order(),
"page" : "1",
"page_size" : "30",
**params,
@@ -284,7 +282,7 @@ class ItakuAPI():
endpoint = "/user_profiles/"
params = {
"cursor" : None,
- "ordering" : "-date_added",
+ "ordering" : self._order(),
"page" : "1",
"page_size": "50",
"sfw_only" : "false",
@@ -311,7 +309,7 @@ class ItakuAPI():
def _call(self, endpoint, params=None):
if not endpoint.startswith("http"):
- endpoint = self.root + endpoint
+ endpoint = f"{self.root}{endpoint}"
return self.extractor.request_json(
endpoint, params=params, headers=self.headers)
@@ -330,3 +328,11 @@ class ItakuAPI():
return
data = self._call(url_next)
+
+ def _order(self):
+ if order := self.extractor.config("order"):
+ if order in {"a", "asc", "r", "reverse"}:
+ return "date_added"
+ if order not in {"d", "desc"}:
+ return order
+ return "-date_added"
diff --git a/gallery_dl/extractor/itchio.py b/gallery_dl/extractor/itchio.py
index 6312e58..6fefe33 100644
--- a/gallery_dl/extractor/itchio.py
+++ b/gallery_dl/extractor/itchio.py
@@ -57,5 +57,5 @@ class ItchioGameExtractor(Extractor):
game = {"game": game, "user": user, "id": upload_id}
url = info["url"]
- yield Message.Directory, game
+ yield Message.Directory, "", game
yield Message.Url, url, text.nameext_from_url(url, game)
diff --git a/gallery_dl/extractor/iwara.py b/gallery_dl/extractor/iwara.py
index 8af2f42..d9d1cf0 100644
--- a/gallery_dl/extractor/iwara.py
+++ b/gallery_dl/extractor/iwara.py
@@ -47,7 +47,7 @@ class IwaraExtractor(Extractor):
group_info["type"] = "image"
group_info["count"] = len(files)
- yield Message.Directory, group_info
+ yield Message.Directory, "", group_info
for num, file in enumerate(files, 1):
file_info = self.extract_media_info(file, None)
file_id = file_info["file_id"]
@@ -78,7 +78,7 @@ class IwaraExtractor(Extractor):
video["id"], exc.__class__.__name__, exc)
continue
- yield Message.Directory, info
+ yield Message.Directory, "", info
yield Message.Url, f"https:{download_url}", info
def items_user(self, users, key=None):
@@ -122,10 +122,10 @@ class IwaraExtractor(Extractor):
info["file_id"] = file_info.get("id")
info["filename"] = filename
info["extension"] = extension
- info["date"] = text.parse_datetime(
- file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ")
- info["date_updated"] = text.parse_datetime(
- file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ")
+ info["date"] = self.parse_datetime_iso(
+ file_info.get("createdAt"))
+ info["date_updated"] = self.parse_datetime_iso(
+ file_info.get("updatedAt"))
info["mime"] = file_info.get("mime")
info["size"] = file_info.get("size")
info["width"] = file_info.get("width")
@@ -144,8 +144,7 @@ class IwaraExtractor(Extractor):
"status" : user.get("status"),
"role" : user.get("role"),
"premium": user.get("premium"),
- "date" : text.parse_datetime(
- user.get("createdAt"), "%Y-%m-%dT%H:%M:%S.000Z"),
+ "date" : self.parse_datetime_iso(user.get("createdAt")),
"description": profile.get("body"),
}
diff --git a/gallery_dl/extractor/jschan.py b/gallery_dl/extractor/jschan.py
index 5f3e75a..5dacf70 100644
--- a/gallery_dl/extractor/jschan.py
+++ b/gallery_dl/extractor/jschan.py
@@ -30,7 +30,7 @@ class JschanThreadExtractor(JschanExtractor):
"{threadId} {subject|nomarkup[:50]}")
filename_fmt = "{postId}{num:?-//} {filename}.{extension}"
archive_fmt = "{board}_{postId}_{num}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)\.html"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/thread/(\d+)\.html"
example = "https://94chan.org/a/thread/12345.html"
def items(self):
@@ -39,7 +39,7 @@ class JschanThreadExtractor(JschanExtractor):
thread["threadId"] = thread["postId"]
posts = thread.pop("replies", ())
- yield Message.Directory, thread
+ yield Message.Directory, "", thread
for post in itertools.chain((thread,), posts):
if files := post.pop("files", ()):
thread.update(post)
@@ -56,7 +56,7 @@ class JschanThreadExtractor(JschanExtractor):
class JschanBoardExtractor(JschanExtractor):
"""Extractor for jschan boards"""
subcategory = "board"
- pattern = (BASE_PATTERN + r"/([^/?#]+)"
+ pattern = (rf"{BASE_PATTERN}/([^/?#]+)"
r"(?:/index\.html|/catalog\.html|/\d+\.html|/?$)")
example = "https://94chan.org/a/"
diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py
index c259c47..88f2e32 100644
--- a/gallery_dl/extractor/kabeuchi.py
+++ b/gallery_dl/extractor/kabeuchi.py
@@ -32,9 +32,8 @@ class KabeuchiUserExtractor(Extractor):
if post.get("is_ad") or not post["image1"]:
continue
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%d %H:%M:%S")
- yield Message.Directory, post
+ post["date"] = self.parse_datetime_iso(post["created_at"])
+ yield Message.Directory, "", post
for key in keys:
name = post[key]
diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py
index f55a930..3c1eb24 100644
--- a/gallery_dl/extractor/keenspot.py
+++ b/gallery_dl/extractor/keenspot.py
@@ -34,7 +34,7 @@ class KeenspotComicExtractor(Extractor):
def items(self):
data = {"comic": self.comic}
- yield Message.Directory, data
+ yield Message.Directory, "", data
with self.request(self.root + "/") as response:
if response.history:
diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py
index b4a8abc..bf35670 100644
--- a/gallery_dl/extractor/kemono.py
+++ b/gallery_dl/extractor/kemono.py
@@ -16,7 +16,7 @@ import json
BASE_PATTERN = (r"(?:https?://)?(?:www\.|beta\.)?"
r"(kemono|coomer)\.(cr|s[tu]|party)")
-USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/([^/?#]+)/user/([^/?#]+)"
HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})"
@@ -44,7 +44,7 @@ class KemonoExtractor(Extractor):
order = self.config("order-revisions")
self.revisions_reverse = order[0] in ("r", "a") if order else False
- self._find_inline = util.re(
+ self._find_inline = text.re(
r'src="(?:https?://(?:kemono\.cr|coomer\.st))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._json_dumps = json.JSONEncoder(
@@ -52,7 +52,7 @@ class KemonoExtractor(Extractor):
sort_keys=True, separators=(",", ":")).encode
def items(self):
- find_hash = util.re(HASH_PATTERN).match
+ find_hash = text.re(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files"))
announcements = True if self.config("announcements") else None
archives = True if self.config("archives") else False
@@ -145,18 +145,24 @@ class KemonoExtractor(Extractor):
file["hash"] = hash = ""
if url[0] == "/":
- url = self.root + "/data" + url
+ url = f"{self.root}/data{url}"
elif url.startswith(self.root):
- url = self.root + "/data" + url[20:]
+ url = f"{self.root}/data{url[20:]}"
file["url"] = url
- text.nameext_from_url(file.get("name", url), file)
- ext = text.ext_from_url(url)
- if not file["extension"]:
- file["extension"] = ext
- elif ext == "txt" and file["extension"] != "txt":
- file["_http_validate"] = _validate
- elif ext in exts_archive or \
+ if name := file.get("name"):
+ text.nameext_from_name(name, file)
+ ext = text.ext_from_url(url)
+
+ if not file["extension"]:
+ file["extension"] = ext
+ elif ext == "txt" and file["extension"] != "txt":
+ file["_http_validate"] = _validate
+ else:
+ text.nameext_from_url(url, file)
+ ext = file["extension"]
+
+ if ext in exts_archive or \
ext == "bin" and file["extension"] in exts_archive:
file["type"] = "archive"
if archives:
@@ -176,7 +182,7 @@ class KemonoExtractor(Extractor):
files.append(file)
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
if "id" in file:
del file["id"]
@@ -194,13 +200,13 @@ class KemonoExtractor(Extractor):
username = username[0]
self.log.info("Logging in as %s", username)
- url = self.root + "/api/v1/authentication/login"
+ url = f"{self.root}/api/v1/authentication/login"
data = {"username": username, "password": password}
response = self.request(url, method="POST", json=data, fatal=False)
if response.status_code >= 400:
try:
- msg = '"' + response.json()["error"] + '"'
+ msg = f'"{response.json()["error"]}"'
except Exception:
msg = '"Username or password is incorrect"'
raise exception.AuthenticationError(msg)
@@ -238,7 +244,7 @@ class KemonoExtractor(Extractor):
def _parse_datetime(self, date_string):
if len(date_string) > 19:
date_string = date_string[:19]
- return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S")
+ return self.parse_datetime_iso(date_string)
def _revisions(self, posts):
return itertools.chain.from_iterable(
@@ -316,7 +322,7 @@ def _validate(response):
class KemonoUserExtractor(KemonoExtractor):
"""Extractor for all posts from a kemono.cr user listing"""
subcategory = "user"
- pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)"
+ pattern = rf"{USER_PATTERN}/?(?:\?([^#]+))?(?:$|\?|#)"
example = "https://kemono.cr/SERVICE/user/12345"
def __init__(self, match):
@@ -339,7 +345,7 @@ class KemonoUserExtractor(KemonoExtractor):
class KemonoPostsExtractor(KemonoExtractor):
"""Extractor for kemono.cr post listings"""
subcategory = "posts"
- pattern = BASE_PATTERN + r"/posts()()(?:/?\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/posts()()(?:/?\?([^#]+))?"
example = "https://kemono.cr/posts"
def posts(self):
@@ -351,7 +357,7 @@ class KemonoPostsExtractor(KemonoExtractor):
class KemonoPostExtractor(KemonoExtractor):
"""Extractor for a single kemono.cr post"""
subcategory = "post"
- pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?"
+ pattern = rf"{USER_PATTERN}/post/([^/?#]+)(/revisions?(?:/(\d*))?)?"
example = "https://kemono.cr/SERVICE/user/12345/post/12345"
def __init__(self, match):
@@ -384,7 +390,7 @@ class KemonoDiscordExtractor(KemonoExtractor):
"{server_id} {server}", "{channel_id} {channel}")
filename_fmt = "{id}_{num:>02}_{filename}.{extension}"
archive_fmt = "discord_{server_id}_{id}_{num}"
- pattern = BASE_PATTERN + r"/discord/server/(\d+)[/#](?:channel/)?(\d+)"
+ pattern = rf"{BASE_PATTERN}/discord/server/(\d+)[/#](?:channel/)?(\d+)"
example = "https://kemono.cr/discord/server/12345/12345"
def items(self):
@@ -407,10 +413,10 @@ class KemonoDiscordExtractor(KemonoExtractor):
"parent_id" : channel["parent_channel_id"],
}
- find_inline = util.re(
+ find_inline = text.re(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
- find_hash = util.re(HASH_PATTERN).match
+ find_hash = text.re(HASH_PATTERN).match
if (order := self.config("order-posts")) and order[0] in ("r", "d"):
posts = self.api.discord_channel(channel_id, channel["post_count"])
@@ -428,13 +434,13 @@ class KemonoDiscordExtractor(KemonoExtractor):
attachment["type"] = "attachment"
files.append(attachment)
for path in find_inline(post["content"] or ""):
- files.append({"path": "https://cdn.discordapp.com" + path,
+ files.append({"path": f"https://cdn.discordapp.com{path}",
"name": path, "type": "inline", "hash": ""})
post.update(data)
post["date"] = self._parse_datetime(post["published"])
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
post["hash"] = file["hash"]
@@ -446,15 +452,15 @@ class KemonoDiscordExtractor(KemonoExtractor):
post["extension"] = text.ext_from_url(url)
if url[0] == "/":
- url = self.root + "/data" + url
+ url = f"{self.root}/data{url}"
elif url.startswith(self.root):
- url = self.root + "/data" + url[20:]
+ url = f"{self.root}/data{url[20:]}"
yield Message.Url, url, post
class KemonoDiscordServerExtractor(KemonoExtractor):
subcategory = "discord-server"
- pattern = BASE_PATTERN + r"/discord/server/(\d+)$"
+ pattern = rf"{BASE_PATTERN}/discord/server/(\d+)$"
example = "https://kemono.cr/discord/server/12345"
def items(self):
@@ -482,7 +488,7 @@ def discord_server_info(extr, server_id):
class KemonoFavoriteExtractor(KemonoExtractor):
"""Extractor for kemono.cr favorites"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/(?:account/)?favorites()()(?:/?\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/(?:account/)?favorites()()(?:/?\?([^#]+))?"
example = "https://kemono.cr/account/favorites/artists"
def items(self):
@@ -530,7 +536,7 @@ class KemonoFavoriteExtractor(KemonoExtractor):
class KemonoArtistsExtractor(KemonoExtractor):
"""Extractor for kemono artists"""
subcategory = "artists"
- pattern = BASE_PATTERN + r"/artists(?:\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/artists(?:\?([^#]+))?"
example = "https://kemono.cr/artists"
def items(self):
@@ -564,32 +570,32 @@ class KemonoArtistsExtractor(KemonoExtractor):
class KemonoAPI():
- """Interface for the Kemono API v1.1.0
+ """Interface for the Kemono API v1.3.0
https://kemono.cr/documentation/api
"""
def __init__(self, extractor):
self.extractor = extractor
- self.root = extractor.root + "/api/v1"
+ self.root = f"{extractor.root}/api"
self.headers = {"Accept": "text/css"}
def posts(self, offset=0, query=None, tags=None):
- endpoint = "/posts"
+ endpoint = "/v1/posts"
params = {"q": query, "o": offset, "tag": tags}
return self._pagination(endpoint, params, 50, "posts")
def file(self, file_hash):
- endpoint = "/file/" + file_hash
+ endpoint = f"/v1/file/{file_hash}"
return self._call(endpoint)
def creators(self):
- endpoint = "/creators"
+ endpoint = "/v1/creators"
return self._call(endpoint)
def creator_posts(self, service, creator_id,
offset=0, query=None, tags=None):
- endpoint = f"/{service}/user/{creator_id}/posts"
+ endpoint = f"/v1/{service}/user/{creator_id}/posts"
params = {"o": offset, "tag": tags, "q": query}
return self._pagination(endpoint, params, 50)
@@ -601,58 +607,58 @@ class KemonoAPI():
service, creator_id, post["id"])["post"]
def creator_announcements(self, service, creator_id):
- endpoint = f"/{service}/user/{creator_id}/announcements"
+ endpoint = f"/v1/{service}/user/{creator_id}/announcements"
return self._call(endpoint)
def creator_dms(self, service, creator_id):
- endpoint = f"/{service}/user/{creator_id}/dms"
+ endpoint = f"/v1/{service}/user/{creator_id}/dms"
return self._call(endpoint)
def creator_fancards(self, service, creator_id):
- endpoint = f"/{service}/user/{creator_id}/fancards"
+ endpoint = f"/v1/{service}/user/{creator_id}/fancards"
return self._call(endpoint)
def creator_post(self, service, creator_id, post_id):
- endpoint = f"/{service}/user/{creator_id}/post/{post_id}"
+ endpoint = f"/v1/{service}/user/{creator_id}/post/{post_id}"
return self._call(endpoint)
def creator_post_comments(self, service, creator_id, post_id):
- endpoint = f"/{service}/user/{creator_id}/post/{post_id}/comments"
+ endpoint = f"/v1/{service}/user/{creator_id}/post/{post_id}/comments"
return self._call(endpoint, fatal=False)
def creator_post_revisions(self, service, creator_id, post_id):
- endpoint = f"/{service}/user/{creator_id}/post/{post_id}/revisions"
+ endpoint = f"/v1/{service}/user/{creator_id}/post/{post_id}/revisions"
return self._call(endpoint, fatal=False)
def creator_profile(self, service, creator_id):
- endpoint = f"/{service}/user/{creator_id}/profile"
+ endpoint = f"/v1/{service}/user/{creator_id}/profile"
return self._call(endpoint)
def creator_links(self, service, creator_id):
- endpoint = f"/{service}/user/{creator_id}/links"
+ endpoint = f"/v1/{service}/user/{creator_id}/links"
return self._call(endpoint)
def creator_tags(self, service, creator_id):
- endpoint = f"/{service}/user/{creator_id}/tags"
+ endpoint = f"/v1/{service}/user/{creator_id}/tags"
return self._call(endpoint)
def discord_channel(self, channel_id, post_count=None):
- endpoint = f"/discord/channel/{channel_id}"
+ endpoint = f"/v1/discord/channel/{channel_id}"
if post_count is None:
return self._pagination(endpoint, {}, 150)
else:
return self._pagination_reverse(endpoint, {}, 150, post_count)
def discord_channel_lookup(self, server_id):
- endpoint = f"/discord/channel/lookup/{server_id}"
+ endpoint = f"/v1/discord/channel/lookup/{server_id}"
return self._call(endpoint)
def discord_server(self, server_id):
- endpoint = f"/discord/server/{server_id}"
+ endpoint = f"/v1/discord/server/{server_id}"
return self._call(endpoint)
def account_favorites(self, type):
- endpoint = "/account/favorites"
+ endpoint = "/v1/account/favorites"
params = {"type": type}
return self._call(endpoint, params)
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index f22d54e..8d1497d 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -35,7 +35,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
raise exception.NotFoundError("soundtrack")
data = self.metadata(page)
- yield Message.Directory, data
+ yield Message.Directory, "", data
if self.config("covers", False):
for num, url in enumerate(self._extract_covers(page), 1):
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index 816bc3d..e2f00e1 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -9,7 +9,7 @@
"""Extractors for https://komikcast.li/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
+from .. import text
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
r"komikcast\d*\.(?:l(?:i|a|ol)|com|cz|site|mo?e)")
@@ -25,7 +25,7 @@ class KomikcastBase():
if data is None:
data = {}
- pattern = util.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?")
+ pattern = text.re(r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?")
match = pattern.match(text.unescape(chapter_string))
manga, chapter, data["chapter_minor"], title = match.groups()
@@ -44,7 +44,7 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"""Extractor for komikcast manga chapters"""
- pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
+ pattern = rf"{BASE_PATTERN}(/chapter/[^/?#]+/)"
example = "https://komikcast.li/chapter/TITLE/"
def metadata(self, page):
@@ -54,7 +54,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
def images(self, page):
readerarea = text.extr(
page, '<div class="main-reading-area', '</div')
- pattern = util.re(r"<img[^>]* src=[\"']([^\"']+)")
+ pattern = text.re(r"<img[^>]* src=[\"']([^\"']+)")
return [
(text.unescape(url), None)
for url in pattern.findall(readerarea)
@@ -64,7 +64,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
"""Extractor for komikcast manga"""
chapterclass = KomikcastChapterExtractor
- pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+/?)$"
+ pattern = rf"{BASE_PATTERN}(/(?:komik/)?[^/?#]+/?)$"
example = "https://komikcast.li/komik/TITLE"
def chapters(self, page):
diff --git a/gallery_dl/extractor/koofr.py b/gallery_dl/extractor/koofr.py
new file mode 100644
index 0000000..9ebc133
--- /dev/null
+++ b/gallery_dl/extractor/koofr.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://koofr.net/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class KoofrSharedExtractor(Extractor):
+ """Base class for koofr extractors"""
+ category = "koofr"
+ subcategory = "shared"
+ root = "https://app.koofr.net"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:app\.)?koofr\.(?:net|eu)/links/([\w-]+)|"
+ r"k00\.fr/(\w+))")
+ example = "https://app.koofr.net/links/UUID"
+
+ def items(self):
+ uuid, code = self.groups
+ if code is not None:
+ uuid = self.request_location(
+ "https://k00.fr/" + code, method="GET").rpartition("/")[2]
+
+ url = f"{self.root}/api/v2/public/links/{uuid}"
+ referer = f"{self.root}/links/{uuid}"
+ password = self.config("password")
+ params = {"password": password or ""}
+ headers = {
+ "Referer" : referer,
+ "X-Client" : "newfrontend",
+ "X-Koofr-Version": "2.1",
+ "Sec-Fetch-Dest" : "empty",
+ "Sec-Fetch-Mode" : "cors",
+ "Sec-Fetch-Site" : "same-origin",
+ }
+ data = self.request_json(url, params=params, headers=headers)
+
+ name = data["name"]
+ file = text.nameext_from_name(name, data["file"])
+ file["_http_headers"] = {"Referer": referer}
+
+ root = data.get("publicUrlBase") or self.root
+ url = f"{root}/content/links/{uuid}/files/get/{name}?path=/&force="
+ if password:
+ url = f"{url}&password={password}"
+
+ yield Message.Directory, "", file
+ yield Message.Url, url, file
diff --git a/gallery_dl/extractor/leakgallery.py b/gallery_dl/extractor/leakgallery.py
index c609891..2939304 100644
--- a/gallery_dl/extractor/leakgallery.py
+++ b/gallery_dl/extractor/leakgallery.py
@@ -37,7 +37,7 @@ class LeakgalleryExtractor(Extractor):
media["url"] = url = f"https://cdn.leakgallery.com/{path}"
text.nameext_from_url(url, media)
- yield Message.Directory, media
+ yield Message.Directory, "", media
yield Message.Url, url, media
def _pagination(self, type, base, params=None, creator=None, pnum=1):
@@ -81,7 +81,7 @@ class LeakgalleryUserExtractor(LeakgalleryExtractor):
class LeakgalleryTrendingExtractor(LeakgalleryExtractor):
"""Extractor for trending posts on leakgallery.com"""
subcategory = "trending"
- pattern = BASE_PATTERN + r"/trending-medias(?:/([\w-]+))?"
+ pattern = rf"{BASE_PATTERN}/trending-medias(?:/([\w-]+))?"
example = "https://leakgallery.com/trending-medias/Week"
def items(self):
@@ -93,7 +93,7 @@ class LeakgalleryTrendingExtractor(LeakgalleryExtractor):
class LeakgalleryMostlikedExtractor(LeakgalleryExtractor):
"""Extractor for most liked posts on leakgallery.com"""
subcategory = "mostliked"
- pattern = BASE_PATTERN + r"/most-liked"
+ pattern = rf"{BASE_PATTERN}/most-liked"
example = "https://leakgallery.com/most-liked"
def items(self):
@@ -104,7 +104,7 @@ class LeakgalleryMostlikedExtractor(LeakgalleryExtractor):
class LeakgalleryPostExtractor(LeakgalleryExtractor):
"""Extractor for individual posts on leakgallery.com"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/(\d+)"
example = "https://leakgallery.com/CREATOR/12345"
def items(self):
@@ -134,7 +134,7 @@ class LeakgalleryPostExtractor(LeakgalleryExtractor):
"url": url,
}
text.nameext_from_url(url, data)
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
except Exception as exc:
self.log.error("Failed to extract post page %s/%s: %s",
diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py
index b0198d5..a7b1318 100644
--- a/gallery_dl/extractor/lensdump.py
+++ b/gallery_dl/extractor/lensdump.py
@@ -31,7 +31,7 @@ class LensdumpBase():
class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor):
subcategory = "album"
- pattern = BASE_PATTERN + r"/a/(\w+)(?:/?\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/a/(\w+)(?:/?\?([^#]+))?"
example = "https://lensdump.com/a/ID"
def __init__(self, match):
@@ -76,7 +76,7 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor):
class LensdumpAlbumsExtractor(LensdumpBase, Extractor):
"""Extractor for album list from lensdump.com"""
subcategory = "albums"
- pattern = BASE_PATTERN + r"/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?"
example = "https://lensdump.com/USER"
def items(self):
@@ -119,10 +119,9 @@ class LensdumpImageExtractor(LensdumpBase, Extractor):
'property="image:width" content="', '"')),
"height": text.parse_int(extr(
'property="image:height" content="', '"')),
- "date" : text.parse_datetime(extr(
- '<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
+ "date" : self.parse_datetime_iso(extr('<span title="', '"')),
}
text.nameext_from_url(data["url"], data)
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, data["url"], data
diff --git a/gallery_dl/extractor/lexica.py b/gallery_dl/extractor/lexica.py
index 6e54847..fc44f51 100644
--- a/gallery_dl/extractor/lexica.py
+++ b/gallery_dl/extractor/lexica.py
@@ -36,7 +36,7 @@ class LexicaSearchExtractor(Extractor):
image["filename"] = image["id"]
image["extension"] = "jpg"
image["search_tags"] = tags
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, base + image["id"], image
def posts(self):
diff --git a/gallery_dl/extractor/lightroom.py b/gallery_dl/extractor/lightroom.py
index b557149..27aa15a 100644
--- a/gallery_dl/extractor/lightroom.py
+++ b/gallery_dl/extractor/lightroom.py
@@ -35,7 +35,7 @@ class LightroomGalleryExtractor(Extractor):
images = self.images(album)
for img in images:
url = img["url"]
- yield Message.Directory, img
+ yield Message.Directory, "", img
yield Message.Url, url, text.nameext_from_url(url, img)
def metadata(self, album):
diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py
index ab3be69..706194a 100644
--- a/gallery_dl/extractor/livedoor.py
+++ b/gallery_dl/extractor/livedoor.py
@@ -27,7 +27,7 @@ class LivedoorExtractor(Extractor):
def items(self):
for post in self.posts():
if images := self._images(post):
- yield Message.Directory, {"post": post}
+ yield Message.Directory, "", {"post": post}
for image in images:
yield Message.Url, image["url"], image
@@ -45,7 +45,7 @@ class LivedoorExtractor(Extractor):
"title" : text.unescape(extr('dc:title="', '"')),
"categories" : extr('dc:subject="', '"').partition(",")[::2],
"description": extr('dc:description="', '"'),
- "date" : text.parse_datetime(extr('dc:date="', '"')),
+ "date" : self.parse_datetime_iso(extr('dc:date="', '"')),
"tags" : text.split_html(tags)[1:] if tags else [],
"user" : self.user,
"body" : body,
diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py
index c20d983..b1f58ac 100644
--- a/gallery_dl/extractor/lofter.py
+++ b/gallery_dl/extractor/lofter.py
@@ -29,7 +29,7 @@ class LofterExtractor(Extractor):
post = post["post"]
post["blog_name"] = post["blogInfo"]["blogName"]
- post["date"] = text.parse_timestamp(post["publishTime"] // 1000)
+ post["date"] = self.parse_timestamp(post["publishTime"] // 1000)
post_type = post["type"]
# Article
@@ -63,7 +63,7 @@ class LofterExtractor(Extractor):
post["id"], post_type)
post["count"] = len(image_urls)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], url in enumerate(image_urls, 1):
yield Message.Url, url, text.nameext_from_url(url, post)
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 5233033..d17549d 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -25,7 +25,7 @@ BASE_PATTERN = LolisafeExtractor.update({
class LolisafeAlbumExtractor(LolisafeExtractor):
subcategory = "album"
- pattern = BASE_PATTERN + "/a/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/a/([^/?#]+)"
example = "https://xbunkr.com/a/ID"
def __init__(self, match):
@@ -42,7 +42,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
def items(self):
files, data = self.fetch_album(self.album_id)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], file in enumerate(files, 1):
url = file["file"]
file.update(data)
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index 0cbc523..2abd1c8 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -58,7 +58,7 @@ class LusciousAlbumExtractor(LusciousExtractor):
def items(self):
album = self.metadata()
- yield Message.Directory, {"album": album}
+ yield Message.Directory, "", {"album": album}
for num, image in enumerate(self.images(), 1):
image["num"] = num
image["album"] = album
@@ -69,7 +69,7 @@ class LusciousAlbumExtractor(LusciousExtractor):
image["thumbnail"] = ""
image["tags"] = [item["text"] for item in image["tags"]]
- image["date"] = text.parse_timestamp(image["created"])
+ image["date"] = self.parse_timestamp(image["created"])
image["id"] = text.parse_int(image["id"])
url = (image["url_to_original"] or image["url_to_video"]
@@ -188,7 +188,7 @@ fragment AlbumStandard on Album {
album["created_by"] = album["created_by"]["display_name"]
album["id"] = text.parse_int(album["id"])
- album["date"] = text.parse_timestamp(album["created"])
+ album["date"] = self.parse_timestamp(album["created"])
return album
diff --git a/gallery_dl/extractor/lynxchan.py b/gallery_dl/extractor/lynxchan.py
index fde2df5..7cf1282 100644
--- a/gallery_dl/extractor/lynxchan.py
+++ b/gallery_dl/extractor/lynxchan.py
@@ -39,7 +39,7 @@ class LynxchanThreadExtractor(LynxchanExtractor):
"{threadId} {subject|message[:50]}")
filename_fmt = "{postId}{num:?-//} {filename}.{extension}"
archive_fmt = "{boardUri}_{postId}_{num}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)"
example = "https://endchan.org/a/res/12345.html"
def items(self):
@@ -48,7 +48,7 @@ class LynxchanThreadExtractor(LynxchanExtractor):
thread["postId"] = thread["threadId"]
posts = thread.pop("posts", ())
- yield Message.Directory, thread
+ yield Message.Directory, "", thread
for post in itertools.chain((thread,), posts):
if files := post.pop("files", ()):
thread.update(post)
@@ -63,7 +63,7 @@ class LynxchanThreadExtractor(LynxchanExtractor):
class LynxchanBoardExtractor(LynxchanExtractor):
"""Extractor for LynxChan boards"""
subcategory = "board"
- pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
example = "https://endchan.org/a/"
def items(self):
diff --git a/gallery_dl/extractor/madokami.py b/gallery_dl/extractor/madokami.py
index 1db5126..e15b90d 100644
--- a/gallery_dl/extractor/madokami.py
+++ b/gallery_dl/extractor/madokami.py
@@ -47,8 +47,7 @@ class MadokamiMangaExtractor(MadokamiExtractor):
"path": text.unescape(extr('href="', '"')),
"chapter_string": text.unescape(extr(">", "<")),
"size": text.parse_bytes(extr("<td>", "</td>")),
- "date": text.parse_datetime(
- extr("<td>", "</td>").strip(), "%Y-%m-%d %H:%M"),
+ "date": self.parse_datetime_iso(extr("<td>", "</td>").strip()),
})
if self.config("chapter-reverse"):
@@ -89,5 +88,5 @@ class MadokamiMangaExtractor(MadokamiExtractor):
url = f"{self.root}{ch['path']}"
text.nameext_from_url(url, ch)
- yield Message.Directory, ch
+ yield Message.Directory, "", ch
yield Message.Url, url, ch
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 16eb650..0a1709d 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -68,7 +68,7 @@ class MangadexExtractor(Extractor):
"chapter" : text.parse_int(chnum),
"chapter_minor": f"{sep}{minor}",
"chapter_id": chapter["id"],
- "date" : text.parse_datetime(cattributes["publishAt"]),
+ "date" : self.parse_datetime_iso(cattributes["publishAt"]),
"group" : [group["attributes"]["name"]
for group in relationships["scanlation_group"]],
"lang" : lang,
@@ -95,7 +95,7 @@ class MangadexCoversExtractor(MangadexExtractor):
name = data["cover"]
text.nameext_from_url(name, data)
data["cover_id"] = data["filename"]
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, f"{base}{name}", data
def _transform_cover(self, cover):
@@ -109,8 +109,8 @@ class MangadexCoversExtractor(MangadexExtractor):
"cover" : cattributes["fileName"],
"lang" : cattributes.get("locale"),
"volume" : text.parse_int(cattributes["volume"]),
- "date" : text.parse_datetime(cattributes["createdAt"]),
- "date_updated": text.parse_datetime(cattributes["updatedAt"]),
+ "date" : self.parse_datetime_iso(cattributes["createdAt"]),
+ "date_updated": self.parse_datetime_iso(cattributes["updatedAt"]),
}
@@ -134,15 +134,21 @@ class MangadexChapterExtractor(MangadexExtractor):
f"available on MangaDex and can instead be read on the "
f"official publisher's website at {data['_external_url']}.")
- yield Message.Directory, data
+ yield Message.Directory, "", data
+
+ if self.config("data-saver", False):
+ path = "data-saver"
+ key = "dataSaver"
+ else:
+ path = key = "data"
server = self.api.athome_server(self.uuid)
chapter = server["chapter"]
- base = f"{server['baseUrl']}/data/{chapter['hash']}/"
+ base = f"{server['baseUrl']}/{path}/{chapter['hash']}/"
enum = util.enumerate_reversed if self.config(
"page-reverse") else enumerate
- for data["page"], page in enum(chapter["data"], 1):
+ for data["page"], page in enum(chapter[key], 1):
text.nameext_from_url(page, data)
yield Message.Url, f"{base}{page}", data
@@ -454,7 +460,7 @@ def _manga_info(self, uuid):
"manga_id": manga["id"],
"manga_titles": [t.popitem()[1]
for t in mattr.get("altTitles") or ()],
- "manga_date" : text.parse_datetime(mattr.get("createdAt")),
+ "manga_date" : self.parse_datetime_iso(mattr.get("createdAt")),
"description" : (mattr["description"].get("en") or
next(iter(mattr["description"].values()), "")),
"demographic": mattr.get("publicationDemographic"),
diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py
index 76f4b7e..8fa645b 100644
--- a/gallery_dl/extractor/mangafox.py
+++ b/gallery_dl/extractor/mangafox.py
@@ -18,8 +18,8 @@ class MangafoxChapterExtractor(ChapterExtractor):
"""Extractor for manga chapters from fanfox.net"""
category = "mangafox"
root = "https://m.fanfox.net"
- pattern = BASE_PATTERN + \
- r"(/manga/[^/?#]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))"
+ pattern = (rf"{BASE_PATTERN}"
+ rf"(/manga/[^/?#]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))")
example = "https://fanfox.net/manga/TITLE/v01/c001/1.html"
def __init__(self, match):
@@ -62,7 +62,7 @@ class MangafoxMangaExtractor(MangaExtractor):
category = "mangafox"
root = "https://m.fanfox.net"
chapterclass = MangafoxChapterExtractor
- pattern = BASE_PATTERN + r"(/manga/[^/?#]+)/?$"
+ pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+)/?$"
example = "https://fanfox.net/manga/TITLE"
def chapters(self, page):
@@ -99,7 +99,7 @@ class MangafoxMangaExtractor(MangaExtractor):
"chapter" : text.parse_int(chapter),
"chapter_minor" : minor or "",
"chapter_string": cstr,
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr('right">', '</span>'), "%b %d, %Y"),
}
chapter.update(data)
diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py
index 151e809..9b3a3a1 100644
--- a/gallery_dl/extractor/mangahere.py
+++ b/gallery_dl/extractor/mangahere.py
@@ -9,7 +9,7 @@
"""Extractors for https://www.mangahere.cc/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
+from .. import text
class MangahereBase():
@@ -102,7 +102,7 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor):
info, pos = text.extract(page, 'class="title3">', '<', pos)
date, pos = text.extract(page, 'class="title2">', '<', pos)
- match = util.re(
+ match = text.re(
r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?").match(info)
if match:
volume, chapter, minor, title = match.groups()
diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py
index a6948e3..3ecf934 100644
--- a/gallery_dl/extractor/manganelo.py
+++ b/gallery_dl/extractor/manganelo.py
@@ -39,7 +39,7 @@ BASE_PATTERN = ManganeloExtractor.update({
class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor):
"""Extractor for manganelo manga chapters"""
- pattern = BASE_PATTERN + r"(/manga/[^/?#]+/chapter-[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+/chapter-[^/?#]+)"
example = "https://www.mangakakalot.gg/manga/MANGA_NAME/chapter-123"
def __init__(self, match):
@@ -50,10 +50,10 @@ class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor):
extr = text.extract_from(page)
data = {
- "date" : text.parse_datetime(extr(
- '"datePublished": "', '"')[:19], "%Y-%m-%dT%H:%M:%S"),
- "date_updated": text.parse_datetime(extr(
- '"dateModified": "', '"')[:19], "%Y-%m-%dT%H:%M:%S"),
+ "date" : self.parse_datetime_iso(extr(
+ '"datePublished": "', '"')[:19]),
+ "date_updated": self.parse_datetime_iso(extr(
+ '"dateModified": "', '"')[:19]),
"manga_id" : text.parse_int(extr("comic_id =", ";")),
"chapter_id" : text.parse_int(extr("chapter_id =", ";")),
"manga" : extr("comic_name =", ";").strip('" '),
@@ -86,7 +86,7 @@ class ManganeloChapterExtractor(ManganeloExtractor, ChapterExtractor):
class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor):
"""Extractor for manganelo manga"""
chapterclass = ManganeloChapterExtractor
- pattern = BASE_PATTERN + r"(/manga/[^/?#]+)$"
+ pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+)$"
example = "https://www.mangakakalot.gg/manga/MANGA_NAME"
def __init__(self, match):
@@ -99,7 +99,7 @@ class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor):
manga = text.unescape(extr("<h1>", "<"))
author = text.remove_html(extr("<li>Author(s) :", "</a>"))
status = extr("<li>Status :", "<").strip()
- update = text.parse_datetime(extr(
+ update = self.parse_datetime(extr(
"<li>Last updated :", "<").strip(), "%b-%d-%Y %I:%M:%S %p")
tags = text.split_html(extr(">Genres :", "</li>"))[::2]
@@ -121,7 +121,7 @@ class ManganeloMangaExtractor(ManganeloExtractor, MangaExtractor):
"chapter" : text.parse_int(chapter),
"chapter_minor": (sep and ".") + minor,
"title" : title.partition(": ")[2],
- "date" : text.parse_datetime(date, "%b-%d-%Y %H:%M"),
+ "date" : self.parse_datetime(date, "%b-%d-%Y %H:%M"),
"lang" : "en",
"language": "English",
}))
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index 19aee33..e2f9166 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -23,7 +23,7 @@ class MangaparkBase():
category = "mangapark"
def _parse_chapter_title(self, title):
- match = util.re(
+ match = text.re(
r"(?i)"
r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?"
r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)"
@@ -70,8 +70,8 @@ class MangaparkBase():
class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
"""Extractor for manga-chapters from mangapark.net"""
- pattern = (BASE_PATTERN +
- r"/(?:title/[^/?#]+/|comic/\d+/[^/?#]+/[^/?#]+-i)(\d+)")
+ pattern = (rf"{BASE_PATTERN}/"
+ rf"(?:title/[^/?#]+/|comic/\d+/[^/?#]+/[^/?#]+-i)(\d+)")
example = "https://mangapark.net/title/MANGA/12345-en-ch.01"
def __init__(self, match):
@@ -101,7 +101,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
"language" : util.code_to_language(lang),
"source" : chapter["srcTitle"],
"source_id" : chapter["sourceId"],
- "date" : text.parse_timestamp(chapter["dateCreate"] // 1000),
+ "date" : self.parse_timestamp(chapter["dateCreate"] // 1000),
}
def images(self, _):
@@ -111,7 +111,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
class MangaparkMangaExtractor(MangaparkBase, Extractor):
"""Extractor for manga from mangapark.net"""
subcategory = "manga"
- pattern = BASE_PATTERN + r"/(?:title|comic)/(\d+)(?:[/-][^/?#]*)?/?$"
+ pattern = rf"{BASE_PATTERN}/(?:title|comic)/(\d+)(?:[/-][^/?#]*)?/?$"
example = "https://mangapark.net/title/12345-MANGA"
def __init__(self, match):
@@ -138,7 +138,7 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
"language" : util.code_to_language(lang),
"source" : chapter["srcTitle"],
"source_id" : chapter["sourceId"],
- "date" : text.parse_timestamp(
+ "date" : self.parse_timestamp(
chapter["dateCreate"] // 1000),
"_extractor": MangaparkChapterExtractor,
}
diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py
index a3bdf39..82fddde 100644
--- a/gallery_dl/extractor/mangaread.py
+++ b/gallery_dl/extractor/mangaread.py
@@ -7,7 +7,7 @@
"""Extractors for https://mangaread.org/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util, exception
+from .. import text, exception
class MangareadBase():
@@ -16,7 +16,7 @@ class MangareadBase():
root = "https://www.mangaread.org"
def parse_chapter_string(self, chapter_string, data):
- match = util.re(
+ match = text.re(
r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?"
).match(text.unescape(chapter_string).strip())
manga, chapter, minor, title = match.groups()
diff --git a/gallery_dl/extractor/mangataro.py b/gallery_dl/extractor/mangataro.py
index f4cc058..029bc2e 100644
--- a/gallery_dl/extractor/mangataro.py
+++ b/gallery_dl/extractor/mangataro.py
@@ -40,10 +40,8 @@ class MangataroChapterExtractor(MangataroBase, ChapterExtractor):
"chapter_minor": str(round(minor, 5))[1:] if minor else "",
"chapter_id" : text.parse_int(chapter_id),
"chapter_url" : comic["url"],
- "date" : text.parse_datetime(
- comic["datePublished"], "%Y-%m-%dT%H:%M:%S%z"),
- "date_updated" : text.parse_datetime(
- comic["dateModified"], "%Y-%m-%dT%H:%M:%S%z"),
+ "date" : self.parse_datetime_iso(comic["datePublished"]),
+ "date_updated" : self.parse_datetime_iso(comic["dateModified"]),
}
def images(self, page):
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index beb13ce..60f0de9 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -91,7 +91,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
data = self.metadata(page)
imgs = self.images(url, page)
- yield Message.Directory, data
+ yield Message.Directory, "", data
data["extension"] = None
for data["num"], path in enumerate(imgs, 1):
@@ -119,7 +119,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
"album": {
"id": self.album_id,
"name": text.unescape(title),
- "date": text.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"),
+ "date": self.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"),
"description": text.unescape(descr),
},
"count": text.parse_int(count),
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 1bab63a..165f8b8 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -64,10 +64,9 @@ class MastodonExtractor(BaseExtractor):
status["count"] = len(attachments)
status["tags"] = [tag["name"] for tag in status["tags"]]
- status["date"] = text.parse_datetime(
- status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
+ status["date"] = self.parse_datetime_iso(status["created_at"][:19])
- yield Message.Directory, status
+ yield Message.Directory, "", status
for status["num"], media in enumerate(attachments, 1):
status["media"] = media
url = media["url"]
@@ -119,7 +118,7 @@ BASE_PATTERN = MastodonExtractor.update({
class MastodonUserExtractor(MastodonExtractor):
"""Extractor for all images of an account/user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?:@|users/)([^/?#]+)(?:/media)?/?$"
+ pattern = rf"{BASE_PATTERN}/(?:@|users/)([^/?#]+)(?:/media)?/?$"
example = "https://mastodon.social/@USER"
def statuses(self):
@@ -139,7 +138,7 @@ class MastodonUserExtractor(MastodonExtractor):
class MastodonBookmarkExtractor(MastodonExtractor):
"""Extractor for mastodon bookmarks"""
subcategory = "bookmark"
- pattern = BASE_PATTERN + r"/bookmarks"
+ pattern = rf"{BASE_PATTERN}/bookmarks"
example = "https://mastodon.social/bookmarks"
def statuses(self):
@@ -149,7 +148,7 @@ class MastodonBookmarkExtractor(MastodonExtractor):
class MastodonFavoriteExtractor(MastodonExtractor):
"""Extractor for mastodon favorites"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/favourites"
+ pattern = rf"{BASE_PATTERN}/favourites"
example = "https://mastodon.social/favourites"
def statuses(self):
@@ -159,7 +158,7 @@ class MastodonFavoriteExtractor(MastodonExtractor):
class MastodonListExtractor(MastodonExtractor):
"""Extractor for mastodon lists"""
subcategory = "list"
- pattern = BASE_PATTERN + r"/lists/(\w+)"
+ pattern = rf"{BASE_PATTERN}/lists/(\w+)"
example = "https://mastodon.social/lists/12345"
def statuses(self):
@@ -169,7 +168,7 @@ class MastodonListExtractor(MastodonExtractor):
class MastodonHashtagExtractor(MastodonExtractor):
"""Extractor for mastodon hashtags"""
subcategory = "hashtag"
- pattern = BASE_PATTERN + r"/tags/(\w+)"
+ pattern = rf"{BASE_PATTERN}/tags/(\w+)"
example = "https://mastodon.social/tags/NAME"
def statuses(self):
@@ -179,7 +178,7 @@ class MastodonHashtagExtractor(MastodonExtractor):
class MastodonFollowingExtractor(MastodonExtractor):
"""Extractor for followed mastodon users"""
subcategory = "following"
- pattern = BASE_PATTERN + r"/(?:@|users/)([^/?#]+)/following"
+ pattern = rf"{BASE_PATTERN}/(?:@|users/)([^/?#]+)/following"
example = "https://mastodon.social/@USER/following"
def items(self):
@@ -194,7 +193,7 @@ class MastodonFollowingExtractor(MastodonExtractor):
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
- pattern = (BASE_PATTERN + r"/(?:@[^/?#]+|(?:users/[^/?#]+/)?"
+ pattern = (rf"{BASE_PATTERN}/(?:@[^/?#]+|(?:users/[^/?#]+/)?"
r"(?:statuses|notice|objects()))/(?!following)([^/?#]+)")
example = "https://mastodon.social/@USER/12345"
@@ -319,10 +318,8 @@ class MastodonAPI():
if code == 404:
raise exception.NotFoundError()
if code == 429:
- self.extractor.wait(until=text.parse_datetime(
- response.headers["x-ratelimit-reset"],
- "%Y-%m-%dT%H:%M:%S.%fZ",
- ))
+ self.extractor.wait(until=self.extractor.parse_datetime_iso(
+ response.headers["x-ratelimit-reset"]))
continue
raise exception.AbortExtraction(response.json().get("error"))
diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py
index d5c2554..6eda213 100644
--- a/gallery_dl/extractor/message.py
+++ b/gallery_dl/extractor/message.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2021 Mike Fährmann
+# Copyright 2015-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -14,13 +14,14 @@ class Message():
is one of the following identifiers. This message-identifier determines
the type and meaning of the other elements in such a tuple.
- - Message.Version:
+ - Message.Version: # obsolete
- Message protocol version (currently always '1')
- 2nd element specifies the version of all following messages as integer
- Message.Directory:
- Sets the target directory for all following images
- - 2nd element is a dictionary containing general metadata
+ - 2nd element is unused
+ - 3rd element is a dictionary containing general metadata
- Message.Url:
- Image URL and its metadata
@@ -45,7 +46,7 @@ class Message():
- The additional URLs serve as a fallback if the primary one fails
"""
- Version = 1
+ # Version = 1
Directory = 2
Url = 3
# Headers = 4
diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py
index 42eaeef..ca3ae18 100644
--- a/gallery_dl/extractor/misskey.py
+++ b/gallery_dl/extractor/misskey.py
@@ -7,7 +7,7 @@
"""Extractors for Misskey instances"""
from .common import BaseExtractor, Message, Dispatch
-from .. import text, exception
+from .. import text, dt, exception
from ..cache import memcache
@@ -18,10 +18,6 @@ class MisskeyExtractor(BaseExtractor):
filename_fmt = "{category}_{id}_{file[id]}.{extension}"
archive_fmt = "{id}_{file[id]}"
- def __init__(self, match):
- BaseExtractor.__init__(self, match)
- self.item = self.groups[-1]
-
def _init(self):
self.api = MisskeyAPI(self)
self.instance = self.root.rpartition("://")[2]
@@ -48,13 +44,11 @@ class MisskeyExtractor(BaseExtractor):
note["instance"] = self.instance
note["instance_remote"] = note["user"]["host"]
note["count"] = len(files)
- note["date"] = text.parse_datetime(
- note["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ note["date"] = self.parse_datetime_iso(note["createdAt"])
- yield Message.Directory, note
+ yield Message.Directory, "", note
for note["num"], file in enumerate(files, 1):
- file["date"] = text.parse_datetime(
- file["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ file["date"] = self.parse_datetime_iso(file["createdAt"])
note["file"] = file
url = file["url"]
yield Message.Url, url, text.nameext_from_url(url, note)
@@ -108,11 +102,11 @@ BASE_PATTERN = MisskeyExtractor.update({
class MisskeyUserExtractor(Dispatch, MisskeyExtractor):
"""Extractor for all images of a Misskey user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/@([^/?#]+)/?$"
+ pattern = rf"{BASE_PATTERN}/@([^/?#]+)/?$"
example = "https://misskey.io/@USER"
def items(self):
- base = f"{self.root}/@{self.item}/"
+ base = f"{self.root}/@{self.groups[-1]}/"
return self._dispatch_extractors((
(MisskeyInfoExtractor , base + "info"),
(MisskeyAvatarExtractor , base + "avatar"),
@@ -124,32 +118,33 @@ class MisskeyUserExtractor(Dispatch, MisskeyExtractor):
class MisskeyNotesExtractor(MisskeyExtractor):
"""Extractor for a Misskey user's notes"""
subcategory = "notes"
- pattern = BASE_PATTERN + r"/@([^/?#]+)/notes"
+ pattern = rf"{BASE_PATTERN}/@([^/?#]+)/notes"
example = "https://misskey.io/@USER/notes"
def notes(self):
- return self.api.users_notes(self.api.user_id_by_username(self.item))
+ return self.api.users_notes(self.api.user_id_by_username(
+ self.groups[-1]))
class MisskeyInfoExtractor(MisskeyExtractor):
"""Extractor for a Misskey user's profile data"""
subcategory = "info"
- pattern = BASE_PATTERN + r"/@([^/?#]+)/info"
+ pattern = rf"{BASE_PATTERN}/@([^/?#]+)/info"
example = "https://misskey.io/@USER/info"
def items(self):
- user = self.api.users_show(self.item)
- return iter(((Message.Directory, user),))
+ user = self.api.users_show(self.groups[-1])
+ return iter(((Message.Directory, "", user),))
class MisskeyAvatarExtractor(MisskeyExtractor):
"""Extractor for a Misskey user's avatar"""
subcategory = "avatar"
- pattern = BASE_PATTERN + r"/@([^/?#]+)/avatar"
+ pattern = rf"{BASE_PATTERN}/@([^/?#]+)/avatar"
example = "https://misskey.io/@USER/avatar"
def notes(self):
- user = self.api.users_show(self.item)
+ user = self.api.users_show(self.groups[-1])
url = user.get("avatarUrl")
return (self._make_note("avatar", user, url),) if url else ()
@@ -157,11 +152,11 @@ class MisskeyAvatarExtractor(MisskeyExtractor):
class MisskeyBackgroundExtractor(MisskeyExtractor):
"""Extractor for a Misskey user's banner image"""
subcategory = "background"
- pattern = BASE_PATTERN + r"/@([^/?#]+)/ba(?:nner|ckground)"
+ pattern = rf"{BASE_PATTERN}/@([^/?#]+)/ba(?:nner|ckground)"
example = "https://misskey.io/@USER/banner"
def notes(self):
- user = self.api.users_show(self.item)
+ user = self.api.users_show(self.groups[-1])
url = user.get("bannerUrl")
return (self._make_note("background", user, url),) if url else ()
@@ -169,11 +164,11 @@ class MisskeyBackgroundExtractor(MisskeyExtractor):
class MisskeyFollowingExtractor(MisskeyExtractor):
"""Extractor for followed Misskey users"""
subcategory = "following"
- pattern = BASE_PATTERN + r"/@([^/?#]+)/following"
+ pattern = rf"{BASE_PATTERN}/@([^/?#]+)/following"
example = "https://misskey.io/@USER/following"
def items(self):
- user_id = self.api.user_id_by_username(self.item)
+ user_id = self.api.user_id_by_username(self.groups[-1])
for user in self.api.users_following(user_id):
user = user["followee"]
url = f"{self.root}/@{user['username']}"
@@ -186,17 +181,17 @@ class MisskeyFollowingExtractor(MisskeyExtractor):
class MisskeyNoteExtractor(MisskeyExtractor):
"""Extractor for images from a Note"""
subcategory = "note"
- pattern = BASE_PATTERN + r"/notes/(\w+)"
+ pattern = rf"{BASE_PATTERN}/notes/(\w+)"
example = "https://misskey.io/notes/98765"
def notes(self):
- return (self.api.notes_show(self.item),)
+ return (self.api.notes_show(self.groups[-1]),)
class MisskeyFavoriteExtractor(MisskeyExtractor):
"""Extractor for favorited notes"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/(?:my|api/i)/favorites"
+ pattern = rf"{BASE_PATTERN}/(?:my|api/i)/favorites"
example = "https://misskey.io/my/favorites"
def notes(self):
@@ -253,12 +248,39 @@ class MisskeyAPI():
return self.extractor.request_json(url, method="POST", json=data)
def _pagination(self, endpoint, data):
+ extr = self.extractor
data["limit"] = 100
- data["withRenotes"] = self.extractor.renotes
+ data["withRenotes"] = extr.renotes
+ data["withFiles"] = False if extr.config("text-posts") else True
+
+ date_min, date_max = extr._get_date_min_max()
+ if (order := extr.config("order-posts")) and \
+ order[0] in ("a", "r"):
+ key = "sinceId"
+ data["sinceDate"] = 1 if date_min is None else date_min * 1000
+ date_stop = None if date_max is None else date_max
+ else:
+ key = "untilId"
+ date_stop = None
+ if date_min is not None:
+ data["sinceDate"] = date_min * 1000
+ if date_max is None:
+ # ensure notes are returned in descending order
+ data["untilDate"] = (int(dt.time.time()) + 1000) * 1000
+ if date_max is not None:
+ data["untilDate"] = date_max * 1000
while True:
notes = self._call(endpoint, data)
if not notes:
return
- yield from notes
- data["untilId"] = notes[-1]["id"]
+ elif date_stop is not None and dt.to_ts(dt.parse_iso(
+ notes[-1]["createdAt"])) > date_stop:
+ for idx, note in enumerate(notes):
+ if dt.to_ts(dt.parse_iso(note["createdAt"])) > date_stop:
+ yield from notes[:idx]
+ return
+ else:
+ yield from notes
+
+ data[key] = notes[-1]["id"]
diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py
index ba27994..23f8fd9 100644
--- a/gallery_dl/extractor/moebooru.py
+++ b/gallery_dl/extractor/moebooru.py
@@ -9,9 +9,8 @@
"""Extractors for Moebooru based sites"""
from .booru import BooruExtractor
-from .. import text, util
+from .. import text, dt
import collections
-import datetime
class MoebooruExtractor(BooruExtractor):
@@ -21,7 +20,7 @@ class MoebooruExtractor(BooruExtractor):
page_start = 1
def _prepare(self, post):
- post["date"] = text.parse_timestamp(post["created_at"])
+ post["date"] = dt.parse_ts(post["created_at"])
def _html(self, post):
url = f"{self.root}/post/show/{post['id']}"
@@ -33,7 +32,7 @@ class MoebooruExtractor(BooruExtractor):
return
tags = collections.defaultdict(list)
- pattern = util.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
+ pattern = text.re(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
@@ -93,7 +92,7 @@ class MoebooruTagExtractor(MoebooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]*)"
+ pattern = rf"{BASE_PATTERN}/post\?(?:[^&#]*&)*tags=([^&#]*)"
example = "https://yande.re/post?tags=TAG"
def __init__(self, match):
@@ -112,7 +111,7 @@ class MoebooruPoolExtractor(MoebooruExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
- pattern = BASE_PATTERN + r"/pool/show/(\d+)"
+ pattern = rf"{BASE_PATTERN}/pool/show/(\d+)"
example = "https://yande.re/pool/show/12345"
def __init__(self, match):
@@ -136,7 +135,7 @@ class MoebooruPoolExtractor(MoebooruExtractor):
class MoebooruPostExtractor(MoebooruExtractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/post/show/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post/show/(\d+)"
example = "https://yande.re/post/show/12345"
def posts(self):
@@ -148,8 +147,8 @@ class MoebooruPopularExtractor(MoebooruExtractor):
subcategory = "popular"
directory_fmt = ("{category}", "popular", "{scale}", "{date}")
archive_fmt = "P_{scale[0]}_{date}_{id}"
- pattern = BASE_PATTERN + \
- r"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?"
+ pattern = (rf"{BASE_PATTERN}"
+ rf"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?")
example = "https://yande.re/post/popular_by_month?year=YYYY&month=MM"
def __init__(self, match):
@@ -164,14 +163,14 @@ class MoebooruPopularExtractor(MoebooruExtractor):
date = (f"{params['year']:>04}-{params.get('month', '01'):>02}-"
f"{params.get('day', '01'):>02}")
else:
- date = datetime.date.today().isoformat()
+ date = dt.date.today().isoformat()
scale = self.scale
if scale.startswith("by_"):
scale = scale[3:]
if scale == "week":
- date = datetime.date.fromisoformat(date)
- date = (date - datetime.timedelta(days=date.weekday())).isoformat()
+ date = dt.date.fromisoformat(date)
+ date = (date - dt.timedelta(days=date.weekday())).isoformat()
elif scale == "month":
date = date[:-3]
diff --git a/gallery_dl/extractor/motherless.py b/gallery_dl/extractor/motherless.py
index 48137ce..c20f138 100644
--- a/gallery_dl/extractor/motherless.py
+++ b/gallery_dl/extractor/motherless.py
@@ -9,9 +9,8 @@
"""Extractors for https://motherless.com/"""
from .common import Extractor, Message
-from .. import text, util, exception
+from .. import text, dt, exception
from ..cache import memcache
-from datetime import timedelta
BASE_PATTERN = r"(?:https?://)?motherless\.com"
@@ -42,6 +41,8 @@ class MotherlessExtractor(Extractor):
path, _, media_id = path.rpartition("/")
data = {
"id" : media_id,
+ "title": text.unescape(
+ (t := extr("<title>", "<")) and t[:t.rfind(" | ")]),
"type" : extr("__mediatype = '", "'"),
"group": extr("__group = '", "'"),
"url" : extr("__fileurl = '", "'"),
@@ -50,7 +51,6 @@ class MotherlessExtractor(Extractor):
for tag in text.extract_iter(
extr('class="media-meta-tags">', "</div>"), ">#", "<")
],
- "title": text.unescape(extr("<h1>", "<")),
"views": text.parse_int(extr(
'class="count">', " ").replace(",", "")),
"favorites": text.parse_int(extr(
@@ -115,14 +115,14 @@ class MotherlessExtractor(Extractor):
return data
- def _parse_datetime(self, dt):
- if " ago" not in dt:
- return text.parse_datetime(dt, "%d %b %Y")
+ def _parse_datetime(self, dt_string):
+ if " ago" not in dt_string:
+ return dt.parse(dt_string, "%d %b %Y")
- value = text.parse_int(dt[:-5])
- delta = timedelta(0, value*3600) if dt[-5] == "h" else timedelta(value)
- return (util.datetime_utcnow() - delta).replace(
- hour=0, minute=0, second=0)
+ value = text.parse_int(dt_string[:-5])
+ delta = (dt.timedelta(0, value*3600) if dt_string[-5] == "h" else
+ dt.timedelta(value))
+ return (dt.now() - delta).replace(hour=0, minute=0, second=0)
@memcache(keyarg=2)
def _extract_gallery_title(self, page, gallery_id):
@@ -132,10 +132,9 @@ class MotherlessExtractor(Extractor):
if title:
return text.unescape(title.strip())
- pos = page.find(f' href="/G{gallery_id}"')
- if pos >= 0:
- return text.unescape(text.extract(
- page, ' title="', '"', pos)[0])
+ if f' href="/G{gallery_id}"' in page:
+ return text.unescape(
+ (t := text.extr(page, "<title>", "<")) and t[:t.rfind(" | ")])
return ""
@@ -153,15 +152,15 @@ class MotherlessExtractor(Extractor):
class MotherlessMediaExtractor(MotherlessExtractor):
"""Extractor for a single image/video from motherless.com"""
subcategory = "media"
- pattern = (BASE_PATTERN +
- r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?"
- r"(?!G)[A-Z0-9]+)")
+ pattern = (rf"{BASE_PATTERN}/("
+ rf"(?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?"
+ rf"(?!G)[A-Z0-9]+)")
example = "https://motherless.com/ABC123"
def items(self):
file = self._extract_media(self.groups[0])
url = file["url"]
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, url, text.nameext_from_url(url, file)
@@ -171,7 +170,7 @@ class MotherlessGalleryExtractor(MotherlessExtractor):
directory_fmt = ("{category}", "{uploader}",
"{gallery_id} {gallery_title}")
archive_fmt = "{gallery_id}_{id}"
- pattern = BASE_PATTERN + "/G([IVG])?([A-Z0-9]+)/?$"
+ pattern = rf"{BASE_PATTERN}/G([IVG])?([A-Z0-9]+)/?$"
example = "https://motherless.com/GABC123"
def items(self):
@@ -198,7 +197,7 @@ class MotherlessGalleryExtractor(MotherlessExtractor):
file["num"] = num
file["thumbnail"] = thumbnail
url = file["url"]
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, url, text.nameext_from_url(url, file)
@@ -207,7 +206,7 @@ class MotherlessGroupExtractor(MotherlessExtractor):
directory_fmt = ("{category}", "{uploader}",
"{group_id} {group_title}")
archive_fmt = "{group_id}_{id}"
- pattern = BASE_PATTERN + "/g([iv]?)/?([a-z0-9_]+)/?$"
+ pattern = rf"{BASE_PATTERN}/g([iv]?)/?([a-z0-9_]+)/?$"
example = "https://motherless.com/g/abc123"
def items(self):
@@ -236,5 +235,5 @@ class MotherlessGroupExtractor(MotherlessExtractor):
file["uploader"] = uploader
file["group"] = file["group_id"]
url = file["url"]
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, url, text.nameext_from_url(url, file)
diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py
index 2a39dc9..936f857 100644
--- a/gallery_dl/extractor/myhentaigallery.py
+++ b/gallery_dl/extractor/myhentaigallery.py
@@ -6,17 +6,21 @@
"""Extractors for https://myhentaigallery.com/"""
-from .common import GalleryExtractor
+from .common import Extractor, GalleryExtractor, Message
from .. import text, exception
+BASE_PATTERN = r"(?:https?://)?myhentaigallery\.com"
-class MyhentaigalleryGalleryExtractor(GalleryExtractor):
- """Extractor for image galleries from myhentaigallery.com"""
+
+class MyhentaigalleryBase():
category = "myhentaigallery"
root = "https://myhentaigallery.com"
+
+
+class MyhentaigalleryGalleryExtractor(MyhentaigalleryBase, GalleryExtractor):
+ """Extractor for image galleries from myhentaigallery.com"""
directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}")
- pattern = (r"(?:https?://)?myhentaigallery\.com"
- r"/g(?:allery/(?:thumbnails|show))?/(\d+)")
+ pattern = rf"{BASE_PATTERN}/g(?:allery/(?:thumbnails|show))?/(\d+)"
example = "https://myhentaigallery.com/g/12345"
def __init__(self, match):
@@ -53,3 +57,32 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
"/thumbnail/", "/original/"), None)
for url in text.extract_iter(page, 'class="comic-thumb"', '</div>')
]
+
+
+class MyhentaigalleryTagExtractor(MyhentaigalleryBase, Extractor):
+ """Extractor for myhentaigallery tag searches"""
+ subcategory = "tag"
+ pattern = rf"{BASE_PATTERN}(/g/(artist|category|group|parody)/(\d+).*)"
+ example = "https://myhentaigallery.com/g/category/123"
+
+ def items(self):
+ data = {"_extractor": MyhentaigalleryGalleryExtractor}
+ for url in self.galleries():
+ yield Message.Queue, url, data
+
+ def galleries(self):
+ root = self.root
+ url = root + self.groups[0]
+
+ while True:
+ page = self.request(url).text
+
+ for inner in text.extract_iter(
+ page, '<div class="comic-inner">', "<div"):
+ yield root + text.extr(inner, 'href="', '"')
+
+ try:
+ pos = page.index(">Next<")
+ except ValueError:
+ return
+ url = root + text.rextr(page, 'href="', '"', pos)
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
index 0223d0b..3a21122 100644
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -49,7 +49,7 @@ class MyportfolioGalleryExtractor(Extractor):
data = self.metadata(page)
imgs = self.images(page)
data["count"] = len(imgs)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(imgs, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/naverblog.py b/gallery_dl/extractor/naverblog.py
index b55e001..cc96e09 100644
--- a/gallery_dl/extractor/naverblog.py
+++ b/gallery_dl/extractor/naverblog.py
@@ -9,8 +9,7 @@
"""Extractors for https://blog.naver.com/"""
from .common import GalleryExtractor, Extractor, Message
-from .. import text, util
-import datetime
+from .. import text, util, dt
import time
@@ -67,11 +66,11 @@ class NaverBlogPostExtractor(NaverBlogBase, GalleryExtractor):
return data
- def _parse_datetime(self, date_string):
- if "ì „" in date_string:
+ def _parse_datetime(self, dt_string):
+ if "ì „" in dt_string:
ts = time.gmtime()
- return datetime.datetime(ts.tm_year, ts.tm_mon, ts.tm_mday)
- return text.parse_datetime(date_string, "%Y. %m. %d. %H:%M")
+ return dt.datetime(ts.tm_year, ts.tm_mon, ts.tm_mday)
+ return dt.parse(dt_string, "%Y. %m. %d. %H:%M")
def images(self, page):
files = []
diff --git a/gallery_dl/extractor/naverchzzk.py b/gallery_dl/extractor/naverchzzk.py
index de4ee7a..5b56710 100644
--- a/gallery_dl/extractor/naverchzzk.py
+++ b/gallery_dl/extractor/naverchzzk.py
@@ -31,17 +31,17 @@ class NaverChzzkExtractor(Extractor):
data["uid"] = data["objectId"]
data["user"] = comment["user"]
data["count"] = len(files)
- data["date"] = text.parse_datetime(
+ data["date"] = self.parse_datetime(
data["createdDate"], "%Y%m%d%H%M%S")
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], file in enumerate(files, 1):
if extra := file.get("extraJson"):
file.update(util.json_loads(extra))
- file["date"] = text.parse_datetime(
- file["createdDate"], "%Y-%m-%dT%H:%M:%S.%f%z")
- file["date_updated"] = text.parse_datetime(
- file["updatedDate"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ file["date"] = self.parse_datetime_iso(
+ file["createdDate"])
+ file["date_updated"] = self.parse_datetime_iso(
+ file["updatedDate"])
data["file"] = file
url = file["attachValue"]
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py
index 3211941..72089d0 100644
--- a/gallery_dl/extractor/naverwebtoon.py
+++ b/gallery_dl/extractor/naverwebtoon.py
@@ -27,7 +27,7 @@ class NaverWebtoonEpisodeExtractor(NaverWebtoonBase, GalleryExtractor):
directory_fmt = ("{category}", "{comic}")
filename_fmt = "{episode:>03}-{num:>02}.{extension}"
archive_fmt = "{title_id}_{episode}_{num}"
- pattern = BASE_PATTERN + r"/detail(?:\.nhn)?\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/detail(?:\.nhn)?\?([^#]+)"
example = "https://comic.naver.com/webtoon/detail?titleId=12345&no=1"
def __init__(self, match):
@@ -66,7 +66,7 @@ class NaverWebtoonEpisodeExtractor(NaverWebtoonBase, GalleryExtractor):
class NaverWebtoonComicExtractor(NaverWebtoonBase, Extractor):
subcategory = "comic"
categorytransfer = True
- pattern = BASE_PATTERN + r"/list(?:\.nhn)?\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/list(?:\.nhn)?\?([^#]+)"
example = "https://comic.naver.com/webtoon/list?titleId=12345"
def __init__(self, match):
diff --git a/gallery_dl/extractor/nekohouse.py b/gallery_dl/extractor/nekohouse.py
index e6b0461..728912b 100644
--- a/gallery_dl/extractor/nekohouse.py
+++ b/gallery_dl/extractor/nekohouse.py
@@ -12,7 +12,7 @@ from .common import Extractor, Message
from .. import text
BASE_PATTERN = r"(?:https?://)?nekohouse\.su"
-USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/([^/?#]+)/user/([^/?#]+)"
class NekohouseExtractor(Extractor):
@@ -27,7 +27,7 @@ class NekohousePostExtractor(NekohouseExtractor):
"{post_id} {date} {title[b:230]}")
filename_fmt = "{num:>02} {id|filename}.{extension}"
archive_fmt = "{service}_{user_id}_{post_id}_{hash}"
- pattern = USER_PATTERN + r"/post/([^/?#]+)"
+ pattern = rf"{USER_PATTERN}/post/([^/?#]+)"
example = "https://nekohouse.su/SERVICE/user/12345/post/12345"
def items(self):
@@ -42,7 +42,7 @@ class NekohousePostExtractor(NekohouseExtractor):
post["post_id"] = post_id
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
url = file["url"]
text.nameext_from_url(url, file)
@@ -59,8 +59,8 @@ class NekohousePostExtractor(NekohouseExtractor):
'class="scrape__user-name', '</').rpartition(">")[2].strip()),
"title" : text.unescape(extr(
'class="scrape__title', '</').rpartition(">")[2]),
- "date" : text.parse_datetime(extr(
- 'datetime="', '"')[:19], "%Y-%m-%d %H:%M:%S"),
+ "date" : self.parse_datetime_iso(extr(
+ 'datetime="', '"')[:19]),
"content": text.unescape(extr(
'class="scrape__content">', "</div>").strip()),
}
@@ -98,7 +98,7 @@ class NekohousePostExtractor(NekohouseExtractor):
class NekohouseUserExtractor(NekohouseExtractor):
subcategory = "user"
- pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)"
+ pattern = rf"{USER_PATTERN}/?(?:\?([^#]+))?(?:$|\?|#)"
example = "https://nekohouse.su/SERVICE/user/12345"
def items(self):
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index ffb4cad..f980f4b 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -9,7 +9,7 @@
"""Extractors for https://www.newgrounds.com/"""
from .common import Extractor, Message, Dispatch
-from .. import text, util, exception
+from .. import text, util, dt, exception
from ..cache import cache
import itertools
@@ -34,7 +34,7 @@ class NewgroundsExtractor(Extractor):
self.user_root = f"https://{self.user}.newgrounds.com"
def _init(self):
- self._extract_comment_urls = util.re(
+ self._extract_comment_urls = text.re(
r'(?:<img |data-smartload-)src="([^"]+)').findall
self.flash = self.config("flash", True)
@@ -58,13 +58,13 @@ class NewgroundsExtractor(Extractor):
post = self.extract_post(post_url)
url = post.get("url")
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
url = None
if url:
if metadata:
post.update(metadata)
- yield Message.Directory, post
+ yield Message.Directory, "", post
post["num"] = 0
yield Message.Url, url, text.nameext_from_url(url, post)
@@ -88,6 +88,7 @@ class NewgroundsExtractor(Extractor):
text.nameext_from_url(url, post)
yield Message.Url, url, post
else:
+ self.status |= 1
self.log.warning(
"Unable to get download URL for '%s'", post_url)
@@ -218,7 +219,7 @@ class NewgroundsExtractor(Extractor):
"description": text.unescape(extr(':description" content="', '"')),
"type" : "art",
"_type" : "i",
- "date" : text.parse_datetime(extr(
+ "date" : dt.parse_iso(extr(
'itemprop="datePublished" content="', '"')),
"rating" : extr('class="rated-', '"'),
"url" : full('src="', '"'),
@@ -268,7 +269,7 @@ class NewgroundsExtractor(Extractor):
"description": text.unescape(extr(':description" content="', '"')),
"type" : "audio",
"_type" : "a",
- "date" : text.parse_datetime(extr(
+ "date" : dt.parse_iso(extr(
'itemprop="datePublished" content="', '"')),
"url" : extr('{"url":"', '"').replace("\\/", "/"),
"index" : text.parse_int(index),
@@ -287,7 +288,7 @@ class NewgroundsExtractor(Extractor):
src = src.replace("\\/", "/")
formats = ()
type = extr(',"description":"', '"')
- date = text.parse_datetime(extr(
+ date = dt.parse_iso(extr(
'itemprop="datePublished" content="', '"'))
if type:
type = type.rpartition(" ")[2].lower()
@@ -302,7 +303,7 @@ class NewgroundsExtractor(Extractor):
sources = self.request_json(url, headers=headers)["sources"]
formats = self._video_formats(sources)
src = next(formats, "")
- date = text.parse_timestamp(src.rpartition("?")[2])
+ date = self.parse_timestamp(src.rpartition("?")[2])
type = "movie"
return {
@@ -321,7 +322,7 @@ class NewgroundsExtractor(Extractor):
def _video_formats(self, sources):
src = sources["360p"][0]["src"]
- sub = util.re(r"\.360p\.\w+").sub
+ sub = text.re(r"\.360p\.\w+").sub
for fmt in self.format:
try:
@@ -411,7 +412,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
class NewgroundsMediaExtractor(NewgroundsExtractor):
"""Extractor for a media file from newgrounds.com"""
subcategory = "media"
- pattern = BASE_PATTERN + r"(/(?:portal/view|audio/listen)/\d+)"
+ pattern = rf"{BASE_PATTERN}(/(?:portal/view|audio/listen)/\d+)"
example = "https://www.newgrounds.com/portal/view/12345"
def __init__(self, match):
@@ -426,34 +427,34 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
class NewgroundsArtExtractor(NewgroundsExtractor):
"""Extractor for all images of a newgrounds user"""
subcategory = _path = "art"
- pattern = USER_PATTERN + r"/art(?:(?:/page/|/?\?page=)(\d+))?/?$"
+ pattern = rf"{USER_PATTERN}/art(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/art"
class NewgroundsAudioExtractor(NewgroundsExtractor):
"""Extractor for all audio submissions of a newgrounds user"""
subcategory = _path = "audio"
- pattern = USER_PATTERN + r"/audio(?:(?:/page/|/?\?page=)(\d+))?/?$"
+ pattern = rf"{USER_PATTERN}/audio(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/audio"
class NewgroundsMoviesExtractor(NewgroundsExtractor):
"""Extractor for all movies of a newgrounds user"""
subcategory = _path = "movies"
- pattern = USER_PATTERN + r"/movies(?:(?:/page/|/?\?page=)(\d+))?/?$"
+ pattern = rf"{USER_PATTERN}/movies(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/movies"
class NewgroundsGamesExtractor(NewgroundsExtractor):
"""Extractor for a newgrounds user's games"""
subcategory = _path = "games"
- pattern = USER_PATTERN + r"/games(?:(?:/page/|/?\?page=)(\d+))?/?$"
+ pattern = rf"{USER_PATTERN}/games(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/games"
class NewgroundsUserExtractor(Dispatch, NewgroundsExtractor):
"""Extractor for a newgrounds user profile"""
- pattern = USER_PATTERN + r"/?$"
+ pattern = rf"{USER_PATTERN}/?$"
example = "https://USER.newgrounds.com"
def items(self):
@@ -470,7 +471,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
"""Extractor for posts favorited by a newgrounds user"""
subcategory = "favorite"
directory_fmt = ("{category}", "{user}", "Favorites")
- pattern = (USER_PATTERN + r"/favorites(?!/following)(?:/(art|audio|movies)"
+ pattern = (rf"{USER_PATTERN}/favorites(?!/following)(?:/(art|audio|movies)"
r"(?:(?:/page/|/?\?page=)(\d+))?)?")
example = "https://USER.newgrounds.com/favorites"
@@ -516,7 +517,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
"""Extractor for a newgrounds user's favorited users"""
subcategory = "following"
- pattern = (USER_PATTERN + r"/favorites/(following)"
+ pattern = (rf"{USER_PATTERN}/favorites/(following)"
r"(?:(?:/page/|/?\?page=)(\d+))?")
example = "https://USER.newgrounds.com/favorites/following"
@@ -538,7 +539,7 @@ class NewgroundsSearchExtractor(NewgroundsExtractor):
"""Extractor for newgrounds.com search reesults"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search_tags}")
- pattern = BASE_PATTERN + r"/search/conduct/([^/?#]+)/?\?([^#]+)"
+ pattern = rf"{BASE_PATTERN}/search/conduct/([^/?#]+)/?\?([^#]+)"
example = "https://www.newgrounds.com/search/conduct/art?terms=QUERY"
def __init__(self, match):
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index c6df835..a6b01c2 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -9,7 +9,7 @@
"""Extractors for nijie instances"""
from .common import BaseExtractor, Message, Dispatch, AsynchronousMixin
-from .. import text, exception
+from .. import text, dt, exception
from ..cache import cache
@@ -59,7 +59,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
urls = self._extract_images(image_id, page)
data["count"] = len(urls)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for num, url in enumerate(urls):
image = text.nameext_from_url(url, {
"num": num,
@@ -82,8 +82,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"title" : keywords[0].strip(),
"description": text.unescape(extr(
'"description": "', '"').replace("&amp;", "&")),
- "date" : text.parse_datetime(extr(
- '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y", 9),
+ "date" : dt.parse(extr(
+ '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"
+ ) - dt.timedelta(hours=9),
"artist_id" : text.parse_int(extr('/members.php?id=', '"')),
"artist_name": keywords[1],
"tags" : keywords[2:-1],
@@ -101,9 +102,9 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"artist_id" : text.parse_int(extr('members.php?id=', '"')),
"artist_name": keywords[1],
"tags" : keywords[2:-1],
- "date" : text.parse_datetime(extr(
- "itemprop='datePublished' content=", "<").rpartition(">")[2],
- "%Y-%m-%d %H:%M:%S", 9),
+ "date" : dt.parse_iso(extr(
+ "itemprop='datePublished' content=", "<").rpartition(">")[2]
+ ) - dt.timedelta(hours=9),
}
def _extract_images(self, image_id, page):
@@ -177,7 +178,7 @@ BASE_PATTERN = NijieExtractor.update({
class NijieUserExtractor(Dispatch, NijieExtractor):
"""Extractor for nijie user profiles"""
- pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/members\.php\?id=(\d+)"
example = "https://nijie.info/members.php?id=12345"
def items(self):
@@ -193,7 +194,7 @@ class NijieUserExtractor(Dispatch, NijieExtractor):
class NijieIllustrationExtractor(NijieExtractor):
"""Extractor for all illustrations of a nijie-user"""
subcategory = "illustration"
- pattern = BASE_PATTERN + r"/members_illust\.php\?id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/members_illust\.php\?id=(\d+)"
example = "https://nijie.info/members_illust.php?id=12345"
def image_ids(self):
@@ -203,7 +204,7 @@ class NijieIllustrationExtractor(NijieExtractor):
class NijieDoujinExtractor(NijieExtractor):
"""Extractor for doujin entries of a nijie user"""
subcategory = "doujin"
- pattern = BASE_PATTERN + r"/members_dojin\.php\?id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/members_dojin\.php\?id=(\d+)"
example = "https://nijie.info/members_dojin.php?id=12345"
def image_ids(self):
@@ -215,7 +216,7 @@ class NijieFavoriteExtractor(NijieExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "bookmarks", "{user_id}")
archive_fmt = "f_{user_id}_{image_id}_{num}"
- pattern = BASE_PATTERN + r"/user_like_illust_view\.php\?id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/user_like_illust_view\.php\?id=(\d+)"
example = "https://nijie.info/user_like_illust_view.php?id=12345"
def image_ids(self):
@@ -233,7 +234,7 @@ class NijieNuitaExtractor(NijieExtractor):
subcategory = "nuita"
directory_fmt = ("{category}", "nuita", "{user_id}")
archive_fmt = "n_{user_id}_{image_id}_{num}"
- pattern = BASE_PATTERN + r"/history_nuita\.php\?id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/history_nuita\.php\?id=(\d+)"
example = "https://nijie.info/history_nuita.php?id=12345"
def image_ids(self):
@@ -252,7 +253,7 @@ class NijieNuitaExtractor(NijieExtractor):
class NijieFeedExtractor(NijieExtractor):
"""Extractor for nijie liked user feed"""
subcategory = "feed"
- pattern = BASE_PATTERN + r"/like_user_view\.php"
+ pattern = rf"{BASE_PATTERN}/like_user_view\.php"
example = "https://nijie.info/like_user_view.php"
def image_ids(self):
@@ -265,7 +266,7 @@ class NijieFeedExtractor(NijieExtractor):
class NijieFollowedExtractor(NijieExtractor):
"""Extractor for followed nijie users"""
subcategory = "followed"
- pattern = BASE_PATTERN + r"/like_my\.php"
+ pattern = rf"{BASE_PATTERN}/like_my\.php"
example = "https://nijie.info/like_my.php"
def items(self):
@@ -291,7 +292,7 @@ class NijieFollowedExtractor(NijieExtractor):
class NijieImageExtractor(NijieExtractor):
"""Extractor for a nijie work/image"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/view(?:_popup)?\.php\?id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/view(?:_popup)?\.php\?id=(\d+)"
example = "https://nijie.info/view.php?id=12345"
def image_ids(self):
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py
index 69d8299..321883c 100644
--- a/gallery_dl/extractor/nitter.py
+++ b/gallery_dl/extractor/nitter.py
@@ -97,7 +97,7 @@ class NitterExtractor(BaseExtractor):
files = ()
tweet["count"] = len(files)
- yield Message.Directory, tweet
+ yield Message.Directory, "", tweet
for tweet["num"], file in enumerate(files, 1):
url = file["url"]
file.update(tweet)
@@ -114,7 +114,7 @@ class NitterExtractor(BaseExtractor):
return {
"author" : author,
"user" : self.user_obj or author,
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
"tweet_id": link.rpartition("/")[2].partition("#")[0],
"content": extr('class="tweet-content', "</div").partition(">")[2],
@@ -142,7 +142,7 @@ class NitterExtractor(BaseExtractor):
return {
"author" : author,
"user" : self.user_obj or author,
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
"tweet_id": link.rpartition("/")[2].partition("#")[0],
"content" : extr('class="quote-text', "</div").partition(">")[2],
@@ -173,7 +173,7 @@ class NitterExtractor(BaseExtractor):
"nick" : extr('title="', '"'),
"name" : extr('title="@', '"'),
"description" : extr('<p dir="auto">', '<'),
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr('class="profile-joindate"><span title="', '"'),
"%I:%M %p - %d %b %Y"),
"statuses_count" : text.parse_int(extr(
@@ -229,12 +229,12 @@ class NitterExtractor(BaseExtractor):
BASE_PATTERN = NitterExtractor.update({
})
-USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/(i(?:/user/|d:)(\d+)|[^/?#]+)"
class NitterTweetsExtractor(NitterExtractor):
subcategory = "tweets"
- pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)"
+ pattern = rf"{USER_PATTERN}(?:/tweets)?(?:$|\?|#)"
example = "https://nitter.net/USER"
def tweets(self):
@@ -243,7 +243,7 @@ class NitterTweetsExtractor(NitterExtractor):
class NitterRepliesExtractor(NitterExtractor):
subcategory = "replies"
- pattern = USER_PATTERN + r"/with_replies"
+ pattern = rf"{USER_PATTERN}/with_replies"
example = "https://nitter.net/USER/with_replies"
def tweets(self):
@@ -252,7 +252,7 @@ class NitterRepliesExtractor(NitterExtractor):
class NitterMediaExtractor(NitterExtractor):
subcategory = "media"
- pattern = USER_PATTERN + r"/media"
+ pattern = rf"{USER_PATTERN}/media"
example = "https://nitter.net/USER/media"
def tweets(self):
@@ -261,7 +261,7 @@ class NitterMediaExtractor(NitterExtractor):
class NitterSearchExtractor(NitterExtractor):
subcategory = "search"
- pattern = USER_PATTERN + r"/search"
+ pattern = rf"{USER_PATTERN}/search"
example = "https://nitter.net/USER/search"
def tweets(self):
@@ -274,7 +274,7 @@ class NitterTweetExtractor(NitterExtractor):
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{num}"
- pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())"
+ pattern = rf"{BASE_PATTERN}/(i/web|[^/?#]+)/status/(\d+())"
example = "https://nitter.net/USER/status/12345"
def tweets(self):
diff --git a/gallery_dl/extractor/noop.py b/gallery_dl/extractor/noop.py
index df2316c..fe88e63 100644
--- a/gallery_dl/extractor/noop.py
+++ b/gallery_dl/extractor/noop.py
@@ -8,7 +8,7 @@
"""noop extractor"""
-from .common import Extractor, Message
+from .common import Extractor
class NoopExtractor(Extractor):
@@ -17,11 +17,9 @@ class NoopExtractor(Extractor):
example = "noop"
def items(self):
- # yield *something* to prevent a 'No results' message
- yield Message.Version, 1
-
# Save cookies manually, since it happens automatically only after
# extended extractor initialization, i.e. Message.Directory, which
# itself might cause some unintended effects.
if self.cookies:
self.cookies_store()
+ return iter(((-1, "", None),))
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 528aff2..fdd3594 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -9,7 +9,7 @@
"""Extractors for https://nozomi.la/"""
from .common import Extractor, Message
-from .. import text
+from .. import text, dt
def decode_nozomi(n):
@@ -49,10 +49,9 @@ class NozomiExtractor(Extractor):
post["character"] = self._list(post.get("character"))
try:
- post["date"] = text.parse_datetime(
- post["date"] + ":00", "%Y-%m-%d %H:%M:%S%z")
+ post["date"] = dt.parse_iso(post["date"] + ":00")
except Exception:
- post["date"] = None
+ post["date"] = dt.NONE
post.update(data)
@@ -61,7 +60,7 @@ class NozomiExtractor(Extractor):
if key in post:
del post[key]
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], image in enumerate(images, 1):
post["filename"] = post["dataid"] = did = image["dataid"]
post["is_video"] = video = \
diff --git a/gallery_dl/extractor/nudostar.py b/gallery_dl/extractor/nudostar.py
index 467d36a..2eb4340 100644
--- a/gallery_dl/extractor/nudostar.py
+++ b/gallery_dl/extractor/nudostar.py
@@ -21,7 +21,7 @@ class NudostarExtractor(GalleryExtractor):
class NudostarModelExtractor(NudostarExtractor):
"""Extractor for NudoStar models"""
subcategory = "model"
- pattern = BASE_PATTERN + r"(/models/([^/?#]+)/?)$"
+ pattern = rf"{BASE_PATTERN}(/models/([^/?#]+)/?)$"
example = "https://nudostar.tv/models/MODEL/"
def metadata(self, page):
@@ -53,7 +53,7 @@ class NudostarModelExtractor(NudostarExtractor):
class NudostarImageExtractor(NudostarExtractor):
"""Extractor for NudoStar images"""
subcategory = "image"
- pattern = BASE_PATTERN + r"(/models/([^/?#]+)/(\d+)/)"
+ pattern = rf"{BASE_PATTERN}(/models/([^/?#]+)/(\d+)/)"
example = "https://nudostar.tv/models/MODEL/123/"
def items(self):
@@ -67,5 +67,5 @@ class NudostarImageExtractor(NudostarExtractor):
data["num"] = text.parse_int(self.groups[2])
data["url"] = img_url
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, img_url, data
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index ff192c2..a0e3c9f 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -8,16 +8,14 @@
"""Utility classes to setup OAuth and link accounts to gallery-dl"""
-from .common import Extractor, Message
+from .common import Extractor
from .. import text, oauth, util, config, exception
from ..output import stdout_write
from ..cache import cache, memcache
-import urllib.parse
-import binascii
-import hashlib
REDIRECT_URI_LOCALHOST = "http://localhost:6414/"
REDIRECT_URI_HTTPS = "https://mikf.github.io/gallery-dl/oauth-redirect.html"
+NOOP = ((-1, "", None),)
class OAuthBase(Extractor):
@@ -86,7 +84,7 @@ class OAuthBase(Extractor):
def open(self, url, params, recv=None):
"""Open 'url' in browser amd return response parameters"""
- url += "?" + urllib.parse.urlencode(params)
+ url = f"{url}?{text.build_query(params)}"
if browser := self.config("browser", True):
try:
@@ -257,16 +255,18 @@ class OAuthFlickr(OAuthBase):
redirect_uri = REDIRECT_URI_HTTPS
def items(self):
- yield Message.Version, 1
- from . import flickr
+ # from . import flickr
self._oauth1_authorization_flow(
- flickr.FlickrAPI.API_KEY,
- flickr.FlickrAPI.API_SECRET,
+ # flickr.FlickrAPI.API_KEY,
+ # flickr.FlickrAPI.API_SECRET,
+ "",
+ "",
"https://www.flickr.com/services/oauth/request_token",
"https://www.flickr.com/services/oauth/authorize",
"https://www.flickr.com/services/oauth/access_token",
)
+ return iter(NOOP)
class OAuthSmugmug(OAuthBase):
@@ -275,7 +275,6 @@ class OAuthSmugmug(OAuthBase):
example = "oauth:smugmug"
def items(self):
- yield Message.Version, 1
from . import smugmug
self._oauth1_authorization_flow(
@@ -285,6 +284,7 @@ class OAuthSmugmug(OAuthBase):
"https://api.smugmug.com/services/oauth/1.0a/authorize",
"https://api.smugmug.com/services/oauth/1.0a/getAccessToken",
)
+ return iter(NOOP)
class OAuthTumblr(OAuthBase):
@@ -293,7 +293,6 @@ class OAuthTumblr(OAuthBase):
example = "oauth:tumblr"
def items(self):
- yield Message.Version, 1
from . import tumblr
self._oauth1_authorization_flow(
@@ -303,6 +302,7 @@ class OAuthTumblr(OAuthBase):
"https://www.tumblr.com/oauth/authorize",
"https://www.tumblr.com/oauth/access_token",
)
+ return iter(NOOP)
# --------------------------------------------------------------------
@@ -315,7 +315,6 @@ class OAuthDeviantart(OAuthBase):
redirect_uri = REDIRECT_URI_HTTPS
def items(self):
- yield Message.Version, 1
from . import deviantart
self._oauth2_authorization_code_grant(
@@ -328,6 +327,7 @@ class OAuthDeviantart(OAuthBase):
scope="browse user.manage",
cache=deviantart._refresh_token_cache,
)
+ return iter(NOOP)
class OAuthReddit(OAuthBase):
@@ -336,7 +336,6 @@ class OAuthReddit(OAuthBase):
example = "oauth:reddit"
def items(self):
- yield Message.Version, 1
from . import reddit
self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT
@@ -350,6 +349,7 @@ class OAuthReddit(OAuthBase):
scope="read history",
cache=reddit._refresh_token_cache,
)
+ return iter(NOOP)
class OAuthMastodon(OAuthBase):
@@ -362,7 +362,6 @@ class OAuthMastodon(OAuthBase):
self.instance = match[1]
def items(self):
- yield Message.Version, 1
from . import mastodon
for _, root, application in mastodon.MastodonExtractor.instances:
@@ -382,6 +381,7 @@ class OAuthMastodon(OAuthBase):
key="access_token",
cache=mastodon._access_token_cache,
)
+ return iter(NOOP)
@cache(maxage=36500*86400, keyarg=1)
def _register(self, instance):
@@ -416,8 +416,9 @@ class OAuthPixiv(OAuthBase):
example = "oauth:pixiv"
def items(self):
- yield Message.Version, 1
from . import pixiv
+ import binascii
+ import hashlib
code_verifier = util.generate_token(32)
digest = hashlib.sha256(code_verifier.encode()).digest()
@@ -464,6 +465,7 @@ class OAuthPixiv(OAuthBase):
self.log.info("Writing 'refresh-token' to cache")
stdout_write(self._generate_message(("refresh-token",), (token,)))
+ return iter(NOOP)
def _input_code(self):
stdout_write("""\
diff --git a/gallery_dl/extractor/okporn.py b/gallery_dl/extractor/okporn.py
new file mode 100644
index 0000000..e03f7cb
--- /dev/null
+++ b/gallery_dl/extractor/okporn.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://ok.porn/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class OkpornGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from ok.porn"""
+ category = "okporn"
+ root = "https://ok.porn"
+ pattern = r"(?:https?://)?(?:www\.)?ok\.porn/albums/(\d+)"
+ example = "https://ok.porn/albums/12345/"
+
+ def __init__(self, match):
+ url = f"{self.root}/albums/{match[1]}/"
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ return {
+ "gallery_id" : text.parse_int(self.groups[0]),
+ "title" : text.unescape(text.extr(
+ page, "h1 class=title>", "</h1>")),
+ "description": text.unescape(text.extr(
+ page, 'name="description" content="', '"')),
+ "tags": text.extr(
+ page, 'name="keywords" content="', '"').split(", "),
+ }
+
+ def images(self, page):
+ return [
+ (url, None)
+ for url in text.extract_iter(page, 'data-original="', '"')
+ ]
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 490243a..d56331f 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -31,7 +31,7 @@ class PahealExtractor(Extractor):
post["width"] = text.parse_int(post["width"])
post["height"] = text.parse_int(post["height"])
post.update(data)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, post["file_url"], post
def get_metadata(self):
@@ -53,8 +53,7 @@ class PahealExtractor(Extractor):
extr("<source src='", "'")),
"uploader": text.unquote(extr(
"class='username' href='/user/", "'")),
- "date" : text.parse_datetime(
- extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
+ "date" : self.parse_datetime_iso(extr("datetime='", "'")),
"source" : text.unescape(text.extr(
extr(">Source Link<", "</td>"), "href='", "'")),
}
@@ -133,7 +132,7 @@ class PahealTagExtractor(PahealExtractor):
"duration" : text.parse_float(duration[:-1]),
"tags" : text.unescape(tags),
"size" : text.parse_bytes(size[:-1]),
- "date" : text.parse_datetime(date, "%B %d, %Y; %H:%M"),
+ "date" : self.parse_datetime(date, "%B %d, %Y; %H:%M"),
"filename" : f"{pid} - {tags}",
"extension": ext,
}
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index cf1a6d6..12dfd48 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -9,7 +9,7 @@
"""Extractors for https://www.patreon.com/"""
from .common import Extractor, Message
-from .. import text, util, exception
+from .. import text, util, dt, exception
from ..cache import memcache
import collections
import itertools
@@ -46,20 +46,21 @@ class PatreonExtractor(Extractor):
for post in self.posts():
- yield Message.Directory, post
+ yield Message.Directory, "", post
if not post.get("current_user_can_view", True):
self.log.warning("Not allowed to view post %s", post["id"])
continue
post["num"] = 0
hashes = set()
- for kind, url, name in itertools.chain.from_iterable(
+ for kind, file, url, name in itertools.chain.from_iterable(
g(post) for g in generators):
fhash = self._filehash(url)
if fhash not in hashes or not fhash:
hashes.add(fhash)
post["hash"] = fhash
post["type"] = kind
+ post["file"] = file
post["num"] += 1
text.nameext_from_url(name, post)
if text.ext_from_url(url) == "m3u8":
@@ -86,7 +87,7 @@ class PatreonExtractor(Extractor):
name = url
else:
name = self._filename(url) or url
- return (("postfile", url, name),)
+ return (("postfile", postfile, url, name),)
return ()
def _images(self, post):
@@ -94,7 +95,7 @@ class PatreonExtractor(Extractor):
for image in images:
if url := self._images_url(image):
name = image.get("file_name") or self._filename(url) or url
- yield "image", url, name
+ yield "image", image, url, name
def _images_url(self, image):
return image.get("download_url")
@@ -109,24 +110,24 @@ class PatreonExtractor(Extractor):
if image := post.get("image"):
if url := image.get("large_url"):
name = image.get("file_name") or self._filename(url) or url
- return (("image_large", url, name),)
+ return (("image_large", image, url, name),)
return ()
def _attachments(self, post):
for attachment in post.get("attachments") or ():
if url := self.request_location(attachment["url"], fatal=False):
- yield "attachment", url, attachment["name"]
+ yield "attachment", attachment, url, attachment["name"]
for attachment in post.get("attachments_media") or ():
if url := attachment.get("download_url"):
- yield "attachment", url, attachment["file_name"]
+ yield "attachment", attachment, url, attachment["file_name"]
def _content(self, post):
if content := post.get("content"):
for img in text.extract_iter(
content, '<img data-media-id="', '>'):
if url := text.extr(img, 'src="', '"'):
- yield "content", url, self._filename(url) or url
+ yield "content", None, url, self._filename(url) or url
def posts(self):
"""Return all relevant post objects"""
@@ -177,8 +178,7 @@ class PatreonExtractor(Extractor):
post, included, "attachments")
attr["attachments_media"] = self._files(
post, included, "attachments_media")
- attr["date"] = text.parse_datetime(
- attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ attr["date"] = self.parse_datetime_iso(attr["published_at"])
try:
attr["campaign"] = (included["campaign"][
@@ -226,8 +226,7 @@ class PatreonExtractor(Extractor):
user = response.json()["data"]
attr = user["attributes"]
attr["id"] = user["id"]
- attr["date"] = text.parse_datetime(
- attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ attr["date"] = self.parse_datetime_iso(attr["created"])
return attr
def _collection(self, collection_id):
@@ -236,8 +235,7 @@ class PatreonExtractor(Extractor):
coll = data["data"]
attr = coll["attributes"]
attr["id"] = coll["id"]
- attr["date"] = text.parse_datetime(
- attr["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ attr["date"] = self.parse_datetime_iso(attr["created_at"])
return attr
def _filename(self, url):
@@ -256,7 +254,7 @@ class PatreonExtractor(Extractor):
return part
return ""
- def _build_url(self, endpoint, query):
+ def _build_url(self, endpoint, sort, query):
return (
f"https://www.patreon.com/api/{endpoint}"
@@ -291,11 +289,20 @@ class PatreonExtractor(Extractor):
"preview_views,video_duration"
f"&page[cursor]={self._init_cursor()}"
- f"{query}"
+ f"{query}{self._order(sort)}"
"&json-api-version=1.0"
)
+ def _order(self, sort):
+ if order := self.config("order-posts"):
+ if order in {"d", "desc"}:
+ order = "-published_at"
+ elif order in {"a", "asc", "r", "reverse"}:
+ order = "published_at"
+ return f"&sort={order}"
+ return f"&sort={sort}" if sort else ""
+
def _build_file_generators(self, filetypes):
if filetypes is None:
return (self._images, self._image_large,
@@ -358,17 +365,26 @@ class PatreonCollectionExtractor(PatreonExtractor):
campaign_id = text.extr(
collection["thumbnail"]["url"], "/campaign/", "/")
- url = self._build_url("posts", (
+ url = self._build_url("posts", "collection_order", (
# patreon returns '400 Bad Request' without campaign_id filter
f"&filter[campaign_id]={campaign_id}"
"&filter[contains_exclusive_posts]=true"
"&filter[is_draft]=false"
f"&filter[collection_id]={collection_id}"
"&filter[include_drops]=true"
- "&sort=collection_order"
))
return self._pagination(url)
+ def _order(self, sort):
+ if order := self.config("order-posts"):
+ if order in {"a", "asc"}:
+ order = "collection_order"
+ elif order in {"d", "desc", "r", "reverse"}:
+ # "-collection_order" results in a '400 Bad Request' error
+ order = "-published_at"
+ return f"&sort={order}"
+ return f"&sort={sort}" if sort else ""
+
class PatreonCreatorExtractor(PatreonExtractor):
"""Extractor for a creator's works"""
@@ -387,12 +403,11 @@ class PatreonCreatorExtractor(PatreonExtractor):
campaign_id = self._get_campaign_id(creator, params)
self.log.debug("campaign_id: %s", campaign_id)
- url = self._build_url("posts", (
+ url = self._build_url("posts", params.get("sort", "-published_at"), (
f"&filter[campaign_id]={campaign_id}"
"&filter[contains_exclusive_posts]=true"
"&filter[is_draft]=false"
f"{self._get_filters(params)}"
- f"&sort={params.get('sort', '-published_at')}"
))
return self._pagination(url)
@@ -445,11 +460,10 @@ class PatreonUserExtractor(PatreonExtractor):
def posts(self):
if date_max := self._get_date_min_max(None, None)[1]:
- self._cursor = cursor = \
- util.datetime_from_timestamp(date_max).isoformat()
+ self._cursor = cursor = dt.from_ts(date_max).isoformat()
self._init_cursor = lambda: cursor
- url = self._build_url("stream", (
+ url = self._build_url("stream", None, (
"&filter[is_following]=true"
"&json-api-use-default-includes=false"
))
diff --git a/gallery_dl/extractor/pexels.py b/gallery_dl/extractor/pexels.py
index f95d409..9e2f40c 100644
--- a/gallery_dl/extractor/pexels.py
+++ b/gallery_dl/extractor/pexels.py
@@ -35,8 +35,7 @@ class PexelsExtractor(Extractor):
post["type"] = attr["type"]
post.update(metadata)
- post["date"] = text.parse_datetime(
- post["created_at"][:-5], "%Y-%m-%dT%H:%M:%S")
+ post["date"] = self.parse_datetime_iso(post["created_at"][:-5])
if "image" in post:
url, _, query = post["image"]["download_link"].partition("?")
@@ -49,7 +48,7 @@ class PexelsExtractor(Extractor):
self.log.warning("%s: Unsupported post type", post.get("id"))
continue
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, url, text.nameext_from_url(name, post)
def posts(self):
@@ -63,7 +62,7 @@ class PexelsCollectionExtractor(PexelsExtractor):
"""Extractor for a pexels.com collection"""
subcategory = "collection"
directory_fmt = ("{category}", "Collections", "{collection}")
- pattern = BASE_PATTERN + r"/collections/((?:[^/?#]*-)?(\w+))"
+ pattern = rf"{BASE_PATTERN}/collections/((?:[^/?#]*-)?(\w+))"
example = "https://www.pexels.com/collections/SLUG-a1b2c3/"
def metadata(self):
@@ -78,7 +77,7 @@ class PexelsSearchExtractor(PexelsExtractor):
"""Extractor for pexels.com search results"""
subcategory = "search"
directory_fmt = ("{category}", "Searches", "{search_tags}")
- pattern = BASE_PATTERN + r"/search/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/search/([^/?#]+)"
example = "https://www.pexels.com/search/QUERY/"
def metadata(self):
@@ -92,7 +91,7 @@ class PexelsUserExtractor(PexelsExtractor):
"""Extractor for pexels.com user galleries"""
subcategory = "user"
directory_fmt = ("{category}", "@{user[slug]}")
- pattern = BASE_PATTERN + r"/(@(?:(?:[^/?#]*-)?(\d+)|[^/?#]+))"
+ pattern = rf"{BASE_PATTERN}/(@(?:(?:[^/?#]*-)?(\d+)|[^/?#]+))"
example = "https://www.pexels.com/@USER-12345/"
def posts(self):
@@ -101,7 +100,7 @@ class PexelsUserExtractor(PexelsExtractor):
class PexelsImageExtractor(PexelsExtractor):
subcategory = "image"
- pattern = BASE_PATTERN + r"/photo/((?:[^/?#]*-)?\d+)"
+ pattern = rf"{BASE_PATTERN}/photo/((?:[^/?#]*-)?\d+)"
example = "https://www.pexels.com/photo/SLUG-12345/"
def posts(self):
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 8891dc0..3634c66 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -36,8 +36,7 @@ class PhilomenaExtractor(BooruExtractor):
return url
def _prepare(self, post):
- post["date"] = text.parse_datetime(
- post["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
+ post["date"] = self.parse_datetime_iso(post["created_at"][:19])
BASE_PATTERN = PhilomenaExtractor.update({
@@ -62,7 +61,7 @@ BASE_PATTERN = PhilomenaExtractor.update({
class PhilomenaPostExtractor(PhilomenaExtractor):
"""Extractor for single posts on a Philomena booru"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/(?:images/)?(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?:images/)?(\d+)"
example = "https://derpibooru.org/images/12345"
def posts(self):
@@ -73,7 +72,7 @@ class PhilomenaSearchExtractor(PhilomenaExtractor):
"""Extractor for Philomena search results"""
subcategory = "search"
directory_fmt = ("{category}", "{search_tags}")
- pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))"
+ pattern = rf"{BASE_PATTERN}/(?:search/?\?([^#]+)|tags/([^/?#]+))"
example = "https://derpibooru.org/search?q=QUERY"
def __init__(self, match):
@@ -107,7 +106,7 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor):
subcategory = "gallery"
directory_fmt = ("{category}", "galleries",
"{gallery[id]} {gallery[title]}")
- pattern = BASE_PATTERN + r"/galleries/(\d+)"
+ pattern = rf"{BASE_PATTERN}/galleries/(\d+)"
example = "https://derpibooru.org/galleries/12345"
def metadata(self):
diff --git a/gallery_dl/extractor/photovogue.py b/gallery_dl/extractor/photovogue.py
index e604304..cb16b23 100644
--- a/gallery_dl/extractor/photovogue.py
+++ b/gallery_dl/extractor/photovogue.py
@@ -18,7 +18,7 @@ class PhotovogueUserExtractor(Extractor):
directory_fmt = ("{category}", "{photographer[id]} {photographer[name]}")
filename_fmt = "{id} {title}.{extension}"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/photographers/(\d+)"
+ pattern = rf"{BASE_PATTERN}/photographers/(\d+)"
example = "https://www.vogue.com/photovogue/photographers/12345"
def __init__(self, match):
@@ -29,10 +29,9 @@ class PhotovogueUserExtractor(Extractor):
for photo in self.photos():
url = photo["gallery_image"]
photo["title"] = photo["title"].strip()
- photo["date"] = text.parse_datetime(
- photo["date"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ photo["date"] = self.parse_datetime_iso(photo["date"])
- yield Message.Directory, photo
+ yield Message.Directory, "", photo
yield Message.Url, url, text.nameext_from_url(url, photo)
def photos(self):
diff --git a/gallery_dl/extractor/picarto.py b/gallery_dl/extractor/picarto.py
index 62ac38a..b0fa079 100644
--- a/gallery_dl/extractor/picarto.py
+++ b/gallery_dl/extractor/picarto.py
@@ -29,10 +29,9 @@ class PicartoGalleryExtractor(Extractor):
def items(self):
for post in self.posts():
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%d %H:%M:%S")
+ post["date"] = self.parse_datetime_iso(post["created_at"])
variations = post.pop("variations", ())
- yield Message.Directory, post
+ yield Message.Directory, "", post
image = post["default_image"]
if not image:
diff --git a/gallery_dl/extractor/picazor.py b/gallery_dl/extractor/picazor.py
new file mode 100644
index 0000000..df1f436
--- /dev/null
+++ b/gallery_dl/extractor/picazor.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://picazor.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class PicazorUserExtractor(Extractor):
+ """Extractor for picazor users"""
+ category = "picazor"
+ subcategory = "user"
+ root = "https://picazor.com"
+ browser = "firefox"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{id}_{num:>03}.{extension}"
+ archive_fmt = "{id}_{num}"
+ pattern = r"(?:https?://)?(?:www\.)?picazor\.com/[a-z]{2}/([^/?#]+)"
+ example = "https://picazor.com/en/USERNAME"
+
+ def items(self):
+ user = self.groups[0]
+ first = True
+
+ url = f"{self.root}/api/files/{user}/sfiles"
+ params = {"page": 1}
+ headers = {"Referer": f"{self.root}/en/{user}"}
+
+ while True:
+ data = self.request_json(url, params=params, headers=headers)
+ if not data:
+ break
+
+ for item in data:
+ path = item.get("path")
+ if not path:
+ continue
+
+ if first:
+ first = False
+ self.kwdict["user"] = user
+ self.kwdict["count"] = item.get("order")
+ yield Message.Directory, "", {
+ "subject": item.get("subject"),
+ "user" : user,
+ }
+
+ item.pop("blurDataURL", None)
+ item["num"] = item["order"]
+
+ file_url = self.root + path
+ text.nameext_from_url(file_url, item)
+ yield Message.Url, file_url, item
+
+ params["page"] += 1
diff --git a/gallery_dl/extractor/pictoa.py b/gallery_dl/extractor/pictoa.py
index da252f3..0dfe304 100644
--- a/gallery_dl/extractor/pictoa.py
+++ b/gallery_dl/extractor/pictoa.py
@@ -24,7 +24,7 @@ class PictoaExtractor(Extractor):
class PictoaImageExtractor(PictoaExtractor):
"""Extractor for single images from pictoa.com"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/albums/(?:[\w-]+-)?(\d+)/(\d+)"
example = "https://www.pictoa.com/albums/NAME-12345/12345.html"
def items(self):
@@ -43,14 +43,14 @@ class PictoaImageExtractor(PictoaExtractor):
}
text.nameext_from_url(image_url, data)
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, image_url, data
class PictoaAlbumExtractor(PictoaExtractor):
"""Extractor for image albums from pictoa.com"""
subcategory = "album"
- pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+).html"
+ pattern = rf"{BASE_PATTERN}/albums/(?:[\w-]+-)?(\d+).html"
example = "https://www.pictoa.com/albums/NAME-12345.html"
def items(self):
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
index 968776b..6661e7d 100644
--- a/gallery_dl/extractor/piczel.py
+++ b/gallery_dl/extractor/piczel.py
@@ -26,14 +26,13 @@ class PiczelExtractor(Extractor):
def items(self):
for post in self.posts():
post["tags"] = [t["title"] for t in post["tags"] if t["title"]]
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ post["date"] = self.parse_datetime_iso(post["created_at"])
if post["multi"]:
images = post["images"]
del post["images"]
post["count"] = len(images)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], image in enumerate(images):
if "id" in image:
del image["id"]
@@ -43,7 +42,7 @@ class PiczelExtractor(Extractor):
else:
post["count"] = 1
- yield Message.Directory, post
+ yield Message.Directory, "", post
post["num"] = 0
url = post["image"]["url"]
yield Message.Url, url, text.nameext_from_url(url, post)
@@ -67,7 +66,7 @@ class PiczelExtractor(Extractor):
class PiczelUserExtractor(PiczelExtractor):
"""Extractor for all images from a user's gallery"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/gallery/([^/?#]+)/?$"
+ pattern = rf"{BASE_PATTERN}/gallery/([^/?#]+)/?$"
example = "https://piczel.tv/gallery/USER"
def posts(self):
@@ -80,7 +79,7 @@ class PiczelFolderExtractor(PiczelExtractor):
subcategory = "folder"
directory_fmt = ("{category}", "{user[username]}", "{folder[name]}")
archive_fmt = "f{folder[id]}_{id}_{num}"
- pattern = BASE_PATTERN + r"/gallery/(?!image/)[^/?#]+/(\d+)"
+ pattern = rf"{BASE_PATTERN}/gallery/(?!image/)[^/?#]+/(\d+)"
example = "https://piczel.tv/gallery/USER/12345"
def posts(self):
@@ -91,7 +90,7 @@ class PiczelFolderExtractor(PiczelExtractor):
class PiczelImageExtractor(PiczelExtractor):
"""Extractor for individual images"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/gallery/image/(\d+)"
+ pattern = rf"{BASE_PATTERN}/gallery/image/(\d+)"
example = "https://piczel.tv/gallery/image/12345"
def posts(self):
diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py
index 05bc8e7..0b750fe 100644
--- a/gallery_dl/extractor/pillowfort.py
+++ b/gallery_dl/extractor/pillowfort.py
@@ -10,7 +10,7 @@
from .common import Extractor, Message
from ..cache import cache
-from .. import text, util, exception
+from .. import text, exception
BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social"
@@ -36,7 +36,7 @@ class PillowfortExtractor(Extractor):
external = self.config("external", False)
if inline:
- inline = util.re(r'src="(https://img\d+\.pillowfort\.social'
+ inline = text.re(r'src="(https://img\d+\.pillowfort\.social'
r'/posts/[^"]+)').findall
for post in self.posts():
@@ -48,11 +48,10 @@ class PillowfortExtractor(Extractor):
for url in inline(post["content"]):
files.append({"url": url})
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ post["date"] = self.parse_datetime_iso(post["created_at"])
post["post_id"] = post.pop("id")
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
post["num"] = 0
for file in files:
@@ -76,8 +75,7 @@ class PillowfortExtractor(Extractor):
if "id" not in file:
post["id"] = post["hash"]
if "created_at" in file:
- post["date"] = text.parse_datetime(
- file["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ post["date"] = self.parse_datetime_iso(file["created_at"])
yield msgtype, url, post
@@ -121,7 +119,7 @@ class PillowfortExtractor(Extractor):
class PillowfortPostExtractor(PillowfortExtractor):
"""Extractor for a single pillowfort post"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/posts/(\d+)"
+ pattern = rf"{BASE_PATTERN}/posts/(\d+)"
example = "https://www.pillowfort.social/posts/12345"
def posts(self):
@@ -132,7 +130,7 @@ class PillowfortPostExtractor(PillowfortExtractor):
class PillowfortUserExtractor(PillowfortExtractor):
"""Extractor for all posts of a pillowfort user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+(?:/tagged/[^/?#]+)?)"
+ pattern = rf"{BASE_PATTERN}/(?!posts/)([^/?#]+(?:/tagged/[^/?#]+)?)"
example = "https://www.pillowfort.social/USER"
def posts(self):
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index ff771fb..7aa32ec 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -46,7 +46,7 @@ class PinterestExtractor(Extractor):
try:
files = self._extract_files(pin)
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.warning(
"%s: Error when extracting download URLs (%s: %s)",
pin.get("id"), exc.__class__.__name__, exc)
@@ -63,7 +63,7 @@ class PinterestExtractor(Extractor):
if value := pin.get(key):
pin[key] = value.strip()
- yield Message.Directory, pin
+ yield Message.Directory, "", pin
for pin["num"], file in enumerate(files, 1):
url = file["url"]
text.nameext_from_url(url, pin)
@@ -207,7 +207,7 @@ class PinterestExtractor(Extractor):
class PinterestUserExtractor(PinterestExtractor):
"""Extractor for a user's boards"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)(?:/_saved)?/?$"
+ pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)(?:/_saved)?/?$"
example = "https://www.pinterest.com/USER/"
def __init__(self, match):
@@ -225,7 +225,7 @@ class PinterestAllpinsExtractor(PinterestExtractor):
"""Extractor for a user's 'All Pins' feed"""
subcategory = "allpins"
directory_fmt = ("{category}", "{user}")
- pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/pins/?$"
+ pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/pins/?$"
example = "https://www.pinterest.com/USER/pins/"
def __init__(self, match):
@@ -243,7 +243,7 @@ class PinterestCreatedExtractor(PinterestExtractor):
"""Extractor for a user's created pins"""
subcategory = "created"
directory_fmt = ("{category}", "{user}")
- pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/_created/?$"
+ pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/_created/?$"
example = "https://www.pinterest.com/USER/_created/"
def __init__(self, match):
@@ -263,7 +263,7 @@ class PinterestSectionExtractor(PinterestExtractor):
directory_fmt = ("{category}", "{board[owner][username]}",
"{board[name]}", "{section[title]}")
archive_fmt = "{board[id]}_{id}"
- pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/([^/?#]+)/([^/?#]+)"
example = "https://www.pinterest.com/USER/BOARD/SECTION"
def __init__(self, match):
@@ -291,7 +291,7 @@ class PinterestSearchExtractor(PinterestExtractor):
"""Extractor for Pinterest search results"""
subcategory = "search"
directory_fmt = ("{category}", "Search", "{search}")
- pattern = BASE_PATTERN + r"/search/pins/?\?q=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/search/pins/?\?q=([^&#]+)"
example = "https://www.pinterest.com/search/pins/?q=QUERY"
def __init__(self, match):
@@ -308,7 +308,7 @@ class PinterestSearchExtractor(PinterestExtractor):
class PinterestPinExtractor(PinterestExtractor):
"""Extractor for images from a single pin from pinterest.com"""
subcategory = "pin"
- pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)"
+ pattern = rf"{BASE_PATTERN}/pin/([^/?#]+)(?!.*#related$)"
example = "https://www.pinterest.com/pin/12345/"
def __init__(self, match):
@@ -329,7 +329,7 @@ class PinterestBoardExtractor(PinterestExtractor):
subcategory = "board"
directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}")
archive_fmt = "{board[id]}_{id}"
- pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)"
+ pattern = (rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)"
r"/([^/?#]+)/?(?!.*#related$)")
example = "https://www.pinterest.com/USER/BOARD/"
@@ -361,7 +361,7 @@ class PinterestRelatedPinExtractor(PinterestPinExtractor):
"""Extractor for related pins of another pin from pinterest.com"""
subcategory = "related-pin"
directory_fmt = ("{category}", "related {original_pin[id]}")
- pattern = BASE_PATTERN + r"/pin/([^/?#]+).*#related$"
+ pattern = rf"{BASE_PATTERN}/pin/([^/?#]+).*#related$"
example = "https://www.pinterest.com/pin/12345/#related"
def metadata(self):
@@ -376,7 +376,7 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor):
subcategory = "related-board"
directory_fmt = ("{category}", "{board[owner][username]}",
"{board[name]}", "related")
- pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/?#related$"
+ pattern = rf"{BASE_PATTERN}/(?!pin/)([^/?#]+)/([^/?#]+)/?#related$"
example = "https://www.pinterest.com/USER/BOARD/#related"
def pins(self):
diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py
index 73f4b1f..1486976 100644
--- a/gallery_dl/extractor/pixeldrain.py
+++ b/gallery_dl/extractor/pixeldrain.py
@@ -24,16 +24,12 @@ class PixeldrainExtractor(Extractor):
if api_key := self.config("api-key"):
self.session.auth = util.HTTPBasicAuth("", api_key)
- def parse_datetime(self, date_string):
- return text.parse_datetime(
- date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
-
class PixeldrainFileExtractor(PixeldrainExtractor):
"""Extractor for pixeldrain files"""
subcategory = "file"
filename_fmt = "{filename[:230]} ({id}).{extension}"
- pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)"
+ pattern = rf"{BASE_PATTERN}/(?:u|api/file)/(\w+)"
example = "https://pixeldrain.com/u/abcdefgh"
def __init__(self, match):
@@ -45,10 +41,10 @@ class PixeldrainFileExtractor(PixeldrainExtractor):
file = self.request_json(url + "/info")
file["url"] = url + "?download"
- file["date"] = self.parse_datetime(file["date_upload"])
+ file["date"] = self.parse_datetime_iso(file["date_upload"])
text.nameext_from_url(file["name"], file)
- yield Message.Directory, file
+ yield Message.Directory, "", file
yield Message.Url, file["url"], file
@@ -58,7 +54,7 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
directory_fmt = ("{category}",
"{album[date]:%Y-%m-%d} {album[title]} ({album[id]})")
filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}"
- pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)(?:#item=(\d+))?"
+ pattern = rf"{BASE_PATTERN}/(?:l|api/list)/(\w+)(?:#item=(\d+))?"
example = "https://pixeldrain.com/l/abcdefgh"
def __init__(self, match):
@@ -72,7 +68,7 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
files = album["files"]
album["count"] = album["file_count"]
- album["date"] = self.parse_datetime(album["date_created"])
+ album["date"] = self.parse_datetime_iso(album["date_created"])
if self.file_index:
idx = text.parse_int(self.file_index)
@@ -86,12 +82,12 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor):
del album["files"]
del album["file_count"]
- yield Message.Directory, {"album": album}
+ yield Message.Directory, "", {"album": album}
for num, file in enumerate(files, idx+1):
file["album"] = album
file["num"] = num
file["url"] = url = f"{self.root}/api/file/{file['id']}?download"
- file["date"] = self.parse_datetime(file["date_upload"])
+ file["date"] = self.parse_datetime_iso(file["date_upload"])
text.nameext_from_url(file["name"], file)
yield Message.Url, url, file
@@ -101,7 +97,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor):
subcategory = "folder"
filename_fmt = "{filename[:230]}.{extension}"
archive_fmt = "{path}_{num}"
- pattern = BASE_PATTERN + r"/(?:d|api/filesystem)/([^?]+)"
+ pattern = rf"{BASE_PATTERN}/(?:d|api/filesystem)/([^?]+)"
example = "https://pixeldrain.com/d/abcdefgh"
def metadata(self, data):
@@ -112,7 +108,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor):
"mime_type" : data["file_type"],
"size" : data["file_size"],
"hash_sha256": data["sha256_sum"],
- "date" : self.parse_datetime(data["created"]),
+ "date" : self.parse_datetime_iso(data["created"]),
}
def items(self):
@@ -135,7 +131,7 @@ class PixeldrainFolderExtractor(PixeldrainExtractor):
folder = self.metadata(path)
folder["id"] = paths[0]["id"]
- yield Message.Directory, folder
+ yield Message.Directory, "", folder
num = 0
for child in children:
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 6276a2a..eb1a7f2 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -9,14 +9,13 @@
"""Extractors for https://www.pixiv.net/"""
from .common import Extractor, Message, Dispatch
-from .. import text, util, exception
+from .. import text, util, dt, exception
from ..cache import cache, memcache
-from datetime import datetime, timedelta
import itertools
import hashlib
BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?ph?ixiv\.net"
-USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)"
+USER_PATTERN = rf"{BASE_PATTERN}/(?:en/)?users/(\d+)"
class PixivExtractor(Extractor):
@@ -44,7 +43,7 @@ class PixivExtractor(Extractor):
self.meta_captions = self.config("captions")
if self.sanity_workaround or self.meta_captions:
- self.meta_captions_sub = util.re(
+ self.meta_captions_sub = text.re(
r'<a href="/jump\.php\?([^"]+)').sub
def items(self):
@@ -96,12 +95,12 @@ class PixivExtractor(Extractor):
if transform_tags:
transform_tags(work)
work["num"] = 0
- work["date"] = text.parse_datetime(work["create_date"])
+ work["date"] = dt.parse_iso(work["create_date"])
work["rating"] = ratings.get(work["x_restrict"])
work["suffix"] = ""
work.update(metadata)
- yield Message.Directory, work
+ yield Message.Directory, "", work
for work["num"], file in enumerate(files):
url = file["url"]
work.update(file)
@@ -149,7 +148,7 @@ class PixivExtractor(Extractor):
self._extract_ajax(work, body)
return self._extract_ugoira(work, url)
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.warning(
"%s: Unable to extract Ugoira URL. Provide "
"logged-in cookies to access it", work["id"])
@@ -238,10 +237,13 @@ class PixivExtractor(Extractor):
return data["body"]
self.log.debug("Server response: %s", util.json_dumps(data))
- return self.log.error(
- "'%s'", data.get("message") or "General Error")
+ if (msg := data.get("message")) == "An unknown error occurred":
+ msg = "Invalid 'PHPSESSID' cookie"
+ else:
+ msg = f"'{msg or 'General Error'}'"
+ self.log.error("%s", msg)
except Exception:
- return None
+ pass
def _extract_ajax(self, work, body):
work["_ajax"] = True
@@ -274,6 +276,9 @@ class PixivExtractor(Extractor):
"profile_image_urls": {},
}
+ if "is_bookmarked" not in work:
+ work["is_bookmarked"] = True if body.get("bookmarkData") else False
+
work["tags"] = tags = []
for tag in body["tags"]["tags"]:
name = tag["tag"]
@@ -350,10 +355,10 @@ class PixivExtractor(Extractor):
if fmt in urls:
yield urls[fmt]
- def _date_from_url(self, url, offset=timedelta(hours=9)):
+ def _date_from_url(self, url, offset=dt.timedelta(hours=9)):
try:
_, _, _, _, _, y, m, d, H, M, S, _ = url.split("/")
- return datetime(
+ return dt.datetime(
int(y), int(m), int(d), int(H), int(M), int(S)) - offset
except Exception:
return None
@@ -388,7 +393,7 @@ class PixivExtractor(Extractor):
class PixivUserExtractor(Dispatch, PixivExtractor):
"""Extractor for a pixiv user profile"""
- pattern = (BASE_PATTERN + r"/(?:"
+ pattern = (rf"{BASE_PATTERN}/(?:"
r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id="
r")(\d+)(?:$|[?#])")
example = "https://www.pixiv.net/en/users/12345"
@@ -411,7 +416,7 @@ class PixivUserExtractor(Dispatch, PixivExtractor):
class PixivArtworksExtractor(PixivExtractor):
"""Extractor for artworks of a pixiv user"""
subcategory = "artworks"
- pattern = (BASE_PATTERN + r"/(?:"
+ pattern = (rf"{BASE_PATTERN}/(?:"
r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)"
r"(?:/([^/?#]+))?/?(?:$|[?#])"
r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)")
@@ -450,7 +455,7 @@ class PixivArtworksExtractor(PixivExtractor):
ajax_ids.extend(map(int, body["manga"]))
ajax_ids.sort()
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.warning("u%s: Failed to collect artwork IDs "
"using AJAX API", self.user_id)
else:
@@ -500,7 +505,7 @@ class PixivAvatarExtractor(PixivExtractor):
subcategory = "avatar"
filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}"
archive_fmt = "avatar_{user[id]}_{date}"
- pattern = USER_PATTERN + r"/avatar"
+ pattern = rf"{USER_PATTERN}/avatar"
example = "https://www.pixiv.net/en/users/12345/avatar"
def _init(self):
@@ -518,7 +523,7 @@ class PixivBackgroundExtractor(PixivExtractor):
subcategory = "background"
filename_fmt = "background{date:?_//%Y-%m-%d}.{extension}"
archive_fmt = "background_{user[id]}_{date}"
- pattern = USER_PATTERN + "/background"
+ pattern = rf"{USER_PATTERN}/background"
example = "https://www.pixiv.net/en/users/12345/background"
def _init(self):
@@ -580,7 +585,7 @@ class PixivWorkExtractor(PixivExtractor):
class PixivUnlistedExtractor(PixivExtractor):
"""Extractor for a unlisted pixiv illustrations"""
subcategory = "unlisted"
- pattern = BASE_PATTERN + r"/(?:en/)?artworks/unlisted/(\w+)"
+ pattern = rf"{BASE_PATTERN}/(?:en/)?artworks/unlisted/(\w+)"
example = "https://www.pixiv.net/en/artworks/unlisted/a1b2c3d4e5f6g7h8i9j0"
def _extract_files(self, work):
@@ -599,7 +604,7 @@ class PixivFavoriteExtractor(PixivExtractor):
directory_fmt = ("{category}", "bookmarks",
"{user_bookmark[id]} {user_bookmark[account]}")
archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}"
- pattern = (BASE_PATTERN + r"/(?:(?:en/)?"
+ pattern = (rf"{BASE_PATTERN}/(?:(?:en/)?"
r"users/(\d+)/(bookmarks/artworks|following)(?:/([^/?#]+))?"
r"|bookmark\.php)(?:\?([^#]*))?")
example = "https://www.pixiv.net/en/users/12345/bookmarks/artworks"
@@ -662,7 +667,7 @@ class PixivRankingExtractor(PixivExtractor):
archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}"
directory_fmt = ("{category}", "rankings",
"{ranking[mode]}", "{ranking[date]}")
- pattern = BASE_PATTERN + r"/ranking\.php(?:\?([^#]*))?"
+ pattern = rf"{BASE_PATTERN}/ranking\.php(?:\?([^#]*))?"
example = "https://www.pixiv.net/ranking.php"
def __init__(self, match):
@@ -712,8 +717,7 @@ class PixivRankingExtractor(PixivExtractor):
self.log.warning("invalid date '%s'", date)
date = None
if not date:
- now = util.datetime_utcnow()
- date = (now - timedelta(days=1)).strftime("%Y-%m-%d")
+ date = (dt.now() - dt.timedelta(days=1)).strftime("%Y-%m-%d")
self.date = date
self.type = type = query.get("content")
@@ -732,7 +736,7 @@ class PixivSearchExtractor(PixivExtractor):
subcategory = "search"
archive_fmt = "s_{search[word]}_{id}{num}.{extension}"
directory_fmt = ("{category}", "search", "{search[word]}")
- pattern = (BASE_PATTERN + r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?"
+ pattern = (rf"{BASE_PATTERN}/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?"
r"|search\.php)(?:\?([^#]+))?")
example = "https://www.pixiv.net/en/tags/TAG"
@@ -798,7 +802,7 @@ class PixivFollowExtractor(PixivExtractor):
subcategory = "follow"
archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}"
directory_fmt = ("{category}", "following")
- pattern = BASE_PATTERN + r"/bookmark_new_illust\.php"
+ pattern = rf"{BASE_PATTERN}/bookmark_new_illust\.php"
example = "https://www.pixiv.net/bookmark_new_illust.php"
def works(self):
@@ -847,7 +851,7 @@ class PixivSeriesExtractor(PixivExtractor):
directory_fmt = ("{category}", "{user[id]} {user[account]}",
"{series[id]} {series[title]}")
filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}"
- pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)"
+ pattern = rf"{BASE_PATTERN}/user/(\d+)/series/(\d+)"
example = "https://www.pixiv.net/user/12345/series/12345"
def __init__(self, match):
@@ -888,11 +892,10 @@ class PixivSketchExtractor(Extractor):
for post in self.posts():
media = post["media"]
post["post_id"] = post["id"]
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ post["date"] = dt.parse_iso(post["created_at"])
util.delete_items(post, ("id", "media", "_links"))
- yield Message.Directory, post
+ yield Message.Directory, "", post
post["_http_headers"] = headers
for photo in media:
@@ -969,11 +972,11 @@ class PixivNovelExtractor(PixivExtractor):
if transform_tags:
transform_tags(novel)
novel["num"] = 0
- novel["date"] = text.parse_datetime(novel["create_date"])
+ novel["date"] = dt.parse_iso(novel["create_date"])
novel["rating"] = ratings.get(novel["x_restrict"])
novel["suffix"] = ""
- yield Message.Directory, novel
+ yield Message.Directory, "", novel
try:
content = self.api.novel_webview(novel["id"])["text"]
@@ -1039,7 +1042,7 @@ class PixivNovelExtractor(PixivExtractor):
class PixivNovelNovelExtractor(PixivNovelExtractor):
"""Extractor for pixiv novels"""
subcategory = "novel"
- pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)"
+ pattern = rf"{BASE_PATTERN}/n(?:ovel/show\.php\?id=|/)(\d+)"
example = "https://www.pixiv.net/novel/show.php?id=12345"
def novels(self):
@@ -1053,7 +1056,7 @@ class PixivNovelNovelExtractor(PixivNovelExtractor):
class PixivNovelUserExtractor(PixivNovelExtractor):
"""Extractor for pixiv users' novels"""
subcategory = "user"
- pattern = USER_PATTERN + r"/novels"
+ pattern = rf"{USER_PATTERN}/novels"
example = "https://www.pixiv.net/en/users/12345/novels"
def novels(self):
@@ -1063,7 +1066,7 @@ class PixivNovelUserExtractor(PixivNovelExtractor):
class PixivNovelSeriesExtractor(PixivNovelExtractor):
"""Extractor for pixiv novel series"""
subcategory = "series"
- pattern = BASE_PATTERN + r"/novel/series/(\d+)"
+ pattern = rf"{BASE_PATTERN}/novel/series/(\d+)"
example = "https://www.pixiv.net/novel/series/12345"
def novels(self):
@@ -1073,7 +1076,7 @@ class PixivNovelSeriesExtractor(PixivNovelExtractor):
class PixivNovelBookmarkExtractor(PixivNovelExtractor):
"""Extractor for bookmarked pixiv novels"""
subcategory = "bookmark"
- pattern = (USER_PATTERN + r"/bookmarks/novels"
+ pattern = (rf"{USER_PATTERN}/bookmarks/novels"
r"(?:/([^/?#]+))?(?:/?\?([^#]+))?")
example = "https://www.pixiv.net/en/users/12345/bookmarks/novels"
@@ -1151,7 +1154,7 @@ class PixivAppAPI():
"get_secure_url": "1",
}
- time = util.datetime_utcnow().strftime("%Y-%m-%dT%H:%M:%S+00:00")
+ time = dt.now().strftime("%Y-%m-%dT%H:%M:%S+00:00")
headers = {
"X-Client-Time": time,
"X-Client-Hash": hashlib.md5(
@@ -1326,11 +1329,11 @@ class PixivAppAPI():
sort = params["sort"]
if sort == "date_desc":
date_key = "end_date"
- date_off = timedelta(days=1)
+ date_off = dt.timedelta(days=1)
date_cmp = lambda lhs, rhs: lhs >= rhs # noqa E731
elif sort == "date_asc":
date_key = "start_date"
- date_off = timedelta(days=-1)
+ date_off = dt.timedelta(days=-1)
date_cmp = lambda lhs, rhs: lhs <= rhs # noqa E731
else:
date_key = None
@@ -1357,8 +1360,8 @@ class PixivAppAPI():
if date_key and text.parse_int(params.get("offset")) >= 5000:
date_last = data["illusts"][-1]["create_date"]
- date_val = (text.parse_datetime(
- date_last) + date_off).strftime("%Y-%m-%d")
+ date_val = (dt.parse_iso(date_last) + date_off).strftime(
+ "%Y-%m-%d")
self.log.info("Reached 'offset' >= 5000; "
"Updating '%s' to '%s'", date_key, date_val)
params[date_key] = date_val
diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py
index 75c06bb..2feab95 100644
--- a/gallery_dl/extractor/pixnet.py
+++ b/gallery_dl/extractor/pixnet.py
@@ -65,7 +65,7 @@ class PixnetImageExtractor(PixnetExtractor):
subcategory = "image"
filename_fmt = "{id}.{extension}"
directory_fmt = ("{category}", "{blog}")
- pattern = BASE_PATTERN + r"/album/photo/(\d+)"
+ pattern = rf"{BASE_PATTERN}/album/photo/(\d+)"
example = "https://USER.pixnet.net/album/photo/12345"
def items(self):
@@ -83,7 +83,7 @@ class PixnetImageExtractor(PixnetExtractor):
data["blog"] = self.blog
data["user"] = data.pop("author_name")
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, data["url"], data
@@ -92,7 +92,7 @@ class PixnetSetExtractor(PixnetExtractor):
subcategory = "set"
directory_fmt = ("{category}", "{blog}",
"{folder_id} {folder_title}", "{set_id} {set_title}")
- pattern = BASE_PATTERN + r"/album/set/(\d+)"
+ pattern = rf"{BASE_PATTERN}/album/set/(\d+)"
example = "https://USER.pixnet.net/album/set/12345"
def items(self):
@@ -100,7 +100,7 @@ class PixnetSetExtractor(PixnetExtractor):
page = self.request(url, encoding="utf-8").text
data = self.metadata(page)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for num, info in enumerate(self._pagination(page), 1):
url, pos = text.extract(info, ' href="', '"')
src, pos = text.extract(info, ' src="', '"', pos)
@@ -137,7 +137,7 @@ class PixnetFolderExtractor(PixnetExtractor):
"""Extractor for all sets in a pixnet folder"""
subcategory = "folder"
url_fmt = "{}/album/folder/{}"
- pattern = BASE_PATTERN + r"/album/folder/(\d+)"
+ pattern = rf"{BASE_PATTERN}/album/folder/(\d+)"
example = "https://USER.pixnet.net/album/folder/12345"
@@ -145,5 +145,5 @@ class PixnetUserExtractor(PixnetExtractor):
"""Extractor for all sets and folders of a pixnet user"""
subcategory = "user"
url_fmt = "{}{}/album/list"
- pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?#])"
+ pattern = rf"{BASE_PATTERN}()(?:/blog|/album(?:/list)?)?/?(?:$|[?#])"
example = "https://USER.pixnet.net/"
diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py
index 37b9b10..76ca59f 100644
--- a/gallery_dl/extractor/plurk.py
+++ b/gallery_dl/extractor/plurk.py
@@ -9,8 +9,7 @@
"""Extractors for https://www.plurk.com/"""
from .common import Extractor, Message
-from .. import text, util, exception
-import datetime
+from .. import text, util, dt, exception
class PlurkExtractor(Extractor):
@@ -62,7 +61,7 @@ class PlurkExtractor(Extractor):
if not data:
raise exception.NotFoundError("user")
return util.json_loads(
- util.re(r"new Date\(([^)]+)\)").sub(r"\1", data))
+ text.re(r"new Date\(([^)]+)\)").sub(r"\1", data))
class PlurkTimelineExtractor(PlurkExtractor):
@@ -88,12 +87,10 @@ class PlurkTimelineExtractor(PlurkExtractor):
while plurks:
yield from plurks
- offset = datetime.datetime.strptime(
- plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
+ offset = dt.parse(plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z")
- response = self.request(
- url, method="POST", headers=headers, data=data)
- plurks = response.json()["plurks"]
+ plurks = self.request_json(
+ url, method="POST", headers=headers, data=data)["plurks"]
class PlurkPostExtractor(PlurkExtractor):
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index 32ca528..c3aaaba 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -81,7 +81,7 @@ class PoipikuExtractor(Extractor):
"PasswordIcon", ">"):
post["password"] = True
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], url in enumerate(extract_files(
post, thumb, extr), 1):
yield Message.Url, url, text.nameext_from_url(url, post)
diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py
index da17eae..832bedf 100644
--- a/gallery_dl/extractor/poringa.py
+++ b/gallery_dl/extractor/poringa.py
@@ -68,7 +68,7 @@ class PoringaExtractor(Extractor):
main_post, '<img class="imagen" border="0" src="', '"'))
data["count"] = len(urls)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
@@ -104,7 +104,7 @@ class PoringaExtractor(Extractor):
class PoringaPostExtractor(PoringaExtractor):
"""Extractor for posts on poringa.net"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)"
+ pattern = rf"{BASE_PATTERN}/posts/imagenes/(\d+)"
example = "http://www.poringa.net/posts/imagenes/12345/TITLE.html"
def posts(self):
@@ -113,7 +113,7 @@ class PoringaPostExtractor(PoringaExtractor):
class PoringaUserExtractor(PoringaExtractor):
subcategory = "user"
- pattern = BASE_PATTERN + r"/(\w+)$"
+ pattern = rf"{BASE_PATTERN}/(\w+)$"
example = "http://www.poringa.net/USER"
def posts(self):
@@ -124,7 +124,7 @@ class PoringaUserExtractor(PoringaExtractor):
class PoringaSearchExtractor(PoringaExtractor):
subcategory = "search"
- pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/buscar/\?&?q=([^&#]+)"
example = "http://www.poringa.net/buscar/?q=QUERY"
def posts(self):
diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
index 1211397..5ced315 100644
--- a/gallery_dl/extractor/pornhub.py
+++ b/gallery_dl/extractor/pornhub.py
@@ -54,7 +54,7 @@ class PornhubGalleryExtractor(PornhubExtractor):
directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}")
filename_fmt = "{num:>03}_{id}.{extension}"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/album/(\d+)"
+ pattern = rf"{BASE_PATTERN}/album/(\d+)"
example = "https://www.pornhub.com/album/12345"
def __init__(self, match):
@@ -64,7 +64,7 @@ class PornhubGalleryExtractor(PornhubExtractor):
def items(self):
data = self.metadata()
- yield Message.Directory, data
+ yield Message.Directory, "", data
for num, img in enumerate(self.images(), 1):
image = {
@@ -134,7 +134,7 @@ class PornhubGifExtractor(PornhubExtractor):
directory_fmt = ("{category}", "{user}", "gifs")
filename_fmt = "{id} {title}.{extension}"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/gif/(\d+)"
+ pattern = rf"{BASE_PATTERN}/gif/(\d+)"
example = "https://www.pornhub.com/gif/12345"
def __init__(self, match):
@@ -150,21 +150,20 @@ class PornhubGifExtractor(PornhubExtractor):
"tags" : extr("data-context-tag='", "'").split(","),
"title": extr('"name": "', '"'),
"url" : extr('"contentUrl": "', '"'),
- "date" : text.parse_datetime(
- extr('"uploadDate": "', '"'), "%Y-%m-%d"),
+ "date" : self.parse_datetime_iso(extr('"uploadDate": "', '"')),
"viewkey" : extr('From this video: '
'<a href="/view_video.php?viewkey=', '"'),
"timestamp": extr('lass="directLink tstamp" rel="nofollow">', '<'),
"user" : text.remove_html(extr("Created by:", "</div>")),
}
- yield Message.Directory, gif
+ yield Message.Directory, "", gif
yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif)
class PornhubUserExtractor(Dispatch, PornhubExtractor):
"""Extractor for a pornhub user"""
- pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$"
+ pattern = rf"{BASE_PATTERN}/((?:users|model|pornstar)/[^/?#]+)/?$"
example = "https://www.pornhub.com/model/USER"
def items(self):
@@ -178,7 +177,7 @@ class PornhubUserExtractor(Dispatch, PornhubExtractor):
class PornhubPhotosExtractor(PornhubExtractor):
"""Extractor for all galleries of a pornhub user"""
subcategory = "photos"
- pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
+ pattern = (rf"{BASE_PATTERN}/((?:users|model|pornstar)/[^/?#]+)"
"/(photos(?:/[^/?#]+)?)")
example = "https://www.pornhub.com/model/USER/photos"
@@ -199,7 +198,7 @@ class PornhubPhotosExtractor(PornhubExtractor):
class PornhubGifsExtractor(PornhubExtractor):
"""Extractor for a pornhub user's gifs"""
subcategory = "gifs"
- pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
+ pattern = (rf"{BASE_PATTERN}/((?:users|model|pornstar)/[^/?#]+)"
"/(gifs(?:/[^/?#]+)?)")
example = "https://www.pornhub.com/model/USER/gifs"
diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py
index 34a0111..9c926e8 100644
--- a/gallery_dl/extractor/pornpics.py
+++ b/gallery_dl/extractor/pornpics.py
@@ -58,7 +58,7 @@ class PornpicsExtractor(Extractor):
class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor):
"""Extractor for pornpics galleries"""
- pattern = BASE_PATTERN + r"/galleries/((?:[^/?#]+-)?(\d+))"
+ pattern = rf"{BASE_PATTERN}/galleries/((?:[^/?#]+-)?(\d+))"
example = "https://www.pornpics.com/galleries/TITLE-12345/"
def __init__(self, match):
@@ -94,7 +94,7 @@ class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor):
class PornpicsTagExtractor(PornpicsExtractor):
"""Extractor for galleries from pornpics tag searches"""
subcategory = "tag"
- pattern = BASE_PATTERN + r"/tags/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/tags/([^/?#]+)"
example = "https://www.pornpics.com/tags/TAGS/"
def galleries(self):
@@ -105,7 +105,7 @@ class PornpicsTagExtractor(PornpicsExtractor):
class PornpicsSearchExtractor(PornpicsExtractor):
"""Extractor for galleries from pornpics search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/(?:\?q=|pornstars/|channels/)([^/&#]+)"
+ pattern = rf"{BASE_PATTERN}/(?:\?q=|pornstars/|channels/)([^/&#]+)"
example = "https://www.pornpics.com/?q=QUERY"
def galleries(self):
@@ -116,3 +116,35 @@ class PornpicsSearchExtractor(PornpicsExtractor):
"offset": 0,
}
return self._pagination(url, params)
+
+
+class PornpicsListingExtractor(PornpicsExtractor):
+ """Extractor for galleries from pornpics listing pages
+
+ These pages (popular, recent, etc.) don't support JSON pagination
+ and use single quotes in HTML, unlike category pages.
+ """
+ subcategory = "listing"
+ pattern = (rf"{BASE_PATTERN}"
+ rf"/(popular|recent|rating|likes|views|comments)/?$")
+ example = "https://www.pornpics.com/popular/"
+
+ def galleries(self):
+ url = f"{self.root}/{self.groups[0]}/"
+ page = self.request(url).text
+ return [
+ {"g_url": href}
+ for href in text.extract_iter(
+ page, "class='rel-link' href='", "'")
+ ]
+
+
+class PornpicsCategoryExtractor(PornpicsExtractor):
+ """Extractor for galleries from pornpics categories"""
+ subcategory = "category"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$"
+ example = "https://www.pornpics.com/ass/"
+
+ def galleries(self):
+ url = f"{self.root}/{self.groups[0]}/"
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/pornstarstube.py b/gallery_dl/extractor/pornstarstube.py
new file mode 100644
index 0000000..82519a0
--- /dev/null
+++ b/gallery_dl/extractor/pornstarstube.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://pornstars.tube/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class PornstarstubeGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from pornstars.tube"""
+ category = "pornstarstube"
+ root = "https://pornstars.tube"
+ pattern = (r"(?:https?://)?(?:www\.)?pornstars\.tube"
+ r"/albums/(\d+)(?:/([\w-]+))?")
+ example = "https://pornstars.tube/albums/12345/SLUG/"
+
+ def __init__(self, match):
+ url = f"{self.root}/albums/{match[1]}/{match[2] or 'a'}/"
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ gid, slug = self.groups
+ return {
+ "gallery_id": text.parse_int(gid),
+ "slug" : slug or "",
+ "title" : text.unescape(text.extr(
+ page, "<title>", " - PORNSTARS.TUBE</title>")),
+ "description": text.unescape(text.extr(
+ page, 'name="description" content="', '"')),
+ "tags": text.extr(
+ page, 'name="keywords" content="', '"').split(", "),
+ }
+
+ def images(self, page):
+ album = text.extr(page, 'class="block-album"', "\n</div>")
+ return [
+ (url, None)
+ for url in text.extract_iter(album, ' href="', '"')
+ ]
diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py
index af971ab..e71246a 100644
--- a/gallery_dl/extractor/postmill.py
+++ b/gallery_dl/extractor/postmill.py
@@ -7,7 +7,7 @@
"""Extractors for Postmill instances"""
from .common import BaseExtractor, Message
-from .. import text, util, exception
+from .. import text, exception
class PostmillExtractor(BaseExtractor):
@@ -20,8 +20,8 @@ class PostmillExtractor(BaseExtractor):
def _init(self):
self.instance = self.root.partition("://")[2]
self.save_link_post_body = self.config("save-link-post-body", False)
- self._search_canonical_url = util.re(r"/f/([\w\d_]+)/(\d+)/").search
- self._search_image_tag = util.re(
+ self._search_canonical_url = text.re(r"/f/([\w\d_]+)/(\d+)/").search
+ self._search_image_tag = text.re(
r'<a href="[^"]+"\n +class="submission__image-link"').search
def items(self):
@@ -31,7 +31,7 @@ class PostmillExtractor(BaseExtractor):
title = text.unescape(extr(
'<meta property="og:title" content="', '">'))
- date = text.parse_datetime(extr(
+ date = self.parse_datetime_iso(extr(
'<meta property="og:article:published_time" content="', '">'))
username = extr(
'<meta property="og:article:author" content="', '">')
@@ -72,7 +72,7 @@ class PostmillExtractor(BaseExtractor):
urls.append((Message.Queue, url))
data["count"] = len(urls)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], (msg, url) in enumerate(urls, 1):
if url.startswith("text:"):
data["filename"], data["extension"] = "", "htm"
@@ -130,14 +130,14 @@ BASE_PATTERN = PostmillExtractor.update({
}
})
QUERY_RE = r"(?:\?([^#]+))?$"
-SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
- QUERY_RE
+SORTING_RE = (rf"(/(?:hot|new|active|top|controversial|most_commented))?"
+ rf"{QUERY_RE}")
class PostmillPostExtractor(PostmillExtractor):
"""Extractor for a single submission URL"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/f/(\w+)/(\d+)"
example = "https://raddle.me/f/FORUM/123/TITLE"
def __init__(self, match):
@@ -152,7 +152,7 @@ class PostmillPostExtractor(PostmillExtractor):
class PostmillShortURLExtractor(PostmillExtractor):
"""Extractor for short submission URLs"""
subcategory = "shorturl"
- pattern = BASE_PATTERN + r"(/\d+)$"
+ pattern = rf"{BASE_PATTERN}(/\d+)$"
example = "https://raddle.me/123"
def items(self):
@@ -165,34 +165,34 @@ class PostmillShortURLExtractor(PostmillExtractor):
class PostmillHomeExtractor(PostmillSubmissionsExtractor):
"""Extractor for the home page"""
subcategory = "home"
- pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
+ pattern = rf"{BASE_PATTERN}(/(?:featured|subscribed|all)?){SORTING_RE}"
example = "https://raddle.me/"
class PostmillForumExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum"""
subcategory = "forum"
- pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
+ pattern = rf"{BASE_PATTERN}(/f/\w+){SORTING_RE}"
example = "https://raddle.me/f/FORUM"
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions made by a user"""
subcategory = "usersubmissions"
- pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
+ pattern = rf"{BASE_PATTERN}(/user/\w+/submissions)(){QUERY_RE}"
example = "https://raddle.me/user/USER/submissions"
class PostmillTagExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum with a specific tag"""
subcategory = "tag"
- pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
+ pattern = rf"{BASE_PATTERN}(/tag/\w+){SORTING_RE}"
example = "https://raddle.me/tag/TAG"
class PostmillSearchExtractor(PostmillSubmissionsExtractor):
"""Extractor for search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$"
+ pattern = rf"{BASE_PATTERN}(/search)()\?(q=[^#]+)$"
example = "https://raddle.me/search?q=QUERY"
whitelisted_parameters = ("q",)
diff --git a/gallery_dl/extractor/rawkuma.py b/gallery_dl/extractor/rawkuma.py
index 242486d..a4a0c9b 100644
--- a/gallery_dl/extractor/rawkuma.py
+++ b/gallery_dl/extractor/rawkuma.py
@@ -7,7 +7,7 @@
"""Extractors for https://rawkuma.net/"""
from .common import MangaExtractor, ChapterExtractor
-from .. import text, util
+from .. import text
BASE_PATTERN = r"(?:https?://)?rawkuma\.(?:net|com)"
@@ -21,43 +21,40 @@ class RawkumaBase():
class RawkumaChapterExtractor(RawkumaBase, ChapterExtractor):
"""Extractor for manga chapters from rawkuma.net"""
archive_fmt = "{chapter_id}_{page}"
- pattern = BASE_PATTERN + r"/([^/?#]+-chapter-\d+(?:-\d+)?)"
- example = "https://rawkuma.net/TITLE-chapter-123/"
+ pattern = rf"{BASE_PATTERN}(/manga/[^/?#]+/chapter-\d+(?:.\d+)?\.(\d+))"
+ example = "https://rawkuma.net/manga/7TITLE/chapter-123.321"
def __init__(self, match):
url = f"{self.root}/{match[1]}/"
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
- item = util.json_loads(text.extr(page, ',"item":', "}};"))
- title = text.rextr(
- page, '<h1 class="entry-title', "</h1>").partition(" &#8211; ")[2]
- date = text.extr(page, 'datetime="', '"')
- chapter, sep, minor = item["c"].partition(".")
+ manga, _, chapter = text.extr(
+ page, '<title>', "<").rpartition(" Chapter ")
+ chapter, sep, minor = chapter.partition(" &#8211; ")[0].partition(".")
return {
- "manga" : item["s"],
- "manga_id" : text.parse_int(item["mid"]),
+ "manga" : text.unescape(manga),
+ "manga_id" : text.parse_int(text.extr(page, "manga_id=", "&")),
"chapter" : text.parse_int(chapter),
"chapter_minor": sep + minor,
- "chapter_id" : text.parse_int(item["cid"]),
- "title" : text.unescape(title),
- "date" : text.parse_datetime(
- date, "%Y-%m-%dWIB%H:%M:%S%z"),
- "thumbnail" : item.get("t"),
+ "chapter_id" : text.parse_int(self.groups[-1]),
+ # "title" : text.unescape(title),
+ "date" : self.parse_datetime_iso(text.extr(
+ page, 'datetime="', '"')),
"lang" : "ja",
"language" : "Japanese",
}
def images(self, page):
- images = util.json_loads(text.extr(page, '","images":', '}'))
- return [(url, None) for url in images]
+ return [(url, None) for url in text.extract_iter(
+ page, "<img src='", "'")]
class RawkumaMangaExtractor(RawkumaBase, MangaExtractor):
"""Extractor for manga from rawkuma.net"""
chapterclass = RawkumaChapterExtractor
- pattern = BASE_PATTERN + r"/manga/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/manga/([^/?#]+)"
example = "https://rawkuma.net/manga/TITLE/"
def __init__(self, match):
@@ -66,18 +63,36 @@ class RawkumaMangaExtractor(RawkumaBase, MangaExtractor):
def chapters(self, page):
manga = text.unescape(text.extr(page, "<title>", " &#8211; "))
+ manga_id = text.parse_int(text.extr(page, "manga_id=", "&"))
+
+ url = f"{self.root}/wp-admin/admin-ajax.php"
+ params = {
+ "manga_id": manga_id,
+ "page" : "1",
+ "action" : "chapter_list",
+ }
+ headers = {
+ "HX-Request" : "true",
+ "HX-Trigger" : "chapter-list",
+ "HX-Target" : "chapter-list",
+ "HX-Current-URL": self.page_url,
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-origin",
+ }
+ html = self.request(url, params=params, headers=headers).text
results = []
- for chbox in text.extract_iter(
- page, '<li data-num="', "</a>"):
- info = text.extr(chbox, '', '"')
- chapter, _, title = info.partition(" - ")
+ for url in text.extract_iter(html, '<a href="', '"'):
+ info = url[url.rfind("-")+1:-1]
+ chapter, _, chapter_id = info.rpartition(".")
chapter, sep, minor = chapter.partition(".")
- results.append((text.extr(chbox, 'href="', '"'), {
+ results.append((url, {
"manga" : manga,
+ "manga_id" : manga_id,
"chapter" : text.parse_int(chapter),
"chapter-minor": sep + minor,
- "title" : title,
+ "chapter_id" : text.parse_int(chapter_id),
}))
return results
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index 483a5ba..8e974d2 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -38,7 +38,7 @@ class ReactorExtractor(BaseExtractor):
def items(self):
data = self.metadata()
- yield Message.Directory, data
+ yield Message.Directory, "", data
for post in self.posts():
for image in self._parse_post(post):
url = image["url"]
@@ -97,7 +97,7 @@ class ReactorExtractor(BaseExtractor):
return
num = 0
- date = text.parse_datetime(data["datePublished"])
+ date = self.parse_datetime_iso(data["datePublished"])
user = data["author"]["name"]
description = text.unescape(data["description"])
title, _, tags = text.unescape(data["headline"]).partition(" / ")
@@ -171,7 +171,7 @@ class ReactorTagExtractor(ReactorExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "{search_tags}_{post_id}_{num}"
- pattern = BASE_PATTERN + r"/tag/([^/?#]+)(?:/[^/?#]+)?"
+ pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)(?:/[^/?#]+)?"
example = "http://reactor.cc/tag/TAG"
def __init__(self, match):
@@ -187,7 +187,7 @@ class ReactorSearchExtractor(ReactorExtractor):
subcategory = "search"
directory_fmt = ("{category}", "search", "{search_tags}")
archive_fmt = "s_{search_tags}_{post_id}_{num}"
- pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/search(?:/|\?q=)([^/?#]+)"
example = "http://reactor.cc/search?q=QUERY"
def __init__(self, match):
@@ -202,7 +202,7 @@ class ReactorUserExtractor(ReactorExtractor):
"""Extractor for all posts of a user on *reactor.cc sites"""
subcategory = "user"
directory_fmt = ("{category}", "user", "{user}")
- pattern = BASE_PATTERN + r"/user/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)"
example = "http://reactor.cc/user/USER"
def __init__(self, match):
@@ -216,7 +216,7 @@ class ReactorUserExtractor(ReactorExtractor):
class ReactorPostExtractor(ReactorExtractor):
"""Extractor for single posts on *reactor.cc sites"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/post/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post/(\d+)"
example = "http://reactor.cc/post/12345"
def __init__(self, match):
@@ -228,6 +228,6 @@ class ReactorPostExtractor(ReactorExtractor):
pos = post.find('class="uhead">')
for image in self._parse_post(post[pos:]):
if image["num"] == 1:
- yield Message.Directory, image
+ yield Message.Directory, "", image
url = image["url"]
yield Message.Url, url, text.nameext_from_url(url, image)
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index 24a0171..dccf91d 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -44,7 +44,7 @@ class ReadcomiconlineBase():
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
"""Extractor for comic-issues from readcomiconline.li"""
subcategory = "issue"
- pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?)([^#]+)"
+ pattern = rf"{BASE_PATTERN}(/Comic/[^/?#]+/[^/?#]+\?)([^#]+)"
example = "https://readcomiconline.li/Comic/TITLE/Issue-123?id=12345"
def _init(self):
@@ -98,7 +98,7 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
"""Extractor for comics from readcomiconline.li"""
chapterclass = ReadcomiconlineIssueExtractor
subcategory = "comic"
- pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/?)$"
+ pattern = rf"{BASE_PATTERN}(/Comic/[^/?#]+/?)$"
example = "https://readcomiconline.li/Comic/TITLE"
def chapters(self, page):
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
index cf45578..7f731f8 100644
--- a/gallery_dl/extractor/realbooru.py
+++ b/gallery_dl/extractor/realbooru.py
@@ -28,18 +28,31 @@ class RealbooruExtractor(booru.BooruExtractor):
extr('class="container"', '>')
post = {
- "_html" : page,
"id" : post_id,
"rating" : "e" if rating == "adult" else (rating or "?")[0],
- "tags" : text.unescape(extr(' alt="', '"')),
- "file_url" : extr('src="', '"'),
+ "file_url" : (s := extr('src="', '"')),
+ "_fallback" : (extr('src="', '"'),) if s.endswith(".mp4") else (),
"created_at": extr(">Posted at ", " by "),
"uploader" : extr(">", "<"),
"score" : extr('">', "<"),
+ "tags" : extr('<br />', "</div>"),
"title" : extr('id="title" style="width: 100%;" value="', '"'),
"source" : extr('d="source" style="width: 100%;" value="', '"'),
}
+ tags_container = post["tags"]
+ tags = []
+ tags_categories = collections.defaultdict(list)
+ pattern = text.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
+ for tag_type, tag_name in pattern.findall(tags_container):
+ tag = text.unescape(text.unquote(tag_name))
+ tags.append(tag)
+ tags_categories[tag_type].append(tag)
+ for key, value in tags_categories.items():
+ post[f"tags_{key}"] = ", ".join(value)
+ tags.sort()
+
+ post["tags"] = ", ".join(tags)
post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
return post
@@ -48,7 +61,7 @@ class RealbooruExtractor(booru.BooruExtractor):
return num
def _prepare(self, post):
- post["date"] = text.parse_datetime(post["created_at"], "%b, %d %Y")
+ post["date"] = self.parse_datetime(post["created_at"], "%b, %d %Y")
def _pagination(self, params, begin, end):
url = self.root + "/index.php"
@@ -66,23 +79,13 @@ class RealbooruExtractor(booru.BooruExtractor):
return
params["pid"] += self.per_page
- def _tags(self, post, _):
- page = post["_html"]
- tag_container = text.extr(page, 'id="tagLink"', '</div>')
- tags = collections.defaultdict(list)
- pattern = util.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
- for tag_type, tag_name in pattern.findall(tag_container):
- tags[tag_type].append(text.unescape(text.unquote(tag_name)))
- for key, value in tags.items():
- post["tags_" + key] = " ".join(value)
-
class RealbooruTagExtractor(RealbooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
per_page = 42
- pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=list&tags=([^&#]*)"
example = "https://realbooru.com/index.php?page=post&s=list&tags=TAG"
def metadata(self):
@@ -102,7 +105,7 @@ class RealbooruFavoriteExtractor(RealbooruExtractor):
directory_fmt = ("{category}", "favorites", "{favorite_id}")
archive_fmt = "f_{favorite_id}_{id}"
per_page = 50
- pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=favorites&s=view&id=(\d+)"
example = "https://realbooru.com/index.php?page=favorites&s=view&id=12345"
def metadata(self):
@@ -120,7 +123,7 @@ class RealbooruPoolExtractor(RealbooruExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool} {pool_name}")
archive_fmt = "p_{pool}_{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=pool&s=show&id=(\d+)"
example = "https://realbooru.com/index.php?page=pool&s=show&id=12345"
def metadata(self):
@@ -147,7 +150,7 @@ class RealbooruPoolExtractor(RealbooruExtractor):
class RealbooruPostExtractor(RealbooruExtractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
+ pattern = rf"{BASE_PATTERN}/index\.php\?page=post&s=view&id=(\d+)"
example = "https://realbooru.com/index.php?page=post&s=view&id=12345"
def posts(self):
diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py
index c553fec..0bf17d3 100644
--- a/gallery_dl/extractor/recursive.py
+++ b/gallery_dl/extractor/recursive.py
@@ -9,7 +9,7 @@
"""Recursive extractor"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text
class RecursiveExtractor(Extractor):
@@ -27,5 +27,5 @@ class RecursiveExtractor(Extractor):
else:
page = self.request(text.ensure_http_scheme(url)).text
- for match in util.re(r"https?://[^\s\"']+").finditer(page):
+ for match in text.re(r"https?://[^\s\"']+").finditer(page):
yield Message.Queue, match[0], {}
diff --git a/gallery_dl/extractor/redbust.py b/gallery_dl/extractor/redbust.py
deleted file mode 100644
index d00ed52..0000000
--- a/gallery_dl/extractor/redbust.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://redbust.com/"""
-
-from .common import GalleryExtractor, Extractor, Message
-from .. import text
-
-BASE_PATTERN = r"(?:https?://)?redbust\.com"
-
-
-class RedbustExtractor(Extractor):
- """Base class for RedBust extractors"""
- category = "redbust"
- root = "https://redbust.com"
- filename_fmt = "{filename}.{extension}"
-
- def items(self):
- data = {"_extractor": RedbustGalleryExtractor}
- for url in self.galleries():
- yield Message.Queue, url, data
-
- def _pagination(self, path, page=None):
- if page is None:
- url = f"{self.root}{path}/"
- base = url + "page/"
- page = self.request(url).text
- else:
- base = f"{self.root}{path}/page/"
-
- pnum = 1
- while True:
- for post in text.extract_iter(
- page, '<h2 class="post-title">', "rel="):
- yield text.extr(post, 'href="', '"')
-
- pnum += 1
- url = f"{base}{pnum}/"
- if url not in page:
- return
- page = self.request(url).text
-
-
-class RedbustGalleryExtractor(GalleryExtractor, RedbustExtractor):
- """Extractor for RedBust galleries"""
- pattern = BASE_PATTERN + r"/([\w-]+)/?$"
- example = "https://redbust.com/TITLE/"
-
- def items(self):
- url = f"{self.root}/{self.groups[0]}/"
- self.page = page = self.request(url).text
-
- self.gallery_id = gid = text.extr(
- page, "<link rel='shortlink' href='https://redbust.com/?p=", "'")
-
- if gid:
- self.page_url = False
- return GalleryExtractor.items(self)
- else:
- self.subcategory = "category"
- return self._items_category(page)
-
- def _items_category(self, _):
- page = self.page
- data = {"_extractor": RedbustGalleryExtractor}
- base = f"{self.root}/{self.groups[0]}/page/"
- pnum = 1
-
- while True:
- for post in text.extract_iter(
- page, '<h2 class="post-title">', "rel="):
- url = text.extr(post, 'href="', '"')
- yield Message.Queue, url, data
-
- pnum += 1
- url = f"{base}{pnum}/"
- if url not in page:
- return
- page = self.request(url).text
-
- def metadata(self, _):
- extr = text.extract_from(self.page)
-
- return {
- "gallery_id" : self.gallery_id,
- "gallery_slug": self.groups[0],
- "categories" : text.split_html(extr(
- '<li class="category">', "</li>"))[::2],
- "title" : text.unescape(extr('class="post-title">', "<")),
- "date" : text.parse_datetime(
- extr('class="post-byline">', "<").strip(), "%B %d, %Y"),
- "views" : text.parse_int(extr("</b>", "v").replace(",", "")),
- "tags" : text.split_html(extr(
- 'class="post-tags">', "</p"))[1:],
- }
-
- def images(self, _):
- results = []
-
- for img in text.extract_iter(self.page, "'><img ", ">"):
- if src := text.extr(img, 'src="', '"'):
- path, _, end = src.rpartition("-")
- if "x" in end:
- url = f"{path}.{end.rpartition('.')[2]}"
- data = None if src == url else {"_fallback": (src,)}
- else:
- url = src
- data = None
- results.append((url, data))
-
- if not results:
- # fallback for older galleries
- for path in text.extract_iter(
- self.page, '<img src="/wp-content/uploads/', '"'):
- results.append(
- (f"{self.root}/wp-content/uploads/{path}", None))
-
- return results
-
-
-class RedbustTagExtractor(RedbustExtractor):
- """Extractor for RedBust tag searches"""
- subcategory = "tag"
- pattern = BASE_PATTERN + r"/tag/([\w-]+)"
- example = "https://redbust.com/tag/TAG/"
-
- def galleries(self):
- return self._pagination("/tag/" + self.groups[0])
-
-
-class RedbustArchiveExtractor(RedbustExtractor):
- """Extractor for RedBust monthly archive collections"""
- subcategory = "archive"
- pattern = BASE_PATTERN + r"(/\d{4}/\d{2})"
- example = "https://redbust.com/2010/01/"
-
- def galleries(self):
- return self._pagination(self.groups[0])
-
-
-class RedbustImageExtractor(RedbustExtractor):
- """Extractor for RedBust images"""
- subcategory = "image"
- directory_fmt = ("{category}", "{title}")
- pattern = BASE_PATTERN + r"/(?!tag/|\d{4}/)([\w-]+)/([\w-]+)/?$"
- example = "https://redbust.com/TITLE/SLUG/"
-
- def items(self):
- gallery_slug, image_slug = self.groups
- url = f"{self.root}/{gallery_slug}/{image_slug}/"
- page = self.request(url).text
-
- img_url = None
-
- # Look for the largest image in srcset first
- if srcset := text.extr(page, 'srcset="', '"'):
- # Extract the largest image from srcset (typically last one)
- urls = srcset.split(", ")
- img_url = urls[-1].partition(" ")[0] if urls else None
-
- # Fallback to original extraction method
- if not img_url:
- if entry := text.extr(page, "entry-inner ", "alt="):
- img_url = text.extr(entry, "img src=", " ").strip("\"'")
-
- if not img_url:
- return
-
- end = img_url.rpartition("-")[2]
- data = text.nameext_from_url(img_url, {
- "title" : text.unescape(text.extr(
- page, 'title="Return to ', '"')),
- "image_id" : text.extr(
- page, "rel='shortlink' href='https://redbust.com/?p=", "'"),
- "gallery_slug": gallery_slug,
- "image_slug" : image_slug,
- "num" : text.parse_int(end.partition(".")[0]),
- "count" : 1,
- "url" : img_url,
- })
-
- yield Message.Directory, data
- yield Message.Url, img_url, data
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index e20d80e..cc73e47 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -33,11 +33,11 @@ class RedditExtractor(Extractor):
previews = self.config("previews", True)
embeds = self.config("embeds", True)
- if videos := self.config("videos", True):
- if videos == "ytdl":
- self._extract_video = self._extract_video_ytdl
- elif videos == "dash":
+ if videos := self.config("videos", "dash"):
+ if videos == "dash":
self._extract_video = self._extract_video_dash
+ elif videos == "ytdl":
+ self._extract_video = self._extract_video_ytdl
videos = True
selftext = self.config("selftext")
@@ -57,9 +57,9 @@ class RedditExtractor(Extractor):
if submission:
submission["comment"] = None
- submission["date"] = text.parse_timestamp(
+ submission["date"] = self.parse_timestamp(
submission["created_utc"])
- yield Message.Directory, submission
+ yield Message.Directory, "", submission
visited.add(submission["id"])
submission["num"] = 0
@@ -86,7 +86,7 @@ class RedditExtractor(Extractor):
yield Message.Url, url, submission
elif embeds and "media_metadata" in media:
- for embed in self._extract_embed(submission):
+ for embed in self._extract_embed(submission, media):
submission["num"] += 1
text.nameext_from_url(embed, submission)
yield Message.Url, embed, submission
@@ -94,6 +94,8 @@ class RedditExtractor(Extractor):
elif media["is_video"]:
if videos:
text.nameext_from_url(url, submission)
+ if not submission["extension"]:
+ submission["extension"] = "mp4"
url = "ytdl:" + self._extract_video(media)
yield Message.Url, url, submission
@@ -105,14 +107,14 @@ class RedditExtractor(Extractor):
urls.append((url, submission))
elif parentdir:
- yield Message.Directory, comments[0]
+ yield Message.Directory, "", comments[0]
if self.api.comments:
if comments and not submission:
submission = comments[0]
submission.setdefault("num", 0)
if not parentdir:
- yield Message.Directory, submission
+ yield Message.Directory, "", submission
for comment in comments:
media = (embeds and "media_metadata" in comment)
@@ -124,11 +126,11 @@ class RedditExtractor(Extractor):
data = submission.copy()
data["comment"] = comment
- comment["date"] = text.parse_timestamp(
+ comment["date"] = self.parse_timestamp(
comment["created_utc"])
if media:
- for url in self._extract_embed(comment):
+ for url in self._extract_embed(data, comment):
data["num"] += 1
text.nameext_from_url(url, data)
yield Message.Url, url, data
@@ -199,8 +201,8 @@ class RedditExtractor(Extractor):
submission["id"], item["media_id"])
self.log.debug(src)
- def _extract_embed(self, submission):
- meta = submission["media_metadata"]
+ def _extract_embed(self, submission, media):
+ meta = media["media_metadata"]
if not meta:
return
@@ -317,8 +319,8 @@ class RedditSubmissionExtractor(RedditExtractor):
"""Extractor for URLs from a submission on reddit.com"""
subcategory = "submission"
pattern = (r"(?:https?://)?(?:"
- r"(?:\w+\.)?reddit\.com/(?:(?:r|u|user)/[^/?#]+"
- r"/comments|gallery)|redd\.it)/([a-z0-9]+)")
+ r"(?:\w+\.)?reddit\.com/(?:(?:(?:r|u|user)/[^/?#]+/)?"
+ r"comments|gallery)|redd\.it)/([a-z0-9]+)")
example = "https://www.reddit.com/r/SUBREDDIT/comments/id/"
def __init__(self, match):
@@ -352,7 +354,7 @@ class RedditImageExtractor(Extractor):
def items(self):
url = f"https://{self.domain}/{self.path}{self.query}"
data = text.nameext_from_url(url)
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
@@ -394,7 +396,7 @@ class RedditAPI():
self.morecomments = config("morecomments", False)
self._warn_429 = False
- if config("api") == "rest":
+ if config("api") != "oauth":
self.root = "https://www.reddit.com"
self.headers = None
self.authenticate = util.noop
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 4098c54..164fdf4 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -51,8 +51,8 @@ class RedgifsExtractor(Extractor):
gif.update(metadata)
gif["count"] = cnt
- gif["date"] = text.parse_timestamp(gif.get("createDate"))
- yield Message.Directory, gif
+ gif["date"] = self.parse_timestamp(gif.get("createDate"))
+ yield Message.Directory, "", gif
for num, gif in enumerate(gifs, enum):
gif["_fallback"] = formats = self._formats(gif)
diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py
index a43ea4c..d48539e 100644
--- a/gallery_dl/extractor/rule34us.py
+++ b/gallery_dl/extractor/rule34us.py
@@ -9,7 +9,7 @@
"""Extractors for https://rule34.us/"""
from .booru import BooruExtractor
-from .. import text, util
+from .. import text
import collections
@@ -19,7 +19,7 @@ class Rule34usExtractor(BooruExtractor):
per_page = 42
def _init(self):
- self._find_tags = util.re(
+ self._find_tags = text.re(
r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall
def _parse_post(self, post_id):
@@ -57,7 +57,7 @@ class Rule34usTagExtractor(Rule34usExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]+)"
+ pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]*)"
example = "https://rule34.us/index.php?r=posts/index&q=TAG"
def __init__(self, match):
diff --git a/gallery_dl/extractor/rule34vault.py b/gallery_dl/extractor/rule34vault.py
index 14d5aef..9f75f64 100644
--- a/gallery_dl/extractor/rule34vault.py
+++ b/gallery_dl/extractor/rule34vault.py
@@ -36,8 +36,7 @@ class Rule34vaultExtractor(BooruExtractor):
def _prepare(self, post):
post.pop("files", None)
- post["date"] = text.parse_datetime(
- post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["created"])
if "tags" in post:
post["tags"] = [t["value"] for t in post["tags"]]
@@ -80,7 +79,7 @@ class Rule34vaultExtractor(BooruExtractor):
class Rule34vaultPostExtractor(Rule34vaultExtractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/post/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post/(\d+)"
example = "https://rule34vault.com/post/12345"
def posts(self):
@@ -91,7 +90,7 @@ class Rule34vaultPlaylistExtractor(Rule34vaultExtractor):
subcategory = "playlist"
directory_fmt = ("{category}", "{playlist_id}")
archive_fmt = "p_{playlist_id}_{id}"
- pattern = BASE_PATTERN + r"/playlists/view/(\d+)"
+ pattern = rf"{BASE_PATTERN}/playlists/view/(\d+)"
example = "https://rule34vault.com/playlists/view/12345"
def metadata(self):
@@ -106,7 +105,7 @@ class Rule34vaultTagExtractor(Rule34vaultExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/(?!p(?:ost|laylists)/)([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?!p(?:ost|laylists)/)([^/?#]+)"
example = "https://rule34vault.com/TAG"
def metadata(self):
diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py
index 05915ba..ddd656f 100644
--- a/gallery_dl/extractor/rule34xyz.py
+++ b/gallery_dl/extractor/rule34xyz.py
@@ -68,8 +68,7 @@ class Rule34xyzExtractor(BooruExtractor):
def _prepare(self, post):
post.pop("files", None)
- post["date"] = text.parse_datetime(
- post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["created"])
post["filename"], _, post["format"] = post["filename"].rpartition(".")
if "tags" in post:
post["tags"] = [t["value"] for t in post["tags"]]
@@ -135,7 +134,7 @@ class Rule34xyzExtractor(BooruExtractor):
class Rule34xyzPostExtractor(Rule34xyzExtractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/post/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post/(\d+)"
example = "https://rule34.xyz/post/12345"
def posts(self):
@@ -146,7 +145,7 @@ class Rule34xyzPlaylistExtractor(Rule34xyzExtractor):
subcategory = "playlist"
directory_fmt = ("{category}", "{playlist_id}")
archive_fmt = "p_{playlist_id}_{id}"
- pattern = BASE_PATTERN + r"/playlists/view/(\d+)"
+ pattern = rf"{BASE_PATTERN}/playlists/view/(\d+)"
example = "https://rule34.xyz/playlists/view/12345"
def metadata(self):
@@ -161,7 +160,7 @@ class Rule34xyzTagExtractor(Rule34xyzExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/([^/?#]+)$"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)$"
example = "https://rule34.xyz/TAG"
def metadata(self):
diff --git a/gallery_dl/extractor/s3ndpics.py b/gallery_dl/extractor/s3ndpics.py
index 215f160..9201a3f 100644
--- a/gallery_dl/extractor/s3ndpics.py
+++ b/gallery_dl/extractor/s3ndpics.py
@@ -30,15 +30,13 @@ class S3ndpicsExtractor(Extractor):
for post in self.posts():
post["id"] = post.pop("_id", None)
post["user"] = post.pop("userId", None)
- post["date"] = text.parse_datetime(
- post["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
- post["date_updated"] = text.parse_datetime(
- post["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["createdAt"])
+ post["date_updated"] = self.parse_datetime_iso(post["updatedAt"])
files = post.pop("files", ())
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
post["type"] = file["type"]
path = file["url"]
diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py
index 07d490a..e15c628 100644
--- a/gallery_dl/extractor/saint.py
+++ b/gallery_dl/extractor/saint.py
@@ -18,7 +18,7 @@ class SaintAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for saint albums"""
category = "saint"
root = "https://saint2.su"
- pattern = BASE_PATTERN + r"/a/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/a/([^/?#]+)"
example = "https://saint2.su/a/ID"
def fetch_album(self, album_id):
@@ -36,7 +36,7 @@ class SaintAlbumExtractor(LolisafeAlbumExtractor):
break
files.append({
"id2" : id2,
- "date" : text.parse_timestamp(extr("", ".")),
+ "date" : self.parse_timestamp(extr("", ".")),
"id" : extr("/embed/", '"'),
"size" : text.parse_int(extr('data="', '"')),
"file" : text.unescape(extr(
@@ -58,7 +58,7 @@ class SaintMediaExtractor(SaintAlbumExtractor):
"""Extractor for saint media links"""
subcategory = "media"
directory_fmt = ("{category}",)
- pattern = BASE_PATTERN + r"(/(embe)?d/([^/?#]+))"
+ pattern = rf"{BASE_PATTERN}(/(embe)?d/([^/?#]+))"
example = "https://saint2.su/embed/ID"
def fetch_album(self, album_id):
@@ -73,7 +73,7 @@ class SaintMediaExtractor(SaintAlbumExtractor):
file = {
"id" : album_id,
"id2" : extr("/thumbs/", "-"),
- "date" : text.parse_timestamp(extr("", ".")),
+ "date" : self.parse_timestamp(extr("", ".")),
"file" : text.unescape(extr('<source src="', '"')),
"id_dl": extr("/d/", "'"),
}
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 5caad4b..690b515 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -16,7 +16,7 @@ import collections
BASE_PATTERN = r"(?:https?://)?" \
r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \
- r"(?:/[a-z]{2})?"
+ r"(?:/[a-z]{2}(?:[-_][A-Z]{2})?)?"
class SankakuExtractor(BooruExtractor):
@@ -47,7 +47,7 @@ class SankakuExtractor(BooruExtractor):
self.api = SankakuAPI(self)
if self.config("tags") == "extended":
self._tags = self._tags_extended
- self._tags_findall = util.re(
+ self._tags_findall = text.re(
r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall
def _file_url(self, post):
@@ -61,13 +61,13 @@ class SankakuExtractor(BooruExtractor):
self.log.warning(
"Login required to download 'contentious_content' posts")
SankakuExtractor._warning = False
- elif url[8] == "v":
- url = "https://s.sankakucomplex.com" + url[url.index("/", 8):]
+ elif url[4] != "s":
+ url = "https" + url[4:]
return url
def _prepare(self, post):
post["created_at"] = post["created_at"]["s"]
- post["date"] = text.parse_timestamp(post["created_at"])
+ post["date"] = self.parse_timestamp(post["created_at"])
post["tags"] = post.pop("tag_names", ())
post["tag_string"] = " ".join(post["tags"])
post["_http_validate"] = self._check_expired
@@ -119,7 +119,7 @@ class SankakuTagExtractor(SankakuExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)"
+ pattern = rf"{BASE_PATTERN}(?:/posts)?/?\?([^#]*)"
example = "https://sankaku.app/?tags=TAG"
def __init__(self, match):
@@ -129,10 +129,10 @@ class SankakuTagExtractor(SankakuExtractor):
if "date:" in self.tags:
# rewrite 'date:' tags (#1790)
- self.tags = util.re(
+ self.tags = text.re(
r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)(?!T)").sub(
r"date:\3-\2-\1T00:00", self.tags)
- self.tags = util.re(
+ self.tags = text.re(
r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)(?!T)").sub(
r"date:\1-\2-\3T00:00", self.tags)
@@ -149,7 +149,7 @@ class SankakuPoolExtractor(SankakuExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}")
archive_fmt = "p_{pool}_{id}"
- pattern = BASE_PATTERN + r"/(?:books|pools?/show)/(\w+)"
+ pattern = rf"{BASE_PATTERN}/(?:books|pools?/show)/(\w+)"
example = "https://sankaku.app/books/12345"
def metadata(self):
@@ -171,7 +171,7 @@ class SankakuPostExtractor(SankakuExtractor):
"""Extractor for single posts from sankaku.app"""
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)"
+ pattern = rf"{BASE_PATTERN}/posts?(?:/show)?/(\w+)"
example = "https://sankaku.app/post/show/12345"
def posts(self):
@@ -181,7 +181,7 @@ class SankakuPostExtractor(SankakuExtractor):
class SankakuBooksExtractor(SankakuExtractor):
"""Extractor for books by tag search on sankaku.app"""
subcategory = "books"
- pattern = BASE_PATTERN + r"/books/?\?([^#]*)"
+ pattern = rf"{BASE_PATTERN}/books/?\?([^#]*)"
example = "https://sankaku.app/books?tags=TAG"
def __init__(self, match):
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index 405e07e..cf5af81 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -40,7 +40,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
extr('property="og:title" content="', '"')),
"description": text.unescape(
extr('property="og:description" content="', '"')),
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime_iso(
extr('property="article:published_time" content="', '"')),
}
content = extr('<div class="entry-content">', '</article>')
@@ -53,7 +53,7 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
files += self._extract_embeds(content)
data["count"] = len(files)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for num, url in enumerate(files, 1):
file = text.nameext_from_url(url)
if url[0] == "/":
@@ -64,19 +64,19 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
yield Message.Url, url, file
def _extract_images(self, content):
- orig_sub = util.re(r"-\d+x\d+\.").sub
+ orig_sub = text.re(r"-\d+x\d+\.").sub
return [
orig_sub(".", url) for url in
util.unique(text.extract_iter(content, 'data-lazy-src="', '"'))
]
def _extract_videos(self, content):
- return util.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content)
+ return text.re(r"<source [^>]*src=[\"']([^\"']+)").findall(content)
def _extract_embeds(self, content):
return [
"ytdl:" + url for url in
- util.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content)
+ text.re(r"<iframe [^>]*src=[\"']([^\"']+)").findall(content)
]
diff --git a/gallery_dl/extractor/schalenetwork.py b/gallery_dl/extractor/schalenetwork.py
index a4ef3b0..bbbb9da 100644
--- a/gallery_dl/extractor/schalenetwork.py
+++ b/gallery_dl/extractor/schalenetwork.py
@@ -126,7 +126,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
data = self.request_json(url, headers=headers)
try:
- data["date"] = text.parse_timestamp(data["created_at"] // 1000)
+ data["date"] = self.parse_timestamp(data["created_at"] // 1000)
data["count"] = len(data["thumbnails"]["entries"])
del data["thumbnails"]
except Exception:
@@ -138,14 +138,13 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
name = tag["name"]
namespace = tag.get("namespace", 0)
tags.append(types[namespace] + ":" + name)
- data["tags"] = tags
-
if self.config("tags", False):
- tags = collections.defaultdict(list)
+ categories = collections.defaultdict(list)
for tag in data["tags"]:
- tags[tag.get("namespace", 0)].append(tag["name"])
- for type, values in tags.items():
+ categories[tag.get("namespace", 0)].append(tag["name"])
+ for type, values in categories.items():
data["tags_" + types[type]] = values
+ data["tags"] = tags
url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={self._crt()}"
if token := self._token(False):
@@ -169,6 +168,20 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
url = (f"{self.root_api}/books/data/{gid}/{gkey}"
f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={self._crt()}")
headers = self.headers
+
+ if self.config("cbz", False):
+ headers["Authorization"] = self._token()
+ dl = self.request_json(
+ f"{url}&action=dl", method="POST", headers=headers)
+ # 'crt' parameter here is necessary for 'hdoujin' downloads
+ url = f"{dl['base']}?crt={self._crt()}"
+ info = text.nameext_from_url(url)
+ if "fallback" in dl:
+ info["_fallback"] = (dl["fallback"],)
+ if not info["extension"]:
+ info["extension"] = "cbz"
+ return ((url, info),)
+
data = self.request_json(url, headers=headers)
base = data["base"]
diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py
index ff191db..b853f53 100644
--- a/gallery_dl/extractor/scrolller.py
+++ b/gallery_dl/extractor/scrolller.py
@@ -34,7 +34,7 @@ class ScrolllerExtractor(Extractor):
files = self._extract_files(post)
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for file in files:
url = file["url"]
post.update(file)
@@ -136,7 +136,7 @@ class ScrolllerExtractor(Extractor):
class ScrolllerSubredditExtractor(ScrolllerExtractor):
"""Extractor for media from a scrolller subreddit"""
subcategory = "subreddit"
- pattern = BASE_PATTERN + r"(/r/[^/?#]+)(?:/?\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}(/r/[^/?#]+)(?:/?\?([^#]+))?"
example = "https://scrolller.com/r/SUBREDDIT"
def posts(self):
@@ -173,7 +173,7 @@ class ScrolllerSubredditExtractor(ScrolllerExtractor):
class ScrolllerFollowingExtractor(ScrolllerExtractor):
"""Extractor for followed scrolller subreddits"""
subcategory = "following"
- pattern = BASE_PATTERN + r"/following"
+ pattern = rf"{BASE_PATTERN}/following"
example = "https://scrolller.com/following"
def items(self):
@@ -199,7 +199,7 @@ class ScrolllerFollowingExtractor(ScrolllerExtractor):
class ScrolllerPostExtractor(ScrolllerExtractor):
"""Extractor for media from a single scrolller post"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?!r/|following$)([^/?#]+)"
example = "https://scrolller.com/TITLE-SLUG-a1b2c3d4f5"
def posts(self):
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
index 7319731..705227d 100644
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -31,7 +31,7 @@ class SeigaExtractor(Extractor):
images = iter(self.get_images())
data = next(images)
- yield Message.Directory, data
+ yield Message.Directory, "", data
for image in util.advance(images, self.start_image):
data.update(image)
data["extension"] = None
@@ -213,7 +213,7 @@ class SeigaImageExtractor(SeigaExtractor):
data["description"] = text.remove_html(data["description"])
data["image_id"] = text.parse_int(self.image_id)
- data["date"] = text.parse_datetime(
+ data["date"] = self.parse_datetime(
data["date"] + ":00+0900", "%Y年%m月%d日 %H:%M:%S%z")
return (data, data)
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 2feb64e..b599f70 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -9,8 +9,7 @@
"""Extractors for https://www.sex.com/"""
from .common import Extractor, Message
-from .. import text
-from datetime import datetime
+from .. import text, dt
BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com(?:/[a-z]{2})?"
@@ -26,7 +25,7 @@ class SexcomExtractor(Extractor):
def items(self):
self.gifs = self.config("gifs", True)
- yield Message.Directory, self.metadata()
+ yield Message.Directory, "", self.metadata()
for pin in map(self._parse_pin, self.pins()):
if not pin:
continue
@@ -34,10 +33,10 @@ class SexcomExtractor(Extractor):
url = pin["url"]
parts = url.rsplit("/", 4)
try:
- pin["date_url"] = dt = datetime(
+ pin["date_url"] = d = dt.datetime(
int(parts[1]), int(parts[2]), int(parts[3]))
if "date" not in pin:
- pin["date"] = dt
+ pin["date"] = d
except Exception:
pass
pin["tags"] = [t[1:] for t in pin["tags"]]
@@ -136,7 +135,7 @@ class SexcomExtractor(Extractor):
text.nameext_from_url(data["url"], data)
data["uploader"] = extr('itemprop="author">', '<')
- data["date"] = text.parse_datetime(extr('datetime="', '"'))
+ data["date"] = dt.parse_iso(extr('datetime="', '"'))
data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>'))
data["comments"] = text.parse_int(extr('Comments (', ')'))
@@ -195,8 +194,8 @@ class SexcomPinExtractor(SexcomExtractor):
"""Extractor for a pinned image or video on www.sex.com"""
subcategory = "pin"
directory_fmt = ("{category}",)
- pattern = (BASE_PATTERN +
- r"(/(?:\w\w/(?:pic|gif|video)s|pin)/\d+/?)(?!.*#related$)")
+ pattern = (rf"{BASE_PATTERN}"
+ rf"(/(?:\w\w/(?:pic|gif|video)s|pin)/\d+/?)(?!.*#related$)")
example = "https://www.sex.com/pin/12345-TITLE/"
def pins(self):
@@ -207,7 +206,7 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor):
"""Extractor for related pins on www.sex.com"""
subcategory = "related-pin"
directory_fmt = ("{category}", "related {original_pin[pin_id]}")
- pattern = BASE_PATTERN + r"(/pin/(\d+)/?).*#related$"
+ pattern = rf"{BASE_PATTERN}(/pin/(\d+)/?).*#related$"
example = "https://www.sex.com/pin/12345#related"
def metadata(self):
@@ -224,7 +223,7 @@ class SexcomPinsExtractor(SexcomExtractor):
"""Extractor for a user's pins on www.sex.com"""
subcategory = "pins"
directory_fmt = ("{category}", "{user}")
- pattern = BASE_PATTERN + r"/user/([^/?#]+)/pins/"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/pins/"
example = "https://www.sex.com/user/USER/pins/"
def metadata(self):
@@ -239,7 +238,7 @@ class SexcomLikesExtractor(SexcomExtractor):
"""Extractor for a user's liked pins on www.sex.com"""
subcategory = "likes"
directory_fmt = ("{category}", "{user}", "Likes")
- pattern = BASE_PATTERN + r"/user/([^/?#]+)/likes/"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)/likes/"
example = "https://www.sex.com/user/USER/likes/"
def metadata(self):
@@ -254,8 +253,8 @@ class SexcomBoardExtractor(SexcomExtractor):
"""Extractor for pins from a board on www.sex.com"""
subcategory = "board"
directory_fmt = ("{category}", "{user}", "{board}")
- pattern = (BASE_PATTERN + r"/user"
- r"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)")
+ pattern = (rf"{BASE_PATTERN}/user"
+ rf"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)")
example = "https://www.sex.com/user/USER/BOARD/"
def metadata(self):
@@ -270,14 +269,31 @@ class SexcomBoardExtractor(SexcomExtractor):
return self._pagination(url)
+class SexcomFeedExtractor(SexcomExtractor):
+ """Extractor for pins from your account's main feed on www.sex.com"""
+ subcategory = "feed"
+ directory_fmt = ("{category}", "feed")
+ pattern = rf"{BASE_PATTERN}/feed"
+ example = "https://www.sex.com/feed/"
+
+ def metadata(self):
+ return {"feed": True}
+
+ def pins(self):
+ if not self.cookies_check(("sess_sex",)):
+ self.log.warning("no 'sess_sex' cookie set")
+ url = f"{self.root}/feed/"
+ return self._pagination(url)
+
+
class SexcomSearchExtractor(SexcomExtractor):
"""Extractor for search results on www.sex.com"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search[search]}")
- pattern = (BASE_PATTERN + r"/(?:"
- r"(pic|gif|video)s(?:\?(search=[^#]+)$|/([^/?#]*))"
- r"|search/(pic|gif|video)s"
- r")/?(?:\?([^#]+))?")
+ pattern = (rf"{BASE_PATTERN}/(?:"
+ rf"(pic|gif|video)s(?:\?(search=[^#]+)$|/([^/?#]*))"
+ rf"|search/(pic|gif|video)s"
+ rf")/?(?:\?([^#]+))?")
example = "https://www.sex.com/search/pics?query=QUERY"
def _init(self):
@@ -314,7 +330,7 @@ class SexcomSearchExtractor(SexcomExtractor):
parts = path.rsplit("/", 4)
try:
- pin["date_url"] = pin["date"] = datetime(
+ pin["date_url"] = pin["date"] = dt.datetime(
int(parts[1]), int(parts[2]), int(parts[3]))
except Exception:
pass
@@ -329,7 +345,7 @@ class SexcomSearchExtractor(SexcomExtractor):
path = f"{path[:-4]}gif"
pin["url"] = f"{root}{path}"
- yield Message.Directory, pin
+ yield Message.Directory, "", pin
yield Message.Url, pin["url"], pin
if params["page"] >= data["paging"]["numberOfPages"]:
diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py
index 36b083b..5572b4d 100644
--- a/gallery_dl/extractor/shimmie2.py
+++ b/gallery_dl/extractor/shimmie2.py
@@ -25,6 +25,8 @@ class Shimmie2Extractor(BaseExtractor):
if file_url := self.config_instance("file_url"):
self.file_url_fmt = file_url
+ if quote := self.config_instance("quote"):
+ self._quote_type = lambda _: quote
def items(self):
data = self.metadata()
@@ -44,7 +46,7 @@ class Shimmie2Extractor(BaseExtractor):
else:
text.nameext_from_url(url, post)
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, url, post
def metadata(self):
@@ -85,6 +87,11 @@ BASE_PATTERN = Shimmie2Extractor.update({
"root": "https://co.llection.pics",
"pattern": r"co\.llection\.pics",
},
+ "soybooru": {
+ "root": "https://soybooru.com",
+ "pattern": r"soybooru\.com",
+ "quote": "'",
+ },
}) + r"/(?:index\.php\?q=/?)?"
@@ -93,7 +100,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
file_url_fmt = "{}/_images/{}/{}%20-%20{}.{}"
- pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?"
+ pattern = rf"{BASE_PATTERN}post/list/([^/?#]+)(?:/(\d+))?"
example = "https://vidya.pics/post/list/TAG/1"
def metadata(self):
@@ -150,15 +157,14 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
}
pnum += 1
- if not extr(">Next<", ">"):
- if not extr(f"/{pnum}'>{pnum}<", ">"):
- return
+ if not extr(f"/{pnum}{quote}>Next</", ">"):
+ return
class Shimmie2PostExtractor(Shimmie2Extractor):
"""Extractor for single shimmie2 posts"""
subcategory = "post"
- pattern = BASE_PATTERN + r"post/view/(\d+)"
+ pattern = rf"{BASE_PATTERN}post/view/(\d+)"
example = "https://vidya.pics/post/view/12345"
def posts(self):
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index 84c9a84..ad38562 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -20,7 +20,7 @@ class ShopifyExtractor(BaseExtractor):
def items(self):
data = self.metadata()
- yield Message.Directory, data
+ yield Message.Directory, "", data
for product in self.products():
for num, image in enumerate(product.pop("images"), 1):
@@ -90,7 +90,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
"""Base class for collection extractors for Shopify based sites"""
subcategory = "collection"
directory_fmt = ("{category}", "{collection[title]}")
- pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])"
+ pattern = rf"{BASE_PATTERN}(/collections/[\w-]+)/?(?:$|[?#])"
example = "https://www.fashionnova.com/collections/TITLE"
def metadata(self):
@@ -113,7 +113,7 @@ class ShopifyProductExtractor(ShopifyExtractor):
"""Base class for product extractors for Shopify based sites"""
subcategory = "product"
directory_fmt = ("{category}", "Products")
- pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)"
+ pattern = rf"{BASE_PATTERN}((?:/collections/[\w-]+)?/products/[\w-]+)"
example = "https://www.fashionnova.com/collections/TITLE/products/NAME"
def products(self):
diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py
deleted file mode 100644
index d8227fa..0000000
--- a/gallery_dl/extractor/simpcity.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2025 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://simpcity.cr/"""
-
-from .common import Extractor, Message
-from .. import text, exception
-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"
-
-
-class SimpcityExtractor(Extractor):
- """Base class for simpcity extractors"""
- category = "simpcity"
- root = "https://simpcity.cr"
-
- def items(self):
- extract_urls = text.re(
- r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
-
- for post in self.posts():
- urls = extract_urls(post["content"])
- data = {"post": post}
- post["count"] = data["count"] = len(urls)
- yield Message.Directory, data
- for data["num"], url in enumerate(urls, 1):
- yield Message.Queue, url, data
-
- def request_page(self, url):
- try:
- return self.request(url)
- except exception.HttpError as exc:
- if exc.status == 403 and b">Log in<" in exc.response.content:
- msg = text.extr(exc.response.text, "blockMessage--error", "</")
- raise exception.AuthRequired(
- "'authenticated cookies'", None,
- msg.rpartition(">")[2].strip())
- raise
-
- def _pagination(self, base, pnum=None):
- base = f"{self.root}{base}"
-
- if pnum is None:
- url = f"{base}/"
- pnum = 1
- else:
- url = f"{base}/page-{pnum}"
- pnum = None
-
- while True:
- page = self.request_page(url).text
-
- yield page
-
- if pnum is None or "pageNav-jump--next" not in page:
- return
- pnum += 1
- url = f"{base}/page-{pnum}"
-
- def _pagination_reverse(self, base, pnum=None):
- base = f"{self.root}{base}"
-
- url = f"{base}/page-9999" # force redirect to last page
- with self.request_page(url) as response:
- url = response.url
- if url[-1] == "/":
- pnum = 1
- else:
- pnum = text.parse_int(url[url.rfind("-")+1:], 1)
- page = response.text
-
- while True:
- yield page
-
- pnum -= 1
- if pnum > 1:
- url = f"{base}/page-{pnum}"
- elif pnum == 1:
- url = f"{base}/"
- else:
- return
-
- page = self.request_page(url).text
-
- def _parse_thread(self, page):
- schema = self._extract_jsonld(page)["mainEntity"]
- author = schema["author"]
- stats = schema["interactionStatistic"]
- url_t = schema["url"]
- url_a = author.get("url") or ""
-
- thread = {
- "id" : url_t[url_t.rfind(".")+1:-1],
- "url" : url_t,
- "title": schema["headline"],
- "date" : text.parse_datetime(schema["datePublished"]),
- "views": stats[0]["userInteractionCount"],
- "posts": stats[1]["userInteractionCount"],
- "tags" : (schema["keywords"].split(", ")
- if "keywords" in schema else ()),
- "section" : schema["articleSection"],
- "author" : author.get("name") or "",
- "author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
- (author.get("name") or "")[15:]),
- "author_url": url_a,
- }
-
- return thread
-
- def _parse_post(self, html):
- extr = text.extract_from(html)
-
- post = {
- "author": extr('data-author="', '"'),
- "id": extr('data-content="post-', '"'),
- "author_url": extr('itemprop="url" content="', '"'),
- "date": text.parse_datetime(extr('datetime="', '"')),
- "content": extr('<div itemprop="text">',
- '<div class="js-selectToQuote').strip(),
- }
-
- url_a = post["author_url"]
- post["author_id"] = url_a[url_a.rfind(".")+1:-1]
-
- return post
-
-
-class SimpcityPostExtractor(SimpcityExtractor):
- subcategory = "post"
- pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
- example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
-
- def posts(self):
- post_id = self.groups[0]
- url = f"{self.root}/posts/{post_id}/"
- page = self.request_page(url).text
-
- pos = page.find(f'data-content="post-{post_id}"')
- if pos < 0:
- raise exception.NotFoundError("post")
- html = text.extract(page, "<article ", "</article>", pos-200)[0]
-
- self.kwdict["thread"] = self._parse_thread(page)
- return (self._parse_post(html),)
-
-
-class SimpcityThreadExtractor(SimpcityExtractor):
- subcategory = "thread"
- pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
- example = "https://simpcity.cr/threads/TITLE.12345/"
-
- def posts(self):
- if (order := self.config("order-posts")) and \
- order[0] not in ("d", "r"):
- pages = self._pagination(*self.groups)
- reverse = False
- else:
- pages = self._pagination_reverse(*self.groups)
- reverse = True
-
- for page in pages:
- if "thread" not in self.kwdict:
- self.kwdict["thread"] = self._parse_thread(page)
- posts = text.extract_iter(page, "<article ", "</article>")
- if reverse:
- posts = list(posts)
- posts.reverse()
- for html in posts:
- yield self._parse_post(html)
-
-
-class SimpcityForumExtractor(SimpcityExtractor):
- subcategory = "forum"
- pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
- example = "https://simpcity.cr/forums/TITLE.123/"
-
- def items(self):
- data = {"_extractor": SimpcityThreadExtractor}
- for page in self._pagination(*self.groups):
- for path in text.extract_iter(page, ' uix-href="', '"'):
- yield Message.Queue, f"{self.root}{text.unquote(path)}", data
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index d6541b2..78d3daf 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -48,7 +48,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
"characters": split(extr('box-title">Characters</div>', '</div>')),
"tags" : split(extr('box-title">Tags</div>', '</div>')),
"artist" : split(extr('box-title">Artists</div>', '</div>')),
- "date" : text.parse_datetime(text.remove_html(
+ "date" : self.parse_datetime(text.remove_html(
extr('Uploaded', '</div>')), "%d.%m.%Y"),
}
data["lang"] = util.language_to_code(data["language"])
@@ -106,7 +106,7 @@ class SimplyhentaiImageExtractor(Extractor):
})
data["token"] = data["filename"].rpartition("_")[2]
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
@@ -152,9 +152,9 @@ class SimplyhentaiVideoExtractor(Extractor):
"episode": text.parse_int(episode),
"tags": text.split_html(tags)[::2],
"type": "video",
- "date": text.parse_datetime(text.remove_html(
+ "date": self.parse_datetime(text.remove_html(
date), "%B %d, %Y %H:%M"),
})
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, video_url, data
diff --git a/gallery_dl/extractor/sizebooru.py b/gallery_dl/extractor/sizebooru.py
index cad4b23..00002b8 100644
--- a/gallery_dl/extractor/sizebooru.py
+++ b/gallery_dl/extractor/sizebooru.py
@@ -45,9 +45,9 @@ class SizebooruExtractor(BooruExtractor):
post.update({
"id" : text.parse_int(post_id),
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr("<b>Posted Date:</b> ", "<"), "%m/%d/%Y"),
- "date_approved": text.parse_datetime(
+ "date_approved": self.parse_datetime(
extr("<b>Approved Date:</b> ", "<"), "%m/%d/%Y"),
"approver" : text.remove_html(extr("<b>Approved By:</b>", "</")),
"uploader" : text.remove_html(extr("<b>Posted By:</b>", "</")),
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index 3c7205a..43e518e 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -10,7 +10,7 @@ from .common import Extractor, Message, Dispatch
from .. import text
BASE_PATTERN = r"(?:https?://)?skeb\.jp"
-USER_PATTERN = BASE_PATTERN + r"/@([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/@([^/?#]+)"
class SkebExtractor(Extractor):
@@ -57,7 +57,7 @@ class SkebExtractor(Extractor):
files = self._get_files_from_post(response)
post["count"] = len(files)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], file in enumerate(files, 1):
post.update(file)
url = file["file_url"]
@@ -194,7 +194,7 @@ class SkebExtractor(Extractor):
class SkebPostExtractor(SkebExtractor):
"""Extractor for a single skeb post"""
subcategory = "post"
- pattern = USER_PATTERN + r"/works/(\d+)"
+ pattern = rf"{USER_PATTERN}/works/(\d+)"
example = "https://skeb.jp/@USER/works/123"
def posts(self):
@@ -204,7 +204,7 @@ class SkebPostExtractor(SkebExtractor):
class SkebWorksExtractor(SkebExtractor):
"""Extractor for a skeb user's works"""
subcategory = "works"
- pattern = USER_PATTERN + r"/works"
+ pattern = rf"{USER_PATTERN}/works"
example = "https://skeb.jp/@USER/works"
def posts(self):
@@ -216,7 +216,7 @@ class SkebWorksExtractor(SkebExtractor):
class SkebSentrequestsExtractor(SkebExtractor):
"""Extractor for a skeb user's sent requests"""
subcategory = "sentrequests"
- pattern = USER_PATTERN + r"/sent[ _-]?requests"
+ pattern = rf"{USER_PATTERN}/sent[ _-]?requests"
example = "https://skeb.jp/@USER/sentrequests"
def posts(self):
@@ -227,7 +227,7 @@ class SkebSentrequestsExtractor(SkebExtractor):
class SkebUserExtractor(Dispatch, SkebExtractor):
"""Extractor for a skeb user profile"""
- pattern = USER_PATTERN + r"/?$"
+ pattern = rf"{USER_PATTERN}/?$"
example = "https://skeb.jp/@USER"
def items(self):
@@ -246,7 +246,7 @@ class SkebUserExtractor(Dispatch, SkebExtractor):
class SkebSearchExtractor(SkebExtractor):
"""Extractor for skeb search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search\?q=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/search\?q=([^&#]+)"
example = "https://skeb.jp/search?q=QUERY"
def metadata(self):
@@ -298,7 +298,7 @@ class SkebSearchExtractor(SkebExtractor):
class SkebFollowingExtractor(SkebExtractor):
"""Extractor for all creators followed by a skeb user"""
subcategory = "following"
- pattern = USER_PATTERN + r"/following_creators"
+ pattern = rf"{USER_PATTERN}/following_creators"
example = "https://skeb.jp/@USER/following_creators"
items = SkebExtractor.items_users
@@ -312,7 +312,7 @@ class SkebFollowingExtractor(SkebExtractor):
class SkebFollowingUsersExtractor(SkebExtractor):
"""Extractor for your followed users"""
subcategory = "following-users"
- pattern = BASE_PATTERN + r"/following_users"
+ pattern = rf"{BASE_PATTERN}/following_users"
example = "https://skeb.jp/following_users"
items = SkebExtractor.items_users
diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py
index ee877f2..6f723c8 100644
--- a/gallery_dl/extractor/slickpic.py
+++ b/gallery_dl/extractor/slickpic.py
@@ -32,7 +32,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
"{album[id]} {album[title]}")
filename_fmt = "{num:>03}_{id}{title:?_//}.{extension}"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/albums/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/albums/([^/?#]+)"
example = "https://USER.slickpic.com/albums/TITLE/"
def __init__(self, match):
@@ -56,7 +56,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
"count": len(imgs),
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for num, img in enumerate(imgs, 1):
url = img["url_rsz"] + "/o/" + img["fname"]
img = text.nameext_from_url(img["fname"], {
@@ -110,7 +110,7 @@ class SlickpicAlbumExtractor(SlickpicExtractor):
class SlickpicUserExtractor(SlickpicExtractor):
subcategory = "user"
- pattern = BASE_PATTERN + r"(?:/gallery)?/?(?:$|[?#])"
+ pattern = rf"{BASE_PATTERN}(?:/gallery)?/?(?:$|[?#])"
example = "https://USER.slickpic.com/"
def items(self):
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index c0f0e36..1bb70ed 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -10,7 +10,6 @@
"""Extractors for https://www.slideshare.net/"""
from .common import GalleryExtractor
-from .. import text
class SlidesharePresentationExtractor(GalleryExtractor):
@@ -40,8 +39,8 @@ class SlidesharePresentationExtractor(GalleryExtractor):
"description" : slideshow["description"].strip(),
"views" : slideshow["views"],
"likes" : slideshow["likes"],
- "date" : text.parse_datetime(
- slideshow["createdAt"], "%Y-%m-%d %H:%M:%S %Z"),
+ "date" : self.parse_datetime_iso(
+ slideshow["createdAt"][:19]),
}
def images(self, page):
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index e9c89a1..902044c 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -81,7 +81,7 @@ class SmugmugAlbumExtractor(SmugmugExtractor):
del album["Uris"]
data = {"Album": album, "User": user}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for image in self.api.album_images(self.album_id, "ImageSizeDetails"):
url = self._select_format(image)
@@ -93,7 +93,7 @@ class SmugmugImageExtractor(SmugmugExtractor):
"""Extractor for individual smugmug images"""
subcategory = "image"
archive_fmt = "{Image[ImageKey]}"
- pattern = BASE_PATTERN + r"(?:/[^/?#]+)+/i-([^/?#-]+)"
+ pattern = rf"{BASE_PATTERN}(?:/[^/?#]+)+/i-([^/?#-]+)"
example = "https://USER.smugmug.com/PATH/i-ID"
def __init__(self, match):
@@ -107,14 +107,14 @@ class SmugmugImageExtractor(SmugmugExtractor):
data = {"Image": image}
text.nameext_from_url(url, data)
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
class SmugmugPathExtractor(SmugmugExtractor):
"""Extractor for smugmug albums from URL paths and users"""
subcategory = "path"
- pattern = BASE_PATTERN + r"((?:/[^/?#a-fh-mo-z][^/?#]*)*)/?$"
+ pattern = rf"{BASE_PATTERN}((?:/[^/?#a-fh-mo-z][^/?#]*)*)/?$"
example = "https://USER.smugmug.com/PATH"
def __init__(self, match):
diff --git a/gallery_dl/extractor/soundgasm.py b/gallery_dl/extractor/soundgasm.py
index 79ab74d..a4617dd 100644
--- a/gallery_dl/extractor/soundgasm.py
+++ b/gallery_dl/extractor/soundgasm.py
@@ -26,7 +26,7 @@ class SoundgasmExtractor(Extractor):
def items(self):
for sound in map(self._extract_sound, self.sounds()):
url = sound["url"]
- yield Message.Directory, sound
+ yield Message.Directory, "", sound
yield Message.Url, url, text.nameext_from_url(url, sound)
def _extract_sound(self, url):
@@ -50,7 +50,7 @@ class SoundgasmExtractor(Extractor):
class SoundgasmAudioExtractor(SoundgasmExtractor):
"""Extractor for audio clips from soundgasm.net"""
subcategory = "audio"
- pattern = BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/([^/?#]+)"
example = "https://soundgasm.net/u/USER/TITLE"
def __init__(self, match):
@@ -64,7 +64,7 @@ class SoundgasmAudioExtractor(SoundgasmExtractor):
class SoundgasmUserExtractor(SoundgasmExtractor):
"""Extractor for all sounds from a soundgasm user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/([^/?#]+)/?$"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$"
example = "https://soundgasm.net/u/USER"
def __init__(self, match):
diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py
index b809b7f..412b3b7 100644
--- a/gallery_dl/extractor/speakerdeck.py
+++ b/gallery_dl/extractor/speakerdeck.py
@@ -9,7 +9,7 @@
"""Extractors for https://speakerdeck.com/"""
from .common import GalleryExtractor
-from .. import text, util
+from .. import text
class SpeakerdeckPresentationExtractor(GalleryExtractor):
@@ -46,7 +46,7 @@ class SpeakerdeckPresentationExtractor(GalleryExtractor):
def images(self, _):
url = f"{self.root}/player/{self.presentation_id}"
page = self.request(url).text
- page = util.re(r"\s+").sub(" ", page)
+ page = text.re(r"\s+").sub(" ", page)
return [
(url, None)
for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"')
diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py
index e17b9fd..c3af7fd 100644
--- a/gallery_dl/extractor/steamgriddb.py
+++ b/gallery_dl/extractor/steamgriddb.py
@@ -59,7 +59,7 @@ class SteamgriddbExtractor(Extractor):
fake_png = download_fake_png and asset.get("fake_png")
asset["count"] = 2 if fake_png else 1
- yield Message.Directory, asset
+ yield Message.Directory, "", asset
asset["num"] = 1
url = asset["url"]
@@ -157,7 +157,7 @@ class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
class SteamgriddbAssetExtractor(SteamgriddbExtractor):
"""Extractor for a single asset"""
subcategory = "asset"
- pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(grid|hero|logo|icon)/(\d+)"
example = "https://www.steamgriddb.com/grid/1234"
def __init__(self, match):
@@ -177,7 +177,7 @@ class SteamgriddbAssetExtractor(SteamgriddbExtractor):
class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor):
subcategory = "grids"
asset_type = "grid"
- pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?"
+ pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/grids(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/grids"
valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930",
"512x512", "1024x1024")
@@ -189,7 +189,7 @@ class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor):
class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor):
subcategory = "heroes"
asset_type = "hero"
- pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?"
+ pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/heroes(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/heroes"
valid_dimensions = ("1920x620", "3840x1240", "1600x650")
valid_styles = ("alternate", "blurred", "material")
@@ -199,7 +199,7 @@ class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor):
class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor):
subcategory = "logos"
asset_type = "logo"
- pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?"
+ pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/logos(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/logos"
valid_dimensions = None
valid_styles = ("official", "white", "black", "custom")
@@ -209,7 +209,7 @@ class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor):
class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor):
subcategory = "icons"
asset_type = "icon"
- pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?"
+ pattern = rf"{BASE_PATTERN}/(game|collection)/(\d+)/icons(?:/(\d+))?"
example = "https://www.steamgriddb.com/game/1234/icons"
valid_dimensions = [f"{i}x{i}" for i in (8, 10, 14, 16, 20, 24,
28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90,
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index 989e6cc..280c8d7 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -46,14 +46,20 @@ class SubscribestarExtractor(Extractor):
content, "<body>", "</body>")
data["title"] = text.unescape(text.rextr(content, "<h1>", "</h1>"))
- yield Message.Directory, data
+ yield Message.Directory, "", data
for num, item in enumerate(media, 1):
item.update(data)
item["num"] = num
- text.nameext_from_url(item.get("name") or item["url"], item)
- if item["url"][0] == "/":
- item["url"] = self.root + item["url"]
- yield Message.Url, item["url"], item
+
+ url = item["url"]
+ if name := (item.get("name") or item.get("original_filename")):
+ text.nameext_from_name(name, item)
+ else:
+ text.nameext_from_url(url, item)
+
+ if url[0] == "/":
+ url = f"{self.root}{url}"
+ yield Message.Url, url, item
def posts(self):
"""Yield HTML content of all relevant posts"""
@@ -155,7 +161,7 @@ class SubscribestarExtractor(Extractor):
attachments = text.extr(
html, 'class="uploads-docs"', 'class="post-edit_form"')
if attachments:
- for att in util.re(r'class="doc_preview[" ]').split(
+ for att in text.re(r'class="doc_preview[" ]').split(
attachments)[1:]:
media.append({
"id" : text.parse_int(text.extr(
@@ -169,7 +175,7 @@ class SubscribestarExtractor(Extractor):
audios = text.extr(
html, 'class="uploads-audios"', 'class="post-edit_form"')
if audios:
- for audio in util.re(r'class="audio_preview-data[" ]').split(
+ for audio in text.re(r'class="audio_preview-data[" ]').split(
audios)[1:]:
media.append({
"id" : text.parse_int(text.extr(
@@ -202,9 +208,9 @@ class SubscribestarExtractor(Extractor):
def _parse_datetime(self, dt):
if dt.startswith("Updated on "):
dt = dt[11:]
- date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p")
+ date = self.parse_datetime(dt, "%b %d, %Y %I:%M %p")
if date is dt:
- date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p")
+ date = self.parse_datetime(dt, "%B %d, %Y %I:%M %p")
return date
def _warn_preview(self):
@@ -215,7 +221,7 @@ class SubscribestarExtractor(Extractor):
class SubscribestarUserExtractor(SubscribestarExtractor):
"""Extractor for media from a subscribestar user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?!posts/)([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?!posts/)([^/?#]+)"
example = "https://www.subscribestar.com/USER"
def posts(self):
@@ -237,7 +243,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
class SubscribestarPostExtractor(SubscribestarExtractor):
"""Extractor for media from a single subscribestar post"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/posts/(\d+)"
+ pattern = rf"{BASE_PATTERN}/posts/(\d+)"
example = "https://www.subscribestar.com/posts/12345"
def posts(self):
diff --git a/gallery_dl/extractor/sxypix.py b/gallery_dl/extractor/sxypix.py
new file mode 100644
index 0000000..c9a1701
--- /dev/null
+++ b/gallery_dl/extractor/sxypix.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://sxypix.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class SxypixGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from sxypix.com"""
+ category = "sxypix"
+ root = "https://sxypix.com"
+ pattern = r"(?:https?://)?(?:www\.)?sxypix\.com(/w/(\w+))"
+ example = "https://sxypix.com/w/2bbaf1b24a5863d0e73436619bbaa7ee"
+
+ def metadata(self, page):
+ return {
+ "gallery_id": self.groups[1],
+ "title": text.unescape(text.extr(
+ page, '<meta name="keywords" content="', '"')),
+ }
+
+ def images(self, page):
+ data = {
+ "aid" : text.extr(page, "data-aid='", "'"),
+ "ghash": text.extr(page, "data-ghash='", "'"),
+ }
+ gallery = self.request_json(
+ "https://sxypix.com/php/gall.php", method="POST", data=data)
+
+ base = "https://x."
+ return [
+ (base + text.extr(entry, "data-src='//.", "'"), None)
+ for entry in gallery["r"]
+ ]
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
index 190ccbf..59477cc 100644
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@@ -57,8 +57,7 @@ class SzurubooruExtractor(booru.BooruExtractor):
return url
def _prepare(self, post):
- post["date"] = text.parse_datetime(
- post["creationTime"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["creationTime"])
tags = []
tags_categories = collections.defaultdict(list)
@@ -94,7 +93,7 @@ class SzurubooruTagExtractor(SzurubooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}_{version}"
- pattern = BASE_PATTERN + r"/posts(?:/query=([^/?#]*))?"
+ pattern = rf"{BASE_PATTERN}/posts(?:/query=([^/?#]*))?"
example = "https://booru.bcbnsfw.space/posts/query=TAG"
def __init__(self, match):
@@ -117,7 +116,7 @@ class SzurubooruTagExtractor(SzurubooruExtractor):
class SzurubooruPostExtractor(SzurubooruExtractor):
subcategory = "post"
archive_fmt = "{id}_{version}"
- pattern = BASE_PATTERN + r"/post/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post/(\d+)"
example = "https://booru.bcbnsfw.space/post/12345"
def posts(self):
diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py
index d823f6a..5f8cb67 100644
--- a/gallery_dl/extractor/tapas.py
+++ b/gallery_dl/extractor/tapas.py
@@ -72,7 +72,7 @@ class TapasExtractor(Extractor):
class TapasEpisodeExtractor(TapasExtractor):
subcategory = "episode"
- pattern = BASE_PATTERN + r"/episode/(\d+)"
+ pattern = rf"{BASE_PATTERN}/episode/(\d+)"
example = "https://tapas.io/episode/12345"
def items(self):
@@ -89,8 +89,8 @@ class TapasEpisodeExtractor(TapasExtractor):
html = data["html"]
episode["series"] = self._extract_series(html)
- episode["date"] = text.parse_datetime(episode["publish_date"])
- yield Message.Directory, episode
+ episode["date"] = self.parse_datetime_iso(episode["publish_date"])
+ yield Message.Directory, "", episode
if episode["book"]:
content = text.extr(
@@ -116,7 +116,7 @@ class TapasEpisodeExtractor(TapasExtractor):
class TapasSeriesExtractor(TapasExtractor):
subcategory = "series"
- pattern = BASE_PATTERN + r"/series/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/series/([^/?#]+)"
example = "https://tapas.io/series/TITLE"
def items(self):
@@ -150,7 +150,7 @@ class TapasSeriesExtractor(TapasExtractor):
class TapasCreatorExtractor(TapasExtractor):
subcategory = "creator"
- pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?!series|episode)([^/?#]+)"
example = "https://tapas.io/CREATOR"
def items(self):
diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py
index 6dcb153..e27ef0d 100644
--- a/gallery_dl/extractor/tcbscans.py
+++ b/gallery_dl/extractor/tcbscans.py
@@ -15,7 +15,7 @@ BASE_PATTERN = (r"(?:https?://)?(?:tcb(?:-backup\.bihar-mirchi|scans)"
class TcbscansChapterExtractor(ChapterExtractor):
category = "tcbscans"
- pattern = BASE_PATTERN + r"(/chapters/\d+/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/chapters/\d+/[^/?#]+)"
example = "https://tcbscans.me/chapters/12345/MANGA-chapter-123"
def __init__(self, match):
@@ -44,7 +44,7 @@ class TcbscansChapterExtractor(ChapterExtractor):
class TcbscansMangaExtractor(MangaExtractor):
category = "tcbscans"
chapterclass = TcbscansChapterExtractor
- pattern = BASE_PATTERN + r"(/mangas/\d+/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/mangas/\d+/[^/?#]+)"
example = "https://tcbscans.me/mangas/123/MANGA"
def __init__(self, match):
diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py
index 2713621..ab77b31 100644
--- a/gallery_dl/extractor/telegraph.py
+++ b/gallery_dl/extractor/telegraph.py
@@ -27,9 +27,8 @@ class TelegraphGalleryExtractor(GalleryExtractor):
'property="og:title" content="', '"')),
"description": text.unescape(extr(
'property="og:description" content="', '"')),
- "date": text.parse_datetime(extr(
- 'property="article:published_time" content="', '"'),
- "%Y-%m-%dT%H:%M:%S%z"),
+ "date": self.parse_datetime_iso(extr(
+ 'property="article:published_time" content="', '"')),
"author": text.unescape(extr(
'property="article:author" content="', '"')),
"post_url": text.unescape(extr(
diff --git a/gallery_dl/extractor/tenor.py b/gallery_dl/extractor/tenor.py
index 7e1f802..3e4bab0 100644
--- a/gallery_dl/extractor/tenor.py
+++ b/gallery_dl/extractor/tenor.py
@@ -40,16 +40,17 @@ class TenorExtractor(Extractor):
continue
url = fmt["url"]
+ title = gif.pop("h1_title", "")
+ gif["title"] = title[:-4] if title.endswith(" GIF") else title
+ gif["width"], gif["height"] = fmt.pop("dims") or (0, 0)
+ gif["description"] = gif.pop("content_description", "")
gif["id_format"] = url.rsplit("/", 2)[1]
gif["format"] = fmt["name"]
- gif["width"], gif["height"] = fmt["dims"]
gif["duration"] = fmt["duration"]
gif["size"] = fmt["size"]
- gif["title"] = gif["h1_title"][:-4]
- gif["description"] = gif.pop("content_description", "")
- gif["date"] = text.parse_timestamp(gif["created"])
+ gif["date"] = self.parse_timestamp(gif["created"])
- yield Message.Directory, gif
+ yield Message.Directory, "", gif
yield Message.Url, url, text.nameext_from_url(url, gif)
def _extract_format(self, gif):
@@ -110,7 +111,7 @@ class TenorExtractor(Extractor):
class TenorImageExtractor(TenorExtractor):
subcategory = "image"
- pattern = BASE_PATTERN + r"view/(?:[^/?#]*-)?(\d+)"
+ pattern = rf"{BASE_PATTERN}view/(?:[^/?#]*-)?(\d+)"
example = "https://tenor.com/view/SLUG-1234567890"
def gifs(self):
@@ -124,7 +125,7 @@ class TenorImageExtractor(TenorExtractor):
class TenorSearchExtractor(TenorExtractor):
subcategory = "search"
directory_fmt = ("{category}", "{search_tags}")
- pattern = BASE_PATTERN + r"search/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}search/([^/?#]+)"
example = "https://tenor.com/search/QUERY"
def gifs(self):
@@ -140,7 +141,7 @@ class TenorSearchExtractor(TenorExtractor):
class TenorUserExtractor(TenorExtractor):
subcategory = "user"
directory_fmt = ("{category}", "@{user[username]}")
- pattern = BASE_PATTERN + r"(?:users|official)/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(?:users|official)/([^/?#]+)"
example = "https://tenor.com/users/USER"
def gifs(self):
diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py
index 9a30654..773f300 100644
--- a/gallery_dl/extractor/thehentaiworld.py
+++ b/gallery_dl/extractor/thehentaiworld.py
@@ -36,12 +36,12 @@ class ThehentaiworldExtractor(Extractor):
if "file_urls" in post:
urls = post["file_urls"]
post["count"] = len(urls)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for post["num"], url in enumerate(urls, 1):
text.nameext_from_url(url, post)
yield Message.Url, url, post
else:
- yield Message.Directory, post
+ yield Message.Directory, "", post
url = post["file_url"]
text.nameext_from_url(url, post)
yield Message.Url, url, post
@@ -56,8 +56,7 @@ class ThehentaiworldExtractor(Extractor):
"id" : text.parse_int(extr(" postid-", " ")),
"slug" : extr(" post-", '"'),
"tags" : extr('id="tagsHead">', "</ul>"),
- "date" : text.parse_datetime(extr(
- "<li>Posted: ", "<"), "%Y-%m-%d"),
+ "date" : self.parse_datetime_iso(extr("<li>Posted: ", "<")),
}
if (c := url[27]) == "v":
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index f450806..a4c7171 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -25,6 +25,7 @@ class TiktokExtractor(Extractor):
def _init(self):
self.audio = self.config("audio", True)
self.video = self.config("videos", True)
+ self.cover = self.config("covers", False)
def items(self):
for tiktok_url in self.urls():
@@ -43,10 +44,10 @@ class TiktokExtractor(Extractor):
post = video_detail["itemInfo"]["itemStruct"]
post["user"] = (a := post.get("author")) and a["uniqueId"] or ""
- post["date"] = text.parse_timestamp(post["createTime"])
+ post["date"] = self.parse_timestamp(post["createTime"])
original_title = title = post["desc"]
- yield Message.Directory, post
+ yield Message.Directory, "", post
ytdl_media = False
if "imagePost" in post:
@@ -70,12 +71,14 @@ class TiktokExtractor(Extractor):
if self.audio and "music" in post:
if self.audio == "ytdl":
ytdl_media = "audio"
- else:
- url = self._extract_audio(post)
+ elif url := self._extract_audio(post):
yield Message.Url, url, post
- elif self.video and "video" in post:
- ytdl_media = "video"
+ elif "video" in post:
+ if self.video:
+ ytdl_media = "video"
+ if self.cover and (url := self._extract_cover(post, "video")):
+ yield Message.Url, url, post
else:
self.log.info("%s: Skipping post", tiktok_url)
@@ -144,6 +147,30 @@ class TiktokExtractor(Extractor):
post["extension"] = "mp3"
return url
+ def _extract_cover(self, post, type):
+ media = post[type]
+
+ for cover_id in ("thumbnail", "cover", "originCover", "dynamicCover"):
+ if url := media.get(cover_id):
+ break
+ else:
+ return
+
+ text.nameext_from_url(url, post)
+ post.update({
+ "type" : "cover",
+ "extension": "jpg",
+ "image" : url,
+ "title" : post["desc"] or f"TikTok {type} cover #{post['id']}",
+ "duration" : media.get("duration"),
+ "num" : 0,
+ "img_id" : "",
+ "cover_id" : cover_id,
+ "width" : 0,
+ "height" : 0,
+ })
+ return url
+
def _check_status_code(self, detail, url):
status = detail.get("statusCode")
if not status:
@@ -166,7 +193,7 @@ class TiktokExtractor(Extractor):
class TiktokPostExtractor(TiktokExtractor):
"""Extract a single video or photo TikTok link"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)"
example = "https://www.tiktok.com/@USER/photo/1234567890"
def urls(self):
@@ -199,7 +226,7 @@ class TiktokVmpostExtractor(TiktokExtractor):
class TiktokUserExtractor(TiktokExtractor):
"""Extract a TikTok user's profile"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)"
+ pattern = rf"{BASE_PATTERN}/@([\w_.-]+)/?(?:$|\?|#)"
example = "https://www.tiktok.com/@USER"
def _init(self):
@@ -214,7 +241,7 @@ class TiktokUserExtractor(TiktokExtractor):
except (ImportError, SyntaxError) as exc:
self.log.error("Cannot import module '%s'",
getattr(exc, "name", ""))
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
raise exception.ExtractionError("yt-dlp or youtube-dl is required "
"for this feature!")
@@ -254,7 +281,7 @@ class TiktokUserExtractor(TiktokExtractor):
self.log.warning("Unable to extract 'avatar' URL (%s: %s)",
exc.__class__.__name__, exc)
else:
- yield Message.Directory, avatar
+ yield Message.Directory, "", avatar
yield Message.Url, avatar_url, avatar
with ytdl_instance as ydl:
diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py
index ef441d3..873cce8 100644
--- a/gallery_dl/extractor/tmohentai.py
+++ b/gallery_dl/extractor/tmohentai.py
@@ -16,7 +16,7 @@ class TmohentaiGalleryExtractor(GalleryExtractor):
category = "tmohentai"
root = "http://tmohentai.com"
directory_fmt = ("{category}", "{title} ({gallery_id})")
- pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)"
+ pattern = rf"{BASE_PATTERN}/(?:contents|reader)/(\w+)"
example = "https://tmohentai.com/contents/12345a67b89c0"
def __init__(self, match):
diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py
index 7add79a..cc29b11 100644
--- a/gallery_dl/extractor/toyhouse.py
+++ b/gallery_dl/extractor/toyhouse.py
@@ -34,7 +34,7 @@ class ToyhouseExtractor(Extractor):
post.update(metadata)
text.nameext_from_url(post["url"], post)
post["id"], _, post["hash"] = post["filename"].partition("_")
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, post["url"], post
def posts(self):
@@ -51,7 +51,7 @@ class ToyhouseExtractor(Extractor):
extr = text.extract_from(post)
return {
"url": extr(needle, '"'),
- "date": text.parse_datetime(extr(
+ "date": self.parse_datetime(extr(
'</h2>\n <div class="mb-1">', '<'),
"%d %b %Y, %I:%M:%S %p"),
"artists": [
@@ -104,7 +104,7 @@ class ToyhouseExtractor(Extractor):
class ToyhouseArtExtractor(ToyhouseExtractor):
"""Extractor for artworks of a toyhouse user"""
subcategory = "art"
- pattern = BASE_PATTERN + r"/([^/?#]+)/art"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/art"
example = "https://www.toyhou.se/USER/art"
def posts(self):
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index 8732c60..1ccdafb 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -65,7 +65,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
"title_jp" : title_jp,
"thumbnail" : extr('"og:image" content="', '"'),
"uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr('id="Uploaded">', '</div>').strip(), "%Y %B %d"),
"rating" : text.parse_float(extr(
'id="Rating">', '</div>').partition(" ")[0]),
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 92fc831..5bb5a40 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -9,8 +9,7 @@
"""Extractors for https://www.tumblr.com/"""
from .common import Extractor, Message
-from .. import text, util, oauth, exception
-from datetime import datetime, date, timedelta
+from .. import text, util, dt, oauth, exception
BASE_PATTERN = (
@@ -61,16 +60,16 @@ class TumblrExtractor(Extractor):
blog = None
# pre-compile regular expressions
- self._sub_video = util.re(
+ self._sub_video = text.re(
r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
if self.inline:
- self._sub_image = util.re(
+ self._sub_image = text.re(
r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
- self._subn_orig_image = util.re(r"/s\d+x\d+/").subn
- _findall_image = util.re('<img src="([^"]+)"').findall
- _findall_video = util.re('<source src="([^"]+)"').findall
+ self._subn_orig_image = text.re(r"/s\d+x\d+/").subn
+ _findall_image = text.re('<img src="([^"]+)"').findall
+ _findall_video = text.re('<source src="([^"]+)"').findall
for post in self.posts():
if self.date_min > post["timestamp"]:
@@ -88,7 +87,7 @@ class TumblrExtractor(Extractor):
if self.avatar:
url = self.api.avatar(self.blog)
- yield Message.Directory, {"blog": blog}
+ yield Message.Directory, "", {"blog": blog}
yield self._prepare_avatar(url, post.copy(), blog)
post["blog"] = blog
@@ -100,7 +99,7 @@ class TumblrExtractor(Extractor):
if "trail" in post:
del post["trail"]
- post["date"] = text.parse_timestamp(post["timestamp"])
+ post["date"] = self.parse_timestamp(post["timestamp"])
posts = []
if "photos" in post: # type "photo" or "link"
@@ -161,7 +160,7 @@ class TumblrExtractor(Extractor):
del post["extension"]
post["count"] = len(posts)
- yield Message.Directory, post
+ yield Message.Directory, "", post
for num, (msg, url, post) in enumerate(posts, 1):
post["num"] = num
@@ -271,7 +270,7 @@ class TumblrExtractor(Extractor):
class TumblrUserExtractor(TumblrExtractor):
"""Extractor for a Tumblr user's posts"""
subcategory = "user"
- pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$"
+ pattern = rf"{BASE_PATTERN}(?:/page/\d+|/archive)?/?$"
example = "https://www.tumblr.com/BLOG"
def posts(self):
@@ -281,7 +280,7 @@ class TumblrUserExtractor(TumblrExtractor):
class TumblrPostExtractor(TumblrExtractor):
"""Extractor for a single Tumblr post"""
subcategory = "post"
- pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)"
+ pattern = rf"{BASE_PATTERN}/(?:post/|image/)?(\d+)"
example = "https://www.tumblr.com/BLOG/12345"
def posts(self):
@@ -296,7 +295,7 @@ class TumblrPostExtractor(TumblrExtractor):
class TumblrTagExtractor(TumblrExtractor):
"""Extractor for Tumblr user's posts by tag"""
subcategory = "tag"
- pattern = BASE_PATTERN + r"(?:/archive)?/tagged/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(?:/archive)?/tagged/([^/?#]+)"
example = "https://www.tumblr.com/BLOG/tagged/TAG"
def posts(self):
@@ -308,12 +307,12 @@ class TumblrTagExtractor(TumblrExtractor):
class TumblrDayExtractor(TumblrExtractor):
"""Extractor for Tumblr user's posts by day"""
subcategory = "day"
- pattern = BASE_PATTERN + r"/day/(\d\d\d\d/\d\d/\d\d)"
+ pattern = rf"{BASE_PATTERN}/day/(\d\d\d\d/\d\d/\d\d)"
example = "https://www.tumblr.com/BLOG/day/1970/01/01"
def posts(self):
year, month, day = self.groups[3].split("/")
- ordinal = date(int(year), int(month), int(day)).toordinal()
+ ordinal = dt.date(int(year), int(month), int(day)).toordinal()
# 719163 == date(1970, 1, 1).toordinal()
self.date_min = (ordinal - 719163) * 86400
@@ -326,7 +325,7 @@ class TumblrLikesExtractor(TumblrExtractor):
subcategory = "likes"
directory_fmt = ("{category}", "{blog_name}", "likes")
archive_fmt = "f_{blog[name]}_{id}_{num}"
- pattern = BASE_PATTERN + r"/likes"
+ pattern = rf"{BASE_PATTERN}/likes"
example = "https://www.tumblr.com/BLOG/likes"
def posts(self):
@@ -336,7 +335,7 @@ class TumblrLikesExtractor(TumblrExtractor):
class TumblrFollowingExtractor(TumblrExtractor):
"""Extractor for a Tumblr user's followed blogs"""
subcategory = "following"
- pattern = BASE_PATTERN + r"/following"
+ pattern = rf"{BASE_PATTERN}/following"
example = "https://www.tumblr.com/BLOG/following"
items = TumblrExtractor.items_blogs
@@ -348,7 +347,7 @@ class TumblrFollowingExtractor(TumblrExtractor):
class TumblrFollowersExtractor(TumblrExtractor):
"""Extractor for a Tumblr user's followers"""
subcategory = "followers"
- pattern = BASE_PATTERN + r"/followers"
+ pattern = rf"{BASE_PATTERN}/followers"
example = "https://www.tumblr.com/BLOG/followers"
items = TumblrExtractor.items_blogs
@@ -514,7 +513,7 @@ class TumblrAPI(oauth.OAuth1API):
self.extractor.wait(seconds=reset)
continue
- t = (datetime.now() + timedelta(0, float(reset))).time()
+ t = (dt.now() + dt.timedelta(0, float(reset))).time()
raise exception.AbortExtraction(
f"Aborting - Rate limit will reset at "
f"{t.hour:02}:{t.minute:02}:{t.second:02}")
diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py
index 26868ec..68c9ec7 100644
--- a/gallery_dl/extractor/tumblrgallery.py
+++ b/gallery_dl/extractor/tumblrgallery.py
@@ -36,7 +36,7 @@ class TumblrgalleryExtractor(GalleryExtractor):
class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
"""Extractor for Tumblrblog on tumblrgallery.xyz"""
subcategory = "tumblrblog"
- pattern = BASE_PATTERN + r"(/tumblrblog/gallery/(\d+)\.html)"
+ pattern = rf"{BASE_PATTERN}(/tumblrblog/gallery/(\d+)\.html)"
example = "https://tumblrgallery.xyz/tumblrblog/gallery/12345.html"
def __init__(self, match):
@@ -68,7 +68,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
"""Extractor for Posts on tumblrgallery.xyz"""
subcategory = "post"
- pattern = BASE_PATTERN + r"(/post/(\d+)\.html)"
+ pattern = rf"{BASE_PATTERN}(/post/(\d+)\.html)"
example = "https://tumblrgallery.xyz/post/12345.html"
def __init__(self, match):
@@ -93,7 +93,7 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
subcategory = "search"
filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}"
directory_fmt = ("{category}", "{search_term}")
- pattern = BASE_PATTERN + r"(/s\.php\?q=([^&#]+))"
+ pattern = rf"{BASE_PATTERN}(/s\.php\?q=([^&#]+))"
example = "https://tumblrgallery.xyz/s.php?q=QUERY"
def __init__(self, match):
diff --git a/gallery_dl/extractor/tungsten.py b/gallery_dl/extractor/tungsten.py
index 45836a9..67c0b50 100644
--- a/gallery_dl/extractor/tungsten.py
+++ b/gallery_dl/extractor/tungsten.py
@@ -23,10 +23,10 @@ class TungstenExtractor(Extractor):
def items(self):
for post in self.posts():
url = post["original_url"]
- post["date"] = text.parse_datetime(post["created_at"])
+ post["date"] = self.parse_datetime_iso(post["created_at"])
post["filename"] = url[url.rfind("/")+1:]
post["extension"] = "webp"
- yield Message.Directory, post
+ yield Message.Directory, "", post
yield Message.Url, url, post
def _pagination(self, url, params):
diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py
index 4f9fe84..e21ef2a 100644
--- a/gallery_dl/extractor/twibooru.py
+++ b/gallery_dl/extractor/twibooru.py
@@ -37,8 +37,7 @@ class TwibooruExtractor(BooruExtractor):
return post["view_url"]
def _prepare(self, post):
- post["date"] = text.parse_datetime(
- post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["created_at"])
if "name" in post:
name, sep, rest = post["name"].rpartition(".")
@@ -49,7 +48,7 @@ class TwibooruPostExtractor(TwibooruExtractor):
"""Extractor for single twibooru posts"""
subcategory = "post"
request_interval = (0.5, 1.5)
- pattern = BASE_PATTERN + r"/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(\d+)"
example = "https://twibooru.org/12345"
def __init__(self, match):
@@ -64,7 +63,7 @@ class TwibooruSearchExtractor(TwibooruExtractor):
"""Extractor for twibooru search results"""
subcategory = "search"
directory_fmt = ("{category}", "{search_tags}")
- pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))"
+ pattern = rf"{BASE_PATTERN}/(?:search/?\?([^#]+)|tags/([^/?#]+))"
example = "https://twibooru.org/search?q=TAG"
def __init__(self, match):
@@ -98,7 +97,7 @@ class TwibooruGalleryExtractor(TwibooruExtractor):
subcategory = "gallery"
directory_fmt = ("{category}", "galleries",
"{gallery[id]} {gallery[title]}")
- pattern = BASE_PATTERN + r"/galleries/(\d+)"
+ pattern = rf"{BASE_PATTERN}/galleries/(\d+)"
example = "https://twibooru.org/galleries/12345"
def __init__(self, match):
@@ -146,8 +145,8 @@ class TwibooruAPI():
return response.json()
if response.status_code == 429:
- until = text.parse_datetime(
- response.headers["X-RL-Reset"], "%Y-%m-%d %H:%M:%S %Z")
+ until = self.parse_datetime_iso(
+ response.headers["X-RL-Reset"][:19])
# wait an extra minute, just to be safe
self.extractor.wait(until=until, adjust=60.0)
continue
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index bf125a6..546e8e1 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -76,7 +76,7 @@ class TwitterExtractor(Extractor):
seen_tweets = set() if self.config("unique", True) else None
if self.twitpic:
- self._find_twitpic = util.re(
+ self._find_twitpic = text.re(
r"https?(://twitpic\.com/(?!photos/)\w+)").findall
tweets = self.tweets()
@@ -124,12 +124,11 @@ class TwitterExtractor(Extractor):
tdata = self._transform_tweet(tweet)
tdata.update(metadata)
tdata["count"] = len(files)
- yield Message.Directory, tdata
+ yield Message.Directory, "", tdata
- del tdata["source_id"]
- del tdata["sensitive_flags"]
- if "source_user" in tdata:
- del tdata["source_user"]
+ tdata.pop("source_id", None)
+ tdata.pop("source_user", None)
+ tdata.pop("sensitive_flags", None)
for tdata["num"], file in enumerate(files, 1):
file.update(tdata)
@@ -146,7 +145,7 @@ class TwitterExtractor(Extractor):
self._extract_media(
data, data["extended_entities"]["media"], files)
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.warning(
"%s: Error while extracting media files (%s: %s)",
data["id_str"], exc.__class__.__name__, exc)
@@ -155,7 +154,7 @@ class TwitterExtractor(Extractor):
try:
self._extract_card(tweet, files)
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.warning(
"%s: Error while extracting Card files (%s: %s)",
data["id_str"], exc.__class__.__name__, exc)
@@ -164,7 +163,7 @@ class TwitterExtractor(Extractor):
try:
self._extract_twitpic(data, files)
except Exception as exc:
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
self.log.warning(
"%s: Error while extracting TwitPic files (%s: %s)",
data["id_str"], exc.__class__.__name__, exc)
@@ -347,32 +346,36 @@ class TwitterExtractor(Extractor):
files.append({"url": url})
def _transform_tweet(self, tweet):
+ if "legacy" in tweet:
+ legacy = tweet["legacy"]
+ else:
+ legacy = tweet
+ tweet_id = int(legacy["id_str"])
+
if "author" in tweet:
author = tweet["author"]
elif "core" in tweet:
- author = tweet["core"]["user_results"]["result"]
+ try:
+ author = tweet["core"]["user_results"]["result"]
+ except KeyError:
+ self.log.warning("%s: Missing 'author' data", tweet_id)
+ author = util.NONE
else:
author = tweet["user"]
author = self._transform_user(author)
- if "legacy" in tweet:
- legacy = tweet["legacy"]
- else:
- legacy = tweet
- tget = legacy.get
-
- tweet_id = int(legacy["id_str"])
if tweet_id >= 300000000000000:
- date = text.parse_timestamp(
+ date = self.parse_timestamp(
((tweet_id >> 22) + 1288834974657) // 1000)
else:
try:
- date = text.parse_datetime(
+ date = self.parse_datetime(
legacy["created_at"], "%a %b %d %H:%M:%S %z %Y")
except Exception:
date = util.NONE
source = tweet.get("source")
+ tget = legacy.get
tdata = {
"tweet_id" : tweet_id,
"retweet_id" : text.parse_int(
@@ -439,6 +442,8 @@ class TwitterExtractor(Extractor):
txt, _, tco = content.rpartition(" ")
tdata["content"] = txt if tco.startswith("https://t.co/") else content
+ if "pinned" in tweet:
+ tdata["pinned"] = True
if "birdwatch_pivot" in tweet:
try:
tdata["birdwatch"] = \
@@ -455,7 +460,7 @@ class TwitterExtractor(Extractor):
tdata, legacy["extended_entities"]["media"][0])
if tdata["retweet_id"]:
tdata["content"] = f"RT @{author['name']}: {tdata['content']}"
- tdata["date_original"] = text.parse_timestamp(
+ tdata["date_original"] = self.parse_timestamp(
((tdata["retweet_id"] >> 22) + 1288834974657) // 1000)
return tdata
@@ -492,7 +497,7 @@ class TwitterExtractor(Extractor):
"id": text.parse_int(cid),
"name": com.get("name"),
"description": com.get("description"),
- "date": text.parse_timestamp(com.get("created_at", 0) // 1000),
+ "date": self.parse_timestamp(com.get("created_at", 0) // 1000),
"nsfw": com.get("is_nsfw"),
"role": com.get("role"),
"member_count": com.get("member_count"),
@@ -528,13 +533,13 @@ class TwitterExtractor(Extractor):
"id" : text.parse_int(uid),
"name" : core.get("screen_name"),
"nick" : core.get("name"),
- "location" : user["location"]["location"],
- "date" : text.parse_datetime(
+ "location" : user["location"].get("location"),
+ "date" : self.parse_datetime(
core["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"verified" : user["verification"]["verified"],
"protected" : user["privacy"]["protected"],
"profile_banner" : lget("profile_banner_url", ""),
- "profile_image" : user["avatar"]["image_url"].replace(
+ "profile_image" : user["avatar"].get("image_url", "").replace(
"_normal.", "."),
"favourites_count": lget("favourites_count"),
"followers_count" : lget("followers_count"),
@@ -591,9 +596,12 @@ class TwitterExtractor(Extractor):
obj = tweet["legacy"] if "legacy" in tweet else tweet
cid = obj.get("conversation_id_str")
if not cid:
- tid = obj["id_str"]
- self.log.warning(
- "Unable to expand %s (no 'conversation_id')", tid)
+ if cid is False:
+ yield tweet
+ else:
+ tid = obj["id_str"]
+ self.log.warning(
+ "Unable to expand %s (no 'conversation_id')", tid)
continue
if cid in seen:
self.log.debug(
@@ -608,6 +616,7 @@ class TwitterExtractor(Extractor):
def _make_tweet(self, user, url, id_str):
return {
"id_str": id_str,
+ "conversation_id_str": False,
"lang": None,
"user": user,
"source": "><",
@@ -658,8 +667,8 @@ class TwitterExtractor(Extractor):
class TwitterHomeExtractor(TwitterExtractor):
"""Extractor for Twitter home timelines"""
subcategory = "home"
- pattern = (BASE_PATTERN +
- r"/(?:home(?:/fo(?:llowing|r[-_ ]?you()))?|i/timeline)/?$")
+ pattern = (rf"{BASE_PATTERN}/"
+ rf"(?:home(?:/fo(?:llowing|r[-_ ]?you()))?|i/timeline)/?$")
example = "https://x.com/home"
def tweets(self):
@@ -671,7 +680,7 @@ class TwitterHomeExtractor(TwitterExtractor):
class TwitterSearchExtractor(TwitterExtractor):
"""Extractor for Twitter search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
+ pattern = rf"{BASE_PATTERN}/search/?\?(?:[^&#]+&)*q=([^&#]+)"
example = "https://x.com/search?q=QUERY"
def metadata(self):
@@ -702,7 +711,7 @@ class TwitterSearchExtractor(TwitterExtractor):
class TwitterHashtagExtractor(TwitterExtractor):
"""Extractor for Twitter hashtags"""
subcategory = "hashtag"
- pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/hashtag/([^/?#]+)"
example = "https://x.com/hashtag/NAME"
def items(self):
@@ -713,7 +722,7 @@ class TwitterHashtagExtractor(TwitterExtractor):
class TwitterUserExtractor(Dispatch, TwitterExtractor):
"""Extractor for a Twitter user"""
- pattern = (BASE_PATTERN + r"/(?:"
+ pattern = (rf"{BASE_PATTERN}/(?:"
r"([^/?#]+)/?(?:$|\?|#)"
r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
example = "https://x.com/USER"
@@ -890,7 +899,7 @@ class TwitterLikesExtractor(TwitterExtractor):
class TwitterBookmarkExtractor(TwitterExtractor):
"""Extractor for bookmarked tweets"""
subcategory = "bookmark"
- pattern = BASE_PATTERN + r"/i/bookmarks()"
+ pattern = rf"{BASE_PATTERN}/i/bookmarks()"
example = "https://x.com/i/bookmarks"
def tweets(self):
@@ -898,7 +907,7 @@ class TwitterBookmarkExtractor(TwitterExtractor):
def _transform_tweet(self, tweet):
tdata = TwitterExtractor._transform_tweet(self, tweet)
- tdata["date_bookmarked"] = text.parse_timestamp(
+ tdata["date_bookmarked"] = self.parse_timestamp(
(int(tweet["sortIndex"] or 0) >> 20) // 1000)
return tdata
@@ -906,7 +915,7 @@ class TwitterBookmarkExtractor(TwitterExtractor):
class TwitterListExtractor(TwitterExtractor):
"""Extractor for Twitter lists"""
subcategory = "list"
- pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
+ pattern = rf"{BASE_PATTERN}/i/lists/(\d+)/?$"
example = "https://x.com/i/lists/12345"
def tweets(self):
@@ -916,7 +925,7 @@ class TwitterListExtractor(TwitterExtractor):
class TwitterListMembersExtractor(TwitterExtractor):
"""Extractor for members of a Twitter list"""
subcategory = "list-members"
- pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
+ pattern = rf"{BASE_PATTERN}/i/lists/(\d+)/members"
example = "https://x.com/i/lists/12345/members"
def items(self):
@@ -952,7 +961,7 @@ class TwitterCommunityExtractor(TwitterExtractor):
directory_fmt = ("{category}", "Communities",
"{community[name]} ({community[id]})")
archive_fmt = "C_{community[id]}_{tweet_id}_{num}"
- pattern = BASE_PATTERN + r"/i/communities/(\d+)"
+ pattern = rf"{BASE_PATTERN}/i/communities/(\d+)"
example = "https://x.com/i/communities/12345"
def tweets(self):
@@ -966,7 +975,7 @@ class TwitterCommunitiesExtractor(TwitterExtractor):
subcategory = "communities"
directory_fmt = TwitterCommunityExtractor.directory_fmt
archive_fmt = TwitterCommunityExtractor.archive_fmt
- pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/communities/?$"
example = "https://x.com/i/communities"
def tweets(self):
@@ -978,7 +987,7 @@ class TwitterEventExtractor(TwitterExtractor):
subcategory = "event"
directory_fmt = ("{category}", "Events",
"{event[id]} {event[short_title]}")
- pattern = BASE_PATTERN + r"/i/events/(\d+)"
+ pattern = rf"{BASE_PATTERN}/i/events/(\d+)"
example = "https://x.com/i/events/12345"
def metadata(self):
@@ -991,7 +1000,7 @@ class TwitterEventExtractor(TwitterExtractor):
class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for individual tweets"""
subcategory = "tweet"
- pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
+ pattern = (rf"{BASE_PATTERN}/([^/?#]+|i/web)/status/(\d+)"
r"/?(?:$|\?|#|photo/|video/)")
example = "https://x.com/USER/status/12345"
@@ -1072,7 +1081,7 @@ class TwitterTweetExtractor(TwitterExtractor):
class TwitterQuotesExtractor(TwitterExtractor):
"""Extractor for quotes of a Tweet"""
subcategory = "quotes"
- pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes"
+ pattern = rf"{BASE_PATTERN}/(?:[^/?#]+|i/web)/status/(\d+)/quotes"
example = "https://x.com/USER/status/12345/quotes"
def items(self):
@@ -1096,7 +1105,7 @@ class TwitterInfoExtractor(TwitterExtractor):
else:
user = api.user_by_screen_name(screen_name)
- return iter(((Message.Directory, self._transform_user(user)),))
+ return iter(((Message.Directory, "", self._transform_user(user)),))
class TwitterAvatarExtractor(TwitterExtractor):
@@ -1162,7 +1171,7 @@ class TwitterImageExtractor(Extractor):
"_fallback": TwitterExtractor._image_fallback(self, base),
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, base + self._size_image, data
@@ -1369,7 +1378,7 @@ class TwitterAPI():
endpoint = "/graphql/E8Wq-_jFSaU7hxVcuOPR9g/UserTweets"
variables = {
"userId": self._user_id_by_screen_name(screen_name),
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
"withQuickPromoteEligibilityTweetFields": False,
"withVoice": True,
@@ -1384,7 +1393,7 @@ class TwitterAPI():
endpoint = "/graphql/-O3QOHrVn1aOm_cF5wyTCQ/UserTweetsAndReplies"
variables = {
"userId": self._user_id_by_screen_name(screen_name),
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
"withCommunity": True,
"withVoice": True,
@@ -1399,7 +1408,7 @@ class TwitterAPI():
endpoint = "/graphql/gmHw9geMTncZ7jeLLUUNOw/UserHighlightsTweets"
variables = {
"userId": self._user_id_by_screen_name(screen_name),
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
"withVoice": True,
}
@@ -1413,7 +1422,7 @@ class TwitterAPI():
endpoint = "/graphql/jCRhbOzdgOHp6u9H4g2tEg/UserMedia"
variables = {
"userId": self._user_id_by_screen_name(screen_name),
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
"withClientEventToken": False,
"withBirdwatchNotes": False,
@@ -1429,7 +1438,7 @@ class TwitterAPI():
endpoint = "/graphql/TGEKkJG_meudeaFcqaxM-Q/Likes"
variables = {
"userId": self._user_id_by_screen_name(screen_name),
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
"withClientEventToken": False,
"withBirdwatchNotes": False,
@@ -1444,32 +1453,45 @@ class TwitterAPI():
def user_bookmarks(self):
endpoint = "/graphql/pLtjrO4ubNh996M_Cubwsg/Bookmarks"
variables = {
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
}
return self._pagination_tweets(
endpoint, variables, ("bookmark_timeline_v2", "timeline"),
stop_tweets=128)
- def search_timeline(self, query, product="Latest"):
+ def search_timeline(self, query, product=None):
+ cfg = self.extractor.config
+
+ if product is None:
+ if product := cfg("search-results"):
+ product = {
+ "top" : "Top",
+ "live" : "Latest",
+ "user" : "People",
+ "media": "Media",
+ "list" : "Lists",
+ }.get(product.lower(), product).capitalize()
+ else:
+ product = "Latest"
+
endpoint = "/graphql/4fpceYZ6-YQCx_JSl_Cn_A/SearchTimeline"
variables = {
"rawQuery": query,
- "count": self.extractor.config("search-limit", 20),
+ "count": cfg("search-limit", 20),
"querySource": "typed_query",
"product": product,
"withGrokTranslatedBio": False,
}
- if self.extractor.config("search-pagination") in (
- "max_id", "maxid", "id"):
+ if cfg("search-pagination") in ("max_id", "maxid", "id"):
update_variables = self._update_variables_search
else:
update_variables = None
- stop_tweets = self.extractor.config("search-stop")
+ stop_tweets = cfg("search-stop")
if stop_tweets is None or stop_tweets == "auto":
- stop_tweets = 3 if update_variables is None else 0
+ stop_tweets = 3
return self._pagination_tweets(
endpoint, variables,
@@ -1494,7 +1516,7 @@ class TwitterAPI():
endpoint = "/graphql/Nyt-88UX4-pPCImZNUl9RQ/CommunityTweetsTimeline"
variables = {
"communityId": community_id,
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"displayLocation": "Community",
"rankingMode": "Recency",
"withCommunity": True,
@@ -1508,7 +1530,7 @@ class TwitterAPI():
endpoint = "/graphql/ZniZ7AAK_VVu1xtSx1V-gQ/CommunityMediaTimeline"
variables = {
"communityId": community_id,
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"withCommunity": True,
}
return self._pagination_tweets(
@@ -1520,7 +1542,7 @@ class TwitterAPI():
endpoint = ("/graphql/p048a9n3hTPppQyK7FQTFw"
"/CommunitiesMainPageTimeline")
variables = {
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"withCommunity": True,
}
return self._pagination_tweets(
@@ -1530,7 +1552,7 @@ class TwitterAPI():
def home_timeline(self):
endpoint = "/graphql/DXmgQYmIft1oLP6vMkJixw/HomeTimeline"
variables = {
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
"latestControlAvailable": True,
"withCommunity": True,
@@ -1541,7 +1563,7 @@ class TwitterAPI():
def home_latest_timeline(self):
endpoint = "/graphql/SFxmNKWfN9ySJcXG_tjX8g/HomeLatestTimeline"
variables = {
- "count": 100,
+ "count": self.extractor.config("limit", 50),
"includePromotedContent": False,
"latestControlAvailable": True,
}
@@ -1568,7 +1590,7 @@ class TwitterAPI():
endpoint = "/graphql/06JtmwM8k_1cthpFZITVVA/ListLatestTweetsTimeline"
variables = {
"listId": list_id,
- "count": 100,
+ "count": self.extractor.config("limit", 50),
}
return self._pagination_tweets(
endpoint, variables, ("list", "tweets_timeline", "timeline"))
@@ -1654,10 +1676,8 @@ class TwitterAPI():
self.extractor._assign_user(user)
return user["rest_id"]
except KeyError:
- if "unavailable_message" in user:
- raise exception.NotFoundError(
- f"{user['unavailable_message'].get('text')} "
- f"({user.get('reason')})", False)
+ if user and user.get("__typename") == "UserUnavailable":
+ raise exception.NotFoundError(user["message"], False)
else:
raise exception.NotFoundError("user")
@@ -1700,7 +1720,7 @@ class TwitterAPI():
self.client_transaction.generate_transaction_id(method, path)
def _call(self, endpoint, params, method="GET", auth=True, root=None):
- url = (root or self.root) + endpoint
+ url = (self.root if root is None else root) + endpoint
while True:
if auth:
@@ -1877,8 +1897,17 @@ class TwitterAPI():
features=None, field_toggles=None):
extr = self.extractor
original_retweets = (extr.retweets == "original")
- pinned_tweet = extr.pinned
+ pinned_tweet = True if extr.pinned else None
stop_tweets_max = stop_tweets
+ api_retries = None
+
+ if isinstance(count := variables.get("count"), list):
+ count = count.copy()
+ count.reverse()
+ self.log.debug("Using 'count: %s'", count[-1])
+ variables["count"] = count.pop()
+ else:
+ count = False
params = {"variables": None}
if cursor := extr._init_cursor():
@@ -1892,14 +1921,14 @@ class TwitterAPI():
while True:
params["variables"] = self._json_dumps(variables)
- data = self._call(endpoint, params)["data"]
+ data = self._call(endpoint, params)
try:
if path is None:
- instructions = (data["user"]["result"]["timeline"]
+ instructions = (data["data"]["user"]["result"]["timeline"]
["timeline"]["instructions"])
else:
- instructions = data
+ instructions = data["data"]
for key in path:
instructions = instructions[key]
instructions = instructions["instructions"]
@@ -1916,7 +1945,7 @@ class TwitterAPI():
elif instr_type == "TimelineAddToModule":
entries = instr["moduleItems"]
elif instr_type == "TimelinePinEntry":
- if pinned_tweet:
+ if pinned_tweet is not None:
pinned_tweet = instr["entry"]
elif instr_type == "TimelineReplaceEntry":
entry = instr["entry"]
@@ -1930,6 +1959,26 @@ class TwitterAPI():
except LookupError:
extr.log.debug(data)
+ if errors := data.get("errors"):
+ if api_retries is None:
+ api_tries = 1
+ api_retries = extr.config("retries-api", 9)
+ if api_retries < 0:
+ api_retries = float("inf")
+
+ err = []
+ srv = False
+ for e in errors:
+ err.append(f"- '{e.get('message') or e.get('name')}'")
+ if e.get("source") == "Server":
+ srv = True
+
+ self.log.warning("API errors (%s/%s):\n%s",
+ api_tries, api_retries+1, "\n".join(err))
+ if srv and api_tries <= api_retries:
+ api_tries += 1
+ continue
+
if user := extr._user_obj:
user = user["legacy"]
if user.get("blocked_by"):
@@ -1950,14 +1999,13 @@ class TwitterAPI():
"Unable to retrieve Tweets from this timeline")
tweets = []
- tweet = None
+ tweet = last_tweet = retry = None
+ api_tries = 1
- if pinned_tweet:
- if isinstance(pinned_tweet, dict):
- tweets.append(pinned_tweet)
- elif instructions[-1]["type"] == "TimelinePinEntry":
- tweets.append(instructions[-1]["entry"])
- pinned_tweet = False
+ if pinned_tweet is not None and isinstance(pinned_tweet, dict):
+ pinned_tweet["pinned"] = True
+ tweets.append(pinned_tweet)
+ pinned_tweet = None
for entry in entries:
esw = entry["entryId"].startswith
@@ -1965,6 +2013,7 @@ class TwitterAPI():
if esw("tweet-"):
tweets.append(entry)
elif esw(("profile-grid-",
+ "search-grid-",
"communities-grid-")):
if "content" in entry:
tweets.extend(entry["content"]["items"])
@@ -1988,6 +2037,28 @@ class TwitterAPI():
tweet = True
cursor = cursor.get("value")
+ if pinned_tweet is not None:
+ if extr._user_obj is None:
+ pinned = None
+ elif pinned := extr._user_obj["legacy"].get(
+ "pinned_tweet_ids_str"):
+ pinned = f"-tweet-{pinned[0]}"
+ for idx, entry in enumerate(tweets):
+ if entry["entryId"].endswith(pinned):
+ # mark as pinned / set 'pinned = True'
+ pinned_tweet = (
+ (entry.get("content") or entry["item"])
+ ["itemContent"]["tweet_results"]["result"])
+ if "tweet" in pinned_tweet:
+ pinned_tweet = pinned_tweet["tweet"]
+ pinned_tweet["pinned"] = True
+ # move to front of 'tweets'
+ del tweets[idx]
+ tweets.insert(0, entry)
+ break
+ del pinned
+ pinned_tweet = None
+
for entry in tweets:
try:
item = ((entry.get("content") or entry["item"])
@@ -2015,6 +2086,16 @@ class TwitterAPI():
(entry.get("entryId") or "").rpartition("-")[2])
continue
+ if retry is None:
+ try:
+ tweet["core"]["user_results"]["result"]
+ retry = False
+ except KeyError:
+ self.log.warning("Received Tweet results without "
+ "'core' data ... Retrying")
+ retry = True
+ break
+
if "retweeted_status_result" in legacy:
try:
retweet = legacy["retweeted_status_result"]["result"]
@@ -2071,18 +2152,25 @@ class TwitterAPI():
tweet.get("rest_id"))
continue
- if tweet:
+ if retry:
+ continue
+ elif tweet:
stop_tweets = stop_tweets_max
last_tweet = tweet
- else:
- if stop_tweets <= 0:
+ elif stop_tweets <= 0:
+ if not count:
return extr._update_cursor(None)
+ self.log.debug("Switching to 'count: %s'", count[-1])
+ variables["count"] = count.pop()
+ continue
+ else:
self.log.debug(
"No Tweet results (%s/%s)",
stop_tweets_max - stop_tweets + 1, stop_tweets_max)
stop_tweets -= 1
if not cursor or cursor == variables.get("cursor"):
+ self.log.debug("No continuation cursor")
return extr._update_cursor(None)
if update_variables is None:
@@ -2169,7 +2257,7 @@ class TwitterAPI():
else:
variables["rawQuery"] = f"{query} {max_id}"
- if prefix := self.extractor._cursor_prefix:
+ if prefix := getattr(self.extractor, "_cursor_prefix", None):
self.extractor._cursor_prefix = \
f"{prefix.partition('_')[0]}_{tweet_id}/"
variables["cursor"] = None
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index cf6631f..b77be95 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -41,11 +41,11 @@ class UnsplashExtractor(Extractor):
if metadata:
photo.update(metadata)
photo["extension"] = "jpg"
- photo["date"] = text.parse_datetime(photo["created_at"])
+ photo["date"] = self.parse_datetime_iso(photo["created_at"])
if "tags" in photo:
photo["tags"] = [t["title"] for t in photo["tags"]]
- yield Message.Directory, photo
+ yield Message.Directory, "", photo
yield Message.Url, url, photo
def metadata(self):
@@ -74,7 +74,7 @@ class UnsplashExtractor(Extractor):
class UnsplashImageExtractor(UnsplashExtractor):
"""Extractor for a single unsplash photo"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/photos/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/photos/([^/?#]+)"
example = "https://unsplash.com/photos/ID"
def photos(self):
@@ -85,7 +85,7 @@ class UnsplashImageExtractor(UnsplashExtractor):
class UnsplashUserExtractor(UnsplashExtractor):
"""Extractor for all photos of an unsplash user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/@(\w+)/?$"
+ pattern = rf"{BASE_PATTERN}/@(\w+)/?$"
example = "https://unsplash.com/@USER"
def photos(self):
@@ -97,7 +97,7 @@ class UnsplashUserExtractor(UnsplashExtractor):
class UnsplashFavoriteExtractor(UnsplashExtractor):
"""Extractor for all likes of an unsplash user"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/@(\w+)/likes"
+ pattern = rf"{BASE_PATTERN}/@(\w+)/likes"
example = "https://unsplash.com/@USER/likes"
def photos(self):
@@ -109,7 +109,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor):
class UnsplashCollectionExtractor(UnsplashExtractor):
"""Extractor for an unsplash collection"""
subcategory = "collection"
- pattern = BASE_PATTERN + r"/collections/([^/?#]+)(?:/([^/?#]+))?"
+ pattern = rf"{BASE_PATTERN}/collections/([^/?#]+)(?:/([^/?#]+))?"
example = "https://unsplash.com/collections/12345/TITLE"
def __init__(self, match):
@@ -128,7 +128,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor):
class UnsplashSearchExtractor(UnsplashExtractor):
"""Extractor for unsplash search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/s/photos/([^/?#]+)(?:\?([^#]+))?"
example = "https://unsplash.com/s/photos/QUERY"
def __init__(self, match):
diff --git a/gallery_dl/extractor/uploadir.py b/gallery_dl/extractor/uploadir.py
index d06c2ad..d80abba 100644
--- a/gallery_dl/extractor/uploadir.py
+++ b/gallery_dl/extractor/uploadir.py
@@ -53,5 +53,5 @@ class UploadirFileExtractor(Extractor):
data = text.nameext_from_url(name)
data["id"] = self.file_id
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py
index 4369ac6..0d8b3d3 100644
--- a/gallery_dl/extractor/urlgalleries.py
+++ b/gallery_dl/extractor/urlgalleries.py
@@ -38,7 +38,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
data["count"] = len(imgs)
root = self.root
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], img in enumerate(imgs, 1):
page = self.request(root + img).text
url = text.extr(page, "window.location.href = '", "'")
@@ -52,7 +52,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
"blog" : text.unescape(extr(' title="', '"')),
"_rprt": extr(' title="', '"'), # report button
"title": text.unescape(extr(' title="', '"').strip()),
- "date" : text.parse_datetime(
+ "date" : self.parse_datetime(
extr(" images in gallery | ", "<"), "%B %d, %Y"),
}
diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py
index 7a9269d..0056d1b 100644
--- a/gallery_dl/extractor/urlshortener.py
+++ b/gallery_dl/extractor/urlshortener.py
@@ -32,7 +32,7 @@ BASE_PATTERN = UrlshortenerExtractor.update({
class UrlshortenerLinkExtractor(UrlshortenerExtractor):
"""Extractor for general-purpose URL shorteners"""
subcategory = "link"
- pattern = BASE_PATTERN + r"(/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/[^/?#]+)"
example = "https://bit.ly/abcde"
def items(self):
diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py
index e0107f3..63fc7fa 100644
--- a/gallery_dl/extractor/vanillarock.py
+++ b/gallery_dl/extractor/vanillarock.py
@@ -47,13 +47,13 @@ class VanillarockPostExtractor(VanillarockExtractor):
"count": len(imgs),
"title": text.unescape(name),
"path" : self.path.strip("/"),
- "date" : text.parse_datetime(extr(
- '<div class="date">', '</div>'), "%Y-%m-%d %H:%M"),
+ "date" : self.parse_datetime_iso(extr(
+ '<div class="date">', '</div>')),
"tags" : text.split_html(extr(
'<div class="cat-tag">', '</div>'))[::2],
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(imgs, 1):
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/vichan.py b/gallery_dl/extractor/vichan.py
index cbb44ee..86758f3 100644
--- a/gallery_dl/extractor/vichan.py
+++ b/gallery_dl/extractor/vichan.py
@@ -39,7 +39,7 @@ class VichanThreadExtractor(VichanExtractor):
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{time}{num:?-//} {filename}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)"
example = "https://8kun.top/a/res/12345.html"
def items(self):
@@ -58,7 +58,7 @@ class VichanThreadExtractor(VichanExtractor):
"num" : 0,
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for post in posts:
if "filename" in post:
yield process(post, data)
@@ -93,7 +93,7 @@ class VichanThreadExtractor(VichanExtractor):
class VichanBoardExtractor(VichanExtractor):
"""Extractor for vichan boards"""
subcategory = "board"
- pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
example = "https://8kun.top/a/"
def items(self):
diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py
index 294fc57..8f6368b 100644
--- a/gallery_dl/extractor/vipergirls.py
+++ b/gallery_dl/extractor/vipergirls.py
@@ -75,7 +75,7 @@ class VipergirlsExtractor(Extractor):
data["count"] = len(images)
del data["imagecount"]
- yield Message.Directory, data
+ yield Message.Directory, "", data
if images:
for data["num"], image in enumerate(images, 1):
yield Message.Queue, image.attrib["main_url"], data
@@ -124,8 +124,8 @@ class VipergirlsExtractor(Extractor):
class VipergirlsThreadExtractor(VipergirlsExtractor):
"""Extractor for vipergirls threads"""
subcategory = "thread"
- pattern = (BASE_PATTERN +
- r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))")
+ pattern = (rf"{BASE_PATTERN}"
+ rf"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))")
example = "https://vipergirls.to/threads/12345-TITLE"
def __init__(self, match):
@@ -140,8 +140,8 @@ class VipergirlsThreadExtractor(VipergirlsExtractor):
class VipergirlsPostExtractor(VipergirlsExtractor):
"""Extractor for vipergirls posts"""
subcategory = "post"
- pattern = (BASE_PATTERN +
- r"/threads/(\d+)(?:-[^/?#]+)?\?p=\d+[^#]*#post(\d+)")
+ pattern = (rf"{BASE_PATTERN}"
+ rf"/threads/(\d+)(?:-[^/?#]+)?\?p=\d+[^#]*#post(\d+)")
example = "https://vipergirls.to/threads/12345-TITLE?p=23456#post23456"
def __init__(self, match):
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
index 22d4b9a..e7453fc 100644
--- a/gallery_dl/extractor/vk.py
+++ b/gallery_dl/extractor/vk.py
@@ -9,7 +9,7 @@
"""Extractors for https://vk.com/"""
from .common import Extractor, Message
-from .. import text, util, exception
+from .. import text, exception
BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com"
@@ -36,11 +36,11 @@ class VkExtractor(Extractor):
return num
def items(self):
- subn = util.re(r"/imp[fg]/").subn
+ subn = text.re(r"/imp[fg]/").subn
sizes = "wzyxrqpo"
data = self.metadata()
- yield Message.Directory, data
+ yield Message.Directory, "", data
for photo in self.photos():
@@ -72,7 +72,7 @@ class VkExtractor(Extractor):
photo["width"] = photo["height"] = 0
photo["id"] = photo["id"].rpartition("_")[2]
- photo["date"] = text.parse_timestamp(text.extr(
+ photo["date"] = self.parse_timestamp(text.extr(
photo["date"], 'data-date="', '"'))
photo["description"] = text.unescape(text.extr(
photo.get("desc", ""), ">", "<"))
@@ -134,7 +134,7 @@ class VkExtractor(Extractor):
class VkPhotosExtractor(VkExtractor):
"""Extractor for photos from a vk user"""
subcategory = "photos"
- pattern = (BASE_PATTERN + r"/(?:"
+ pattern = (rf"{BASE_PATTERN}/(?:"
r"(?:albums|photos|id)(-?\d+)"
r"|(?!(?:album|tag|wall)-?\d+_?)([^/?#]+))")
example = "https://vk.com/id12345"
@@ -184,7 +184,7 @@ class VkAlbumExtractor(VkExtractor):
"""Extractor for a vk album"""
subcategory = "album"
directory_fmt = ("{category}", "{user[id]}", "{album[id]}")
- pattern = BASE_PATTERN + r"/album(-?\d+)_(\d+)$"
+ pattern = rf"{BASE_PATTERN}/album(-?\d+)_(\d+)$"
example = "https://vk.com/album12345_00"
def photos(self):
@@ -228,7 +228,7 @@ class VkTaggedExtractor(VkExtractor):
"""Extractor for a vk tagged photos"""
subcategory = "tagged"
directory_fmt = ("{category}", "{user[id]}", "tags")
- pattern = BASE_PATTERN + r"/tag(-?\d+)$"
+ pattern = rf"{BASE_PATTERN}/tag(-?\d+)$"
example = "https://vk.com/tag12345"
def __init__(self, match):
@@ -247,7 +247,7 @@ class VkWallPostExtractor(VkExtractor):
subcategory = "wall-post"
directory_fmt = ("{category}", "{user[id]}", "wall")
filename_fmt = "{wall[id]}_{num}.{extension}"
- pattern = BASE_PATTERN + r"/wall(-?\d+)_(\d+)"
+ pattern = rf"{BASE_PATTERN}/wall(-?\d+)_(\d+)"
example = "https://vk.com/wall12345_123"
def photos(self):
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index df09fce..b8da813 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -12,7 +12,7 @@ from .common import Extractor, Message, Dispatch
from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co"
-USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)"
+USER_PATTERN = rf"{BASE_PATTERN}/([^/?#]+)"
class VscoExtractor(Extractor):
@@ -22,7 +22,7 @@ class VscoExtractor(Extractor):
directory_fmt = ("{category}", "{user}")
filename_fmt = "{id}.{extension}"
archive_fmt = "{id}"
- tls12 = False
+ browser = "firefox"
def __init__(self, match):
Extractor.__init__(self, match)
@@ -30,7 +30,7 @@ class VscoExtractor(Extractor):
def items(self):
videos = self.config("videos", True)
- yield Message.Directory, {"user": self.user}
+ yield Message.Directory, "", {"user": self.user}
for img in self.images():
if not img:
@@ -62,7 +62,7 @@ class VscoExtractor(Extractor):
"grid" : img["grid_name"],
"meta" : img.get("image_meta") or {},
"tags" : [tag["text"] for tag in img.get("tags") or ()],
- "date" : text.parse_timestamp(img["upload_date"] // 1000),
+ "date" : self.parse_timestamp(img["upload_date"] // 1000),
"video" : img["is_video"],
"width" : img["width"],
"height": img["height"],
@@ -133,7 +133,7 @@ class VscoExtractor(Extractor):
class VscoUserExtractor(Dispatch, VscoExtractor):
"""Extractor for a vsco user profile"""
- pattern = USER_PATTERN + r"/?$"
+ pattern = rf"{USER_PATTERN}/?$"
example = "https://vsco.co/USER"
def items(self):
@@ -149,7 +149,7 @@ class VscoUserExtractor(Dispatch, VscoExtractor):
class VscoGalleryExtractor(VscoExtractor):
"""Extractor for a vsco user's gallery"""
subcategory = "gallery"
- pattern = USER_PATTERN + r"/(?:gallery|images)"
+ pattern = rf"{USER_PATTERN}/(?:gallery|images)"
example = "https://vsco.co/USER/gallery"
def images(self):
@@ -173,7 +173,7 @@ class VscoCollectionExtractor(VscoExtractor):
subcategory = "collection"
directory_fmt = ("{category}", "{user}", "collection")
archive_fmt = "c_{user}_{id}"
- pattern = USER_PATTERN + r"/collection"
+ pattern = rf"{USER_PATTERN}/collection"
example = "https://vsco.co/USER/collection/1"
def images(self):
@@ -198,7 +198,7 @@ class VscoSpaceExtractor(VscoExtractor):
subcategory = "space"
directory_fmt = ("{category}", "space", "{user}")
archive_fmt = "s_{user}_{id}"
- pattern = BASE_PATTERN + r"/spaces/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/spaces/([^/?#]+)"
example = "https://vsco.co/spaces/a1b2c3d4e5f"
def images(self):
@@ -245,7 +245,7 @@ class VscoSpaceExtractor(VscoExtractor):
class VscoSpacesExtractor(VscoExtractor):
"""Extractor for a vsco.co user's spaces"""
subcategory = "spaces"
- pattern = USER_PATTERN + r"/spaces"
+ pattern = rf"{USER_PATTERN}/spaces"
example = "https://vsco.co/USER/spaces"
def items(self):
@@ -275,7 +275,7 @@ class VscoSpacesExtractor(VscoExtractor):
class VscoAvatarExtractor(VscoExtractor):
"""Extractor for vsco.co user avatars"""
subcategory = "avatar"
- pattern = USER_PATTERN + r"/avatar"
+ pattern = rf"{USER_PATTERN}/avatar"
example = "https://vsco.co/USER/avatar"
def images(self):
@@ -303,7 +303,7 @@ class VscoAvatarExtractor(VscoExtractor):
class VscoImageExtractor(VscoExtractor):
"""Extractor for individual images on vsco.co"""
subcategory = "image"
- pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)"
+ pattern = rf"{USER_PATTERN}/media/([0-9a-fA-F]+)"
example = "https://vsco.co/USER/media/0123456789abcdef"
def images(self):
@@ -316,7 +316,7 @@ class VscoImageExtractor(VscoExtractor):
class VscoVideoExtractor(VscoExtractor):
"""Extractor for vsco.co videos links"""
subcategory = "video"
- pattern = USER_PATTERN + r"/video/([^/?#]+)"
+ pattern = rf"{USER_PATTERN}/video/([^/?#]+)"
example = "https://vsco.co/USER/video/012345678-9abc-def0"
def images(self):
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
index f0f27e0..9ea3c36 100644
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -29,7 +29,7 @@ class WallhavenExtractor(Extractor):
self._transform(wp)
wp.update(metadata)
url = wp["url"]
- yield Message.Directory, wp
+ yield Message.Directory, "", wp
yield Message.Url, url, text.nameext_from_url(url, wp)
def wallpapers(self):
@@ -43,8 +43,7 @@ class WallhavenExtractor(Extractor):
wp["url"] = wp.pop("path")
if "tags" in wp:
wp["tags"] = [t["name"] for t in wp["tags"]]
- wp["date"] = text.parse_datetime(
- wp.pop("created_at"), "%Y-%m-%d %H:%M:%S")
+ wp["date"] = self.parse_datetime_iso(wp.pop("created_at"))
wp["width"] = wp.pop("dimension_x")
wp["height"] = wp.pop("dimension_y")
wp["wh_category"] = wp["category"]
diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py
index 65fca24..1392164 100644
--- a/gallery_dl/extractor/wallpapercave.py
+++ b/gallery_dl/extractor/wallpapercave.py
@@ -27,7 +27,7 @@ class WallpapercaveImageExtractor(Extractor):
path = None
for path in text.extract_iter(page, 'class="download" href="', '"'):
image = text.nameext_from_url(path)
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, self.root + path, image
if path is None:
@@ -38,7 +38,7 @@ class WallpapercaveImageExtractor(Extractor):
pass
else:
image = text.nameext_from_url(path)
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, self.root + path, image
if path is None:
@@ -46,5 +46,5 @@ class WallpapercaveImageExtractor(Extractor):
page, 'class="wallpaper" id="wp', '</picture>'):
if path := text.rextr(wp, ' src="', '"'):
image = text.nameext_from_url(path)
- yield Message.Directory, image
+ yield Message.Directory, "", image
yield Message.Url, self.root + path, image
diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py
index 8ae2a49..b66ba8d 100644
--- a/gallery_dl/extractor/warosu.py
+++ b/gallery_dl/extractor/warosu.py
@@ -37,12 +37,12 @@ class WarosuThreadExtractor(Extractor):
data["title"] = text.unescape(text.remove_html(
posts[0]["com"]))[:50]
- yield Message.Directory, data
+ yield Message.Directory, "", data
for post in posts:
if "image" in post:
for key in ("w", "h", "no", "time", "tim"):
post[key] = text.parse_int(post[key])
- dt = text.parse_timestamp(post["time"])
+ dt = self.parse_timestamp(post["time"])
# avoid zero-padding 'day' with %d
post["now"] = dt.strftime(f"%a, %b {dt.day}, %Y %H:%M:%S")
post.update(data)
diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py
index a69f3a8..e718e51 100644
--- a/gallery_dl/extractor/weasyl.py
+++ b/gallery_dl/extractor/weasyl.py
@@ -24,8 +24,7 @@ class WeasylExtractor(Extractor):
# Some submissions don't have content and can be skipped
if "submission" in data["media"]:
data["url"] = data["media"]["submission"][0]["url"]
- data["date"] = text.parse_datetime(
- data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S")
+ data["date"] = self.parse_datetime_iso(data["posted_at"][:19])
text.nameext_from_url(data["url"], data)
return True
return False
@@ -42,7 +41,7 @@ class WeasylExtractor(Extractor):
f"{self.root}/api/journals/{journalid}/view")
data["extension"] = "html"
data["html"] = "text:" + data["content"]
- data["date"] = text.parse_datetime(data["posted_at"])
+ data["date"] = self.parse_datetime_iso(data["posted_at"])
return data
def submissions(self, owner_login, folderid=None):
@@ -71,7 +70,7 @@ class WeasylExtractor(Extractor):
class WeasylSubmissionExtractor(WeasylExtractor):
subcategory = "submission"
- pattern = BASE_PATTERN + r"(?:~[\w~-]+/submissions|submission|view)/(\d+)"
+ pattern = rf"{BASE_PATTERN}(?:~[\w~-]+/submissions|submission|view)/(\d+)"
example = "https://www.weasyl.com/~USER/submissions/12345/TITLE"
def __init__(self, match):
@@ -81,13 +80,13 @@ class WeasylSubmissionExtractor(WeasylExtractor):
def items(self):
data = self.request_submission(self.submitid)
if self.populate_submission(data):
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, data["url"], data
class WeasylSubmissionsExtractor(WeasylExtractor):
subcategory = "submissions"
- pattern = BASE_PATTERN + r"(?:~|submissions/)([\w~-]+)/?$"
+ pattern = rf"{BASE_PATTERN}(?:~|submissions/)([\w~-]+)/?$"
example = "https://www.weasyl.com/submissions/USER"
def __init__(self, match):
@@ -95,14 +94,14 @@ class WeasylSubmissionsExtractor(WeasylExtractor):
self.owner_login = match[1]
def items(self):
- yield Message.Directory, {"owner_login": self.owner_login}
+ yield Message.Directory, "", {"owner_login": self.owner_login}
yield from self.submissions(self.owner_login)
class WeasylFolderExtractor(WeasylExtractor):
subcategory = "folder"
directory_fmt = ("{category}", "{owner_login}", "{folder_name}")
- pattern = BASE_PATTERN + r"submissions/([\w~-]+)\?folderid=(\d+)"
+ pattern = rf"{BASE_PATTERN}submissions/([\w~-]+)\?folderid=(\d+)"
example = "https://www.weasyl.com/submissions/USER?folderid=12345"
def __init__(self, match):
@@ -114,7 +113,7 @@ class WeasylFolderExtractor(WeasylExtractor):
# Folder names are only on single submission api calls
msg, url, data = next(iter)
details = self.request_submission(data["submitid"])
- yield Message.Directory, details
+ yield Message.Directory, "", details
yield msg, url, data
yield from iter
@@ -123,7 +122,7 @@ class WeasylJournalExtractor(WeasylExtractor):
subcategory = "journal"
filename_fmt = "{journalid} {title}.{extension}"
archive_fmt = "{journalid}"
- pattern = BASE_PATTERN + r"journal/(\d+)"
+ pattern = rf"{BASE_PATTERN}journal/(\d+)"
example = "https://www.weasyl.com/journal/12345"
def __init__(self, match):
@@ -132,7 +131,7 @@ class WeasylJournalExtractor(WeasylExtractor):
def items(self):
data = self.retrieve_journal(self.journalid)
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, data["html"], data
@@ -140,7 +139,7 @@ class WeasylJournalsExtractor(WeasylExtractor):
subcategory = "journals"
filename_fmt = "{journalid} {title}.{extension}"
archive_fmt = "{journalid}"
- pattern = BASE_PATTERN + r"journals/([\w~-]+)"
+ pattern = rf"{BASE_PATTERN}journals/([\w~-]+)"
example = "https://www.weasyl.com/journals/USER"
def __init__(self, match):
@@ -148,7 +147,7 @@ class WeasylJournalsExtractor(WeasylExtractor):
self.owner_login = match[1]
def items(self):
- yield Message.Directory, {"owner_login": self.owner_login}
+ yield Message.Directory, "", {"owner_login": self.owner_login}
url = f"{self.root}/journals/{self.owner_login}"
page = self.request(url).text
@@ -160,7 +159,7 @@ class WeasylJournalsExtractor(WeasylExtractor):
class WeasylFavoriteExtractor(WeasylExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "{user}", "Favorites")
- pattern = BASE_PATTERN + r"favorites(?:\?userid=(\d+)|/([^/?#]+))"
+ pattern = rf"{BASE_PATTERN}favorites(?:\?userid=(\d+)|/([^/?#]+))"
example = "https://www.weasyl.com/favorites?userid=12345"
def items(self):
@@ -192,7 +191,7 @@ class WeasylFavoriteExtractor(WeasylExtractor):
submission = self.request_submission(submitid)
if self.populate_submission(submission):
submission["user"] = owner_login
- yield Message.Directory, submission
+ yield Message.Directory, "", submission
yield Message.Url, submission["url"], submission
try:
diff --git a/gallery_dl/extractor/webmshare.py b/gallery_dl/extractor/webmshare.py
index cc41b03..2cb41bb 100644
--- a/gallery_dl/extractor/webmshare.py
+++ b/gallery_dl/extractor/webmshare.py
@@ -40,7 +40,7 @@ class WebmshareVideoExtractor(Extractor):
'property="og:video:width" content="', '"')),
"height": text.parse_int(extr(
'property="og:video:height" content="', '"')),
- "date" : text.parse_datetime(extr(
+ "date" : self.parse_datetime(extr(
"<small>Added ", "<"), "%B %d, %Y"),
"views": text.parse_int(extr('glyphicon-eye-open"></span>', '<')),
"id" : self.video_id,
@@ -51,5 +51,5 @@ class WebmshareVideoExtractor(Extractor):
if data["title"] == "webmshare":
data["title"] = ""
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, data["url"], data
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 79120c1..bed251b 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -48,7 +48,7 @@ class WebtoonsBase():
class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
"""Extractor for an episode on webtoons.com"""
subcategory = "episode"
- pattern = (LANG_PATTERN + r"/([^/?#]+)/([^/?#]+)/[^/?#]+)"
+ pattern = (rf"{LANG_PATTERN}/([^/?#]+)/([^/?#]+)/[^/?#]+)"
r"/viewer\?([^#'\"]+)")
example = ("https://www.webtoons.com/en/GENRE/TITLE/NAME/viewer"
"?title_no=123&episode_no=12345")
@@ -131,7 +131,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
def assets(self, page):
if self.config("thumbnails", False):
- active = text.extr(page, 'class="on ', '</a>')
+ active = text.extr(page, 'class="on', '</a>')
url = _url(text.extr(active, 'data-url="', '"'))
return ({"url": url, "type": "thumbnail"},)
@@ -142,7 +142,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
categorytransfer = True
filename_fmt = "{type}.{extension}"
archive_fmt = "{title_no}_{type}"
- pattern = LANG_PATTERN + r"/([^/?#]+)/([^/?#]+))/list\?([^#]+)"
+ pattern = rf"{LANG_PATTERN}/([^/?#]+)/([^/?#]+))/list\?([^#]+)"
example = "https://www.webtoons.com/en/GENRE/TITLE/list?title_no=123"
def items(self):
@@ -160,7 +160,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
page = response.text
if self.config("banners") and (asset := self._asset_banner(page)):
- yield Message.Directory, asset
+ yield Message.Directory, "", asset
yield Message.Url, asset["url"], asset
data = {"_extractor": WebtoonsEpisodeExtractor}
@@ -197,7 +197,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
class WebtoonsArtistExtractor(WebtoonsBase, Extractor):
"""Extractor for webtoons.com artists"""
subcategory = "artist"
- pattern = BASE_PATTERN + r"/p/community/([^/?#]+)/u/([^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/p/community/([^/?#]+)/u/([^/?#]+)"
example = "https://www.webtoons.com/p/community/LANG/u/ARTIST"
def items(self):
diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py
index 03cbf29..31cdaac 100644
--- a/gallery_dl/extractor/weebcentral.py
+++ b/gallery_dl/extractor/weebcentral.py
@@ -44,7 +44,7 @@ class WeebcentralBase():
class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor):
"""Extractor for manga chapters from weebcentral.com"""
- pattern = BASE_PATTERN + r"(/chapters/(\w+))"
+ pattern = rf"{BASE_PATTERN}(/chapters/(\w+))"
example = "https://weebcentral.com/chapters/01JHABCDEFGHIJKLMNOPQRSTUV"
def metadata(self, page):
@@ -95,7 +95,7 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor):
class WeebcentralMangaExtractor(WeebcentralBase, MangaExtractor):
"""Extractor for manga from weebcentral.com"""
chapterclass = WeebcentralChapterExtractor
- pattern = BASE_PATTERN + r"/series/(\w+)"
+ pattern = rf"{BASE_PATTERN}/series/(\w+)"
example = "https://weebcentral.com/series/01J7ABCDEFGHIJKLMNOPQRSTUV/TITLE"
def chapters(self, _):
@@ -127,8 +127,8 @@ class WeebcentralMangaExtractor(WeebcentralBase, MangaExtractor):
"chapter" : text.parse_int(chapter),
"chapter_minor": sep + minor,
"chapter_type" : type,
- "date" : text.parse_datetime(
- extr(' datetime="', '"')[:-5], "%Y-%m-%dT%H:%M:%S"),
+ "date" : self.parse_datetime_iso(extr(
+ ' datetime="', '"')[:-5]),
}
chapter.update(data)
results.append((base + chapter_id, chapter))
diff --git a/gallery_dl/extractor/weebdex.py b/gallery_dl/extractor/weebdex.py
new file mode 100644
index 0000000..78fbda1
--- /dev/null
+++ b/gallery_dl/extractor/weebdex.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://weebdex.org/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+from ..cache import memcache
+
+BASE_PATTERN = r"(?:https?://)?weebdex\.org"
+
+
+class WeebdexBase():
+ """Base class for weebdex extractors"""
+ category = "weebdex"
+ root = "https://weebdex.org"
+ root_api = "https://api.weebdex.org"
+ request_interval = 0.2 # 5 requests per second
+
+ def _init(self):
+ self.headers_api = {
+ "Referer": self.root + "/",
+ "Origin" : self.root,
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-site",
+ }
+
+
+class WeebdexChapterExtractor(WeebdexBase, ChapterExtractor):
+ """Extractor for weebdex manga chapters"""
+ archive_fmt = "{chapter_id}_{version}_{page}"
+ pattern = BASE_PATTERN + r"/chapter/(\w+)"
+ example = "https://weebdex.org/chapter/ID/PAGE"
+
+ def metadata(self, _):
+ cid = self.groups[0]
+ url = f"{self.root_api}/chapter/{cid}"
+ self.data = data = self.request_json(url, headers=self.headers_api)
+
+ rel = data.pop("relationships")
+ chapter, sep, minor = data["chapter"].partition(".")
+
+ return {
+ **_manga_info(self, rel["manga"]["id"]),
+ "title" : data.get("title", ""),
+ "version" : data["version"],
+ "volume" : text.parse_int(data["volume"]),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "chapter_id" : cid,
+ "date" : self.parse_datetime_iso(data["created_at"]),
+ "date_updated" : self.parse_datetime_iso(data["updated_at"]),
+ "lang" : data["language"],
+ "uploader": rel["uploader"]["name"] if "uploader" in rel else "",
+ "group" : [g["name"] for g in rel.get("groups") or ()],
+ }
+
+ def images(self, _):
+ data = self.data
+ base = f"{data['node']}/data/{data['id']}/"
+
+ return [
+ (base + page["name"], {
+ "width" : page["dimensions"][0],
+ "height": page["dimensions"][1],
+ })
+ for page in data["data"]
+ ]
+
+
+class WeebdexMangaExtractor(WeebdexBase, MangaExtractor):
+ """Extractor for weebdex manga"""
+ chapterclass = WeebdexChapterExtractor
+ pattern = BASE_PATTERN + r"/title/(\w+)"
+ example = "https://weebdex.org/title/ID/SLUG"
+
+ def chapters(self, page):
+ mid = self.groups[0]
+ url = f"{self.root_api}/manga/{mid}/chapters"
+ params = {
+ "limit": 100,
+ "order": "asc" if self.config("chapter-reverse") else "desc",
+ }
+
+ base = self.root + "/chapter/"
+ manga = _manga_info(self, mid)
+ results = []
+
+ while True:
+ data = self.request_json(
+ url, params=params, headers=self.headers_api)
+
+ for ch in data["data"]:
+ chapter, sep, minor = ch["chapter"].partition(".")
+ ch["volume"] = text.parse_int(ch["volume"])
+ ch["chapter"] = text.parse_int(chapter)
+ ch["chapter_minor"] = sep + minor
+ ch.update(manga)
+ results.append((base + ch["id"], ch))
+
+ if data["total"] <= data["page"] * params["limit"]:
+ break
+ params["page"] = data["page"] + 1
+
+ return results
+
+
+@memcache(keyarg=1)
+def _manga_info(self, mid):
+ url = f"{self.root_api}/manga/{mid}"
+ manga = self.request_json(url, headers=self.headers_api)
+ rel = manga["relationships"]
+
+ return {
+ "manga" : manga["title"],
+ "manga_id": manga["id"],
+ "manga_date": self.parse_datetime_iso(manga["created_at"]),
+ "year" : manga["year"],
+ "status" : manga["status"],
+ "origin" : manga["language"],
+ "description": manga["description"],
+ "demographic": manga["demographic"],
+ "tags" : [f"{t['group']}:{t['name']}" for t in rel["tags"]],
+ "author" : [a["name"] for a in rel["authors"]],
+ "artist" : [a["name"] for a in rel["artists"]],
+ }
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 3c0f077..abec0f7 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -34,6 +34,7 @@ class WeiboExtractor(Extractor):
def _init(self):
self.livephoto = self.config("livephoto", True)
self.retweets = self.config("retweets", False)
+ self.longtext = self.config("text", False)
self.videos = self.config("videos", True)
self.movies = self.config("movies", False)
self.gifs = self.config("gifs", True)
@@ -98,10 +99,14 @@ class WeiboExtractor(Extractor):
files = []
self._extract_status(status, files)
- status["date"] = text.parse_datetime(
+ if self.longtext and status.get("isLongText") and \
+ status["text"].endswith('class="expand">展开</span>'):
+ status = self._status_by_id(status["id"])
+
+ status["date"] = self.parse_datetime(
status["created_at"], "%a %b %d %H:%M:%S %z %Y")
status["count"] = len(files)
- yield Message.Directory, status
+ yield Message.Directory, "", status
num = 0
for file in files:
@@ -190,7 +195,8 @@ class WeiboExtractor(Extractor):
return video
def _status_by_id(self, status_id):
- url = f"{self.root}/ajax/statuses/show?id={status_id}"
+ url = (f"{self.root}/ajax/statuses/show"
+ f"?id={status_id}&isGetLongText=true")
return self.request_json(url)
def _user_id(self):
diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py
index 830d880..8f3a1c9 100644
--- a/gallery_dl/extractor/wikiart.py
+++ b/gallery_dl/extractor/wikiart.py
@@ -27,7 +27,7 @@ class WikiartExtractor(Extractor):
def items(self):
data = self.metadata()
- yield Message.Directory, data
+ yield Message.Directory, "", data
for painting in self.paintings():
url = painting["image"]
painting.update(data)
@@ -68,7 +68,7 @@ class WikiartArtistExtractor(WikiartExtractor):
"""Extractor for an artist's paintings on wikiart.org"""
subcategory = "artist"
directory_fmt = ("{category}", "{artist[artistName]}")
- pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$"
+ pattern = rf"{BASE_PATTERN}/(?!\w+-by-)([\w-]+)/?$"
example = "https://www.wikiart.org/en/ARTIST"
def __init__(self, match):
@@ -89,7 +89,7 @@ class WikiartArtistExtractor(WikiartExtractor):
class WikiartImageExtractor(WikiartArtistExtractor):
"""Extractor for individual paintings on wikiart.org"""
subcategory = "image"
- pattern = BASE_PATTERN + r"/(?!(?:paintings|artists)-by-)([\w-]+)/([\w-]+)"
+ pattern = rf"{BASE_PATTERN}/(?!(?:paintings|artists)-by-)([\w-]+)/([\w-]+)"
example = "https://www.wikiart.org/en/ARTIST/TITLE"
def __init__(self, match):
@@ -109,7 +109,7 @@ class WikiartArtworksExtractor(WikiartExtractor):
"""Extractor for artwork collections on wikiart.org"""
subcategory = "artworks"
directory_fmt = ("{category}", "Artworks by {group!c}", "{type}")
- pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)"
+ pattern = rf"{BASE_PATTERN}/paintings-by-([\w-]+)/([\w-]+)"
example = "https://www.wikiart.org/en/paintings-by-GROUP/TYPE"
def __init__(self, match):
@@ -128,7 +128,7 @@ class WikiartArtworksExtractor(WikiartExtractor):
class WikiartArtistsExtractor(WikiartExtractor):
"""Extractor for artist collections on wikiart.org"""
subcategory = "artists"
- pattern = (BASE_PATTERN + r"/artists-by-([\w-]+)/([\w-]+)")
+ pattern = (rf"{BASE_PATTERN}/artists-by-([\w-]+)/([\w-]+)")
example = "https://www.wikiart.org/en/artists-by-GROUP/TYPE"
def __init__(self, match):
diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py
index 31dc9cd..a07fd84 100644
--- a/gallery_dl/extractor/wikifeet.py
+++ b/gallery_dl/extractor/wikifeet.py
@@ -34,8 +34,8 @@ class WikifeetGalleryExtractor(GalleryExtractor):
"celeb" : self.celeb,
"type" : self.type,
"birthplace": text.unescape(extr('"bplace":"', '"')),
- "birthday" : text.parse_datetime(text.unescape(
- extr('"bdate":"', '"'))[:10], "%Y-%m-%d"),
+ "birthday" : self.parse_datetime_iso(text.unescape(extr(
+ '"bdate":"', '"'))[:10]),
"shoesize" : text.unescape(extr('"ssize":', ',')),
"rating" : text.parse_float(extr('"score":', ',')),
"celebrity" : text.unescape(extr('"cname":"', '"')),
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index ba020d5..70e42c6 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -22,25 +22,32 @@ class WikimediaExtractor(BaseExtractor):
request_interval = (1.0, 2.0)
def __init__(self, match):
- BaseExtractor.__init__(self, match)
+ self._init_category(match)
+ self.format = False
if self.category == "wikimedia":
- self.category = self.root.split(".")[-2]
+ labels = self.root.split(".")
+ self.lang = labels[-3][-2:]
+ self.category = labels[-2]
elif self.category in ("fandom", "wikigg"):
+ self.lang = "en"
+ self.format = "original"
+ self.basesubcategory = self.category
self.category = (
f"{self.category}-"
f"{self.root.partition('.')[0].rpartition('/')[2]}")
-
- self.per_page = self.config("limit", 50)
- self.subcategories = False
+ else:
+ self.lang = ""
if useragent := self.config_instance("useragent"):
self.useragent = useragent
+ BaseExtractor.__init__(self, match)
+
def _init(self):
if api_path := self.config_instance("api-path"):
if api_path[0] == "/":
- self.api_url = self.root + api_path
+ self.api_url = f"{self.root}{api_path}"
else:
self.api_url = api_path
else:
@@ -51,12 +58,15 @@ class WikimediaExtractor(BaseExtractor):
# https://www.mediawiki.org/wiki/API:Revisions
# https://www.mediawiki.org/wiki/API:Imageinfo
self.image_revisions = self.config("image-revisions", 1)
+ self.format = self.config("format", self.format)
+ self.per_page = self.config("limit", 50)
+ self.subcategories = False
@cache(maxage=36500*86400, keyarg=1)
def _search_api_path(self, root):
self.log.debug("Probing possible API endpoints")
for path in ("/api.php", "/w/api.php", "/wiki/api.php"):
- url = root + path
+ url = f"{root}{path}"
response = self.request(url, method="HEAD", fatal=None)
if response.status_code < 400:
return url
@@ -74,12 +84,19 @@ class WikimediaExtractor(BaseExtractor):
m["name"]: m["value"]
for m in image["commonmetadata"] or ()}
- text.nameext_from_url(image["canonicaltitle"].partition(":")[2], image)
- image["date"] = text.parse_datetime(
- image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
+ text.nameext_from_name(
+ image["canonicaltitle"].partition(":")[2], image)
+ image["date"] = self.parse_datetime_iso(image["timestamp"])
+
+ if self.format:
+ url = image["url"]
+ image["url"] = (f"{url}{'&' if '?' in url else '?'}"
+ f"format={self.format}")
def items(self):
- for info in self._pagination(self.params):
+ params = self.params()
+
+ for info in self._pagination(params):
try:
images = info.pop("imageinfo")
except KeyError:
@@ -88,7 +105,7 @@ class WikimediaExtractor(BaseExtractor):
info["count"] = len(images)
self.prepare_info(info)
- yield Message.Directory, info
+ yield Message.Directory, "", info
num = 0
for image in images:
@@ -105,10 +122,10 @@ class WikimediaExtractor(BaseExtractor):
yield Message.Url, image["url"], image
if self.subcategories:
- base = self.root + "/wiki/"
- self.params["gcmtype"] = "subcat"
- for subcat in self._pagination(self.params):
- url = base + subcat["title"].replace(" ", "_")
+ base = f"{self.root}/wiki/"
+ params["gcmtype"] = "subcat"
+ for subcat in self._pagination(params):
+ url = f"{base}{subcat['title'].replace(' ', '_')}"
subcat["_extractor"] = WikimediaArticleExtractor
yield Message.Queue, url, subcat
@@ -219,7 +236,7 @@ class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles"""
subcategory = "article"
directory_fmt = ("{category}", "{page}")
- pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?!static/)([^?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE"
def __init__(self, match):
@@ -227,53 +244,54 @@ class WikimediaArticleExtractor(WikimediaExtractor):
path = self.groups[-1]
if path[2] == "/":
- self.root = self.root + "/" + path[:2]
+ self.lang = lang = path[:2]
+ self.root = f"{self.root}/{lang}"
path = path[3:]
if path.startswith("wiki/"):
path = path[5:]
+ self.path = text.unquote(path)
pre, sep, _ = path.partition(":")
- prefix = pre.lower() if sep else None
-
- self.title = path = text.unquote(path)
- if prefix:
+ self.prefix = prefix = pre.lower() if sep else None
+ if prefix is not None:
self.subcategory = prefix
- if prefix == "category":
+ def params(self):
+ if self.prefix == "category":
if self.config("subcategories", True):
self.subcategories = True
- self.params = {
+ return {
"generator": "categorymembers",
- "gcmtitle" : path,
+ "gcmtitle" : self.path,
"gcmtype" : "file",
"gcmlimit" : self.per_page,
}
- elif prefix == "file":
- self.params = {
- "titles" : path,
- }
- else:
- self.params = {
- "generator": "images",
- "gimlimit" : self.per_page,
- "titles" : path,
+
+ if self.prefix == "file":
+ return {
+ "titles": self.path,
}
+ return {
+ "generator": "images",
+ "gimlimit" : self.per_page,
+ "titles" : self.path,
+ }
+
def prepare_info(self, info):
- info["page"] = self.title
+ info["page"] = self.path
+ info["lang"] = self.lang
class WikimediaWikiExtractor(WikimediaExtractor):
"""Extractor for all files on a MediaWiki instance"""
subcategory = "wiki"
- pattern = BASE_PATTERN + r"/?$"
+ pattern = rf"{BASE_PATTERN}/?$"
example = "https://en.wikipedia.org/"
- def __init__(self, match):
- WikimediaExtractor.__init__(self, match)
-
+ def params(self):
# ref: https://www.mediawiki.org/wiki/API:Allpages
- self.params = {
+ return {
"generator" : "allpages",
"gapnamespace": 6, # "File" namespace
"gaplimit" : self.per_page,
diff --git a/gallery_dl/extractor/xasiat.py b/gallery_dl/extractor/xasiat.py
index 6aa3168..d4dbea1 100644
--- a/gallery_dl/extractor/xasiat.py
+++ b/gallery_dl/extractor/xasiat.py
@@ -7,7 +7,7 @@
"""Extractors for https://www.xasiat.com"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text
import time
BASE_PATTERN = r"(?:https?://)?(?:www\.)?xasiat\.com((?:/fr|/ja)?/albums"
@@ -29,7 +29,7 @@ class XasiatExtractor(Extractor):
def _pagination(self, path, pnum=1):
url = f"{self.root}{path}/"
- find_posts = util.re(r'class="item ">\s*<a href="([^"]+)').findall
+ find_posts = text.re(r'class="item ">\s*<a href="([^"]+)').findall
while True:
params = {
@@ -38,7 +38,7 @@ class XasiatExtractor(Extractor):
"block_id": "list_albums_common_albums_list",
"sort_by": "post_date",
"from": pnum,
- "_": int(time.time() * 1000)
+ "_": int(time.time() * 1000),
}
page = self.request(url, params=params).text
@@ -52,7 +52,7 @@ class XasiatExtractor(Extractor):
class XasiatAlbumExtractor(XasiatExtractor):
subcategory = "album"
- pattern = BASE_PATTERN + r"/(\d+)/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/(\d+)/[^/?#]+)"
example = "https://www.xasiat.com/albums/12345/TITLE/"
def items(self):
@@ -66,38 +66,37 @@ class XasiatAlbumExtractor(XasiatExtractor):
images = extr('class="images"', "</div>")
urls = list(text.extract_iter(images, 'href="', '"'))
-
+ categories = text.re(r'categories/[^"]+\">\s*(.+)\s*</a').findall(info)
data = {
"title": text.unescape(title),
- "model": util.re(
+ "model": text.re(
r'top_models1"></i>\s*(.+)\s*</span').findall(info),
- "tags": util.re(
+ "tags": text.re(
r'tags/[^"]+\">\s*(.+)\s*</a').findall(info),
- "album_category": util.re(
- r'categories/[^"]+\">\s*(.+)\s*</a').findall(info)[0],
+ "album_category": categories[0] if categories else "",
"album_url": response.url,
"album_id": text.parse_int(album_id),
"count": len(urls),
}
- yield Message.Directory, data
+ yield Message.Directory, "", data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url[:-1], data)
class XasiatTagExtractor(XasiatExtractor):
subcategory = "tag"
- pattern = BASE_PATTERN + r"/tags/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/tags/[^/?#]+)"
example = "https://www.xasiat.com/albums/tags/TAG/"
class XasiatCategoryExtractor(XasiatExtractor):
subcategory = "category"
- pattern = BASE_PATTERN + r"/categories/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/categories/[^/?#]+)"
example = "https://www.xasiat.com/albums/categories/CATEGORY/"
class XasiatModelExtractor(XasiatExtractor):
subcategory = "model"
- pattern = BASE_PATTERN + r"/models/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}/models/[^/?#]+)"
example = "https://www.xasiat.com/albums/models/MODEL/"
diff --git a/gallery_dl/extractor/xenforo.py b/gallery_dl/extractor/xenforo.py
new file mode 100644
index 0000000..d8536b0
--- /dev/null
+++ b/gallery_dl/extractor/xenforo.py
@@ -0,0 +1,348 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for XenForo forums"""
+
+from .common import BaseExtractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class XenforoExtractor(BaseExtractor):
+ """Base class for xenforo extractors"""
+ basecategory = "xenforo"
+ directory_fmt = ("{category}", "{thread[section]}",
+ "{thread[title]} ({thread[id]})")
+ filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
+ archive_fmt = "{post[id]}/{type[0]}{id}_{filename}"
+
+ def __init__(self, match):
+ BaseExtractor.__init__(self, match)
+ self.cookies_domain = "." + self.root.split("/")[2]
+ self.cookies_names = self.config_instance("cookies")
+
+ def items(self):
+ self.login()
+
+ extract_urls = text.re(
+ r'(?s)(?:'
+ r'<video (.*?\ssrc="[^"]+".*?)</video>'
+ r'|<a [^>]*?href="[^"]*?'
+ r'(/(?:index\.php\?)?attachments/[^"]+".*?)</a>'
+ r'|<div [^>]*?data-src="[^"]*?'
+ r'(/(?:index\.php\?)attachments/[^"]+".*?)/>'
+ r'|(?:<a [^>]*?href="|<iframe [^>]*?src="|'
+ r'''onclick="loadMedia\(this, ')([^"']+)'''
+ r')'
+ ).findall
+
+ for post in self.posts():
+ urls = extract_urls(post["content"])
+ if post["attachments"]:
+ urls.extend(extract_urls(post["attachments"]))
+
+ data = {"post": post}
+ post["count"] = data["count"] = len(urls)
+ yield Message.Directory, "", data
+
+ id_last = None
+ data["_http_expected_status"] = (403,)
+ data["_http_validate"] = self._validate
+ data["num"] = data["num_internal"] = data["num_external"] = 0
+ for video, inl1, inl2, ext in urls:
+ if ext:
+ data["num"] += 1
+ data["num_external"] += 1
+ data["type"] = "external"
+ if ext[0] == "/":
+ if ext[1] == "/":
+ ext = "https:" + ext
+ else:
+ continue
+ yield Message.Queue, ext, data
+
+ elif video:
+ data["num"] += 1
+ data["num_internal"] += 1
+ data["type"] = "video"
+ url = text.extr(video, 'src="', '"')
+ text.nameext_from_url(url, data)
+ data["id"] = text.parse_int(
+ data["filename"].partition("-")[0])
+ yield Message.Url, url, data
+
+ elif (inline := inl1 or inl2):
+ path = inline[:inline.find('"')]
+ name, _, id = path[path.rfind("/", 0, -1):].strip(
+ "/").rpartition(".")
+ if id == id_last:
+ id_last = None
+ continue
+ else:
+ id_last = id
+ data["id"] = text.parse_int(id)
+ if alt := text.extr(inline, 'alt="', '"'):
+ text.nameext_from_name(alt, data)
+ if not data["extension"]:
+ data["extension"] = name.rpartition("-")[2]
+ else:
+ data["filename"], _, data["extension"] = \
+ name.rpartition("-")
+ data["num"] += 1
+ data["num_internal"] += 1
+ data["type"] = "inline"
+ yield Message.Url, self.root + path, data
+
+ def request_page(self, url):
+ try:
+ return self.request(url)
+ except exception.HttpError as exc:
+ if exc.status == 403 and b">Log in<" in exc.response.content:
+ self._require_auth(exc.response)
+ raise
+
+ def login(self):
+ if self.cookies_check(self.cookies_names):
+ return
+
+ username, password = self._get_auth_info()
+ if username:
+ self.cookies_update(self._login_impl(username, password))
+
+ @cache(maxage=365*86400, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ url = f"{self.root}/login/login"
+ page = self.request(url).text
+ data = {
+ "_xfToken": text.extr(page, 'name="_xfToken" value="', '"'),
+ "login" : username,
+ "password": password,
+ "remember": "1",
+ "_xfRedirect": "",
+ }
+ response = self.request(url, method="POST", data=data)
+
+ if not response.history:
+ err = self._extract_error(response.text)
+ raise exception.AuthenticationError(f'"{err}"')
+
+ return {
+ cookie.name: cookie.value
+ for cookie in self.cookies
+ if cookie.domain.endswith(self.cookies_domain)
+ }
+
+ def _pagination(self, base, pnum=None):
+ base = f"{self.root}{base}"
+
+ if pnum is None:
+ url = f"{base}/"
+ pnum = 1
+ else:
+ url = f"{base}/page-{pnum}"
+ pnum = None
+
+ while True:
+ page = self.request_page(url).text
+
+ yield page
+
+ if pnum is None or "pageNav-jump--next" not in page:
+ return
+ pnum += 1
+ url = f"{base}/page-{pnum}"
+
+ def _pagination_reverse(self, base, pnum=None):
+ base = f"{self.root}{base}"
+
+ url = f"{base}/page-{'9999' if pnum is None else pnum}"
+ with self.request_page(url) as response:
+ if pnum is None and not response.history:
+ self._require_auth()
+ url = response.url
+ if url[-1] == "/":
+ pnum = 1
+ else:
+ pnum = text.parse_int(url[url.rfind("-")+1:], 1)
+ page = response.text
+
+ while True:
+ yield page
+
+ pnum -= 1
+ if pnum > 1:
+ url = f"{base}/page-{pnum}"
+ elif pnum == 1:
+ url = f"{base}/"
+ else:
+ return
+
+ page = self.request_page(url).text
+
+ def _extract_error(self, html):
+ return text.unescape(text.extr(
+ html, "blockMessage--error", "</").rpartition(">")[2].strip())
+
+ def _parse_thread(self, page):
+ try:
+ data = self._extract_jsonld(page)
+ except ValueError:
+ return {}
+
+ schema = data.get("mainEntity", data)
+ author = schema["author"]
+ stats = schema["interactionStatistic"]
+ url_t = schema.get("url") or schema.get("@id") or ""
+ url_a = author.get("url") or ""
+
+ thread = {
+ "id" : url_t[url_t.rfind(".")+1:-1],
+ "url" : url_t,
+ "title": schema["headline"],
+ "date" : self.parse_datetime_iso(schema["datePublished"]),
+ "tags" : (schema["keywords"].split(", ")
+ if "keywords" in schema else ()),
+ "section" : schema["articleSection"],
+ "author" : author.get("name") or "",
+ "author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
+ (author.get("name") or "")[15:]),
+ "author_url": url_a,
+ }
+
+ if isinstance(stats, list):
+ thread["views"] = stats[0]["userInteractionCount"]
+ thread["posts"] = stats[1]["userInteractionCount"]
+ else:
+ thread["views"] = -1
+ thread["posts"] = stats["userInteractionCount"]
+
+ return thread
+
+ def _parse_post(self, html):
+ extr = text.extract_from(html)
+
+ post = {
+ "author": extr('data-author="', '"'),
+ "id": extr('data-content="post-', '"'),
+ "author_url": (extr('itemprop="url" content="', '"') or
+ extr('<a href="', '"')),
+ "date": self.parse_datetime_iso(extr('datetime="', '"')),
+ "content": extr('class="message-body',
+ '<div class="js-selectToQuote'),
+ "attachments": extr('<section class="message-attachments">',
+ '</section>'),
+ }
+
+ url_a = post["author_url"]
+ post["author_id"] = url_a[url_a.rfind(".")+1:-1]
+
+ con = post["content"]
+ if (pos := con.find('<div class="bbWrapper')) >= 0:
+ con = con[pos:]
+ post["content"] = con.strip()
+
+ return post
+
+ def _require_auth(self, response=None):
+ raise exception.AuthRequired(
+ ("username & password", "authenticated cookies"), None,
+ None if response is None else self._extract_error(response.text))
+
+ def _validate(self, response):
+ if response.status_code == 403 and b">Log in<" in response.content:
+ self._require_auth(response)
+ return True
+
+
+BASE_PATTERN = XenforoExtractor.update({
+ "simpcity": {
+ "root": "https://simpcity.cr",
+ "pattern": r"(?:www\.)?simpcity\.(?:cr|su)",
+ "cookies": ("ogaddgmetaprof_user",),
+ },
+ "nudostarforum": {
+ "root": "https://nudostar.com/forum",
+ "pattern": r"(?:www\.)?nudostar\.com/forum",
+ "cookies": ("xf_user",),
+ },
+ "atfforum": {
+ "root": "https://allthefallen.moe/forum",
+ "pattern": r"(?:www\.)?allthefallen\.moe/forum",
+ "cookies": ("xf_user",),
+ },
+})
+
+
+class XenforoPostExtractor(XenforoExtractor):
+ subcategory = "post"
+ pattern = (rf"{BASE_PATTERN}(/(?:index\.php\?)?threads"
+ rf"/[^/?#]+/post-|/posts/)(\d+)")
+ example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
+
+ def posts(self):
+ path = self.groups[-2]
+ post_id = self.groups[-1]
+ url = f"{self.root}{path}{post_id}/"
+ page = self.request_page(url).text
+
+ pos = page.find(f'data-content="post-{post_id}"')
+ if pos < 0:
+ raise exception.NotFoundError("post")
+ html = text.extract(page, "<article ", "<footer", pos-200)[0]
+
+ self.kwdict["thread"] = self._parse_thread(page)
+ return (self._parse_post(html),)
+
+
+class XenforoThreadExtractor(XenforoExtractor):
+ subcategory = "thread"
+ pattern = (rf"{BASE_PATTERN}(/(?:index\.php\?)?threads"
+ rf"/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?")
+ example = "https://simpcity.cr/threads/TITLE.12345/"
+
+ def posts(self):
+ path = self.groups[-2]
+ pnum = self.groups[-1]
+
+ if (order := self.config("order-posts")) and \
+ order[0] not in ("d", "r"):
+ pages = self._pagination(path, pnum)
+ reverse = False
+ else:
+ pages = self._pagination_reverse(path, pnum)
+ reverse = True
+
+ for page in pages:
+ if "thread" not in self.kwdict:
+ self.kwdict["thread"] = self._parse_thread(page)
+ posts = text.extract_iter(page, "<article ", "<footer")
+ if reverse:
+ posts = list(posts)
+ posts.reverse()
+ for html in posts:
+ yield self._parse_post(html)
+
+
+class XenforoForumExtractor(XenforoExtractor):
+ subcategory = "forum"
+ pattern = (rf"{BASE_PATTERN}(/(?:index\.php\?)?forums"
+ rf"/(?:[^/?#]+\.)?[^/?#]+)(?:/page-(\d+))?")
+ example = "https://simpcity.cr/forums/TITLE.123/"
+
+ def items(self):
+ extract_threads = text.re(
+ r'(/(?:index\.php\?)?threads/[^"]+)"[^>]+data-xf-init=').findall
+
+ data = {"_extractor": XenforoThreadExtractor}
+ path = self.groups[-2]
+ pnum = self.groups[-1]
+ for page in self._pagination(path, pnum):
+ for path in extract_threads(page):
+ yield Message.Queue, f"{self.root}{text.unquote(path)}", data
diff --git a/gallery_dl/extractor/xfolio.py b/gallery_dl/extractor/xfolio.py
index 12f437a..8caff85 100644
--- a/gallery_dl/extractor/xfolio.py
+++ b/gallery_dl/extractor/xfolio.py
@@ -45,7 +45,7 @@ class XfolioExtractor(Extractor):
class XfolioWorkExtractor(XfolioExtractor):
subcategory = "work"
- pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/works/(\d+)"
+ pattern = rf"{BASE_PATTERN}/portfolio/([^/?#]+)/works/(\d+)"
example = "https://xfolio.jp/portfolio/USER/works/12345"
def items(self):
@@ -57,7 +57,7 @@ class XfolioWorkExtractor(XfolioExtractor):
files = self._extract_files(html, work)
work["count"] = len(files)
- yield Message.Directory, work
+ yield Message.Directory, "", work
for work["num"], file in enumerate(files, 1):
file.update(work)
yield Message.Url, file["url"], file
@@ -107,7 +107,7 @@ class XfolioWorkExtractor(XfolioExtractor):
class XfolioUserExtractor(XfolioExtractor):
subcategory = "user"
- pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)"
+ pattern = rf"{BASE_PATTERN}/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)"
example = "https://xfolio.jp/portfolio/USER"
def works(self):
@@ -129,7 +129,7 @@ class XfolioUserExtractor(XfolioExtractor):
class XfolioSeriesExtractor(XfolioExtractor):
subcategory = "series"
- pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/series/(\d+)"
+ pattern = rf"{BASE_PATTERN}/portfolio/([^/?#]+)/series/(\d+)"
example = "https://xfolio.jp/portfolio/USER/series/12345"
def works(self):
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 6c97175..64113d3 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -31,12 +31,12 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
"{gallery[id]} {gallery[title]}")
filename_fmt = "{num:>03}_{id}.{extension}"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"(/photos/gallery/[^/?#]+)"
+ pattern = rf"{BASE_PATTERN}(/photos/gallery/[^/?#]+)"
example = "https://xhamster.com/photos/gallery/12345"
def items(self):
data = self.metadata()
- yield Message.Directory, data
+ yield Message.Directory, "", data
for num, image in enumerate(self.images(), 1):
url = image["imageURL"]
image.update(data)
@@ -67,7 +67,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
{
"id" : text.parse_int(gallery["id"]),
"tags" : [t["label"] for t in info["categoriesTags"]],
- "date" : text.parse_timestamp(model["created"]),
+ "date" : self.parse_timestamp(model["created"]),
"views" : text.parse_int(model["views"]),
"likes" : text.parse_int(model["rating"]["likes"]),
"dislikes" : text.parse_int(model["rating"]["dislikes"]),
@@ -102,7 +102,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
class XhamsterUserExtractor(XhamsterExtractor):
"""Extractor for all galleries of an xhamster user"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])"
+ pattern = rf"{BASE_PATTERN}/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])"
example = "https://xhamster.com/users/USER/photos"
def items(self):
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
index 6c016ec..1f33eac 100644
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -28,7 +28,7 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor):
"{gallery[id]} {gallery[title]}")
filename_fmt = "{category}_{gallery[id]}_{num:>03}.{extension}"
archive_fmt = "{gallery[id]}_{num}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/photos/(\d+)"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/photos/(\d+)"
example = "https://www.xvideos.com/profiles/USER/photos/12345"
def __init__(self, match):
@@ -86,7 +86,7 @@ class XvideosUserExtractor(XvideosBase, Extractor):
"""Extractor for user profiles on xvideos.com"""
subcategory = "user"
categorytransfer = True
- pattern = BASE_PATTERN + r"/([^/?#]+)/?(?:#.*)?$"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/?(?:#.*)?$"
example = "https://www.xvideos.com/profiles/USER"
def __init__(self, match):
diff --git a/gallery_dl/extractor/yiffverse.py b/gallery_dl/extractor/yiffverse.py
index 1595b4d..65289e2 100644
--- a/gallery_dl/extractor/yiffverse.py
+++ b/gallery_dl/extractor/yiffverse.py
@@ -55,8 +55,7 @@ class YiffverseExtractor(BooruExtractor):
def _prepare(self, post):
post.pop("files", None)
- post["date"] = text.parse_datetime(
- post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date"] = self.parse_datetime_iso(post["created"])
post["filename"], _, post["format"] = post["filename"].rpartition(".")
if "tags" in post:
post["tags"] = [t["value"] for t in post["tags"]]
@@ -99,7 +98,7 @@ class YiffverseExtractor(BooruExtractor):
class YiffversePostExtractor(YiffverseExtractor):
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/post/(\d+)"
+ pattern = rf"{BASE_PATTERN}/post/(\d+)"
example = "https://yiffverse.com/post/12345"
def posts(self):
@@ -110,7 +109,7 @@ class YiffversePlaylistExtractor(YiffverseExtractor):
subcategory = "playlist"
directory_fmt = ("{category}", "{playlist_id}")
archive_fmt = "p_{playlist_id}_{id}"
- pattern = BASE_PATTERN + r"/playlist/(\d+)"
+ pattern = rf"{BASE_PATTERN}/playlist/(\d+)"
example = "https://yiffverse.com/playlist/12345"
def metadata(self):
@@ -125,7 +124,7 @@ class YiffverseTagExtractor(YiffverseExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/(?:tag/([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)"
+ pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)"
example = "https://yiffverse.com/tag/TAG"
def _init(self):
diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py
index eb33b65..ea3b615 100644
--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@@ -114,7 +114,7 @@ class YoutubeDLExtractor(Extractor):
info_dict.get("webpage_url") or
self.ytdl_url)
- yield Message.Directory, info_dict
+ yield Message.Directory, "", info_dict
yield Message.Url, url, info_dict
def _process_entries(self, ytdl_module, ytdl_instance, entries):
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 7bff83b..b4bbd5a 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -76,7 +76,7 @@ class ZerochanExtractor(BooruExtractor):
data = {
"id" : text.parse_int(entry_id),
"file_url": jsonld["contentUrl"],
- "date" : text.parse_datetime(jsonld["datePublished"]),
+ "date" : self.parse_datetime_iso(jsonld["datePublished"]),
"width" : text.parse_int(jsonld["width"][:-3]),
"height" : text.parse_int(jsonld["height"][:-3]),
"size" : text.parse_bytes(jsonld["contentSize"][:-1]),
@@ -128,7 +128,7 @@ class ZerochanExtractor(BooruExtractor):
return data
def _parse_json(self, txt):
- txt = util.re(r"[\x00-\x1f\x7f]").sub("", txt)
+ txt = text.re(r"[\x00-\x1f\x7f]").sub("", txt)
main, _, tags = txt.partition('tags": [')
item = {}
@@ -160,7 +160,7 @@ class ZerochanExtractor(BooruExtractor):
class ZerochanTagExtractor(ZerochanExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
- pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"
+ pattern = rf"{BASE_PATTERN}/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"
example = "https://www.zerochan.net/TAG"
def __init__(self, match):
@@ -286,7 +286,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
class ZerochanImageExtractor(ZerochanExtractor):
subcategory = "image"
- pattern = BASE_PATTERN + r"/(\d+)"
+ pattern = rf"{BASE_PATTERN}/(\d+)"
example = "https://www.zerochan.net/12345"
def posts(self):
diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py
index 5246f66..0787464 100644
--- a/gallery_dl/formatter.py
+++ b/gallery_dl/formatter.py
@@ -13,9 +13,8 @@ import sys
import time
import string
import _string
-import datetime
import operator
-from . import text, util
+from . import text, util, dt
NONE = util.NONE
@@ -68,8 +67,8 @@ class StringFormatter():
- "g": calls text.slugify()
- "j": calls json.dumps
- "t": calls str.strip
- - "T": calls util.datetime_to_timestamp_string()
- - "d": calls text.parse_timestamp
+ - "T": calls dt.to_ts_string()
+ - "d": calls dt.parse_ts()
- "s": calls str()
- "S": calls util.to_string()
- "U": calls urllib.parse.unescape
@@ -331,10 +330,10 @@ def _slice(indices):
)
-def _bytesgetter(slice, encoding=sys.getfilesystemencoding()):
+def _bytesgetter(slice):
def apply_slice_bytes(obj):
- return obj.encode(encoding)[slice].decode(encoding, "ignore")
+ return obj.encode(_ENCODING)[slice].decode(_ENCODING, "ignore")
return apply_slice_bytes
@@ -414,15 +413,27 @@ def _parse_conversion(format_spec, default):
def _parse_maxlen(format_spec, default):
maxlen, replacement, format_spec = format_spec.split(_SEPARATOR, 2)
- maxlen = text.parse_int(maxlen[1:])
fmt = _build_format_func(format_spec, default)
- def mlen(obj):
- obj = fmt(obj)
- return obj if len(obj) <= maxlen else replacement
+ if maxlen[1] == "b":
+ maxlen = text.parse_int(maxlen[2:])
+
+ def mlen(obj):
+ obj = fmt(obj)
+ return obj if len(obj.encode(_ENCODING)) <= maxlen else replacement
+ else:
+ maxlen = text.parse_int(maxlen[1:])
+
+ def mlen(obj):
+ obj = fmt(obj)
+ return obj if len(obj) <= maxlen else replacement
return mlen
+def _parse_identity(format_spec, default):
+ return util.identity
+
+
def _parse_join(format_spec, default):
separator, _, format_spec = format_spec.partition(_SEPARATOR)
join = separator[1:].join
@@ -471,9 +482,9 @@ def _parse_datetime(format_spec, default):
dt_format = dt_format[1:]
fmt = _build_format_func(format_spec, default)
- def dt(obj):
- return fmt(text.parse_datetime(obj, dt_format))
- return dt
+ def dt_parse(obj):
+ return fmt(dt.parse(obj, dt_format))
+ return dt_parse
def _parse_offset(format_spec, default):
@@ -482,15 +493,15 @@ def _parse_offset(format_spec, default):
fmt = _build_format_func(format_spec, default)
if not offset or offset == "local":
- def off(dt):
- local = time.localtime(util.datetime_to_timestamp(dt))
- return fmt(dt + datetime.timedelta(0, local.tm_gmtoff))
+ def off(dt_utc):
+ local = time.localtime(dt.to_ts(dt_utc))
+ return fmt(dt_utc + dt.timedelta(0, local.tm_gmtoff))
else:
hours, _, minutes = offset.partition(":")
offset = 3600 * int(hours)
if minutes:
offset += 60 * (int(minutes) if offset > 0 else -int(minutes))
- offset = datetime.timedelta(0, offset)
+ offset = dt.timedelta(0, offset)
def off(obj):
return fmt(obj + offset)
@@ -502,25 +513,36 @@ def _parse_sort(format_spec, default):
fmt = _build_format_func(format_spec, default)
if "d" in args or "r" in args:
- def sort_desc(obj):
+ def sort(obj):
return fmt(sorted(obj, reverse=True))
- return sort_desc
else:
- def sort_asc(obj):
+ def sort(obj):
return fmt(sorted(obj))
- return sort_asc
+ return sort
def _parse_limit(format_spec, default):
limit, hint, format_spec = format_spec.split(_SEPARATOR, 2)
- limit = int(limit[1:])
- limit_hint = limit - len(hint)
fmt = _build_format_func(format_spec, default)
- def apply_limit(obj):
- if len(obj) > limit:
- obj = obj[:limit_hint] + hint
- return fmt(obj)
+ if limit[1] == "b":
+ hint = hint.encode(_ENCODING)
+ limit = int(limit[2:])
+ limit_hint = limit - len(hint)
+
+ def apply_limit(obj):
+ objb = obj.encode(_ENCODING)
+ if len(objb) > limit:
+ obj = (objb[:limit_hint] + hint).decode(_ENCODING, "ignore")
+ return fmt(obj)
+ else:
+ limit = int(limit[1:])
+ limit_hint = limit - len(hint)
+
+ def apply_limit(obj):
+ if len(obj) > limit:
+ obj = obj[:limit_hint] + hint
+ return fmt(obj)
return apply_limit
@@ -541,6 +563,7 @@ class Literal():
_literal = Literal()
_CACHE = {}
+_ENCODING = sys.getfilesystemencoding()
_SEPARATOR = "/"
_FORMATTERS = {
"E" : ExpressionFormatter,
@@ -557,7 +580,7 @@ _FORMATTERS = {
_GLOBALS = {
"_env": lambda: os.environ,
"_lit": lambda: _literal,
- "_now": datetime.datetime.now,
+ "_now": dt.datetime.now,
"_nul": lambda: util.NONE,
}
_CONVERSIONS = {
@@ -569,9 +592,9 @@ _CONVERSIONS = {
"t": str.strip,
"n": len,
"L": util.code_to_language,
- "T": util.datetime_to_timestamp_string,
- "d": text.parse_timestamp,
- "D": util.to_datetime,
+ "T": dt.to_ts_string,
+ "d": dt.parse_ts,
+ "D": dt.convert,
"U": text.unescape,
"H": lambda s: text.unescape(text.remove_html(s)),
"g": text.slugify,
@@ -590,6 +613,7 @@ _FORMAT_SPECIFIERS = {
"A": _parse_arithmetic,
"C": _parse_conversion,
"D": _parse_datetime,
+ "I": _parse_identity,
"J": _parse_join,
"L": _parse_maxlen,
"M": _parse_map,
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index 9369e5d..7a52bd6 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -87,17 +87,22 @@ class Job():
"current_git_head": util.git_head()
}
# user-supplied metadata
- if kwdict := extr.config("keywords"):
- if extr.config("keywords-eval"):
- self.kwdict_eval = []
- for key, value in kwdict.items():
- if isinstance(value, str):
- fmt = formatter.parse(value, None, util.identity)
- self.kwdict_eval.append((key, fmt.format_map))
- else:
- self.kwdict[key] = value
- else:
- self.kwdict.update(kwdict)
+ kwdict = extr.config("keywords")
+ if kwdict_global := extr.config("keywords-global"):
+ kwdict = {**kwdict_global, **kwdict} if kwdict else kwdict_global
+ elif not kwdict:
+ return
+
+ if extr.config("keywords-eval"):
+ self.kwdict_eval = []
+ for key, value in kwdict.items():
+ if isinstance(value, str):
+ fmt = formatter.parse(value, None, util.identity)
+ self.kwdict_eval.append((key, fmt.format_map))
+ else:
+ self.kwdict[key] = value
+ else:
+ self.kwdict.update(kwdict)
def _build_config_path(self, parent):
extr = self.extractor
@@ -130,6 +135,8 @@ class Job():
if extr.basecategory:
if not cfgpath:
cfgpath.append((extr.category, extr.subcategory))
+ if extr.basesubcategory:
+ cfgpath.append((extr.basesubcategory, extr.subcategory))
cfgpath.append((extr.basecategory, extr.subcategory))
return cfgpath
@@ -138,37 +145,35 @@ class Job():
"""Execute or run the job"""
extractor = self.extractor
log = extractor.log
- msg = None
self._init()
# sleep before extractor start
sleep = util.build_duration_func(
extractor.config("sleep-extractor"))
- if sleep:
+ if sleep is not None:
extractor.sleep(sleep(), "extractor")
try:
- for msg in extractor:
- self.dispatch(msg)
+ msg = self.dispatch(extractor)
except exception.StopExtraction as exc:
if exc.depth > 1 and exc.target != extractor.__class__.subcategory:
exc.depth -= 1
raise
pass
except exception.AbortExtraction as exc:
+ log.traceback(exc)
log.error(exc.message)
self.status |= exc.code
except (exception.TerminateExtraction, exception.RestartExtraction):
raise
except exception.GalleryDLException as exc:
log.error("%s: %s", exc.__class__.__name__, exc)
- log.debug("", exc_info=exc)
+ log.traceback(exc)
self.status |= exc.code
except OSError as exc:
- log.debug("", exc_info=exc)
- name = exc.__class__.__name__
- if name == "JSONDecodeError":
+ log.traceback(exc)
+ if (name := exc.__class__.__name__) == "JSONDecodeError":
log.error("Failed to parse JSON data: %s: %s", name, exc)
self.status |= 1
else: # regular OSError
@@ -180,7 +185,7 @@ class Job():
"copy its output and report this issue on "
"https://github.com/mikf/gallery-dl/issues ."),
exc.__class__.__name__, exc)
- log.debug("", exc_info=exc)
+ log.traceback(exc)
self.status |= 1
except BaseException:
self.status |= 1
@@ -196,31 +201,47 @@ class Job():
self.status |= s
return self.status
- def dispatch(self, msg):
+ def dispatch(self, messages):
"""Call the appropriate message handler"""
- if msg[0] == Message.Url:
- _, url, kwdict = msg
- if self.metadata_url:
- kwdict[self.metadata_url] = url
- if self.pred_url(url, kwdict):
- self.update_kwdict(kwdict)
- self.handle_url(url, kwdict)
- if FLAGS.FILE is not None:
- FLAGS.process("FILE")
-
- elif msg[0] == Message.Directory:
- self.update_kwdict(msg[1])
- self.handle_directory(msg[1])
-
- elif msg[0] == Message.Queue:
- _, url, kwdict = msg
- if self.metadata_url:
- kwdict[self.metadata_url] = url
- if self.pred_queue(url, kwdict):
- self.update_kwdict(kwdict)
- self.handle_queue(url, kwdict)
- if FLAGS.CHILD is not None:
- FLAGS.process("CHILD")
+ msg = None
+ process = True
+
+ for msg, url, kwdict in messages:
+
+ if msg == Message.Directory:
+ if self.pred_post(url, kwdict):
+ process = True
+ self.update_kwdict(kwdict)
+ self.handle_directory(kwdict)
+ else:
+ process = None
+ if FLAGS.POST is not None:
+ FLAGS.process("POST")
+
+ elif process is None:
+ continue
+
+ elif msg == Message.Url:
+ if self.metadata_url:
+ kwdict[self.metadata_url] = url
+ if self.pred_url(url, kwdict):
+ self.update_kwdict(kwdict)
+ self.handle_url(url, kwdict)
+ if FLAGS.FILE is not None:
+ FLAGS.process("FILE")
+
+ elif msg == Message.Queue:
+ if process is None:
+ continue
+ if self.metadata_url:
+ kwdict[self.metadata_url] = url
+ if self.pred_queue(url, kwdict):
+ self.update_kwdict(kwdict)
+ self.handle_queue(url, kwdict)
+ if FLAGS.CHILD is not None:
+ FLAGS.process("CHILD")
+
+ return msg
def handle_url(self, url, kwdict):
"""Handle Message.Url"""
@@ -252,15 +273,16 @@ class Job():
def _init(self):
self.extractor.initialize()
self.pred_url = self._prepare_predicates("image", True)
+ self.pred_post = self._prepare_predicates("post", False)
self.pred_queue = self._prepare_predicates("chapter", False)
def _prepare_predicates(self, target, skip=True):
predicates = []
- if self.extractor.config(target + "-unique"):
+ if self.extractor.config(f"{target}-unique"):
predicates.append(util.UniquePredicate())
- if pfilter := self.extractor.config(target + "-filter"):
+ if pfilter := self.extractor.config(f"{target}-filter"):
try:
pred = util.FilterPredicate(pfilter, target)
except (SyntaxError, ValueError, TypeError) as exc:
@@ -268,7 +290,7 @@ class Job():
else:
predicates.append(pred)
- if prange := self.extractor.config(target + "-range"):
+ if prange := self.extractor.config(f"{target}-range"):
try:
pred = util.RangePredicate(prange)
except ValueError as exc:
@@ -288,7 +310,7 @@ class Job():
return self._logger_adapter(logger, self)
def _write_unsupported(self, url):
- if self.ulog:
+ if self.ulog is not None:
self.ulog.info(url)
@@ -321,7 +343,7 @@ class DownloadJob(Job):
for callback in hooks["prepare"]:
callback(pathfmt)
- if archive and archive.check(kwdict):
+ if archive is not None and archive.check(kwdict):
pathfmt.fix_extension()
self.handle_skip()
return
@@ -330,7 +352,7 @@ class DownloadJob(Job):
pathfmt.build_path()
if pathfmt.exists():
- if archive and self._archive_write_skip:
+ if archive is not None and self._archive_write_skip:
archive.add(kwdict)
self.handle_skip()
return
@@ -340,12 +362,12 @@ class DownloadJob(Job):
callback(pathfmt)
if kwdict.pop("_file_recheck", False) and pathfmt.exists():
- if archive and self._archive_write_skip:
+ if archive is not None and self._archive_write_skip:
archive.add(kwdict)
self.handle_skip()
return
- if self.sleep:
+ if self.sleep is not None:
self.extractor.sleep(self.sleep(), "download")
# download from URL
@@ -369,7 +391,7 @@ class DownloadJob(Job):
return
if not pathfmt.temppath:
- if archive and self._archive_write_skip:
+ if archive is not None and self._archive_write_skip:
archive.add(kwdict)
self.handle_skip()
return
@@ -383,15 +405,17 @@ class DownloadJob(Job):
pathfmt.finalize()
self.out.success(pathfmt.path)
self._skipcnt = 0
- if archive and self._archive_write_file:
+ if archive is not None and self._archive_write_file:
archive.add(kwdict)
if "after" in hooks:
for callback in hooks["after"]:
callback(pathfmt)
+ if archive is not None and self._archive_write_after:
+ archive.add(kwdict)
def handle_directory(self, kwdict):
"""Set and create the target directory for downloads"""
- if not self.pathfmt:
+ if self.pathfmt is None:
self.initialize(kwdict)
else:
if "post-after" in self.hooks:
@@ -428,7 +452,8 @@ class DownloadJob(Job):
else:
extr._parentdir = pextr._parentdir
- if pmeta := pextr.config2("parent-metadata", "metadata-parent"):
+ if pmeta := pextr.config2(
+ "parent-metadata", "metadata-parent", pextr.parent):
if isinstance(pmeta, str):
data = self.kwdict.copy()
if kwdict:
@@ -509,7 +534,7 @@ class DownloadJob(Job):
self.out.skip(pathfmt.path)
if self._skipexc:
- if not self._skipftr or self._skipftr(pathfmt.kwdict):
+ if self._skipftr is None or self._skipftr(pathfmt.kwdict):
self._skipcnt += 1
if self._skipcnt >= self._skipmax:
raise self._skipexc
@@ -553,7 +578,7 @@ class DownloadJob(Job):
cfg = extr.config
pathfmt = self.pathfmt = path.PathFormat(extr)
- if kwdict:
+ if kwdict is not None:
pathfmt.set_directory(kwdict)
self.sleep = util.build_duration_func(cfg("sleep"))
@@ -593,11 +618,13 @@ class DownloadJob(Job):
if events is None:
self._archive_write_file = True
self._archive_write_skip = False
+ self._archive_write_after = False
else:
if isinstance(events, str):
events = events.split(",")
self._archive_write_file = ("file" in events)
self._archive_write_skip = ("skip" in events)
+ self._archive_write_after = ("after" in events)
if skip := cfg("skip", True):
self._skipexc = None
@@ -621,7 +648,7 @@ class DownloadJob(Job):
else:
# monkey-patch methods to always return False
pathfmt.exists = lambda x=None: False
- if self.archive:
+ if self.archive is not None:
self.archive.check = pathfmt.exists
if not cfg("postprocess", True):
@@ -681,15 +708,15 @@ class DownloadJob(Job):
pp_dict["__init__"] = None
pp_cls = postprocessor.find(name)
- if not pp_cls:
+ if pp_cls is None:
pp_log.warning("module '%s' not found", name)
continue
try:
pp_obj = pp_cls(self, pp_dict)
except Exception as exc:
+ pp_log.traceback(exc)
pp_log.error("'%s' initialization failed: %s: %s",
name, exc.__class__.__name__, exc)
- pp_log.debug("", exc_info=exc)
else:
pp_list.append(pp_obj)
@@ -706,15 +733,11 @@ class DownloadJob(Job):
condition = util.compile_filter(expr)
for hook, callback in hooks.items():
self.hooks[hook].append(functools.partial(
- self._call_hook, callback, condition))
+ _call_hook_condition, callback, condition))
else:
for hook, callback in hooks.items():
self.hooks[hook].append(callback)
- def _call_hook(self, callback, condition, pathfmt):
- if condition(pathfmt.kwdict):
- callback(pathfmt)
-
def _build_extractor_filter(self):
clist = self.extractor.config("whitelist")
if clist is not None:
@@ -730,20 +753,25 @@ class DownloadJob(Job):
return util.build_extractor_filter(clist, negate, special)
+def _call_hook_condition(callback, condition, pathfmt):
+ if condition(pathfmt.kwdict):
+ callback(pathfmt)
+
+
class SimulationJob(DownloadJob):
"""Simulate the extraction process without downloading anything"""
def handle_url(self, url, kwdict):
ext = kwdict["extension"] or "jpg"
kwdict["extension"] = self.pathfmt.extension_map(ext, ext)
- if self.sleep:
+ if self.sleep is not None:
self.extractor.sleep(self.sleep(), "download")
- if self.archive and self._archive_write_skip:
+ if self.archive is not None and self._archive_write_skip:
self.archive.add(kwdict)
self.out.skip(self.pathfmt.build_filename(kwdict))
def handle_directory(self, kwdict):
- if not self.pathfmt:
+ if self.pathfmt is None:
self.initialize()
@@ -931,13 +959,12 @@ class DataJob(Job):
extractor = self.extractor
sleep = util.build_duration_func(
extractor.config("sleep-extractor"))
- if sleep:
+ if sleep is not None:
extractor.sleep(sleep(), "extractor")
# collect data
try:
- for msg in extractor:
- self.dispatch(msg)
+ self.dispatch(extractor)
except exception.StopExtraction:
pass
except Exception as exc:
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index 05cc9d3..a47d8cd 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -698,10 +698,15 @@ def build_parser():
"(e.g. '5', '8-20', or '1:24:3')"),
)
selection.add_argument(
+ "--post-range",
+ dest="post-range", metavar="RANGE", action=ConfigAction,
+ help=("Like '--range', but for posts"),
+ )
+ selection.add_argument(
"--chapter-range",
dest="chapter-range", metavar="RANGE", action=ConfigAction,
- help=("Like '--range', but applies to manga chapters "
- "and other delegated URLs"),
+ help=("Like '--range', but for child extractors handling "
+ "manga chapters, external URLs, etc."),
)
selection.add_argument(
"--filter",
@@ -713,10 +718,15 @@ def build_parser():
"rating in ('s', 'q')\""),
)
selection.add_argument(
+ "--post-filter",
+ dest="post-filter", metavar="EXPR", action=ConfigAction,
+ help=("Like '--filter', but for posts"),
+ )
+ selection.add_argument(
"--chapter-filter",
dest="chapter-filter", metavar="EXPR", action=ConfigAction,
- help=("Like '--filter', but applies to manga chapters "
- "and other delegated URLs"),
+ help=("Like '--filter', but for child extractors handling "
+ "manga chapters, external URLs, etc."),
)
infojson = {
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
index 9e0888b..fe7235e 100644
--- a/gallery_dl/output.py
+++ b/gallery_dl/output.py
@@ -89,6 +89,11 @@ class LoggerAdapter():
self.logger = logger
self.extra = job._logger_extra
+ def traceback(self, exc):
+ if self.logger.isEnabledFor(logging.DEBUG):
+ self.logger._log(
+ logging.DEBUG, "", None, exc_info=exc, extra=self.extra)
+
def debug(self, msg, *args, **kwargs):
if self.logger.isEnabledFor(logging.DEBUG):
kwargs["extra"] = self.extra
@@ -171,6 +176,48 @@ class Formatter(logging.Formatter):
return msg
+class FileHandler(logging.StreamHandler):
+ def __init__(self, path, mode, encoding, delay=True):
+ self.path = path
+ self.mode = mode
+ self.errors = None
+ self.encoding = encoding
+
+ if delay:
+ logging.Handler.__init__(self)
+ self.stream = None
+ self.emit = self.emit_delayed
+ else:
+ logging.StreamHandler.__init__(self, self._open())
+
+ def close(self):
+ with self.lock:
+ try:
+ if self.stream:
+ try:
+ self.flush()
+ self.stream.close()
+ finally:
+ self.stream = None
+ finally:
+ logging.StreamHandler.close(self)
+
+ def _open(self):
+ try:
+ return open(self.path, self.mode,
+ encoding=self.encoding, errors=self.errors)
+ except FileNotFoundError:
+ os.makedirs(os.path.dirname(self.path))
+ return open(self.path, self.mode,
+ encoding=self.encoding, errors=self.errors)
+
+ def emit_delayed(self, record):
+ if self.mode != "w" or not self._closed:
+ self.stream = self._open()
+ self.emit = logging.StreamHandler.emit.__get__(self)
+ self.emit(record)
+
+
def initialize_logging(loglevel):
"""Setup basic logging functionality before configfiles have been loaded"""
# convert levelnames to lowercase
@@ -242,7 +289,8 @@ def configure_logging(loglevel):
root.setLevel(minlevel)
-def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL, mode="w"):
+def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL, mode="w",
+ defer=False):
"""Setup a new logging handler"""
opts = config.interpolate(("output",), key)
if not opts:
@@ -253,12 +301,10 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL, mode="w"):
path = opts.get("path")
mode = opts.get("mode", mode)
encoding = opts.get("encoding", "utf-8")
+ delay = opts.get("defer", defer)
try:
path = util.expand_path(path)
- handler = logging.FileHandler(path, mode, encoding)
- except FileNotFoundError:
- os.makedirs(os.path.dirname(path))
- handler = logging.FileHandler(path, mode, encoding)
+ handler = FileHandler(path, mode, encoding, delay)
except (OSError, ValueError) as exc:
logging.getLogger("gallery-dl").warning(
"%s: %s", key, exc)
diff --git a/gallery_dl/path.py b/gallery_dl/path.py
index 763fb55..be2dcc9 100644
--- a/gallery_dl/path.py
+++ b/gallery_dl/path.py
@@ -31,6 +31,8 @@ class PathFormat():
if kwdefault is None:
kwdefault = util.NONE
+ self.filename_conditions = self.directory_conditions = None
+
filename_fmt = config("filename")
try:
if filename_fmt is None:
@@ -41,7 +43,6 @@ class PathFormat():
formatter.parse(fmt, kwdefault).format_map)
for expr, fmt in filename_fmt.items() if expr
]
- self.build_filename = self.build_filename_conditional
filename_fmt = filename_fmt.get("", extractor.filename_fmt)
self.filename_formatter = formatter.parse(
@@ -50,7 +51,6 @@ class PathFormat():
raise exception.FilenameFormatError(exc)
directory_fmt = config("directory")
- self.directory_conditions = ()
try:
if directory_fmt is None:
directory_fmt = extractor.directory_fmt
@@ -62,7 +62,6 @@ class PathFormat():
])
for expr, fmts in directory_fmt.items() if expr
]
- self.build_directory = self.build_directory_conditional
directory_fmt = directory_fmt.get("", extractor.directory_fmt)
self.directory_formatters = [
@@ -160,8 +159,12 @@ class PathFormat():
def exists(self):
"""Return True if the file exists on disk"""
- if self.extension and os.path.exists(self.realpath):
- return self.check_file()
+ if self.extension:
+ try:
+ os.lstat(self.realpath) # raises OSError if file doesn't exist
+ return self.check_file()
+ except OSError:
+ pass
return False
def check_file(self):
@@ -174,7 +177,7 @@ class PathFormat():
prefix = format(num) + "."
self.kwdict["extension"] = prefix + self.extension
self.build_path()
- os.stat(self.realpath) # raises OSError if file doesn't exist
+ os.lstat(self.realpath) # raises OSError if file doesn't exist
num += 1
except OSError:
pass
@@ -252,55 +255,47 @@ class PathFormat():
def build_filename(self, kwdict):
"""Apply 'kwdict' to filename format string"""
try:
- return self.clean_path(self.clean_segment(
- self.filename_formatter(kwdict)))
- except Exception as exc:
- raise exception.FilenameFormatError(exc)
-
- def build_filename_conditional(self, kwdict):
- try:
- for condition, fmt in self.filename_conditions:
- if condition(kwdict):
- break
- else:
+ if self.filename_conditions is None:
fmt = self.filename_formatter
+ else:
+ for condition, fmt in self.filename_conditions:
+ if condition(kwdict):
+ break
+ else:
+ fmt = self.filename_formatter
return self.clean_path(self.clean_segment(fmt(kwdict)))
except Exception as exc:
raise exception.FilenameFormatError(exc)
def build_directory(self, kwdict):
"""Apply 'kwdict' to directory format strings"""
- segments = []
- strip = self.strip
-
try:
- for fmt in self.directory_formatters:
- segment = fmt(kwdict).strip()
- if strip and segment not in {".", ".."}:
- # remove trailing dots and spaces (#647)
- segment = segment.rstrip(strip)
- if segment:
- segments.append(self.clean_segment(segment))
- return segments
- except Exception as exc:
- raise exception.DirectoryFormatError(exc)
-
- def build_directory_conditional(self, kwdict):
- segments = []
- strip = self.strip
-
- try:
- for condition, formatters in self.directory_conditions:
- if condition(kwdict):
- break
- else:
+ if self.directory_conditions is None:
formatters = self.directory_formatters
+ else:
+ for condition, formatters in self.directory_conditions:
+ if condition(kwdict):
+ break
+ else:
+ formatters = self.directory_formatters
+
+ segments = []
+ strip = self.strip
for fmt in formatters:
- segment = fmt(kwdict).strip()
- if strip and segment != "..":
- segment = segment.rstrip(strip)
- if segment:
- segments.append(self.clean_segment(segment))
+ segment = fmt(kwdict)
+ if segment.__class__ is str:
+ segment = segment.strip()
+ if strip and segment not in {".", ".."}:
+ segment = segment.rstrip(strip)
+ if segment:
+ segments.append(self.clean_segment(segment))
+ else: # assume list
+ for segment in segment:
+ segment = segment.strip()
+ if strip and segment not in {".", ".."}:
+ segment = segment.rstrip(strip)
+ if segment:
+ segments.append(self.clean_segment(segment))
return segments
except Exception as exc:
raise exception.DirectoryFormatError(exc)
@@ -321,7 +316,15 @@ class PathFormat():
self.kwdict["extension"] = self.prefix + self.extension_map(
"part", "part")
self.build_path()
- if part_directory:
+
+ if part_directory is not None:
+ if isinstance(part_directory, list):
+ for condition, part_directory in part_directory:
+ if condition(self.kwdict):
+ break
+ else:
+ return
+
self.temppath = os.path.join(
part_directory,
os.path.basename(self.temppath),
diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py
index 1a4ce56..6da0330 100644
--- a/gallery_dl/postprocessor/__init__.py
+++ b/gallery_dl/postprocessor/__init__.py
@@ -33,7 +33,7 @@ def find(name):
cls = None
if name in modules: # prevent unwanted imports
try:
- module = __import__(name, globals(), None, (), 1)
+ module = __import__(name, globals(), None, None, 1)
except ImportError:
pass
else:
diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py
index 3b0ab22..9e2e4df 100644
--- a/gallery_dl/postprocessor/exec.py
+++ b/gallery_dl/postprocessor/exec.py
@@ -22,6 +22,10 @@ else:
from shlex import quote
+def trim(args):
+ return (args.partition(" ") if isinstance(args, str) else args)[0]
+
+
class ExecPP(PostProcessor):
def __init__(self, job, options):
@@ -35,6 +39,7 @@ class ExecPP(PostProcessor):
if options.get("async", False):
self._exec = self._popen
+ self.verbose = options.get("verbose", True)
self.session = False
self.creationflags = 0
if options.get("session"):
@@ -115,11 +120,11 @@ class ExecPP(PostProcessor):
def _exec(self, args, shell):
if retcode := self._popen(args, shell).wait():
self.log.warning("'%s' returned with non-zero exit status (%d)",
- args, retcode)
+ args if self.verbose else trim(args), retcode)
return retcode
def _popen(self, args, shell):
- self.log.debug("Running '%s'", args)
+ self.log.debug("Running '%s'", args if self.verbose else trim(args))
return util.Popen(
args,
shell=shell,
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index 90e6e3d..0017b5b 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -117,9 +117,15 @@ class MetadataPP(PostProcessor):
self.mtime = options.get("mtime")
self.omode = options.get("open", omode)
self.encoding = options.get("encoding", "utf-8")
+ self.newline = options.get("newline")
self.skip = options.get("skip", False)
self.meta_path = options.get("metadata-path")
+ def open(self, path):
+ return open(path, self.omode,
+ encoding=self.encoding,
+ newline=self.newline)
+
def run(self, pathfmt):
archive = self.archive
if archive and archive.check(pathfmt.kwdict):
@@ -138,11 +144,11 @@ class MetadataPP(PostProcessor):
return
try:
- with open(path, self.omode, encoding=self.encoding) as fp:
+ with self.open(path) as fp:
self.write(fp, pathfmt.kwdict)
except FileNotFoundError:
os.makedirs(directory, exist_ok=True)
- with open(path, self.omode, encoding=self.encoding) as fp:
+ with self.open(path) as fp:
self.write(fp, pathfmt.kwdict)
if archive:
diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py
index b1269dd..7d4796e 100644
--- a/gallery_dl/postprocessor/mtime.py
+++ b/gallery_dl/postprocessor/mtime.py
@@ -9,8 +9,7 @@
"""Use metadata as file modification time"""
from .common import PostProcessor
-from .. import text, util, formatter
-from datetime import datetime
+from .. import text, util, dt, formatter
class MtimePP(PostProcessor):
@@ -36,8 +35,8 @@ class MtimePP(PostProcessor):
return
pathfmt.kwdict["_mtime_meta"] = (
- util.datetime_to_timestamp(mtime)
- if isinstance(mtime, datetime) else
+ dt.to_ts(mtime)
+ if isinstance(mtime, dt.datetime) else
text.parse_int(mtime)
)
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 1a55e22..3813fae 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -151,7 +151,7 @@ class UgoiraPP(PostProcessor):
"%s: Unable to extract frames from %s (%s: %s)",
pathfmt.kwdict.get("id"), pathfmt.filename,
exc.__class__.__name__, exc)
- return self.log.debug("", exc_info=exc)
+ return self.log.traceback(exc)
if self.convert(pathfmt, tempdir):
if self.delete:
@@ -227,12 +227,12 @@ class UgoiraPP(PostProcessor):
output.stderr_write("\n")
self.log.error("Unable to invoke FFmpeg (%s: %s)",
exc.__class__.__name__, exc)
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
pathfmt.realpath = pathfmt.temppath
except Exception as exc:
output.stderr_write("\n")
self.log.error("%s: %s", exc.__class__.__name__, exc)
- self.log.debug("", exc_info=exc)
+ self.log.traceback(exc)
pathfmt.realpath = pathfmt.temppath
else:
if self.mtime:
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 98bba48..5b074d9 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -8,10 +8,7 @@
"""Collection of functions that work on strings/text"""
-import sys
import html
-import time
-import datetime
import urllib.parse
import re as re_module
@@ -113,9 +110,27 @@ def nameext_from_url(url, data=None):
filename = unquote(filename_from_url(url))
name, _, ext = filename.rpartition(".")
if name and len(ext) <= 16:
- data["filename"], data["extension"] = name, ext.lower()
+ data["filename"] = name
+ data["extension"] = ext.lower()
else:
- data["filename"], data["extension"] = filename, ""
+ data["filename"] = filename
+ data["extension"] = ""
+
+ return data
+
+
+def nameext_from_name(filename, data=None):
+ """Extract the last part of an URL and fill 'data' accordingly"""
+ if data is None:
+ data = {}
+
+ name, _, ext = filename.rpartition(".")
+ if name and len(ext) <= 16:
+ data["filename"] = name
+ data["extension"] = ext.lower()
+ else:
+ data["filename"] = filename
+ data["extension"] = ""
return data
@@ -322,46 +337,6 @@ def build_query(params):
])
-if sys.hexversion < 0x30c0000:
- # Python <= 3.11
- def parse_timestamp(ts, default=None):
- """Create a datetime object from a Unix timestamp"""
- try:
- return datetime.datetime.utcfromtimestamp(int(ts))
- except Exception:
- return default
-else:
- # Python >= 3.12
- def parse_timestamp(ts, default=None):
- """Create a datetime object from a Unix timestamp"""
- try:
- Y, m, d, H, M, S, _, _, _ = time.gmtime(int(ts))
- return datetime.datetime(Y, m, d, H, M, S)
- except Exception:
- return default
-
-
-def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0):
- """Create a datetime object by parsing 'date_string'"""
- try:
- d = datetime.datetime.strptime(date_string, format)
- o = d.utcoffset()
- if o is not None:
- # convert to naive UTC
- d = d.replace(tzinfo=None, microsecond=0) - o
- else:
- if d.microsecond:
- d = d.replace(microsecond=0)
- if utcoffset:
- # apply manual UTC offset
- d += datetime.timedelta(0, utcoffset * -3600)
- return d
- except (TypeError, IndexError, KeyError):
- return None
- except (ValueError, OverflowError):
- return date_string
-
-
urljoin = urllib.parse.urljoin
quote = urllib.parse.quote
diff --git a/gallery_dl/update.py b/gallery_dl/update.py
index 273ca18..e51a4b3 100644
--- a/gallery_dl/update.py
+++ b/gallery_dl/update.py
@@ -212,5 +212,5 @@ class UpdateExtractor(Extractor):
url = (f"{self.root}/{path_repo}/releases/download"
f"/{data['tag_name']}/{binary_name}")
- yield Message.Directory, data
+ yield Message.Directory, "", data
yield Message.Url, url, data
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 49c1ba8..7d54d4c 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -16,7 +16,6 @@ import random
import getpass
import hashlib
import binascii
-import datetime
import functools
import itertools
import subprocess
@@ -24,7 +23,7 @@ import collections
import urllib.parse
from http.cookiejar import Cookie
from email.utils import mktime_tz, parsedate_tz
-from . import text, version, exception
+from . import text, dt, version, exception
def bencode(num, alphabet="0123456789"):
@@ -228,63 +227,6 @@ def to_string(value):
return str(value)
-def to_datetime(value):
- """Convert 'value' to a datetime object"""
- if not value:
- return EPOCH
-
- if isinstance(value, datetime.datetime):
- return value
-
- if isinstance(value, str):
- try:
- if value[-1] == "Z":
- # compat for Python < 3.11
- value = value[:-1]
- dt = datetime.datetime.fromisoformat(value)
- if dt.tzinfo is None:
- if dt.microsecond:
- dt = dt.replace(microsecond=0)
- else:
- # convert to naive UTC
- dt = dt.astimezone(datetime.timezone.utc).replace(
- microsecond=0, tzinfo=None)
- return dt
- except Exception:
- pass
-
- return text.parse_timestamp(value, EPOCH)
-
-
-def datetime_to_timestamp(dt):
- """Convert naive UTC datetime to Unix timestamp"""
- return (dt - EPOCH) / SECOND
-
-
-def datetime_to_timestamp_string(dt):
- """Convert naive UTC datetime to Unix timestamp string"""
- try:
- return str((dt - EPOCH) // SECOND)
- except Exception:
- return ""
-
-
-if sys.hexversion < 0x30c0000:
- # Python <= 3.11
- datetime_utcfromtimestamp = datetime.datetime.utcfromtimestamp
- datetime_utcnow = datetime.datetime.utcnow
- datetime_from_timestamp = datetime_utcfromtimestamp
-else:
- # Python >= 3.12
- def datetime_from_timestamp(ts=None):
- """Convert Unix timestamp to naive UTC datetime"""
- Y, m, d, H, M, S, _, _, _ = time.gmtime(ts)
- return datetime.datetime(Y, m, d, H, M, S)
-
- datetime_utcfromtimestamp = datetime_from_timestamp
- datetime_utcnow = datetime_from_timestamp
-
-
def json_default(obj):
if isinstance(obj, CustomNone):
return None
@@ -379,7 +321,7 @@ def extract_headers(response):
text.nameext_from_url(name, data)
if hlm := headers.get("last-modified"):
- data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6])
+ data["date"] = dt.datetime(*parsedate_tz(hlm)[:6])
return data
@@ -751,11 +693,11 @@ class Flags():
# 735506 == 739342 - 137 * 28
# v135.0 release of Chrome on 2025-04-01 has ordinal 739342
# 735562 == 739342 - 135 * 28
-# _ord_today = datetime.date.today().toordinal()
+# _ord_today = dt.date.today().toordinal()
# _ff_ver = (_ord_today - 735506) // 28
# _ch_ver = (_ord_today - 735562) // 28
-_ff_ver = (datetime.date.today().toordinal() - 735506) // 28
+_ff_ver = (dt.date.today().toordinal() - 735506) // 28
# _ch_ver = _ff_ver - 2
re = text.re
@@ -763,8 +705,6 @@ re_compile = text.re_compile
NONE = CustomNone()
FLAGS = Flags()
-EPOCH = datetime.datetime(1970, 1, 1)
-SECOND = datetime.timedelta(0, 1)
WINDOWS = (os.name == "nt")
SENTINEL = object()
EXECUTABLE = getattr(sys, "frozen", False)
@@ -786,8 +726,8 @@ GLOBALS = {
"contains" : contains,
"parse_int": text.parse_int,
"urlsplit" : urllib.parse.urlsplit,
- "datetime" : datetime.datetime,
- "timedelta": datetime.timedelta,
+ "datetime" : dt.datetime,
+ "timedelta": dt.timedelta,
"abort" : raises(exception.StopExtraction),
"error" : raises(exception.AbortExtraction),
"terminate": raises(exception.TerminateExtraction),
@@ -1071,6 +1011,8 @@ class RangePredicate():
if isinstance(rangespec, str):
rangespec = rangespec.split(",")
+ elif isinstance(rangespec, int):
+ rangespec = (str(rangespec),)
for group in rangespec:
if not group:
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index bc70f74..0dcb01a 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,5 +6,5 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.30.10"
+__version__ = "1.31.1"
__variant__ = None
diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py
index b7ee1ca..a4d8097 100644
--- a/gallery_dl/ytdl.py
+++ b/gallery_dl/ytdl.py
@@ -55,6 +55,8 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None):
opts["min_filesize"] = text.parse_bytes(config("filesize-min"), None)
if opts.get("max_filesize") is None:
opts["max_filesize"] = text.parse_bytes(config("filesize-max"), None)
+ if opts.get("overwrites") is None and not config("skip", True):
+ opts["overwrites"] = True
if opts.get("ratelimit") is None:
if rate := config("rate"):
func = util.build_selection_func(rate, 0, text.parse_bytes)
@@ -262,7 +264,7 @@ def parse_command_line(module, argv):
else module.match_filter_func(opts.match_filter))
if cookiesfrombrowser := getattr(opts, "cookiesfrombrowser", None):
- pattern = util.re(r"""(?x)
+ pattern = text.re(r"""(?x)
(?P<name>[^+:]+)
(?:\s*\+\s*(?P<keyring>[^:]+))?
(?:\s*:\s*(?!:)(?P<profile>.+?))?
@@ -528,7 +530,7 @@ def legacy_postprocessors(opts, module, ytdlp, compat_opts):
if len(dur) == 2 and all(t is not None for t in dur):
remove_ranges.append(tuple(dur))
continue
- remove_chapters_patterns.append(util.re(regex))
+ remove_chapters_patterns.append(text.re(regex))
if opts.remove_chapters or sponsorblock_query:
postprocessors.append({
"key": "ModifyChapters",