summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md31
-rw-r--r--PKG-INFO6
-rw-r--r--README.rst4
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.557
-rw-r--r--docs/gallery-dl.conf10
-rw-r--r--gallery_dl.egg-info/PKG-INFO6
-rw-r--r--gallery_dl.egg-info/SOURCES.txt3
-rw-r--r--gallery_dl/__init__.py80
-rw-r--r--gallery_dl/extractor/2chen.py99
-rw-r--r--gallery_dl/extractor/8chan.py172
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/artstation.py16
-rw-r--r--gallery_dl/extractor/common.py13
-rw-r--r--gallery_dl/extractor/danbooru.py17
-rw-r--r--gallery_dl/extractor/deviantart.py10
-rw-r--r--gallery_dl/extractor/directlink.py10
-rw-r--r--gallery_dl/extractor/fanbox.py23
-rw-r--r--gallery_dl/extractor/generic.py6
-rw-r--r--gallery_dl/extractor/hitomi.py8
-rw-r--r--gallery_dl/extractor/imagefap.py109
-rw-r--r--gallery_dl/extractor/instagram.py105
-rw-r--r--gallery_dl/extractor/nana.py115
-rw-r--r--gallery_dl/extractor/nijie.py49
-rw-r--r--gallery_dl/extractor/nozomi.py14
-rw-r--r--gallery_dl/extractor/redgifs.py26
-rw-r--r--gallery_dl/extractor/tumblr.py21
-rw-r--r--gallery_dl/extractor/unsplash.py2
-rw-r--r--gallery_dl/extractor/vk.py33
-rw-r--r--gallery_dl/extractor/wallhaven.py2
-rw-r--r--gallery_dl/job.py21
-rw-r--r--gallery_dl/path.py5
-rw-r--r--gallery_dl/postprocessor/metadata.py18
-rw-r--r--gallery_dl/util.py76
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_postprocessor.py1
-rw-r--r--test/test_results.py4
37 files changed, 946 insertions, 233 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c83ab91..5901e37 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,36 @@
# Changelog
+## 1.23.3 - 2022-10-15
+### Additions
+- [2chen] Add `2chen.moe` extractor ([#2707](https://github.com/mikf/gallery-dl/issues/2707))
+- [8chan] add `thread` and `board` extractors ([#2938](https://github.com/mikf/gallery-dl/issues/2938))
+- [deviantart] add `group` option ([#3018](https://github.com/mikf/gallery-dl/issues/3018))
+- [fanbox] add `content` metadata field ([#3020](https://github.com/mikf/gallery-dl/issues/3020))
+- [instagram] restore `cursor` functionality ([#2991](https://github.com/mikf/gallery-dl/issues/2991))
+- [instagram] restore warnings for private profiles ([#3004](https://github.com/mikf/gallery-dl/issues/3004), [#3045](https://github.com/mikf/gallery-dl/issues/3045))
+- [nana] add `nana` extractors ([#2967](https://github.com/mikf/gallery-dl/issues/2967))
+- [nijie] add `feed` and `followed` extractors ([#3048](https://github.com/mikf/gallery-dl/issues/3048))
+- [tumblr] support `https://www.tumblr.com/BLOGNAME` URLs ([#3034](https://github.com/mikf/gallery-dl/issues/3034))
+- [tumblr] add `offset` option
+- [vk] add `tagged` extractor ([#2997](https://github.com/mikf/gallery-dl/issues/2997))
+- add `path-extended` option ([#3021](https://github.com/mikf/gallery-dl/issues/3021))
+- emit debug logging messages before calling time.sleep() ([#2982](https://github.com/mikf/gallery-dl/issues/2982))
+### Changes
+- [postprocessor:metadata] assume `"mode": "custom"` when `format` is given
+### Fixes
+- [artstation] skip missing projects ([#3016](https://github.com/mikf/gallery-dl/issues/3016))
+- [danbooru] fix ugoira metadata extraction ([#3056](https://github.com/mikf/gallery-dl/issues/3056))
+- [deviantart] fix `deviation` extraction ([#2981](https://github.com/mikf/gallery-dl/issues/2981))
+- [hitomi] fall back to `webp` when selected format is not available ([#3030](https://github.com/mikf/gallery-dl/issues/3030))
+- [imagefap] fix and improve folder extraction and gallery pagination ([#3013](https://github.com/mikf/gallery-dl/issues/3013))
+- [instagram] fix login ([#3011](https://github.com/mikf/gallery-dl/issues/3011), [#3015](https://github.com/mikf/gallery-dl/issues/3015))
+- [nozomi] fix extraction ([#3051](https://github.com/mikf/gallery-dl/issues/3051))
+- [redgifs] fix extraction ([#3037](https://github.com/mikf/gallery-dl/issues/3037))
+- [tumblr] sleep between fallback retries ([#2957](https://github.com/mikf/gallery-dl/issues/2957))
+- [vk] unescape error messages
+- fix duplicated metadata bug with `-j` ([#3033](https://github.com/mikf/gallery-dl/issues/3033))
+- fix bug when processing input file comments ([#2808](https://github.com/mikf/gallery-dl/issues/2808))
+
## 1.23.2 - 2022-10-01
### Additions
- [artstation] support search filters ([#2970](https://github.com/mikf/gallery-dl/issues/2970))
diff --git a/PKG-INFO b/PKG-INFO
index aea8d49..2ecb797 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.23.2
+Version: 1.23.3
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -99,8 +99,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.2/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.2/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
diff --git a/README.rst b/README.rst
index 5676a0e..1457efc 100644
--- a/README.rst
+++ b/README.rst
@@ -66,8 +66,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.2/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.2/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index c7051c2..cca3dee 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2022-10-01" "1.23.2" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2022-10-15" "1.23.3" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 14db723..1c484b6 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2022-10-01" "1.23.2" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2022-10-15" "1.23.3" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -310,6 +310,18 @@ depending on the local operating system
* \f[I]"windows"\f[]: \f[I]". "\f[]
+.SS extractor.*.path-extended
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+On Windows, use \f[I]extended-length paths\f[]
+prefixed with \f[I]\\\\?\\\f[] to work around the 260 characters path length limit.
+
+
.SS extractor.*.extension-map
.IP "Type:" 6
\f[I]object\f[]
@@ -1211,6 +1223,18 @@ Note: Gathering this information requires a lot of API calls.
Use with caution.
+.SS extractor.deviantart.group
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Check whether the profile name in a given URL
+belongs to a group or a regular user.
+
+
.SS extractor.deviantart.include
.IP "Type:" 6
\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[]
@@ -1974,6 +1998,18 @@ Fetch media from replies to other posts.
Also emit metadata for text-only posts without media content.
+.SS extractor.nana.favkey
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]null\f[]
+
+.IP "Description:" 4
+Your \f[I]Nana Favorite Key\f[],
+used to access your favorite archives.
+
+
.SS extractor.newgrounds.flash
.IP "Type:" 6
\f[I]bool\f[]
@@ -2610,6 +2646,19 @@ images from them.
Search posts for inline images and videos.
+.SS extractor.tumblr.offset
+.IP "Type:" 6
+\f[I]integer\f[]
+
+.IP "Default:" 9
+\f[I]0\f[]
+
+.IP "Description:" 4
+Custom \f[I]offset\f[] starting value when paginating over blog posts.
+
+Allows skipping over posts without having to waste API calls.
+
+
.SS extractor.tumblr.original
.IP "Type:" 6
\f[I]bool\f[]
@@ -3474,9 +3523,13 @@ Disable the use of a proxy by explicitly setting this option to \f[I]null\f[].
\f[I]true\f[]
.IP "Description:" 4
-Check the file headers of \f[I]jpg\f[], \f[I]png\f[], and \f[I]gif\f[] files
+Check file headers of downloaded files
and adjust their filename extensions if they do not match.
+For example, this will change the filename extension (\f[I]{extension}\f[])
+of a file called \f[I]example.png\f[] from \f[I]png\f[] to \f[I]jpg\f[] when said file
+contains JPEG/JFIF data.
+
.SS downloader.http.headers
.IP "Type:" 6
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 1c565ec..e507eb0 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -24,6 +24,8 @@
"path-replace": "_",
"path-remove": "\\u0000-\\u001f\\u007f",
"path-strip": "auto",
+ "path-extended": true,
+
"extension-map": {
"jpeg": "jpg",
"jpe" : "jpg",
@@ -71,10 +73,13 @@
{
"client-id": null,
"client-secret": null,
+ "auto-watch": false,
+ "auto-unwatch": false,
"comments": false,
"extra": false,
"flat": true,
"folders": false,
+ "group": true,
"include": "gallery",
"journals": "html",
"mature": true,
@@ -189,6 +194,10 @@
"format": "original",
"include": "art"
},
+ "nana":
+ {
+ "favkey": null
+ },
"nijie":
{
"username": null,
@@ -288,6 +297,7 @@
"external": false,
"inline": true,
"posts": "all",
+ "offset": 0,
"original": true,
"reblogs": true
},
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index 016840e..c1bfabf 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.23.2
+Version: 1.23.3
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -99,8 +99,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.2/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.2/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.3/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 73cc80b..b768d5b 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -40,11 +40,13 @@ gallery_dl/downloader/http.py
gallery_dl/downloader/text.py
gallery_dl/downloader/ytdl.py
gallery_dl/extractor/2chan.py
+gallery_dl/extractor/2chen.py
gallery_dl/extractor/35photo.py
gallery_dl/extractor/3dbooru.py
gallery_dl/extractor/420chan.py
gallery_dl/extractor/4chan.py
gallery_dl/extractor/500px.py
+gallery_dl/extractor/8chan.py
gallery_dl/extractor/8kun.py
gallery_dl/extractor/8muses.py
gallery_dl/extractor/__init__.py
@@ -132,6 +134,7 @@ gallery_dl/extractor/message.py
gallery_dl/extractor/moebooru.py
gallery_dl/extractor/myhentaigallery.py
gallery_dl/extractor/myportfolio.py
+gallery_dl/extractor/nana.py
gallery_dl/extractor/naver.py
gallery_dl/extractor/naverwebtoon.py
gallery_dl/extractor/newgrounds.py
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index 7504fa4..b64fa2f 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -7,7 +7,6 @@
# published by the Free Software Foundation.
import sys
-import json
import logging
from . import version, config, option, output, extractor, job, util, exception
@@ -32,81 +31,6 @@ def progress(urls, pformat):
yield pinfo["url"]
-def parse_inputfile(file, log):
- """Filter and process strings from an input file.
-
- Lines starting with '#' and empty lines will be ignored.
- Lines starting with '-' will be interpreted as a key-value pair separated
- by an '='. where 'key' is a dot-separated option name and 'value' is a
- JSON-parsable value. These configuration options will be applied while
- processing the next URL.
- Lines starting with '-G' are the same as above, except these options will
- be applied for *all* following URLs, i.e. they are Global.
- Everything else will be used as a potential URL.
-
- Example input file:
-
- # settings global options
- -G base-directory = "/tmp/"
- -G skip = false
-
- # setting local options for the next URL
- -filename="spaces_are_optional.jpg"
- -skip = true
-
- https://example.org/
-
- # next URL uses default filename and 'skip' is false.
- https://example.com/index.htm # comment1
- https://example.com/404.htm # comment2
- """
- gconf = []
- lconf = []
-
- for line in file:
- line = line.strip()
-
- if not line or line[0] == "#":
- # empty line or comment
- continue
-
- elif line[0] == "-":
- # config spec
- if len(line) >= 2 and line[1] == "G":
- conf = gconf
- line = line[2:]
- else:
- conf = lconf
- line = line[1:]
-
- key, sep, value = line.partition("=")
- if not sep:
- log.warning("input file: invalid <key>=<value> pair: %s", line)
- continue
-
- try:
- value = json.loads(value.strip())
- except ValueError as exc:
- log.warning("input file: unable to parse '%s': %s", value, exc)
- continue
-
- key = key.strip().split(".")
- conf.append((key[:-1], key[-1], value))
-
- else:
- # url
- if " #" in line:
- line = line.partition(" #")[0].rstrip()
- elif "\t#" in line:
- line = line.partition("\t#")[0].rstrip()
- if gconf or lconf:
- yield util.ExtendedUrl(line, gconf, lconf)
- gconf = []
- lconf = []
- else:
- yield line
-
-
def main():
try:
if sys.stdout and sys.stdout.encoding.lower() != "utf-8":
@@ -275,12 +199,12 @@ def main():
try:
if inputfile == "-":
if sys.stdin:
- urls += parse_inputfile(sys.stdin, log)
+ urls += util.parse_inputfile(sys.stdin, log)
else:
log.warning("input file: stdin is not readable")
else:
with open(inputfile, encoding="utf-8") as file:
- urls += parse_inputfile(file, log)
+ urls += util.parse_inputfile(file, log)
except OSError as exc:
log.warning("input file: %s", exc)
diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py
new file mode 100644
index 0000000..8fffeb0
--- /dev/null
+++ b/gallery_dl/extractor/2chen.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://2chen.moe/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _2chenThreadExtractor(Extractor):
+ """Extractor for 2chen threads"""
+ category = "2chen"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} {title}")
+ filename_fmt = "{time} {filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{hash}"
+ root = "https://2chen.moe"
+ pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)"
+ test = (
+ ("https://2chen.moe/jp/303786", {
+ "count": ">= 10",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/{}/{}".format(self.root, self.board, self.thread)
+ page = self.request(url, encoding="utf-8").text
+ data = self.metadata(page)
+ yield Message.Directory, data
+ for post in self.posts(page):
+ if not post["url"]:
+ continue
+ post.update(data)
+ post["url"] = self.root + post["url"]
+ post["time"] = text.parse_int(post["date"].timestamp())
+ yield Message.Url, post["url"], text.nameext_from_url(
+ post["filename"], post)
+
+ def metadata(self, page):
+ board, pos = text.extract(page, 'class="board">/', '/<')
+ title = text.extract(page, "<h3>", "</h3>", pos)[0]
+ return {
+ "board" : board,
+ "thread": self.thread,
+ "title" : text.unescape(title),
+ }
+
+ def posts(self, page):
+ """Return iterable with relevant posts"""
+ return map(self.parse, text.extract_iter(
+ page, 'class="glass media', '</article>'))
+
+ def parse(self, post):
+ extr = text.extract_from(post)
+ return {
+ "name" : text.unescape(extr("<span>", "</span>")),
+ "date" : text.parse_datetime(
+ extr("<time", "<").partition(">")[2],
+ "%d %b %Y (%a) %H:%M:%S"
+ ),
+ "no" : extr('href="#p', '"'),
+ "url" : extr('</span><a href="', '"'),
+ "filename": text.unescape(extr('download="', '"')),
+ "hash" : extr('data-hash="', '"'),
+ }
+
+
+class _2chenBoardExtractor(Extractor):
+ """Extractor for 2chen boards"""
+ category = "2chen"
+ subcategory = "board"
+ root = "https://2chen.moe"
+ pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog)?/?$"
+ test = (
+ ("https://2chen.moe/co/", {
+ "pattern": _2chenThreadExtractor.pattern
+ }),
+ ("https://2chen.moe/co"),
+ ("https://2chen.moe/co/catalog")
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board = match.group(1)
+
+ def items(self):
+ url = "{}/{}/catalog".format(self.root, self.board)
+ page = self.request(url).text
+ data = {"_extractor": _2chenThreadExtractor}
+ for thread in text.extract_iter(
+ page, '<figure><a href="', '"'):
+ yield Message.Queue, self.root + thread, data
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
new file mode 100644
index 0000000..1e020c2
--- /dev/null
+++ b/gallery_dl/extractor/8chan.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://8chan.moe/"""
+
+from .common import Extractor, Message
+from .. import text
+from ..cache import memcache
+from datetime import datetime, timedelta
+import itertools
+
+BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)"
+
+
+class _8chanExtractor(Extractor):
+ """Base class for 8chan extractors"""
+ category = "8chan"
+ root = "https://8chan.moe"
+
+ def __init__(self, match):
+ self.root = "https://8chan." + match.group(1)
+ Extractor.__init__(self, match)
+
+ @memcache()
+ def _prepare_cookies(self):
+ # fetch captcha cookies
+ # (necessary to download without getting interrupted)
+ now = datetime.utcnow()
+ url = self.root + "/captcha.js"
+ params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")}
+ self.request(url, params=params).content
+
+ # adjust cookies
+ # - remove 'expires' timestamp
+ # - move 'captchaexpiration' value forward by 1 month)
+ domain = self.root.rpartition("/")[2]
+ for cookie in self.session.cookies:
+ if cookie.domain.endswith(domain):
+ cookie.expires = None
+ if cookie.name == "captchaexpiration":
+ cookie.value = (now + timedelta(30, 300)).strftime(
+ "%a, %d %b %Y %H:%M:%S GMT")
+
+ return self.session.cookies
+
+
+class _8chanThreadExtractor(_8chanExtractor):
+ """Extractor for 8chan threads"""
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{boardUri}",
+ "{threadId} {subject[:50]}")
+ filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}"
+ archive_fmt = "{boardUri}_{postId}_{num}"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
+ test = (
+ ("https://8chan.moe/vhs/res/4.html", {
+ "pattern": r"https://8chan\.moe/\.media/[0-9a-f]{64}\.\w+$",
+ "count": 14,
+ "keyword": {
+ "archived": False,
+ "autoSage": False,
+ "boardDescription": "Film and Cinema",
+ "boardMarkdown": None,
+ "boardName": "Movies",
+ "boardUri": "vhs",
+ "creation": r"re:\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}Z",
+ "cyclic": False,
+ "email": None,
+ "id": "re:^[0-9a-f]{6}$",
+ "locked": False,
+ "markdown": str,
+ "maxFileCount": 5,
+ "maxFileSize": "32.00 MB",
+ "maxMessageLength": 8001,
+ "message": str,
+ "mime": str,
+ "name": "Anonymous",
+ "num": int,
+ "originalName": str,
+ "path": r"re:/.media/[0-9a-f]{64}\.\w+$",
+ "pinned": False,
+ "postId": int,
+ "signedRole": None,
+ "size": int,
+ "threadId": 4,
+ "thumb": r"re:/.media/t_[0-9a-f]{64}$",
+ "uniquePosters": 9,
+ "usesCustomCss": True,
+ "usesCustomJs": False,
+ "wsPort": 8880,
+ "wssPort": 2087,
+ },
+ }),
+ ("https://8chan.se/vhs/res/4.html"),
+ ("https://8chan.cc/vhs/res/4.html"),
+ )
+
+ def __init__(self, match):
+ _8chanExtractor.__init__(self, match)
+ _, self.board, self.thread = match.groups()
+
+ def items(self):
+ # fetch thread data
+ url = "{}/{}/res/{}.".format(self.root, self.board, self.thread)
+ self.session.headers["Referer"] = url + "html"
+ thread = self.request(url + "json").json()
+ thread["postId"] = thread["threadId"]
+ thread["_http_headers"] = {"Referer": url + "html"}
+
+ try:
+ self.session.cookies = self._prepare_cookies()
+ except Exception as exc:
+ self.log.debug("Failed to fetch captcha cookies: %s: %s",
+ exc.__class__.__name__, exc, exc_info=True)
+
+ # download files
+ posts = thread.pop("posts", ())
+ yield Message.Directory, thread
+ for post in itertools.chain((thread,), posts):
+ files = post.pop("files", ())
+ if not files:
+ continue
+ thread.update(post)
+ for num, file in enumerate(files):
+ file.update(thread)
+ file["num"] = num
+ text.nameext_from_url(file["originalName"], file)
+ yield Message.Url, self.root + file["path"], file
+
+
+class _8chanBoardExtractor(_8chanExtractor):
+ """Extractor for 8chan boards"""
+ subcategory = "board"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$"
+ test = (
+ ("https://8chan.moe/vhs/"),
+ ("https://8chan.moe/vhs/2.html", {
+ "pattern": _8chanThreadExtractor.pattern,
+ "count": 23,
+ }),
+ ("https://8chan.se/vhs/"),
+ ("https://8chan.cc/vhs/"),
+ )
+
+ def __init__(self, match):
+ _8chanExtractor.__init__(self, match)
+ _, self.board, self.page = match.groups()
+ self.session.headers["Referer"] = self.root + "/"
+
+ def items(self):
+ page = text.parse_int(self.page, 1)
+ url = "{}/{}/{}.json".format(self.root, self.board, page)
+ board = self.request(url).json()
+ threads = board["threads"]
+
+ while True:
+ for thread in threads:
+ thread["_extractor"] = _8chanThreadExtractor
+ url = "{}/{}/res/{}.html".format(
+ self.root, self.board, thread["threadId"])
+ yield Message.Queue, url, thread
+
+ page += 1
+ if page > board["pageCount"]:
+ return
+ url = "{}/{}/{}.json".format(self.root, self.board, page)
+ threads = self.request(url).json()["threads"]
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index fed6998..851f660 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -10,11 +10,13 @@ import re
modules = [
"2chan",
+ "2chen",
"35photo",
"3dbooru",
"420chan",
"4chan",
"500px",
+ "8chan",
"8kun",
"8muses",
"adultempire",
@@ -90,6 +92,7 @@ modules = [
"mememuseum",
"myhentaigallery",
"myportfolio",
+ "nana",
"naver",
"naverwebtoon",
"newgrounds",
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index 62626a1..14d1e6b 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -76,7 +76,12 @@ class ArtstationExtractor(Extractor):
def get_project_assets(self, project_id):
"""Return all assets associated with 'project_id'"""
url = "{}/projects/{}.json".format(self.root, project_id)
- data = self.request(url).json()
+
+ try:
+ data = self.request(url).json()
+ except exception.HttpError as exc:
+ self.log.warning(exc)
+ return
data["title"] = text.unescape(data["title"])
data["description"] = text.unescape(text.remove_html(
@@ -406,6 +411,10 @@ class ArtstationImageExtractor(ArtstationExtractor):
"options": (("external", True),),
"pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0",
}),
+ # 404 (#3016)
+ ("https://www.artstation.com/artwork/3q3mXB", {
+ "count": 0,
+ }),
# alternate URL patterns
("https://sungchoi.artstation.com/projects/LQVJr"),
("https://artstn.co/p/LQVJr"),
@@ -419,7 +428,10 @@ class ArtstationImageExtractor(ArtstationExtractor):
def metadata(self):
self.assets = list(ArtstationExtractor.get_project_assets(
self, self.project_id))
- self.user = self.assets[0]["user"]["username"]
+ try:
+ self.user = self.assets[0]["user"]["username"]
+ except IndexError:
+ self.user = ""
return ArtstationExtractor.metadata(self)
def projects(self):
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index f7ee51f..e304717 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -122,8 +122,7 @@ class Extractor():
seconds = (self._interval() -
(time.time() - Extractor.request_timestamp))
if seconds > 0.0:
- self.log.debug("Sleeping for %.5s seconds", seconds)
- time.sleep(seconds)
+ self.sleep(seconds, "request")
while True:
try:
@@ -169,8 +168,9 @@ class Extractor():
self.log.debug("%s (%s/%s)", msg, tries, retries+1)
if tries > retries:
break
- time.sleep(
- max(tries, self._interval()) if self._interval else tries)
+ self.sleep(
+ max(tries, self._interval()) if self._interval else tries,
+ "retry")
tries += 1
raise exception.HttpError(msg, response)
@@ -202,6 +202,11 @@ class Extractor():
self.log.info("Waiting until %s for %s.", isotime, reason)
time.sleep(seconds)
+ def sleep(self, seconds, reason):
+ self.log.debug("Sleeping %.2f seconds (%s)",
+ seconds, reason)
+ time.sleep(seconds)
+
def _get_auth_info(self):
"""Return authentication information as (username, password) tuple"""
username = self.config("username")
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 8c2ed53..c455ce1 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -88,10 +88,7 @@ class DanbooruExtractor(BaseExtractor):
if post["extension"] == "zip":
if self.ugoira:
- post["frames"] = self.request(
- "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format(
- self.root, post["id"])
- ).json()["pixiv_ugoira_frame_data"]["data"]
+ post["frames"] = self._ugoira_frames(post)
post["_http_adjust_extension"] = False
else:
url = post["large_file_url"]
@@ -139,6 +136,18 @@ class DanbooruExtractor(BaseExtractor):
else:
return
+ def _ugoira_frames(self, post):
+ data = self.request("{}/posts/{}.json?only=media_metadata".format(
+ self.root, post["id"])
+ ).json()["media_metadata"]["metadata"]
+
+ ext = data["ZIP:ZipFileName"].rpartition(".")[2]
+ print(post["id"], ext)
+ fmt = ("{:>06}." + ext).format
+ delays = data["Ugoira:FrameDelays"]
+ return [{"file": fmt(index), "delay": delay}
+ for index, delay in enumerate(delays)]
+
INSTANCES = {
"danbooru": {
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 6897476..cb2aa24 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -72,7 +72,7 @@ class DeviantartExtractor(Extractor):
def items(self):
self.api = DeviantartOAuthAPI(self)
- if self.user:
+ if self.user and self.config("group", True):
profile = self.api.user_profile(self.user)
self.group = not profile
if self.group:
@@ -938,11 +938,11 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
def deviations(self):
url = "{}/{}/{}/{}".format(
self.root, self.user, self.type, self.deviation_id)
- appurl = text.extract(self._limited_request(url).text,
- 'property="da:appurl" content="', '"')[0]
- if not appurl:
+ uuid = text.extract(self._limited_request(url).text,
+ '"deviationUuid\\":\\"', '\\')[0]
+ if not uuid:
raise exception.NotFoundError("deviation")
- return (self.api.deviation(appurl.rpartition("/")[2]),)
+ return (self.api.deviation(uuid),)
class DeviantartScrapsExtractor(DeviantartExtractor):
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
index 6ddf2ec..8b90250 100644
--- a/gallery_dl/extractor/directlink.py
+++ b/gallery_dl/extractor/directlink.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2021 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -19,7 +19,7 @@ class DirectlinkExtractor(Extractor):
archive_fmt = filename_fmt
pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\."
r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
- r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$")
+ r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$")
test = (
(("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {
"url": "18c5d00077332e98e53be9fed2ee4be66154b88d",
@@ -31,9 +31,9 @@ class DirectlinkExtractor(Extractor):
"keyword": "29dad729c40fb09349f83edafa498dba1297464a",
}),
# more complex example
- ("https://example.org/path/to/file.webm?que=1&ry=2#fragment", {
- "url": "114b8f1415cc224b0f26488ccd4c2e7ce9136622",
- "keyword": "06014abd503e3b2b58aa286f9bdcefdd2ae336c0",
+ ("https://example.org/path/to/file.webm?que=1?&ry=2/#fragment", {
+ "url": "6fb1061390f8aada3db01cb24b51797c7ee42b31",
+ "keyword": "3d7abc31d45ba324e59bc599c3b4862452d5f29c",
}),
# percent-encoded characters
("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index 8481248..f692a90 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -68,6 +68,16 @@ class FanboxExtractor(Extractor):
post["html"] = content_body["html"]
if post["type"] == "article":
post["articleBody"] = content_body.copy()
+ if "blocks" in content_body:
+ content = []
+ append = content.append
+ for block in content_body["blocks"]:
+ if "text" in block:
+ append(block["text"])
+ if "links" in block:
+ for link in block["links"]:
+ append(link["url"])
+ post["content"] = "\n".join(content)
post["date"] = text.parse_datetime(post["publishedDatetime"])
post["text"] = content_body.get("text") if content_body else None
@@ -271,6 +281,19 @@ class FanboxPostExtractor(FanboxExtractor):
"hasAdultContent": True
},
}),
+ # 'content' metadata (#3020)
+ ("https://www.fanbox.cc/@official-en/posts/4326303", {
+ "keyword": {
+ "content": r"re:(?s)^Greetings from FANBOX.\n \nAs of Monday, "
+ r"September 5th, 2022, we are happy to announce "
+ r"the start of the FANBOX hashtag event "
+ r"#MySetupTour ! \nAbout the event\nTo join this "
+ r"event .+ \nPlease check this page for further "
+ r"details regarding the Privacy & Terms.\n"
+ r"https://fanbox.pixiv.help/.+/10184952456601\n\n\n"
+ r"Thank you for your continued support of FANBOX.$",
+ },
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index bece905..69c07d0 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -27,9 +27,9 @@ class GenericExtractor(Extractor):
pattern += r"""
(?P<scheme>https?://)? # optional http(s) scheme
(?P<domain>[-\w\.]+) # required domain
- (?P<path>/[^?&#]*)? # optional path
- (?:\?(?P<query>[^/?#]*))? # optional query
- (?:\#(?P<fragment>.*))?$ # optional fragment
+ (?P<path>/[^?#]*)? # optional path
+ (?:\?(?P<query>[^#]*))? # optional query
+ (?:\#(?P<fragment>.*))? # optional fragment
"""
def __init__(self, match):
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index f8b0c3b..cc110aa 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -115,12 +115,16 @@ class HitomiGalleryExtractor(GalleryExtractor):
fmt = self.config("format") or "webp"
if fmt == "original":
- subdomain, fmt, ext = "b", "images", None
+ subdomain, fmt, ext, check = "b", "images", None, False
else:
- subdomain, ext = "a", fmt
+ subdomain, ext, check = "a", fmt, True
result = []
for image in self.info["files"]:
+ if check:
+ if not image.get("has" + fmt):
+ fmt = ext = "webp"
+ check = False
ihash = image["hash"]
idata = text.nameext_from_url(image["name"])
if ext:
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index b1c0e9e..2c899eb 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -44,7 +44,9 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
("https://www.imagefap.com/gallery/5486966", {
"pattern": r"https://cdnh?\.imagefap\.com"
r"/images/full/\d+/\d+/\d+\.jpg",
- "keyword": "3e24eace5b09639b881ebd393165862feb46adde",
+ "keyword": "8d2e562df7a0bc9e8eecb9d1bb68d32b4086bf98",
+ "archive": False,
+ "count": 62,
}),
("https://www.imagefap.com/gallery.php?gid=7102714"),
("https://beta.imagefap.com/gallery.php?gid=7102714"),
@@ -73,32 +75,42 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
title, _, descr = descr.partition(" porn picture gallery by ")
uploader, _, tags = descr.partition(" to see hottest ")
+ self._count = text.parse_int(count)
return {
"gallery_id": text.parse_int(self.gid),
"title": text.unescape(title),
"uploader": uploader,
"tags": tags[:-11].split(", "),
- "count": text.parse_int(count),
+ "count": self._count,
}
def get_images(self):
"""Collect image-urls and -metadata"""
- num = 0
url = "{}/photo/{}/".format(self.root, self.image_id)
params = {"gid": self.gid, "idx": 0, "partial": "true"}
+ headers = {
+ "Content-Type": "application/x-www-form-urlencoded",
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": "{}?pgid=&gid={}&page=0".format(url, self.image_id)
+ }
+
+ num = 0
+ total = self._count
while True:
- pos = 0
- page = self.request(url, params=params).text
- for _ in range(24):
- imgurl, pos = text.extract(page, '<a href="', '"', pos)
- if not imgurl:
- return
+ page = self.request(url, params=params, headers=headers).text
+
+ cnt = 0
+ for image_url in text.extract_iter(page, '<a href="', '"'):
num += 1
- data = text.nameext_from_url(imgurl)
+ cnt += 1
+ data = text.nameext_from_url(image_url)
data["num"] = num
data["image_id"] = text.parse_int(data["filename"])
- yield imgurl, data
- params["idx"] += 24
+ yield image_url, data
+
+ if cnt < 24 and num >= total:
+ return
+ params["idx"] += cnt
class ImagefapImageExtractor(ImagefapExtractor):
@@ -170,40 +182,49 @@ class ImagefapUserExtractor(ImagefapExtractor):
self.user, self.user_id = match.groups()
def items(self):
- for gid, name in self.get_gallery_data():
- url = "{}/gallery/{}".format(self.root, gid)
- data = {
- "gallery_id": text.parse_int(gid),
- "title": text.unescape(name),
- "_extractor": ImagefapGalleryExtractor,
- }
- yield Message.Queue, url, data
-
- def get_gallery_data(self):
- """Yield all gallery_ids of a specific user"""
- folders = self.get_gallery_folders()
- url = "{}/ajax_usergallery_folder.php".format(self.root)
- params = {"userid": self.user_id}
- for folder_id in folders:
- params["id"] = folder_id
- page = self.request(url, params=params).text
-
- pos = 0
- while True:
- gid, pos = text.extract(page, '<a href="/gallery/', '"', pos)
- if not gid:
- break
- name, pos = text.extract(page, "<b>", "<", pos)
- yield gid, name
-
- def get_gallery_folders(self):
- """Create a list of all folder_ids of a specific user"""
+ for folder_id in self.folders():
+ for gallery_id, name in self.galleries(folder_id):
+ url = "{}/gallery/{}".format(self.root, gallery_id)
+ data = {
+ "gallery_id": text.parse_int(gallery_id),
+ "title" : text.unescape(name),
+ "_extractor": ImagefapGalleryExtractor,
+ }
+ yield Message.Queue, url, data
+
+ def folders(self):
+ """Return a list of folder_ids of a specific user"""
if self.user:
url = "{}/profile/{}/galleries".format(self.root, self.user)
else:
url = "{}/usergallery.php?userid={}".format(
self.root, self.user_id)
- page = self.request(url).text
- self.user_id, pos = text.extract(page, '?userid=', '"')
- folders, pos = text.extract(page, ' id="tgl_all" value="', '"', pos)
- return folders.split("|")[:-1]
+
+ response = self.request(url)
+ self.user = response.url.split("/")[-2]
+ folders = text.extract(response.text, ' id="tgl_all" value="', '"')[0]
+ return folders.rstrip("|").split("|")
+
+ def galleries(self, folder_id):
+ """Yield gallery_ids of a folder"""
+ if folder_id == "-1":
+ url = "{}/profile/{}/galleries?folderid=-1".format(
+ self.root, self.user)
+ else:
+ url = "{}/organizer/{}/".format(self.root, folder_id)
+ params = {"page": 0}
+
+ while True:
+ extr = text.extract_from(self.request(url, params=params).text)
+ cnt = 0
+
+ while True:
+ gid = extr('<a href="/gallery/', '"')
+ if not gid:
+ break
+ yield gid, extr("<b>", "<")
+ cnt += 1
+
+ if cnt < 25:
+ break
+ params["page"] += 1
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 425d541..4775613 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -338,6 +338,14 @@ class InstagramExtractor(Extractor):
"username" : user["username"],
"full_name": user["full_name"]})
+ def _init_cursor(self):
+ return self.config("cursor") or None
+
+ def _update_cursor(self, cursor):
+ self.log.debug("Cursor: %s", cursor)
+ self._cursor = cursor
+ return cursor
+
class InstagramUserExtractor(InstagramExtractor):
"""Extractor for an Instagram user profile"""
@@ -409,8 +417,8 @@ class InstagramTaggedExtractor(InstagramExtractor):
self.user_id = self.item[3:]
return {"tagged_owner_id": self.user_id}
+ self.user_id = self.api.user_id(self.item)
user = self.api.user(self.item)
- self.user_id = user["id"]
return {
"tagged_owner_id" : user["id"],
@@ -693,7 +701,15 @@ class InstagramRestAPI():
def user_id(self, screen_name):
if screen_name.startswith("id:"):
return screen_name[3:]
- return self.user(screen_name)["id"]
+ user = self.user(screen_name)
+ if user is None:
+ raise exception.AuthorizationError(
+ "Login required to access this profile")
+ if user["is_private"] and not user["followed_by_viewer"]:
+ name = user["username"]
+ s = "" if name.endswith("s") else "s"
+ raise exception.StopExtraction("%s'%s posts are private", name, s)
+ return user["id"]
def user_clips(self, user_id):
endpoint = "/v1/clips/user/"
@@ -741,6 +757,9 @@ class InstagramRestAPI():
def _pagination(self, endpoint, params=None, media=False):
if params is None:
params = {}
+ extr = self.extractor
+ params["max_id"] = extr._init_cursor()
+
while True:
data = self._call(endpoint, params=params)
@@ -752,9 +771,12 @@ class InstagramRestAPI():
if not data.get("more_available"):
return
- params["max_id"] = data["next_max_id"]
+ params["max_id"] = extr._update_cursor(data["next_max_id"])
def _pagination_post(self, endpoint, params):
+ extr = self.extractor
+ params["max_id"] = extr._init_cursor()
+
while True:
data = self._call(endpoint, method="POST", data=params)
@@ -764,9 +786,12 @@ class InstagramRestAPI():
info = data["paging_info"]
if not info.get("more_available"):
return
- params["max_id"] = info["max_id"]
+ params["max_id"] = extr._update_cursor(info["max_id"])
def _pagination_sections(self, endpoint, params):
+ extr = self.extractor
+ params["max_id"] = extr._init_cursor()
+
while True:
info = self._call(endpoint, method="POST", data=params)
@@ -774,19 +799,22 @@ class InstagramRestAPI():
if not info.get("more_available"):
return
- params["max_id"] = info["next_max_id"]
params["page"] = info["next_page"]
+ params["max_id"] = extr._update_cursor(info["next_max_id"])
class InstagramGraphqlAPI():
def __init__(self, extractor):
self.extractor = extractor
- self.user = InstagramRestAPI(extractor).user
self.user_collection = self.user_saved = self.reels_media = \
self.highlights_media = self._login_required
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
+ api = InstagramRestAPI(extractor)
+ self.user = api.user
+ self.user_id = api.user_id
+
@staticmethod
def _login_required(_=None):
raise exception.AuthorizationError("Login required")
@@ -824,11 +852,6 @@ class InstagramGraphqlAPI():
return self._pagination(query_hash, variables,
"hashtag", "edge_hashtag_to_media")
- def user_id(self, screen_name):
- if screen_name.startswith("id:"):
- return screen_name[3:]
- return self.user(screen_name)["id"]
-
def user_clips(self, user_id):
query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
variables = {"id": user_id, "first": 50}
@@ -871,9 +894,8 @@ class InstagramGraphqlAPI():
def _pagination(self, query_hash, variables,
key_data="user", key_edge=None):
- cursor = self.extractor.config("cursor")
- if cursor:
- variables["after"] = cursor
+ extr = self.extractor
+ variables["after"] = extr._init_cursor()
while True:
data = self._call(query_hash, variables)[key_data]
@@ -890,35 +912,55 @@ class InstagramGraphqlAPI():
raise exception.StopExtraction(
"%s'%s posts are private", self.item, s)
- variables["after"] = self._cursor = info["end_cursor"]
- self.extractor.log.debug("Cursor: %s", self._cursor)
+ variables["after"] = extr._update_cursor(info["end_cursor"])
-@cache(maxage=360*24*3600, keyarg=1)
+@cache(maxage=90*24*3600, keyarg=1)
def _login_impl(extr, username, password):
extr.log.info("Logging in as %s", username)
+ user_agent = ("Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/106.0.5249.79 Mobile "
+ "Safari/537.36 Instagram 255.1.0.17.102")
+
+ headers = {
+ "User-Agent" : user_agent,
+ "Sec-Fetch-Dest": "document",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "none",
+ "Sec-Fetch-User": "?1",
+ }
url = extr.root + "/accounts/login/"
- page = extr.request(url).text
+ response = extr.request(url, headers=headers)
+
+ extract = text.extract_from(response.text)
+ csrf_token = extract('"csrf_token":"', '"')
+ device_id = extract('"device_id":"', '"')
+ rollout_hash = extract('"rollout_hash":"', '"')
+
+ cset = extr.session.cookies.set
+ cset("csrftoken", csrf_token, domain=extr.cookiedomain)
+ cset("ig_did", device_id, domain=extr.cookiedomain)
headers = {
- "X-Web-Device-Id" : text.extract(page, '"device_id":"', '"')[0],
+ "User-Agent" : user_agent,
+ "Accept" : "*/*",
+ "X-CSRFToken" : csrf_token,
+ "X-Instagram-AJAX": rollout_hash,
"X-IG-App-ID" : "936619743392459",
- "X-ASBD-ID" : "437806",
+ "X-ASBD-ID" : "198387",
"X-IG-WWW-Claim" : "0",
"X-Requested-With": "XMLHttpRequest",
+ "Origin" : extr.root,
"Referer" : url,
+ "Sec-Fetch-Dest" : "empty",
+ "Sec-Fetch-Mode" : "cors",
+ "Sec-Fetch-Site" : "same-origin",
}
- url = extr.root + "/data/shared_data/"
- data = extr.request(url, headers=headers).json()
-
- headers["X-CSRFToken"] = data["config"]["csrf_token"]
- headers["X-Instagram-AJAX"] = data["rollout_hash"]
- headers["Origin"] = extr.root
data = {
- "username" : username,
- "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format(
+ "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format(
int(time.time()), password),
+ "username" : username,
"queryParams" : "{}",
"optIntoOneTap" : "false",
"stopDeletionNonce" : "",
@@ -930,11 +972,8 @@ def _login_impl(extr, username, password):
if not response.json().get("authenticated"):
raise exception.AuthenticationError()
- cget = extr.session.cookies.get
- return {
- name: cget(name)
- for name in ("sessionid", "mid", "ig_did")
- }
+ return {cookie.name: cookie.value
+ for cookie in extr.session.cookies}
def id_from_shortcode(shortcode):
diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py
new file mode 100644
index 0000000..6062418
--- /dev/null
+++ b/gallery_dl/extractor/nana.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nana.my.id/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+import json
+
+
+class NanaGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from nana.my.id"""
+ category = "nana"
+ directory_fmt = ("{category}", "{title}")
+ pattern = r"(?:https?://)?nana\.my\.id/reader/([^/?#]+)"
+ test = (
+ (("https://nana.my.id/reader/"
+ "059f7de55a4297413bfbd432ce7d6e724dd42bae"), {
+ "pattern": r"https://nana\.my\.id/reader/"
+ r"\w+/image/page\?path=.*\.\w+",
+ "title" : "Everybody Loves Shion",
+ "artist" : "fuzui",
+ "tags" : list,
+ "count" : 29,
+ }),
+ (("https://nana.my.id/reader/"
+ "77c8712b67013e427923573379f5bafcc0c72e46"), {
+ "pattern": r"https://nana\.my\.id/reader/"
+ r"\w+/image/page\?path=.*\.\w+",
+ "title" : "Lovey-Dovey With an Otaku-Friendly Gyaru",
+ "artist" : "Sueyuu",
+ "tags" : ["Sueyuu"],
+ "count" : 58,
+ }),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "https://nana.my.id/reader/" + self.gallery_id
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ title = text.unescape(
+ text.extract(page, '</a>&nbsp; ', '</div>')[0])
+ artist = text.unescape(text.extract(
+ page, '<title>', '</title>')[0])[len(title):-10]
+ tags = text.extract(page, 'Reader.tags = "', '"')[0]
+
+ return {
+ "gallery_id": self.gallery_id,
+ "title" : title,
+ "artist" : artist[4:] if artist.startswith(" by ") else "",
+ "tags" : tags.split(", ") if tags else (),
+ "lang" : "en",
+ "language" : "English",
+ }
+
+ def images(self, page):
+ data = json.loads(text.extract(page, "Reader.pages = ", ".pages")[0])
+ return [
+ ("https://nana.my.id" + image, None)
+ for image in data["pages"]
+ ]
+
+
+class NanaSearchExtractor(Extractor):
+ """Extractor for nana search results"""
+ category = "nana"
+ subcategory = "search"
+ pattern = r"(?:https?://)?nana\.my\.id(?:/?\?([^#]+))"
+ test = (
+ ('https://nana.my.id/?q=+"elf"&sort=desc', {
+ "pattern": NanaGalleryExtractor.pattern,
+ "range": "1-100",
+ "count": 100,
+ }),
+ ("https://nana.my.id/?q=favorites%3A", {
+ "pattern": NanaGalleryExtractor.pattern,
+ "count": ">= 2",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+ self.params["p"] = text.parse_int(self.params.get("p"), 1)
+ self.params["q"] = self.params.get("q") or ""
+
+ def items(self):
+ if "favorites:" in self.params["q"]:
+ favkey = self.config("favkey")
+ if not favkey:
+ raise exception.AuthenticationError(
+ "'Favorite key' not provided. "
+ "Please see 'https://nana.my.id/tutorial'")
+ self.session.cookies.set("favkey", favkey, domain="nana.my.id")
+
+ data = {"_extractor": NanaGalleryExtractor}
+ while True:
+ try:
+ page = self.request(
+ "https://nana.my.id", params=self.params).text
+ except exception.HttpError:
+ return
+
+ for gallery in text.extract_iter(
+ page, '<div class="id3">', '</div>'):
+ url = "https://nana.my.id" + text.extract(
+ gallery, '<a href="', '"')[0]
+ yield Message.Queue, url, data
+
+ self.params["p"] += 1
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 2c8e72c..73911b2 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -326,6 +326,55 @@ class NijieNuitaExtractor(NijieExtractor):
page, "<title>", "さんの抜いた")[0] or "")
+class NijieFeedExtractor(NijieExtractor):
+ """Extractor for nijie liked user feed"""
+ subcategory = "feed"
+ pattern = BASE_PATTERN + r"/like_user_view\.php"
+ test = (
+ ("https://nijie.info/like_user_view.php", {
+ "range": "1-10",
+ "count": 10,
+ }),
+ ("https://horne.red/like_user_view.php"),
+ )
+
+ def image_ids(self):
+ return self._pagination("like_user_view")
+
+ @staticmethod
+ def _extract_user_name(page):
+ return ""
+
+
+class NijiefollowedExtractor(NijieExtractor):
+ """Extractor for followed nijie users"""
+ subcategory = "followed"
+ pattern = BASE_PATTERN + r"/like_my\.php"
+ test = (
+ ("https://nijie.info/like_my.php"),
+ ("https://horne.red/like_my.php"),
+ )
+
+ def items(self):
+ self.login()
+
+ url = self.root + "/like_my.php"
+ params = {"p": 1}
+ data = {"_extractor": NijieUserExtractor}
+
+ while True:
+ page = self.request(url, params=params).text
+
+ for user_id in text.extract_iter(
+ page, '"><a href="/members.php?id=', '"'):
+ user_url = "{}/members.php?id={}".format(self.root, user_id)
+ yield Message.Queue, user_url, data
+
+ if '<a rel="next"' not in page:
+ return
+ params["p"] += 1
+
+
class NijieImageExtractor(NijieExtractor):
"""Extractor for a nijie work/image"""
subcategory = "image"
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 713330d..f381f12 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -62,10 +62,11 @@ class NozomiExtractor(Extractor):
yield Message.Directory, post
for post["num"], image in enumerate(images, 1):
- post["url"] = url = text.urljoin(self.root, image["imageurl"])
- text.nameext_from_url(url, post)
- post["is_video"] = bool(image.get("is_video"))
- post["dataid"] = post["filename"]
+ post["filename"] = post["dataid"] = did = image["dataid"]
+ post["extension"] = ext = image["type"]
+ post["is_video"] = video = bool(image.get("is_video"))
+ post["url"] = url = "https://{}.nozomi.la/{}/{}/{}.{}".format(
+ "v" if video else "i", did[-1], did[-3:-1], did, ext)
yield Message.Url, url, post
def posts(self):
@@ -109,7 +110,6 @@ class NozomiPostExtractor(NozomiExtractor):
"height" : 768,
"is_video" : False,
"postid" : 3649262,
- "source" : "danbooru",
"tags" : list,
"type" : "jpg",
"url" : str,
@@ -119,7 +119,7 @@ class NozomiPostExtractor(NozomiExtractor):
# multiple images per post
("https://nozomi.la/post/25588032.html", {
"url": "6aa3b7db385abcc9d374bdffd19187bccbf8f228",
- "keyword": "f60e048df36308b6b25dfaac419b586895d360bc",
+ "keyword": "2a2998af93c6438863c4077bd386b613b8bc2957",
"count": 7,
}),
# empty 'date' (#1163)
@@ -160,7 +160,7 @@ class NozomiTagExtractor(NozomiExtractor):
archive_fmt = "t_{search_tags}_{dataid}"
pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\."
test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", {
- "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$",
+ "pattern": r"^https://[iv]\.nozomi\.la/\w/\w\w/\w+\.\w+$",
"count": ">= 25",
"range": "1-25",
})
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 3a4fb0e..1111c3a 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text
+from ..cache import cache
class RedgifsExtractor(Extractor):
@@ -88,7 +89,7 @@ class RedgifsSearchExtractor(RedgifsExtractor):
pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/browse/?\?([^#]+)"
test = (
("https://www.redgifs.com/browse?tags=JAV", {
- "pattern": r"https://\w+\.redgifs\.com/[A-Za-z]+\.mp4",
+ "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.mp4",
"range": "1-10",
"count": 10,
}),
@@ -131,6 +132,12 @@ class RedgifsAPI():
def __init__(self, extractor):
self.extractor = extractor
+ self.headers = {
+ "Referer" : extractor.root + "/",
+ "authorization": "Bearer " + self._fetch_bearer_token(extractor),
+ "content-type" : "application/json",
+ "Origin" : extractor.root,
+ }
def gif(self, gif_id):
endpoint = "/v2/gifs/" + gif_id.lower()
@@ -149,7 +156,8 @@ class RedgifsAPI():
def _call(self, endpoint, params=None):
url = self.API_ROOT + endpoint
- return self.extractor.request(url, params=params).json()
+ return self.extractor.request(
+ url, params=params, headers=self.headers).json()
def _pagination(self, endpoint, params):
params["page"] = 1
@@ -161,3 +169,17 @@ class RedgifsAPI():
if params["page"] >= data["pages"]:
return
params["page"] += 1
+
+ @cache(maxage=3600)
+ def _fetch_bearer_token(self, extr):
+ extr.log.debug("Retrieving Bearer token")
+
+ page = extr.request(extr.root + "/").text
+ index = text.extract(page, "/assets/js/index", ".js")[0]
+
+ url = extr.root + "/assets/js/index" + index + ".js"
+ page = extr.request(url, encoding="utf-8").text
+ token = "ey" + text.extract(page, '="ey', '"')[0]
+
+ extr.log.debug("Token: '%s'", token)
+ return token
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 447ce00..324a3c6 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -17,7 +17,7 @@ import re
BASE_PATTERN = (
r"(?:tumblr:(?:https?://)?([^/]+)|"
r"(?:https?://)?"
- r"(?:www\.tumblr\.com/blog/(?:view/)?([\w-]+)|"
+ r"(?:www\.tumblr\.com/(?:blog/(?:view/)?)?([\w-]+)|"
r"([\w-]+\.tumblr\.com)))"
)
@@ -250,9 +250,9 @@ class TumblrExtractor(Extractor):
return updated, (resized == updated)
def _original_image_fallback(self, url, post_id):
- yield self._update_image_token(url)[0]
- yield self._update_image_token(url)[0]
- yield self._update_image_token(url)[0]
+ for _ in range(3):
+ self.sleep(120, "image token")
+ yield self._update_image_token(url)[0]
self.log.warning("Unable to fetch higher-resolution "
"version of %s (%s)", url, post_id)
@@ -298,6 +298,7 @@ class TumblrUserExtractor(TumblrExtractor):
("tumblr:www.b-authentique.com"),
("https://www.tumblr.com/blog/view/smarties-art"),
("https://www.tumblr.com/blog/smarties-art"),
+ ("https://www.tumblr.com/smarties-art"),
)
def posts(self):
@@ -354,6 +355,8 @@ class TumblrPostExtractor(TumblrExtractor):
}),
("http://demo.tumblr.com/image/459265350"),
("https://www.tumblr.com/blog/view/smarties-art/686047436641353728"),
+ ("https://www.tumblr.com/blog/smarties-art/686047436641353728"),
+ ("https://www.tumblr.com/smarties-art/686047436641353728"),
)
def __init__(self, match):
@@ -381,6 +384,8 @@ class TumblrTagExtractor(TumblrExtractor):
"count": 1,
}),
("https://www.tumblr.com/blog/view/smarties-art/tagged/undertale"),
+ ("https://www.tumblr.com/blog/smarties-art/tagged/undertale"),
+ ("https://www.tumblr.com/smarties-art/tagged/undertale"),
)
def __init__(self, match):
@@ -402,6 +407,8 @@ class TumblrLikesExtractor(TumblrExtractor):
"count": 1,
}),
("https://www.tumblr.com/blog/view/mikf123/likes"),
+ ("https://www.tumblr.com/blog/mikf123/likes"),
+ ("https://www.tumblr.com/mikf123/likes"),
)
def posts(self):
@@ -435,11 +442,15 @@ class TumblrAPI(oauth.OAuth1API):
def posts(self, blog, params):
"""Retrieve published posts"""
- params.update({"offset": 0, "limit": 50, "reblog_info": "true"})
+ params["offset"] = self.extractor.config("offset") or 0
+ params["limit"] = 50
+ params["reblog_info"] = "true"
+
if self.posts_type:
params["type"] = self.posts_type
if self.before:
params["before"] = self.before
+
while True:
data = self._call(blog, "posts", params)
self.BLOG_CACHE[blog] = data["blog"]
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index 623ed94..8bea18c 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -210,7 +210,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor):
class UnsplashSearchExtractor(UnsplashExtractor):
"""Extractor for unsplash search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?"
+ pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?"
test = ("https://unsplash.com/s/photos/hair-style", {
"pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
index 25b00fe..9b6831b 100644
--- a/gallery_dl/extractor/vk.py
+++ b/gallery_dl/extractor/vk.py
@@ -79,7 +79,8 @@ class VkExtractor(Extractor):
if len(payload) < 4:
self.log.debug(payload)
- raise exception.AuthorizationError(payload[0])
+ raise exception.AuthorizationError(
+ text.unescape(payload[0]) if payload[0] else None)
total = payload[1]
photos = payload[3]
@@ -103,7 +104,7 @@ class VkPhotosExtractor(VkExtractor):
subcategory = "photos"
pattern = (BASE_PATTERN + r"/(?:"
r"(?:albums|photos|id)(-?\d+)"
- r"|(?!album-?\d+_)([^/?#]+))")
+ r"|(?!(?:album|tag)-?\d+_?)([^/?#]+))")
test = (
("https://vk.com/id398982326", {
"pattern": r"https://sun\d+-\d+\.userapi\.com/s/v1/if1"
@@ -182,9 +183,6 @@ class VkAlbumExtractor(VkExtractor):
directory_fmt = ("{category}", "{user[id]}", "{album[id]}")
pattern = BASE_PATTERN + r"/album(-?\d+)_(\d+)$"
test = (
- ("https://vk.com/album232175027_00", {
- "count": 8,
- }),
("https://vk.com/album-165740836_281339889", {
"count": 12,
}),
@@ -192,6 +190,9 @@ class VkAlbumExtractor(VkExtractor):
("https://vk.com/album-53775183_00", {
"exception": exception.AuthorizationError,
}),
+ ("https://vk.com/album232175027_00", {
+ "exception": exception.AuthorizationError,
+ }),
)
def __init__(self, match):
@@ -207,3 +208,25 @@ class VkAlbumExtractor(VkExtractor):
"user": {"id": self.user_id},
"album": {"id": self.album_id},
}
+
+
+class VkTaggedExtractor(VkExtractor):
+ """Extractor for a vk tagged photos"""
+ subcategory = "tagged"
+ directory_fmt = ("{category}", "{user[id]}", "tags")
+ pattern = BASE_PATTERN + r"/tag(-?\d+)$"
+ test = (
+ ("https://vk.com/tag304303884", {
+ "count": 44,
+ }),
+ )
+
+ def __init__(self, match):
+ VkExtractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def photos(self):
+ return self._pagination("tag{}".format(self.user_id))
+
+ def metadata(self):
+ return {"user": {"id": self.user_id}}
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
index 0ad8523..47451bd 100644
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -52,7 +52,7 @@ class WallhavenSearchExtractor(WallhavenExtractor):
subcategory = "search"
directory_fmt = ("{category}", "{search[q]}")
archive_fmt = "s_{search[q]}_{id}"
- pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^/?#]+))?"
+ pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^#]+))?"
test = (
("https://wallhaven.cc/search?q=touhou"),
(("https://wallhaven.cc/search?q=id%3A87"
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index 7b22b1d..2f48ffd 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -8,7 +8,6 @@
import sys
import json
-import time
import errno
import logging
import functools
@@ -74,9 +73,10 @@ class Job():
log = extractor.log
msg = None
- sleep = util.build_duration_func(extractor.config("sleep-extractor"))
+ sleep = util.build_duration_func(
+ extractor.config("sleep-extractor"))
if sleep:
- time.sleep(sleep())
+ extractor.sleep(sleep(), "extractor")
try:
for msg in extractor:
@@ -238,7 +238,7 @@ class DownloadJob(Job):
return
if self.sleep:
- time.sleep(self.sleep())
+ self.extractor.sleep(self.sleep(), "download")
# download from URL
if not self.download(url):
@@ -527,11 +527,11 @@ class SimulationJob(DownloadJob):
if not kwdict["extension"]:
kwdict["extension"] = "jpg"
self.pathfmt.set_filename(kwdict)
- self.out.skip(self.pathfmt.path)
if self.sleep:
- time.sleep(self.sleep())
+ self.extractor.sleep(self.sleep(), "download")
if self.archive:
self.archive.add(kwdict)
+ self.out.skip(self.pathfmt.path)
def handle_directory(self, kwdict):
if not self.pathfmt:
@@ -697,17 +697,18 @@ class DataJob(Job):
self.ascii = config.get(("output",), "ascii", ensure_ascii)
private = config.get(("output",), "private")
- self.filter = util.identity if private else util.filter_dict
+ self.filter = dict.copy if private else util.filter_dict
def run(self):
+ extractor = self.extractor
sleep = util.build_duration_func(
- self.extractor.config("sleep-extractor"))
+ extractor.config("sleep-extractor"))
if sleep:
- time.sleep(sleep())
+ extractor.sleep(sleep(), "extractor")
# collect data
try:
- for msg in self.extractor:
+ for msg in extractor:
self.dispatch(msg)
except exception.StopExtraction:
pass
diff --git a/gallery_dl/path.py b/gallery_dl/path.py
index 84ee7af..28c07c3 100644
--- a/gallery_dl/path.py
+++ b/gallery_dl/path.py
@@ -105,6 +105,9 @@ class PathFormat():
strip = ". "
self.strip = strip
+ if WINDOWS:
+ self.extended = config("path-extended", True)
+
basedir = extractor._parentdir
if not basedir:
basedir = config("base-directory")
@@ -178,7 +181,7 @@ class PathFormat():
else:
self.directory = directory = self.basedirectory
- if WINDOWS:
+ if WINDOWS and self.extended:
# Enable longer-than-260-character paths
directory = os.path.abspath(directory)
if directory.startswith("\\\\"):
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index d9baed3..b21e483 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -19,15 +19,9 @@ class MetadataPP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
- mode = options.get("mode", "json")
- if mode == "custom":
- self.write = self._write_custom
- cfmt = options.get("content-format") or options.get("format")
- if isinstance(cfmt, list):
- cfmt = "\n".join(cfmt) + "\n"
- self._content_fmt = formatter.parse(cfmt).format_map
- ext = "txt"
- elif mode == "tags":
+ mode = options.get("mode")
+ cfmt = options.get("content-format") or options.get("format")
+ if mode == "tags":
self.write = self._write_tags
ext = "txt"
elif mode == "modify":
@@ -41,6 +35,12 @@ class MetadataPP(PostProcessor):
self.run = self._run_delete
self.fields = options.get("fields")
ext = None
+ elif mode == "custom" or not mode and cfmt:
+ self.write = self._write_custom
+ if isinstance(cfmt, list):
+ cfmt = "\n".join(cfmt) + "\n"
+ self._content_fmt = formatter.parse(cfmt).format_map
+ ext = "txt"
else:
self.write = self._write_json
self.indent = options.get("indent", 4)
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 4ba1cba..1650b0a 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -418,6 +418,82 @@ CODES = {
}
+def parse_inputfile(file, log):
+ """Filter and process strings from an input file.
+
+ Lines starting with '#' and empty lines will be ignored.
+ Lines starting with '-' will be interpreted as a key-value pair separated
+ by an '='. where 'key' is a dot-separated option name and 'value' is a
+ JSON-parsable value. These configuration options will be applied while
+ processing the next URL.
+ Lines starting with '-G' are the same as above, except these options will
+ be applied for *all* following URLs, i.e. they are Global.
+ Everything else will be used as a potential URL.
+
+ Example input file:
+
+ # settings global options
+ -G base-directory = "/tmp/"
+ -G skip = false
+
+ # setting local options for the next URL
+ -filename="spaces_are_optional.jpg"
+ -skip = true
+
+ https://example.org/
+
+ # next URL uses default filename and 'skip' is false.
+ https://example.com/index.htm # comment1
+ https://example.com/404.htm # comment2
+ """
+ gconf = []
+ lconf = []
+ strip_comment = None
+
+ for line in file:
+ line = line.strip()
+
+ if not line or line[0] == "#":
+ # empty line or comment
+ continue
+
+ elif line[0] == "-":
+ # config spec
+ if len(line) >= 2 and line[1] == "G":
+ conf = gconf
+ line = line[2:]
+ else:
+ conf = lconf
+ line = line[1:]
+
+ key, sep, value = line.partition("=")
+ if not sep:
+ log.warning("input file: invalid <key>=<value> pair: %s", line)
+ continue
+
+ try:
+ value = json.loads(value.strip())
+ except ValueError as exc:
+ log.warning("input file: unable to parse '%s': %s", value, exc)
+ continue
+
+ key = key.strip().split(".")
+ conf.append((key[:-1], key[-1], value))
+
+ else:
+ # url
+ if " #" in line or "\t#" in line:
+ if strip_comment is None:
+ strip_comment = re.compile(r"\s+#.*").sub
+ line = strip_comment("", line)
+ if gconf or lconf:
+ yield ExtendedUrl(line, gconf, lconf)
+ gconf = []
+ lconf = []
+ else:
+ yield line
+
+
class UniversalNone():
"""None-style object that supports more operations than None itself"""
__slots__ = ()
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 13cb9a0..f758857 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.23.2"
+__version__ = "1.23.3"
diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py
index af8b0af..ba37ee0 100644
--- a/test/test_postprocessor.py
+++ b/test/test_postprocessor.py
@@ -267,6 +267,7 @@ class MetadataTest(BasePostprocessorTest):
test({"mode": "custom", "content-format": "{foo}\n{missing}\n"})
test({"mode": "custom", "content-format": ["{foo}", "{missing}"]})
test({"mode": "custom", "format": "{foo}\n{missing}\n"})
+ test({"format": "{foo}\n{missing}\n"})
def test_metadata_extfmt(self):
pp = self._create({
diff --git a/test/test_results.py b/test/test_results.py
index d3debc6..e594933 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -330,6 +330,10 @@ def setup_test_config():
config.set(("extractor", "mastodon.social"), "access-token",
"Blf9gVqG7GytDTfVMiyYQjwVMQaNACgf3Ds3IxxVDUQ")
+ config.set(("extractor", "nana"), "favkey",
+ "9237ddb82019558ea7d179e805100805"
+ "ea6aa1c53ca6885cd4c179f9fb22ead2")
+
config.set(("extractor", "deviantart"), "client-id", "7777")
config.set(("extractor", "deviantart"), "client-secret",
"ff14994c744d9208e5caeec7aab4a026")