diff options
author | Unit 193 <unit193@ubuntu.com> | 2019-12-25 19:40:29 -0500 |
---|---|---|
committer | Unit 193 <unit193@ubuntu.com> | 2019-12-25 19:40:29 -0500 |
commit | d9cde490ce957f56cd2bb9e1628c2c0ef1a8733b (patch) | |
tree | 13f58a5c602ae402a6b1fc598ce227e9f36f3aa2 | |
parent | f9b17d9842e84709e2a41e92eb1dff0654c430c5 (diff) | |
parent | f9a1a9dcb7df977eeac9544786df9c0b93795815 (diff) | |
download | gallery-dl-d9cde490ce957f56cd2bb9e1628c2c0ef1a8733b.tar.bz2 gallery-dl-d9cde490ce957f56cd2bb9e1628c2c0ef1a8733b.tar.xz gallery-dl-d9cde490ce957f56cd2bb9e1628c2c0ef1a8733b.tar.zst |
Update upstream source from tag 'upstream/1.12.1'
Update to upstream version '1.12.1'
with Debian dir 6bd6e3209b21195bb26b892f2d85b1324de1f0d9
74 files changed, 1670 insertions, 904 deletions
@@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.11.1 +Version: 1.12.1 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -95,8 +95,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.10.6/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.10.6/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.1/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.1/gallery-dl.bin>`__ These executables include a Python 3.7 interpreter and all required Python packages. @@ -132,14 +132,14 @@ Description: ========== .. code:: bash - $ gallery-dl http://danbooru.donmai.us/posts?tags=bonocho + $ gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" Get the direct URL of an image from a site that requires authentication: .. code:: bash - $ gallery-dl -g -u <username> -p <password> http://seiga.nicovideo.jp/seiga/im3211703 + $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" | Search a remote resource for URLs and download images from them: @@ -147,7 +147,7 @@ Description: ========== .. code:: bash - $ gallery-dl r:https://pastebin.com/raw/FLwrCYsT + $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" Configuration @@ -241,8 +241,8 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.10.6.zip - .. _dev: https://github.com/mikf/gallery-dl/archive/master.zip + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.1.tar.gz + .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ @@ -84,8 +84,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.10.6/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.10.6/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.1/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.1/gallery-dl.bin>`__ These executables include a Python 3.7 interpreter and all required Python packages. @@ -121,14 +121,14 @@ Download images; in this case from danbooru via tag search for 'bonocho': .. code:: bash - $ gallery-dl http://danbooru.donmai.us/posts?tags=bonocho + $ gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" Get the direct URL of an image from a site that requires authentication: .. code:: bash - $ gallery-dl -g -u <username> -p <password> http://seiga.nicovideo.jp/seiga/im3211703 + $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" | Search a remote resource for URLs and download images from them: @@ -136,7 +136,7 @@ Get the direct URL of an image from a site that requires authentication: .. code:: bash - $ gallery-dl r:https://pastebin.com/raw/FLwrCYsT + $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" Configuration @@ -230,8 +230,8 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.10.6.zip -.. _dev: https://github.com/mikf/gallery-dl/archive/master.zip +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.1.tar.gz +.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl index 415bf5c..11a796a 100644 --- a/data/completion/gallery-dl +++ b/data/completion/gallery-dl @@ -10,7 +10,7 @@ _gallery_dl() elif [[ "${prev}" =~ ^(-d|--dest)$ ]]; then COMPREPLY=( $(compgen -d -- "${cur}") ) else - COMPREPLY=( $(compgen -W "--help --version --dest --input-file --cookies --proxy --clear-cache --quiet --verbose --get-urls --dump-json --simulate --list-keywords --list-modules --list-extractors --write-log --write-unsupported --limit-rate --retries --abort --http-timeout --sleep --no-part --no-mtime --no-download --no-check-certificate --abort-on-skip --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --range --chapter-range --filter --chapter-filter --zip --exec --ugoira-conv --ugoira-conv-lossless --write-metadata --write-tags --mtime-from-date" -- "${cur}") ) + COMPREPLY=( $(compgen -W "--help --version --dest --input-file --cookies --proxy --clear-cache --quiet --verbose --get-urls --dump-json --simulate --list-keywords --list-modules --list-extractors --write-log --write-unsupported --limit-rate --retries --abort --http-timeout --sleep --no-part --no-mtime --no-download --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --write-metadata --write-tags --mtime-from-date --exec --exec-after" -- "${cur}") ) fi } diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index a775e76..a530760 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2019-11-09" "1.11.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2019-12-22" "1.12.1" "gallery-dl Manual" .\" disable hyphenation .nh @@ -131,9 +131,6 @@ Like '--filter', but applies to manga-chapters and other delegated URLs .B "\-\-zip" Store downloaded files in a ZIP archive .TP -.B "\-\-exec" \f[I]CMD\f[] -Execute CMD for each downloaded file. Example: --exec 'magick convert {} {}.png && rm {}' -.TP .B "\-\-ugoira\-conv" Convert Pixiv Ugoira to WebM (requires FFmpeg) .TP @@ -148,6 +145,12 @@ Write image tags to separate text files .TP .B "\-\-mtime\-from\-date" Set file modification times according to 'date' metadata +.TP +.B "\-\-exec" \f[I]CMD\f[] +Execute CMD for each downloaded file. Example: --exec 'convert {} {}.png && rm {}' +.TP +.B "\-\-exec\-after" \f[I]CMD\f[] +Execute CMD after all files were downloaded successfully. Example: --exec-after 'cd {} && convert * ../doc.pdf' .SH EXAMPLES .TP diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 8902f51..07f1b88 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2019-11-09" "1.11.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2019-12-22" "1.12.1" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -171,7 +171,7 @@ escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[] \f[I]string\f[] .IP "Default:" 9 -\f[I]"\\\\u0000-\\\\u001f\\\\u007f"\f[] (ASCII control characters) +\f[I]"\\u0000-\\u001f\\u007f"\f[] (ASCII control characters) .IP "Description:" 4 Set of characters to remove from generated path names. @@ -191,8 +191,6 @@ Controls the behavior when downloading files that have been downloaded before, i.e. a file with the same filename already exists or its ID is in a \f[I]download archive\f[]. -__ \f[I]extractor.*.archive\f[] - * \f[I]true\f[]: Skip downloads @@ -211,8 +209,8 @@ after \f[I]N\f[] consecutive skips after \f[I]N\f[] consecutive skips -* \f[I]"enumerate"\f[]: Append a numeric suffix to the end of the -original filename (\f[I]file.ext.1\f[], \f[I]file.ext.2\f[], etc) +* \f[I]"enumerate"\f[]: Add an enumeration index to the beginning of the +filename extension (\f[I]file.1.ext\f[], \f[I]file.2.ext\f[], etc.) .SS extractor.*.sleep .IP "Type:" 6 @@ -280,7 +278,7 @@ be \f[I]strings\f[], will be used as cookie-names and -values. \f[I]bool\f[] .IP "Default:" 9 -\f[I]false\f[] +\f[I]true\f[] .IP "Description:" 4 If \f[I]extractor.*.cookies\f[] specifies a cookies.txt file, update its @@ -623,20 +621,22 @@ Use with caution. .SS extractor.deviantart.include .IP "Type:" 6 -\f[I]list\f[] of \f[I]strings\f[] or \f[I]string\f[] +\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 -\f[I]["gallery"]\f[] +\f[I]"gallery"\f[] + +.IP "Example:" 4 +"favorite,journal,scraps" or ["favorite", "journal", "scraps"] .IP "Description:" 4 -Selects the subcategories to include when processing a user profile. +A (comma-separated) list of subcategories to include +when processing a user profile. -Possible values are \f[I]"gallery"\f[], \f[I]"scraps"\f[], \f[I]"journal"\f[], -\f[I]"favorite"\f[]. +Possible values are +\f[I]"gallery"\f[], \f[I]"scraps"\f[], \f[I]"journal"\f[], \f[I]"favorite"\f[]. -It is also possible to use a string with the initial character of -each subcategory, i.e. \f[I]"gsj"\f[] for -\f[I]["gallery", "scraps", "journal"]\f[] +You can use \f[I]"all"\f[] instead of listing all values separately. .SS extractor.deviantart.journals .IP "Type:" 6 @@ -869,6 +869,16 @@ Controls whether to choose the GIF or MP4 version of an animation. Include *Story Highlights* when downloading a user profile. (requires authentication) +.SS extractor.instagram.videos +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download video files. + .SS extractor.kissmanga.captcha .IP "Type:" 6 \f[I]string\f[] @@ -884,6 +894,25 @@ Controls how to handle redirects to CAPTCHA pages. * \f[I]"wait\f[]: Ask the user to solve the CAPTCHA and wait. +.SS extractor.newgrounds.include +.IP "Type:" 6 +\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"art"\f[] + +.IP "Example:" 4 +"movies,audio" or ["movies", "audio"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"art"\f[], \f[I]"audio"\f[], \f[I]"movies"\f[]. + +You can use \f[I]"all"\f[] instead of listing all values separately. + .SS extractor.oauth.browser .IP "Type:" 6 \f[I]bool\f[] @@ -975,7 +1004,7 @@ the \f[I]recursive\f[] extractor. \f[I]integer\f[] .IP "Default:" 9 -\f[I]500\f[] +\f[I]0\f[] .IP "Description:" 4 The value of the \f[I]limit\f[] parameter when loading @@ -1197,6 +1226,16 @@ video extraction and download * \f[I]false\f[]: Skip video Tweets +.SS extractor.vsco.videos +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download video files. + .SS extractor.wallhaven.api-key .IP "Type:" 6 \f[I]string\f[] @@ -1299,8 +1338,8 @@ alongside the actual output files. .IP "Default:" 9 \f[I]null\f[] -.IP "Examples:" 4 -\f[I]"32000"\f[], \f[I]"500k"\f[], \f[I]"2.5M"\f[] +.IP "Example:" 4 +"32000", "500k", "2.5M" .IP "Description:" 4 Maximum download rate in bytes per second. @@ -1571,10 +1610,10 @@ or to let it run asynchronously. \f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] .IP "Example:" 4 - -* "convert {} {}.png && rm {}" - -* ["echo", "{user[account]}", "{id}"] +"convert {} {}.png && rm {}", +.br +["echo", "{user[account]}", "{id}"] +.br .IP "Description:" 4 The command to run. @@ -1620,8 +1659,8 @@ Select how to write metadata. * \f[I]"tags"\f[]: \f[I]tags\f[] separated by newlines -* \f[I]"custom"\f[]: result of applying \f[I]metadata.format\f[] to a file's -metadata dictionary +* \f[I]"custom"\f[]: result of applying \f[I]metadata.content-format\f[] +to a file's metadata dictionary .SS metadata.extension .IP "Type:" 6 @@ -1631,9 +1670,26 @@ metadata dictionary \f[I]"json"\f[] or \f[I]"txt"\f[] .IP "Description:" 4 -Filename extension for metadata files. +Filename extension for metadata files that will be appended to the +original file names. -.SS metadata.format +.SS metadata.extension-format +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +"{extension}.json", +.br +"json" +.br + +.IP "Description:" 4 +Custom format string to build filename extensions for metadata +files with, which will replace the original filename extensions. + +Note: \f[I]metadata.extension\f[] is ignored if this option is set. + +.SS metadata.content-format .IP "Type:" 6 \f[I]string\f[] @@ -1641,7 +1697,7 @@ Filename extension for metadata files. "tags:\\n\\n{tags:J\\n}\\n" .IP "Description:" 4 -Custom format string to build content of metadata files. +Custom format string to build the content of metadata files with. Note: Only applies for \f[I]"mode": "custom"\f[]. @@ -1845,6 +1901,17 @@ this cache. (See \f[I]SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>\f[] for details) +.SS pyopenssl +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Use \f[I]pyOpenSSL <https://www.pyopenssl.org/en/stable/>\f[]-backed +SSL-support. + .SH API TOKENS & IDS .SS extractor.deviantart.client-id & .client-secret .IP "Type:" 6 @@ -1965,19 +2032,19 @@ and put them in your configuration file .IP "Type:" 6 \f[I]string\f[] or \f[I]integer\f[] -.IP "Examples:" 4 - -* \f[I]"2019-01-01T00:00:00"\f[] - -* \f[I]"2019"\f[] with \f[I]"%Y"\f[] as \f[I]date-format\f[] - -* \f[I]1546297200\f[] +.IP "Example:" 4 +"2019-01-01T00:00:00", +.br +"2019" with "%Y" as \f[I]date-format\f[], +.br +1546297200 +.br .IP "Description:" 4 A \f[I]Date\f[] value represents a specific point in time. -* If given as \f[I]string\f[], it is parsed according to date-format_. +* If given as \f[I]string\f[], it is parsed according to \f[I]date-format\f[]. * If given as \f[I]integer\f[], it is interpreted as UTC timestamp. @@ -1985,15 +2052,15 @@ A \f[I]Date\f[] value represents a specific point in time. .IP "Type:" 6 \f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] -.IP "Examples:" 4 - -* \f[I]"file.ext"\f[] - -* \f[I]"~/path/to/file.ext"\f[] - -* \f[I]"$HOME/path/to/file.ext"\f[] - -* \f[I]["$HOME", "path", "to", "file.ext"]\f[] +.IP "Example:" 4 +"file.ext", +.br +"~/path/to/file.ext", +.br +"$HOME/path/to/file.ext", +.br +["$HOME", "path", "to", "file.ext"] +.br .IP "Description:" 4 A \f[I]Path\f[] is a \f[I]string\f[] representing the location of a file @@ -2015,7 +2082,7 @@ The path \f[I]C:\\path\\to\\file.ext\f[] has therefore to be written as \f[I]object\f[] -.IP "Examples:" 4 +.IP "Example:" 4 .. code:: { diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index ab46b5c..a8700a6 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.11.1 +Version: 1.12.1 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -95,8 +95,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.10.6/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.10.6/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.1/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.1/gallery-dl.bin>`__ These executables include a Python 3.7 interpreter and all required Python packages. @@ -132,14 +132,14 @@ Description: ========== .. code:: bash - $ gallery-dl http://danbooru.donmai.us/posts?tags=bonocho + $ gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" Get the direct URL of an image from a site that requires authentication: .. code:: bash - $ gallery-dl -g -u <username> -p <password> http://seiga.nicovideo.jp/seiga/im3211703 + $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" | Search a remote resource for URLs and download images from them: @@ -147,7 +147,7 @@ Description: ========== .. code:: bash - $ gallery-dl r:https://pastebin.com/raw/FLwrCYsT + $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" Configuration @@ -241,8 +241,8 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.10.6.zip - .. _dev: https://github.com/mikf/gallery-dl/archive/master.zip + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.1.tar.gz + .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 16db33a..513b6c7 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -115,6 +115,7 @@ gallery_dl/extractor/pornhub.py gallery_dl/extractor/pururin.py gallery_dl/extractor/reactor.py gallery_dl/extractor/readcomiconline.py +gallery_dl/extractor/realbooru.py gallery_dl/extractor/recursive.py gallery_dl/extractor/reddit.py gallery_dl/extractor/rule34.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 9665823..ffaed3d 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -93,7 +93,8 @@ def parse_inputfile(file, log): log.warning("input file: unable to parse '%s': %s", value, exc) continue - conf.append((key.strip().split("."), value)) + key = key.strip().split(".") + conf.append((key[:-1], key[-1], value)) else: # url @@ -122,11 +123,11 @@ def main(): if args.yamlfiles: config.load(args.yamlfiles, strict=True, fmt="yaml") if args.postprocessors: - config.set(("postprocessors",), args.postprocessors) + config.set((), "postprocessors", args.postprocessors) if args.abort: - config.set(("skip",), "abort:" + str(args.abort)) - for key, value in args.options: - config.set(key, value) + config.set((), "skip", "abort:" + str(args.abort)) + for opts in args.options: + config.set(*opts) # stream logging handler output.configure_logging_handler( @@ -140,7 +141,7 @@ def main(): # loglevels if args.loglevel >= logging.ERROR: - config.set(("output", "mode"), "null") + config.set(("output",), "mode", "null") elif args.loglevel <= logging.DEBUG: import platform import subprocess @@ -230,7 +231,7 @@ def main(): ulog.propagate = False job.Job.ulog = ulog - pformat = config.get(("output", "progress"), True) + pformat = config.get(("output",), "progress", True) if pformat and len(urls) > 1 and args.loglevel < logging.ERROR: urls = progress(urls, pformat) @@ -239,8 +240,8 @@ def main(): try: log.debug("Starting %s for '%s'", jobtype.__name__, url) if isinstance(url, util.ExtendedUrl): - for key, value in url.gconfig: - config.set(key, value) + for opts in url.gconfig: + config.set(*opts) with config.apply(url.lconfig): retval |= jobtype(url.value).run() else: diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py index 3ceef75..1824195 100644 --- a/gallery_dl/cache.py +++ b/gallery_dl/cache.py @@ -188,7 +188,7 @@ def clear(): def _path(): - path = config.get(("cache", "file"), -1) + path = config.get(("cache",), "file", -1) if path != -1: return util.expand_path(path) diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index b9bf32d..6e23c83 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -35,10 +35,11 @@ def solve_challenge(session, response, kwargs): cf_kwargs = {} headers = cf_kwargs["headers"] = collections.OrderedDict() - params = cf_kwargs["params"] = collections.OrderedDict() + params = cf_kwargs["data"] = collections.OrderedDict() page = response.text - params["s"] = text.extract(page, 'name="s" value="', '"')[0] + url = root + text.extract(page, 'action="', '"')[0] + params["r"] = text.extract(page, 'name="r" value="', '"')[0] params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0] params["pass"] = text.extract(page, 'name="pass" value="', '"')[0] params["jschl_answer"] = solve_js_challenge(page, parsed.netloc) @@ -46,12 +47,14 @@ def solve_challenge(session, response, kwargs): time.sleep(4) - url = root + "/cdn-cgi/l/chk_jschl" cf_kwargs["allow_redirects"] = False - cf_response = session.request("GET", url, **cf_kwargs) + cf_response = session.request("POST", url, **cf_kwargs) - location = cf_response.headers.get("Location") - if not location: + cookies = { + cookie.name: cookie.value + for cookie in cf_response.cookies + } + if not cookies: import logging log = logging.getLogger("cloudflare") rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected" @@ -60,18 +63,9 @@ def solve_challenge(session, response, kwargs): log.debug("Content:\n%s", cf_response.text) raise exception.StopExtraction() - if location[0] == "/": - location = root + location - else: - location = re.sub(r"(https?):/(?!/)", r"\1://", location) - - for cookie in cf_response.cookies: - if cookie.name == "cf_clearance": - return location, cookie.domain, { - cookie.name: cookie.value, - "__cfduid" : response.cookies.get("__cfduid", ""), - } - return location, "", {} + domain = next(iter(cf_response.cookies)).domain + cookies["__cfduid"] = response.cookies.get("__cfduid", "") + return cf_response, domain, cookies def solve_js_challenge(page, netloc): @@ -110,8 +104,7 @@ def solve_js_challenge(page, netloc): solution += len(netloc) if ".toFixed(" in expr: # trim solution to 10 decimal places - # and strip trailing zeros - solution = "{:.10f}".format(solution).rstrip("0") + solution = "{:.10f}".format(solution) return solution diff --git a/gallery_dl/config.py b/gallery_dl/config.py index da52f1e..785ffc3 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -57,7 +57,7 @@ def load(files=None, strict=False, fmt="json"): confdict = parsefunc(file) except OSError as exc: if strict: - log.error("%s", exc) + log.error(exc) sys.exit(1) except Exception as exc: log.warning("Could not parse '%s': %s", path, exc) @@ -75,62 +75,57 @@ def clear(): _config.clear() -def get(keys, default=None, conf=_config): +def get(path, key, default=None, *, conf=_config): """Get the value of property 'key' or a default value""" try: - for k in keys: - conf = conf[k] - return conf - except (KeyError, AttributeError): + for p in path: + conf = conf[p] + return conf[key] + except Exception: return default -def interpolate(keys, default=None, conf=_config): +def interpolate(path, key, default=None, *, conf=_config): """Interpolate the value of 'key'""" + if key in conf: + return conf[key] try: - lkey = keys[-1] - if lkey in conf: - return conf[lkey] - for k in keys: - if lkey in conf: - default = conf[lkey] - conf = conf[k] - return conf - except (KeyError, AttributeError): - return default + for p in path: + conf = conf[p] + if key in conf: + default = conf[key] + except Exception: + pass + return default -def set(keys, value, conf=_config): +def set(path, key, value, *, conf=_config): """Set the value of property 'key' for this session""" - for k in keys[:-1]: + for p in path: try: - conf = conf[k] + conf = conf[p] except KeyError: - temp = {} - conf[k] = temp - conf = temp - conf[keys[-1]] = value + conf[p] = conf = {} + conf[key] = value -def setdefault(keys, value, conf=_config): +def setdefault(path, key, value, *, conf=_config): """Set the value of property 'key' if it doesn't exist""" - for k in keys[:-1]: + for p in path: try: - conf = conf[k] + conf = conf[p] except KeyError: - temp = {} - conf[k] = temp - conf = temp - return conf.setdefault(keys[-1], value) + conf[p] = conf = {} + return conf.setdefault(key, value) -def unset(keys, conf=_config): +def unset(path, key, *, conf=_config): """Unset the value of property 'key'""" try: - for k in keys[:-1]: - conf = conf[k] - del conf[keys[-1]] - except (KeyError, AttributeError): + for p in path: + conf = conf[p] + del conf[key] + except Exception: pass @@ -143,13 +138,13 @@ class apply(): self.kvlist = kvlist def __enter__(self): - for key, value in self.kvlist: - self.original.append((key, get(key, self._sentinel))) - set(key, value) + for path, key, value in self.kvlist: + self.original.append((path, key, get(path, key, self._sentinel))) + set(path, key, value) def __exit__(self, etype, value, traceback): - for key, value in self.original: + for path, key, value in self.original: if value is self._sentinel: - unset(key) + unset(path, key) else: - set(key, value) + set(path, key, value) diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index 6e5cd4c..596c956 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -30,7 +30,7 @@ class DownloaderBase(): def config(self, key, default=None): """Interpolate downloader config value for 'key'""" - return config.interpolate(("downloader", self.scheme, key), default) + return config.interpolate(("downloader", self.scheme), key, default) def download(self, url, pathfmt): """Write data from 'url' into the file specified by 'pathfmt'""" diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 1c78cfb..fab96ba 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -15,10 +15,11 @@ from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase from .. import text +from ssl import SSLError try: - from OpenSSL.SSL import Error as SSLError + from OpenSSL.SSL import Error as OpenSSLError except ImportError: - from ssl import SSLError + OpenSSLError = SSLError class HttpDownloader(DownloaderBase): @@ -39,11 +40,13 @@ class HttpDownloader(DownloaderBase): self.retries = float("inf") if self.rate: rate = text.parse_bytes(self.rate) - if not rate: + if rate: + if rate < self.chunk_size: + self.chunk_size = rate + self.rate = rate + self.receive = self._receive_rate + else: self.log.warning("Invalid rate limit (%r)", self.rate) - elif rate < self.chunk_size: - self.chunk_size = rate - self.rate = rate def download(self, url, pathfmt): try: @@ -77,12 +80,15 @@ class HttpDownloader(DownloaderBase): time.sleep(min(2 ** (tries-1), 1800)) tries += 1 + headers = {} # check for .part file filesize = pathfmt.part_size() if filesize: - headers = {"Range": "bytes={}-".format(filesize)} - else: - headers = None + headers["Range"] = "bytes={}-".format(filesize) + # file-specific headers + extra = pathfmt.kwdict.get("_http_headers") + if extra: + headers.update(extra) # connect to (remote) source try: @@ -93,7 +99,7 @@ class HttpDownloader(DownloaderBase): msg = str(exc) continue except Exception as exc: - self.log.warning("%s", exc) + self.log.warning(exc) return False # check response @@ -110,7 +116,7 @@ class HttpDownloader(DownloaderBase): msg = "'{} {}' for '{}'".format(code, response.reason, url) if code == 429 or 500 <= code < 600: # Server Error continue - self.log.warning("%s", msg) + self.log.warning(msg) return False size = text.parse_int(size) @@ -140,7 +146,7 @@ class HttpDownloader(DownloaderBase): # download content try: self.receive(response, file) - except (RequestException, SSLError) as exc: + except (RequestException, SSLError, OpenSSLError) as exc: msg = str(exc) print() continue @@ -166,20 +172,26 @@ class HttpDownloader(DownloaderBase): return True def receive(self, response, file): - if self.rate: - total = 0 # total amount of bytes received - start = time.time() # start time + for data in response.iter_content(self.chunk_size): + file.write(data) + + def _receive_rate(self, response, file): + t1 = time.time() + rt = self.rate for data in response.iter_content(self.chunk_size): file.write(data) - if self.rate: - total += len(data) - expected = total / self.rate # expected elapsed time - delta = time.time() - start # actual elapsed time since start - if delta < expected: - # sleep if less time passed than expected - time.sleep(expected - delta) + t2 = time.time() # current time + actual = t2 - t1 # actual elapsed time + expected = len(data) / rt # expected elapsed time + + if actual < expected: + # sleep if less time elapsed than expected + time.sleep(expected - actual) + t1 = time.time() + else: + t1 = t2 def get_extension(self, response): mtype = response.headers.get("Content-Type", "image/jpeg") diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index 33e7929..c34cfec 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -68,6 +68,8 @@ class _2chanThreadExtractor(Extractor): def parse(self, post): """Build post-object by extracting data from an HTML post""" data = self._extract_post(post) + if data["name"]: + data["name"] = data["name"].strip() if '<a href="/' in post: self._extract_image(post, data) data["tim"], _, data["extension"] = data["filename"].partition(".") @@ -78,10 +80,10 @@ class _2chanThreadExtractor(Extractor): @staticmethod def _extract_post(post): return text.extract_all(post, ( - ("no" , 'name="', '"'), - ("post", '<b>', '</b>'), - ("name", '<b>', ' </b>'), - ("now" , '</font> ', ' '), + ("post", 'class="csb">' , '<'), + ("name", 'class="cnm">' , '<'), + ("now" , 'class="cnw">' , '<'), + ("no" , 'class="cno">No.', '<'), (None , '<blockquote', ''), ("com" , '>', '</blockquote>'), ))[0] diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index febbb51..ac96211 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -67,7 +67,7 @@ class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor): r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" r"(?:\?(?P<query>[^#]*))?") test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", { - "url": "f5a26c624da9a3d1dbc610e4a614bc57df6251c5", + "url": "8b1a5c5b7a10f8f5d3d6124d1aabfee0277078cb", "count": 20, }) diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py index 36a0573..980dc20 100644 --- a/gallery_dl/extractor/4chan.py +++ b/gallery_dl/extractor/4chan.py @@ -59,3 +59,30 @@ class _4chanThreadExtractor(Extractor): url = "https://i.4cdn.org/{}/{}{}".format( post["board"], post["tim"], post["ext"]) yield Message.Url, url, post + + +class _4chanBoardExtractor(Extractor): + """Extractor for 4chan boards""" + category = "4chan" + subcategory = "board" + pattern = r"(?:https?://)?boards\.4chan(?:nel)?\.org/([^/?&#]+)/\d*$" + test = ("https://boards.4channel.org/po/", { + "pattern": _4chanThreadExtractor.pattern, + "count": ">= 100", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def items(self): + url = "https://a.4cdn.org/{}/threads.json".format(self.board) + threads = self.request(url).json() + + for page in threads: + for thread in page["threads"]: + url = "https://boards.4chan.org/{}/thread/{}/".format( + self.board, thread["no"]) + thread["page"] = page["page"] + thread["_extractor"] = _4chanThreadExtractor + yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b8f74d1..9ff3746 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -86,6 +86,7 @@ modules = [ "pururin", "reactor", "readcomiconline", + "realbooru", "reddit", "rule34", "safebooru", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index c701927..1126615 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -72,6 +72,11 @@ class BehanceGalleryExtractor(BehanceExtractor): "url": "0258fe194fe7d828d6f2c7f6086a9a0a4140db1d", "keyword": {"owners": ["Alex Strohl"]}, }), + # 'media_collection' modules + ("https://www.behance.net/gallery/88276087/Audi-R8-RWD", { + "count": 20, + "url": "6bebff0d37f85349f9ad28bd8b76fd66627c1e2f", + }), ) def __init__(self, match): @@ -112,20 +117,28 @@ class BehanceGalleryExtractor(BehanceExtractor): @staticmethod def get_images(data): """Extract image results from an API response""" - results = [] + result = [] + append = result.append for module in data["modules"]: + mtype = module["type"] - if module["type"] == "image": + if mtype == "image": url = module["sizes"]["original"] - results.append((url, module)) + append((url, module)) + + elif mtype == "media_collection": + for component in module["components"]: + url = component["sizes"]["source"] + append((url, module)) - elif module["type"] == "embed": + elif mtype == "embed": embed = module.get("original_embed") or module.get("embed") - url = "ytdl:" + text.extract(embed, 'src="', '"')[0] - results.append((url, module)) + if embed: + url = "ytdl:" + text.extract(embed, 'src="', '"')[0] + append((url, module)) - return results + return result class BehanceUserExtractor(BehanceExtractor): diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py index dba5fe7..94a2840 100644 --- a/gallery_dl/extractor/bobx.py +++ b/gallery_dl/extractor/bobx.py @@ -10,18 +10,38 @@ from .common import Extractor, Message from .. import text +from ..cache import memcache +import random +import time class BobxExtractor(Extractor): """Base class for bobx extractors""" category = "bobx" root = "http://www.bobx.com" + cookiedomain = ".bobx.com" per_page = 80 def __init__(self, match): Extractor.__init__(self, match) self.path = match.group(1) + def login(self): + if not self._check_cookies(("BobXUser",)): + self._update_cookies(self._login_impl()) + + @memcache() + def _login_impl(self): + """Generate a randomized 'BobXUser' cookie""" + rand = random.randrange + tnow = time.time() - rand(60, 3600) + + return {"BobXUser": "{}.{}.{}.{}.{}.{}".format( + int(tnow), + rand(128, 192), rand(0, 256), rand(0, 256), rand(0, 256), + tnow + 622080000, # timestamp in 7200 days + )} + class BobxGalleryExtractor(BobxExtractor): """Extractor for individual image galleries on bobx.com""" @@ -46,6 +66,8 @@ class BobxGalleryExtractor(BobxExtractor): ) def items(self): + self.login() + num = 0 while True: url = "{}/{}-{}-10-8.html".format(self.root, self.path, num) @@ -99,6 +121,7 @@ class BobxIdolExtractor(BobxExtractor): }) def items(self): + self.login() url = "{}/{}/".format(self.root, self.path) data = {"_extractor": BobxGalleryExtractor} page = self.request(url).text diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 0d258eb..a1a4890 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -69,7 +69,7 @@ class Extractor(): def config(self, key, default=None): return config.interpolate( - ("extractor", self.category, self.subcategory, key), default) + ("extractor", self.category, self.subcategory), key, default) def request(self, url, *, method="GET", session=None, retries=None, encoding=None, fatal=True, notfound=None, **kwargs): @@ -101,18 +101,14 @@ class Extractor(): raise exception.NotFoundError(notfound) if cloudflare.is_challenge(response): self.log.info("Solving Cloudflare challenge") - url, domain, cookies = cloudflare.solve_challenge( + response, domain, cookies = cloudflare.solve_challenge( session, response, kwargs) + if response.status_code >= 400: + continue cloudflare.cookies.update(self.category, (domain, cookies)) - continue + return response if cloudflare.is_captcha(response): - try: - import OpenSSL # noqa - except ImportError: - msg = " - Install 'pyOpenSSL' and try again" - else: - msg = "" - self.log.warning("Cloudflare CAPTCHA" + msg) + self.log.warning("Cloudflare CAPTCHA") msg = "'{} {}' for '{}'".format(code, response.reason, url) if code < 500 and code != 429 and code != 430: @@ -200,7 +196,7 @@ class Extractor(): def _store_cookies(self): """Store the session's cookiejar in a cookies.txt file""" - if self._cookiefile and self.config("cookies-update", False): + if self._cookiefile and self.config("cookies-update", True): cookiejar = http.cookiejar.MozillaCookieJar() for cookie in self._cookiejar: cookiejar.set_cookie(cookie) @@ -233,12 +229,14 @@ class Extractor(): """Check if all 'cookienames' are in the session's cookiejar""" if domain is None: domain = self.cookiedomain - try: - for name in cookienames: - self._cookiejar._find(name, domain) - except KeyError: - return False - return True + + names = set(cookienames) + for cookie in self._cookiejar: + if cookie.domain == domain: + names.discard(cookie.name) + if not names: + return True + return False def _get_date_min_max(self, dmin=None, dmax=None): """Retrieve and parse 'date-min' and 'date-max' config values""" @@ -254,6 +252,26 @@ class Extractor(): fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") return get("date-min", dmin), get("date-max", dmax) + def _dispatch_extractors(self, extractor_data, default=()): + """ """ + extractors = { + data[0].subcategory: data + for data in extractor_data + } + + include = self.config("include", default) or () + if include == "all": + include = extractors + elif isinstance(include, str): + include = include.split(",") + + result = [(Message.Version, 1)] + for category in include: + if category in extractors: + extr, url = extractors[category] + result.append((Message.Queue, url, {"_extractor": extr})) + return iter(result) + @classmethod def _get_tests(cls): """Yield an extractor's test cases as (URL, RESULTS) tuples""" @@ -284,7 +302,7 @@ class GalleryExtractor(Extractor): def items(self): self.login() - page = self.request(self.gallery_url).text + page = self.request(self.gallery_url, notfound=self.subcategory).text data = self.metadata(page) imgs = self.images(page) @@ -402,16 +420,13 @@ class SharedConfigMixin(): def config(self, key, default=None, *, sentinel=object()): value = Extractor.config(self, key, sentinel) - if value is sentinel: - cat, self.category = self.category, self.basecategory - value = Extractor.config(self, key, default) - self.category = cat - return value + return value if value is not sentinel else config.interpolate( + ("extractor", self.basecategory, self.subcategory), key, default) def generate_extractors(extractor_data, symtable, classes): """Dynamically generate Extractor classes""" - extractors = config.get(("extractor", classes[0].basecategory)) + extractors = config.get(("extractor",), classes[0].basecategory) ckey = extractor_data.get("_ckey") prev = None @@ -456,10 +471,21 @@ def generate_extractors(extractor_data, symtable, classes): http.cookiejar.MozillaCookieJar.magic_re = re.compile( "#( Netscape)? HTTP Cookie File", re.IGNORECASE) -# Replace default cipher list of urllib3 to avoid Cloudflare CAPTCHAs -ciphers = config.get(("ciphers",), True) + +# Undo automatic pyOpenSSL injection by requests +pyopenssl = config.get((), "pyopenssl", False) +if not pyopenssl: + try: + from requests.packages.urllib3.contrib import pyopenssl # noqa + pyopenssl.extract_from_urllib3() + except ImportError: + pass +del pyopenssl + + +# Replace urllib3's default cipher list to avoid Cloudflare CAPTCHAs +ciphers = config.get((), "ciphers", True) if ciphers: - logging.getLogger("gallery-dl").debug("Updating urllib3 ciphers") if ciphers is True: ciphers = ( @@ -489,3 +515,4 @@ if ciphers: from requests.packages.urllib3.util import ssl_ # noqa ssl_.DEFAULT_CIPHERS = ciphers del ssl_ +del ciphers diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index eeee74a..604966f 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -29,7 +29,7 @@ BASE_PATTERN = ( class DeviantartExtractor(Extractor): """Base class for deviantart extractors using the OAuth API""" category = "deviantart" - directory_fmt = ("{category}", "{author[username]!l}") + directory_fmt = ("{category}", "{username}") filename_fmt = "{category}_{index}_{title}.{extension}" root = "https://www.deviantart.com" @@ -47,6 +47,12 @@ class DeviantartExtractor(Extractor): if self.quality: self.quality = "q_{}".format(self.quality) + if self.original != "image": + self._update_content = self._update_content_default + else: + self._update_content = self._update_content_image + self.original = True + self.commit_journal = { "html": self._commit_journal_html, "text": self._commit_journal_text, @@ -62,6 +68,7 @@ class DeviantartExtractor(Extractor): self.group = not profile if self.group: self.subcategory = "group-" + self.subcategory + self.user = self.user.lower() else: self.user = profile["user"]["username"] @@ -95,8 +102,7 @@ class DeviantartExtractor(Extractor): yield self.commit(deviation, content) elif deviation["is_downloadable"]: - content = {} - self._update_content(deviation, content) + content = self.api.deviation_download(deviation["deviationid"]) yield self.commit(deviation, content) if "videos" in deviation: @@ -127,8 +133,14 @@ class DeviantartExtractor(Extractor): deviation["url"].rpartition("-")[2]) except KeyError: deviation["index"] = 0 + if self.user: deviation["username"] = self.user + deviation["_username"] = self.user.lower() + else: + deviation["username"] = deviation["author"]["username"] + deviation["_username"] = deviation["username"].lower() + deviation["da_category"] = deviation["category"] deviation["published_time"] = text.parse_int( deviation["published_time"]) @@ -238,81 +250,51 @@ class DeviantartExtractor(Extractor): url = "{}/{}/{}/0/".format(self.root, self.user, category) return [(url + folder["name"], folder) for folder in folders] - def _update_content(self, deviation, content): - try: - data = self.api.deviation_extended_fetch( - deviation["index"], - deviation["author"]["username"], - "journal" if "excerpt" in deviation else "art", - ) - download = data["deviation"]["extended"]["download"] - download["src"] = download["url"] - except Exception as e: - self.log.warning( - "Unable to fetch original download URL for ID %s ('%s: %s')", - deviation["index"], e.__class__.__name__, e, - ) - self.log.debug("Server response: %s", data) - else: - if self.original == "image": - url = data["src"].partition("?")[0] - mtype = mimetypes.guess_type(url, False)[0] - if not mtype or not mtype.startswith("image/"): - return - del download["url"] - content.update(download) + def _update_content_default(self, deviation, content): + content.update(self.api.deviation_download(deviation["deviationid"])) + def _update_content_image(self, deviation, content): + data = self.api.deviation_download(deviation["deviationid"]) + url = data["src"].partition("?")[0] + mtype = mimetypes.guess_type(url, False)[0] + if mtype and mtype.startswith("image/"): + content.update(data) -class DeviantartUserExtractor(Extractor): + +class DeviantartUserExtractor(DeviantartExtractor): """Extractor for an artist's user profile""" - category = "deviantart" subcategory = "user" pattern = BASE_PATTERN + r"/?$" test = ( ("https://www.deviantart.com/shimoda7", { - "options": (("include", "gsjf"),), - "pattern": r"/shimoda7/(gallery(/scraps)?|posts|favourites)", + "pattern": r"/shimoda7/gallery$", + }), + ("https://www.deviantart.com/shimoda7", { + "options": (("include", "all"),), + "pattern": r"/shimoda7/(gallery(/scraps)?|posts|favourites)$", "count": 4, }), ("https://shimoda7.deviantart.com/"), ) - def __init__(self, match): - Extractor.__init__(self, match) - self.user = match.group(1) or match.group(2) - - incl = self.config("include") or "g" - if isinstance(incl, list): - incl = "".join(item[0] for item in incl if item) - self.include = incl.lower() - def items(self): - base = "https://www.deviantart.com/{}/".format(self.user) - incl = self.include - data = {} - - if "g" in incl: - data["_extractor"] = DeviantartGalleryExtractor - yield Message.Queue, base + "gallery", data - if "s" in incl: - data["_extractor"] = DeviantartScrapsExtractor - yield Message.Queue, base + "gallery/scraps", data - if "j" in incl: - data["_extractor"] = DeviantartJournalExtractor - yield Message.Queue, base + "posts", data - if "f" in incl: - data["_extractor"] = DeviantartFavoriteExtractor - yield Message.Queue, base + "favourites", data + base = "{}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (DeviantartGalleryExtractor , base + "gallery"), + (DeviantartScrapsExtractor , base + "gallery/scraps"), + (DeviantartJournalExtractor , base + "posts"), + (DeviantartFavoriteExtractor, base + "favourites"), + ), ("gallery",)) class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" subcategory = "gallery" - archive_fmt = "g_{username}_{index}.{extension}" + archive_fmt = "g_{_username}_{index}.{extension}" pattern = BASE_PATTERN + r"/gallery(?:/all|/?\?catpath=)?/?$" test = ( ("https://www.deviantart.com/shimoda7/gallery/", { - "pattern": r"https://(www.deviantart.com/download/\d+/" + "pattern": r"https://(api-da\.wixmp\.com/_api/download/file" r"|images-wixmp-[^.]+.wixmp.com/f/.+/.+.jpg\?token=.+)", "count": ">= 30", "keyword": { @@ -398,7 +380,7 @@ class DeviantartGalleryExtractor(DeviantartExtractor): class DeviantartFolderExtractor(DeviantartExtractor): """Extractor for deviations inside an artist's gallery folder""" subcategory = "folder" - directory_fmt = ("{category}", "{folder[owner]}", "{folder[title]}") + directory_fmt = ("{category}", "{username}", "{folder[title]}") archive_fmt = "F_{folder[uuid]}_{index}.{extension}" pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?&#]+)" test = ( @@ -418,14 +400,19 @@ class DeviantartFolderExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.fname = match.group(4) - self.folder = {"owner": self.user, "index": match.group(3)} + self.folder = None + self.folder_id = match.group(3) + self.folder_name = match.group(4) def deviations(self): folders = self.api.gallery_folders(self.user) - folder = self._find_folder(folders, self.fname) - self.folder["title"] = folder["name"] - self.folder["uuid"] = folder["folderid"] + folder = self._find_folder(folders, self.folder_name) + self.folder = { + "title": folder["name"], + "uuid" : folder["folderid"], + "index": self.folder_id, + "owner": self.user, + } return self.api.gallery(self.user, folder["folderid"], self.offset) def prepare(self, deviation): @@ -440,7 +427,8 @@ class DeviantartStashExtractor(DeviantartExtractor): pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" test = ( ("https://sta.sh/022c83odnaxc", { - "pattern": r"https://sta.sh/download/7549925030122512/.+\?token=", + "pattern": r"https://api-da\.wixmp\.com/_api/download/file", + "content": "057eb2f2861f6c8a96876b13cca1a4b7a408c11f", "count": 1, }), # multiple stash items @@ -450,7 +438,7 @@ class DeviantartStashExtractor(DeviantartExtractor): }), # downloadable, but no "content" field (#307) ("https://sta.sh/024t4coz16mi", { - "pattern": r"https://sta.sh/download/7800709982190282/.+\?token=", + "pattern": r"https://api-da\.wixmp\.com/_api/download/file", "count": 1, }), ("https://sta.sh/abcdefghijkl", { @@ -468,41 +456,25 @@ class DeviantartStashExtractor(DeviantartExtractor): def deviations(self): url = "https://sta.sh/" + self.stash_id page = self.request(url).text - deviation_id, pos = text.extract(page, '//deviation/', '"') + deviation_id = text.extract(page, '//deviation/', '"')[0] if deviation_id: - deviation = self.api.deviation(deviation_id) - pos = page.find("dev-page-download", pos) - if pos >= 0: - deviation["_download"] = { - "width" : text.parse_int(text.extract( - page, 'data-download_width="' , '"', pos)[0]), - "height": text.parse_int(text.extract( - page, 'data-download_height="', '"', pos)[0]), - "src" : text.unescape(text.extract( - page, 'data-download_url="' , '"', pos)[0]), - } - return (deviation,) + return (self.api.deviation(deviation_id),) + else: data = {"_extractor": DeviantartStashExtractor} - page = text.extract( - page, 'id="stash-body"', 'class="footer"', pos)[0] + page = text.extract(page, 'id="stash-body"', 'class="footer"')[0] return [ (url, data) for url in text.extract_iter(page, '<a href="', '"') ] - def _update_content(self, deviation, content): - if "_download" in deviation: - content.update(deviation["_download"]) - del deviation["_download"] - class DeviantartFavoriteExtractor(DeviantartExtractor): """Extractor for an artist's favorites""" subcategory = "favorite" directory_fmt = ("{category}", "{username}", "Favourites") - archive_fmt = "f_{username}_{index}.{extension}" + archive_fmt = "f_{_username}_{index}.{extension}" pattern = BASE_PATTERN + r"/favourites/?(?:\?catpath=/)?$" test = ( ("https://www.deviantart.com/h3813067/favourites/", { @@ -530,8 +502,8 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): class DeviantartCollectionExtractor(DeviantartExtractor): """Extractor for a single favorite collection""" subcategory = "collection" - directory_fmt = ("{category}", "{collection[owner]}", - "Favourites", "{collection[title]}") + directory_fmt = ("{category}", "{username}", "Favourites", + "{collection[title]}") archive_fmt = "C_{collection[uuid]}_{index}.{extension}" pattern = BASE_PATTERN + r"/favourites/(\d+)/([^/?&#]+)" test = ( @@ -546,14 +518,19 @@ class DeviantartCollectionExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - _, _, cid, self.cname = match.groups() - self.collection = {"owner": self.user, "index": cid} + self.collection = None + self.collection_id = match.group(3) + self.collection_name = match.group(4) def deviations(self): folders = self.api.collections_folders(self.user) - folder = self._find_folder(folders, self.cname) - self.collection["title"] = folder["name"] - self.collection["uuid"] = folder["folderid"] + folder = self._find_folder(folders, self.collection_name) + self.collection = { + "title": folder["name"], + "uuid" : folder["folderid"], + "index": self.collection_id, + "owner": self.user, + } return self.api.collections(self.user, folder["folderid"], self.offset) def prepare(self, deviation): @@ -565,7 +542,7 @@ class DeviantartJournalExtractor(DeviantartExtractor): """Extractor for an artist's journals""" subcategory = "journal" directory_fmt = ("{category}", "{username}", "Journal") - archive_fmt = "j_{username}_{index}.{extension}" + archive_fmt = "j_{_username}_{index}.{extension}" pattern = BASE_PATTERN + r"/(?:posts(?:/journals)?|journal)/?(?:\?.*)?$" test = ( ("https://www.deviantart.com/angrywhitewanker/posts/journals/", { @@ -635,8 +612,18 @@ class DeviantartPopularExtractor(DeviantartExtractor): class DeviantartExtractorV2(DeviantartExtractor): """Base class for deviantart extractors using the NAPI""" + cookiedomain = ".deviantart.com" + cookienames = ("auth", "auth_secure", "userinfo") + _warning = True def items(self): + if self.original and not self._check_cookies(self.cookienames): + self.original = False + if self._warning: + DeviantartExtractorV2._warning = False + self.log.warning("No session cookies set: " + "Disabling original file downloads.") + yield Message.Version, 1 for deviation in self.deviations(): data = self.api.deviation_extended_fetch( @@ -646,10 +633,14 @@ class DeviantartExtractorV2(DeviantartExtractor): ) if "deviation" not in data: - self.log.warning("Skipping ID %s", deviation["deviationId"]) + self.log.warning("Unable to fetch deviation ID %s", + deviation["deviationId"]) self.log.debug("Server response: %s", data) continue + deviation = self._extract(data) + if not deviation: + continue yield Message.Directory, deviation yield Message.Url, deviation["target"]["src"], deviation @@ -662,13 +653,14 @@ class DeviantartExtractorV2(DeviantartExtractor): def _extract(self, data): deviation = data["deviation"] extended = deviation["extended"] - files = deviation["files"] + media = deviation["media"] del deviation["extended"] - del deviation["files"] + del deviation["media"] # prepare deviation metadata deviation["description"] = extended.get("description", "") - deviation["username"] = self.user.lower() + deviation["username"] = deviation["author"]["username"] + deviation["_username"] = deviation["username"].lower() deviation["stats"] = extended["stats"] deviation["stats"]["comments"] = data["comments"]["total"] deviation["index"] = deviation["deviationId"] @@ -682,53 +674,69 @@ class DeviantartExtractorV2(DeviantartExtractor): ) # extract download target - target = files[-1] + target = media["types"][-1] + src = token = None - if "textContent" in deviation and self.commit_journal: + if "textContent" in deviation: + if not self.commit_journal: + return None journal = deviation["textContent"] journal["html"] = journal["html"]["markup"] - target["src"] = self.commit_journal(deviation, journal)[1] - elif target["type"] == "gif": - pass - elif target["type"] == "video": - # select largest video - target = max( - files, key=lambda x: text.parse_int(x.get("quality", "")[:-1])) - elif target["type"] == "flash": - if target["src"].startswith("https://sandbox.deviantart.com"): - # extract SWF file from "sandbox" - target["src"] = text.extract( - self.request(target["src"]).text, - 'id="sandboxembed" src="', '"', - )[0] - elif "download" in extended: + src = self.commit_journal(deviation, journal)[1] + + elif target["t"] == "gif": + src = target["b"] + token = media["token"][0] + + elif "download" in extended and self.original: target = extended["download"] - target["src"] = target["url"] + src = target["url"] del target["url"] - elif target["src"].startswith("https://images-wixmp-"): - if deviation["index"] <= 790677560: - # https://github.com/r888888888/danbooru/issues/4069 - target["src"] = re.sub( - r"(/f/[^/]+/[^/]+)/v\d+/.*", - r"/intermediary\1", target["src"]) - if self.quality: - target["src"] = re.sub( - r"q_\d+", self.quality, target["src"]) + + elif target["t"] == "video": + # select largest video + target = max(media["types"], + key=lambda x: text.parse_int(x.get("q", "")[:-1])) + src = target["s"] + + elif target["t"] == "flash": + src = target["s"] + if src.startswith("https://sandbox.deviantart.com"): + # extract SWF file from "sandbox" + src = text.extract( + self.request(src).text, 'id="sandboxembed" src="', '"')[0] + + else: + src = media["baseUri"] + if "token" in media: + token = media["token"][0] + + if "c" in target: + src += "/" + target["c"].replace( + "<prettyName>", media["prettyName"]) + if src.startswith("https://images-wixmp-"): + if deviation["index"] <= 790677560: + # https://github.com/r888888888/danbooru/issues/4069 + src = re.sub( + r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", src) + if self.quality: + src = re.sub(r"q_\d+", self.quality, src) # filename and extension metadata alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" sub = re.compile(r"\W").sub - deviation["filename"] = target["filename"] = "".join(( + deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", util.bencode(deviation["index"], alphabet), )) if "extension" not in deviation: - deviation["extension"] = target["extension"] = ( - text.ext_from_url(target["src"]) - ) - deviation["target"] = target + deviation["extension"] = text.ext_from_url(src) + if token: + src = src + "?token=" + token + target["src"] = src + deviation["target"] = target return deviation @@ -740,19 +748,21 @@ class DeviantartDeviationExtractor(DeviantartExtractorV2): test = ( (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), { "options": (("original", 0),), - "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", + # "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", }), ("https://www.deviantart.com/zzz/art/zzz-1234567890", { "count": 0, }), (("https://www.deviantart.com/myria-moon/art/Aime-Moi-261986576"), { - "pattern": (r"https://www.deviantart.com/download/261986576" - r"/[\w-]+\.jpg\?token=\w+&ts=\d+"), + # "pattern": (r"https://www.deviantart.com/download/261986576" + # r"/[\w-]+\.jpg\?token=\w+&ts=\d+"), + "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" + r"/intermediary/f/[^/]+/[^.]+\.jpg") }), # wixmp URL rewrite (("https://www.deviantart.com/citizenfresh/art/Hverarond-789295466"), { "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" - r"/intermediary/f/[^/]+/[^.]+\.jpg$") + r"/intermediary/f/[^/]+/[^.]+\.jpg") }), # wixmp URL rewrite v2 (#369) (("https://www.deviantart.com/josephbiwald/art/Destiny-2-804940104"), { @@ -774,20 +784,21 @@ class DeviantartDeviationExtractor(DeviantartExtractorV2): ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", { "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b", "keyword": { + "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5", + "extension": "mp4", "target": { - "duration": 306, - "extension": "mp4", - "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5", - "filesize": 9963639, - "quality": "1080p", + "d": 306, + "f": 9963639, + "q": "1080p", + "t": "video", "src": str, - "type": "video", }, } }), # archive ("https://www.deviantart.com/itsvenue/art/-brush-pngs-14-763300948", { - "pattern": r"https://.+deviantart.com/download/763300948/.*\.rar", + # "pattern": r"https://.+deviantart.com/download/763300948/.*rar", + "pattern": r"https://images-wixmp-\w+\.wixmp\.com/i/.*\.png" }), # swf ("https://www.deviantart.com/ikatxfruti/art/Bang-Bang-528130222", { @@ -830,7 +841,7 @@ class DeviantartScrapsExtractor(DeviantartExtractorV2): """Extractor for an artist's scraps""" subcategory = "scraps" directory_fmt = ("{category}", "{username}", "Scraps") - archive_fmt = "s_{username}_{index}.{extension}" + archive_fmt = "s_{_username}_{index}.{extension}" pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" test = ( ("https://www.deviantart.com/shimoda7/gallery/scraps", { @@ -841,14 +852,6 @@ class DeviantartScrapsExtractor(DeviantartExtractorV2): ) def deviations(self): - # copy self.session - session = self.session.__class__() - for attr in session.__attrs__: - setattr(session, attr, getattr(self.session, attr, None)) - - # reset cookies in the original session object - self.session.cookies = session.cookies.__class__() - url = self.root + "/_napi/da-user-profile/api/gallery/contents" params = { "username" : self.user, @@ -861,8 +864,7 @@ class DeviantartScrapsExtractor(DeviantartExtractorV2): } while True: - data = self.request( - url, session=session, params=params, headers=headers).json() + data = self.request(url, params=params, headers=headers).json() for obj in data["results"]: yield obj["deviation"] diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 77a19f6..80db096 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -15,25 +15,30 @@ from .. import text class DirectlinkExtractor(Extractor): """Extractor for direct links to images and other media files""" category = "directlink" - filename_fmt = "{domain}/{path}" - archive_fmt = "{domain}/{path}" + filename_fmt = "{domain}/{path}/{filename}.{extension}" + archive_fmt = filename_fmt pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\." r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$") test = ( (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), { "url": "18c5d00077332e98e53be9fed2ee4be66154b88d", - "keyword": "e81b9fe3022e971365dd859f38e4ef717a6c69ed", + "keyword": "105770a3f4393618ab7b811b731b22663b5d3794", + }), + # empty path + (("https://example.org/file.webm"), { + "url": "2d807ed7059d1b532f1bb71dc24b510b80ff943f", + "keyword": "29dad729c40fb09349f83edafa498dba1297464a", }), # more complex example - ("https://example.org/path/file.webm?que=1&ry=2#fragment", { - "url": "fd4aec8a32842343394e6078a06c3e6b647bf671", - "keyword": "ff75764b1ae66615b723a6357b8193fa2de84678", + ("https://example.org/path/to/file.webm?que=1&ry=2#fragment", { + "url": "114b8f1415cc224b0f26488ccd4c2e7ce9136622", + "keyword": "06014abd503e3b2b58aa286f9bdcefdd2ae336c0", }), # percent-encoded characters ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", { "url": "2627e8140727fdf743f86fe18f69f99a052c9718", - "keyword": "4d19dc12e41ffcb4cbec2013e335cf482377c35e", + "keyword": "831790fddda081bdddd14f96985ab02dc5b5341f", }), # upper case file extension (#296) ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw" @@ -46,11 +51,14 @@ class DirectlinkExtractor(Extractor): self.data = match.groupdict() def items(self): - text.nameext_from_url(self.url, self.data) - for key, value in self.data.items(): + data = self.data + for key, value in data.items(): if value: - self.data[key] = text.unquote(value) + data[key] = text.unquote(value) + data["path"], _, name = data["path"].rpartition("/") + data["filename"], _, ext = name.rpartition(".") + data["extension"] = ext.lower() yield Message.Version, 1 - yield Message.Directory, self.data - yield Message.Url, self.url, self.data + yield Message.Directory, data + yield Message.Url, self.url, data diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index b71fc4d..bd34bdb 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -280,18 +280,22 @@ class FlickrAPI(oauth.OAuth1API): API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" API_SECRET = "3adb0f568dc68393" FORMATS = [ - ("o", "Original" , None), - ("k", "Large 2048" , 2048), - ("h", "Large 1600" , 1600), - ("l", "Large" , 1024), - ("c", "Medium 800" , 800), - ("z", "Medium 640" , 640), - ("m", "Medium" , 500), - ("n", "Small 320" , 320), - ("s", "Small" , 240), - ("q", "Large Square", 150), - ("t", "Thumbnail" , 100), - ("s", "Square" , 75), + ("o" , "Original" , None), + ("6k", "X-Large 6K" , 6144), + ("5k", "X-Large 5K" , 5120), + ("4k", "X-Large 4K" , 4096), + ("3k", "X-Large 3K" , 3072), + ("k" , "Large 2048" , 2048), + ("h" , "Large 1600" , 1600), + ("l" , "Large" , 1024), + ("c" , "Medium 800" , 800), + ("z" , "Medium 640" , 640), + ("m" , "Medium" , 500), + ("n" , "Small 320" , 320), + ("s" , "Small" , 240), + ("q" , "Large Square", 150), + ("t" , "Thumbnail" , 100), + ("s" , "Square" , 75), ] VIDEO_FORMATS = { "orig" : 9, @@ -325,7 +329,7 @@ class FlickrAPI(oauth.OAuth1API): if not fmt[2] or fmt[2] <= self.maxsize] else: self.formats = self.FORMATS - self.formats = self.formats[:4] + self.formats = self.formats[:8] def favorites_getList(self, user_id): """Returns a list of the user's favorite photos.""" diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 645b53a..428f3c3 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -20,7 +20,6 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): subcategory = "thread" directory_fmt = ("{category}", "{board[shortname]}", "{thread_num}{title:? - //}") - filename_fmt = "{media[media]}" archive_fmt = "{board[shortname]}_{num}_{timestamp}" pattern_fmt = r"/([^/]+)/thread/(\d+)" external = "default" @@ -50,7 +49,8 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): if url.startswith("/"): url = self.root + url - post["extension"] = url.rpartition(".")[2] + post["filename"], _, post["extension"] = \ + media["media"].rpartition(".") yield Message.Url, url, post def posts(self): diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index fc7dbf9..1f8c567 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -82,13 +82,18 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): }) def images(self, page): + data = None + if self.decode == "base64": - base64_data = text.extract(page, 'atob("', '"')[0].encode() - data = base64.b64decode(base64_data).decode() + base64_data = text.extract(page, 'atob("', '"')[0] + if base64_data: + data = base64.b64decode(base64_data.encode()).decode() elif self.decode == "double": pos = page.find("[{") - data = text.extract(page, " = ", ";", pos)[0] - else: + if pos >= 0: + data = text.extract(page, " = ", ";", pos)[0] + + if not data: data = text.extract(page, "var pages = ", ";")[0] return json.loads(data) @@ -138,8 +143,8 @@ EXTRACTORS = { ("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", { "keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673", }), - ("https://jaiminisbox.com/reader/read/dr-stone/en/0/16/", { - "keyword": "8607375c24b1d0db7f52d059ef5baff793aa458e", + ("https://jaiminisbox.com/reader/read/red-storm/en/0/336/", { + "keyword": "53c6dddf3e5a61b6002a886ccd7e3354e973299a", }), ), "test-manga": diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 4ec7f00..0c05a97 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -52,7 +52,7 @@ class GelbooruExtractor(booru.XmlParserMixin, page = self.request(self.post_url.format(post_id)).text data = text.extract_all(page, ( (None , '<meta name="keywords"', ''), - ("tags" , ' imageboard, ', '"'), + ("tags" , ' imageboard- ', '"'), ("id" , '<li>Id: ', '<'), ("created_at", '<li>Posted: ', '<'), ("width" , '<li>Size: ', 'x'), diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py index 43479c6..181db9a 100644 --- a/gallery_dl/extractor/hbrowse.py +++ b/gallery_dl/extractor/hbrowse.py @@ -50,7 +50,7 @@ class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))" test = ("https://www.hbrowse.com/10363/c00000", { "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6", - "keyword": "274996f6c809e5250b6ff3abbc5147e29f89d9a5", + "keyword": "6c1136522a25de013a6579ffa34dadc1eb0d4d1b", "content": "44578ebbe176c2c27434966aef22945787e2781e", }) @@ -78,7 +78,7 @@ class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$" test = ("https://www.hbrowse.com/10363", { "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6", - "keyword": "4b15fda1858a69de1fbf5afddfe47dd893397312", + "keyword": "08f5935a4411d2c19ac1786bd4ca552c3785fcae", }) def chapters(self, page): diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index 7e0b63c..84ad3af 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -23,8 +23,8 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" test = ("https://hentaifox.com/gallery/56622/", { "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", + "keyword": "b7ff141331d0c7fc711ab28d45dfbb013a83d8e9", "count": 24, - "keyword": "903ebe227d85e484460382fc6cbab42be7a244d5", }) def __init__(self, match): @@ -37,19 +37,43 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): return { "gallery_id": text.parse_int(self.gallery_id), "title" : text.unescape(extr("<h1>", "</h1>")), - "parody" : split(extr(">Parodies:" , "</a></span>"))[::2], - "characters": split(extr(">Characters:", "</a></span>"))[::2], - "tags" : split(extr(">Tags:" , "</a></span>"))[::2], - "artist" : split(extr(">Artists:" , "</a></span>"))[::2], - "group" : split(extr(">Groups:" , "</a></span>"))[::2], - "type" : text.remove_html(extr(">Category:", "</a></span>")), + "parody" : split(extr(">Parodies:" , "</ul>"))[::2], + "characters": split(extr(">Characters:", "</ul>"))[::2], + "tags" : split(extr(">Tags:" , "</ul>"))[::2], + "artist" : split(extr(">Artists:" , "</ul>"))[::2], + "group" : split(extr(">Groups:" , "</ul>"))[::2], + "type" : text.remove_html(extr(">Category:", "<span")), "language" : "English", "lang" : "en", } def images(self, page): + pos = page.find('id="load_all"') + if pos >= 0: + extr = text.extract + load_id = extr(page, 'id="load_id" value="', '"', pos)[0] + load_dir = extr(page, 'id="load_dir" value="', '"', pos)[0] + load_pages = extr(page, 'id="load_pages" value="', '"', pos)[0] + + url = self.root + "/includes/thumbs_loader.php" + data = { + "u_id" : self.gallery_id, + "g_id" : load_id, + "img_dir" : load_dir, + "visible_pages": "0", + "total_pages" : load_pages, + "type" : "2", + } + headers = { + "Origin": self.root, + "Referer": self.gallery_url, + "X-Requested-With": "XMLHttpRequest", + } + page = self.request( + url, method="POST", headers=headers, data=data).text + return [ - (text.urljoin(self.root, url.replace("t.", ".")), None) + (url.replace("t.", "."), None) for url in text.extract_iter(page, 'data-src="', '"') ] @@ -64,15 +88,13 @@ class HentaifoxSearchExtractor(HentaifoxBase, Extractor): ("https://hentaifox.com/character/reimu-hakurei/"), ("https://hentaifox.com/artist/distance/"), ("https://hentaifox.com/search/touhou/"), - ("https://hentaifox.com/tag/full-colour/", { + ("https://hentaifox.com/tag/heterochromia/", { "pattern": HentaifoxGalleryExtractor.pattern, - "count": ">= 40", + "count": ">= 60", "keyword": { - "url": str, + "url" : str, "gallery_id": int, - "thumbnail": r"re:https://i\d*.hentaifox.com/\d+/\d+/thumb\.", - "title": str, - "tags": list, + "title" : str, }, }), ) @@ -87,31 +109,26 @@ class HentaifoxSearchExtractor(HentaifoxBase, Extractor): yield Message.Queue, gallery["url"], gallery def galleries(self): - url = "{}/{}/".format(self.root, self.path) + num = 1 while True: + url = "{}{}/pag/{}/".format(self.root, self.path, num) page = self.request(url).text - info, gpos = text.extract( - page, 'class="galleries_overview">', 'class="clear">') - for ginfo in text.extract_iter(info, '<div class="item', '</a>'): - tags , pos = text.extract(ginfo, '', '"') - url , pos = text.extract(ginfo, 'href="', '"', pos) - title, pos = text.extract(ginfo, 'alt="', '"', pos) - thumb, pos = text.extract(ginfo, 'src="', '"', pos) + for info in text.extract_iter( + page, 'class="g_title"><a href="', '</a>'): + url, _, title = info.partition('">') yield { - "url": text.urljoin(self.root, url), + "url" : text.urljoin(self.root, url), "gallery_id": text.parse_int( url.strip("/").rpartition("/")[2]), - "thumbnail": text.urljoin(self.root, thumb), - "title": text.unescape(title), - "tags": tags.split(), + "title" : text.unescape(title), "_extractor": HentaifoxGalleryExtractor, } - pos = page.find('class="current"', gpos) - url = text.extract(page, 'href="', '"', pos)[0] + pos = page.find(">Next<") + url = text.rextract(page, "href=", ">", pos)[0] if pos == -1 or "/pag" not in url: return - url = text.urljoin(self.root, url) + num += 1 diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 9e2ee9f..193cadf 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): test = ( ("https://hentainexus.com/view/5688", { "url": "746d0043e20030f1171aae5ea113176607302517", - "keyword": "9512cf5f258130e5f75de9954d7a13217c2405e7", + "keyword": "c1b7091e2bc2f733f6401711e072ad11cf93dd69", }), ("https://hentainexus.com/read/5688"), ) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 152b631..d6eea7f 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -33,13 +33,13 @@ class ImagefapGalleryExtractor(ImagefapExtractor): r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)") test = ( ("https://www.imagefap.com/pictures/7102714", { - "url": "268995eac5d01ddecd0fe58cfa9828390dc85a84", - "keyword": "b5bd65ab2ff574ed1639db9a43c7b1b8583c85ef", + "pattern": r"https://cdn.imagefap.com/images/full/\d+/\d+/\d+.jpg", + "keyword": "2ba96e84c2952c4750e9fa94a3f2b1f965cec2f3", "content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab", }), ("https://www.imagefap.com/gallery/5486966", { - "url": "14906b4f0b8053d1d69bc730a325acb793cbc898", - "keyword": "ab90972f3527a2011478fabc621a2c99a541f752", + "pattern": r"https://cdn.imagefap.com/images/full/\d+/\d+/\d+.jpg", + "keyword": "3e24eace5b09639b881ebd393165862feb46adde", }), ("https://www.imagefap.com/gallery.php?gid=7102714"), ) @@ -89,9 +89,10 @@ class ImagefapGalleryExtractor(ImagefapExtractor): if not imgurl: return num += 1 - _, imgid, name = imgurl.rsplit("/", 2) - data = {"image_id": text.parse_int(imgid), "num": num} - yield imgurl, text.nameext_from_url(name, data) + data = text.nameext_from_url(imgurl) + data["num"] = num + data["image_id"] = text.parse_int(data["filename"]) + yield imgurl, data params["idx"] += 24 @@ -100,8 +101,8 @@ class ImagefapImageExtractor(ImagefapExtractor): subcategory = "image" pattern = r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)" test = ("https://www.imagefap.com/photo/1369341772/", { - "url": "b31ee405b61ff0450020a1bf11c0581ca9adb471", - "keyword": "eadaa8f8012298384996efd21cf1f9e9e0dddb9b", + "pattern": r"https://cdn.imagefap.com/images/full/\d+/\d+/\d+.jpg", + "keyword": "8894e45f7262020d8d66ce59917315def1fc475b", }) def __init__(self, match): @@ -109,27 +110,32 @@ class ImagefapImageExtractor(ImagefapExtractor): self.image_id = match.group(1) def items(self): - data = self.get_job_metadata() + url, data = self.get_image() yield Message.Version, 1 yield Message.Directory, data - yield Message.Url, data["url"], data + yield Message.Url, url, data - def get_job_metadata(self): - """Collect metadata for extractor-job""" + def get_image(self): url = "{}/photo/{}/".format(self.root, self.image_id) page = self.request(url).text - info = json.loads(text.extract( - page, '<script type="application/ld+json">', '</script>')[0]) - parts = info["contentUrl"].rsplit("/", 3) - return text.nameext_from_url(parts[3], { - "url": info["contentUrl"], + + info, pos = text.extract( + page, '<script type="application/ld+json">', '</script>') + image_id, pos = text.extract( + page, 'id="imageid_input" value="', '"', pos) + gallery_id, pos = text.extract( + page, 'id="galleryid_input" value="', '"', pos) + info = json.loads(info) + url = info["contentUrl"] + + return url, text.nameext_from_url(url, { "title": text.unescape(info["name"]), "uploader": info["author"], "date": info["datePublished"], "width": text.parse_int(info["width"]), "height": text.parse_int(info["height"]), - "gallery_id": text.parse_int(parts[1]), - "image_id": text.parse_int(parts[2]), + "gallery_id": text.parse_int(gallery_id), + "image_id": text.parse_int(image_id), }) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 954c1f0..4015bfd 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -154,10 +154,10 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): """Extractor for single images from imagetwist.com""" category = "imagetwist" pattern = r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))" - test = ("https://imagetwist.com/4e46hv31tu0q/test.jpg", { - "url": "c999dc1a5dec0525ac9eb8c092f173dfe6dba0b0", - "keyword": "a9f2e01757ec96d4ee4752cbd8446ede80f7935e", - "content": "96b1fd099b06faad5879fce23a7e4eb8290d8810", + test = ("https://imagetwist.com/f1i2s4vhvbrq/test.png", { + "url": "8d5e168c0bee30211f821c6f3b2116e419d42671", + "keyword": "d1060a4c2e3b73b83044e20681712c0ffdd6cfef", + "content": "0c8768055e4e20e7c7259608b67799171b691140", }) https = True params = None @@ -199,9 +199,9 @@ class PixhostImageExtractor(ImagehostImageExtractor): category = "pixhost" pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" r"/show/\d+/(\d+)_[^/?&#]+)") - test = ("https://pixhost.to/show/224/96246707_test-.png", { - "url": "8f3d41fdd2dbec4c844e5ee45bf49961fbd79c67", - "keyword": "ecefe2d5814286f9d1dff3d88d9bdc78dd456c5d", + test = ("http://pixhost.to/show/190/130327671_test-.png", { + "url": "4e5470dcf6513944773044d40d883221bbc46cff", + "keyword": "3bad6d59db42a5ebbd7842c2307e1c3ebd35e6b0", "content": "0c8768055e4e20e7c7259608b67799171b691140", }) https = True diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index fb321d0..15152b7 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -39,7 +39,7 @@ class ImgbbExtractor(Extractor): for img in self.images(page): image = { "id" : img["url_viewer"].rpartition("/")[2], - "user" : img["user"]["username"], + "user" : img["user"]["username"] if "user" in img else "", "title" : text.unescape(img["title"]), "url" : img["image"]["url"], "extension": img["image"]["extension"], @@ -79,8 +79,15 @@ class ImgbbExtractor(Extractor): return self.session.cookies def _pagination(self, page, endpoint, params): - params["page"] = 2 data = None + seek, pos = text.extract(page, 'data-seek="', '"') + tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos) + params["action"] = "list" + params["list"] = "images" + params["sort"] = self.sort + params["seek"] = seek + params["page"] = 2 + params["auth_token"] = tokn while True: for img in text.extract_iter(page, "data-object='", "'"): @@ -90,6 +97,8 @@ class ImgbbExtractor(Extractor): return params["seek"] = data["seekEnd"] params["page"] += 1 + elif not seek or 'class="pagination-next"' not in page: + return data = self.request(endpoint, method="POST", data=params).json() page = data["html"] @@ -110,6 +119,11 @@ class ImgbbAlbumExtractor(ImgbbExtractor): "url": "e2e387b8fdb3690bd75d804d0af2833112e385cd", "keyword": "a307fc9d2085bdc0eb7c538c8d866c59198d460c", }), + # no user data (#471) + ("https://ibb.co/album/kYKpwF", { + "url": "ac0abcfcb89f4df6adc2f7e4ff872f3b03ef1bc7", + "keyword": {"user": ""}, + }), # deleted ("https://ibb.co/album/fDArrF", { "exception": exception.NotFoundError, @@ -133,21 +147,13 @@ class ImgbbAlbumExtractor(ImgbbExtractor): return { "album_id" : self.album_id, "album_name": text.unescape(album), - "user" : user.lower(), + "user" : user.lower() if user else "", } def images(self, page): - seek, pos = text.extract(page, 'data-seek="', '"') - tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos) - return self._pagination(page, "https://ibb.co/json", { - "action" : "list", - "list" : "images", "from" : "album", - "sort" : self.sort, "albumid" : self.album_id, - "seek" : seek, - "auth_token": tokn, "params_hidden[list]" : "images", "params_hidden[from]" : "album", "params_hidden[albumid]": self.album_id, @@ -173,18 +179,10 @@ class ImgbbUserExtractor(ImgbbExtractor): return {"user": self.user} def images(self, page): - seek, pos = text.extract(page, 'data-seek="', '"') - tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos) - user, pos = text.extract(page, '.obj.resource={"id":"', '"', pos) - + user = text.extract(page, '.obj.resource={"id":"', '"')[0] return self._pagination(page, self.page_url + "json", { - "action" : "list", - "list" : "images", "from" : "user", - "sort" : self.sort, - "seek" : seek, "userid" : user, - "auth_token": tokn, "params_hidden[userid]": user, "params_hidden[from]" : "user", }) diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index b1be995..ce3e1ce 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -116,8 +116,8 @@ class ImgurImageExtractor(ImgurExtractor): image = self.api.image(self.key) if not image["title"]: page = self.request(self.root + "/" + self.key, fatal=False).text - title = text.extract(page, "<title>", "<")[0] - image["title"] = (title or "").rpartition(" - ")[0].strip() + title = text.extract(page, "<title>", "<")[0] or "" + image["title"] = text.unescape(title.rpartition(" - ")[0].strip()) url = self._prepare(image) yield Message.Version, 1 yield Message.Directory, image @@ -280,6 +280,20 @@ class ImgurFavoriteExtractor(ImgurExtractor): return self._items_queue(self.api.account_favorites(self.key)) +class ImgurSubredditExtractor(ImgurExtractor): + """Extractor for a subreddits's imgur links""" + subcategory = "subreddit" + pattern = BASE_PATTERN + r"/r/([^/?&#]+)" + test = ("https://imgur.com/r/pics", { + "range": "1-100", + "count": 100, + "pattern": r"https?://(i.imgur.com|imgur.com/a)/[\w.]+", + }) + + def items(self): + return self._items_queue(self.api.gallery_subreddit(self.key)) + + class ImgurAPI(): def __init__(self, extractor): @@ -297,6 +311,10 @@ class ImgurAPI(): endpoint = "account/{}/submissions".format(account) return self._pagination(endpoint) + def gallery_subreddit(self, subreddit): + endpoint = "gallery/r/{}".format(subreddit) + return self._pagination(endpoint) + def album(self, album_hash): return self._call("album/" + album_hash) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index a14225f..05adac1 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -31,6 +31,7 @@ class InstagramExtractor(Extractor): self.login() yield Message.Version, 1 + videos = self.config("videos", True) metadata = self.get_metadata() for data in self.instagrams(): data.update(metadata) @@ -41,7 +42,11 @@ class InstagramExtractor(Extractor): data['_extractor'] = InstagramStoriesExtractor yield Message.Queue, url, data else: - url = data['video_url'] or data['display_url'] + url = data.get('video_url') + if not url: + url = data['display_url'] + elif not videos: + continue yield Message.Url, url, text.nameext_from_url(url, data) def login(self): @@ -109,8 +114,14 @@ class InstagramExtractor(Extractor): return data def _extract_postpage(self, url): - data = self.request(url + "?__a=1").json() - media = data['graphql']['shortcode_media'] + try: + with self.request(url + '?__a=1', fatal=False) as response: + media = response.json()['graphql']['shortcode_media'] + except (KeyError, ValueError) as exc: + self.log.warning("Unable to fetch data from '%s': %s: %s", + url, exc.__class__.__name__, exc) + self.log.debug("Server response: %s", response.text) + return () common = { 'date': text.parse_timestamp(media['taken_at_timestamp']), @@ -199,10 +210,10 @@ class InstagramExtractor(Extractor): 'expires': text.parse_timestamp(media['expiring_at_timestamp']), 'media_id': media['id'], 'typename': media['__typename'], + 'display_url': media['display_url'], } if media['__typename'] == 'GraphStoryImage': media_data.update({ - 'display_url': media['display_url'], 'height': text.parse_int(media['dimensions']['height']), 'width': text.parse_int(media['dimensions']['width']), }) @@ -210,7 +221,7 @@ class InstagramExtractor(Extractor): vr = media['video_resources'][0] media_data.update({ 'duration': text.parse_float(media['video_duration']), - 'display_url': vr['src'], + 'video_url': vr['src'], 'height': text.parse_int(vr['config_height']), 'width': text.parse_int(vr['config_width']), }) @@ -292,7 +303,7 @@ class InstagramImageExtractor(InstagramExtractor): # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { "pattern": r"https://[^/]+\.(cdninstagram\.com|fbcdn\.net)" - r"/vp/[0-9a-f]+/[0-9A-F]+/t51.2885-15/e35" + r"/v(p/[0-9a-f]+/[0-9A-F]+)?/t51.2885-15/e35" r"/44877605_725955034447492_3123079845831750529_n.jpg", "keyword": { "date": "type:datetime", diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index bb89f93..7151de0 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -19,9 +19,9 @@ import re class RedirectMixin(): """Detect and handle redirects to CAPTCHA pages""" - def request(self, url): + def request(self, url, **kwargs): while True: - response = Extractor.request(self, url) + response = Extractor.request(self, url, **kwargs) if not response.history or "/AreYouHuman" not in response.url: return response if self.config("captcha", "stop") == "wait": diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py index e922f61..e47b7db 100644 --- a/gallery_dl/extractor/livedoor.py +++ b/gallery_dl/extractor/livedoor.py @@ -64,7 +64,7 @@ class LivedoorExtractor(Extractor): if not src: continue if "://livedoor.blogimg.jp/" in src: - url = src.replace("-s.", ".") + url = src.replace("http:", "https:", 1).replace("-s.", ".") else: url = text.urljoin(self.root, src) name, _, ext = url.rpartition("/")[2].rpartition(".") @@ -131,16 +131,16 @@ class LivedoorPostExtractor(LivedoorExtractor): pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/archives/(\d+)" test = ( ("http://blog.livedoor.jp/zatsu_ke/archives/51493859.html", { - "url": "8826fe623f19dc868e7538e8519bf8491e92a0a2", - "keyword": "83993111d5d0c08d021196802dd36b73f04c7057", + "url": "9ca3bbba62722c8155be79ad7fc47be409e4a7a2", + "keyword": "1f5b558492e0734f638b760f70bfc0b65c5a97b9", }), ("http://blog.livedoor.jp/amaumauma/archives/7835811.html", { - "url": "fc1d6a9557245b5a27d3a10bf0fa9922ef377215", - "keyword": "fd700760c98897c3125328e157972f905fd34aaa", + "url": "204bbd6a9db4969c50e0923855aeede04f2e4a62", + "keyword": "05821c7141360e6057ef2d382b046f28326a799d", }), ("http://blog.livedoor.jp/uotapo/archives/1050616939.html", { - "url": "3f3581807ec4776e6a67ed7985a22494d4bc4904", - "keyword": "9e319413a42e08d32f0dcbe8aa3b452ad41aa906", + "url": "4b5ab144b7309eb870d9c08f8853d1abee9946d2", + "keyword": "84fbf6e4eef16675013d6333039a7cfcb22c2d50", }), ) diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 4ad8da2..114a48e 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -72,8 +72,8 @@ class MangoxoAlbumExtractor(MangoxoExtractor): "url": "ad921fe62663b06e7d73997f7d00646cab7bdd0d", "keyword": { "channel": { - "id": "QeYKRkO0", - "name": "美女图社", + "id": "Jpw9ywQ4", + "name": "绘画艺术赏析", "cover": str, }, "album": { diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 28a2c2d..a325264 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -31,8 +31,8 @@ class MastodonExtractor(Extractor): if value is not sentinel: return value return config.interpolate( - ("extractor", "mastodon", self.instance, self.subcategory, key), - default, + ("extractor", "mastodon", self.instance, self.subcategory), + key, default, ) def items(self): @@ -145,10 +145,10 @@ def generate_extractors(): """Dynamically generate Extractor classes for Mastodon instances""" symtable = globals() - extractors = config.get(("extractor", "mastodon")) + extractors = config.get(("extractor",), "mastodon") if extractors: EXTRACTORS.update(extractors) - config.set(("extractor", "mastodon"), EXTRACTORS) + config.set(("extractor",), "mastodon", EXTRACTORS) for instance, info in EXTRACTORS.items(): diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py index 1831620..088fdd6 100644 --- a/gallery_dl/extractor/message.py +++ b/gallery_dl/extractor/message.py @@ -52,3 +52,4 @@ class Message(): # Cookies = 5 Queue = 6 Urllist = 7 + Metadata = 8 diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 1ca1073..5454e52 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -9,43 +9,107 @@ """Extractors for https://www.newgrounds.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception +from ..cache import cache import json class NewgroundsExtractor(Extractor): """Base class for newgrounds extractors""" category = "newgrounds" - directory_fmt = ("{category}", "{user}") + directory_fmt = ("{category}", "{artist[:10]:J, }") filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" + root = "https://www.newgrounds.com" + cookiedomain = ".newgrounds.com" + cookienames = ("NG_GG_username", "vmk1du5I8m") def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) - self.root = "https://{}.newgrounds.com".format(self.user) + self.user_root = "https://{}.newgrounds.com".format(self.user) def items(self): - data = self.get_metadata() + self.login() yield Message.Version, 1 - yield Message.Directory, data - for page_url in self.get_page_urls(): - image = self.parse_page_data(page_url) - image.update(data) - url = image["url"] - yield Message.Url, url, text.nameext_from_url(url, image) + for post_url in self.posts(): + try: + file = self.extract_post(post_url) + url = file["url"] + # except Exception: + except OSError: + url = None + if not url: + self.log.warning("Unable to get download URL for %s", post_url) + continue + yield Message.Directory, file + yield Message.Url, url, text.nameext_from_url(url, file) - def get_metadata(self): - """Collect metadata for extractor-job""" - return {"user": self.user} - - def get_page_urls(self): + def posts(self): """Return urls of all relevant image pages""" + return self._pagination(self.subcategory) + + def login(self): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=360*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/passport/" + page = self.request(url).text + headers = {"Origin": self.root, "Referer": url} + + url = text.urljoin(self.root, text.extract(page, 'action="', '"')[0]) + data = { + "username": username, + "password": password, + "remember": "1", + "login" : "1", + } + + response = self.request(url, method="POST", headers=headers, data=data) + if not response.history: + raise exception.AuthenticationError() + + return { + cookie.name: cookie.value + for cookie in response.history[0].cookies + if cookie.expires and cookie.domain == self.cookiedomain + } + + def extract_post(self, post_url): + page = self.request(post_url).text + extr = text.extract_from(page) + + if "/art/view/" in post_url: + data = self._extract_image_data(extr, post_url) + elif "/audio/listen/" in post_url: + data = self._extract_audio_data(extr, post_url) + else: + data = self._extract_media_data(extr, post_url) + + data["comment"] = text.unescape(text.remove_html(extr( + 'id="author_comments">', '</div>'), "", "")) + data["favorites"] = text.parse_int(extr( + 'id="faves_load">', '<').replace(",", "")) + data["score"] = text.parse_float(extr('id="score_number">', '<')) + data["tags"] = text.split_html(extr( + '<dd class="tags momag">', '</dd>')) + data["artist"] = [ + text.extract(user, '//', '.')[0] + for user in text.extract_iter(page, '<div class="item-user">', '>') + ] + + data["tags"].sort() + data["user"] = self.user or data["artist"][0] + return data - def parse_page_data(self, page_url): - """Collect url and metadata from an image page""" - extr = text.extract_from(self.request(page_url).text) + @staticmethod + def _extract_image_data(extr, url): full = text.extract_from(json.loads(extr('"full_image_text":', '});'))) data = { "title" : text.unescape(extr('"og:title" content="', '"')), @@ -53,53 +117,68 @@ class NewgroundsExtractor(Extractor): "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), - "favorites" : text.parse_int(extr('id="faves_load">', '<')), - "score" : text.parse_float(extr('id="score_number">', '<')), - "tags" : text.split_html(extr( - '<dd class="tags momag">', '</dd>')), "url" : full('src="', '"'), "width" : text.parse_int(full('width="', '"')), "height" : text.parse_int(full('height="', '"')), } - data["tags"].sort() data["index"] = text.parse_int( data["url"].rpartition("/")[2].partition("_")[0]) return data - def _pagination(self, url): + @staticmethod + def _extract_audio_data(extr, url): + return { + "title" : text.unescape(extr('"og:title" content="', '"')), + "description": text.unescape(extr(':description" content="', '"')), + "date" : text.parse_datetime(extr( + 'itemprop="datePublished" content="', '"')), + "url" : extr('{"url":"', '"').replace("\\/", "/"), + "index" : text.parse_int(url.split("/")[5]), + "rating" : "", + } + + @staticmethod + def _extract_media_data(extr, url): + return { + "title" : text.unescape(extr('"og:title" content="', '"')), + "url" : extr('{"url":"', '"').replace("\\/", "/"), + "date" : text.parse_datetime(extr( + 'itemprop="datePublished" content="', '"')), + "description": text.unescape(extr( + 'itemprop="description" content="', '"')), + "rating" : extr('class="rated-', '"'), + "index" : text.parse_int(url.split("/")[5]), + } + + def _pagination(self, kind): + root = self.user_root headers = { - "Referer": self.root, - "X-Requested-With": "XMLHttpRequest", "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Referer": root, } + url = "{}/{}/page/1".format(root, kind) while True: - data = self.request(url, headers=headers).json() + with self.request(url, headers=headers, fatal=False) as response: + try: + data = response.json() + except ValueError: + return + if not data: + return + if "errors" in data: + msg = ", ".join(text.unescape(e) for e in data["errors"]) + raise exception.StopExtraction(msg) for year in data["sequence"]: for item in data["years"][str(year)]["items"]: page_url = text.extract(item, 'href="', '"')[0] - yield text.urljoin(self.root, page_url) + yield text.urljoin(root, page_url) if not data["more"]: return - url = text.urljoin(self.root, data["more"]) - - -class NewgroundsUserExtractor(NewgroundsExtractor): - """Extractor for all images of a newgrounds user""" - subcategory = "user" - pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com(?:/art)?/?$" - test = ( - ("https://blitzwuff.newgrounds.com/art", { - "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", - "keyword": "62981f7bdd66e1f1c72ab1d9b932423c156bc9a1", - }), - ("https://blitzwuff.newgrounds.com/"), - ) - - def get_page_urls(self): - return self._pagination(self.root + "/art/page/1") + url = text.urljoin(root, data["more"]) class NewgroundsImageExtractor(NewgroundsExtractor): @@ -109,14 +188,28 @@ class NewgroundsImageExtractor(NewgroundsExtractor): r"(?:www\.)?newgrounds\.com/art/view/([^/?&#]+)/[^/?&#]+" r"|art\.ngfiles\.com/images/\d+/\d+_([^_]+)_([^.]+))") test = ( - ("https://www.newgrounds.com/art/view/blitzwuff/ffx", { - "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818", - "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e", - "content": "cb067d6593598710292cdd340d350d14a26fe075", + ("https://www.newgrounds.com/art/view/tomfulp/ryu-is-hawt", { + "url": "57f182bcbbf2612690c3a54f16ffa1da5105245e", + "content": "8f395e08333eb2457ba8d8b715238f8910221365", + "keyword": { + "artist" : ["tomfulp"], + "comment" : "re:Consider this the bottom threshold for ", + "date" : "type:datetime", + "description": "re:Consider this the bottom threshold for ", + "favorites" : int, + "filename" : "94_tomfulp_ryu-is-hawt", + "height" : 476, + "index" : 94, + "rating" : "e", + "score" : float, + "tags" : ["ryu", "streetfighter"], + "title" : "Ryu is Hawt", + "user" : "tomfulp", + "width" : 447, + }, }), - ("https://art.ngfiles.com/images/587000/587551_blitzwuff_ffx.png", { - "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818", - "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e", + ("https://art.ngfiles.com/images/0/94_tomfulp_ryu-is-hawt.gif", { + "url": "57f182bcbbf2612690c3a54f16ffa1da5105245e", }), ) @@ -124,30 +217,120 @@ class NewgroundsImageExtractor(NewgroundsExtractor): NewgroundsExtractor.__init__(self, match) if match.group(2): self.user = match.group(2) - self.page_url = "https://www.newgrounds.com/art/view/{}/{}".format( + self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format( self.user, match.group(3)) else: - self.page_url = match.group(0) + url = match.group(0) + if not url.startswith("http"): + url = "https://" + url + self.post_url = url + + def posts(self): + return (self.post_url,) + + +class NewgroundsMediaExtractor(NewgroundsExtractor): + """Extractor for a media file from newgrounds.com""" + subcategory = "media" + pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" + r"(/(?:portal/view|audio/listen)/\d+)") + test = ( + ("https://www.newgrounds.com/portal/view/589549", { + "url": "48d916d819c99139e6a3acbbf659a78a867d363e", + "content": "ceb865426727ec887177d99e0d20bb021e8606ae", + "keyword": { + "artist" : ["psychogoldfish", "tomfulp"], + "comment" : "re:People have been asking me how I like the ", + "date" : "type:datetime", + "description": "re:People have been asking how I like the ", + "favorites" : int, + "filename" : "527818_alternate_1896", + "index" : 589549, + "rating" : "t", + "score" : float, + "tags" : ["newgrounds", "psychogoldfish", + "rage", "redesign-2012"], + "title" : "Redesign Rage", + "user" : "psychogoldfish", + }, + }), + ("https://www.newgrounds.com/audio/listen/609768", { + "url": "f4c5490ae559a3b05e46821bb7ee834f93a43c95", + "keyword": { + "artist" : ["zj", "tomfulp"], + "comment" : "re:RECORDED 12-09-2014\n\nFrom The ZJ \"Late ", + "date" : "type:datetime", + "description": "From The ZJ Report Show!", + "favorites" : int, + "index" : 609768, + "rating" : "", + "score" : float, + "tags" : ["fulp", "interview", "tom", "zj"], + "title" : "ZJ Interviews Tom Fulp!", + "user" : "zj", + }, + }), + ) + + def __init__(self, match): + NewgroundsExtractor.__init__(self, match) + self.user = "" + self.post_url = self.root + match.group(1) - def get_page_urls(self): - return (self.page_url,) + def posts(self): + return (self.post_url,) -class NewgroundsVideoExtractor(NewgroundsExtractor): - """Extractor for all videos of a newgrounds user""" - subcategory = "video" - filename_fmt = "{category}_{index}.{extension}" +class NewgroundsArtExtractor(NewgroundsExtractor): + """Extractor for all images of a newgrounds user""" + subcategory = "art" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/art/?$" + test = ("https://tomfulp.newgrounds.com/art", { + "pattern": NewgroundsImageExtractor.pattern, + "count": ">= 3", + }) + + +class NewgroundsAudioExtractor(NewgroundsExtractor): + """Extractor for all audio submissions of a newgrounds user""" + subcategory = "audio" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/audio/?$" + test = ("https://tomfulp.newgrounds.com/audio", { + "pattern": r"https://audio.ngfiles.com/\d+/\d+_.+\.mp3", + "count": ">= 4", + }) + + +class NewgroundsMoviesExtractor(NewgroundsExtractor): + """Extractor for all movies of a newgrounds user""" + subcategory = "movies" pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$" test = ("https://tomfulp.newgrounds.com/movies", { - "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+", - "count": ">= 32", + "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+_.+", + "range": "1-10", + "count": 10, }) - def get_page_urls(self): - return self._pagination(self.root + "/movies/page/1") - def parse_page_data(self, page_url): - return { - "url" : "ytdl:" + page_url, - "index": text.parse_int(page_url.rpartition("/")[2]), - } +class NewgroundsUserExtractor(NewgroundsExtractor): + """Extractor for a newgrounds user profile""" + subcategory = "user" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/?$" + test = ( + ("https://tomfulp.newgrounds.com", { + "pattern": "https://tomfulp.newgrounds.com/art$", + }), + ("https://tomfulp.newgrounds.com", { + "options": (("include", "all"),), + "pattern": "https://tomfulp.newgrounds.com/(art|audio|movies)$", + "count": 3, + }), + ) + + def items(self): + base = self.user_root + "/" + return self._dispatch_extractors(( + (NewgroundsArtExtractor , base + "art"), + (NewgroundsAudioExtractor , base + "audio"), + (NewgroundsMoviesExtractor, base + "movies"), + ), ("art",)) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 0bd858f..aae17a3 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -69,8 +69,8 @@ class NijieExtractor(AsynchronousMixin, Extractor): "description": text.unescape(extr( '"description": "', '"').replace("&", "&")), "date" : text.parse_datetime(extr( - '"datePublished": "', '"')[:-4] + "+0900", - "%a %d %b %Y %I:%M:%S %p%z"), + '"datePublished": "', '"') + "+0900", + "%a %b %d %H:%M:%S %Y%z"), "artist_id" : text.parse_int(extr( '"sameAs": "https://nijie.info/members.php?id=', '"')), "artist_name": keywords[1], diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 912447b..74835bf 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -15,11 +15,14 @@ from ..cache import cache import os import urllib.parse +REDIRECT_URI_LOCALHOST = "http://localhost:6414/" +REDIRECT_URI_HTTPS = "https://mikf.github.io/gallery-dl/oauth-redirect.html" + class OAuthBase(Extractor): """Base class for OAuth Helpers""" category = "oauth" - redirect_uri = "http://localhost:6414/" + redirect_uri = REDIRECT_URI_LOCALHOST def __init__(self, match): Extractor.__init__(self, match) @@ -27,7 +30,7 @@ class OAuthBase(Extractor): def oauth_config(self, key, default=None): return config.interpolate( - ("extractor", self.subcategory, key), default) + ("extractor", self.subcategory), key, default) def recv(self): """Open local HTTP server and recv callback parameters""" @@ -163,7 +166,7 @@ class OAuthBase(Extractor): class OAuthDeviantart(OAuthBase): subcategory = "deviantart" pattern = "oauth:deviantart$" - redirect_uri = "https://mikf.github.io/gallery-dl/oauth-redirect.html" + redirect_uri = REDIRECT_URI_HTTPS def items(self): yield Message.Version, 1 @@ -182,6 +185,7 @@ class OAuthDeviantart(OAuthBase): class OAuthFlickr(OAuthBase): subcategory = "flickr" pattern = "oauth:flickr$" + redirect_uri = REDIRECT_URI_HTTPS def __init__(self, match): OAuthBase.__init__(self, match) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 9b13391..1e52559 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -9,7 +9,7 @@ """Extractors for https://www.patreon.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception from ..cache import memcache import collections import json @@ -33,13 +33,15 @@ class PatreonExtractor(Extractor): PatreonExtractor._warning = False for post in self.posts(): - yield Message.Directory, post - ids = set() post["num"] = 0 content = post.get("content") postfile = post.get("post_file") + yield Message.Directory, post + yield Message.Metadata, text.nameext_from_url( + post["creator"].get("image_url", ""), post) + for image in post["images"]: url = image.get("download_url") if not url: @@ -97,8 +99,10 @@ class PatreonExtractor(Extractor): attr["attachments"] = self._files(post, included, "attachments") attr["date"] = text.parse_datetime( attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - attr["creator"] = self._user( - post["relationships"]["user"]["links"]["related"]) + user = post["relationships"]["user"] + attr["creator"] = ( + self._user(user["links"]["related"]) or + included["user"][user["data"]["id"]]) return attr @staticmethod @@ -123,7 +127,10 @@ class PatreonExtractor(Extractor): @memcache(keyarg=1) def _user(self, url): """Fetch user information""" - user = self.request(url).json()["data"] + response = self.request(url, fatal=False) + if response.status_code >= 400: + return None + user = response.json()["data"] attr = user["attributes"] attr["id"] = user["id"] attr["date"] = text.parse_datetime( @@ -168,23 +175,28 @@ class PatreonCreatorExtractor(PatreonExtractor): pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" r"/(?!(?:home|join|posts|login|signup)(?:$|[/?&#]))" r"([^/?&#]+)/?") - test = ("https://www.patreon.com/koveliana", { - "range": "1-25", - "count": ">= 25", - "keyword": { - "attachments": list, - "comment_count": int, - "content": str, - "creator": dict, - "date": "type:datetime", - "id": int, - "images": list, - "like_count": int, - "post_type": str, - "published_at": str, - "title": str, - }, - }) + test = ( + ("https://www.patreon.com/koveliana", { + "range": "1-25", + "count": ">= 25", + "keyword": { + "attachments" : list, + "comment_count": int, + "content" : str, + "creator" : dict, + "date" : "type:datetime", + "id" : int, + "images" : list, + "like_count" : int, + "post_type" : str, + "published_at" : str, + "title" : str, + }, + }), + ("https://www.patreon.com/kovelianot", { + "exception": exception.NotFoundError, + }), + ) def __init__(self, match): PatreonExtractor.__init__(self, match) @@ -192,9 +204,12 @@ class PatreonCreatorExtractor(PatreonExtractor): def posts(self): url = "{}/{}".format(self.root, self.creator) - page = self.request(url).text + page = self.request(url, notfound="creator").text campaign_id = text.extract(page, "/campaign/", "/")[0] + if not campaign_id: + raise exception.NotFoundError("creator") + url = self._build_url("posts", ( "&sort=-published_at" "&filter[is_draft]=false" @@ -221,19 +236,26 @@ class PatreonUserExtractor(PatreonExtractor): class PatreonPostExtractor(PatreonExtractor): """Extractor for media from a single post""" subcategory = "post" - pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" - r"/posts/[^/?&#]*?(\d+)") - test = ("https://www.patreon.com/posts/precious-metal-23563293", { - "count": 4, - }) + pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?&#]+)" + test = ( + ("https://www.patreon.com/posts/precious-metal-23563293", { + "count": 4, + }), + ("https://www.patreon.com/posts/er1-28201153", { + "count": 1, + }), + ("https://www.patreon.com/posts/not-found-123", { + "exception": exception.NotFoundError, + }), + ) def __init__(self, match): PatreonExtractor.__init__(self, match) - self.post_id = match.group(1) + self.slug = match.group(1) def posts(self): - url = "{}/posts/{}".format(self.root, self.post_id) - page = self.request(url).text + url = "{}/posts/{}".format(self.root, self.slug) + page = self.request(url, notfound="post").text data = text.extract(page, "window.patreon.bootstrap,", "\n});")[0] post = json.loads(data + "}")["post"] diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py index 8456f97..a6456da 100644 --- a/gallery_dl/extractor/photobucket.py +++ b/gallery_dl/extractor/photobucket.py @@ -117,7 +117,7 @@ class PhotobucketImageExtractor(Extractor): (("https://s271.photobucket.com/user/lakerfanryan" "/media/Untitled-3-1.jpg.html"), { "url": "3b647deeaffc184cc48c89945f67574559c9051f", - "keyword": "a2de4e60d584912537b8025c01bdd1d20bdea735", + "keyword": "69732741b2b351db7ecaa77ace2fdb39f08ca5a3", }), (("https://s271.photobucket.com/user/lakerfanryan" "/media/IsotopeswBros.jpg.html?sort=3&o=2"), { diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py index 2775dac..5f50245 100644 --- a/gallery_dl/extractor/piczel.py +++ b/gallery_dl/extractor/piczel.py @@ -65,7 +65,7 @@ class PiczelUserExtractor(PiczelExtractor): subcategory = "user" pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$" test = ("https://piczel.tv/gallery/Maximumwarp", { - "count": ">= 50", + "count": ">= 45", }) def __init__(self, match): diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index e36a82b..bcdd082 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -119,8 +119,9 @@ class PinterestRelatedPinExtractor(PinterestPinExtractor): directory_fmt = ("{category}", "related {original_pin[id]}") pattern = BASE_PATTERN + r"/pin/([^/?#&]+).*#related$" test = ("https://www.pinterest.com/pin/858146903966145189/#related", { - "range": "31-50", - "count": 20, + "range": "31-70", + "count": 40, + "archive": False, }) def metadata(self): @@ -138,8 +139,9 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): "{board[name]}", "related") pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+).*#related$" test = ("https://www.pinterest.com/g1952849/test-/#related", { - "range": "31-50", - "count": 20, + "range": "31-70", + "count": 40, + "archive": False, }) def pins(self): @@ -241,7 +243,7 @@ class PinterestAPI(): if response.status_code == 404 or response.history: resource = self.extractor.subcategory.rpartition("-")[2] raise exception.NotFoundError(resource) - self.extractor.log.debug("%s", response.text) + self.extractor.log.debug("Server response: %s", response.text) raise exception.StopExtraction("API request failed") def _pagination(self, resource, options): diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index d32f245..7901149 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -318,16 +318,22 @@ class PixivSearchExtractor(PixivExtractor): archive_fmt = "s_{search[word]}_{id}{num}.{extension}" directory_fmt = ("{category}", "search", "{search[word]}") pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/search\.php\?([^#]+)") + r"/(?:(?:en/)?tags/([^/?&#]+)(?:/[^/?&#]+)?/?" + r"|search\.php)(?:\?([^#]+))?") test = ( + ("https://www.pixiv.net/en/tags/Original", { + "range": "1-10", + "count": 10, + }), + ("https://www.pixiv.net/en/tags/foo/artworks?order=date&s_mode=s_tag"), ("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original"), ("https://touch.pixiv.net/search.php?word=Original"), ) def __init__(self, match): PixivExtractor.__init__(self, match) - self.query = match.group(1) - self.word = self.sort = self.target = None + self.word, self.query = match.groups() + self.sort = self.target = None def works(self): return self.api.search_illust(self.word, self.sort, self.target) @@ -335,9 +341,12 @@ class PixivSearchExtractor(PixivExtractor): def get_metadata(self, user=None): query = text.parse_query(self.query) - if "word" not in query: - raise exception.StopExtraction("Missing search term") - self.word = query["word"] + if self.word: + self.word = text.unquote(self.word) + else: + if "word" not in query: + raise exception.StopExtraction("Missing search term") + self.word = query["word"] sort = query.get("order", "date_d") sort_map = { diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index 2bb66ac..6862559 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, extractor, exception import datetime +import time import json import re @@ -47,13 +48,22 @@ class PlurkExtractor(Extractor): """Return an iterable with a 'plurk's comments""" url = "https://www.plurk.com/Responses/get" data = {"plurk_id": plurk["id"], "count": "200"} + headers = { + "Origin": self.root, + "Referer": self.root, + "X-Requested-With": "XMLHttpRequest", + } while True: - info = self.request(url, method="POST", data=data).json() + info = self.request( + url, method="POST", headers=headers, data=data).json() yield from info["responses"] if not info["has_newer"]: return - data["from_response_id"] = info["responses"][-1]["id"] + elif info["has_newer"] < 200: + del data["count"] + time.sleep(1) + data["from_response_id"] = info["responses"][-1]["id"] + 1 @staticmethod def _load(data): @@ -81,9 +91,9 @@ class PlurkTimelineExtractor(PlurkExtractor): user_id, pos = text.extract(page, '"user_id":', ',') plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0]) - url = "https://www.plurk.com/TimeLine/getPlurks" - data = {"user_id": user_id.strip()} headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"} + data = {"user_id": user_id.strip()} + url = "https://www.plurk.com/TimeLine/getPlurks" while plurks: yield from plurks diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py new file mode 100644 index 0000000..6d89151 --- /dev/null +++ b/gallery_dl/extractor/realbooru.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://realbooru.com/""" + +from . import booru + + +class RealbooruExtractor(booru.XmlParserMixin, + booru.GelbooruPageMixin, + booru.BooruExtractor): + """Base class for realbooru extractors""" + category = "realbooru" + api_url = "https://realbooru.com/index.php" + post_url = "https://realbooru.com/index.php?page=post&s=view&id={}" + pool_url = "https://realbooru.com/index.php?page=pool&s=show&id={}" + + def __init__(self, match): + super().__init__(match) + self.params.update({"page": "dapi", "s": "post", "q": "index"}) + + +class RealbooruTagExtractor(booru.TagMixin, RealbooruExtractor): + """Extractor for images from realbooru.com based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?" + r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") + test = ("https://realbooru.com/index.php?page=post&s=list&tags=wine", { + "count": 64, + }) + + +class RealbooruPoolExtractor(booru.GelbooruPoolMixin, RealbooruExtractor): + """Extractor for image-pools from realbooru.com""" + pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?" + r"\?page=pool&s=show&id=(?P<pool>\d+)") + test = ("https://realbooru.com/index.php?page=pool&s=show&id=1", { + "count": 3, + }) + + +class RealbooruPostExtractor(booru.PostMixin, RealbooruExtractor): + """Extractor for single images from realbooru.com""" + pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?" + r"\?page=post&s=view&id=(?P<post>\d+)") + test = ("https://realbooru.com/index.php?page=post&s=view&id=668483", { + "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", + "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", + "options": (("tags", True),), + "keyword": { + "tags_general" : str, + "tags_metadata": "tagme", + "tags_model" : "jennifer_lawrence", + }, + }) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index ecce003..656148e 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -60,14 +60,16 @@ class RedditExtractor(Extractor): def _urls(self, submissions): for submission, comments in submissions: - self._visited.add(submission["id"]) - if not submission["is_self"]: - yield submission["url"], submission + if submission: + self._visited.add(submission["id"]) - for url in text.extract_iter( - submission["selftext_html"] or "", ' href="', '"'): - yield url, submission + if not submission["is_self"]: + yield submission["url"], submission + + for url in text.extract_iter( + submission["selftext_html"] or "", ' href="', '"'): + yield url, submission if comments: for comment in comments: @@ -130,15 +132,14 @@ class RedditSubmissionExtractor(RedditExtractor): r")/([a-z0-9]+)") test = ( ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { - "pattern": r"https://", - "count": 3, - }), - # ignore submission comments (#429) - ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { - "options": (("comments", 0),), "pattern": r"https://c2.staticflickr.com/8/7272/\w+_k.jpg", "count": 1, }), + ("https://www.reddit.com/r/lavaporn/comments/8cqhub/", { + "options": (("comments", 500),), + "pattern": r"https://", + "count": 3, + }), ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), @@ -186,7 +187,7 @@ class RedditAPI(): def __init__(self, extractor): self.extractor = extractor - self.comments = text.parse_int(extractor.config("comments", 500)) + self.comments = text.parse_int(extractor.config("comments", 0)) self.morecomments = extractor.config("morecomments", False) self.refresh_token = extractor.config("refresh-token") self.log = extractor.log @@ -298,17 +299,24 @@ class RedditAPI(): while True: data = self._call(endpoint, params)["data"] - for submission in data["children"]: - submission = submission["data"] - if (date_min <= submission["created_utc"] <= date_max and - id_min <= self._decode(submission["id"]) <= id_max): - if submission["num_comments"] and self.comments: - try: - yield self.submission(submission["id"]) - except exception.AuthorizationError: - pass - else: - yield submission, None + for child in data["children"]: + kind = child["kind"] + post = child["data"] + + if (date_min <= post["created_utc"] <= date_max and + id_min <= self._decode(post["id"]) <= id_max): + + if kind == "t3": + if post["num_comments"] and self.comments: + try: + yield self.submission(post["id"]) + except exception.AuthorizationError: + pass + else: + yield post, None + + elif kind == "t1" and self.comments: + yield None, (post,) if not data["after"]: return diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py index 736173f..bde0b5d 100644 --- a/gallery_dl/extractor/senmanga.py +++ b/gallery_dl/extractor/senmanga.py @@ -24,7 +24,7 @@ class SenmangaChapterExtractor(Extractor): ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { "url": "5f95140ff511d8497e2ec08fa7267c6bb231faec", "keyword": "705d941a150765edb33cd2707074bd703a93788c", - "content": "0e37b1995708ffc175f2e175d91a518e6948c379", + "content": "556a16d5ca3441d7a5807b6b5ac06ec458a3e4ba", }), ("http://raw.senmanga.com/Love-Lab/2016-03/1", { "url": "8347b9f00c14b864dd3c19a1f5ae52adb2ef00de", diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index c4597af..2c9746e 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -21,7 +21,6 @@ class SexcomExtractor(Extractor): root = "https://www.sex.com" def items(self): - self.session.headers["Referer"] = self.root yield Message.Version, 1 yield Message.Directory, self.metadata() for pin in map(self._parse_pin, self.pins()): @@ -59,6 +58,7 @@ class SexcomExtractor(Extractor): extr = text.extract_from(response.text) data = {} + data["_http_headers"] = {"Referer": url} data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower() data["title"] = text.unescape(extr('itemprop="name">' , '<')) @@ -123,10 +123,12 @@ class SexcomPinExtractor(SexcomExtractor): # gif ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", { "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1", + "content": "a54b37eb39d565094c54ad7d21244fe8f978fb14", }), # video - ("https://www.sex.com/pin/55748381/", { - "pattern": "https://www.sex.com/video/stream/776238/hd", + ("https://www.sex.com/pin/55748341/", { + "pattern": "https://www.sex.com/video/stream/776229/hd", + "content": "e1a5834869163e2c4d1ca2677f5b7b367cf8cfff", }), # pornhub embed ("https://www.sex.com/pin/55847384-very-nicely-animated/", { diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index be29dcf..0c13825 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -154,13 +154,14 @@ class SmugmugPathExtractor(SmugmugExtractor): "pattern": "smugmug:album:6VRT8G$", }), # custom domain - ("smugmug:www.creativedogportraits.com/PortfolioGallery/", { - "pattern": "smugmug:album:txWXzs$", + ("smugmug:www.sitkapics.com/TREES-and-TRAILS/", { + "pattern": "smugmug:album:ct8Nds$", }), - ("smugmug:www.creativedogportraits.com/", { - "pattern": "smugmug:album:txWXzs$", + ("smugmug:www.sitkapics.com/", { + "pattern": r"smugmug:album:\w{6}$", + "count": ">= 14", }), - ("smugmug:https://www.creativedogportraits.com/"), + ("smugmug:https://www.sitkapics.com/"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 998eed4..1d37419 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -156,8 +156,8 @@ class TumblrExtractor(Extractor): invalid = types - POST_TYPES if invalid: types = types & POST_TYPES - self.log.warning('invalid post types: "%s"', - '", "'.join(sorted(invalid))) + self.log.warning("Invalid post types: '%s'", + "', '".join(sorted(invalid))) return types @staticmethod diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index dfafc1f..8ef966f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,13 +11,14 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache, memcache +import json import re class TwitterExtractor(Extractor): """Base class for twitter extractors""" category = "twitter" - directory_fmt = ("{category}", "{user}") + directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" root = "https://twitter.com" @@ -26,6 +27,7 @@ class TwitterExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + self._user_dict = None self.logged_in = False self.retweets = self.config("retweets", True) self.content = self.config("content", False) @@ -37,23 +39,18 @@ class TwitterExtractor(Extractor): def items(self): self.login() + metadata = self.metadata() yield Message.Version, 1 - yield Message.Directory, self.metadata() for tweet in self.tweets(): data = self._data_from_tweet(tweet) - - if not self.retweets and data["retweet_id"]: + if not data or not self.retweets and data["retweet_id"]: continue - - images = text.extract_iter( - tweet, 'data-image-url="', '"') - for data["num"], url in enumerate(images, 1): - text.nameext_from_url(url, data) - urls = [url + size for size in self.sizes] - yield Message.Urllist, urls, data + data.update(metadata) if self.videos and "-videoContainer" in tweet: + yield Message.Directory, data + if self.videos == "ytdl": data["extension"] = None url = "ytdl:{}/{}/status/{}".format( @@ -70,9 +67,19 @@ class TwitterExtractor(Extractor): data["num"] = 1 yield Message.Url, url, data + elif "data-image-url=" in tweet: + yield Message.Directory, data + + images = text.extract_iter( + tweet, 'data-image-url="', '"') + for data["num"], url in enumerate(images, 1): + text.nameext_from_url(url, data) + urls = [url + size for size in self.sizes] + yield Message.Urllist, urls, data + def metadata(self): """Return general metadata""" - return {"user": self.user} + return {} def tweets(self): """Yield HTML content of all relevant tweets""" @@ -113,11 +120,33 @@ class TwitterExtractor(Extractor): "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), "retweeter" : extr('data-retweeter="' , '"'), - "user" : extr('data-screen-name="', '"'), - "username" : extr('data-name="' , '"'), - "user_id" : text.parse_int(extr('data-user-id="' , '"')), - "date" : text.parse_timestamp(extr('data-time="', '"')), + "author" : { + "name" : extr('data-screen-name="', '"'), + "nick" : text.unescape(extr('data-name="' , '"')), + "id" : text.parse_int(extr('data-user-id="' , '"')), + }, } + + if not self._user_dict: + if data["retweet_id"]: + for user in json.loads(text.unescape(extr( + 'data-reply-to-users-json="', '"'))): + if user["screen_name"] == data["retweeter"]: + break + else: + self.log.warning("Unable to extract user info") + return None + self._user_dict = { + "name": user["screen_name"], + "nick": text.unescape(user["name"]), + "id" : text.parse_int(user["id_str"]), + } + else: + self._user_dict = data["author"] + + data["user"] = self._user_dict + data["date"] = text.parse_timestamp(extr('data-time="', '"')) + if self.content: content = extr('<div class="js-tweet-text-container">', '\n</div>') if '<img class="Emoji ' in content: @@ -125,6 +154,7 @@ class TwitterExtractor(Extractor): content = text.unescape(text.remove_html(content, "", "")) cl, _, cr = content.rpartition("pic.twitter.com/") data["content"] = cl if cl and len(cr) < 16 else content + return data def _video_from_tweet(self, tweet_id): @@ -185,7 +215,7 @@ class TwitterExtractor(Extractor): if "min_position" in data: position = data["min_position"] - if position == max_position: + if position == max_position or position is None: return else: position = text.parse_int(text.extract( @@ -204,7 +234,7 @@ class TwitterTimelineExtractor(TwitterExtractor): ("https://twitter.com/supernaturepics", { "range": "1-40", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", - "keyword": "7210d679606240405e0cf62cbc67596e81a7a250", + "keyword": "37f4d35affd733d458d3b235b4a55f619a86f794", }), ("https://mobile.twitter.com/supernaturepics?p=i"), ) @@ -262,13 +292,13 @@ class TwitterTweetExtractor(TwitterExtractor): test = ( ("https://twitter.com/supernaturepics/status/604341487988576256", { "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", - "keyword": "1b8afb93cc04a9f44d89173f8facc61c3a6caf91", + "keyword": "3fa3623e8d9a204597238e2f1f6433da19c63b4a", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", }), # 4 images ("https://twitter.com/perrypumas/status/894001459754180609", { "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6", - "keyword": "43d98ab448193f0d4f30aa571a4b6bda9b6a5692", + "keyword": "49165725116ac52193a3861e8f5534e47a706b62", }), # video ("https://twitter.com/perrypumas/status/1065692031626829824", { @@ -278,7 +308,7 @@ class TwitterTweetExtractor(TwitterExtractor): # content with emoji, newlines, hashtags (#338) ("https://twitter.com/yumi_san0112/status/1151144618936823808", { "options": (("content", True),), - "keyword": "b133464b73aec33871521ab021a3166204194285", + "keyword": "0b7a3d05607b480c1412dfd85f8606478313e7bf", }), # Reply to another tweet (#403) ("https://twitter.com/tyson_hesse/status/1103767554424598528", { @@ -295,9 +325,6 @@ class TwitterTweetExtractor(TwitterExtractor): TwitterExtractor.__init__(self, match) self.tweet_id = match.group(2) - def metadata(self): - return {"user": self.user, "tweet_id": self.tweet_id} - def tweets(self): url = "{}/i/web/status/{}".format(self.root, self.tweet_id) cookies = {"app_shell_visited": "1"} diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 6cc5911..a24d3fe 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -29,10 +29,25 @@ class VscoExtractor(Extractor): self.user = match.group(1).lower() def items(self): + videos = self.config("videos", True) yield Message.Version, 1 yield Message.Directory, {"user": self.user} for img in self.images(): - url = "https://" + (img.get("video_url") or img["responsive_url"]) + + if img["is_video"]: + if not videos: + continue + url = "https://" + img["video_url"] + else: + base = img["responsive_url"].partition("/")[2] + cdn, _, path = base.partition("/") + if cdn.startswith("aws"): + url = "https://image-{}.vsco.co/{}".format(cdn, path) + elif cdn.isdecimal(): + url = "https://image.vsco.co/" + base + else: + url = "https://" + img["responsive_url"] + data = text.nameext_from_url(url, { "id" : img["_id"], "user" : self.user, @@ -66,10 +81,20 @@ class VscoExtractor(Extractor): while True: data = self.request(url, params=params, headers=headers).json() - if not data.get(key): + medias = data.get(key) + if not medias: return - yield from data[key] - params["page"] += 1 + + if "cursor" in params: + for media in medias: + yield media[media["type"]] + cursor = data.get("next_cursor") + if not cursor: + return + params["cursor"] = cursor + else: + yield from medias + params["page"] += 1 @staticmethod def _transform_media(media): @@ -89,9 +114,9 @@ class VscoUserExtractor(VscoExtractor): pattern = BASE_PATTERN + r"(?:/images(?:/\d+)?)?/?(?:$|[?#])" test = ( ("https://vsco.co/missuri/images/1", { + "pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+", "range": "1-80", "count": 80, - "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+", }), ("https://vsco.co/missuri"), ) @@ -102,12 +127,19 @@ class VscoUserExtractor(VscoExtractor): tkn = data["users"]["currentUser"]["tkn"] sid = str(data["sites"]["siteByUsername"][self.user]["site"]["id"]) + site = data["medias"]["bySiteId"][sid] + + url = "{}/api/3.0/medias/profile".format(self.root) + params = { + "site_id" : sid, + "limit" : "14", + "show_only": "0", + "cursor" : site["nextCursor"], + } - url = "{}/api/2.0/medias".format(self.root) - params = {"page": 2, "size": "30", "site_id": sid} return self._pagination(url, params, tkn, "media", ( - data["medias"]["byId"][mid]["media"] - for mid in data["medias"]["bySiteId"][sid]["medias"]["1"] + data["medias"]["byId"][media[media["type"]]]["media"] + for media in site["medias"] )) @@ -118,9 +150,9 @@ class VscoCollectionExtractor(VscoExtractor): archive_fmt = "c_{user}_{id}" pattern = BASE_PATTERN + r"/collection/" test = ("https://vsco.co/vsco/collection/1", { + "pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+\.\w+", "range": "1-80", "count": 80, - "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+", }) def images(self): @@ -136,7 +168,7 @@ class VscoCollectionExtractor(VscoExtractor): return self._pagination(url, params, tkn, "medias", ( data["medias"]["byId"][mid]["media"] for mid in data - ["collections"]["byCollectionId"][cid]["collection"]["1"] + ["collections"]["byCollectionId"][cid]["byPage"]["1"]["collection"] )) @@ -146,7 +178,7 @@ class VscoImageExtractor(VscoExtractor): pattern = BASE_PATTERN + r"/media/([0-9a-fA-F]+)" test = ( ("https://vsco.co/erenyildiz/media/5d34b93ef632433030707ce2", { - "url": "faa214d10f859f374ad91da3f7547d2439f5af08", + "url": "a45f9712325b42742324b330c348b72477996031", "content": "1394d070828d82078035f19a92f404557b56b83f", "keyword": { "id" : "5d34b93ef632433030707ce2", diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 09a166c..737c253 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -33,8 +33,8 @@ class WallhavenSearchExtractor(WallhavenExtractor): ("https://wallhaven.cc/search?q=touhou"), (("https://wallhaven.cc/search?q=id%3A87" "&categories=111&purity=100&sorting=date_added&order=asc&page=3"), { - "count": 5, - "url": "d477b68a534c3416d506ae1f159b25debab64678", + "pattern": r"https://w.wallhaven.cc/full/\w\w/wallhaven-\w+\.\w+", + "count": "<= 10", }), ) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 0f4ebd2..49fa082 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -9,7 +9,7 @@ """Extractors for https://www.weibo.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception import json @@ -124,7 +124,7 @@ class WeiboStatusExtractor(WeiboExtractor): }), # unavailable video (#427) ("https://m.weibo.cn/status/4268682979207023", { - "count": 0, + "exception": exception.NotFoundError, }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), @@ -136,7 +136,8 @@ class WeiboStatusExtractor(WeiboExtractor): def statuses(self): url = "{}/detail/{}".format(self.root, self.status_id) - page = self.request(url).text - data = json.loads(text.extract( - page, " var $render_data = [", "][0] || {};")[0]) - return (data["status"],) + page = self.request(url, notfound="status").text + data = text.extract(page, "var $render_data = [", "][0] || {};")[0] + if not data: + raise exception.NotFoundError("status") + return (json.loads(data)["status"],) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 463733f..ac289df 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -93,7 +93,7 @@ class WikiartArtworksExtractor(WikiartExtractor): directory_fmt = ("{category}", "Artworks by {group!c}", "{type}") pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)" test = ("https://www.wikiart.org/en/paintings-by-media/grisaille", { - "url": "f92d55669fa949491c26a5437527adb14b35b8cc", + "url": "228426a9d32b5bba9d659944c6b0ba73883af33f", }) def __init__(self, match): diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index e253b7f..80a3614 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -6,86 +6,91 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.xvideos.com/""" +"""Extractors for https://www.xvideos.com/""" -from .common import Extractor, Message -from .. import text, exception +from .common import GalleryExtractor, Extractor, Message +from .. import text import json -class XvideosExtractor(Extractor): +class XvideosBase(): """Base class for xvideos extractors""" category = "xvideos" root = "https://www.xvideos.com" -class XvideosGalleryExtractor(XvideosExtractor): - """Extractor for user profile galleries from xvideos.com""" +class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): + """Extractor for user profile galleries on xvideos.com""" subcategory = "gallery" - directory_fmt = ("{category}", "{user[name]}", "{title}") - filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" - archive_fmt = "{gallery_id}_{num}" + directory_fmt = ("{category}", "{user[name]}", + "{gallery[id]} {gallery[title]}") + filename_fmt = "{category}_{gallery[id]}_{num:>03}.{extension}" + archive_fmt = "{gallery[id]}_{num}" pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" - r"/profiles/([^/?&#]+)/photos/(\d+)") + r"/(?:profiles|amateur-channels|model-channels)" + r"/([^/?&#]+)/photos/(\d+)") test = ( - (("https://www.xvideos.com/profiles" - "/pervertedcouple/photos/751031/random_stuff"), { + ("https://www.xvideos.com/profiles/pervertedcouple/photos/751031", { "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", - "keyword": "65979d63a69576cf692b41d5fbbd995cc40a51b9", - }), - ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { - "exception": exception.NotFoundError, + "keyword": { + "gallery": { + "id" : 751031, + "title": "Random Stuff", + "tags" : list, + }, + "user": { + "id" : 20245371, + "name" : "pervertedcouple", + "display" : "Pervertedcouple", + "sex" : "Woman", + "description": str, + }, + }, }), + ("https://www.xvideos.com/amateur-channels/pervertedcouple/photos/12"), + ("https://www.xvideos.com/model-channels/pervertedcouple/photos/12"), ) def __init__(self, match): - XvideosExtractor.__init__(self, match) - self.user, self.gid = match.groups() - - def items(self): - url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) - page = self.request(url, notfound=self.subcategory).text - data = self.get_metadata(page) - imgs = self.get_images(page) - data["count"] = len(imgs) - yield Message.Version, 1 - yield Message.Directory, data - for url in imgs: - data["num"] = text.parse_int(url.rsplit("_", 2)[1]) - data["extension"] = url.rpartition(".")[2] - yield Message.Url, url, data - - def get_metadata(self, page): - """Collect metadata for extractor-job""" - data = text.extract_all(page, ( - ("userid" , '"id_user":', ','), - ("display", '"display":"', '"'), - ("title" , '"title":"', '"'), - ("descr" , '<small class="mobile-hide">', '</small>'), - ("tags" , '<em>Tagged:</em>', '<'), - ))[0] + self.user, self.gallery_id = match.groups() + url = "{}/profiles/{}/photos/{}".format( + self.root, self.user, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + user = { + "id" : text.parse_int(extr('"id_user":', ',')), + "display": extr('"display":"', '"'), + "sex" : extr('"sex":"', '"'), + "name" : self.user, + } + title = extr('"title":"', '"') + user["description"] = extr( + '<small class="mobile-hide">', '</small>').strip() + tags = extr('<em>Tagged:</em>', '<').strip() return { - "user": { - "id": text.parse_int(data["userid"]), - "name": self.user, - "display": data["display"], - "description": data["descr"].strip(), + "user": user, + "gallery": { + "id" : text.parse_int(self.gallery_id), + "title": text.unescape(title), + "tags" : text.unescape(tags).split(", ") if tags else [], }, - "tags": text.unescape(data["tags"] or "").strip().split(", "), - "title": text.unescape(data["title"]), - "gallery_id": text.parse_int(self.gid), } @staticmethod - def get_images(page): + def images(page): """Return a list of all image urls for this gallery""" - return list(text.extract_iter( - page, '<a class="embed-responsive-item" href="', '"')) + return [ + (url, None) + for url in text.extract_iter( + page, '<a class="embed-responsive-item" href="', '"') + ] -class XvideosUserExtractor(XvideosExtractor): - """Extractor for user profiles from xvideos.com""" +class XvideosUserExtractor(XvideosBase, Extractor): + """Extractor for user profiles on xvideos.com""" subcategory = "user" categorytransfer = True pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" @@ -93,16 +98,13 @@ class XvideosUserExtractor(XvideosExtractor): test = ( ("https://www.xvideos.com/profiles/pervertedcouple", { "url": "a413f3e60d6d3a2de79bd44fa3b7a9c03db4336e", - "keyword": "a796760d34732adc7ec52a8feb057515209a2ca6", - }), - ("https://www.xvideos.com/profiles/niwehrwhernvh", { - "exception": exception.NotFoundError, + "keyword": "335a3304941ff2e666c0201e9122819b61b34adb", }), ("https://www.xvideos.com/profiles/pervertedcouple#_tabPhotos"), ) def __init__(self, match): - XvideosExtractor.__init__(self, match) + Extractor.__init__(self, match) self.user = match.group(1) def items(self): @@ -118,17 +120,17 @@ class XvideosUserExtractor(XvideosExtractor): galleries = [ { - "gallery_id": text.parse_int(gid), + "id" : text.parse_int(gid), "title": text.unescape(gdata["title"]), "count": gdata["nb_pics"], "_extractor": XvideosGalleryExtractor, } for gid, gdata in data["galleries"].items() ] - galleries.sort(key=lambda x: x["gallery_id"]) + galleries.sort(key=lambda x: x["id"]) yield Message.Version, 1 for gallery in galleries: url = "https://www.xvideos.com/profiles/{}/photos/{}".format( - self.user, gallery["gallery_id"]) + self.user, gallery["id"]) yield Message.Queue, url, gallery diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 9c76336..88b6a55 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -8,6 +8,7 @@ import sys import time +import errno import logging from . import extractor, downloader, postprocessor from . import config, text, util, output, exception @@ -97,6 +98,10 @@ class Job(): self.update_kwdict(kwds) self.handle_urllist(urls, kwds) + elif msg[0] == Message.Metadata: + self.update_kwdict(msg[1]) + self.handle_metadata(msg[1]) + elif msg[0] == Message.Version: if msg[1] != 1: raise "unsupported message-version ({}, {})".format( @@ -114,6 +119,9 @@ class Job(): def handle_directory(self, kwdict): """Handle Message.Directory""" + def handle_metadata(self, kwdict): + """Handle Message.Metadata""" + def handle_queue(self, url, kwdict): """Handle Message.Queue""" @@ -161,11 +169,6 @@ class Job(): if self.ulog: self.ulog.info(url) - @staticmethod - def _filter(kwdict): - """Return a copy of 'kwdict' without "private" entries""" - return {k: v for k, v in kwdict.items() if k[0] != "_"} - class DownloadJob(Job): """Download images into appropriate directory/filename locations""" @@ -247,6 +250,16 @@ class DownloadJob(Job): else: self.pathfmt.set_directory(kwdict) + def handle_metadata(self, kwdict): + """Run postprocessors with metadata from 'kwdict'""" + postprocessors = self.postprocessors + + if postprocessors: + pathfmt = self.pathfmt + pathfmt.set_filename(kwdict) + for pp in postprocessors: + pp.run_metadata(pathfmt) + def handle_queue(self, url, kwdict): if "_extractor" in kwdict: extr = kwdict["_extractor"].from_url(url) @@ -280,7 +293,13 @@ class DownloadJob(Job): scheme = url.partition(":")[0] downloader = self.get_downloader(scheme) if downloader: - return downloader.download(url, self.pathfmt) + try: + return downloader.download(url, self.pathfmt) + except OSError as exc: + if exc.errno == errno.ENOSPC: + raise + self.log.warning("%s: %s", exc.__class__.__name__, exc) + return False self._write_unsupported(url) return False @@ -291,14 +310,14 @@ class DownloadJob(Job): except KeyError: pass - klass = downloader.find(scheme) - if klass and config.get(("downloader", klass.scheme, "enabled"), True): - instance = klass(self.extractor, self.out) + cls = downloader.find(scheme) + if cls and config.get(("downloader", cls.scheme), "enabled", True): + instance = cls(self.extractor, self.out) else: instance = None self.log.error("'%s:' URLs are not supported/enabled", scheme) - if klass and klass.scheme == "http": + if cls and cls.scheme == "http": self.downloaders["http"] = self.downloaders["https"] = instance else: self.downloaders[scheme] = instance @@ -477,7 +496,10 @@ class DataJob(Job): Job.__init__(self, url, parent) self.file = file self.data = [] - self.ascii = config.get(("output", "ascii"), ensure_ascii) + self.ascii = config.get(("output",), "ascii", ensure_ascii) + + private = config.get(("output",), "private") + self.filter = (lambda x: x) if private else util.filter_dict def run(self): # collect data @@ -492,7 +514,7 @@ class DataJob(Job): pass # convert numbers to string - if config.get(("output", "num-to-str"), False): + if config.get(("output",), "num-to-str", False): for msg in self.data: util.transform_dict(msg[-1], util.number_to_string) @@ -501,16 +523,19 @@ class DataJob(Job): return 0 def handle_url(self, url, kwdict): - self.data.append((Message.Url, url, self._filter(kwdict))) + self.data.append((Message.Url, url, self.filter(kwdict))) def handle_urllist(self, urls, kwdict): - self.data.append((Message.Urllist, list(urls), self._filter(kwdict))) + self.data.append((Message.Urllist, list(urls), self.filter(kwdict))) def handle_directory(self, kwdict): - self.data.append((Message.Directory, self._filter(kwdict))) + self.data.append((Message.Directory, self.filter(kwdict))) + + def handle_metadata(self, kwdict): + self.data.append((Message.Metadata, self.filter(kwdict))) def handle_queue(self, url, kwdict): - self.data.append((Message.Queue, url, self._filter(kwdict))) + self.data.append((Message.Queue, url, self.filter(kwdict))) def handle_finalize(self): self.file.close() diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py index 3093a72..9ceefbf 100644 --- a/gallery_dl/oauth.py +++ b/gallery_dl/oauth.py @@ -115,14 +115,15 @@ class OAuth1API(): api_secret = extractor.config("api-secret", self.API_SECRET) token = extractor.config("access-token") token_secret = extractor.config("access-token-secret") + key_type = "default" if api_key == self.API_KEY else "custom" if api_key and api_secret and token and token_secret: - self.log.debug("Using OAuth1.0 authentication") + self.log.debug("Using %s OAuth1.0 authentication", key_type) self.session = OAuth1Session( api_key, api_secret, token, token_secret) self.api_key = None else: - self.log.debug("Using api_key authentication") + self.log.debug("Using %s api_key authentication", key_type) self.session = extractor.session self.api_key = api_key diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 3118b83..34222a2 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -18,13 +18,13 @@ from . import job, version class ConfigAction(argparse.Action): """Set argparse results as config values""" def __call__(self, parser, namespace, values, option_string=None): - namespace.options.append(((self.dest,), values)) + namespace.options.append(((), self.dest, values)) class ConfigConstAction(argparse.Action): """Set argparse const values as config values""" def __call__(self, parser, namespace, values, option_string=None): - namespace.options.append(((self.dest,), self.const)) + namespace.options.append(((), self.dest, self.const)) class AppendCommandAction(argparse.Action): @@ -41,7 +41,7 @@ class DeprecatedConfigConstAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): print("warning: {} is deprecated. Use {} instead.".format( "/".join(self.option_strings), self.choices), file=sys.stderr) - namespace.options.append(((self.dest,), self.const)) + namespace.options.append(((), self.dest, self.const)) class ParseAction(argparse.Action): @@ -52,8 +52,8 @@ class ParseAction(argparse.Action): value = json.loads(value) except ValueError: pass - key = key.split(".") - namespace.options.append((key, value)) + key = key.split(".") # splitting an empty string becomes [""] + namespace.options.append((key[:-1], key[-1], value)) class Formatter(argparse.HelpFormatter): @@ -224,12 +224,6 @@ def build_parser(): dest="verify", nargs=0, action=ConfigConstAction, const=False, help="Disable HTTPS certificate validation", ) - downloader.add_argument( - "--abort-on-skip", - action=DeprecatedConfigConstAction, - dest="skip", nargs=0, const="abort", choices="-A/--abort", - help=argparse.SUPPRESS, - ) configuration = parser.add_argument_group("Configuration Options") configuration.add_argument( @@ -313,13 +307,6 @@ def build_parser(): help="Store downloaded files in a ZIP archive", ) postprocessor.add_argument( - "--exec", - dest="postprocessors", metavar="CMD", - action=AppendCommandAction, const={"name": "exec"}, - help=("Execute CMD for each downloaded file. " - "Example: --exec 'magick convert {} {}.png && rm {}'"), - ) - postprocessor.add_argument( "--ugoira-conv", dest="postprocessors", action="append_const", const={ "name" : "ugoira", @@ -358,6 +345,20 @@ def build_parser(): action="append_const", const={"name": "mtime"}, help="Set file modification times according to 'date' metadata", ) + postprocessor.add_argument( + "--exec", + dest="postprocessors", metavar="CMD", + action=AppendCommandAction, const={"name": "exec"}, + help=("Execute CMD for each downloaded file. " + "Example: --exec 'convert {} {}.png && rm {}'"), + ) + postprocessor.add_argument( + "--exec-after", + dest="postprocessors", metavar="CMD", + action=AppendCommandAction, const={"name": "exec", "final": True}, + help=("Execute CMD after all files were downloaded successfully. " + "Example: --exec-after 'cd {} && convert * ../doc.pdf'"), + ) parser.add_argument( "urls", diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 87c5006..38e2f60 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -83,7 +83,7 @@ def initialize_logging(loglevel): def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): """Setup a new logging handler""" - opts = config.interpolate(("output", key)) + opts = config.interpolate(("output",), key) if not opts: return None if not isinstance(opts, dict): @@ -114,7 +114,7 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): def configure_logging_handler(key, handler): """Configure a logging handler""" - opts = config.interpolate(("output", key)) + opts = config.interpolate(("output",), key) if not opts: return if isinstance(opts, str): @@ -156,7 +156,7 @@ def select(): "color": ColorOutput, "null": NullOutput, } - omode = config.get(("output", "mode"), "auto").lower() + omode = config.get(("output",), "mode", "auto").lower() if omode in pdict: return pdict[omode]() elif omode == "auto": @@ -192,7 +192,7 @@ class PipeOutput(NullOutput): class TerminalOutput(NullOutput): def __init__(self): - self.short = config.get(("output", "shorten"), True) + self.short = config.get(("output",), "shorten", True) if self.short: self.width = shutil.get_terminal_size().columns - OFFSET diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 83b42eb..70b0dfb 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -27,6 +27,10 @@ class PostProcessor(): """Execute the postprocessor for a file""" @staticmethod + def run_metadata(pathfmt): + """Execute the postprocessor for a file""" + + @staticmethod def run_after(pathfmt): """Execute postprocessor after moving a file to its target location""" diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 467ef11..bc26484 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -18,29 +18,49 @@ class MetadataPP(PostProcessor): PostProcessor.__init__(self) mode = options.get("mode", "json") - ext = "txt" - if mode == "custom": self.write = self._write_custom - self.formatter = util.Formatter(options.get("format")) + cfmt = options.get("content-format") or options.get("format") + self.contentfmt = util.Formatter(cfmt).format_map + ext = "txt" elif mode == "tags": self.write = self._write_tags + ext = "txt" else: self.write = self._write_json self.indent = options.get("indent", 4) self.ascii = options.get("ascii", False) ext = "json" - self.extension = options.get("extension", ext) + extfmt = options.get("extension-format") + if extfmt: + self.path = self._path_format + self.extfmt = util.Formatter(extfmt).format_map + else: + self.path = self._path_append + self.extension = options.get("extension", ext) + + if options.get("bypost"): + self.run_metadata, self.run = self.run, self.run_metadata def run(self, pathfmt): - path = "{}.{}".format(pathfmt.realpath, self.extension) - with open(path, "w", encoding="utf-8") as file: + with open(self.path(pathfmt), "w", encoding="utf-8") as file: self.write(file, pathfmt.kwdict) + def _path_append(self, pathfmt): + return "{}.{}".format(pathfmt.realpath, self.extension) + + def _path_format(self, pathfmt): + kwdict = pathfmt.kwdict + ext = kwdict["extension"] + kwdict["extension"] = pathfmt.extension + kwdict["extension"] = pathfmt.prefix + self.extfmt(kwdict) + path = pathfmt.realdirectory + pathfmt.build_filename() + kwdict["extension"] = ext + return path + def _write_custom(self, file, kwdict): - output = self.formatter.format_map(kwdict) - file.write(output) + file.write(self.contentfmt(kwdict)) def _write_tags(self, file, kwdict): tags = kwdict.get("tags") or kwdict.get("tag_string") @@ -58,7 +78,7 @@ class MetadataPP(PostProcessor): file.write("\n") def _write_json(self, file, kwdict): - util.dump_json(kwdict, file, self.ascii, self.indent) + util.dump_json(util.filter_dict(kwdict), file, self.ascii, self.indent) __postprocessor__ = MetadataPP diff --git a/gallery_dl/util.py b/gallery_dl/util.py index fb51edf..48ae0be 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -78,6 +78,11 @@ def transform_dict(a, func): a[key] = func(value) +def filter_dict(a): + """Return a copy of 'a' without "private" entries""" + return {k: v for k, v in a.items() if k[0] != "_"} + + def number_to_string(value, numbers=(int, float)): """Convert numbers (int, float) to string; Return everything else as is.""" return str(value) if value.__class__ in numbers else value @@ -665,17 +670,17 @@ class PathFormat(): self.temppath = self.realpath = self.realpath[:-1] return True - def build_path(self): - """Use filename metadata and directory to build a full path""" - - # Apply 'kwdict' to filename format string + def build_filename(self): + """Apply 'kwdict' to filename format string""" try: - self.filename = filename = self.clean_path(self.clean_segment( + return self.clean_path(self.clean_segment( self.filename_formatter(self.kwdict))) except Exception as exc: raise exception.FilenameFormatError(exc) - # Combine directory and filename to full paths + def build_path(self): + """Combine directory and filename to full paths""" + self.filename = filename = self.build_filename() self.path = self.directory + filename self.realpath = self.realdirectory + filename if not self.temppath: @@ -743,13 +748,13 @@ class DownloadArchive(): def __contains__(self, kwdict): """Return True if the item described by 'kwdict' exists in archive""" - key = self.keygen(kwdict) + key = kwdict["_archive_key"] = self.keygen(kwdict) self.cursor.execute( "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) return self.cursor.fetchone() def add(self, kwdict): """Add item described by 'kwdict' to archive""" - key = self.keygen(kwdict) + key = kwdict.get("_archive_key") or self.keygen(kwdict) self.cursor.execute( "INSERT OR IGNORE INTO archive VALUES (?)", (key,)) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 4d73139..2ac7ceb 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.11.1" +__version__ = "1.12.1" diff --git a/test/test_config.py b/test/test_config.py index 8cdb3da..a9d3f54 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,65 +16,125 @@ import tempfile class TestConfig(unittest.TestCase): def setUp(self): - fd, self._configfile = tempfile.mkstemp() - with os.fdopen(fd, "w") as file: - file.write('{"a": "1", "b": {"a": 2, "c": "text"}}') - config.load((self._configfile,)) + config.set(() , "a", 1) + config.set(("b",) , "a", 2) + config.set(("b", "b"), "a", 3) + config.set(("b",) , "c", "text") + config.set(("b", "b"), "c", [8, 9]) def tearDown(self): config.clear() - os.remove(self._configfile) def test_get(self): - self.assertEqual(config.get(["a"]), "1") - self.assertEqual(config.get(["b", "c"]), "text") - self.assertEqual(config.get(["d"]), None) - self.assertEqual(config.get(["e", "f", "g"], 123), 123) + self.assertEqual(config.get(() , "a") , 1) + self.assertEqual(config.get(("b",) , "a") , 2) + self.assertEqual(config.get(("b", "b"), "a") , 3) + + self.assertEqual(config.get(() , "c") , None) + self.assertEqual(config.get(("b",) , "c") , "text") + self.assertEqual(config.get(("b", "b"), "c") , [8, 9]) + + self.assertEqual(config.get(("a",) , "g") , None) + self.assertEqual(config.get(("a", "a"), "g") , None) + self.assertEqual(config.get(("e", "f"), "g") , None) + self.assertEqual(config.get(("e", "f"), "g", 4), 4) def test_interpolate(self): - self.assertEqual(config.interpolate(["a"]), "1") - self.assertEqual(config.interpolate(["b", "a"]), "1") - self.assertEqual(config.interpolate(["b", "c"], "2"), "text") - self.assertEqual(config.interpolate(["b", "d"], "2"), "2") - config.set(["d"], 123) - self.assertEqual(config.interpolate(["b", "d"], "2"), 123) - self.assertEqual(config.interpolate(["d", "d"], "2"), 123) + self.assertEqual(config.interpolate(() , "a"), 1) + self.assertEqual(config.interpolate(("b",) , "a"), 1) + self.assertEqual(config.interpolate(("b", "b"), "a"), 1) + + self.assertEqual(config.interpolate(() , "c"), None) + self.assertEqual(config.interpolate(("b",) , "c"), "text") + self.assertEqual(config.interpolate(("b", "b"), "c"), [8, 9]) + + self.assertEqual(config.interpolate(("a",) , "g") , None) + self.assertEqual(config.interpolate(("a", "a"), "g") , None) + self.assertEqual(config.interpolate(("e", "f"), "g") , None) + self.assertEqual(config.interpolate(("e", "f"), "g", 4), 4) + + self.assertEqual(config.interpolate(("b",), "d", 1) , 1) + self.assertEqual(config.interpolate(("d",), "d", 1) , 1) + config.set(() , "d", 2) + self.assertEqual(config.interpolate(("b",), "d", 1) , 2) + self.assertEqual(config.interpolate(("d",), "d", 1) , 2) + config.set(("b",), "d", 3) + self.assertEqual(config.interpolate(("b",), "d", 1) , 2) + self.assertEqual(config.interpolate(("d",), "d", 1) , 2) def test_set(self): - config.set(["b", "c"], [1, 2, 3]) - config.set(["e", "f", "g"], value=234) - self.assertEqual(config.get(["b", "c"]), [1, 2, 3]) - self.assertEqual(config.get(["e", "f", "g"]), 234) + config.set(() , "c", [1, 2, 3]) + config.set(("b",) , "c", [1, 2, 3]) + config.set(("e", "f"), "g", value=234) + self.assertEqual(config.get(() , "c"), [1, 2, 3]) + self.assertEqual(config.get(("b",) , "c"), [1, 2, 3]) + self.assertEqual(config.get(("e", "f"), "g"), 234) def test_setdefault(self): - config.setdefault(["b", "c"], [1, 2, 3]) - config.setdefault(["e", "f", "g"], value=234) - self.assertEqual(config.get(["b", "c"]), "text") - self.assertEqual(config.get(["e", "f", "g"]), 234) + config.setdefault(() , "c", [1, 2, 3]) + config.setdefault(("b",) , "c", [1, 2, 3]) + config.setdefault(("e", "f"), "g", value=234) + self.assertEqual(config.get(() , "c"), [1, 2, 3]) + self.assertEqual(config.get(("b",) , "c"), "text") + self.assertEqual(config.get(("e", "f"), "g"), 234) def test_unset(self): - config.unset(["a"]) - config.unset(["b", "c"]) - config.unset(["c", "d"]) - self.assertEqual(config.get(["a"]), None) - self.assertEqual(config.get(["b", "a"]), 2) - self.assertEqual(config.get(["b", "c"]), None) + config.unset(() , "a") + config.unset(("b",), "c") + config.unset(("a",), "d") + config.unset(("b",), "d") + config.unset(("c",), "d") + self.assertEqual(config.get(() , "a"), None) + self.assertEqual(config.get(("b",), "a"), 2) + self.assertEqual(config.get(("b",), "c"), None) + self.assertEqual(config.get(("a",), "d"), None) + self.assertEqual(config.get(("b",), "d"), None) + self.assertEqual(config.get(("c",), "d"), None) def test_apply(self): options = ( - (["b", "c"], [1, 2, 3]), - (["e", "f", "g"], 234), + (("b",) , "c", [1, 2, 3]), + (("e", "f"), "g", 234), ) - self.assertEqual(config.get(["b", "c"]), "text") - self.assertEqual(config.get(["e", "f", "g"]), None) + self.assertEqual(config.get(("b",) , "c"), "text") + self.assertEqual(config.get(("e", "f"), "g"), None) with config.apply(options): - self.assertEqual(config.get(["b", "c"]), [1, 2, 3]) - self.assertEqual(config.get(["e", "f", "g"]), 234) - - self.assertEqual(config.get(["b", "c"]), "text") - self.assertEqual(config.get(["e", "f", "g"]), None) + self.assertEqual(config.get(("b",) , "c"), [1, 2, 3]) + self.assertEqual(config.get(("e", "f"), "g"), 234) + + self.assertEqual(config.get(("b",) , "c"), "text") + self.assertEqual(config.get(("e", "f"), "g"), None) + + def test_load(self): + with tempfile.TemporaryDirectory() as base: + path1 = os.path.join(base, "cfg1") + with open(path1, "w") as file: + file.write('{"a": 1, "b": {"a": 2, "c": "text"}}') + + path2 = os.path.join(base, "cfg2") + with open(path2, "w") as file: + file.write('{"a": 7, "b": {"a": 8, "e": "foo"}}') + + config.clear() + config.load((path1,)) + self.assertEqual(config.get(() , "a"), 1) + self.assertEqual(config.get(("b",), "a"), 2) + self.assertEqual(config.get(("b",), "c"), "text") + + config.load((path2,)) + self.assertEqual(config.get(() , "a"), 7) + self.assertEqual(config.get(("b",), "a"), 8) + self.assertEqual(config.get(("b",), "c"), "text") + self.assertEqual(config.get(("b",), "e"), "foo") + + config.clear() + config.load((path1, path2)) + self.assertEqual(config.get(() , "a"), 7) + self.assertEqual(config.get(("b",), "a"), 8) + self.assertEqual(config.get(("b",), "c"), "text") + self.assertEqual(config.get(("b",), "e"), "foo") if __name__ == '__main__': diff --git a/test/test_cookies.py b/test/test_cookies.py index a786df6..4f294bf 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2017 Mike Fährmann +# Copyright 2017-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,8 +18,6 @@ from os.path import join import gallery_dl.config as config import gallery_dl.extractor as extractor -CKEY = ("cookies",) - class TestCookiejar(unittest.TestCase): @@ -45,7 +43,7 @@ class TestCookiejar(unittest.TestCase): config.clear() def test_cookiefile(self): - config.set(CKEY, self.cookiefile) + config.set((), "cookies", self.cookiefile) cookies = extractor.find("test:").session.cookies self.assertEqual(len(cookies), 1) @@ -63,7 +61,7 @@ class TestCookiejar(unittest.TestCase): self._test_warning(join(self.path.name, "nothing"), FileNotFoundError) def _test_warning(self, filename, exc): - config.set(CKEY, filename) + config.set((), "cookies", filename) log = logging.getLogger("test") with mock.patch.object(log, "warning") as mock_warning: cookies = extractor.find("test:").session.cookies @@ -77,7 +75,7 @@ class TestCookiedict(unittest.TestCase): def setUp(self): self.cdict = {"NAME1": "VALUE1", "NAME2": "VALUE2"} - config.set(CKEY, self.cdict) + config.set((), "cookies", self.cdict) def tearDown(self): config.clear() @@ -112,7 +110,7 @@ class TestCookieLogin(unittest.TestCase): } for category, cookienames in extr_cookies.items(): cookies = {name: "value" for name in cookienames} - config.set(CKEY, cookies) + config.set((), "cookies", cookies) extr = _get_extractor(category) with mock.patch.object(extr, "_login_impl") as mock_login: extr.login() diff --git a/test/test_downloader.py b/test/test_downloader.py index 0f58d4e..a7c4ce6 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -100,7 +100,7 @@ class TestDownloaderBase(unittest.TestCase): cls.extractor = extractor.find("test:") cls.dir = tempfile.TemporaryDirectory() cls.fnum = 0 - config.set(("base-directory",), cls.dir.name) + config.set((), "base-directory", cls.dir.name) @classmethod def tearDownClass(cls): diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 0ab89db..17f82c9 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -58,7 +58,7 @@ class BasePostprocessorTest(unittest.TestCase): def setUpClass(cls): cls.extractor = extractor.find("test:") cls.dir = tempfile.TemporaryDirectory() - config.set(("base-directory",), cls.dir.name) + config.set((), "base-directory", cls.dir.name) @classmethod def tearDownClass(cls): @@ -151,8 +151,12 @@ class MetadataTest(BasePostprocessorTest): "ascii" : True, "indent" : 2, "extension": "JSON", + }, { + "public" : "hello", + "_private" : "world", }) + self.assertEqual(pp.path , pp._path_append) self.assertEqual(pp.write , pp._write_json) self.assertEqual(pp.ascii , True) self.assertEqual(pp.indent , 2) @@ -167,7 +171,8 @@ class MetadataTest(BasePostprocessorTest): self.assertEqual(self._output(m), """{ "category": "test", "extension": "ext", - "filename": "file" + "filename": "file", + "public": "hello" } """) @@ -224,13 +229,41 @@ class MetadataTest(BasePostprocessorTest): ) self.assertEqual(pp.write, pp._write_custom) self.assertEqual(pp.extension, "txt") - self.assertTrue(pp.formatter) + self.assertTrue(pp.contentfmt) with patch("builtins.open", mock_open()) as m: pp.prepare(self.pathfmt) pp.run(self.pathfmt) self.assertEqual(self._output(m), "bar\nNone\n") + def test_metadata_extfmt(self): + pp = self._create({ + "extension" : "ignored", + "extension-format": "json", + }) + + self.assertEqual(pp.path, pp._path_format) + + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + + path = self.pathfmt.realdirectory + "file.json" + m.assert_called_once_with(path, "w", encoding="utf-8") + + def test_metadata_extfmt_2(self): + pp = self._create({ + "extension-format": "{extension!u}-data:{category:Res/ES/}", + }) + + self.pathfmt.prefix = "2." + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + + path = self.pathfmt.realdirectory + "file.2.EXT-data:tESt" + m.assert_called_once_with(path, "w", encoding="utf-8") + @staticmethod def _output(mock): return "".join( diff --git a/test/test_results.py b/test/test_results.py index 6d628c3..869ff83 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -26,10 +26,9 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "hentaifox", - "livedoor", + "erolord", "mangapark", - "yaplog", + "photobucket", } @@ -56,10 +55,11 @@ class TestExtractorResults(unittest.TestCase): if result: if "options" in result: for key, value in result["options"]: - config.set(key.split("."), value) + key = key.split(".") + config.set(key[:-1], key[-1], value) if "range" in result: - config.set(("image-range",), result["range"]) - config.set(("chapter-range",), result["range"]) + config.set((), "image-range" , result["range"]) + config.set((), "chapter-range", result["range"]) content = "content" in result else: content = False @@ -79,14 +79,18 @@ class TestExtractorResults(unittest.TestCase): pass except exception.HttpError as exc: exc = str(exc) - if re.match(r"5\d\d: ", exc) or \ + if re.match(r"'5\d\d ", exc) or \ re.search(r"\bRead timed out\b", exc): self._skipped.append((url, exc)) self.skipTest(exc) raise - # test archive-id uniqueness - self.assertEqual(len(set(tjob.archive_list)), len(tjob.archive_list)) + if result.get("archive", True): + self.assertEqual( + len(set(tjob.archive_list)), + len(tjob.archive_list), + "archive-id uniqueness", + ) if tjob.queue: # test '_extractor' entries @@ -185,7 +189,7 @@ class ResultJob(job.DownloadJob): self._update_url(url) self._update_kwdict(kwdict) self._update_archive(kwdict) - self._update_content(url) + self._update_content(url, kwdict) self.format_filename(kwdict) def handle_directory(self, kwdict): @@ -204,7 +208,7 @@ class ResultJob(job.DownloadJob): def _update_kwdict(self, kwdict, to_list=True): if to_list: self.kwdict_list.append(kwdict.copy()) - kwdict = self._filter(kwdict) + kwdict = util.filter_dict(kwdict) self.kwdict_hash.update( json.dumps(kwdict, sort_keys=True, default=str).encode()) @@ -213,9 +217,10 @@ class ResultJob(job.DownloadJob): self.archive_list.append(archive_id) self.archive_hash.update(archive_id.encode()) - def _update_content(self, url): + def _update_content(self, url, kwdict): if self.content: scheme = url.partition(":")[0] + self.fileobj.kwdict = kwdict self.get_downloader(scheme).download(url, self.fileobj) @@ -281,34 +286,36 @@ def setup_test_config(): email = "gallerydl@openaliasbox.org" config.clear() - config.set(("cache", "file"), ":memory:") - config.set(("downloader", "part"), False) - config.set(("downloader", "adjust-extensions"), False) - config.set(("extractor", "timeout"), 60) - config.set(("extractor", "username"), name) - config.set(("extractor", "password"), name) - config.set(("extractor", "nijie" , "username"), email) - config.set(("extractor", "seiga" , "username"), email) - - config.set(("extractor", "danbooru" , "username"), None) - config.set(("extractor", "instagram", "username"), None) - config.set(("extractor", "imgur" , "username"), None) - config.set(("extractor", "twitter" , "username"), None) - - config.set(("extractor", "mangoxo" , "username"), "LiQiang3") - config.set(("extractor", "mangoxo" , "password"), "5zbQF10_5u25259Ma") - - config.set(("extractor", "deviantart", "client-id"), "7777") - config.set(("extractor", "deviantart", "client-secret"), + config.set(("cache",), "file", None) + config.set(("downloader",), "part", False) + config.set(("downloader",), "adjust-extensions", False) + config.set(("extractor" ,), "timeout" , 60) + config.set(("extractor" ,), "username", name) + config.set(("extractor" ,), "password", name) + + config.set(("extractor", "nijie") , "username", email) + config.set(("extractor", "seiga") , "username", email) + config.set(("extractor", "danbooru") , "username", None) + config.set(("extractor", "instagram") , "username", None) + config.set(("extractor", "twitter") , "username", None) + + config.set(("extractor", "newgrounds"), "username", "d1618111") + config.set(("extractor", "newgrounds"), "password", "d1618111") + + config.set(("extractor", "mangoxo") , "username", "LiQiang3") + config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") + + config.set(("extractor", "deviantart"), "client-id", "7777") + config.set(("extractor", "deviantart"), "client-secret", "ff14994c744d9208e5caeec7aab4a026") - config.set(("extractor", "tumblr", "api-key"), + config.set(("extractor", "tumblr"), "api-key", "0cXoHfIqVzMQcc3HESZSNsVlulGxEXGDTTZCDrRrjaa0jmuTc6") - config.set(("extractor", "tumblr", "api-secret"), + config.set(("extractor", "tumblr"), "api-secret", "6wxAK2HwrXdedn7VIoZWxGqVhZ8JdYKDLjiQjL46MLqGuEtyVj") - config.set(("extractor", "tumblr", "access-token"), + config.set(("extractor", "tumblr"), "access-token", "N613fPV6tOZQnyn0ERTuoEZn0mEqG8m2K8M3ClSJdEHZJuqFdG") - config.set(("extractor", "tumblr", "access-token-secret"), + config.set(("extractor", "tumblr"), "access-token-secret", "sgOA7ZTT4FBXdOGGVV331sSp0jHYp4yMDRslbhaQf7CaS71i4O") diff --git a/test/test_util.py b/test/test_util.py index 9b252a3..5a103cf 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -358,6 +358,21 @@ class TestOther(unittest.TestCase): self.assertEqual( d, {1: 123, 2: 123, 3: 0, 4: {11: 321, 12: 321, 13: 0}}) + def test_filter_dict(self): + d = {} + r = util.filter_dict(d) + self.assertEqual(r, d) + self.assertIsNot(r, d) + + d = {"foo": 123, "bar": [], "baz": None} + r = util.filter_dict(d) + self.assertEqual(r, d) + self.assertIsNot(r, d) + + d = {"foo": 123, "_bar": [], "__baz__": None} + r = util.filter_dict(d) + self.assertEqual(r, {"foo": 123}) + def test_number_to_string(self, f=util.number_to_string): self.assertEqual(f(1) , "1") self.assertEqual(f(1.0) , "1.0") |