summaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/__init__.py255
-rw-r--r--gallery_dl/__main__.py20
-rw-r--r--gallery_dl/aes.py337
-rw-r--r--gallery_dl/cache.py204
-rw-r--r--gallery_dl/cloudflare.py176
-rw-r--r--gallery_dl/config.py155
-rw-r--r--gallery_dl/downloader/__init__.py39
-rw-r--r--gallery_dl/downloader/common.py170
-rw-r--r--gallery_dl/downloader/http.py128
-rw-r--r--gallery_dl/downloader/text.py37
-rw-r--r--gallery_dl/downloader/ytdl.py81
-rw-r--r--gallery_dl/exception.py79
-rw-r--r--gallery_dl/extractor/2chan.py95
-rw-r--r--gallery_dl/extractor/35photo.py205
-rw-r--r--gallery_dl/extractor/3dbooru.py81
-rw-r--r--gallery_dl/extractor/4chan.py36
-rw-r--r--gallery_dl/extractor/500px.py238
-rw-r--r--gallery_dl/extractor/8chan.py29
-rw-r--r--gallery_dl/extractor/8muses.py129
-rw-r--r--gallery_dl/extractor/__init__.py189
-rw-r--r--gallery_dl/extractor/artstation.py369
-rw-r--r--gallery_dl/extractor/behance.py179
-rw-r--r--gallery_dl/extractor/bobx.py112
-rw-r--r--gallery_dl/extractor/booru.py265
-rw-r--r--gallery_dl/extractor/chan.py61
-rw-r--r--gallery_dl/extractor/common.py432
-rw-r--r--gallery_dl/extractor/danbooru.py86
-rw-r--r--gallery_dl/extractor/deviantart.py992
-rw-r--r--gallery_dl/extractor/directlink.py56
-rw-r--r--gallery_dl/extractor/dynastyscans.py145
-rw-r--r--gallery_dl/extractor/e621.py71
-rw-r--r--gallery_dl/extractor/exhentai.py382
-rw-r--r--gallery_dl/extractor/fallenangels.py105
-rw-r--r--gallery_dl/extractor/flickr.py503
-rw-r--r--gallery_dl/extractor/foolfuuka.py157
-rw-r--r--gallery_dl/extractor/foolslide.py240
-rw-r--r--gallery_dl/extractor/gelbooru.py130
-rw-r--r--gallery_dl/extractor/gfycat.py83
-rw-r--r--gallery_dl/extractor/hbrowse.py101
-rw-r--r--gallery_dl/extractor/hentai2read.py101
-rw-r--r--gallery_dl/extractor/hentaicafe.py88
-rw-r--r--gallery_dl/extractor/hentaifoundry.py264
-rw-r--r--gallery_dl/extractor/hentaifox.py117
-rw-r--r--gallery_dl/extractor/hentaihere.py101
-rw-r--r--gallery_dl/extractor/hentainexus.py96
-rw-r--r--gallery_dl/extractor/hitomi.py103
-rw-r--r--gallery_dl/extractor/hypnohub.py68
-rw-r--r--gallery_dl/extractor/idolcomplex.py59
-rw-r--r--gallery_dl/extractor/imagebam.py128
-rw-r--r--gallery_dl/extractor/imagefap.py195
-rw-r--r--gallery_dl/extractor/imagehosts.py251
-rw-r--r--gallery_dl/extractor/imgbox.py134
-rw-r--r--gallery_dl/extractor/imgth.py61
-rw-r--r--gallery_dl/extractor/imgur.py183
-rw-r--r--gallery_dl/extractor/instagram.py277
-rw-r--r--gallery_dl/extractor/keenspot.py157
-rw-r--r--gallery_dl/extractor/khinsider.py69
-rw-r--r--gallery_dl/extractor/kissmanga.py223
-rw-r--r--gallery_dl/extractor/komikcast.py117
-rw-r--r--gallery_dl/extractor/konachan.py85
-rw-r--r--gallery_dl/extractor/livedoor.py156
-rw-r--r--gallery_dl/extractor/luscious.py208
-rw-r--r--gallery_dl/extractor/mangadex.py180
-rw-r--r--gallery_dl/extractor/mangafox.py61
-rw-r--r--gallery_dl/extractor/mangahere.py138
-rw-r--r--gallery_dl/extractor/mangapanda.py36
-rw-r--r--gallery_dl/extractor/mangapark.py140
-rw-r--r--gallery_dl/extractor/mangareader.py119
-rw-r--r--gallery_dl/extractor/mangastream.py54
-rw-r--r--gallery_dl/extractor/mangoxo.py176
-rw-r--r--gallery_dl/extractor/mastodon.py203
-rw-r--r--gallery_dl/extractor/message.py54
-rw-r--r--gallery_dl/extractor/myportfolio.py95
-rw-r--r--gallery_dl/extractor/newgrounds.py155
-rw-r--r--gallery_dl/extractor/ngomik.py51
-rw-r--r--gallery_dl/extractor/nhentai.py135
-rw-r--r--gallery_dl/extractor/nijie.py205
-rw-r--r--gallery_dl/extractor/nsfwalbum.py62
-rw-r--r--gallery_dl/extractor/oauth.py375
-rw-r--r--gallery_dl/extractor/paheal.py120
-rw-r--r--gallery_dl/extractor/patreon.py183
-rw-r--r--gallery_dl/extractor/photobucket.py178
-rw-r--r--gallery_dl/extractor/piczel.py118
-rw-r--r--gallery_dl/extractor/pinterest.py260
-rw-r--r--gallery_dl/extractor/pixiv.py517
-rw-r--r--gallery_dl/extractor/pixnet.py179
-rw-r--r--gallery_dl/extractor/plurk.py125
-rw-r--r--gallery_dl/extractor/pornhub.py157
-rw-r--r--gallery_dl/extractor/pururin.py102
-rw-r--r--gallery_dl/extractor/reactor.py338
-rw-r--r--gallery_dl/extractor/readcomiconline.py97
-rw-r--r--gallery_dl/extractor/recursive.py55
-rw-r--r--gallery_dl/extractor/reddit.py313
-rw-r--r--gallery_dl/extractor/rule34.py63
-rw-r--r--gallery_dl/extractor/safebooru.py61
-rw-r--r--gallery_dl/extractor/sankaku.py299
-rw-r--r--gallery_dl/extractor/sankakucomplex.py120
-rw-r--r--gallery_dl/extractor/seiga.py198
-rw-r--r--gallery_dl/extractor/senmanga.py65
-rw-r--r--gallery_dl/extractor/sexcom.py194
-rw-r--r--gallery_dl/extractor/shopify.py136
-rw-r--r--gallery_dl/extractor/simplyhentai.py187
-rw-r--r--gallery_dl/extractor/slickpic.py140
-rw-r--r--gallery_dl/extractor/slideshare.py86
-rw-r--r--gallery_dl/extractor/smugmug.py316
-rw-r--r--gallery_dl/extractor/test.py86
-rw-r--r--gallery_dl/extractor/tsumino.py343
-rw-r--r--gallery_dl/extractor/tumblr.py425
-rw-r--r--gallery_dl/extractor/twitter.py202
-rw-r--r--gallery_dl/extractor/vanillarock.py95
-rw-r--r--gallery_dl/extractor/wallhaven.py148
-rw-r--r--gallery_dl/extractor/warosu.py108
-rw-r--r--gallery_dl/extractor/weibo.py137
-rw-r--r--gallery_dl/extractor/wikiart.py134
-rw-r--r--gallery_dl/extractor/xhamster.py171
-rw-r--r--gallery_dl/extractor/xvideos.py140
-rw-r--r--gallery_dl/extractor/yandere.py68
-rw-r--r--gallery_dl/extractor/yaplog.py109
-rw-r--r--gallery_dl/extractor/yuki.py125
-rw-r--r--gallery_dl/job.py492
-rw-r--r--gallery_dl/oauth.py132
-rw-r--r--gallery_dl/option.py304
-rw-r--r--gallery_dl/output.py221
-rw-r--r--gallery_dl/postprocessor/__init__.py44
-rw-r--r--gallery_dl/postprocessor/classify.py49
-rw-r--r--gallery_dl/postprocessor/common.py25
-rw-r--r--gallery_dl/postprocessor/exec.py43
-rw-r--r--gallery_dl/postprocessor/metadata.py65
-rw-r--r--gallery_dl/postprocessor/ugoira.py132
-rw-r--r--gallery_dl/postprocessor/zip.py65
-rw-r--r--gallery_dl/text.py278
-rw-r--r--gallery_dl/util.py673
-rw-r--r--gallery_dl/version.py9
133 files changed, 22062 insertions, 0 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
new file mode 100644
index 0000000..3643a5c
--- /dev/null
+++ b/gallery_dl/__init__.py
@@ -0,0 +1,255 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from __future__ import unicode_literals, print_function
+
+__author__ = "Mike Fährmann"
+__copyright__ = "Copyright 2014-2018 Mike Fährmann"
+__license__ = "GPLv2"
+__maintainer__ = "Mike Fährmann"
+__email__ = "mike_faehrmann@web.de"
+
+import sys
+
+if sys.hexversion < 0x3040000:
+ sys.exit("Python 3.4+ required")
+
+import json
+import logging
+from . import version, config, option, output, extractor, job, util, exception
+
+__version__ = version.__version__
+
+
+def progress(urls, pformat):
+ """Wrapper around urls to output a simple progress indicator"""
+ if pformat is True:
+ pformat = "[{current}/{total}] {url}"
+ pinfo = {"total": len(urls)}
+ for pinfo["current"], pinfo["url"] in enumerate(urls, 1):
+ print(pformat.format_map(pinfo), file=sys.stderr)
+ yield pinfo["url"]
+
+
+def parse_inputfile(file, log):
+ """Filter and process strings from an input file.
+
+ Lines starting with '#' and empty lines will be ignored.
+ Lines starting with '-' will be interpreted as a key-value pair separated
+ by an '='. where 'key' is a dot-separated option name and 'value' is a
+ JSON-parsable value for it. These config options will be applied while
+ processing the next URL.
+ Lines starting with '-G' are the same as above, except these options will
+ be valid for all following URLs, i.e. they are Global.
+ Everything else will be used as potential URL.
+
+ Example input file:
+
+ # settings global options
+ -G base-directory = "/tmp/"
+ -G skip = false
+
+ # setting local options for the next URL
+ -filename="spaces_are_optional.jpg"
+ -skip = true
+
+ https://example.org/
+
+ # next URL uses default filename and 'skip' is false.
+ https://example.com/index.htm
+ """
+ gconf = []
+ lconf = []
+
+ for line in file:
+ line = line.strip()
+
+ if not line or line[0] == "#":
+ # empty line or comment
+ continue
+
+ elif line[0] == "-":
+ # config spec
+ if len(line) >= 2 and line[1] == "G":
+ conf = gconf
+ line = line[2:]
+ else:
+ conf = lconf
+ line = line[1:]
+
+ key, sep, value = line.partition("=")
+ if not sep:
+ log.warning("input file: invalid <key>=<value> pair: %s", line)
+ continue
+
+ try:
+ value = json.loads(value.strip())
+ except ValueError as exc:
+ log.warning("input file: unable to parse '%s': %s", value, exc)
+ continue
+
+ conf.append((key.strip().split("."), value))
+
+ else:
+ # url
+ if gconf or lconf:
+ yield util.ExtendedUrl(line, gconf, lconf)
+ gconf = []
+ lconf = []
+ else:
+ yield line
+
+
+def main():
+ try:
+ if sys.stdout.encoding.lower() != "utf-8":
+ output.replace_std_streams()
+
+ parser = option.build_parser()
+ args = parser.parse_args()
+ log = output.initialize_logging(args.loglevel)
+
+ # configuration
+ if args.load_config:
+ config.load()
+ if args.cfgfiles:
+ config.load(args.cfgfiles, strict=True)
+ if args.yamlfiles:
+ config.load(args.yamlfiles, strict=True, fmt="yaml")
+ if args.postprocessors:
+ config.set(("postprocessors", ), args.postprocessors)
+ for key, value in args.options:
+ config.set(key, value)
+
+ # stream logging handler
+ output.configure_logging_handler(
+ "log", logging.getLogger().handlers[0])
+
+ # file logging handler
+ handler = output.setup_logging_handler(
+ "logfile", lvl=args.loglevel)
+ if handler:
+ logging.getLogger().addHandler(handler)
+
+ # loglevels
+ if args.loglevel >= logging.ERROR:
+ config.set(("output", "mode"), "null")
+ elif args.loglevel <= logging.DEBUG:
+ import platform
+ import subprocess
+ import os.path
+ import requests
+
+ head = ""
+ try:
+ out, err = subprocess.Popen(
+ ("git", "rev-parse", "--short", "HEAD"),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ cwd=os.path.dirname(os.path.abspath(__file__)),
+ ).communicate()
+ if out and not err:
+ head = " - Git HEAD: " + out.decode().rstrip()
+ except (OSError, subprocess.SubprocessError):
+ pass
+
+ log.debug("Version %s%s", __version__, head)
+ log.debug("Python %s - %s",
+ platform.python_version(), platform.platform())
+ try:
+ log.debug("requests %s - urllib3 %s",
+ requests.__version__,
+ requests.packages.urllib3.__version__)
+ except AttributeError:
+ pass
+
+ if args.list_modules:
+ for module_name in extractor.modules:
+ print(module_name)
+ elif args.list_extractors:
+ for extr in extractor.extractors():
+ if not extr.__doc__:
+ continue
+ print(extr.__name__)
+ print(extr.__doc__)
+ print("Category:", extr.category,
+ "- Subcategory:", extr.subcategory)
+ test = next(extr._get_tests(), None)
+ if test:
+ print("Example :", test[0])
+ print()
+ elif args.clear_cache:
+ from . import cache
+ log = logging.getLogger("cache")
+ cnt = cache.clear()
+
+ if cnt is None:
+ log.error("Database file not available")
+ else:
+ log.info(
+ "Deleted %d %s from '%s'",
+ cnt, "entry" if cnt == 1 else "entries", cache._path(),
+ )
+ else:
+ if not args.urls and not args.inputfile:
+ parser.error(
+ "The following arguments are required: URL\n"
+ "Use 'gallery-dl --help' to get a list of all options.")
+
+ if args.list_urls:
+ jobtype = job.UrlJob
+ jobtype.maxdepth = args.list_urls
+ else:
+ jobtype = args.jobtype or job.DownloadJob
+
+ urls = args.urls
+ if args.inputfile:
+ try:
+ if args.inputfile == "-":
+ file = sys.stdin
+ else:
+ file = open(args.inputfile, encoding="utf-8")
+ urls += parse_inputfile(file, log)
+ file.close()
+ except OSError as exc:
+ log.warning("input file: %s", exc)
+
+ # unsupported file logging handler
+ handler = output.setup_logging_handler(
+ "unsupportedfile", fmt="{message}")
+ if handler:
+ ulog = logging.getLogger("unsupported")
+ ulog.addHandler(handler)
+ ulog.propagate = False
+ job.Job.ulog = ulog
+
+ pformat = config.get(("output", "progress"), True)
+ if pformat and len(urls) > 1 and args.loglevel < logging.ERROR:
+ urls = progress(urls, pformat)
+
+ for url in urls:
+ try:
+ log.debug("Starting %s for '%s'", jobtype.__name__, url)
+ if isinstance(url, util.ExtendedUrl):
+ for key, value in url.gconfig:
+ config.set(key, value)
+ with config.apply(url.lconfig):
+ jobtype(url.value).run()
+ else:
+ jobtype(url).run()
+ except exception.NoExtractorError:
+ log.error("No suitable extractor found for '%s'", url)
+
+ except KeyboardInterrupt:
+ print("\nKeyboardInterrupt", file=sys.stderr)
+ except BrokenPipeError:
+ pass
+ except IOError as exc:
+ import errno
+ if exc.errno != errno.EPIPE:
+ raise
diff --git a/gallery_dl/__main__.py b/gallery_dl/__main__.py
new file mode 100644
index 0000000..04ea9fe
--- /dev/null
+++ b/gallery_dl/__main__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2017 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import sys
+
+if __package__ is None and not hasattr(sys, "frozen"):
+ import os.path
+ path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ sys.path.insert(0, os.path.realpath(path))
+
+import gallery_dl
+
+if __name__ == "__main__":
+ gallery_dl.main()
diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py
new file mode 100644
index 0000000..a45f50e
--- /dev/null
+++ b/gallery_dl/aes.py
@@ -0,0 +1,337 @@
+# -*- coding: utf-8 -*-
+
+# This is a stripped down version of youtube-dl's aes module.
+# All credit for this code goes to the authors of the youtube-dl project.
+# https://ytdl-org.github.io/youtube-dl/
+# https://github.com/ytdl-org/youtube-dl/
+
+import base64
+from math import ceil
+
+BLOCK_SIZE_BYTES = 16
+
+
+def aes_cbc_decrypt(data, key, iv):
+ """
+ Decrypt with aes in CBC mode
+
+ @param {int[]} data cipher
+ @param {int[]} key 16/24/32-Byte cipher key
+ @param {int[]} iv 16-Byte IV
+ @returns {int[]} decrypted data
+ """
+ expanded_key = key_expansion(key)
+ block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+
+ decrypted_data = []
+ previous_cipher_block = iv
+ for i in range(block_count):
+ block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ block += [0] * (BLOCK_SIZE_BYTES - len(block))
+
+ decrypted_block = aes_decrypt(block, expanded_key)
+ decrypted_data += xor(decrypted_block, previous_cipher_block)
+ previous_cipher_block = block
+ decrypted_data = decrypted_data[:len(data)]
+
+ return decrypted_data
+
+
+def aes_cbc_decrypt_text(data, key, iv):
+ """
+ Decrypt with aes in CBC mode
+
+ @param {string} data base64 encoded cipher
+ @param {int[]} key 16/24/32-Byte cipher key
+ @param {int[]} iv 16-Byte IV
+ @returns {string} decrypted data as utf8 encoded string
+ """
+ data = base64.standard_b64decode(bytes(data, "ascii"))
+ charcodes = aes_cbc_decrypt(list(data), key, iv)
+ last = charcodes[-1]
+ if last <= 16:
+ charcodes = charcodes[:-last]
+ return bytes(charcodes).decode()
+
+
+def key_expansion(data):
+ """
+ Generate key schedule
+
+ @param {int[]} data 16/24/32-Byte cipher key
+ @returns {int[]} 176/208/240-Byte expanded key
+ """
+ data = data[:] # copy
+ rcon_iteration = 1
+ key_size_bytes = len(data)
+ expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES
+
+ while len(data) < expanded_key_size_bytes:
+ temp = data[-4:]
+ temp = key_schedule_core(temp, rcon_iteration)
+ rcon_iteration += 1
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+ for _ in range(3):
+ temp = data[-4:]
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+ if key_size_bytes == 32:
+ temp = data[-4:]
+ temp = sub_bytes(temp)
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+ if key_size_bytes == 32:
+ rounds = 3
+ elif key_size_bytes == 24:
+ rounds = 2
+ else:
+ rounds = 0
+ for _ in range(rounds):
+ temp = data[-4:]
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+ data = data[:expanded_key_size_bytes]
+
+ return data
+
+
+def aes_decrypt(data, expanded_key):
+ """
+ Decrypt one block with aes
+
+ @param {int[]} data 16-Byte cipher
+ @param {int[]} expanded_key 176/208/240-Byte expanded key
+ @returns {int[]} 16-Byte state
+ """
+ rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
+
+ for i in range(rounds, 0, -1):
+ data = xor(
+ data,
+ expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ )
+ if i != rounds:
+ data = mix_columns_inv(data)
+ data = shift_rows_inv(data)
+ data = sub_bytes_inv(data)
+ data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
+
+ return data
+
+
+RCON = (
+ 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+)
+SBOX = (
+ 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
+ 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+ 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
+ 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+ 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
+ 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+ 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
+ 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+ 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
+ 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+ 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
+ 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+ 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
+ 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+ 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
+ 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+ 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
+ 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+ 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
+ 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+ 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
+ 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+ 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
+ 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+ 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
+ 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+ 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
+ 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+ 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
+ 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+ 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
+ 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
+)
+SBOX_INV = (
+ 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+ 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+ 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+ 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+ 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+ 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+ 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+ 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+ 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+ 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+ 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+ 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+ 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+ 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+ 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+ 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+ 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+ 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+ 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+ 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+ 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+ 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+ 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+ 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+ 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+ 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+ 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+ 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+ 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+ 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+ 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+ 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
+)
+MIX_COLUMN_MATRIX = (
+ (0x2, 0x3, 0x1, 0x1),
+ (0x1, 0x2, 0x3, 0x1),
+ (0x1, 0x1, 0x2, 0x3),
+ (0x3, 0x1, 0x1, 0x2),
+)
+MIX_COLUMN_MATRIX_INV = (
+ (0xE, 0xB, 0xD, 0x9),
+ (0x9, 0xE, 0xB, 0xD),
+ (0xD, 0x9, 0xE, 0xB),
+ (0xB, 0xD, 0x9, 0xE),
+)
+RIJNDAEL_EXP_TABLE = (
+ 0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF,
+ 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35,
+ 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4,
+ 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA,
+ 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26,
+ 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31,
+ 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC,
+ 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD,
+ 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7,
+ 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88,
+ 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F,
+ 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A,
+ 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0,
+ 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3,
+ 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC,
+ 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0,
+ 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2,
+ 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41,
+ 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0,
+ 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75,
+ 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E,
+ 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80,
+ 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF,
+ 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54,
+ 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09,
+ 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA,
+ 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91,
+ 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E,
+ 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C,
+ 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17,
+ 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD,
+ 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01,
+)
+RIJNDAEL_LOG_TABLE = (
+ 0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6,
+ 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
+ 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef,
+ 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
+ 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a,
+ 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
+ 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24,
+ 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
+ 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94,
+ 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
+ 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62,
+ 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
+ 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42,
+ 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
+ 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca,
+ 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
+ 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74,
+ 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
+ 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5,
+ 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
+ 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec,
+ 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
+ 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86,
+ 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
+ 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc,
+ 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
+ 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47,
+ 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
+ 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89,
+ 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
+ 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18,
+ 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07,
+)
+
+
+def sub_bytes(data):
+ return [SBOX[x] for x in data]
+
+
+def sub_bytes_inv(data):
+ return [SBOX_INV[x] for x in data]
+
+
+def rotate(data):
+ return data[1:] + [data[0]]
+
+
+def key_schedule_core(data, rcon_iteration):
+ data = rotate(data)
+ data = sub_bytes(data)
+ data[0] = data[0] ^ RCON[rcon_iteration]
+ return data
+
+
+def xor(data1, data2):
+ return [x ^ y for x, y in zip(data1, data2)]
+
+
+def rijndael_mul(a, b):
+ if a == 0 or b == 0:
+ return 0
+ return RIJNDAEL_EXP_TABLE[
+ (RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF
+ ]
+
+
+def mix_column(data, matrix):
+ data_mixed = []
+ for row in range(4):
+ mixed = 0
+ for column in range(4):
+ # xor is (+) and (-)
+ mixed ^= rijndael_mul(data[column], matrix[row][column])
+ data_mixed.append(mixed)
+ return data_mixed
+
+
+def mix_columns(data, matrix=MIX_COLUMN_MATRIX):
+ data_mixed = []
+ for i in range(4):
+ column = data[i * 4: (i + 1) * 4]
+ data_mixed += mix_column(column, matrix)
+ return data_mixed
+
+
+def mix_columns_inv(data):
+ return mix_columns(data, MIX_COLUMN_MATRIX_INV)
+
+
+def shift_rows_inv(data):
+ data_shifted = []
+ for column in range(4):
+ for row in range(4):
+ data_shifted.append(data[((column - row) & 0b11) * 4 + row])
+ return data_shifted
+
+
+__all__ = ['key_expansion', 'aes_cbc_decrypt', 'aes_cbc_decrypt_text']
diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py
new file mode 100644
index 0000000..e6ba61a
--- /dev/null
+++ b/gallery_dl/cache.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Decorators to keep function results in an in-memory and database cache"""
+
+import sqlite3
+import pickle
+import time
+import functools
+from . import config, util
+
+
+class CacheDecorator():
+ """Simplified in-memory cache"""
+ def __init__(self, func, keyarg):
+ self.func = func
+ self.cache = {}
+ self.keyarg = keyarg
+
+ def __get__(self, instance, cls):
+ return functools.partial(self.__call__, instance)
+
+ def __call__(self, *args, **kwargs):
+ key = "" if self.keyarg is None else args[self.keyarg]
+ try:
+ value = self.cache[key]
+ except KeyError:
+ value = self.cache[key] = self.func(*args, **kwargs)
+ return value
+
+ def update(self, key, value):
+ self.cache[key] = value
+
+ def invalidate(self, key):
+ try:
+ del self.cache[key]
+ except KeyError:
+ pass
+
+
+class MemoryCacheDecorator(CacheDecorator):
+ """In-memory cache"""
+ def __init__(self, func, keyarg, maxage):
+ CacheDecorator.__init__(self, func, keyarg)
+ self.maxage = maxage
+
+ def __call__(self, *args, **kwargs):
+ key = "" if self.keyarg is None else args[self.keyarg]
+ timestamp = int(time.time())
+ try:
+ value, expires = self.cache[key]
+ except KeyError:
+ expires = 0
+ if expires < timestamp:
+ value = self.func(*args, **kwargs)
+ expires = timestamp + self.maxage
+ self.cache[key] = value, expires
+ return value
+
+ def update(self, key, value):
+ self.cache[key] = value, int(time.time()) + self.maxage
+
+
+class DatabaseCacheDecorator():
+ """Database cache"""
+ db = None
+ _init = True
+
+ def __init__(self, func, keyarg, maxage):
+ self.key = "%s.%s" % (func.__module__, func.__name__)
+ self.func = func
+ self.cache = {}
+ self.keyarg = keyarg
+ self.maxage = maxage
+
+ def __get__(self, obj, objtype):
+ return functools.partial(self.__call__, obj)
+
+ def __call__(self, *args, **kwargs):
+ key = "" if self.keyarg is None else args[self.keyarg]
+ timestamp = int(time.time())
+
+ # in-memory cache lookup
+ try:
+ value, expires = self.cache[key]
+ if expires > timestamp:
+ return value
+ except KeyError:
+ pass
+
+ # database lookup
+ fullkey = "%s-%s" % (self.key, key)
+ cursor = self.cursor()
+ try:
+ cursor.execute("BEGIN EXCLUSIVE")
+ except sqlite3.OperationalError:
+ pass # Silently swallow exception - workaround for Python 3.6
+ try:
+ cursor.execute(
+ "SELECT value, expires FROM data WHERE key=? LIMIT 1",
+ (fullkey,),
+ )
+ result = cursor.fetchone()
+
+ if result and result[1] > timestamp:
+ value, expires = result
+ value = pickle.loads(value)
+ else:
+ value = self.func(*args, **kwargs)
+ expires = timestamp + self.maxage
+ cursor.execute(
+ "INSERT OR REPLACE INTO data VALUES (?,?,?)",
+ (fullkey, pickle.dumps(value), expires),
+ )
+ finally:
+ self.db.commit()
+ self.cache[key] = value, expires
+ return value
+
+ def update(self, key, value):
+ expires = int(time.time()) + self.maxage
+ self.cache[key] = value, expires
+ self.cursor().execute(
+ "INSERT OR REPLACE INTO data VALUES (?,?,?)",
+ ("%s-%s" % (self.key, key), pickle.dumps(value), expires),
+ )
+
+ def invalidate(self, key):
+ try:
+ del self.cache[key]
+ except KeyError:
+ pass
+ self.cursor().execute(
+ "DELETE FROM data WHERE key=? LIMIT 1",
+ ("%s-%s" % (self.key, key),),
+ )
+
+ def cursor(self):
+ if self._init:
+ self.db.execute(
+ "CREATE TABLE IF NOT EXISTS data "
+ "(key TEXT PRIMARY KEY, value TEXT, expires INTEGER)"
+ )
+ DatabaseCacheDecorator._init = False
+ return self.db.cursor()
+
+
+def memcache(maxage=None, keyarg=None):
+ if maxage:
+ def wrap(func):
+ return MemoryCacheDecorator(func, keyarg, maxage)
+ else:
+ def wrap(func):
+ return CacheDecorator(func, keyarg)
+ return wrap
+
+
+def cache(maxage=3600, keyarg=None):
+ def wrap(func):
+ return DatabaseCacheDecorator(func, keyarg, maxage)
+ return wrap
+
+
+def clear():
+ """Delete all database entries"""
+ db = DatabaseCacheDecorator.db
+
+ if db:
+ rowcount = 0
+ cursor = db.cursor()
+ try:
+ cursor.execute("DELETE FROM data")
+ except sqlite3.OperationalError:
+ pass # database is not initialized, can't be modified, etc.
+ else:
+ rowcount = cursor.rowcount
+ db.commit()
+ cursor.execute("VACUUM")
+ return rowcount
+
+ return None
+
+
+def _path():
+ path = config.get(("cache", "file"), -1)
+
+ if path == -1:
+ import tempfile
+ import os.path
+ return os.path.join(tempfile.gettempdir(), ".gallery-dl.cache")
+
+ return util.expand_path(path)
+
+
+try:
+ DatabaseCacheDecorator.db = sqlite3.connect(
+ _path(), timeout=30, check_same_thread=False)
+except (TypeError, sqlite3.OperationalError):
+ cache = memcache # noqa: F811
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
new file mode 100644
index 0000000..b9bf32d
--- /dev/null
+++ b/gallery_dl/cloudflare.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Methods to access sites behind Cloudflare protection"""
+
+import re
+import time
+import operator
+import collections
+import urllib.parse
+from . import text, exception
+from .cache import memcache
+
+
+def is_challenge(response):
+ return (response.status_code == 503 and
+ response.headers.get("Server", "").startswith("cloudflare") and
+ b"jschl-answer" in response.content)
+
+
+def is_captcha(response):
+ return (response.status_code == 403 and
+ b'name="captcha-bypass"' in response.content)
+
+
+def solve_challenge(session, response, kwargs):
+ """Solve Cloudflare challenge and get cfclearance cookie"""
+ parsed = urllib.parse.urlsplit(response.url)
+ root = parsed.scheme + "://" + parsed.netloc
+
+ cf_kwargs = {}
+ headers = cf_kwargs["headers"] = collections.OrderedDict()
+ params = cf_kwargs["params"] = collections.OrderedDict()
+
+ page = response.text
+ params["s"] = text.extract(page, 'name="s" value="', '"')[0]
+ params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
+ params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
+ params["jschl_answer"] = solve_js_challenge(page, parsed.netloc)
+ headers["Referer"] = response.url
+
+ time.sleep(4)
+
+ url = root + "/cdn-cgi/l/chk_jschl"
+ cf_kwargs["allow_redirects"] = False
+ cf_response = session.request("GET", url, **cf_kwargs)
+
+ location = cf_response.headers.get("Location")
+ if not location:
+ import logging
+ log = logging.getLogger("cloudflare")
+ rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected"
+ log.error("%s response", rtype)
+ log.debug("Headers:\n%s", cf_response.headers)
+ log.debug("Content:\n%s", cf_response.text)
+ raise exception.StopExtraction()
+
+ if location[0] == "/":
+ location = root + location
+ else:
+ location = re.sub(r"(https?):/(?!/)", r"\1://", location)
+
+ for cookie in cf_response.cookies:
+ if cookie.name == "cf_clearance":
+ return location, cookie.domain, {
+ cookie.name: cookie.value,
+ "__cfduid" : response.cookies.get("__cfduid", ""),
+ }
+ return location, "", {}
+
+
+def solve_js_challenge(page, netloc):
+ """Evaluate JS challenge in 'page' to get 'jschl_answer' value"""
+
+ # build variable name
+ # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
+ data, pos = text.extract_all(page, (
+ ('var' , ',f, ', '='),
+ ('key' , '"' , '"'),
+ ('expr', ':' , '}'),
+ ))
+ variable = "{}.{}".format(data["var"], data["key"])
+ vlength = len(variable)
+
+ # evaluate the initial expression
+ solution = evaluate_expression(data["expr"], page, netloc)
+
+ # iterator over all remaining expressions
+ # and combine their values in 'solution'
+ expressions = text.extract(
+ page, "'challenge-form');", "f.submit();", pos)[0]
+ for expr in expressions.split(";")[1:]:
+
+ if expr.startswith(variable):
+ # select arithmetc function based on operator (+/-/*)
+ func = OPERATORS[expr[vlength]]
+ # evaluate the rest of the expression
+ value = evaluate_expression(expr[vlength+2:], page, netloc)
+ # combine expression value with our current solution
+ solution = func(solution, value)
+
+ elif expr.startswith("a.value"):
+ if "t.length)" in expr:
+ # add length of hostname
+ solution += len(netloc)
+ if ".toFixed(" in expr:
+ # trim solution to 10 decimal places
+ # and strip trailing zeros
+ solution = "{:.10f}".format(solution).rstrip("0")
+ return solution
+
+
+def evaluate_expression(expr, page, netloc, *,
+ split_re=re.compile(r"[(+]+([^)]*)\)")):
+ """Evaluate a single Javascript expression for the challenge"""
+
+ if expr.startswith("function(p)"):
+ # get HTML element with ID k and evaluate the expression inside
+ # 'eval(eval("document.getElementById(k).innerHTML"))'
+ k, pos = text.extract(page, "k = '", "'")
+ e, pos = text.extract(page, 'id="'+k+'"', '<')
+ return evaluate_expression(e.partition(">")[2], page, netloc)
+
+ if "/" in expr:
+ # split the expression in numerator and denominator subexpressions,
+ # evaluate them separately,
+ # and return their fraction-result
+ num, _, denom = expr.partition("/")
+ num = evaluate_expression(num, page, netloc)
+ denom = evaluate_expression(denom, page, netloc)
+ return num / denom
+
+ if "function(p)" in expr:
+ # split initial expression and function code
+ initial, _, func = expr.partition("function(p)")
+ # evaluate said expression
+ initial = evaluate_expression(initial, page, netloc)
+ # get function argument and use it as index into 'netloc'
+ index = evaluate_expression(func[func.index("}")+1:], page, netloc)
+ return initial + ord(netloc[int(index)])
+
+ # iterate over all subexpressions,
+ # evaluate them,
+ # and accumulate their values in 'result'
+ result = ""
+ for subexpr in split_re.findall(expr) or (expr,):
+ result += str(sum(
+ VALUES[part]
+ for part in subexpr.split("[]")
+ ))
+ return int(result)
+
+
+OPERATORS = {
+ "+": operator.add,
+ "-": operator.sub,
+ "*": operator.mul,
+}
+
+VALUES = {
+ "": 0,
+ "+": 0,
+ "!+": 1,
+ "!!": 1,
+ "+!!": 1,
+}
+
+
+@memcache(keyarg=0)
+def cookies(category):
+ return None
diff --git a/gallery_dl/config.py b/gallery_dl/config.py
new file mode 100644
index 0000000..da52f1e
--- /dev/null
+++ b/gallery_dl/config.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Global configuration module"""
+
+import sys
+import json
+import os.path
+import logging
+from . import util
+
+log = logging.getLogger("config")
+
+
+# --------------------------------------------------------------------
+# internals
+
+_config = {}
+
+if os.name == "nt":
+ _default_configs = [
+ r"%USERPROFILE%\gallery-dl\config.json",
+ r"%USERPROFILE%\gallery-dl.conf",
+ ]
+else:
+ _default_configs = [
+ "/etc/gallery-dl.conf",
+ "${HOME}/.config/gallery-dl/config.json",
+ "${HOME}/.gallery-dl.conf",
+ ]
+
+
+# --------------------------------------------------------------------
+# public interface
+
+def load(files=None, strict=False, fmt="json"):
+ """Load JSON configuration files"""
+ if fmt == "yaml":
+ try:
+ import yaml
+ parsefunc = yaml.safe_load
+ except ImportError:
+ log.error("Could not import 'yaml' module")
+ return
+ else:
+ parsefunc = json.load
+
+ for path in files or _default_configs:
+ path = util.expand_path(path)
+ try:
+ with open(path, encoding="utf-8") as file:
+ confdict = parsefunc(file)
+ except OSError as exc:
+ if strict:
+ log.error("%s", exc)
+ sys.exit(1)
+ except Exception as exc:
+ log.warning("Could not parse '%s': %s", path, exc)
+ if strict:
+ sys.exit(2)
+ else:
+ if not _config:
+ _config.update(confdict)
+ else:
+ util.combine_dict(_config, confdict)
+
+
+def clear():
+ """Reset configuration to an empty state"""
+ _config.clear()
+
+
+def get(keys, default=None, conf=_config):
+ """Get the value of property 'key' or a default value"""
+ try:
+ for k in keys:
+ conf = conf[k]
+ return conf
+ except (KeyError, AttributeError):
+ return default
+
+
+def interpolate(keys, default=None, conf=_config):
+ """Interpolate the value of 'key'"""
+ try:
+ lkey = keys[-1]
+ if lkey in conf:
+ return conf[lkey]
+ for k in keys:
+ if lkey in conf:
+ default = conf[lkey]
+ conf = conf[k]
+ return conf
+ except (KeyError, AttributeError):
+ return default
+
+
+def set(keys, value, conf=_config):
+ """Set the value of property 'key' for this session"""
+ for k in keys[:-1]:
+ try:
+ conf = conf[k]
+ except KeyError:
+ temp = {}
+ conf[k] = temp
+ conf = temp
+ conf[keys[-1]] = value
+
+
+def setdefault(keys, value, conf=_config):
+ """Set the value of property 'key' if it doesn't exist"""
+ for k in keys[:-1]:
+ try:
+ conf = conf[k]
+ except KeyError:
+ temp = {}
+ conf[k] = temp
+ conf = temp
+ return conf.setdefault(keys[-1], value)
+
+
+def unset(keys, conf=_config):
+ """Unset the value of property 'key'"""
+ try:
+ for k in keys[:-1]:
+ conf = conf[k]
+ del conf[keys[-1]]
+ except (KeyError, AttributeError):
+ pass
+
+
+class apply():
+ """Context Manager: apply a collection of key-value pairs"""
+ _sentinel = object()
+
+ def __init__(self, kvlist):
+ self.original = []
+ self.kvlist = kvlist
+
+ def __enter__(self):
+ for key, value in self.kvlist:
+ self.original.append((key, get(key, self._sentinel)))
+ set(key, value)
+
+ def __exit__(self, etype, value, traceback):
+ for key, value in self.original:
+ if value is self._sentinel:
+ unset(key)
+ else:
+ set(key, value)
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py
new file mode 100644
index 0000000..97972cd
--- /dev/null
+++ b/gallery_dl/downloader/__init__.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader modules"""
+
+import importlib
+
+modules = [
+ "http",
+ "text",
+ "ytdl",
+]
+
+
+def find(scheme):
+ """Return downloader class suitable for handling the given scheme"""
+ try:
+ return _cache[scheme]
+ except KeyError:
+ klass = None
+ try:
+ if scheme in modules: # prevent unwanted imports
+ module = importlib.import_module("." + scheme, __package__)
+ klass = module.__downloader__
+ except (ImportError, AttributeError, TypeError):
+ pass
+ _cache[scheme] = klass
+ return klass
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = {}
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
new file mode 100644
index 0000000..4803c85
--- /dev/null
+++ b/gallery_dl/downloader/common.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by downloader modules."""
+
+import os
+import time
+import logging
+from .. import config, util, exception
+from requests.exceptions import RequestException
+from ssl import SSLError
+
+
+class DownloaderBase():
+ """Base class for downloaders"""
+ scheme = ""
+ retries = 1
+
+ def __init__(self, extractor, output):
+ self.session = extractor.session
+ self.out = output
+ self.log = logging.getLogger("downloader." + self.scheme)
+ self.downloading = False
+ self.part = self.config("part", True)
+ self.partdir = self.config("part-directory")
+
+ if self.partdir:
+ self.partdir = util.expand_path(self.partdir)
+ os.makedirs(self.partdir, exist_ok=True)
+
+ def config(self, key, default=None):
+ """Interpolate config value for 'key'"""
+ return config.interpolate(("downloader", self.scheme, key), default)
+
+ def download(self, url, pathfmt):
+ """Download the resource at 'url' and write it to a file-like object"""
+ try:
+ return self.download_impl(url, pathfmt)
+ except Exception:
+ print()
+ raise
+ finally:
+ # remove file from incomplete downloads
+ if self.downloading and not self.part:
+ try:
+ os.remove(pathfmt.temppath)
+ except (OSError, AttributeError):
+ pass
+
+ def download_impl(self, url, pathfmt):
+ """Actual implementaion of the download process"""
+ adj_ext = None
+ tries = 0
+ msg = ""
+
+ if self.part:
+ pathfmt.part_enable(self.partdir)
+
+ while True:
+ self.reset()
+ if tries:
+ self.log.warning("%s (%d/%d)", msg, tries, self.retries)
+ if tries >= self.retries:
+ return False
+ time.sleep(tries)
+ tries += 1
+
+ # check for .part file
+ filesize = pathfmt.part_size()
+
+ # connect to (remote) source
+ try:
+ offset, size = self.connect(url, filesize)
+ except exception.DownloadRetry as exc:
+ msg = exc
+ continue
+ except exception.DownloadComplete:
+ break
+ except Exception as exc:
+ self.log.warning(exc)
+ return False
+
+ # check response
+ if not offset:
+ mode = "w+b"
+ if filesize:
+ self.log.info("Unable to resume partial download")
+ else:
+ mode = "r+b"
+ self.log.info("Resuming download at byte %d", offset)
+
+ # set missing filename extension
+ if not pathfmt.has_extension:
+ pathfmt.set_extension(self.get_extension())
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+
+ self.out.start(pathfmt.path)
+ self.downloading = True
+ with pathfmt.open(mode) as file:
+ if offset:
+ file.seek(offset)
+
+ # download content
+ try:
+ self.receive(file)
+ except (RequestException, SSLError) as exc:
+ msg = exc
+ print()
+ continue
+
+ # check filesize
+ if size and file.tell() < size:
+ msg = "filesize mismatch ({} < {})".format(
+ file.tell(), size)
+ continue
+
+ # check filename extension
+ adj_ext = self._check_extension(file, pathfmt)
+
+ break
+
+ self.downloading = False
+ if adj_ext:
+ pathfmt.set_extension(adj_ext)
+ return True
+
+ def connect(self, url, offset):
+ """Connect to 'url' while respecting 'offset' if possible
+
+ Returns a 2-tuple containing the actual offset and expected filesize.
+ If the returned offset-value is greater than zero, all received data
+ will be appended to the existing .part file.
+ Return '0' as second tuple-field to indicate an unknown filesize.
+ """
+
+ def receive(self, file):
+ """Write data to 'file'"""
+
+ def reset(self):
+ """Reset internal state / cleanup"""
+
+ def get_extension(self):
+ """Return a filename extension appropriate for the current request"""
+
+ @staticmethod
+ def _check_extension(file, pathfmt):
+ """Check filename extension against fileheader"""
+ extension = pathfmt.keywords["extension"]
+ if extension in FILETYPE_CHECK:
+ file.seek(0)
+ header = file.read(8)
+ if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
+ for ext, check in FILETYPE_CHECK.items():
+ if ext != extension and check(header):
+ return ext
+ return None
+
+
+FILETYPE_CHECK = {
+ "jpg": lambda h: h[0:2] == b"\xff\xd8",
+ "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
+ "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
+}
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
new file mode 100644
index 0000000..961c1a2
--- /dev/null
+++ b/gallery_dl/downloader/http.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for http:// and https:// URLs"""
+
+import time
+import mimetypes
+from requests.exceptions import ConnectionError, Timeout
+from .common import DownloaderBase
+from .. import text, exception
+
+
+class HttpDownloader(DownloaderBase):
+ scheme = "http"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+ self.response = None
+ self.retries = self.config("retries", extractor._retries)
+ self.timeout = self.config("timeout", extractor._timeout)
+ self.verify = self.config("verify", extractor._verify)
+ self.rate = self.config("rate")
+ self.chunk_size = 16384
+
+ if self.rate:
+ self.rate = text.parse_bytes(self.rate)
+ if not self.rate:
+ self.log.warning("Invalid rate limit specified")
+ elif self.rate < self.chunk_size:
+ self.chunk_size = self.rate
+
+ def connect(self, url, offset):
+ headers = {}
+ if offset:
+ headers["Range"] = "bytes={}-".format(offset)
+
+ try:
+ self.response = self.session.request(
+ "GET", url, stream=True, headers=headers, allow_redirects=True,
+ timeout=self.timeout, verify=self.verify)
+ except (ConnectionError, Timeout) as exc:
+ raise exception.DownloadRetry(exc)
+
+ code = self.response.status_code
+ if code == 200: # OK
+ offset = 0
+ size = self.response.headers.get("Content-Length")
+ elif code == 206: # Partial Content
+ size = self.response.headers["Content-Range"].rpartition("/")[2]
+ elif code == 416: # Requested Range Not Satisfiable
+ raise exception.DownloadComplete()
+ elif code == 429 or 500 <= code < 600: # Server Error
+ raise exception.DownloadRetry(
+ "{} Server Error: {} for url: {}".format(
+ code, self.response.reason, url))
+ else:
+ self.response.raise_for_status()
+
+ return offset, text.parse_int(size)
+
+ def receive(self, file):
+ if self.rate:
+ total = 0 # total amount of bytes received
+ start = time.time() # start time
+
+ for data in self.response.iter_content(self.chunk_size):
+ file.write(data)
+
+ if self.rate:
+ total += len(data)
+ expected = total / self.rate # expected elapsed time
+ delta = time.time() - start # actual elapsed time since start
+ if delta < expected:
+ # sleep if less time passed than expected
+ time.sleep(expected - delta)
+
+ def reset(self):
+ if self.response:
+ self.response.close()
+ self.response = None
+
+ def get_extension(self):
+ mtype = self.response.headers.get("Content-Type", "image/jpeg")
+ mtype = mtype.partition(";")[0]
+
+ if mtype in MIMETYPE_MAP:
+ return MIMETYPE_MAP[mtype]
+
+ exts = mimetypes.guess_all_extensions(mtype, strict=False)
+ if exts:
+ exts.sort()
+ return exts[-1][1:]
+
+ self.log.warning(
+ "No filename extension found for MIME type '%s'", mtype)
+ return "txt"
+
+
+MIMETYPE_MAP = {
+ "image/jpeg": "jpg",
+ "image/jpg": "jpg",
+ "image/png": "png",
+ "image/gif": "gif",
+ "image/bmp": "bmp",
+ "image/webp": "webp",
+ "image/svg+xml": "svg",
+
+ "video/webm": "webm",
+ "video/ogg": "ogg",
+ "video/mp4": "mp4",
+
+ "audio/wav": "wav",
+ "audio/x-wav": "wav",
+ "audio/webm": "webm",
+ "audio/ogg": "ogg",
+ "audio/mpeg": "mp3",
+
+ "application/ogg": "ogg",
+ "application/octet-stream": "bin",
+}
+
+
+__downloader__ = HttpDownloader
diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py
new file mode 100644
index 0000000..ca33863
--- /dev/null
+++ b/gallery_dl/downloader/text.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for text: URLs"""
+
+from .common import DownloaderBase
+
+
+class TextDownloader(DownloaderBase):
+ scheme = "text"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+ self.content = b""
+
+ def connect(self, url, offset):
+ data = url.encode()
+ self.content = data[offset + 5:]
+ return offset, len(data) - 5
+
+ def receive(self, file):
+ file.write(self.content)
+
+ def reset(self):
+ self.content = b""
+
+ @staticmethod
+ def get_extension():
+ return "txt"
+
+
+__downloader__ = TextDownloader
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
new file mode 100644
index 0000000..57a84d0
--- /dev/null
+++ b/gallery_dl/downloader/ytdl.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for URLs requiring youtube-dl support"""
+
+from youtube_dl import YoutubeDL
+from .common import DownloaderBase
+from .. import text
+import os
+
+
+class YoutubeDLDownloader(DownloaderBase):
+ scheme = "ytdl"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+
+ options = {
+ "format": self.config("format") or None,
+ "ratelimit": text.parse_bytes(self.config("rate"), None),
+ "retries": self.config("retries", extractor._retries),
+ "socket_timeout": self.config("timeout", extractor._timeout),
+ "nocheckcertificate": not self.config("verify", extractor._verify),
+ "nopart": not self.part,
+ }
+ options.update(self.config("raw-options") or {})
+
+ if self.config("logging", True):
+ options["logger"] = self.log
+
+ self.ytdl = YoutubeDL(options)
+
+ def download(self, url, pathfmt):
+ try:
+ info_dict = self.ytdl.extract_info(url[5:], download=False)
+ except Exception:
+ return False
+
+ if "entries" in info_dict:
+ index = pathfmt.keywords.get("_ytdl_index")
+ if index is None:
+ return self._download_playlist(pathfmt, info_dict)
+ else:
+ info_dict = info_dict["entries"][index]
+ return self._download_video(pathfmt, info_dict)
+
+ def _download_video(self, pathfmt, info_dict):
+ if "url" in info_dict:
+ text.nameext_from_url(info_dict["url"], pathfmt.keywords)
+ pathfmt.set_extension(info_dict["ext"])
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+ if self.part and self.partdir:
+ pathfmt.temppath = os.path.join(
+ self.partdir, pathfmt.filename)
+ self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%")
+
+ self.out.start(pathfmt.path)
+ try:
+ self.ytdl.process_info(info_dict)
+ except Exception:
+ self.log.debug("Traceback", exc_info=True)
+ return False
+ return True
+
+ def _download_playlist(self, pathfmt, info_dict):
+ pathfmt.set_extension("%(playlist_index)s.%(ext)s")
+ self.ytdl.params["outtmpl"] = pathfmt.realpath
+
+ for entry in info_dict["entries"]:
+ self.ytdl.process_info(entry)
+ return True
+
+
+__downloader__ = YoutubeDLDownloader
diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py
new file mode 100644
index 0000000..3e86177
--- /dev/null
+++ b/gallery_dl/exception.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Exception classes used by gallery-dl
+
+Class Hierarchy:
+
+Exception
+ +-- GalleryDLException
+ +-- ExtractionError
+ | +-- AuthenticationError
+ | +-- AuthorizationError
+ | +-- NotFoundError
+ | +-- HttpError
+ +-- DownloadError
+ | +-- DownloadComplete
+ | +-- DownloadRetry
+ +-- NoExtractorError
+ +-- FormatError
+ +-- FilterError
+ +-- StopExtraction
+"""
+
+
+class GalleryDLException(Exception):
+ """Base class for GalleryDL exceptions"""
+
+
+class ExtractionError(GalleryDLException):
+ """Base class for exceptions during information extraction"""
+
+
+class AuthenticationError(ExtractionError):
+ """Invalid or missing login information"""
+
+
+class AuthorizationError(ExtractionError):
+ """Insufficient privileges to access a resource"""
+
+
+class NotFoundError(ExtractionError):
+ """Requested resource (gallery/image) does not exist"""
+
+
+class HttpError(ExtractionError):
+ """HTTP request during extraction failed"""
+
+
+class DownloadError(GalleryDLException):
+ """Base class for exceptions during file downloads"""
+
+
+class DownloadRetry(DownloadError):
+ """Download attempt failed and should be retried"""
+
+
+class DownloadComplete(DownloadError):
+ """Output file of attempted download is already complete"""
+
+
+class NoExtractorError(GalleryDLException):
+ """No extractor can handle the given URL"""
+
+
+class FormatError(GalleryDLException):
+ """Error while building output path"""
+
+
+class FilterError(GalleryDLException):
+ """Error while evaluating a filter expression"""
+
+
+class StopExtraction(GalleryDLException):
+ """Extraction should stop"""
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
new file mode 100644
index 0000000..8df8645
--- /dev/null
+++ b/gallery_dl/extractor/2chan.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.2chan.net/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class FutabaThreadExtractor(Extractor):
+ """Extractor for images from threads on www.2chan.net"""
+ category = "2chan"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board_name}", "{thread}")
+ filename_fmt = "{tim}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
+ pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
+ test = ("http://dec.2chan.net/70/res/947.htm", {
+ "url": "c5c12b80b290e224b6758507b3bb952044f4595b",
+ "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.server, self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "https://{}.2chan.net/{}/res/{}.htm".format(
+ self.server, self.board, self.thread)
+ page = self.request(url).text
+ data = self.metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in self.posts(page):
+ if "filename" not in post:
+ continue
+ post.update(data)
+ url = self.url_fmt.format_map(post)
+ yield Message.Url, url, post
+
+ def metadata(self, page):
+ """Collect metadata for extractor-job"""
+ title = text.extract(page, "<title>", "</title>")[0]
+ title, _, boardname = title.rpartition(" - ")
+ return {
+ "server": self.server,
+ "title": title,
+ "board": self.board,
+ "board_name": boardname[:-4],
+ "thread": self.thread,
+ }
+
+ def posts(self, page):
+ """Build a list of all post-objects"""
+ page = text.extract(
+ page, '<div class="thre"', '<div style="clear:left"></div>')[0]
+ return [
+ self.parse(post)
+ for post in page.split('<table border=0>')
+ ]
+
+ def parse(self, post):
+ """Build post-object by extracting data from an HTML post"""
+ data = self._extract_post(post)
+ if '<a href="/' in post:
+ self._extract_image(post, data)
+ data["tim"], _, data["extension"] = data["filename"].partition(".")
+ data["time"] = data["tim"][:-3]
+ data["ext"] = "." + data["extension"]
+ return data
+
+ @staticmethod
+ def _extract_post(post):
+ return text.extract_all(post, (
+ ("no" , 'name="', '"'),
+ ("post", '<b>', '</b>'),
+ ("name", '<b>', ' </b>'),
+ ("now" , '</font> ', ' '),
+ (None , '<blockquote', ''),
+ ("com" , '>', '</blockquote>'),
+ ))[0]
+
+ @staticmethod
+ def _extract_image(post, data):
+ text.extract_all(post, (
+ (None , '_blank', ''),
+ ("filename", '>', '<'),
+ ("fsize" , '(', ' '),
+ ), 0, data)
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py
new file mode 100644
index 0000000..50dbfe8
--- /dev/null
+++ b/gallery_dl/extractor/35photo.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://35photo.pro/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _35photoExtractor(Extractor):
+ category = "35photo"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{id}{title:?_//}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ root = "https://35photo.pro"
+
+ def items(self):
+ first = True
+ data = self.metadata()
+ yield Message.Version, 1
+
+ for photo_id in self.photos():
+ for photo in self._photo_data(photo_id):
+ photo.update(data)
+ url = photo["url"]
+ if first:
+ first = False
+ yield Message.Directory, photo
+ yield Message.Url, url, text.nameext_from_url(url, photo)
+
+ def metadata(self):
+ """Returns general metadata"""
+ return {}
+
+ def photos(self):
+ """Returns an iterable containing all relevant photo IDs"""
+
+ def _pagination(self, params, extra_ids=None):
+ url = "https://35photo.pro/show_block.php"
+ headers = {"Referer": self.root, "X-Requested-With": "XMLHttpRequest"}
+ params["type"] = "getNextPageData"
+
+ if "lastId" not in params:
+ params["lastId"] = "999999999"
+ if extra_ids:
+ yield from extra_ids
+ while params["lastId"]:
+ data = self.request(url, headers=headers, params=params).json()
+ yield from self._photo_ids(data["data"])
+ params["lastId"] = data["lastId"]
+
+ def _photo_data(self, photo_id):
+ params = {"method": "photo.getData", "photoId": photo_id}
+ data = self.request(
+ "https://api.35photo.pro/", params=params).json()["data"][photo_id]
+ info = {
+ "url" : data["src"],
+ "id" : data["photo_id"],
+ "title" : data["photo_name"],
+ "description": data["photo_desc"],
+ "tags" : data["tags"] or [],
+ "views" : data["photo_see"],
+ "favorites" : data["photo_fav"],
+ "score" : data["photo_rating"],
+ "type" : data["photo_type"],
+ "date" : data["timeAdd"],
+ "user" : data["user_login"],
+ "user_id" : data["user_id"],
+ "user_name" : data["user_name"],
+ "other" : data["otherData"],
+ }
+
+ if "series" in data:
+ for info["num"], photo in enumerate(data["series"], 1):
+ info["url"] = photo["src"]
+ info["id_series"] = text.parse_int(photo["id"])
+ info["title_series"] = photo["title"] or ""
+ yield info.copy()
+ else:
+ info["num"] = 1
+ yield info
+
+ @staticmethod
+ def _photo_ids(page):
+ """Extract unique photo IDs and return them as sorted list"""
+ # searching for photo-id="..." doesn't always work (see unit tests)
+ return sorted(
+ set(text.extract_iter(page, "/photo_", "/")),
+ key=text.parse_int,
+ reverse=True,
+ )
+
+
+class _35photoUserExtractor(_35photoExtractor):
+ """Extractor for all images of a user on 35photo.pro"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro"
+ r"/(?!photo_|genre_)([^/?&#]+)")
+ test = (
+ ("https://35photo.pro/liya", {
+ "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg",
+ "count": 9,
+ }),
+ ("https://35photo.pro/suhoveev", {
+ # last photo ID (1267028) isn't given as 'photo-id="<id>"
+ # there are only 23 photos without the last one
+ "count": ">= 33",
+ }),
+ ("https://en.35photo.pro/liya"),
+ ("https://ru.35photo.pro/liya"),
+ )
+
+ def __init__(self, match):
+ _35photoExtractor.__init__(self, match)
+ self.user = match.group(1)
+ self.user_id = 0
+
+ def metadata(self):
+ url = "{}/{}/".format(self.root, self.user)
+ page = self.request(url).text
+ self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0])
+ return {
+ "user": self.user,
+ "user_id": self.user_id,
+ }
+
+ def photos(self):
+ return self._pagination({
+ "page": "photoUser",
+ "user_id": self.user_id,
+ })
+
+
+class _35photoGenreExtractor(_35photoExtractor):
+ """Extractor for images of a specific genre on 35photo.pro"""
+ subcategory = "genre"
+ directory_fmt = ("{category}", "Genre", "{genre}")
+ archive_fmt = "g{genre_id}_{id}_{num}"
+ pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/genre_(\d+)(/new/)?"
+ test = (
+ ("https://35photo.pro/genre_109/", {
+ "range": "1-30",
+ }),
+ ("https://35photo.pro/genre_109/new/"),
+ )
+
+ def __init__(self, match):
+ _35photoExtractor.__init__(self, match)
+ self.genre_id, self.new = match.groups()
+ self.photo_ids = None
+
+ def metadata(self):
+ url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/")
+ page = self.request(url).text
+ self.photo_ids = self._photo_ids(text.extract(
+ page, ' class="photo', '\n')[0])
+ return {
+ "genre": text.extract(page, " genre - ", ". ")[0],
+ "genre_id": text.parse_int(self.genre_id),
+ }
+
+ def photos(self):
+ return self._pagination({
+ "page": "genre",
+ "community_id": self.genre_id,
+ "photo_rating": "0" if self.new else "50",
+ "lastId": self.photo_ids[-1],
+ }, self.photo_ids)
+
+
+class _35photoImageExtractor(_35photoExtractor):
+ """Extractor for individual images from 35photo.pro"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/photo_(\d+)"
+ test = ("https://35photo.pro/photo_753340/", {
+ "count": 1,
+ "keyword": {
+ "url" : r"re:https://m\d+.35photo.pro/photos_main/.*.jpg",
+ "id" : 753340,
+ "title" : "Winter walk",
+ "description": str,
+ "tags" : list,
+ "views" : int,
+ "favorites" : int,
+ "score" : int,
+ "type" : 0,
+ "date" : "15 авг, 2014",
+ "user" : "liya",
+ "user_id" : 20415,
+ "user_name" : "Liya Mirzaeva",
+ "other" : str,
+ },
+ })
+
+ def __init__(self, match):
+ _35photoExtractor.__init__(self, match)
+ self.photo_id = match.group(1)
+
+ def photos(self):
+ return (self.photo_id,)
diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py
new file mode 100644
index 0000000..d0e59ad
--- /dev/null
+++ b/gallery_dl/extractor/3dbooru.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from http://behoimi.org/"""
+
+from . import booru
+
+
+class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for 3dbooru extractors"""
+ category = "3dbooru"
+ api_url = "http://behoimi.org/post/index.json"
+ post_url = "http://behoimi.org/post/show/{}"
+ page_limit = 1000
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.session.headers.update({
+ "Referer": "http://behoimi.org/post/show/",
+ "Accept-Encoding": "identity",
+ })
+
+
+class ThreedeebooruTagExtractor(booru.TagMixin,
+ ThreedeebooruExtractor):
+ """Extractor for images from behoimi.org based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post"
+ r"(?:/(?:index)?)?\?tags=(?P<tags>[^&#]+)")
+ test = ("http://behoimi.org/post?tags=himekawa_azuru+dress", {
+ "url": "ecb30c6aaaf8a6ff8f55255737a9840832a483c1",
+ "content": "11cbda40c287e026c1ce4ca430810f761f2d0b2a",
+ })
+
+
+class ThreedeebooruPoolExtractor(booru.PoolMixin,
+ ThreedeebooruExtractor):
+ """Extractor for image-pools from behoimi.org"""
+ pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P<pool>\d+)"
+ test = ("http://behoimi.org/pool/show/27", {
+ "url": "da75d2d1475449d5ef0c266cb612683b110a30f2",
+ "content": "fd5b37c5c6c2de4b4d6f1facffdefa1e28176554",
+ })
+
+
+class ThreedeebooruPostExtractor(booru.PostMixin,
+ ThreedeebooruExtractor):
+ """Extractor for single images from behoimi.org"""
+ pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P<post>\d+)"
+ test = ("http://behoimi.org/post/show/140852", {
+ "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
+ "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_character": "furude_rika",
+ "tags_copyright": "higurashi_no_naku_koro_ni",
+ "tags_model": "himekawa_azuru",
+ "tags_general": str,
+ },
+ })
+
+
+class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin,
+ ThreedeebooruExtractor):
+ """Extractor for popular images from behoimi.org"""
+ pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
+ "url": "c70268dce441a9ccc3383c244ec15edb059f494f",
+ "count": 20,
+ })
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "http://behoimi.org/post/popular_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py
new file mode 100644
index 0000000..e387b33
--- /dev/null
+++ b/gallery_dl/extractor/4chan.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images and videos from https://www.4chan.org/"""
+
+from . import chan
+from .. import text
+
+
+class FourchanThreadExtractor(chan.ChanThreadExtractor):
+ """Extractor for images from threads from 4chan.org"""
+ category = "4chan"
+ pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org"
+ r"/([^/]+)/thread/(\d+)")
+ test = (
+ ("https://boards.4chan.org/tg/thread/15396072/", {
+ "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
+ "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
+ "content": "20b7b51afa51c9c31a0020a0737b889532c8d7ec",
+ }),
+ ("https://boards.4channel.org/tg/thread/15396072/", {
+ "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
+ "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
+ }),
+ )
+ api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
+ file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
+
+ def update(self, post, data=None):
+ chan.ChanThreadExtractor.update(self, post, data)
+ post["filename"] = text.unescape(post["filename"])
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
new file mode 100644
index 0000000..00b8ab5
--- /dev/null
+++ b/gallery_dl/extractor/500px.py
@@ -0,0 +1,238 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://500px.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _500pxExtractor(Extractor):
+ """Base class for 500px extractors"""
+ category = "500px"
+ directory_fmt = ("{category}", "{user[username]}")
+ filename_fmt = "{id}_{name}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://500px.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.session.headers["Referer"] = self.root + "/"
+
+ def items(self):
+ first = True
+ data = self.metadata()
+ yield Message.Version, 1
+
+ for photo in self.photos():
+ url = photo["images"][-1]["url"]
+ fmt = photo["image_format"]
+ photo["extension"] = "jpg" if fmt == "jpeg" else fmt
+ if data:
+ photo.update(data)
+ if first:
+ first = False
+ yield Message.Directory, photo
+ yield Message.Url, url, photo
+
+ def metadata(self):
+ """Returns general metadata"""
+
+ def photos(self):
+ """Returns an iterable containing all relevant photo IDs"""
+
+ def _extend(self, photos):
+ """Extend photos with additional metadata and higher resolution URLs"""
+ url = "https://api.500px.com/v1/photos"
+ params = {
+ "expanded_user_info" : "true",
+ "include_tags" : "true",
+ "include_geo" : "true",
+ "include_equipment_info": "true",
+ "vendor_photos" : "true",
+ "include_licensing" : "true",
+ "include_releases" : "true",
+ "liked_by" : "1",
+ "following_sample" : "100",
+ "image_size" : "32768",
+ "ids" : ",".join(str(p["id"]) for p in photos),
+ }
+
+ data = self._api_call(url, params)["photos"]
+ for photo in photos:
+ pid = str(photo["id"])
+ photo.update(data[pid])
+ return photos
+
+ def _api_call(self, url, params, csrf_token=None):
+ headers = {"Origin": self.root, "X-CSRF-Token": csrf_token}
+ return self.request(url, headers=headers, params=params).json()
+
+ def _pagination(self, url, params, csrf):
+ params["page"] = 1
+ while True:
+ data = self._api_call(url, params, csrf)
+ yield from self._extend(data["photos"])
+
+ if params["page"] >= data["total_pages"]:
+ return
+ params["page"] += 1
+
+
+class _500pxUserExtractor(_500pxExtractor):
+ """Extractor for photos from a user's photostream on 500px.com"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?500px\.com"
+ r"/(?!photo/)([^/?&#]+)/?(?:$|\?|#)")
+ test = ("https://500px.com/light_expression_photography", {
+ "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D5000/v2",
+ "range": "1-99",
+ "count": 99,
+ })
+
+ def __init__(self, match):
+ _500pxExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def photos(self):
+ # get csrf token and user id from webpage
+ url = "{}/{}".format(self.root, self.user)
+ page = self.request(url).text
+ csrf_token, pos = text.extract(page, 'csrf-token" content="', '"')
+ user_id , pos = text.extract(page, '/user/', '"', pos)
+
+ # get user photos
+ url = "https://api.500px.com/v1/photos"
+ params = {
+ "feature" : "user",
+ "stream" : "photos",
+ "rpp" : "50",
+ "user_id" : user_id,
+ }
+ return self._pagination(url, params, csrf_token)
+
+
+class _500pxGalleryExtractor(_500pxExtractor):
+ """Extractor for photo galleries on 500px.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user[username]}", "{gallery[name]}")
+ pattern = (r"(?:https?://)?500px\.com"
+ r"/(?!photo/)([^/?&#]+)/galleries/([^/?&#]+)")
+ test = ("https://500px.com/fashvamp/galleries/lera", {
+ "url": "8a520272ece83278166b4f8556f9c9da43c43c45",
+ "count": 3,
+ "keyword": {
+ "gallery": dict,
+ "user": dict,
+ },
+ })
+
+ def __init__(self, match):
+ _500pxExtractor.__init__(self, match)
+ self.user_name, self.gallery_name = match.groups()
+ self.user_id = self.gallery_id = self.csrf_token = None
+
+ def metadata(self):
+ # get csrf token and user id from webpage
+ url = "{}/{}/galleries/{}".format(
+ self.root, self.user_name, self.gallery_name)
+ page = self.request(url).text
+ self.csrf_token, pos = text.extract(page, 'csrf-token" content="', '"')
+ self.user_id , pos = text.extract(page, 'App.CuratorId =', '\n', pos)
+ self.user_id = self.user_id.strip()
+
+ # get gallery metadata; transform gallery name into id
+ url = "https://api.500px.com/v1/users/{}/galleries/{}".format(
+ self.user_id, self.gallery_name)
+ params = {
+ # "include_user": "true",
+ "include_cover": "1",
+ "cover_size": "2048",
+ }
+ data = self._api_call(url, params, self.csrf_token)
+ self.gallery_id = data["gallery"]["id"]
+ return data
+
+ def photos(self):
+ url = "https://api.500px.com/v1/users/{}/galleries/{}/items".format(
+ self.user_id, self.gallery_id)
+ params = {
+ "sort" : "position",
+ "sort_direction" : "asc",
+ "rpp" : "50",
+ }
+ return self._pagination(url, params, self.csrf_token)
+
+
+class _500pxImageExtractor(_500pxExtractor):
+ """Extractor for individual images from 500px.com"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?500px\.com/photo/(\d+)"
+ test = ("https://500px.com/photo/222049255/queen-of-coasts", {
+ "url": "d1eda7afeaa589f71f05b9bb5c0694e3ffb357cd",
+ "count": 1,
+ "keyword": {
+ "camera": "Canon EOS 600D",
+ "camera_info": dict,
+ "collections_count": int,
+ "comments": list,
+ "comments_count": int,
+ "converted": False,
+ "converted_bits": int,
+ "created_at": "2017-08-01T04:40:05-04:00",
+ "crop_version": 0,
+ "description": str,
+ "editored_by": dict,
+ "editors_choice": False,
+ "extension": "jpg",
+ "favorites_count": int,
+ "feature": "popular",
+ "feature_date": "2017-08-01T09:58:28+00:00",
+ "focal_length": "208",
+ "height": 3111,
+ "id": 222049255,
+ "image_format": "jpeg",
+ "image_url": str,
+ "images": list,
+ "iso": "100",
+ "lens": "EF-S55-250mm f/4-5.6 IS II",
+ "lens_info": dict,
+ "license_type": 0,
+ "licensed_at": None,
+ "liked": False,
+ "location": None,
+ "location_details": dict,
+ "name": "Queen Of Coasts",
+ "nsfw": False,
+ "privacy": False,
+ "profile": True,
+ "rating": float,
+ "sales_count": int,
+ "status": 1,
+ "store_download": False,
+ "store_height": 3111,
+ "store_width": 4637,
+ "tags": list,
+ "taken_at": "2017-05-04T13:36:51-04:00",
+ "times_viewed": int,
+ "url": "/photo/222049255/queen-of-coasts-by-olesya-nabieva",
+ "user": dict,
+ "user_id": 12847235,
+ "votes_count": int,
+ "watermark": True,
+ "width": 4637,
+ },
+ })
+
+ def __init__(self, match):
+ _500pxExtractor.__init__(self, match)
+ self.photo_id = match.group(1)
+
+ def photos(self):
+ photos = ({"id": self.photo_id},)
+ return self._extend(photos)
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
new file mode 100644
index 0000000..e526da3
--- /dev/null
+++ b/gallery_dl/extractor/8chan.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images and videos from https://8ch.net/"""
+
+from . import chan
+
+
+class InfinitychanThreadExtractor(chan.ChanThreadExtractor):
+ """Extractor for images from threads from 8ch.net"""
+ category = "8chan"
+ filename_fmt = "{time}-{filename}{ext}"
+ pattern = r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)"
+ test = ("https://8ch.net/builders/res/3.html", {
+ "url": "5d85c0509f907f217aea379f862b41bf3d01f645",
+ "keyword": "0c497190c0c0f826925fde09815351d01869c783",
+ })
+ api_url = "https://8ch.net/{board}/res/{thread}.json"
+ file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
+ file_url_v2 = "https://media.8ch.net/file_store/{tim}{ext}"
+
+ def build_url(self, post):
+ fmt = self.file_url if len(post["tim"]) < 64 else self.file_url_v2
+ return fmt.format_map(post)
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
new file mode 100644
index 0000000..6fbf6b5
--- /dev/null
+++ b/gallery_dl/extractor/8muses.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.8muses.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class _8musesAlbumExtractor(Extractor):
+ """Extractor for image albums on www.8muses.com"""
+ category = "8muses"
+ subcategory = "album"
+ directory_fmt = ("{category}", "{album[path]}")
+ filename_fmt = "{page:>03}.{extension}"
+ archive_fmt = "{hash}"
+ root = "https://www.8muses.com"
+ pattern = (r"(?:https?://)?(?:www\.)?8muses\.com"
+ r"(/comics/album/[^?&#]+)(\?[^#]+)?")
+ test = (
+ ("https://www.8muses.com/comics/album/Fakku-Comics/santa/Im-Sorry", {
+ "url": "82449d6a26a29204695cba5d52c3ec60170bc159",
+ "keyword": {
+ "url" : str,
+ "hash" : str,
+ "page" : int,
+ "count": 16,
+ "album": {
+ "id" : 10457,
+ "title" : "Im Sorry",
+ "path" : "Fakku Comics/santa/Im Sorry",
+ "private": False,
+ "url" : str,
+ "parent" : 10454,
+ "views" : int,
+ "likes" : int,
+ "date" : "type:datetime",
+ },
+ },
+ }),
+ ("https://www.8muses.com/comics/album/Fakku-Comics/santa", {
+ "count": ">= 3",
+ "pattern": pattern,
+ "keyword": {
+ "url" : str,
+ "name" : str,
+ "private": False,
+ },
+ }),
+ ("https://www.8muses.com/comics/album/Fakku-Comics/6?sort=az", {
+ "count": ">= 70",
+ "keyword": {"name": r"re:^[S-Zs-z]"},
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+ self.params = match.group(2) or ""
+
+ def items(self):
+ url = self.root + self.path + self.params
+
+ while True:
+ data = self._unobfuscate(text.extract(
+ self.request(url).text,
+ 'id="ractive-public" type="text/plain">', '</script>')[0])
+
+ images = data.get("pictures")
+ if images:
+ count = len(images)
+ album = self._make_album(data["album"])
+ yield Message.Directory, {"album": album, "count": count}
+ for num, image in enumerate(images, 1):
+ url = self.root + "/image/fl/" + image["publicUri"]
+ img = {
+ "url" : url,
+ "page" : num,
+ "hash" : image["publicUri"],
+ "count" : count,
+ "album" : album,
+ "extension": "jpg",
+ }
+ yield Message.Url, url, img
+
+ albums = data.get("albums")
+ if albums:
+ for album in albums:
+ url = self.root + "/comics/album/" + album["permalink"]
+ album = {
+ "url" : url,
+ "name" : album["name"],
+ "private": album["isPrivate"],
+ }
+ yield Message.Queue, url, album
+
+ if data["page"] >= data["pages"]:
+ return
+ path, _, num = self.path.rstrip("/").rpartition("/")
+ path = path if num.isdecimal() else self.path
+ url = "{}{}/{}{}".format(
+ self.root, path, data["page"] + 1, self.params)
+
+ def _make_album(self, album):
+ return {
+ "id" : album["id"],
+ "path" : album["path"],
+ "title" : album["name"],
+ "private": album["isPrivate"],
+ "url" : self.root + album["permalink"],
+ "parent" : text.parse_int(album["parentId"]),
+ "views" : text.parse_int(album["numberViews"]),
+ "likes" : text.parse_int(album["numberLikes"]),
+ "date" : text.parse_datetime(
+ album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"),
+ }
+
+ @staticmethod
+ def _unobfuscate(data):
+ return json.loads("".join([
+ chr(33 + (ord(c) + 14) % 94) if c != " " else c
+ for c in text.unescape(data.strip("\t\n\r !"))
+ ]))
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
new file mode 100644
index 0000000..81d480e
--- /dev/null
+++ b/gallery_dl/extractor/__init__.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import re
+import importlib
+
+modules = [
+ "2chan",
+ "35photo",
+ "3dbooru",
+ "4chan",
+ "500px",
+ "8chan",
+ "8muses",
+ "artstation",
+ "behance",
+ "bobx",
+ "danbooru",
+ "deviantart",
+ "dynastyscans",
+ "e621",
+ "exhentai",
+ "fallenangels",
+ "flickr",
+ "gelbooru",
+ "gfycat",
+ "hbrowse",
+ "hentai2read",
+ "hentaicafe",
+ "hentaifoundry",
+ "hentaifox",
+ "hentaihere",
+ "hentainexus",
+ "hitomi",
+ "hypnohub",
+ "idolcomplex",
+ "imagebam",
+ "imagefap",
+ "imgbox",
+ "imgth",
+ "imgur",
+ "instagram",
+ "keenspot",
+ "khinsider",
+ "kissmanga",
+ "komikcast",
+ "konachan",
+ "livedoor",
+ "luscious",
+ "mangadex",
+ "mangafox",
+ "mangahere",
+ "mangapanda",
+ "mangapark",
+ "mangareader",
+ "mangastream",
+ "mangoxo",
+ "myportfolio",
+ "newgrounds",
+ "ngomik",
+ "nhentai",
+ "nijie",
+ "nsfwalbum",
+ "paheal",
+ "patreon",
+ "photobucket",
+ "piczel",
+ "pinterest",
+ "pixiv",
+ "pixnet",
+ "plurk",
+ "pornhub",
+ "pururin",
+ "reactor",
+ "readcomiconline",
+ "reddit",
+ "rule34",
+ "safebooru",
+ "sankaku",
+ "sankakucomplex",
+ "seiga",
+ "senmanga",
+ "sexcom",
+ "simplyhentai",
+ "slickpic",
+ "slideshare",
+ "smugmug",
+ "tsumino",
+ "tumblr",
+ "twitter",
+ "vanillarock",
+ "wallhaven",
+ "warosu",
+ "weibo",
+ "wikiart",
+ "xhamster",
+ "xvideos",
+ "yandere",
+ "yaplog",
+ "yuki",
+ "foolfuuka",
+ "foolslide",
+ "mastodon",
+ "shopify",
+ "imagehosts",
+ "directlink",
+ "recursive",
+ "oauth",
+ "test",
+]
+
+
+def find(url):
+ """Find a suitable extractor for the given URL"""
+ for cls in _list_classes():
+ match = cls.pattern.match(url)
+ if match and cls not in _blacklist:
+ return cls(match)
+ return None
+
+
+def add(cls):
+ """Add 'cls' to the list of available extractors"""
+ cls.pattern = re.compile(cls.pattern)
+ _cache.append(cls)
+ return cls
+
+
+def add_module(module):
+ """Add all extractors in 'module' to the list of available extractors"""
+ classes = _get_classes(module)
+ for cls in classes:
+ cls.pattern = re.compile(cls.pattern)
+ _cache.extend(classes)
+ return classes
+
+
+def extractors():
+ """Yield all available extractor classes"""
+ return sorted(
+ _list_classes(),
+ key=lambda x: x.__name__
+ )
+
+
+class blacklist():
+ """Context Manager to blacklist extractor modules"""
+ def __init__(self, categories, extractors=None):
+ self.extractors = extractors or []
+ for cls in _list_classes():
+ if cls.category in categories:
+ self.extractors.append(cls)
+
+ def __enter__(self):
+ _blacklist.update(self.extractors)
+
+ def __exit__(self, etype, value, traceback):
+ _blacklist.clear()
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = []
+_blacklist = set()
+_module_iter = iter(modules)
+
+
+def _list_classes():
+ """Yield all available extractor classes"""
+ yield from _cache
+
+ for module_name in _module_iter:
+ module = importlib.import_module("."+module_name, __package__)
+ yield from add_module(module)
+
+
+def _get_classes(module):
+ """Return a list of all extractor classes in a module"""
+ return [
+ cls for cls in module.__dict__.values() if (
+ hasattr(cls, "pattern") and cls.__module__ == module.__name__
+ )
+ ]
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
new file mode 100644
index 0000000..24197ad
--- /dev/null
+++ b/gallery_dl/extractor/artstation.py
@@ -0,0 +1,369 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.artstation.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+import random
+import string
+
+
+class ArtstationExtractor(Extractor):
+ """Base class for artstation extractors"""
+ category = "artstation"
+ filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}"
+ directory_fmt = ("{category}", "{userinfo[username]}")
+ archive_fmt = "{asset[id]}"
+ root = "https://www.artstation.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1) or match.group(2)
+ self.external = self.config("external", False)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for project in self.projects():
+ for asset in self.get_project_assets(project["hash_id"]):
+ asset.update(data)
+ adict = asset["asset"]
+
+ if adict["has_embedded_player"] and self.external:
+ player = adict["player_embedded"]
+ url = text.extract(player, 'src="', '"')[0]
+ if not url.startswith(self.root):
+ yield Message.Url, "ytdl:" + url, asset
+ continue
+
+ if adict["has_image"]:
+ url = adict["image_url"]
+ text.nameext_from_url(url, asset)
+ yield Message.Url, self._no_cache(url), asset
+
+ def metadata(self):
+ """Return general metadata"""
+ return {"userinfo": self.get_user_info(self.user)}
+
+ def projects(self):
+ """Return an iterable containing all relevant project IDs"""
+
+ def get_project_assets(self, project_id):
+ """Return all assets associated with 'project_id'"""
+ url = "{}/projects/{}.json".format(self.root, project_id)
+ data = self.request(url).json()
+
+ data["title"] = text.unescape(data["title"])
+ data["description"] = text.unescape(text.remove_html(
+ data["description"]))
+
+ assets = data["assets"]
+ del data["assets"]
+
+ if len(assets) == 1:
+ data["asset"] = assets[0]
+ yield data
+ else:
+ for asset in assets:
+ data["asset"] = asset
+ yield data.copy()
+
+ def get_user_info(self, username):
+ """Return metadata for a specific user"""
+ url = "{}/users/{}/quick.json".format(self.root, username.lower())
+ response = self.request(url, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("user")
+ return response.json()
+
+ def _pagination(self, url, params=None):
+ if not params:
+ params = {}
+ params["page"] = 1
+ total = 0
+
+ while True:
+ data = self.request(url, params=params).json()
+ yield from data["data"]
+
+ total += len(data["data"])
+ if total >= data["total_count"]:
+ return
+
+ params["page"] += 1
+
+ @staticmethod
+ def _no_cache(url, alphabet=(string.digits + string.ascii_letters)):
+ """Cause a cache miss to prevent Cloudflare 'optimizations'
+
+ Cloudflare's 'Polish' optimization strips image metadata and may even
+ recompress an image as lossy JPEG. This can be prevented by causing
+ a cache miss when requesting an image by adding a random dummy query
+ parameter.
+
+ Ref:
+ https://github.com/r888888888/danbooru/issues/3528
+ https://danbooru.donmai.us/forum_topics/14952
+ """
+ param = "gallerydl_no_cache=" + util.bencode(
+ random.getrandbits(64), alphabet)
+ sep = "&" if "?" in url else "?"
+ return url + sep + param
+
+
+class ArtstationUserExtractor(ArtstationExtractor):
+ """Extractor for all projects of an artstation user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
+ r"/(?!artwork|projects|search)([^/?&#]+)(?:/albums/all)?"
+ r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$")
+ test = (
+ ("https://www.artstation.com/gaerikim/", {
+ "pattern": r"https://\w+\.artstation\.com/p/assets"
+ r"/images/images/\d+/\d+/\d+/large/[^/]+",
+ "count": ">= 6",
+ }),
+ ("https://www.artstation.com/gaerikim/albums/all/"),
+ ("https://gaerikim.artstation.com/"),
+ ("https://gaerikim.artstation.com/projects/"),
+ )
+
+ def projects(self):
+ url = "{}/users/{}/projects.json".format(self.root, self.user)
+ return self._pagination(url)
+
+
+class ArtstationAlbumExtractor(ArtstationExtractor):
+ """Extractor for all projects in an artstation album"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{userinfo[username]}", "Albums",
+ "{album[id]} - {album[title]}")
+ archive_fmt = "a_{album[id]}_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
+ r"/(?!artwork|projects|search)([^/?&#]+)"
+ r"|((?!www)\w+)\.artstation\.com)/albums/(\d+)")
+ test = (
+ ("https://www.artstation.com/huimeiye/albums/770899", {
+ "count": 2,
+ }),
+ ("https://www.artstation.com/huimeiye/albums/770898", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://huimeiye.artstation.com/albums/770899"),
+ )
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.album_id = text.parse_int(match.group(3))
+
+ def metadata(self):
+ userinfo = self.get_user_info(self.user)
+ album = None
+
+ for album in userinfo["albums_with_community_projects"]:
+ if album["id"] == self.album_id:
+ break
+ else:
+ raise exception.NotFoundError("album")
+
+ return {
+ "userinfo": userinfo,
+ "album": album
+ }
+
+ def projects(self):
+ url = "{}/users/{}/projects.json".format(self.root, self.user)
+ params = {"album_id": self.album_id}
+ return self._pagination(url, params)
+
+
+class ArtstationLikesExtractor(ArtstationExtractor):
+ """Extractor for liked projects of an artstation user"""
+ subcategory = "likes"
+ directory_fmt = ("{category}", "{userinfo[username]}", "Likes")
+ archive_fmt = "f_{userinfo[id]}_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
+ r"/(?!artwork|projects|search)([^/?&#]+)/likes/?")
+ test = (
+ ("https://www.artstation.com/mikf/likes", {
+ "pattern": r"https://\w+\.artstation\.com/p/assets"
+ r"/images/images/\d+/\d+/\d+/large/[^/]+",
+ "count": 6,
+ }),
+ # no likes
+ ("https://www.artstation.com/sungchoi/likes", {
+ "count": 0,
+ }),
+ )
+
+ def projects(self):
+ url = "{}/users/{}/likes.json".format(self.root, self.user)
+ return self._pagination(url)
+
+
+class ArtstationChallengeExtractor(ArtstationExtractor):
+ """Extractor for submissions of artstation challenges"""
+ subcategory = "challenge"
+ filename_fmt = "{submission_id}_{asset_id}_{filename}.{extension}"
+ directory_fmt = ("{category}", "Challenges",
+ "{challenge[id]} - {challenge[title]}")
+ archive_fmt = "c_{challenge[id]}_{asset_id}"
+ pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
+ r"/contests/[^/?&#]+/challenges/(\d+)"
+ r"/?(?:\?sorting=([a-z]+))?")
+ test = (
+ ("https://www.artstation.com/contests/thu-2017/challenges/20"),
+ (("https://www.artstation.com/contests/beyond-human"
+ "/challenges/23?sorting=winners"), {
+ "range": "1-30",
+ "count": 30,
+ }),
+ )
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.challenge_id = match.group(1)
+ self.sorting = match.group(2) or "popular"
+
+ def items(self):
+ challenge_url = "{}/contests/_/challenges/{}.json".format(
+ self.root, self.challenge_id)
+ submission_url = "{}/contests/_/challenges/{}/submissions.json".format(
+ self.root, self.challenge_id)
+ update_url = "{}/contests/submission_updates.json".format(
+ self.root)
+
+ challenge = self.request(challenge_url).json()
+ yield Message.Version, 1
+ yield Message.Directory, {"challenge": challenge}
+
+ params = {"sorting": self.sorting}
+ for submission in self._pagination(submission_url, params):
+
+ params = {"submission_id": submission["id"]}
+ for update in self._pagination(update_url, params=params):
+
+ del update["replies"]
+ update["challenge"] = challenge
+ for url in text.extract_iter(
+ update["body_presentation_html"], ' href="', '"'):
+ update["asset_id"] = self._id_from_url(url)
+ text.nameext_from_url(url, update)
+ yield Message.Url, self._no_cache(url), update
+
+ @staticmethod
+ def _id_from_url(url):
+ """Get an image's submission ID from its URL"""
+ parts = url.split("/")
+ return text.parse_int("".join(parts[7:10]))
+
+
+class ArtstationSearchExtractor(ArtstationExtractor):
+ """Extractor for artstation search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Searches", "{search[searchterm]}")
+ archive_fmt = "s_{search[searchterm]}_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
+ r"/search/?\?([^#]+)")
+ test = ("https://www.artstation.com/search?sorting=recent&q=ancient",)
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ query = text.parse_query(match.group(1))
+ self.searchterm = query.get("q", "")
+ self.order = query.get("sorting", "recent").lower()
+
+ def metadata(self):
+ return {"search": {
+ "searchterm": self.searchterm,
+ "order": self.order,
+ }}
+
+ def projects(self):
+ order = "likes_count" if self.order == "likes" else "published_at"
+ url = "{}/search/projects.json".format(self.root)
+ params = {
+ "direction": "desc",
+ "order": order,
+ "q": self.searchterm,
+ # "show_pro_first": "true",
+ }
+ return self._pagination(url, params)
+
+
+class ArtstationArtworkExtractor(ArtstationExtractor):
+ """Extractor for projects on artstation's artwork page"""
+ subcategory = "artwork"
+ directory_fmt = ("{category}", "Artworks", "{artwork[sorting]!c}")
+ archive_fmt = "A_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
+ r"/artwork/?\?([^#]+)")
+ test = ("https://www.artstation.com/artwork?sorting=latest",)
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.query = text.parse_query(match.group(1))
+
+ def metadata(self):
+ return {"artwork": self.query}
+
+ def projects(self):
+ url = "{}/projects.json".format(self.root)
+ params = self.query.copy()
+ params["page"] = 1
+ return self._pagination(url, params)
+
+
+class ArtstationImageExtractor(ArtstationExtractor):
+ """Extractor for images from a single artstation project"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:\w+\.)?artstation\.com/(?:artwork|projects|search)"
+ r"|artstn\.co/p)/(\w+)")
+ test = (
+ ("https://www.artstation.com/artwork/LQVJr", {
+ "pattern": r"https?://\w+\.artstation\.com/p/assets"
+ r"/images/images/008/760/279/large/.+",
+ "content": "1f645ce7634e44675ebde8f6b634d36db0617d3c",
+ # SHA1 hash without _no_cache()
+ # "content": "2e8aaf6400aeff2345274f45e90b6ed3f2a0d946",
+ }),
+ # multiple images per project
+ ("https://www.artstation.com/artwork/Db3dy", {
+ "count": 4,
+ }),
+ # embedded youtube video
+ ("https://www.artstation.com/artwork/g4WPK", {
+ "range": "2",
+ "options": (("external", True),),
+ "pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0",
+ }),
+ # alternate URL patterns
+ ("https://sungchoi.artstation.com/projects/LQVJr"),
+ ("https://artstn.co/p/LQVJr"),
+ )
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.project_id = match.group(1)
+ self.assets = None
+
+ def metadata(self):
+ self.assets = list(ArtstationExtractor.get_project_assets(
+ self, self.project_id))
+ self.user = self.assets[0]["user"]["username"]
+ return ArtstationExtractor.metadata(self)
+
+ def projects(self):
+ return ({"hash_id": self.project_id},)
+
+ def get_project_assets(self, project_id):
+ return self.assets
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
new file mode 100644
index 0000000..111d560
--- /dev/null
+++ b/gallery_dl/extractor/behance.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.behance.net/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class BehanceExtractor(Extractor):
+ """Base class for behance extractors"""
+ category = "behance"
+ root = "https://www.behance.net"
+
+ def items(self):
+ yield Message.Version, 1
+ for gallery in self.galleries():
+ gallery["_extractor"] = BehanceGalleryExtractor
+ yield Message.Queue, gallery["url"], self._update(gallery)
+
+ def galleries(self):
+ """Return all relevant gallery URLs"""
+
+ @staticmethod
+ def _update(data):
+ # compress data to simple lists
+ data["fields"] = [field["name"] for field in data["fields"]]
+ data["owners"] = [owner["display_name"] for owner in data["owners"]]
+ if "tags" in data:
+ data["tags"] = [tag["title"] for tag in data["tags"]]
+
+ # backwards compatibility
+ data["gallery_id"] = data["id"]
+ data["title"] = data["name"]
+ data["user"] = ", ".join(data["owners"])
+
+ return data
+
+
+class BehanceGalleryExtractor(BehanceExtractor):
+ """Extractor for image galleries from www.behance.net"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{owners:J, }", "{id} {name}")
+ filename_fmt = "{category}_{id}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ pattern = r"(?:https?://)?(?:www\.)?behance\.net/gallery/(\d+)"
+ test = (
+ ("https://www.behance.net/gallery/17386197/A-Short-Story", {
+ "count": 2,
+ "url": "ab79bd3bef8d3ae48e6ac74fd995c1dfaec1b7d2",
+ "keyword": {
+ "id": 17386197,
+ "name": 're:"Hi". A short story about the important things ',
+ "owners": ["Place Studio", "Julio César Velazquez"],
+ "fields": ["Animation", "Character Design", "Directing"],
+ "tags": list,
+ "module": dict,
+ },
+ }),
+ ("https://www.behance.net/gallery/21324767/Nevada-City", {
+ "count": 6,
+ "url": "0258fe194fe7d828d6f2c7f6086a9a0a4140db1d",
+ "keyword": {"owners": ["Alex Strohl"]},
+ }),
+ )
+
+ def __init__(self, match):
+ BehanceExtractor.__init__(self, match)
+ self.gallery_id = match.group(1)
+
+ def items(self):
+ data = self.get_gallery_data()
+ imgs = self.get_images(data)
+ data["count"] = len(imgs)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], (url, module) in enumerate(imgs, 1):
+ data["module"] = module
+ data["extension"] = text.ext_from_url(url)
+ yield Message.Url, url, data
+
+ def get_gallery_data(self):
+ """Collect gallery info dict"""
+ url = "{}/gallery/{}/a".format(self.root, self.gallery_id)
+ cookies = {
+ "_evidon_consent_cookie":
+ '{"consent_date":"2019-01-31T09:41:15.132Z"}',
+ "bcp": "815b5eee-8bdf-4898-ac79-33c2bcc0ed19",
+ "gk_suid": "66981391",
+ "gki": '{"feature_project_view":false,'
+ '"feature_discover_login_prompt":false,'
+ '"feature_project_login_prompt":false}',
+ "ilo0": "true",
+ }
+ page = self.request(url, cookies=cookies).text
+
+ data = json.loads(text.extract(
+ page, 'id="beconfig-store_state">', '</script>')[0])
+ return self._update(data["project"]["project"])
+
+ @staticmethod
+ def get_images(data):
+ """Extract image results from an API response"""
+ results = []
+
+ for module in data["modules"]:
+
+ if module["type"] == "image":
+ url = module["sizes"]["original"]
+ results.append((url, module))
+
+ elif module["type"] == "embed":
+ embed = module.get("original_embed") or module.get("embed")
+ url = "ytdl:" + text.extract(embed, 'src="', '"')[0]
+ results.append((url, module))
+
+ return results
+
+
+class BehanceUserExtractor(BehanceExtractor):
+ """Extractor for a user's galleries from www.behance.net"""
+ subcategory = "user"
+ categorytransfer = True
+ pattern = r"(?:https?://)?(?:www\.)?behance\.net/([^/?&#]+)/?$"
+ test = ("https://www.behance.net/alexstrohl", {
+ "count": ">= 8",
+ "pattern": BehanceGalleryExtractor.pattern,
+ })
+
+ def __init__(self, match):
+ BehanceExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def galleries(self):
+ url = "{}/{}/projects".format(self.root, self.user)
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ params = {"offset": 0}
+
+ while True:
+ data = self.request(url, headers=headers, params=params).json()
+ work = data["profile"]["activeSection"]["work"]
+ yield from work["projects"]
+ if not work["hasMore"]:
+ return
+ params["offset"] += len(work["projects"])
+
+
+class BehanceCollectionExtractor(BehanceExtractor):
+ """Extractor for a collection's galleries from www.behance.net"""
+ subcategory = "collection"
+ categorytransfer = True
+ pattern = r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)"
+ test = ("https://www.behance.net/collection/170615607/Sky", {
+ "count": ">= 13",
+ "pattern": BehanceGalleryExtractor.pattern,
+ })
+
+ def __init__(self, match):
+ BehanceExtractor.__init__(self, match)
+ self.collection_id = match.group(1)
+
+ def galleries(self):
+ url = "{}/collection/{}/a".format(self.root, self.collection_id)
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ params = {}
+
+ while True:
+ data = self.request(url, headers=headers, params=params).json()
+ yield from data["output"]
+ if not data.get("offset"):
+ return
+ params["offset"] = data["offset"]
diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py
new file mode 100644
index 0000000..67427a7
--- /dev/null
+++ b/gallery_dl/extractor/bobx.py
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from http://www.bobx.com/dark/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class BobxExtractor(Extractor):
+ """Base class for bobx extractors"""
+ category = "bobx"
+ root = "http://www.bobx.com"
+ per_page = 80
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+
+class BobxGalleryExtractor(BobxExtractor):
+ """Extractor for individual image galleries on bobx.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{model}", "{title}")
+ filename_fmt = "{model}_{image_id}_{num:>03}.{extension}"
+ archive_fmt = "{image_id}"
+ pattern = (r"(?:https?://)?(?:www\.)?bobx\.com"
+ r"/([^/]+/[^/]+/photoset/[\w-]+)-\d+-\d+-\d+\.html")
+ test = (
+ (("http://www.bobx.com/idol/mikoto-hibi"
+ "/photoset/wpb-2018-_11-0-2-8.html"), {
+ "url": "93972d6a661f6627e963d62c9d15531e6b36a389",
+ "keyword": "6c620862db494ed05e69356ba30e604b167b0670",
+ "content": "3f176b7fe752524cec21a763aa55567e41181e07",
+ }),
+ (("http://www.bobx.com/idol/nashiko-momotsuki"
+ "/photoset/wpb-net-_221---2018-08---magic-of-summer-0-10-10.html"), {
+ "url": "f5d6c0cd0881ae6f504c21a90d86e3464dc54e8e",
+ "keyword": "f4819c75f494044348889ecd27771508464c0f5f",
+ }),
+ )
+
+ def items(self):
+ num = 0
+ while True:
+ url = "{}/{}-{}-10-8.html".format(self.root, self.path, num)
+ page = self.request(url, encoding="utf-8").text
+
+ if num == 0:
+ data = self.metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ data["num"] = 0
+
+ for url in self.images(page):
+ url = text.urljoin(self.root, url.replace("-preview-", "-"))
+ data = text.nameext_from_url(url, data)
+ data["image_id"] = text.parse_int(
+ data["filename"].rpartition("-")[2])
+ data["num"] += 1
+ yield Message.Url, url, data
+
+ num += self.per_page
+ if num >= data["count"]:
+ return
+
+ @staticmethod
+ def metadata(page):
+ """Collect metadata for extractor-job"""
+ info = text.extract(page, "<title>", "</title>")[0]
+ model, _, info = info.partition(" in ")
+ info, _, count = info.rpartition(" of ")
+ title = info.rpartition(" - @")[0]
+ return {
+ "title": text.unquote(title),
+ "model": text.unquote(model),
+ "count": text.parse_int(count),
+ }
+
+ @staticmethod
+ def images(page):
+ """Extract all image-urls"""
+ page = text.extract(page, "<table CELLPADDING=", "<script ")[0]
+ return text.extract_iter(page, '<img src="/thumbnail', '"')
+
+
+class BobxIdolExtractor(BobxExtractor):
+ """Extractor for an idol's image galleries on bobx.com"""
+ subcategory = "idol"
+ pattern = r"(?:https?://)?(?:www\.)?bobx\.com/([^/]+/[^/?&#]+)/?$"
+ test = ("http://www.bobx.com/idol/rin-okabe/", {
+ "url": "74d80bfcd53b738b31909bb42e5cc97c41b475b8",
+ })
+
+ def items(self):
+ url = "{}/{}/".format(self.root, self.path)
+ data = {"_extractor": BobxGalleryExtractor}
+ page = self.request(url).text
+ skip = True
+
+ yield Message.Version, 1
+ for part in text.extract_iter(page, '="photoset/', '"'):
+ # skip every other entry
+ skip = not skip
+ if skip:
+ continue
+ yield Message.Queue, "{}photoset/{}".format(url, part), data
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
new file mode 100644
index 0000000..c63085a
--- /dev/null
+++ b/gallery_dl/extractor/booru.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Base classes for extractors for danbooru and co"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text, exception
+from xml.etree import ElementTree
+import collections
+import datetime
+import operator
+import re
+
+
+class BooruExtractor(SharedConfigMixin, Extractor):
+ """Base class for all booru extractors"""
+ basecategory = "booru"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
+ api_url = ""
+ post_url = ""
+ per_page = 50
+ page_start = 1
+ page_limit = None
+ sort = False
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params = {}
+ self.extags = self.post_url and self.config("tags", False)
+
+ def skip(self, num):
+ pages = num // self.per_page
+ if self.page_limit and pages + self.page_start > self.page_limit:
+ pages = self.page_limit - self.page_start
+ self.page_start += pages
+ return pages * self.per_page
+
+ def items(self):
+ data = self.get_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ self.reset_page()
+ while True:
+ images = self.parse_response(
+ self.request(self.api_url, params=self.params))
+
+ for image in images:
+ try:
+ url = image["file_url"]
+ except KeyError:
+ continue
+ if url.startswith("/"):
+ url = text.urljoin(self.api_url, url)
+ image.update(data)
+ if self.extags:
+ self.extended_tags(image)
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ if len(images) < self.per_page:
+ return
+ self.update_page(image)
+
+ def reset_page(self):
+ """Initialize params to point to the first page"""
+ self.params["page"] = self.page_start
+
+ def update_page(self, data):
+ """Update params to point to the next page"""
+
+ def parse_response(self, response):
+ """Parse JSON API response"""
+ images = response.json()
+ if self.sort:
+ images.sort(key=operator.itemgetter("score", "id"),
+ reverse=True)
+ return images
+
+ def get_metadata(self):
+ """Collect metadata for extractor-job"""
+ return {}
+
+ def extended_tags(self, image, page=None):
+ """Retrieve extended tag information"""
+ if not page:
+ url = self.post_url.format(image["id"])
+ page = self.request(url).text
+ tags = collections.defaultdict(list)
+ tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+ pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
+ for tag_type, tag_name in pattern.findall(tags_html or ""):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ image["tags_" + key] = " ".join(value)
+
+
+class XmlParserMixin():
+ """Mixin for XML based API responses"""
+ def parse_response(self, response):
+ root = ElementTree.fromstring(response.text)
+ return [post.attrib for post in root]
+
+
+class DanbooruPageMixin():
+ """Pagination for Danbooru v2"""
+ def update_page(self, data):
+ self.params["page"] = "b{}".format(data["id"])
+
+
+class MoebooruPageMixin():
+ """Pagination for Moebooru and Danbooru v1"""
+ def update_page(self, data):
+ if self.page_limit:
+ self.params["page"] = None
+ self.params["before_id"] = data["id"]
+ else:
+ self.params["page"] += 1
+
+
+class GelbooruPageMixin():
+ """Pagination for Gelbooru-like sites"""
+ page_start = 0
+
+ def reset_page(self):
+ self.params["pid"] = self.page_start
+
+ def update_page(self, data):
+ self.params["pid"] += 1
+
+
+class TagMixin():
+ """Extraction of images based on search-tags"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.tags = text.unquote(match.group("tags").replace("+", " "))
+ self.params["tags"] = self.tags
+ self.params["limit"] = self.per_page
+
+ def get_metadata(self):
+ return {"search_tags": self.tags}
+
+
+class PoolMixin():
+ """Extraction of image-pools"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool}")
+ archive_fmt = "p_{pool}_{id}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.pool = match.group("pool")
+ self.params["tags"] = "pool:" + self.pool
+ self.params["limit"] = self.per_page
+
+ def get_metadata(self):
+ return {"pool": text.parse_int(self.pool)}
+
+
+class GelbooruPoolMixin(PoolMixin):
+ """Image-pool extraction for Gelbooru-like sites"""
+ per_page = 1
+
+ def get_metadata(self):
+ page = self.request(self.pool_url.format(self.pool)).text
+ name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
+ if not name:
+ name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
+ if not name:
+ raise exception.NotFoundError("pool")
+ self.posts = list(text.extract_iter(page, 'id="p', '"', pos))
+
+ return {
+ "pool": text.parse_int(self.pool),
+ "pool_name": text.unescape(name),
+ "count": len(self.posts),
+ }
+
+ def reset_page(self):
+ self.index = self.page_start
+ self.update_page(None)
+
+ def update_page(self, data):
+ try:
+ post = self.posts[self.index]
+ self.index += 1
+ except IndexError:
+ post = "0"
+ self.params["tags"] = "id:" + post
+
+
+class PostMixin():
+ """Extraction of a single image-post"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.post = match.group("post")
+ self.params["tags"] = "id:" + self.post
+
+
+class PopularMixin():
+ """Extraction and metadata handling for Danbooru v2"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "popular", "{scale}", "{date}")
+ archive_fmt = "P_{scale[0]}_{date}_{id}"
+ page_start = None
+ sort = True
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params.update(text.parse_query(match.group("query")))
+
+ def get_metadata(self, fmt="%Y-%m-%d"):
+ date = self.get_date() or datetime.datetime.utcnow().strftime(fmt)
+ scale = self.get_scale() or "day"
+
+ if scale == "week":
+ dt = datetime.datetime.strptime(date, fmt)
+ dt -= datetime.timedelta(days=dt.weekday())
+ date = dt.strftime(fmt)
+ elif scale == "month":
+ date = date[:-3]
+
+ return {"date": date, "scale": scale}
+
+ def get_scale(self):
+ if "scale" in self.params:
+ return self.params["scale"]
+ return None
+
+ def get_date(self):
+ if "date" in self.params:
+ return self.params["date"][:10]
+ return None
+
+
+class MoebooruPopularMixin(PopularMixin):
+ """Extraction and metadata handling for Moebooru and Danbooru v1"""
+ def __init__(self, match):
+ super().__init__(match)
+ self.scale = match.group("scale")
+
+ def get_date(self):
+ if "year" in self.params:
+ return "{:>04}-{:>02}-{:>02}".format(
+ self.params["year"],
+ self.params.get("month", "01"),
+ self.params.get("day", "01"))
+ return None
+
+ def get_scale(self):
+ if self.scale and self.scale.startswith("by_"):
+ return self.scale[3:]
+ return self.scale
diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py
new file mode 100644
index 0000000..5e44fd9
--- /dev/null
+++ b/gallery_dl/extractor/chan.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Base classes for extractors for different Futaba Channel-like boards"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class ChanThreadExtractor(Extractor):
+ """Base class for extractors for Futaba Channel-like boards"""
+ category = "chan"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} - {title}")
+ filename_fmt = "{tim}-{filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ api_url = ""
+ file_url = ""
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.metadata = {
+ "board": match.group(1),
+ "thread": match.group(2),
+ }
+
+ def items(self):
+ yield Message.Version, 1
+ url = self.api_url.format_map(self.metadata)
+ posts = self.request(url).json()["posts"]
+ self.metadata["title"] = self.get_thread_title(posts[0])
+ yield Message.Directory, self.metadata
+ for post in posts:
+ if "filename" not in post:
+ continue
+ self.update(post)
+ yield Message.Url, self.build_url(post), post
+ if "extra_files" in post:
+ for file in post["extra_files"]:
+ self.update(post, file)
+ yield Message.Url, self.build_url(post), post
+
+ def update(self, post, data=None):
+ """Update keyword dictionary"""
+ post.update(data or self.metadata)
+ post["extension"] = post["ext"][1:]
+
+ def build_url(self, post):
+ """Construct an image url out of a post object"""
+ return self.file_url.format_map(post)
+
+ @staticmethod
+ def get_thread_title(post):
+ """Return thread title from first post"""
+ title = post["sub"] if "sub" in post else text.remove_html(post["com"])
+ return text.unescape(title)[:50]
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
new file mode 100644
index 0000000..175af63
--- /dev/null
+++ b/gallery_dl/extractor/common.py
@@ -0,0 +1,432 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by extractor modules."""
+
+import re
+import time
+import netrc
+import queue
+import logging
+import requests
+import threading
+import http.cookiejar
+from .message import Message
+from .. import config, text, exception, cloudflare
+
+
+class Extractor():
+
+ category = ""
+ subcategory = ""
+ categorytransfer = False
+ directory_fmt = ("{category}",)
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = ""
+ cookiedomain = ""
+ root = ""
+ test = None
+
+ def __init__(self, match):
+ self.session = requests.Session()
+ self.log = logging.getLogger(self.category)
+ self.url = match.string
+ self._init_headers()
+ self._init_cookies()
+ self._init_proxies()
+ self._retries = self.config("retries", 5)
+ self._timeout = self.config("timeout", 30)
+ self._verify = self.config("verify", True)
+
+ @classmethod
+ def from_url(cls, url):
+ if isinstance(cls.pattern, str):
+ cls.pattern = re.compile(cls.pattern)
+ match = cls.pattern.match(url)
+ return cls(match) if match else None
+
+ def __iter__(self):
+ return self.items()
+
+ def items(self):
+ yield Message.Version, 1
+
+ def skip(self, num):
+ return 0
+
+ def config(self, key, default=None):
+ return config.interpolate(
+ ("extractor", self.category, self.subcategory, key), default)
+
+ def request(self, url, method="GET", *, session=None,
+ encoding=None, expect=(), retries=None, **kwargs):
+ tries = 0
+ retries = retries or self._retries
+ session = session or self.session
+ kwargs.setdefault("timeout", self._timeout)
+ kwargs.setdefault("verify", self._verify)
+
+ while True:
+ try:
+ response = session.request(method, url, **kwargs)
+ except (requests.exceptions.ConnectionError,
+ requests.exceptions.Timeout,
+ requests.exceptions.ChunkedEncodingError,
+ requests.exceptions.ContentDecodingError) as exc:
+ msg = exc
+ except (requests.exceptions.RequestException) as exc:
+ raise exception.HttpError(exc)
+ else:
+ code = response.status_code
+ if 200 <= code < 400 or code in expect:
+ if encoding:
+ response.encoding = encoding
+ return response
+ if cloudflare.is_challenge(response):
+ self.log.info("Solving Cloudflare challenge")
+ url, domain, cookies = cloudflare.solve_challenge(
+ session, response, kwargs)
+ cloudflare.cookies.update(self.category, (domain, cookies))
+ continue
+
+ msg = "{}: {} for url: {}".format(code, response.reason, url)
+ if code < 500 and code != 429:
+ break
+
+ tries += 1
+ self.log.debug("%s (%d/%d)", msg, tries, retries)
+ if tries >= retries:
+ break
+ time.sleep(2 ** tries)
+
+ raise exception.HttpError(msg)
+
+ def _get_auth_info(self):
+ """Return authentication information as (username, password) tuple"""
+ username = self.config("username")
+ password = None
+
+ if username:
+ password = self.config("password")
+ elif self.config("netrc", False):
+ try:
+ info = netrc.netrc().authenticators(self.category)
+ username, _, password = info
+ except (OSError, netrc.NetrcParseError) as exc:
+ self.log.error("netrc: %s", exc)
+ except TypeError:
+ self.log.warning("netrc: No authentication info")
+
+ return username, password
+
+ def _init_headers(self):
+ """Set additional headers for the 'session' object"""
+ headers = self.session.headers
+ headers.clear()
+
+ headers["User-Agent"] = self.config(
+ "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) "
+ "Gecko/20100101 Firefox/62.0"))
+ headers["Accept"] = "*/*"
+ headers["Accept-Language"] = "en-US,en;q=0.5"
+ headers["Accept-Encoding"] = "gzip, deflate"
+ headers["Connection"] = "keep-alive"
+ headers["Upgrade-Insecure-Requests"] = "1"
+
+ def _init_proxies(self):
+ """Update the session's proxy map"""
+ proxies = self.config("proxy")
+ if proxies:
+ if isinstance(proxies, str):
+ proxies = {"http": proxies, "https": proxies}
+ if isinstance(proxies, dict):
+ for scheme, proxy in proxies.items():
+ if "://" not in proxy:
+ proxies[scheme] = "http://" + proxy.lstrip("/")
+ self.session.proxies = proxies
+ else:
+ self.log.warning("invalid proxy specifier: %s", proxies)
+
+ def _init_cookies(self):
+ """Populate the session's cookiejar"""
+ cookies = self.config("cookies")
+ if cookies:
+ if isinstance(cookies, dict):
+ self._update_cookies_dict(cookies, self.cookiedomain)
+ else:
+ cookiejar = http.cookiejar.MozillaCookieJar()
+ try:
+ cookiejar.load(cookies)
+ except OSError as exc:
+ self.log.warning("cookies: %s", exc)
+ else:
+ self.session.cookies.update(cookiejar)
+
+ cookies = cloudflare.cookies(self.category)
+ if cookies:
+ domain, cookies = cookies
+ self._update_cookies_dict(cookies, domain)
+
+ def _update_cookies(self, cookies, *, domain=""):
+ """Update the session's cookiejar with 'cookies'"""
+ if isinstance(cookies, dict):
+ self._update_cookies_dict(cookies, domain or self.cookiedomain)
+ else:
+ setcookie = self.session.cookies.set_cookie
+ try:
+ cookies = iter(cookies)
+ except TypeError:
+ setcookie(cookies)
+ else:
+ for cookie in cookies:
+ setcookie(cookie)
+
+ def _update_cookies_dict(self, cookiedict, domain):
+ """Update cookiejar with name-value pairs from a dict"""
+ setcookie = self.session.cookies.set
+ for name, value in cookiedict.items():
+ setcookie(name, value, domain=domain)
+
+ def _check_cookies(self, cookienames, *, domain=""):
+ """Check if all 'cookienames' are in the session's cookiejar"""
+ if not domain:
+ domain = self.cookiedomain
+ try:
+ for name in cookienames:
+ self.session.cookies._find(name, domain)
+ except KeyError:
+ return False
+ return True
+
+ @classmethod
+ def _get_tests(cls):
+ """Yield an extractor's test cases as (URL, RESULTS) tuples"""
+ tests = cls.test
+ if not tests:
+ return
+
+ if len(tests) == 2 and (not tests[1] or isinstance(tests[1], dict)):
+ tests = (tests,)
+
+ for test in tests:
+ if isinstance(test, str):
+ test = (test, None)
+ yield test
+
+
+class ChapterExtractor(Extractor):
+
+ subcategory = "chapter"
+ directory_fmt = (
+ "{category}", "{manga}",
+ "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
+ filename_fmt = (
+ "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
+ archive_fmt = (
+ "{manga}_{chapter}{chapter_minor}_{page}")
+
+ def __init__(self, match, url=None):
+ Extractor.__init__(self, match)
+ self.chapter_url = url or self.root + match.group(1)
+
+ def items(self):
+ self.login()
+ page = self.request(self.chapter_url).text
+ data = self.metadata(page)
+ imgs = self.images(page)
+
+ if "count" in data:
+ images = zip(
+ range(1, data["count"]+1),
+ imgs,
+ )
+ else:
+ try:
+ data["count"] = len(imgs)
+ except TypeError:
+ pass
+ images = enumerate(imgs, 1)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"], (url, imgdata) in images:
+ if imgdata:
+ data.update(imgdata)
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def login(self):
+ """Login and set necessary cookies"""
+
+ def metadata(self, page):
+ """Return a dict with general metadata"""
+
+ def images(self, page):
+ """Return a list of all (image-url, metadata)-tuples"""
+
+
+class MangaExtractor(Extractor):
+
+ subcategory = "manga"
+ categorytransfer = True
+ chapterclass = None
+ reverse = True
+
+ def __init__(self, match, url=None):
+ Extractor.__init__(self, match)
+ self.manga_url = url or self.root + match.group(1)
+
+ if self.config("chapter-reverse", False):
+ self.reverse = not self.reverse
+
+ def items(self):
+ self.login()
+ page = self.request(self.manga_url).text
+
+ chapters = self.chapters(page)
+ if self.reverse:
+ chapters.reverse()
+
+ yield Message.Version, 1
+ for chapter, data in chapters:
+ data["_extractor"] = self.chapterclass
+ yield Message.Queue, chapter, data
+
+ def login(self):
+ """Login and set necessary cookies"""
+
+ def chapters(self, page):
+ """Return a list of all (chapter-url, metadata)-tuples"""
+
+
+class GalleryExtractor(ChapterExtractor):
+
+ subcategory = "gallery"
+ filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ archive_fmt = "{gallery_id}_{page}"
+
+
+class AsynchronousMixin():
+ """Run info extraction in a separate thread"""
+
+ def __iter__(self):
+ messages = queue.Queue(5)
+ thread = threading.Thread(
+ target=self.async_items,
+ args=(messages,),
+ daemon=True,
+ )
+
+ thread.start()
+ while True:
+ msg = messages.get()
+ if msg is None:
+ thread.join()
+ return
+ if isinstance(msg, Exception):
+ thread.join()
+ raise msg
+ yield msg
+ messages.task_done()
+
+ def async_items(self, messages):
+ try:
+ for msg in self.items():
+ messages.put(msg)
+ except Exception as exc:
+ messages.put(exc)
+ messages.put(None)
+
+
+class SharedConfigMixin():
+ """Enable sharing of config settings based on 'basecategory'"""
+ basecategory = ""
+
+ def config(self, key, default=None, *, sentinel=object()):
+ value = Extractor.config(self, key, sentinel)
+ if value is sentinel:
+ cat, self.category = self.category, self.basecategory
+ value = Extractor.config(self, key, default)
+ self.category = cat
+ return value
+
+
+def generate_extractors(extractor_data, symtable, classes):
+ """Dynamically generate Extractor classes"""
+ extractors = config.get(("extractor", classes[0].basecategory))
+ ckey = extractor_data.get("_ckey")
+ prev = None
+
+ if extractors:
+ extractor_data.update(extractors)
+
+ for category, info in extractor_data.items():
+
+ if not isinstance(info, dict):
+ continue
+
+ root = info["root"]
+ domain = root[root.index(":") + 3:]
+ pattern = info.get("pattern") or re.escape(domain)
+ name = (info.get("name") or category).capitalize()
+
+ for cls in classes:
+
+ class Extr(cls):
+ pass
+ Extr.__module__ = cls.__module__
+ Extr.__name__ = Extr.__qualname__ = \
+ name + cls.subcategory.capitalize() + "Extractor"
+ Extr.__doc__ = \
+ "Extractor for " + cls.subcategory + "s from " + domain
+ Extr.category = category
+ Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt
+ Extr.test = info.get("test-" + cls.subcategory)
+ Extr.root = root
+
+ if "extra" in info:
+ for key, value in info["extra"].items():
+ setattr(Extr, key, value)
+ if prev and ckey:
+ setattr(Extr, ckey, prev)
+
+ symtable[Extr.__name__] = prev = Extr
+
+
+# Reduce strictness of the expected magic string in cookiejar files.
+# (This allows the use of Wget-generated cookiejars without modification)
+http.cookiejar.MozillaCookieJar.magic_re = re.compile(
+ "#( Netscape)? HTTP Cookie File", re.IGNORECASE)
+
+# Update default cipher list of urllib3
+# to fix issues with Cloudflare and, by extension, Artstation (#227)
+from requests.packages.urllib3.util import ssl_ # noqa
+logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers")
+
+# cipher list taken from urllib3 1.25
+# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py
+# with additions from
+# https://github.com/Anorov/cloudflare-scrape/pull/242
+ssl_.DEFAULT_CIPHERS = (
+ "ECDHE+AESGCM:"
+ "ECDHE+CHACHA20:"
+ "DHE+AESGCM:"
+ "DHE+CHACHA20:"
+ "ECDH+AESGCM:"
+ "DH+AESGCM:"
+ "ECDH+AES:"
+ "DH+AES:"
+ "RSA+AESGCM:"
+ "RSA+AES:"
+ "!ECDHE+SHA:"
+ "!AES128-SHA:"
+ "!aNULL:"
+ "!eNULL:"
+ "!MD5:"
+ "!DSS"
+)
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
new file mode 100644
index 0000000..211c340
--- /dev/null
+++ b/gallery_dl/extractor/danbooru.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://danbooru.donmai.us/"""
+
+from . import booru
+
+
+BASE_PATTERN = (
+ r"(?:https?://)?"
+ r"(?P<subdomain>danbooru|hijiribe|sonohara|safebooru)"
+ r"\.donmai\.us")
+
+
+class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor):
+ """Base class for danbooru extractors"""
+ category = "danbooru"
+ page_limit = 1000
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.subdomain = match.group("subdomain")
+ self.scheme = "https" if self.subdomain == "danbooru" else "http"
+ self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format(
+ scheme=self.scheme, subdomain=self.subdomain)
+
+ username, api_key = self._get_auth_info()
+ if username:
+ self.log.debug("Using HTTP Basic Auth for user '%s'", username)
+ self.session.auth = (username, api_key)
+
+
+class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor):
+ """Extractor for images from danbooru based on search-tags"""
+ pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"
+ test = (
+ ("https://danbooru.donmai.us/posts?tags=bonocho", {
+ "content": "b196fb9f1668109d7774a0a82efea3ffdda07746",
+ }),
+ # test page transitions
+ ("https://danbooru.donmai.us/posts?tags=canvas_%28cocktail_soft%29", {
+ "count": ">= 50",
+ }),
+ ("https://hijiribe.donmai.us/posts?tags=bonocho"),
+ ("https://sonohara.donmai.us/posts?tags=bonocho"),
+ ("https://safebooru.donmai.us/posts?tags=bonocho"),
+ )
+
+
+class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor):
+ """Extractor for image-pools from danbooru"""
+ pattern = BASE_PATTERN + r"/pools/(?P<pool>\d+)"
+ test = ("https://danbooru.donmai.us/pools/7659", {
+ "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",
+ })
+
+
+class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor):
+ """Extractor for single images from danbooru"""
+ pattern = BASE_PATTERN + r"/posts/(?P<post>\d+)"
+ test = ("https://danbooru.donmai.us/posts/294929", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ })
+
+
+class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor):
+ """Extractor for popular images from danbooru"""
+ pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P<query>[^#]*))?"
+ test = (
+ ("https://danbooru.donmai.us/explore/posts/popular"),
+ (("https://danbooru.donmai.us/explore/posts/popular"
+ "?date=2013-06-06+03%3A34%3A22+-0400&scale=week"), {
+ "count": ">= 1",
+ }),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json"
+ self.api_url = urlfmt.format(
+ scheme=self.scheme, subdomain=self.subdomain)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
new file mode 100644
index 0000000..ebab040
--- /dev/null
+++ b/gallery_dl/extractor/deviantart.py
@@ -0,0 +1,992 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.deviantart.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache, memcache
+import collections
+import itertools
+import mimetypes
+import math
+import time
+import re
+
+
+BASE_PATTERN = (
+ r"(?:https?://)?(?:"
+ r"(?:www\.)?deviantart\.com/([\w-]+)|"
+ r"(?!www\.)([\w-]+)\.deviantart\.com)"
+)
+
+
+class DeviantartExtractor(Extractor):
+ """Base class for deviantart extractors"""
+ category = "deviantart"
+ directory_fmt = ("{category}", "{author[username]!l}")
+ filename_fmt = "{category}_{index}_{title}.{extension}"
+ root = "https://www.deviantart.com"
+
+ def __init__(self, match=None):
+ Extractor.__init__(self, match)
+ self.offset = 0
+ self.flat = self.config("flat", True)
+ self.extra = self.config("extra", False)
+ self.original = self.config("original", True)
+ self.user = match.group(1) or match.group(2)
+ self.group = False
+ self.api = DeviantartAPI(self)
+
+ if self.original != "image":
+ self._update_content = self._update_content_default
+ else:
+ self._update_content = self._update_content_image
+ self.original = True
+
+ self.commit_journal = {
+ "html": self._commit_journal_html,
+ "text": self._commit_journal_text,
+ }.get(self.config("journals", "html"))
+
+ def skip(self, num):
+ self.offset += num
+ return num
+
+ def items(self):
+ if self.user:
+ self.group = not self.api.user_profile(self.user)
+ if self.group:
+ self.subcategory = "group-" + self.subcategory
+
+ yield Message.Version, 1
+ for deviation in self.deviations():
+ if isinstance(deviation, tuple):
+ url, data = deviation
+ yield Message.Queue, url, data
+ continue
+
+ self.prepare(deviation)
+ yield Message.Directory, deviation
+
+ if "content" in deviation:
+ content = deviation["content"]
+
+ if self.original and deviation["is_downloadable"] and \
+ text.ext_from_url(content["src"]) != "gif":
+ self._update_content(deviation, content)
+
+ if deviation["index"] <= 790677560 and \
+ content["src"].startswith("https://images-wixmp-"):
+ # https://github.com/r888888888/danbooru/issues/4069
+ content["src"] = re.sub(
+ r"(/f/[^/]+/[^/]+)/v\d+/.*",
+ r"/intermediary\1", content["src"])
+
+ yield self.commit(deviation, content)
+
+ elif deviation["is_downloadable"]:
+ content = self.api.deviation_download(deviation["deviationid"])
+ yield self.commit(deviation, content)
+
+ if "videos" in deviation:
+ video = max(deviation["videos"],
+ key=lambda x: text.parse_int(x["quality"][:-1]))
+ yield self.commit(deviation, video)
+
+ if "flash" in deviation:
+ yield self.commit(deviation, deviation["flash"])
+
+ if "excerpt" in deviation and self.commit_journal:
+ journal = self.api.deviation_content(deviation["deviationid"])
+ yield self.commit_journal(deviation, journal)
+
+ if self.extra:
+ for match in DeviantartStashExtractor.pattern.finditer(
+ deviation.get("description", "")):
+ deviation["_extractor"] = DeviantartStashExtractor
+ yield Message.Queue, match.group(0), deviation
+
+ def deviations(self):
+ """Return an iterable containing all relevant Deviation-objects"""
+
+ def prepare(self, deviation):
+ """Adjust the contents of a Deviation-object"""
+ try:
+ deviation["index"] = text.parse_int(
+ deviation["url"].rpartition("-")[2])
+ except KeyError:
+ deviation["index"] = 0
+ if self.user:
+ deviation["username"] = self.user
+ deviation["da_category"] = deviation["category"]
+ deviation["published_time"] = text.parse_int(
+ deviation["published_time"])
+ deviation["date"] = text.parse_timestamp(
+ deviation["published_time"])
+
+ @staticmethod
+ def commit(deviation, target):
+ url = target["src"]
+ deviation["target"] = text.nameext_from_url(url, target.copy())
+ deviation["extension"] = deviation["target"]["extension"]
+ return Message.Url, url, deviation
+
+ def _commit_journal_html(self, deviation, journal):
+ title = text.escape(deviation["title"])
+ url = deviation["url"]
+ thumbs = deviation["thumbs"]
+ html = journal["html"]
+ shadow = SHADOW_TEMPLATE.format_map(thumbs[0]) if thumbs else ""
+
+ if "css" in journal:
+ css, cls = journal["css"], "withskin"
+ else:
+ css, cls = "", "journal-green"
+
+ if html.find('<div class="boxtop journaltop">', 0, 250) != -1:
+ needle = '<div class="boxtop journaltop">'
+ header = HEADER_CUSTOM_TEMPLATE.format(
+ title=title, url=url, date=deviation["date"],
+ )
+ else:
+ needle = '<div usr class="gr">'
+ catlist = deviation["category_path"].split("/")
+ categories = " / ".join(
+ ('<span class="crumb"><a href="{}/{}/"><span>{}</span></a>'
+ '</span>').format(self.root, cpath, cat.capitalize())
+ for cat, cpath in zip(
+ catlist,
+ itertools.accumulate(catlist, lambda t, c: t + "/" + c)
+ )
+ )
+ username = deviation["author"]["username"]
+ urlname = deviation.get("username") or username.lower()
+ header = HEADER_TEMPLATE.format(
+ title=title,
+ url=url,
+ userurl="{}/{}/".format(self.root, urlname),
+ username=username,
+ date=deviation["date"],
+ categories=categories,
+ )
+
+ html = JOURNAL_TEMPLATE_HTML.format(
+ title=title,
+ html=html.replace(needle, header, 1),
+ shadow=shadow,
+ css=css,
+ cls=cls,
+ )
+
+ deviation["extension"] = "htm"
+ return Message.Url, html, deviation
+
+ @staticmethod
+ def _commit_journal_text(deviation, journal):
+ content = "\n".join(
+ text.unescape(text.remove_html(txt))
+ for txt in journal["html"].rpartition("<script")[0].split("<br />")
+ )
+ txt = JOURNAL_TEMPLATE_TEXT.format(
+ title=deviation["title"],
+ username=deviation["author"]["username"],
+ date=deviation["date"],
+ content=content,
+ )
+
+ deviation["extension"] = "txt"
+ return Message.Url, txt, deviation
+
+ @staticmethod
+ def _find_folder(folders, name):
+ pattern = re.compile(
+ r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$")
+ for folder in folders:
+ if pattern.match(folder["name"]):
+ return folder
+ raise exception.NotFoundError("folder")
+
+ def _folder_urls(self, folders, category):
+ url = "{}/{}/{}/0/".format(self.root, self.user, category)
+ return [(url + folder["name"], folder) for folder in folders]
+
+ def _update_content_default(self, deviation, content):
+ content.update(self.api.deviation_download(deviation["deviationid"]))
+
+ def _update_content_image(self, deviation, content):
+ data = self.api.deviation_download(deviation["deviationid"])
+ url = data["src"].partition("?")[0]
+ mtype = mimetypes.guess_type(url, False)[0]
+ if mtype and mtype.startswith("image/"):
+ content.update(data)
+
+ def _html_request(self, url, **kwargs):
+ cookies = {"userinfo": (
+ '__167217c8e6aac1a3331f;{"username":"","uniqueid":"ab2e8b184471bf0'
+ 'e3f8ed3ee7a3220aa","vd":"Bc7vEx,BdC7Fy,A,J,A,,B,A,B,BdC7Fy,BdC7XU'
+ ',J,J,A,BdC7XU,13,A,B,A,,A,A,B,A,A,,A","attr":56}'
+ )}
+ return self.request(url, cookies=cookies, **kwargs)
+
+
+class DeviantartGalleryExtractor(DeviantartExtractor):
+ """Extractor for all deviations from an artist's gallery"""
+ subcategory = "gallery"
+ archive_fmt = "g_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"
+ test = (
+ ("https://www.deviantart.com/shimoda7/gallery/", {
+ "pattern": r"https://(s3.amazonaws.com/origin-(img|orig)"
+ r".deviantart.net/|images-wixmp-\w+.wixmp.com/)",
+ "count": ">= 30",
+ "keyword": {
+ "allows_comments": bool,
+ "author": {
+ "type": "regular",
+ "usericon": str,
+ "userid": "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B",
+ "username": "shimoda7",
+ },
+ "category_path": str,
+ "content": {
+ "filesize": int,
+ "height": int,
+ "src": str,
+ "transparency": bool,
+ "width": int,
+ },
+ "da_category": str,
+ "date": "type:datetime",
+ "deviationid": str,
+ "?download_filesize": int,
+ "extension": str,
+ "index": int,
+ "is_deleted": bool,
+ "is_downloadable": bool,
+ "is_favourited": bool,
+ "is_mature": bool,
+ "preview": {
+ "height": int,
+ "src": str,
+ "transparency": bool,
+ "width": int,
+ },
+ "published_time": int,
+ "stats": {
+ "comments": int,
+ "favourites": int,
+ },
+ "target": dict,
+ "thumbs": list,
+ "title": str,
+ "url": r"re:https://www.deviantart.com/shimoda7/art/[^/]+-\d+",
+ "username": "shimoda7",
+ },
+ }),
+ # group
+ ("https://www.deviantart.com/yakuzafc", {
+ "pattern": r"https://www.deviantart.com/yakuzafc/gallery/0/",
+ "count": ">= 15",
+ }),
+ # 'folders' option (#276)
+ ("https://www.deviantart.com/justatest235723", {
+ "count": 2,
+ "options": (("metadata", 1), ("folders", 1), ("original", 0)),
+ "keyword": {
+ "description": str,
+ "folders": list,
+ "is_watching": bool,
+ "license": str,
+ "tags": list,
+ },
+ }),
+ ("https://www.deviantart.com/shimoda8/gallery/", {
+ "exception": exception.NotFoundError,
+ }),
+ # old-style URLs
+ ("https://www.deviantart.com/shimoda7/gallery/?catpath=/"),
+ ("https://shimoda7.deviantart.com/gallery/"),
+ ("https://yakuzafc.deviantart.com/"),
+ ("https://shimoda7.deviantart.com/gallery/?catpath=/"),
+ )
+
+ def deviations(self):
+ if self.flat and not self.group:
+ return self.api.gallery_all(self.user, self.offset)
+ folders = self.api.gallery_folders(self.user)
+ return self._folder_urls(folders, "gallery")
+
+
+class DeviantartFolderExtractor(DeviantartExtractor):
+ """Extractor for deviations inside an artist's gallery folder"""
+ subcategory = "folder"
+ directory_fmt = ("{category}", "{folder[owner]}", "{folder[title]}")
+ archive_fmt = "F_{folder[uuid]}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?&#]+)"
+ test = (
+ # user
+ ("https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", {
+ "count": 5,
+ "options": (("original", False),),
+ }),
+ # group
+ ("https://www.deviantart.com/yakuzafc/gallery/37412168/Crafts", {
+ "count": ">= 4",
+ "options": (("original", False),),
+ }),
+ ("https://shimoda7.deviantart.com/gallery/722019/Miscellaneous"),
+ ("https://yakuzafc.deviantart.com/gallery/37412168/Crafts"),
+ )
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.fname = match.group(4)
+ self.folder = {"owner": self.user, "index": match.group(3)}
+
+ def deviations(self):
+ folders = self.api.gallery_folders(self.user)
+ folder = self._find_folder(folders, self.fname)
+ self.folder["title"] = folder["name"]
+ self.folder["uuid"] = folder["folderid"]
+ return self.api.gallery(self.user, folder["folderid"], self.offset)
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["folder"] = self.folder
+
+
+class DeviantartDeviationExtractor(DeviantartExtractor):
+ """Extractor for single deviations"""
+ subcategory = "deviation"
+ archive_fmt = "{index}.{extension}"
+ pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)"
+ test = (
+ (("https://www.deviantart.com/shimoda7/art/"
+ "For-the-sake-of-a-memory-10073852"), {
+ "options": (("original", 0),),
+ "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
+ }),
+ ("https://www.deviantart.com/zzz/art/zzz-1234567890", {
+ "exception": exception.NotFoundError,
+ }),
+ (("https://www.deviantart.com/myria-moon/art/"
+ "Aime-Moi-part-en-vadrouille-261986576"), {
+ "pattern": (r"https?://s3\.amazonaws\.com/origin-orig\."
+ r"deviantart\.net/a383/f/2013/135/e/7/[^.]+\.jpg\?"),
+ }),
+ # wixmp URL rewrite
+ (("https://www.deviantart.com/citizenfresh/art/"
+ "Hverarond-14-the-beauty-of-the-earth-789295466"), {
+ "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
+ r"/intermediary/f/[^/]+/[^.]+\.jpg$")
+ }),
+ # non-download URL for GIFs (#242)
+ (("https://www.deviantart.com/skatergators/art/"
+ "COM-Monique-Model-781571783"), {
+ "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
+ r"/f/[^/]+/[^.]+\.gif\?token="),
+ }),
+ # external URLs from description (#302)
+ (("https://www.deviantart.com/uotapo/art/"
+ "INANAKI-Memorial-Humane7-590297498"), {
+ "options": (("extra", 1), ("original", 0)),
+ "pattern": r"https?://sta\.sh/\w+$",
+ "range": "2-",
+ "count": 4,
+ }),
+ # old-style URLs
+ ("https://shimoda7.deviantart.com"
+ "/art/For-the-sake-of-a-memory-10073852"),
+ ("https://myria-moon.deviantart.com"
+ "/art/Aime-Moi-part-en-vadrouille-261986576"),
+ ("https://zzz.deviantart.com/art/zzz-1234567890"),
+ )
+
+ skip = Extractor.skip
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.path = match.group(3)
+
+ def deviations(self):
+ url = "{}/{}/{}".format(self.root, self.user, self.path)
+ response = self._html_request(url, expect=range(400, 500))
+ deviation_id = text.extract(response.text, '//deviation/', '"')[0]
+ if response.status_code >= 400 or not deviation_id:
+ raise exception.NotFoundError("image")
+ return (self.api.deviation(deviation_id),)
+
+
+class DeviantartStashExtractor(DeviantartExtractor):
+ """Extractor for sta.sh-ed deviations"""
+ subcategory = "stash"
+ archive_fmt = "{index}.{extension}"
+ pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)"
+ test = (
+ ("https://sta.sh/022c83odnaxc", {
+ "pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net",
+ "count": 1,
+ }),
+ # multiple stash items
+ ("https://sta.sh/21jf51j7pzl2", {
+ "pattern": pattern,
+ "count": 4,
+ }),
+ # downloadable, but no "content" field (#307)
+ ("https://sta.sh/024t4coz16mi", {
+ "count": 1,
+ }),
+ ("https://sta.sh/abcdefghijkl", {
+ "exception": exception.HttpError,
+ }),
+ )
+
+ skip = Extractor.skip
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.user = None
+ self.stash_id = match.group(1)
+
+ def deviations(self):
+ url = "https://sta.sh/" + self.stash_id
+ page = self.request(url).text
+ deviation_id = text.extract(page, '//deviation/', '"')[0]
+
+ if deviation_id:
+ yield self.api.deviation(deviation_id)
+ else:
+ data = {"_extractor": DeviantartStashExtractor}
+ page = text.extract(
+ page, '<div id="stash-body"', '<div class="footer"')[0]
+ for url in text.extract_iter(page, '<a href="', '"'):
+ yield url, data
+
+
+class DeviantartFavoriteExtractor(DeviantartExtractor):
+ """Extractor for an artist's favorites"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{username}", "Favourites")
+ archive_fmt = "f_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/favourites/?(?:\?catpath=/)?$"
+ test = (
+ ("https://www.deviantart.com/h3813067/favourites/", {
+ "options": (("metadata", True), ("flat", False)), # issue #271
+ "count": 1,
+ }),
+ ("https://www.deviantart.com/h3813067/favourites/", {
+ "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
+ }),
+ ("https://www.deviantart.com/h3813067/favourites/?catpath=/"),
+ ("https://h3813067.deviantart.com/favourites/"),
+ ("https://h3813067.deviantart.com/favourites/?catpath=/"),
+ )
+
+ def deviations(self):
+ folders = self.api.collections_folders(self.user)
+ if self.flat:
+ return itertools.chain.from_iterable(
+ self.api.collections(self.user, folder["folderid"])
+ for folder in folders
+ )
+ return self._folder_urls(folders, "favourites")
+
+
+class DeviantartCollectionExtractor(DeviantartExtractor):
+ """Extractor for a single favorite collection"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{collection[owner]}",
+ "Favourites", "{collection[title]}")
+ archive_fmt = "C_{collection[uuid]}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/favourites/(\d+)/([^/?&#]+)"
+ test = (
+ (("https://www.deviantart.com/pencilshadings"
+ "/favourites/70595441/3D-Favorites"), {
+ "count": ">= 20",
+ "options": (("original", False),),
+ }),
+ ("https://pencilshadings.deviantart.com"
+ "/favourites/70595441/3D-Favorites"),
+ )
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ _, _, cid, self.cname = match.groups()
+ self.collection = {"owner": self.user, "index": cid}
+
+ def deviations(self):
+ folders = self.api.collections_folders(self.user)
+ folder = self._find_folder(folders, self.cname)
+ self.collection["title"] = folder["name"]
+ self.collection["uuid"] = folder["folderid"]
+ return self.api.collections(self.user, folder["folderid"], self.offset)
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["collection"] = self.collection
+
+
+class DeviantartJournalExtractor(DeviantartExtractor):
+ """Extractor for an artist's journals"""
+ subcategory = "journal"
+ directory_fmt = ("{category}", "{username}", "Journal")
+ archive_fmt = "j_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/(?:journal|blog)/?(?:\?catpath=/)?$"
+ test = (
+ ("https://www.deviantart.com/angrywhitewanker/journal/", {
+ "url": "38db2a0d3a587a7e0f9dba7ff7d274610ebefe44",
+ }),
+ ("https://www.deviantart.com/angrywhitewanker/journal/", {
+ "url": "b2a8e74d275664b1a4acee0fca0a6fd33298571e",
+ "options": (("journals", "text"),),
+ }),
+ ("https://www.deviantart.com/angrywhitewanker/journal/", {
+ "count": 0,
+ "options": (("journals", "none"),),
+ }),
+ ("https://www.deviantart.com/shimoda7/journal/?catpath=/"),
+ ("https://shimoda7.deviantart.com/journal/"),
+ ("https://shimoda7.deviantart.com/journal/?catpath=/"),
+ )
+
+ def deviations(self):
+ return self.api.browse_user_journals(self.user, self.offset)
+
+
+class DeviantartScrapsExtractor(DeviantartExtractor):
+ """Extractor for an artist's scraps"""
+ subcategory = "scraps"
+ directory_fmt = ("{category}", "{username}", "Scraps")
+ archive_fmt = "s_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/gallery/\?catpath=scraps\b"
+ test = (
+ ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", {
+ "count": 12,
+ "options": (("original", False),),
+ }),
+ ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"),
+ )
+
+ def deviations(self):
+ url = "{}/{}/gallery/?catpath=scraps".format(self.root, self.user)
+ page = self._html_request(url).text
+ csrf, pos = text.extract(page, '"csrf":"', '"')
+ iid , pos = text.extract(page, '"requestid":"', '"', pos)
+
+ url = "https://www.deviantart.com/dapi/v1/gallery/0"
+ data = {
+ "username": self.user,
+ "offset": self.offset,
+ "limit": "24",
+ "catpath": "scraps",
+ "_csrf": csrf,
+ "dapiIid": iid + "-jsok7403-1.1"
+ }
+
+ while True:
+ content = self.request(
+ url, method="POST", data=data).json()["content"]
+
+ for item in content["results"]:
+ if item["html"].startswith('<div class="ad-container'):
+ continue
+ deviation_url = text.extract(item["html"], 'href="', '"')[0]
+ page = self._html_request(deviation_url).text
+ deviation_id = text.extract(page, '//deviation/', '"')[0]
+ if deviation_id:
+ yield self.api.deviation(deviation_id)
+
+ if not content["has_more"]:
+ return
+ data["offset"] = content["next_offset"]
+
+
+class DeviantartPopularExtractor(DeviantartExtractor):
+ """Extractor for popular deviations"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "Popular",
+ "{popular[range]}", "{popular[search]}")
+ archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}"
+ pattern = (r"(?:https?://)?www\.deviantart\.com"
+ r"((?:/\w+)*)/(?:popular-([^/?&#]+))/?(?:\?([^#]*))?")
+ test = (
+ ("https://www.deviantart.com/popular-24-hours/?q=tree+house", {
+ "options": (("original", False),),
+ }),
+ ("https://www.deviantart.com/artisan/popular-all-time/?q=tree"),
+ )
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.search_term = self.time_range = self.category_path = None
+ self.user = ""
+
+ path, trange, query = match.groups()
+ if path:
+ self.category_path = path.lstrip("/")
+ if trange:
+ self.time_range = trange.replace("-", "").replace("hours", "hr")
+ if query:
+ self.search_term = text.parse_query(query).get("q")
+
+ self.popular = {
+ "search": self.search_term or "",
+ "range": trange or "24-hours",
+ "path": self.category_path,
+ }
+
+ def deviations(self):
+ return self.api.browse_popular(
+ self.search_term, self.time_range, self.category_path, self.offset)
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["popular"] = self.popular
+
+
+class DeviantartAPI():
+ """Minimal interface for the DeviantArt API
+
+ Ref: https://www.deviantart.com/developers/http/v1/20160316
+ """
+ CLIENT_ID = "5388"
+ CLIENT_SECRET = "76b08c69cfb27f26d6161f9ab6d061a1"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.log = extractor.log
+ self.headers = {}
+
+ delay = extractor.config("wait-min", 0)
+ self.delay = math.ceil(math.log2(delay)) if delay >= 1 else -1
+ self.delay_min = max(2, self.delay)
+
+ self.mature = extractor.config("mature", "true")
+ if not isinstance(self.mature, str):
+ self.mature = "true" if self.mature else "false"
+
+ self.folders = extractor.config("folders", False)
+ self.metadata = extractor.extra or extractor.config("metadata", False)
+
+ self.refresh_token = extractor.config("refresh-token")
+ self.client_id = extractor.config("client-id", self.CLIENT_ID)
+ self.client_secret = extractor.config(
+ "client-secret", self.CLIENT_SECRET)
+
+ def browse_popular(self, query=None, timerange=None,
+ category_path=None, offset=0):
+ """Yield popular deviations"""
+ endpoint = "browse/popular"
+ params = {"q": query, "offset": offset, "limit": 120,
+ "timerange": timerange, "category_path": category_path,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
+ def browse_user_journals(self, username, offset=0):
+ """Yield all journal entries of a specific user"""
+ endpoint = "browse/user/journals"
+ params = {"username": username, "offset": offset, "limit": 50,
+ "mature_content": self.mature, "featured": "false"}
+ return self._pagination(endpoint, params)
+
+ def collections(self, username, folder_id, offset=0):
+ """Yield all Deviation-objects contained in a collection folder"""
+ endpoint = "collections/" + folder_id
+ params = {"username": username, "offset": offset, "limit": 24,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
+ @memcache(keyarg=1)
+ def collections_folders(self, username, offset=0):
+ """Yield all collection folders of a specific user"""
+ endpoint = "collections/folders"
+ params = {"username": username, "offset": offset, "limit": 50,
+ "mature_content": self.mature}
+ return self._pagination_folders(endpoint, params)
+
+ def deviation(self, deviation_id):
+ """Query and return info about a single Deviation"""
+ endpoint = "deviation/" + deviation_id
+ deviation = self._call(endpoint)
+ if self.metadata:
+ self._metadata((deviation,))
+ if self.folders:
+ self._folders((deviation,))
+ return deviation
+
+ def deviation_content(self, deviation_id):
+ """Get extended content of a single Deviation"""
+ endpoint = "deviation/content"
+ params = {"deviationid": deviation_id}
+ return self._call(endpoint, params)
+
+ def deviation_download(self, deviation_id):
+ """Get the original file download (if allowed)"""
+ endpoint = "deviation/download/" + deviation_id
+ params = {"mature_content": self.mature}
+ return self._call(endpoint, params)
+
+ def deviation_metadata(self, deviations):
+ """ Fetch deviation metadata for a set of deviations"""
+ endpoint = "deviation/metadata?" + "&".join(
+ "deviationids[{}]={}".format(num, deviation["deviationid"])
+ for num, deviation in enumerate(deviations)
+ )
+ params = {"mature_content": self.mature}
+ return self._call(endpoint, params)["metadata"]
+
+ def gallery(self, username, folder_id="", offset=0, extend=True):
+ """Yield all Deviation-objects contained in a gallery folder"""
+ endpoint = "gallery/" + folder_id
+ params = {"username": username, "offset": offset, "limit": 24,
+ "mature_content": self.mature, "mode": "newest"}
+ return self._pagination(endpoint, params, extend)
+
+ def gallery_all(self, username, offset=0):
+ """Yield all Deviation-objects of a specific user"""
+ endpoint = "gallery/all"
+ params = {"username": username, "offset": offset, "limit": 24,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
+ @memcache(keyarg=1)
+ def gallery_folders(self, username, offset=0):
+ """Yield all gallery folders of a specific user"""
+ endpoint = "gallery/folders"
+ params = {"username": username, "offset": offset, "limit": 50,
+ "mature_content": self.mature}
+ return self._pagination_folders(endpoint, params)
+
+ @memcache(keyarg=1)
+ def user_profile(self, username):
+ """Get user profile information"""
+ endpoint = "user/profile/" + username
+ return self._call(endpoint, expect_error=True)
+
+ def authenticate(self, refresh_token):
+ """Authenticate the application by requesting an access token"""
+ self.headers["Authorization"] = self._authenticate_impl(refresh_token)
+
+ @cache(maxage=3600, keyarg=1)
+ def _authenticate_impl(self, refresh_token):
+ """Actual authenticate implementation"""
+ url = "https://www.deviantart.com/oauth2/token"
+ if refresh_token:
+ self.log.info("Refreshing private access token")
+ data = {"grant_type": "refresh_token",
+ "refresh_token": _refresh_token_cache(refresh_token)}
+ else:
+ self.log.info("Requesting public access token")
+ data = {"grant_type": "client_credentials"}
+
+ auth = (self.client_id, self.client_secret)
+ response = self.extractor.request(
+ url, method="POST", data=data, auth=auth)
+ data = response.json()
+
+ if response.status_code != 200:
+ raise exception.AuthenticationError('"{} ({})"'.format(
+ data.get("error_description"), data.get("error")))
+ if refresh_token:
+ _refresh_token_cache.update(refresh_token, data["refresh_token"])
+ return "Bearer " + data["access_token"]
+
+ def _call(self, endpoint, params=None, expect_error=False, public=True):
+ """Call an API endpoint"""
+ url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint
+ while True:
+ if self.delay >= 0:
+ time.sleep(2 ** self.delay)
+
+ self.authenticate(None if public else self.refresh_token)
+ response = self.extractor.request(
+ url,
+ params=params,
+ headers=self.headers,
+ expect=range(400, 500),
+ )
+ data = response.json()
+ status = response.status_code
+
+ if 200 <= status < 400:
+ if self.delay > self.delay_min:
+ self.delay -= 1
+ return data
+ if expect_error:
+ return None
+ if data.get("error_description") == "User not found.":
+ raise exception.NotFoundError("user or group")
+
+ self.log.debug(response.text)
+ msg = "API responded with {} {}".format(
+ status, response.reason)
+ if status == 429:
+ self.delay += 1
+ self.log.warning("%s. Using %ds delay.", msg, 2 ** self.delay)
+ else:
+ self.log.error(msg)
+ return data
+
+ def _pagination(self, endpoint, params, extend=True):
+ public = True
+ while True:
+ data = self._call(endpoint, params, public=public)
+ if "results" not in data:
+ self.log.error("Unexpected API response: %s", data)
+ return
+ if (public and self.refresh_token and
+ len(data["results"]) < params["limit"]):
+ self.log.debug("Switching to private access token")
+ public = False
+ continue
+
+ if extend:
+ if self.metadata:
+ self._metadata(data["results"])
+ if self.folders:
+ self._folders(data["results"])
+ yield from data["results"]
+
+ if not data["has_more"]:
+ return
+ params["offset"] = data["next_offset"]
+
+ def _pagination_folders(self, endpoint, params):
+ result = []
+ result.extend(self._pagination(endpoint, params, False))
+ return result
+
+ def _metadata(self, deviations):
+ """Add extended metadata to each deviation object"""
+ for deviation, metadata in zip(
+ deviations, self.deviation_metadata(deviations)):
+ deviation.update(metadata)
+ deviation["tags"] = [t["tag_name"] for t in deviation["tags"]]
+ return deviations
+
+ def _folders(self, deviations):
+ """Add a list of all containing folders to each deviation object"""
+ for deviation in deviations:
+ deviation["folders"] = self._folders_map(
+ deviation["author"]["username"])[deviation["deviationid"]]
+
+ @memcache(keyarg=1)
+ def _folders_map(self, username):
+ """Generate a deviation_id -> folders mapping for 'username'"""
+ self.log.info("Collecting folder information for '%s'", username)
+ folders = self.gallery_folders(username)
+
+ # add parent names to folders, but ignore "Featured" as parent
+ fmap = {}
+ featured = folders[0]["folderid"]
+ for folder in folders:
+ if folder["parent"] and folder["parent"] != featured:
+ folder["name"] = fmap[folder["parent"]] + "/" + folder["name"]
+ fmap[folder["folderid"]] = folder["name"]
+
+ # map deviationids to folder names
+ dmap = collections.defaultdict(list)
+ for folder in folders:
+ for deviation in self.gallery(
+ username, folder["folderid"], 0, False):
+ dmap[deviation["deviationid"]].append(folder["name"])
+ return dmap
+
+
+@cache(maxage=10*365*24*3600, keyarg=0)
+def _refresh_token_cache(original_token, new_token=None):
+ return new_token or original_token
+
+
+SHADOW_TEMPLATE = """
+<span class="shadow">
+ <img src="{src}" class="smshadow" width="{width}" height="{height}">
+</span>
+<br><br>
+"""
+
+HEADER_TEMPLATE = """<div usr class="gr">
+<div class="metadata">
+ <h2><a href="{url}">{title}</a></h2>
+ <ul>
+ <li class="author">
+ by <span class="name"><span class="username-with-symbol u">
+ <a class="u regular username" href="{userurl}">{username}</a>\
+<span class="user-symbol regular"></span></span></span>,
+ <span>{date}</span>
+ </li>
+ <li class="category">
+ {categories}
+ </li>
+ </ul>
+</div>
+"""
+
+HEADER_CUSTOM_TEMPLATE = """<div class='boxtop journaltop'>
+<h2>
+ <img src="https://st.deviantart.net/minish/gruzecontrol/icons/journal.gif\
+?2" style="vertical-align:middle" alt=""/>
+ <a href="{url}">{title}</a>
+</h2>
+Journal Entry: <span>{date}</span>
+"""
+
+JOURNAL_TEMPLATE_HTML = """text:<!DOCTYPE html>
+<html>
+<head>
+ <meta charset="utf-8">
+ <title>{title}</title>
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/deviantart-network_lc.css?3843780832">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/group_secrets_lc.css?3250492874">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/v6core_lc.css?4246581581">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/sidebar_lc.css?1490570941">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/writer_lc.css?3090682151">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/v6loggedin_lc.css?3001430805">
+ <style>{css}</style>
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+roses/cssmin/core.css?1488405371919" >
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+roses/cssmin/peeky.css?1487067424177" >
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+roses/cssmin/desktop.css?1491362542749" >
+</head>
+<body id="deviantART-v7" class="bubble no-apps loggedout w960 deviantart">
+ <div id="output">
+ <div class="dev-page-container bubbleview">
+ <div class="dev-page-view view-mode-normal">
+ <div class="dev-view-main-content">
+ <div class="dev-view-deviation">
+ {shadow}
+ <div class="journal-wrapper tt-a">
+ <div class="journal-wrapper2">
+ <div class="journal {cls} journalcontrol">
+ {html}
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+</body>
+</html>
+"""
+
+JOURNAL_TEMPLATE_TEXT = """text:{title}
+by {username}, {date}
+
+{content}
+"""
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
new file mode 100644
index 0000000..77a19f6
--- /dev/null
+++ b/gallery_dl/extractor/directlink.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Direct link handling"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class DirectlinkExtractor(Extractor):
+ """Extractor for direct links to images and other media files"""
+ category = "directlink"
+ filename_fmt = "{domain}/{path}"
+ archive_fmt = "{domain}/{path}"
+ pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\."
+ r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
+ r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$")
+ test = (
+ (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {
+ "url": "18c5d00077332e98e53be9fed2ee4be66154b88d",
+ "keyword": "e81b9fe3022e971365dd859f38e4ef717a6c69ed",
+ }),
+ # more complex example
+ ("https://example.org/path/file.webm?que=1&ry=2#fragment", {
+ "url": "fd4aec8a32842343394e6078a06c3e6b647bf671",
+ "keyword": "ff75764b1ae66615b723a6357b8193fa2de84678",
+ }),
+ # percent-encoded characters
+ ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {
+ "url": "2627e8140727fdf743f86fe18f69f99a052c9718",
+ "keyword": "4d19dc12e41ffcb4cbec2013e335cf482377c35e",
+ }),
+ # upper case file extension (#296)
+ ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw"
+ ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP"
+ "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.data = match.groupdict()
+
+ def items(self):
+ text.nameext_from_url(self.url, self.data)
+ for key, value in self.data.items():
+ if value:
+ self.data[key] = text.unquote(value)
+
+ yield Message.Version, 1
+ yield Message.Directory, self.data
+ yield Message.Url, self.url, self.data
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
new file mode 100644
index 0000000..b10bd35
--- /dev/null
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from https://dynasty-scans.com/"""
+
+from .common import ChapterExtractor, Extractor, Message
+from .. import text
+import json
+import re
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
+
+
+class DynastyscansBase():
+ """Base class for dynastyscans extractors"""
+ category = "dynastyscans"
+ root = "https://dynasty-scans.com"
+
+ def _parse_image_page(self, image_id):
+ url = "{}/images/{}".format(self.root, image_id)
+ extr = text.extract_from(self.request(url).text)
+
+ date = extr("class='create_at'>", "</span>")
+ tags = extr("class='tags'>", "</span>")
+ src = extr("class='btn-group'>", "</div>")
+ url = extr(' src="', '"')
+
+ src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
+
+ return {
+ "url" : self.root + url,
+ "image_id": text.parse_int(image_id),
+ "tags" : text.split_html(text.unescape(tags)),
+ "date" : text.remove_html(date),
+ "source" : text.unescape(src),
+ }
+
+
+class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
+ """Extractor for manga-chapters from dynasty-scans.com"""
+ pattern = BASE_PATTERN + r"(/chapters/[^/?&#]+)"
+ test = (
+ (("http://dynasty-scans.com/chapters/"
+ "hitoribocchi_no_oo_seikatsu_ch33"), {
+ "url": "dce64e8c504118f1ab4135c00245ea12413896cb",
+ "keyword": "1564965671ac69bb7fbc340538397f6bd0aa269b",
+ }),
+ (("http://dynasty-scans.com/chapters/"
+ "new_game_the_spinoff_special_13"), {
+ "url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538",
+ "keyword": "22b35029bc65d6d95db2e2c147b0a37f2d290f29",
+ }),
+ )
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ match = re.match(
+ (r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
+ r"(?: ch(\d+)([^:<]*))?" # chapter info
+ r"(?:: (.+))?"), # title
+ extr("<h3 id='chapter-title'><b>", "</b>"),
+ )
+ author = extr(" by ", "</a>")
+ group = extr('"icon-print"></i> ', '</span>')
+
+ return {
+ "manga" : text.unescape(match.group(1)),
+ "chapter" : text.parse_int(match.group(2)),
+ "chapter_minor": match.group(3) or "",
+ "title" : text.unescape(match.group(4) or ""),
+ "author" : text.remove_html(author),
+ "group" : (text.remove_html(group) or
+ text.extract(group, ' alt="', '"')[0] or ""),
+ "date" : extr('"icon-calendar"></i> ', '<'),
+ "lang" : "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ data = text.extract(page, "var pages = ", ";\n")[0]
+ return [
+ (self.root + img["image"], None)
+ for img in json.loads(data)
+ ]
+
+
+class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
+ """Extrator for image search results on dynasty-scans.com"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Images")
+ filename_fmt = "{image_id}.{extension}"
+ archive_fmt = "i_{image_id}"
+ pattern = BASE_PATTERN + r"/images/?(?:\?([^#]+))?$"
+ test = (
+ ("https://dynasty-scans.com/images?with[]=4930&with[]=5211", {
+ "url": "6b570eedd8a741c2cd34fb98b22a49d772f84191",
+ "keyword": "a1e2d05c1406a08b02f347389616a6babb1b50bf",
+ }),
+ ("https://dynasty-scans.com/images", {
+ "range": "1",
+ "count": 1,
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1) or ""
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, {}
+ for image_id in self.images():
+ image = self._parse_image_page(image_id)
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def images(self):
+ url = self.root + "/images?" + self.query.replace("[]", "%5B%5D")
+ params = {"page": 1}
+
+ while True:
+ page = self.request(url, params=params).text
+ yield from text.extract_iter(page, '"/images/', '"')
+ if 'rel="next"' not in page:
+ return
+ params["page"] += 1
+
+
+class DynastyscansImageExtractor(DynastyscansSearchExtractor):
+ """Extractor for individual images on dynasty-scans.com"""
+ subcategory = "image"
+ pattern = BASE_PATTERN + r"/images/(\d+)"
+ test = ("https://dynasty-scans.com/images/1245", {
+ "url": "15e54bd94148a07ed037f387d046c27befa043b2",
+ "keyword": "3b630c6139e5ff06e141541d57960f8a2957efbb",
+ })
+
+ def images(self):
+ return (self.query,)
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
new file mode 100644
index 0000000..f245ddf
--- /dev/null
+++ b/gallery_dl/extractor/e621.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://e621.net/"""
+
+from . import booru
+
+
+class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for e621 extractors"""
+ category = "e621"
+ api_url = "https://e621.net/post/index.json"
+ post_url = "https://e621.net/post/show/{}"
+ page_limit = 750
+
+
+class E621TagExtractor(booru.TagMixin, E621Extractor):
+ """Extractor for images from e621.net based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?e621\.net/post"
+ r"(?:/index/\d+/|\?tags=)(?P<tags>[^/?&#]+)")
+ test = (
+ ("https://e621.net/post/index/1/anry", {
+ "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",
+ "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
+ }),
+ ("https://e621.net/post?tags=anry"),
+ )
+
+
+class E621PoolExtractor(booru.PoolMixin, E621Extractor):
+ """Extractor for image-pools from e621.net"""
+ pattern = r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(?P<pool>\d+)"
+ test = ("https://e621.net/pool/show/73", {
+ "url": "842f2fb065c7c339486a9b1d689020b8569888ed",
+ "content": "c2c87b7a9150509496cddc75ccab08109922876a",
+ })
+
+
+class E621PostExtractor(booru.PostMixin, E621Extractor):
+ """Extractor for single images from e621.net"""
+ pattern = r"(?:https?://)?(?:www\.)?e621\.net/post/show/(?P<post>\d+)"
+ test = ("https://e621.net/post/show/535", {
+ "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
+ "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "anry",
+ "tags_general": str,
+ "tags_species": str,
+ },
+ })
+
+
+class E621PopularExtractor(booru.MoebooruPopularMixin, E621Extractor):
+ """Extractor for popular images from 621.net"""
+ pattern = (r"(?:https?://)?(?:www\.)?e621\.net"
+ r"/post/popular_by_(?P<scale>day|week|month)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = ("https://e621.net/post/popular_by_month?month=6&year=2013", {
+ "count": 32,
+ })
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "https://e621.net/post/popular_by_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
new file mode 100644
index 0000000..d67c58a
--- /dev/null
+++ b/gallery_dl/extractor/exhentai.py
@@ -0,0 +1,382 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from galleries at https://exhentai.org/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+import itertools
+import random
+import time
+import math
+
+
+BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
+
+
+class ExhentaiExtractor(Extractor):
+ """Base class for exhentai extractors"""
+ category = "exhentai"
+ directory_fmt = ("{category}", "{gallery_id}")
+ filename_fmt = (
+ "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}")
+ archive_fmt = "{gallery_id}_{num}"
+ cookiedomain = ".exhentai.org"
+ cookienames = ("ipb_member_id", "ipb_pass_hash")
+ root = "https://exhentai.org"
+
+ def __init__(self, match):
+ if match.group(1) != "ex":
+ self.root = "https://e-hentai.org"
+ self.cookiedomain = ".e-hentai.org"
+ Extractor.__init__(self, match)
+ self.limits = self.config("limits", True)
+ self.original = self.config("original", True)
+ self.wait_min = self.config("wait-min", 3)
+ self.wait_max = self.config("wait-max", 6)
+
+ self._remaining = 0
+ if self.wait_max < self.wait_min:
+ self.wait_max = self.wait_min
+ self.session.headers["Referer"] = self.root + "/"
+
+ def request(self, *args, **kwargs):
+ response = Extractor.request(self, *args, **kwargs)
+ if self._is_sadpanda(response):
+ self.log.info("sadpanda.jpg")
+ raise exception.AuthorizationError()
+ return response
+
+ def wait(self, waittime=None):
+ """Wait for a randomly chosen amount of seconds"""
+ if not waittime:
+ waittime = random.uniform(self.wait_min, self.wait_max)
+ else:
+ waittime = random.uniform(waittime * 0.66, waittime * 1.33)
+ time.sleep(waittime)
+
+ def login(self):
+ """Login and set necessary cookies"""
+ if self._check_cookies(self.cookienames):
+ return
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ else:
+ self.log.info("no username given; using e-hentai.org")
+ self.root = "https://e-hentai.org"
+ self.original = False
+ self.limits = False
+ self.session.cookies["nw"] = "1"
+
+ @cache(maxage=90*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
+ headers = {
+ "Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1",
+ }
+ data = {
+ "CookieDate": "1",
+ "b": "d",
+ "bt": "1-1",
+ "UserName": username,
+ "PassWord": password,
+ "ipb_login_submit": "Login!",
+ }
+
+ response = self.request(url, method="POST", headers=headers, data=data)
+ if "You are now logged in as:" not in response.text:
+ raise exception.AuthenticationError()
+ return {c: response.cookies[c] for c in self.cookienames}
+
+ @staticmethod
+ def _is_sadpanda(response):
+ """Return True if the response object contains a sad panda"""
+ return (
+ response.headers.get("Content-Length") == "9615" and
+ "sadpanda.jpg" in response.headers.get("Content-Disposition", "")
+ )
+
+
+class ExhentaiGalleryExtractor(ExhentaiExtractor):
+ """Extractor for image galleries from exhentai.org"""
+ subcategory = "gallery"
+ pattern = (BASE_PATTERN +
+ r"(?:/g/(\d+)/([\da-f]{10})"
+ r"|/s/([\da-f]{10})/(\d+)-(\d+))")
+ test = (
+ ("https://exhentai.org/g/960460/4f0e369d82/", {
+ "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480",
+ "content": "493d759de534355c9f55f8e365565b62411de146",
+ }),
+ ("https://exhentai.org/g/960461/4f0e369d82/", {
+ "exception": exception.NotFoundError,
+ }),
+ ("http://exhentai.org/g/962698/7f02358e00/", {
+ "exception": exception.AuthorizationError,
+ }),
+ ("https://exhentai.org/s/3957343c3b/960460-5", {
+ "count": 2,
+ }),
+ ("https://e-hentai.org/s/3957343c3b/960460-5", {
+ "count": 2,
+ }),
+ ("https://g.e-hentai.org/g/960460/4f0e369d82/"),
+ )
+
+ def __init__(self, match):
+ ExhentaiExtractor.__init__(self, match)
+ self.key = {}
+ self.count = 0
+ self.gallery_id = text.parse_int(match.group(2) or match.group(5))
+ self.gallery_token = match.group(3)
+ self.image_token = match.group(4)
+ self.image_num = text.parse_int(match.group(6), 1)
+
+ def items(self):
+ self.login()
+
+ if self.gallery_token:
+ gpage = self._gallery_page()
+ self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
+ self.wait()
+ ipage = self._image_page()
+ else:
+ ipage = self._image_page()
+ part = text.extract(ipage, 'hentai.org/g/', '"')[0]
+ self.gallery_token = part.split("/")[1]
+ self.wait()
+ gpage = self._gallery_page()
+
+ data = self.get_metadata(gpage)
+ self.count = data["count"]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ images = itertools.chain(
+ (self.image_from_page(ipage),), self.images_from_api())
+ for url, image in images:
+ data.update(image)
+ if self.limits:
+ self._check_limits(data)
+ if "/fullimg.php" in url:
+ data["extension"] = ""
+ self.wait(1.5)
+ yield Message.Url, url, data
+
+ def get_metadata(self, page):
+ """Extract gallery metadata"""
+ extr = text.extract_from(page)
+ data = {
+ "gallery_id" : self.gallery_id,
+ "gallery_token": self.gallery_token,
+ "title" : text.unescape(extr('<h1 id="gn">', '</h1>')),
+ "title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')),
+ "date" : text.parse_datetime(extr(
+ '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
+ "parent" : extr(
+ '>Parent:</td><td class="gdt2"><a href="', '"'),
+ "visible" : extr(
+ '>Visible:</td><td class="gdt2">', '<'),
+ "language" : extr(
+ '>Language:</td><td class="gdt2">', ' '),
+ "gallery_size" : text.parse_bytes(extr(
+ '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
+ "count" : text.parse_int(extr(
+ '>Length:</td><td class="gdt2">', ' ')),
+ }
+
+ data["lang"] = util.language_to_code(data["language"])
+ data["tags"] = [
+ text.unquote(tag)
+ for tag in text.extract_iter(page, 'hentai.org/tag/', '"')
+ ]
+
+ return data
+
+ def image_from_page(self, page):
+ """Get image url and data from webpage"""
+ pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
+ extr = text.extract_from(page, pos)
+
+ self.key["next"] = extr("'", "'")
+ iurl = extr('<img id="img" src="', '"')
+ orig = extr('hentai.org/fullimg.php', '"')
+
+ if self.original and orig:
+ url = self.root + "/fullimg.php" + text.unescape(orig)
+ data = self._parse_original_info(extr('ownload original', '<'))
+ else:
+ url = iurl
+ data = self._parse_image_info(url)
+
+ data["num"] = self.image_num
+ data["image_token"] = self.key["start"] = extr('var startkey="', '";')
+ self.key["show"] = extr('var showkey="', '";')
+
+ return url, text.nameext_from_url(iurl, data)
+
+ def images_from_api(self):
+ """Get image url and data from api calls"""
+ api_url = self.root + "/api.php"
+ nextkey = self.key["next"]
+ request = {
+ "method" : "showpage",
+ "gid" : self.gallery_id,
+ "imgkey" : nextkey,
+ "showkey": self.key["show"],
+ }
+ for request["page"] in range(self.image_num + 1, self.count + 1):
+ self.wait()
+ page = self.request(api_url, method="POST", json=request).json()
+ imgkey = nextkey
+ nextkey, pos = text.extract(page["i3"], "'", "'")
+ imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
+ origurl, pos = text.extract(page["i7"], '<a href="', '"')
+
+ if self.original and origurl:
+ url = text.unescape(origurl)
+ data = self._parse_original_info(
+ text.extract(page["i7"], "ownload original", "<", pos)[0])
+ else:
+ url = imgurl
+ data = self._parse_image_info(url)
+
+ data["num"] = request["page"]
+ data["image_token"] = imgkey
+ yield url, text.nameext_from_url(imgurl, data)
+
+ request["imgkey"] = nextkey
+
+ def _gallery_page(self):
+ url = "{}/g/{}/{}/".format(
+ self.root, self.gallery_id, self.gallery_token)
+ response = self.request(url, expect=range(400, 500))
+ page = response.text
+
+ if response.status_code == 404 and "Gallery Not Available" in page:
+ raise exception.AuthorizationError()
+ if page.startswith(("Key missing", "Gallery not found")):
+ raise exception.NotFoundError("gallery")
+ return page
+
+ def _image_page(self):
+ url = "{}/s/{}/{}-{}".format(
+ self.root, self.image_token, self.gallery_id, self.image_num)
+ page = self.request(url, expect=range(400, 500)).text
+
+ if page.startswith(("Invalid page", "Keep trying")):
+ raise exception.NotFoundError("image page")
+ return page
+
+ def _check_limits(self, data):
+ if not self._remaining or data["num"] % 20 == 0:
+ self._update_limits()
+ self._remaining -= data["cost"]
+
+ if self._remaining <= 0:
+ url = "{}/s/{}/{}-{}".format(
+ self.root, data["image_token"], self.gallery_id, data["num"])
+ self.log.error(
+ "Image limit reached! Reset it and continue with "
+ "'%s' as URL.", url)
+ raise exception.StopExtraction()
+
+ def _update_limits(self):
+ url = "https://e-hentai.org/home.php"
+ cookies = {
+ cookie.name: cookie.value
+ for cookie in self.session.cookies
+ if cookie.domain == self.cookiedomain and cookie.name != "igneous"
+ }
+
+ page = self.request(url, cookies=cookies).text
+ current, pos = text.extract(page, "<strong>", "</strong>")
+ maximum, pos = text.extract(page, "<strong>", "</strong>", pos)
+ self._remaining = text.parse_int(maximum) - text.parse_int(current)
+
+ @staticmethod
+ def _parse_image_info(url):
+ parts = url.split("/")[4].split("-")
+ return {
+ "width": text.parse_int(parts[2]),
+ "height": text.parse_int(parts[3]),
+ "size": text.parse_int(parts[1]),
+ "cost": 1,
+ }
+
+ @staticmethod
+ def _parse_original_info(info):
+ parts = info.lstrip().split(" ")
+ size = text.parse_bytes(parts[3] + parts[4][0])
+ return {
+ "width": text.parse_int(parts[0]),
+ "height": text.parse_int(parts[2]),
+ "size": size,
+ "cost": 1 + math.ceil(size * 5 / 1024 / 1024)
+ }
+
+
+class ExhentaiSearchExtractor(ExhentaiExtractor):
+ """Extractor for exhentai search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/?\?(.*)$"
+ test = (
+ ("https://exhentai.org/?f_search=touhou"),
+ (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0"
+ "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0"
+ "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), {
+ "pattern": ExhentaiGalleryExtractor.pattern,
+ "range": "1-30",
+ "count": 30,
+ }),
+ )
+
+ def __init__(self, match):
+ ExhentaiExtractor.__init__(self, match)
+ self.params = text.parse_query(match.group(2))
+ self.params["page"] = text.parse_int(self.params.get("page"))
+ self.search_url = self.root
+
+ def items(self):
+ self.login()
+ yield Message.Version, 1
+
+ while True:
+ last = None
+ page = self.request(self.search_url, params=self.params).text
+
+ for gallery in ExhentaiGalleryExtractor.pattern.finditer(page):
+ url = gallery.group(0)
+ if url == last:
+ continue
+ last = url
+ yield Message.Queue, url, {}
+
+ if 'class="ptdd">&gt;<' in page or ">No hits found</p>" in page:
+ return
+ self.params["page"] += 1
+ self.wait()
+
+
+class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
+ """Extractor for favorited exhentai galleries"""
+ subcategory = "favorite"
+ pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?"
+ test = (
+ ("https://exhentai.org/favorites.php"),
+ ("https://exhentai.org/favorites.php?favcat=1&f_search=touhou"
+ "&f_apply=Search+Favorites"),
+ )
+
+ def __init__(self, match):
+ ExhentaiSearchExtractor.__init__(self, match)
+ self.search_url = self.root + "/favorites.php"
diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py
new file mode 100644
index 0000000..a2d8c04
--- /dev/null
+++ b/gallery_dl/extractor/fallenangels.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from https://www.fascans.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, util
+import json
+
+
+class FallenangelsChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from fascans.com"""
+ category = "fallenangels"
+ pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com"
+ r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?")
+ test = (
+ ("https://manga.fascans.com/manga/chronos-ruler/20/1", {
+ "url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3",
+ "keyword": "2dfcc50020e32cd207be88e2a8fac0933e36bdfb",
+ }),
+ ("http://truyen.fascans.com/manga/hungry-marie/8", {
+ "url": "1f923d9cb337d5e7bbf4323719881794a951c6ae",
+ "keyword": "2bdb7334c0e3eceb9946ffd3132df679b4a94f6a",
+ }),
+ ("http://manga.fascans.com/manga/rakudai-kishi-no-eiyuutan/19.5", {
+ "keyword": "9fcca4c1a90d11f00764f62477ebe10bd408021c",
+ }),
+ )
+
+ def __init__(self, match):
+ self.version, self.manga, self.chapter, self.minor = match.groups()
+ url = "https://{}.fascans.com/manga/{}/{}/1".format(
+ self.version, self.manga, self.chapter)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ lang = "vi" if self.version == "truyen" else "en"
+ return {
+ "manga" : extr('name="description" content="', ' Chapter '),
+ "title" : extr(': ', ' - Page 1'),
+ "chapter" : self.chapter,
+ "chapter_minor": self.minor or "",
+ "lang" : lang,
+ "language": util.code_to_language(lang),
+ }
+
+ @staticmethod
+ def images(page):
+ return [
+ (img["page_image"], None)
+ for img in json.loads(
+ text.extract(page, "var pages = ", ";")[0]
+ )
+ ]
+
+
+class FallenangelsMangaExtractor(MangaExtractor):
+ """Extractor for manga from fascans.com"""
+ chapterclass = FallenangelsChapterExtractor
+ category = "fallenangels"
+ pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$"
+ test = (
+ ("http://manga.fascans.com/manga/trinity-seven", {
+ "url": "293057f264de6c438b979bd1c3de4719568db452",
+ "keyword": "50e0374dba60734230e4284b5ffdadef5104ae62",
+ }),
+ ("https://truyen.fascans.com/manga/rakudai-kishi-no-eiyuutan", {
+ "url": "51a731a6b82d5eb7a335fbae6b02d06aeb2ab07b",
+ "keyword": "2d2a2a5d9ea5925eb9a47bb13d848967f3af086c",
+ }),
+ )
+
+ def __init__(self, match):
+ url = "https://" + match.group(1)
+ self.lang = "vi" if match.group(2) == "truyen" else "en"
+ MangaExtractor.__init__(self, match, url)
+
+ def chapters(self, page):
+ extr = text.extract_from(page)
+ results = []
+ language = util.code_to_language(self.lang)
+ while extr('<li style="', '"'):
+ vol = extr('class="volume-', '"')
+ url = extr('href="', '"')
+ cha = extr('>', '<')
+ title = extr('<em>', '</em>')
+
+ manga, _, chapter = cha.rpartition(" ")
+ chapter, dot, minor = chapter.partition(".")
+ results.append((url, {
+ "manga" : manga,
+ "title" : text.unescape(title),
+ "volume" : text.parse_int(vol),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": dot + minor,
+ "lang" : self.lang,
+ "language": language,
+ }))
+ return results
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
new file mode 100644
index 0000000..d941d76
--- /dev/null
+++ b/gallery_dl/extractor/flickr.py
@@ -0,0 +1,503 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.flickr.com/"""
+
+from .common import Extractor, Message
+from .. import text, oauth, util, exception
+
+
+class FlickrExtractor(Extractor):
+ """Base class for flickr extractors"""
+ category = "flickr"
+ filename_fmt = "{category}_{id}.{extension}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = FlickrAPI(self)
+ self.item_id = match.group(1)
+ self.user = None
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for photo in self.photos():
+ photo.update(data)
+ url = photo["url"]
+ yield Message.Url, url, text.nameext_from_url(url, photo)
+
+ def metadata(self):
+ """Return general metadata"""
+ self.user = self.api.urls_lookupUser(self.item_id)
+ return {"user": self.user}
+
+ def photos(self):
+ """Return an iterable with all relevant photo objects"""
+
+
+class FlickrImageExtractor(FlickrExtractor):
+ """Extractor for individual images from flickr.com"""
+ subcategory = "image"
+ archive_fmt = "{id}"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
+ r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
+ r"|flic\.kr/p/([A-Za-z1-9]+))")
+ test = (
+ ("https://www.flickr.com/photos/departingyyz/16089302239", {
+ "pattern": pattern,
+ "content": "0821a28ee46386e85b02b67cf2720063440a228c",
+ "keyword": {
+ "comments": int,
+ "description": str,
+ "extension": "jpg",
+ "filename": "16089302239_de18cd8017_b",
+ "id": 16089302239,
+ "height": 683,
+ "label": "Large",
+ "media": "photo",
+ "url": str,
+ "views": int,
+ "width": 1024,
+ },
+ }),
+ ("https://www.flickr.com/photos/145617051@N08/46733161535", {
+ "count": 1,
+ "keyword": {"media": "video"},
+ }),
+ ("http://c2.staticflickr.com/2/1475/24531000464_9a7503ae68_b.jpg", {
+ "pattern": pattern}),
+ ("https://farm2.static.flickr.com/1035/1188352415_cb139831d0.jpg", {
+ "pattern": pattern}),
+ ("https://flic.kr/p/FPVo9U", {
+ "pattern": pattern}),
+ ("https://www.flickr.com/photos/zzz/16089302238", {
+ "exception": exception.NotFoundError}),
+ )
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ if not self.item_id:
+ alphabet = ("123456789abcdefghijkmnopqrstu"
+ "vwxyzABCDEFGHJKLMNPQRSTUVWXYZ")
+ self.item_id = util.bdecode(match.group(2), alphabet)
+
+ def items(self):
+ photo = self.api.photos_getInfo(self.item_id)
+
+ if photo["media"] == "video" and self.api.videos:
+ self.api._extract_video(photo)
+ else:
+ self.api._extract_photo(photo)
+
+ photo["title"] = photo["title"]["_content"]
+ photo["comments"] = text.parse_int(photo["comments"]["_content"])
+ photo["description"] = photo["description"]["_content"]
+ photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]]
+ photo["date"] = text.parse_timestamp(photo["dateuploaded"])
+ photo["views"] = text.parse_int(photo["views"])
+ photo["id"] = text.parse_int(photo["id"])
+
+ if "location" in photo:
+ location = photo["location"]
+ for key, value in location.items():
+ if isinstance(value, dict):
+ location[key] = value["_content"]
+
+ url = photo["url"]
+ yield Message.Version, 1
+ yield Message.Directory, photo
+ yield Message.Url, url, text.nameext_from_url(url, photo)
+
+
+class FlickrAlbumExtractor(FlickrExtractor):
+ """Extractor for photo albums from flickr.com"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{subcategory}s",
+ "{album[id]} - {album[title]}")
+ archive_fmt = "a_{album[id]}_{id}"
+ pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
+ r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?")
+ test = (
+ (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": 6,
+ }),
+ ("https://www.flickr.com/photos/shona_s/albums", {
+ "pattern": pattern,
+ "count": 2,
+ }),
+ )
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ self.album_id = match.group(2)
+
+ def items(self):
+ if self.album_id:
+ return FlickrExtractor.items(self)
+ return self._album_items()
+
+ def _album_items(self):
+ yield Message.Version, 1
+ data = FlickrExtractor.metadata(self)
+ data["_extractor"] = FlickrAlbumExtractor
+
+ for album in self.api.photosets_getList(self.user["nsid"]):
+ self.api._clean_info(album).update(data)
+ url = "https://www.flickr.com/photos/{}/albums/{}".format(
+ self.user["path_alias"], album["id"])
+ yield Message.Queue, url, album
+
+ def metadata(self):
+ data = FlickrExtractor.metadata(self)
+ data["album"] = self.api.photosets_getInfo(
+ self.album_id, self.user["nsid"])
+ return data
+
+ def photos(self):
+ return self.api.photosets_getPhotos(self.album_id)
+
+
+class FlickrGalleryExtractor(FlickrExtractor):
+ """Extractor for photo galleries from flickr.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "galleries",
+ "{user[username]} {gallery[id]}")
+ archive_fmt = "g_{gallery[id]}_{id}"
+ pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
+ r"photos/([^/]+)/galleries/(\d+)")
+ test = (("https://www.flickr.com/photos/flickr/"
+ "galleries/72157681572514792/"), {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": ">= 10",
+ })
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+
+ def metadata(self):
+ data = FlickrExtractor.metadata(self)
+ data["gallery"] = self.api.galleries_getInfo(self.gallery_id)
+ return data
+
+ def photos(self):
+ return self.api.galleries_getPhotos(self.gallery_id)
+
+
+class FlickrGroupExtractor(FlickrExtractor):
+ """Extractor for group pools from flickr.com"""
+ subcategory = "group"
+ directory_fmt = ("{category}", "{subcategory}s", "{group[groupname]}")
+ archive_fmt = "G_{group[nsid]}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"
+ test = ("https://www.flickr.com/groups/bird_headshots/", {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": "> 150",
+ })
+
+ def metadata(self):
+ self.group = self.api.urls_lookupGroup(self.item_id)
+ return {"group": self.group}
+
+ def photos(self):
+ return self.api.groups_pools_getPhotos(self.group["nsid"])
+
+
+class FlickrUserExtractor(FlickrExtractor):
+ """Extractor for the photostream of a flickr user"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "{user[username]}")
+ archive_fmt = "u_{user[nsid]}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"
+ test = ("https://www.flickr.com/photos/shona_s/", {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": 28,
+ })
+
+ def photos(self):
+ return self.api.people_getPhotos(self.user["nsid"])
+
+
+class FlickrFavoriteExtractor(FlickrExtractor):
+ """Extractor for favorite photos of a flickr user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{subcategory}s", "{user[username]}")
+ archive_fmt = "f_{user[nsid]}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"
+ test = ("https://www.flickr.com/photos/shona_s/favorites", {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": 4,
+ })
+
+ def photos(self):
+ return self.api.favorites_getList(self.user["nsid"])
+
+
+class FlickrSearchExtractor(FlickrExtractor):
+ """Extractor for flickr photos based on search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "{subcategory}", "{search[text]}")
+ archive_fmt = "s_{search}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"
+ test = (
+ ("https://flickr.com/search/?text=mountain"),
+ ("https://flickr.com/search/?text=tree%20cloud%20house"
+ "&color_codes=4&styles=minimalism"),
+ )
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ self.search = text.parse_query(match.group(1))
+ if "text" not in self.search:
+ self.search["text"] = ""
+
+ def metadata(self):
+ return {"search": self.search}
+
+ def photos(self):
+ return self.api.photos_search(self.search)
+
+
+class FlickrAPI(oauth.OAuth1API):
+ """Minimal interface for the flickr API"""
+ API_URL = "https://api.flickr.com/services/rest/"
+ API_KEY = "ac4fd7aa98585b9eee1ba761c209de68"
+ API_SECRET = "3adb0f568dc68393"
+ FORMATS = [
+ ("o", "Original" , None),
+ ("k", "Large 2048" , 2048),
+ ("h", "Large 1600" , 1600),
+ ("l", "Large" , 1024),
+ ("c", "Medium 800" , 800),
+ ("z", "Medium 640" , 640),
+ ("m", "Medium" , 500),
+ ("n", "Small 320" , 320),
+ ("s", "Small" , 240),
+ ("q", "Large Square", 150),
+ ("t", "Thumbnail" , 100),
+ ("s", "Square" , 75),
+ ]
+ VIDEO_FORMATS = {
+ "orig" : 9,
+ "1080p" : 8,
+ "720p" : 7,
+ "360p" : 6,
+ "288p" : 5,
+ "700" : 4,
+ "300" : 3,
+ "100" : 2,
+ "appletv" : 1,
+ "iphone_wifi": 0,
+ }
+
+ def __init__(self, extractor):
+ oauth.OAuth1API.__init__(self, extractor)
+
+ self.videos = extractor.config("videos", True)
+ self.maxsize = extractor.config("size-max")
+ if isinstance(self.maxsize, str):
+ for fmt, fmtname, fmtwidth in self.FORMATS:
+ if self.maxsize == fmt or self.maxsize == fmtname:
+ self.maxsize = fmtwidth
+ break
+ else:
+ self.maxsize = None
+ extractor.log.warning(
+ "Could not match '%s' to any format", self.maxsize)
+ if self.maxsize:
+ self.formats = [fmt for fmt in self.FORMATS
+ if not fmt[2] or fmt[2] <= self.maxsize]
+ else:
+ self.formats = self.FORMATS
+ self.formats = self.formats[:4]
+
+ def favorites_getList(self, user_id):
+ """Returns a list of the user's favorite photos."""
+ params = {"user_id": user_id}
+ return self._pagination("favorites.getList", params)
+
+ def galleries_getInfo(self, gallery_id):
+ """Gets information about a gallery."""
+ params = {"gallery_id": gallery_id}
+ gallery = self._call("galleries.getInfo", params)["gallery"]
+ return self._clean_info(gallery)
+
+ def galleries_getPhotos(self, gallery_id):
+ """Return the list of photos for a gallery."""
+ params = {"gallery_id": gallery_id}
+ return self._pagination("galleries.getPhotos", params)
+
+ def groups_pools_getPhotos(self, group_id):
+ """Returns a list of pool photos for a given group."""
+ params = {"group_id": group_id}
+ return self._pagination("groups.pools.getPhotos", params)
+
+ def people_getPhotos(self, user_id):
+ """Return photos from the given user's photostream."""
+ params = {"user_id": user_id}
+ return self._pagination("people.getPhotos", params)
+
+ def photos_getInfo(self, photo_id):
+ """Get information about a photo."""
+ params = {"photo_id": photo_id}
+ return self._call("photos.getInfo", params)["photo"]
+
+ def photos_getSizes(self, photo_id):
+ """Returns the available sizes for a photo."""
+ params = {"photo_id": photo_id}
+ sizes = self._call("photos.getSizes", params)["sizes"]["size"]
+ if self.maxsize:
+ for index, size in enumerate(sizes):
+ if index > 0 and (int(size["width"]) > self.maxsize or
+ int(size["height"]) > self.maxsize):
+ del sizes[index:]
+ break
+ return sizes
+
+ def photos_search(self, params):
+ """Return a list of photos matching some criteria."""
+ return self._pagination("photos.search", params.copy())
+
+ def photosets_getInfo(self, photoset_id, user_id):
+ """Gets information about a photoset."""
+ params = {"photoset_id": photoset_id, "user_id": user_id}
+ photoset = self._call("photosets.getInfo", params)["photoset"]
+ return self._clean_info(photoset)
+
+ def photosets_getList(self, user_id):
+ """Returns the photosets belonging to the specified user."""
+ params = {"user_id": user_id}
+ return self._pagination_sets("photosets.getList", params)
+
+ def photosets_getPhotos(self, photoset_id):
+ """Get the list of photos in a set."""
+ params = {"photoset_id": photoset_id}
+ return self._pagination("photosets.getPhotos", params, "photoset")
+
+ def urls_lookupGroup(self, groupname):
+ """Returns a group NSID, given the url to a group's page."""
+ params = {"url": "https://www.flickr.com/groups/" + groupname}
+ group = self._call("urls.lookupGroup", params)["group"]
+ return {"nsid": group["id"],
+ "path_alias": groupname,
+ "groupname": group["groupname"]["_content"]}
+
+ def urls_lookupUser(self, username):
+ """Returns a user NSID, given the url to a user's photos or profile."""
+ params = {"url": "https://www.flickr.com/photos/" + username}
+ user = self._call("urls.lookupUser", params)["user"]
+ return {"nsid": user["id"],
+ "path_alias": username,
+ "username": user["username"]["_content"]}
+
+ def video_getStreamInfo(self, video_id, secret=None):
+ """Returns all available video streams"""
+ params = {"photo_id": video_id}
+ if not secret:
+ secret = self._call("photos.getInfo", params)["photo"]["secret"]
+ params["secret"] = secret
+ stream = self._call("video.getStreamInfo", params)["streams"]["stream"]
+ return max(stream, key=lambda s: self.VIDEO_FORMATS.get(s["type"], 0))
+
+ def _call(self, method, params):
+ params["method"] = "flickr." + method
+ params["format"] = "json"
+ params["nojsoncallback"] = "1"
+ if self.api_key:
+ params["api_key"] = self.api_key
+ data = self.request(self.API_URL, params=params).json()
+ if "code" in data:
+ if data["code"] == 1:
+ raise exception.NotFoundError(self.extractor.subcategory)
+ elif data["code"] == 98:
+ raise exception.AuthenticationError(data.get("message"))
+ elif data["code"] == 99:
+ raise exception.AuthorizationError()
+ self.log.error("API call failed: %s", data.get("message"))
+ raise exception.StopExtraction()
+ return data
+
+ def _pagination(self, method, params, key="photos"):
+ params["extras"] = "description,date_upload,tags,views,media,"
+ params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats)
+ params["page"] = 1
+
+ while True:
+ data = self._call(method, params)[key]
+ yield from map(self._extract_format, data["photo"])
+ if params["page"] >= data["pages"]:
+ return
+ params["page"] += 1
+
+ def _pagination_sets(self, method, params):
+ params["page"] = 1
+
+ while True:
+ data = self._call(method, params)["photosets"]
+ yield from data["photoset"]
+ if params["page"] >= data["pages"]:
+ return
+ params["page"] += 1
+
+ def _extract_format(self, photo):
+ photo["description"] = photo["description"]["_content"].strip()
+ photo["views"] = text.parse_int(photo["views"])
+ photo["date"] = text.parse_timestamp(photo["dateupload"])
+ photo["tags"] = photo["tags"].split()
+ photo["id"] = text.parse_int(photo["id"])
+
+ if photo["media"] == "video" and self.videos:
+ return self._extract_video(photo)
+
+ for fmt, fmtname, fmtwidth in self.formats:
+ key = "url_" + fmt
+ if key in photo:
+ photo["width"] = text.parse_int(photo["width_" + fmt])
+ photo["height"] = text.parse_int(photo["height_" + fmt])
+ if self.maxsize and (photo["width"] > self.maxsize or
+ photo["height"] > self.maxsize):
+ continue
+ photo["url"] = photo[key]
+ photo["label"] = fmtname
+
+ # remove excess data
+ keys = [
+ key for key in photo
+ if key.startswith(("url_", "width_", "height_"))
+ ]
+ for key in keys:
+ del photo[key]
+ break
+ else:
+ self._extract_photo(photo)
+
+ return photo
+
+ def _extract_photo(self, photo):
+ size = self.photos_getSizes(photo["id"])[-1]
+ photo["url"] = size["source"]
+ photo["label"] = size["label"]
+ photo["width"] = text.parse_int(size["width"])
+ photo["height"] = text.parse_int(size["height"])
+ return photo
+
+ def _extract_video(self, photo):
+ stream = self.video_getStreamInfo(photo["id"], photo.get("secret"))
+ photo["url"] = stream["_content"]
+ photo["label"] = stream["type"]
+ photo["width"] = photo["height"] = 0
+ return photo
+
+ @staticmethod
+ def _clean_info(info):
+ info["title"] = info["title"]["_content"]
+ info["description"] = info["description"]["_content"]
+ return info
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
new file mode 100644
index 0000000..5f4c5b8
--- /dev/null
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for 4chan archives based on FoolFuuka"""
+
+from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .. import text
+import itertools
+import operator
+
+
+class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
+ """Base extractor for FoolFuuka based boards/archives"""
+ basecategory = "foolfuuka"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board[shortname]}",
+ "{thread_num}{title:? - //}")
+ filename_fmt = "{media[media]}"
+ archive_fmt = "{board[shortname]}_{num}_{timestamp}"
+ pattern_fmt = r"/([^/]+)/thread/(\d+)"
+ external = "default"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+ self.session.headers["Referer"] = self.root
+ if self.external == "direct":
+ self.remote = self._remote_direct
+
+ def items(self):
+ op = True
+ yield Message.Version, 1
+ for post in self.posts():
+ if op:
+ yield Message.Directory, post
+ op = False
+ if not post["media"]:
+ continue
+
+ media = post["media"]
+ url = media["media_link"]
+
+ if not url and "remote_media_link" in media:
+ url = self.remote(media)
+ if url.startswith("/"):
+ url = self.root + url
+
+ post["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, post
+
+ def posts(self):
+ """Return an iterable with all posts in this thread"""
+ url = self.root + "/_/api/chan/thread/"
+ params = {"board": self.board, "num": self.thread}
+ data = self.request(url, params=params).json()[self.thread]
+
+ # sort post-objects by key
+ posts = sorted(data.get("posts", {}).items())
+ posts = map(operator.itemgetter(1), posts)
+
+ return itertools.chain((data["op"],), posts)
+
+ def remote(self, media):
+ """Resolve a remote media link"""
+ needle = '<meta http-equiv="Refresh" content="0; url='
+ page = self.request(media["remote_media_link"]).text
+ return text.extract(page, needle, '"')[0]
+
+ @staticmethod
+ def _remote_direct(media):
+ return media["remote_media_link"]
+
+
+EXTRACTORS = {
+ "4plebs": {
+ "name": "fourplebs",
+ "root": "https://archive.4plebs.org",
+ "pattern": r"(?:archive\.)?4plebs\.org",
+ "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", {
+ "url": "07452944164b602502b02b24521f8cee5c484d2a",
+ }),
+ },
+ "archivedmoe": {
+ "root": "https://archived.moe",
+ "test-thread": (
+ ("https://archived.moe/gd/thread/309639/", {
+ "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
+ "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
+ }),
+ ("https://archived.moe/a/thread/159767162/", {
+ "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
+ }),
+ ),
+ },
+ "archiveofsins": {
+ "root": "https://archiveofsins.com",
+ "pattern": r"(?:www\.)?archiveofsins\.com",
+ "test-thread": ("https://archiveofsins.com/h/thread/4668813/", {
+ "url": "f612d287087e10a228ef69517cf811539db9a102",
+ "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
+ }),
+ },
+ "b4k": {
+ "root": "https://arch.b4k.co",
+ "extra": {"external": "direct"},
+ "test-thread": ("https://arch.b4k.co/meta/thread/196/", {
+ "url": "9b0ae01292133268fe9178b71332da1ee25b7704",
+ }),
+ },
+ "desuarchive": {
+ "root": "https://desuarchive.org",
+ "test-thread": ("https://desuarchive.org/a/thread/159542679/", {
+ "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
+ }),
+ },
+ "fireden": {
+ "root": "https://boards.fireden.net",
+ "test-thread": ("https://boards.fireden.net/a/thread/159803223/", {
+ "url": "01b7baacfb0656a68e566368290e3072b27f86c9",
+ }),
+ },
+ "nyafuu": {
+ "root": "https://archive.nyafuu.org",
+ "pattern": r"(?:archive\.)?nyafuu\.org",
+ "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", {
+ "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
+ }),
+ },
+ "rbt": {
+ "root": "https://rbt.asia",
+ "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
+ "test-thread": (
+ ("https://rbt.asia/g/thread/61487650/", {
+ "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ }),
+ ("https://archive.rebeccablacktech.com/g/thread/61487650/", {
+ "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ }),
+ ),
+ },
+ "thebarchive": {
+ "root": "https://thebarchive.com",
+ "pattern": r"thebarchive\.com",
+ "test-thread": ("https://thebarchive.com/b/thread/739772332/", {
+ "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
+ }),
+ },
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ FoolfuukaThreadExtractor,
+))
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
new file mode 100644
index 0000000..14baa36
--- /dev/null
+++ b/gallery_dl/extractor/foolslide.py
@@ -0,0 +1,240 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for FoOlSlide based sites"""
+
+from .common import (
+ Extractor,
+ ChapterExtractor,
+ MangaExtractor,
+ SharedConfigMixin,
+ Message,
+ generate_extractors,
+)
+from .. import text, util
+import base64
+import json
+
+
+class FoolslideBase(SharedConfigMixin):
+ """Base class for FoOlSlide extractors"""
+ basecategory = "foolslide"
+
+ def request(self, url):
+ return Extractor.request(
+ self, url, encoding="utf-8", method="POST", data={"adult": "true"})
+
+ @staticmethod
+ def parse_chapter_url(url, data):
+ info = url.partition("/read/")[2].rstrip("/").split("/")
+ lang = info[1].partition("-")[0]
+ data["lang"] = lang
+ data["language"] = util.code_to_language(lang)
+ data["volume"] = text.parse_int(info[2])
+ data["chapter"] = text.parse_int(info[3])
+ data["chapter_minor"] = "." + info[4] if len(info) >= 5 else ""
+ data["title"] = data["chapter_string"].partition(":")[2].strip()
+ return data
+
+
+class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
+ """Base class for chapter extractors for FoOlSlide based sites"""
+ directory_fmt = (
+ "{category}", "{manga}", "{chapter_string}")
+ archive_fmt = "{id}"
+ pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
+ decode = "default"
+
+ def items(self):
+ page = self.request(self.chapter_url).text
+ data = self.metadata(page)
+ imgs = self.images(page)
+
+ data["count"] = len(imgs)
+ data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"])
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"], image in enumerate(imgs, 1):
+ try:
+ url = image["url"]
+ del image["url"]
+ del image["chapter_id"]
+ del image["thumb_url"]
+ except KeyError:
+ pass
+ for key in ("height", "id", "size", "width"):
+ image[key] = text.parse_int(image[key])
+ data.update(image)
+ text.nameext_from_url(data["filename"], data)
+ yield Message.Url, url, data
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ extr('<h1 class="tbtitle dnone">', '')
+ return self.parse_chapter_url(self.chapter_url, {
+ "manga" : text.unescape(extr('title="', '"')).strip(),
+ "chapter_string": text.unescape(extr('title="', '"')),
+ })
+
+ def images(self, page):
+ if self.decode == "base64":
+ base64_data = text.extract(page, 'atob("', '"')[0].encode()
+ data = base64.b64decode(base64_data).decode()
+ elif self.decode == "double":
+ pos = page.find("[{")
+ data = text.extract(page, " = ", ";", pos)[0]
+ else:
+ data = text.extract(page, "var pages = ", ";")[0]
+ return json.loads(data)
+
+
+class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
+ """Base class for manga extractors for FoOlSlide based sites"""
+ pattern_fmt = r"(/series/[^/?&#]+)"
+
+ def chapters(self, page):
+ extr = text.extract_from(page)
+ manga = text.unescape(extr('<h1 class="title">', '</h1>')).strip()
+ author = extr('<b>Author</b>: ', '<br')
+ artist = extr('<b>Artist</b>: ', '<br')
+
+ results = []
+ while True:
+ url = extr('<div class="title"><a href="', '"')
+ if not url:
+ return results
+ results.append((url, self.parse_chapter_url(url, {
+ "manga": manga, "author": author, "artist": artist,
+ "chapter_string": extr('title="', '"'),
+ "group" : extr('title="', '"'),
+ })))
+
+
+EXTRACTORS = {
+ "dokireader": {
+ "root": "https://kobato.hologfx.com/reader",
+ "test-chapter":
+ (("https://kobato.hologfx.com/reader/read/"
+ "hitoribocchi_no_oo_seikatsu/en/3/34"), {
+ "keyword": "6e719ac86f0c6dab89390dd7e507e678459e0dbc",
+ }),
+ "test-manga":
+ (("https://kobato.hologfx.com/reader/series/"
+ "boku_ha_ohimesama_ni_narenai/"), {
+ "url": "1c1f5a7258ce4f631f5fc32be548d78a6a57990d",
+ "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995",
+ }),
+ },
+ "jaiminisbox": {
+ "root": "https://jaiminisbox.com/reader",
+ "pattern": r"(?:www\.)?jaiminisbox\.com/reader",
+ "extra": {"decode": "base64"},
+ "test-chapter": (
+ ("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", {
+ "keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673",
+ }),
+ ("https://jaiminisbox.com/reader/read/dr-stone/en/0/16/", {
+ "keyword": "8607375c24b1d0db7f52d059ef5baff793aa458e",
+ }),
+ ),
+ "test-manga":
+ ("https://jaiminisbox.com/reader/series/sora_no_kian/", {
+ "url": "66612be177dc3b3fa1d1f537ef02f4f701b163ea",
+ "keyword": "0908a4145bb03acc4210f5d01169988969f5acd1",
+ }),
+ },
+ "kireicake": {
+ "root": "https://reader.kireicake.com",
+ "test-chapter":
+ ("https://reader.kireicake.com/read/wonderland/en/1/1/", {
+ "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e",
+ "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6",
+ }),
+ "test-manga":
+ ("https://reader.kireicake.com/series/wonderland/", {
+ "url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
+ "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
+ }),
+ },
+ "powermanga": {
+ "root": "https://read.powermanga.org",
+ "pattern": r"read(?:er)?\.powermanga\.org",
+ "test-chapter":
+ (("https://read.powermanga.org"
+ "/read/one_piece_digital_colour_comics/en/0/75/"), {
+ "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384",
+ "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe",
+ }),
+ "test-manga":
+ (("https://read.powermanga.org"
+ "/series/one_piece_digital_colour_comics/"), {
+ "count": ">= 1",
+ "keyword": {
+ "chapter": int,
+ "chapter_minor": str,
+ "chapter_string": str,
+ "group": "PowerManga",
+ "lang": "en",
+ "language": "English",
+ "manga": "One Piece Digital Colour Comics",
+ "title": str,
+ "volume": int,
+ },
+ }),
+ },
+ "sensescans": {
+ "root": "http://sensescans.com/reader",
+ "pattern": r"(?:(?:www\.)?sensescans\.com/reader"
+ r"|reader\.sensescans\.com)",
+ "test-chapter": (
+ (("http://sensescans.com/reader/read/"
+ "magi__labyrinth_of_magic/en/37/369/"), {
+ "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812",
+ "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988",
+ }),
+ (("http://reader.sensescans.com/read/"
+ "magi__labyrinth_of_magic/en/37/369/"), {
+ "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812",
+ "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988",
+ }),
+ ),
+ "test-manga":
+ ("http://sensescans.com/reader/series/hakkenden/", {
+ "url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2",
+ "keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23",
+ }),
+ },
+ "worldthree": {
+ "root": "http://www.slide.world-three.org",
+ "pattern": r"(?:www\.)?slide\.world-three\.org",
+ "test-chapter": (
+ (("http://www.slide.world-three.org"
+ "/read/black_bullet/en/2/7/page/1"), {
+ "url": "be2f04f6e2d311b35188094cfd3e768583271584",
+ "keyword": "967d536a65de4d52478d5b666a1760b181eddb6e",
+ }),
+ (("http://www.slide.world-three.org"
+ "/read/idolmster_cg_shuffle/en/0/4/2/"), {
+ "url": "6028ea5ca282744f925dfad92eeb98509f9cc78c",
+ "keyword": "f3cfe2ad3388991f1d045c85d0fa94795a7694dc",
+ }),
+ ),
+ "test-manga":
+ ("http://www.slide.world-three.org/series/black_bullet/", {
+ "url": "5743b93512d26e6b540d90a7a5d69208b6d4a738",
+ "keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120",
+ }),
+ },
+ "_ckey": "chapterclass",
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ FoolslideChapterExtractor,
+ FoolslideMangaExtractor,
+))
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
new file mode 100644
index 0000000..15bd0a8
--- /dev/null
+++ b/gallery_dl/extractor/gelbooru.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://gelbooru.com/"""
+
+from . import booru
+from .common import Message
+from .. import text, util
+
+
+class GelbooruExtractor(booru.XmlParserMixin,
+ booru.GelbooruPageMixin,
+ booru.BooruExtractor):
+ """Base class for gelbooru extractors"""
+ category = "gelbooru"
+ api_url = "https://gelbooru.com/index.php"
+ post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
+ pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"
+
+ def __init__(self, match):
+ super().__init__(match)
+
+ self.use_api = self.config("api", True)
+ if self.use_api:
+ self.params.update({"page": "dapi", "s": "post", "q": "index"})
+ else:
+ self.items = self.items_noapi
+
+ def items_noapi(self):
+ data = self.get_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for post in self.get_posts():
+ post = self.get_post_data(post)
+ url = post["file_url"]
+ post.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ def get_posts(self):
+ """Return an iterable containing all relevant post objects"""
+
+ def get_post_data(self, post_id):
+ """Extract metadata of a single post"""
+ page = self.request(self.post_url.format(post_id)).text
+ data = text.extract_all(page, (
+ (None , '<meta name="keywords"', ''),
+ ("tags" , ' imageboard, ', '"'),
+ ("id" , '<li>Id: ', '<'),
+ ("created_at", '<li>Posted: ', '<'),
+ ("width" , '<li>Size: ', 'x'),
+ ("height" , '', '<'),
+ ("source" , '<li>Source: <a href="', '"'),
+ ("rating" , '<li>Rating: ', '<'),
+ (None , '<li>Score: ', ''),
+ ("score" , '>', '<'),
+ ("file_url" , '<li><a href="http', '"'),
+ ("change" , ' id="lupdated" value="', '"'),
+ ))[0]
+ data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
+ data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
+ data["rating"] = (data["rating"] or "?")[0].lower()
+ data["tags"] = " ".join(
+ [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
+ if self.extags:
+ self.extended_tags(data, page)
+ return data
+
+
+class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
+ """Extractor for images from gelbooru.com based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
+ r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
+ test = (
+ ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
+ "count": 5,
+ }),
+ ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
+ "options": (("api", False),),
+ "count": 5,
+ }),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ if not self.use_api:
+ self.per_page = 42
+
+ def get_posts(self):
+ url = "https://gelbooru.com/index.php?page=post&s=list"
+ params = {"tags": self.tags, "pid": self.page_start * self.per_page}
+
+ while True:
+ page = self.request(url, params=params).text
+ ids = list(text.extract_iter(page, '<a id="p', '"'))
+ yield from ids
+ if len(ids) < self.per_page:
+ return
+ params["pid"] += self.per_page
+
+
+class GelbooruPoolExtractor(booru.GelbooruPoolMixin, GelbooruExtractor):
+ """Extractor for image-pools from gelbooru.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
+ r"\?page=pool&s=show&id=(?P<pool>\d+)")
+ test = ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
+ "count": 6,
+ })
+
+ def get_posts(self):
+ return util.advance(self.posts, self.page_start)
+
+
+class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
+ """Extractor for single images from gelbooru.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
+ r"\?page=post&s=view&id=(?P<post>\d+)")
+ test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ "count": 1,
+ })
+
+ def get_posts(self):
+ return (self.post,)
diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py
new file mode 100644
index 0000000..1dcb3c8
--- /dev/null
+++ b/gallery_dl/extractor/gfycat.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://gfycat.com/"""
+
+from .common import Extractor, Message
+
+
+class GfycatExtractor(Extractor):
+ """Base class for gfycat extractors"""
+ category = "gfycat"
+ filename_fmt = "{category}_{gfyName}.{extension}"
+ archive_fmt = "{gfyName}"
+ root = "https://gfycat.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif")
+
+ def _select_format(self, gfyitem):
+ for fmt in self.formats:
+ key = fmt + "Url"
+ if key in gfyitem:
+ url = gfyitem[key]
+ gfyitem["extension"] = url.rpartition(".")[2]
+ return url
+ return ""
+
+ def _get_info(self, gfycat_id):
+ url = "https://api.gfycat.com/v1/gfycats/" + gfycat_id
+ return self.request(url).json()["gfyItem"]
+
+
+class GfycatImageExtractor(GfycatExtractor):
+ """Extractor for individual images from gfycat.com"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:\w+\.)?gfycat\.com"
+ r"/(?:gifs/detail/|\w+/)?([A-Za-z]+)")
+ test = (
+ ("https://gfycat.com/GrayGenerousCowrie", {
+ "url": "e0b5e1d7223108249b15c3c7898dd358dbfae045",
+ "content": "5786028e04b155baa20b87c5f4f77453cd5edc37",
+ "keyword": {
+ "gfyId": "graygenerouscowrie",
+ "gfyName": "GrayGenerousCowrie",
+ "gfyNumber": "755075459",
+ "title": "Bottom's up",
+ "userName": "jackson3oh3",
+ "createDate": 1495884169,
+ "md5": "a4796e05b0db9ba9ce5140145cd318aa",
+ "width": 400,
+ "height": 224,
+ "frameRate": 23,
+ "numFrames": 158,
+ "views": int,
+ },
+ }),
+ (("https://thumbs.gfycat.com/SillyLameIsabellinewheatear"
+ "-size_restricted.gif"), {
+ "url": "13b32e6cc169d086577d7dd3fd36ee6cdbc02726",
+ }),
+ ("https://gfycat.com/detail/UnequaledHastyAnkole?tagname=aww", {
+ "url": "e24c9f69897fd223343782425a429c5cab6a768e",
+ }),
+ ("https://gfycat.com/gifs/detail/UnequaledHastyAnkole"),
+ ("https://gfycat.com/ifr/UnequaledHastyAnkole"),
+ ("https://gfycat.com/ru/UnequaledHastyAnkole"),
+ )
+
+ def __init__(self, match):
+ GfycatExtractor.__init__(self, match)
+ self.gfycat_id = match.group(1)
+
+ def items(self):
+ gfyitem = self._get_info(self.gfycat_id)
+ yield Message.Version, 1
+ yield Message.Directory, gfyitem
+ yield Message.Url, self._select_format(gfyitem), gfyitem
diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py
new file mode 100644
index 0000000..01793dc
--- /dev/null
+++ b/gallery_dl/extractor/hbrowse.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.hbrowse.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, exception
+import json
+
+
+class HbrowseBase():
+ """Base class for hbrowse extractors"""
+ category = "hbrowse"
+ root = "https://www.hbrowse.com"
+
+ def parse_page(self, page, data):
+ """Parse metadata on 'page' and add it to 'data'"""
+ data, pos = text.extract_all(page, (
+ ('manga' , '<td class="listLong">', '</td>'),
+ ('artist', '<td class="listLong">', '</td>'),
+ ('total' , '<td class="listLong">', ' '),
+ ('origin', '<td class="listLong">', '</td>'),
+ ), values=data)
+
+ if not data["manga"] and "<b>Warning</b>" in page:
+ msg = page.rpartition(">")[2].strip()
+ self.log.error("Site is not accessible: '%s'", msg)
+ raise exception.StopExtraction()
+
+ tags = text.extract(page, 'class="listTable"', '</table>', pos)[0]
+
+ data["manga"] = text.unescape(data["manga"])
+ data["total"] = text.parse_int(data["total"])
+ data["artist"] = text.remove_html(data["artist"])
+ data["origin"] = text.remove_html(data["origin"])
+ data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"'))
+ return data
+
+
+class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
+ """Extractor for manga-chapters from hbrowse.com"""
+ directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}")
+ filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
+ "{page:>03}.{extension}")
+ archive_fmt = "{manga_id}_{chapter}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))"
+ test = ("https://www.hbrowse.com/10363/c00000", {
+ "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6",
+ "keyword": "274996f6c809e5250b6ff3abbc5147e29f89d9a5",
+ "content": "44578ebbe176c2c27434966aef22945787e2781e",
+ })
+
+ def __init__(self, match):
+ self.path, self.gid, self.chapter = match.groups()
+ self.path += "/"
+ ChapterExtractor.__init__(self, match)
+
+ def metadata(self, page):
+ return self.parse_page(page, {
+ "manga_id": text.parse_int(self.gid),
+ "chapter": text.parse_int(self.chapter)
+ })
+
+ def images(self, page):
+ base = self.root + "/data" + self.path
+ json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
+ return [(base + name, None) for name in json.loads(json_data)]
+
+
+class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
+ """Extractor for manga from hbrowse.com"""
+ chapterclass = HbrowseChapterExtractor
+ reverse = False
+ pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$"
+ test = ("https://www.hbrowse.com/10363", {
+ "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6",
+ "keyword": "4b15fda1858a69de1fbf5afddfe47dd893397312",
+ })
+
+ def chapters(self, page):
+ results = []
+ data = self.parse_page(page, {
+ "manga_id": text.parse_int(
+ self.manga_url.rstrip("/").rpartition("/")[2])
+ })
+
+ pos = 0
+ needle = '<td class="listMiddle">\n<a class="listLink" href="'
+ while True:
+ url, pos = text.extract(page, needle, '"', pos)
+ if not url:
+ return results
+ title, pos = text.extract(page, '>View ', '<', pos)
+ data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
+ data["title"] = title
+ results.append((text.urljoin(self.root, url), data.copy()))
diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py
new file mode 100644
index 0000000..354acbf
--- /dev/null
+++ b/gallery_dl/extractor/hentai2read.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract hentai-manga from https://hentai2read.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import json
+import re
+
+
+class Hentai2readBase():
+ """Base class for hentai2read extractors"""
+ category = "hentai2read"
+ root = "https://hentai2read.com"
+
+
+class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
+ """Extractor for a single manga chapter from hentai2read.com"""
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+/(\d+))"
+ test = ("https://hentai2read.com/amazon_elixir/1/", {
+ "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
+ "keyword": "ff84b8f751f0e4ee37717efc4332ff1db71951d9",
+ })
+
+ def __init__(self, match):
+ self.chapter = match.group(2)
+ ChapterExtractor.__init__(self, match)
+
+ def metadata(self, page):
+ title, pos = text.extract(page, "<title>", "</title>")
+ manga_id, pos = text.extract(page, 'data-mid="', '"', pos)
+ chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
+ match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - "
+ r"(\d+): (.+) . Page 1 ", title)
+ return {
+ "manga": match.group(1),
+ "manga_id": text.parse_int(manga_id),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_id": text.parse_int(chapter_id),
+ "type": match.group(2),
+ "author": match.group(3),
+ "title": match.group(5),
+ "lang": "en",
+ "language": "English",
+ }
+
+ @staticmethod
+ def images(page):
+ images = text.extract(page, "'images' : ", ",\n")[0]
+ return [
+ ("https://hentaicdn.com/hentai" + part, None)
+ for part in json.loads(images)
+ ]
+
+
+class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor):
+ """Extractor for hmanga from hentai2read.com"""
+ chapterclass = Hentai2readChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+)/?$"
+ test = (
+ ("https://hentai2read.com/amazon_elixir/", {
+ "url": "273073752d418ec887d7f7211e42b832e8c403ba",
+ "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac",
+ }),
+ ("https://hentai2read.com/oshikage_riot/", {
+ "url": "6595f920a3088a15c2819c502862d45f8eb6bea6",
+ "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36",
+ }),
+ )
+
+ def chapters(self, page):
+ results = []
+ manga, pos = text.extract(
+ page, '<span itemprop="name">', '</span>')
+ mtype, pos = text.extract(
+ page, '<small class="text-danger">[', ']</small>', pos)
+ manga_id = text.parse_int(text.extract(
+ page, 'data-mid="', '"', pos)[0])
+
+ while True:
+ chapter_id, pos = text.extract(page, ' data-cid="', '"', pos)
+ if not chapter_id:
+ return results
+ _ , pos = text.extract(page, ' href="', '"', pos)
+ url, pos = text.extract(page, ' href="', '"', pos)
+ chapter, pos = text.extract(page, '>', '<', pos)
+
+ chapter, _, title = text.unescape(chapter).strip().partition(" - ")
+ results.append((url, {
+ "manga_id": manga_id, "manga": manga, "type": mtype,
+ "chapter_id": text.parse_int(chapter_id),
+ "chapter": text.parse_int(chapter),
+ "title": title, "lang": "en", "language": "English",
+ }))
diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py
new file mode 100644
index 0000000..e95467b
--- /dev/null
+++ b/gallery_dl/extractor/hentaicafe.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentai.cafe/"""
+
+from . import foolslide
+from .. import text
+from ..cache import memcache
+import re
+
+
+class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
+ """Extractor for manga-chapters from hentai.cafe"""
+ category = "hentaicafe"
+ directory_fmt = ("{category}", "{manga}")
+ pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe"
+ r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
+ test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
+ "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
+ "keyword": "6913608267d883c82b887303b9ced13821188329",
+ })
+ root = "https://hentai.cafe"
+
+ def metadata(self, page):
+ info = text.unescape(text.extract(page, '<title>', '</title>')[0])
+ manga, _, chapter_string = info.partition(" :: ")
+
+ data = self._data(self.chapter_url.split("/")[5])
+ data["manga"] = manga
+ data["chapter_string"] = chapter_string.rstrip(" :")
+ return self.parse_chapter_url(self.chapter_url, data)
+
+ @memcache(keyarg=1)
+ def _data(self, manga):
+ return {"artist": [], "tags": []}
+
+
+class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
+ """Extractor for manga from hentai.cafe"""
+ category = "hentaicafe"
+ pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe"
+ r"((?:/manga/series)?/[^/?&#]+)/?$")
+ test = (
+ # single chapter
+ ("https://hentai.cafe/hazuki-yuuto-summer-blues/", {
+ "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b",
+ "keyword": "eb9f98544098c961bd8cf5dbe69e6da51c4fb2f6",
+ }),
+ # multi-chapter
+ ("https://hentai.cafe/saitom-saitom-box/", {
+ "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
+ "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb",
+ }),
+ # foolslide URL
+ ("https://hentai.cafe/manga/series/saitom-box/", {
+ "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
+ "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c",
+ }),
+ )
+ root = "https://hentai.cafe"
+ reverse = False
+ chapterclass = HentaicafeChapterExtractor
+
+ def chapters(self, page):
+ if "/manga/series/" in self.manga_url:
+ chapters = foolslide.FoolslideMangaExtractor.chapters(self, page)
+ chapters.reverse()
+ return chapters
+
+ tags , pos = text.extract(page, "<p>Tags: ", "</br>")
+ artist, pos = text.extract(page, "\nArtists: ", "</br>", pos)
+ manga , pos = text.extract(page, "/manga/read/", "/", pos)
+ data = {
+ "tags" : text.split_html(tags)[::2],
+ "artist": text.split_html(artist),
+ }
+ HentaicafeChapterExtractor._data(manga).update(data)
+
+ return [
+ (url, data)
+ for url in re.findall(
+ r'<a +class="x-btn[^"]*" +href="([^"]+)"', page)
+ ]
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
new file mode 100644
index 0000000..d31f66f
--- /dev/null
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -0,0 +1,264 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.hentai-foundry.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+
+
+class HentaifoundryExtractor(Extractor):
+ """Base class for hentaifoundry extractors"""
+ category = "hentaifoundry"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{category}_{index}_{title}.{extension}"
+ archive_fmt = "{index}"
+ root = "https://www.hentai-foundry.com"
+ per_page = 25
+
+ def __init__(self, match, user="", page=1):
+ Extractor.__init__(self, match)
+ self.page_url = ""
+ self.user = user
+ self.start_post = 0
+ self.start_page = text.parse_int(page, 1)
+
+ def items(self):
+ data = self.get_job_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ self.set_filters()
+ for page_url in util.advance(self.get_image_pages(), self.start_post):
+ url, image = self.get_image_metadata(page_url)
+ image.update(data)
+ yield Message.Url, url, image
+
+ def skip(self, num):
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+
+ def get_job_metadata(self):
+ """Collect metadata for extractor-job"""
+ self.request(self.root + "/?enterAgree=1")
+ return {"user": self.user}
+
+ def get_image_pages(self):
+ """Yield urls of all relevant image pages"""
+ num = self.start_page
+
+ while True:
+ page = self.request("{}/page/{}".format(self.page_url, num)).text
+ yield from text.extract_iter(page, 'thumbTitle"><a href="', '"')
+
+ if 'class="pager"' not in page or 'class="last hidden"' in page:
+ return
+ num += 1
+
+ def get_image_metadata(self, page_url):
+ """Collect url and metadata from an image page"""
+ page = self.request(text.urljoin(self.root, page_url)).text
+ index = page_url.rsplit("/", 2)[1]
+ title , pos = text.extract(page, '<title>', '</title>')
+ _ , pos = text.extract(page, 'id="picBox"', '', pos)
+ width , pos = text.extract(page, 'width="', '"', pos)
+ height, pos = text.extract(page, 'height="', '"', pos)
+ url , pos = text.extract(page, 'src="', '"', pos)
+
+ title, _, artist = title.rpartition(" - ")[0].rpartition(" by ")
+
+ data = text.nameext_from_url(url, {
+ "title": text.unescape(title),
+ "artist": text.unescape(artist),
+ "index": text.parse_int(index),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ })
+ if not data["extension"]:
+ data["extension"] = "jpg"
+ return text.urljoin(self.root, url), data
+
+ def set_filters(self):
+ """Set site-internal filters to show all images"""
+ token = text.unquote(text.extract(
+ self.session.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0])
+ data = {
+ "YII_CSRF_TOKEN": token,
+ "rating_nudity": 3,
+ "rating_violence": 3,
+ "rating_profanity": 3,
+ "rating_racism": 3,
+ "rating_sex": 3,
+ "rating_spoilers": 3,
+ "rating_yaoi": 1,
+ "rating_yuri": 1,
+ "rating_teen": 1,
+ "rating_guro": 1,
+ "rating_furry": 1,
+ "rating_beast": 1,
+ "rating_male": 1,
+ "rating_female": 1,
+ "rating_futa": 1,
+ "rating_other": 1,
+ "rating_scat": 1,
+ "rating_incest": 1,
+ "rating_rape": 1,
+ "filter_media": "A",
+ "filter_order": "date_new",
+ "filter_type": 0,
+ }
+ url = self.root + "/site/filters"
+ self.request(url, method="POST", data=data)
+
+
+class HentaifoundryUserExtractor(HentaifoundryExtractor):
+ """Extractor for all images of a hentai-foundry-user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/(?:pictures/user/([^/]+)(?:/page/(\d+))?/?$"
+ r"|user/([^/]+)/profile)")
+ test = (
+ ("https://www.hentai-foundry.com/pictures/user/Tenpura", {
+ "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28",
+ "keyword": "63ad576f87f82fa166ca4676761762f7f8496cf5",
+ }),
+ ("https://www.hentai-foundry.com/pictures/user/Tenpura/page/3"),
+ ("https://www.hentai-foundry.com/user/Tenpura/profile"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(
+ self, match, match.group(1) or match.group(3), match.group(2))
+ self.page_url = "{}/pictures/user/{}".format(self.root, self.user)
+
+ def get_job_metadata(self):
+ page = self.request(self.page_url + "?enterAgree=1").text
+ count = text.extract(page, ">Pictures (", ")")[0]
+ return {"user": self.user, "count": text.parse_int(count)}
+
+
+class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
+ """Extractor for scrap images of a hentai-foundry-user"""
+ subcategory = "scraps"
+ directory_fmt = ("{category}", "{user}", "Scraps")
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/pictures/user/([^/]+)/scraps(?:/page/(\d+))?")
+ test = (
+ ("https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", {
+ "url": "00a11e30b73ff2b00a1fba0014f08d49da0a68ec",
+ "keyword": "410c6c900cfd23a8dd1e53dfcc97a79ea68c3359",
+ }),
+ ("https://www.hentai-foundry.com"
+ "/pictures/user/Evulchibi/scraps/page/3"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(
+ self, match, match.group(1), match.group(2))
+ self.page_url = "{}/pictures/user/{}/scraps".format(
+ self.root, self.user)
+
+ def get_job_metadata(self):
+ page = self.request(self.page_url + "?enterAgree=1").text
+ count = text.extract(page, ">Scraps (", ")")[0]
+ return {"user": self.user, "count": text.parse_int(count)}
+
+
+class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
+ """Extractor for favorite images of a hentai-foundry-user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{user}", "Favorites")
+ archive_fmt = "f_{user}_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/user/([^/]+)/faves/pictures(?:/page/(\d+))?")
+ test = (
+ ("https://www.hentai-foundry.com/user/Tenpura/faves/pictures", {
+ "url": "56f9ae2e89fe855e9fe1da9b81e5ec6212b0320b",
+ "keyword": "2b9478725e66d46ea043fa87476bbd28546958e7",
+ }),
+ ("https://www.hentai-foundry.com"
+ "/user/Tenpura/faves/pictures/page/3"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(
+ self, match, match.group(1), match.group(2))
+ self.page_url = "{}/user/{}/faves/pictures".format(
+ self.root, self.user)
+
+
+class HentaifoundryRecentExtractor(HentaifoundryExtractor):
+ """Extractor for 'Recent Pictures' on hentaifoundry.com"""
+ subcategory = "recent"
+ directory_fmt = ("{category}", "Recent Pictures", "{date}")
+ archive_fmt = "r_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/pictures/recent/(\d+-\d+-\d+)(?:/page/(\d+))?")
+ test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20",)
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(self, match, "", match.group(2))
+ self.date = match.group(1)
+ self.page_url = "{}/pictures/recent/{}".format(self.root, self.date)
+
+ def get_job_metadata(self):
+ self.request(self.root + "/?enterAgree=1")
+ return {"date": self.date}
+
+
+class HentaifoundryPopularExtractor(HentaifoundryExtractor):
+ """Extractor for popular images on hentaifoundry.com"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "Popular Pictures")
+ archive_fmt = "p_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/pictures/popular(?:/page/(\d+))?")
+ test = ("http://www.hentai-foundry.com/pictures/popular",)
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(self, match, "", match.group(1))
+ self.page_url = self.root + "/pictures/popular"
+
+
+class HentaifoundryImageExtractor(HentaifoundryExtractor):
+ """Extractor for a single image from hentaifoundry.com"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com"
+ r"/(?:pictures/user|[^/])/([^/]+)/(\d+)")
+ test = (
+ (("https://www.hentai-foundry.com"
+ "/pictures/user/Tenpura/407501/shimakaze"), {
+ "url": "fbf2fd74906738094e2575d2728e8dc3de18a8a3",
+ "keyword": "cbb9381e6c2acce58db4adf4efc0ad7d138bddc4",
+ "content": "91bf01497c39254b6dfb234a18e8f01629c77fd1",
+ }),
+ ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/", {
+ "exception": exception.HttpError,
+ }),
+ ("https://pictures.hentai-foundry.com"
+ "/t/Tenpura/407501/Tenpura-407501-shimakaze.png"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(self, match, match.group(1))
+ self.index = match.group(2)
+
+ def items(self):
+ post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format(
+ self.root, self.user, self.index)
+ url, data = self.get_image_metadata(post_url)
+ data["user"] = self.user
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+ def skip(self, _):
+ return 0
diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py
new file mode 100644
index 0000000..cf4871f
--- /dev/null
+++ b/gallery_dl/extractor/hentaifox.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentaifox.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text
+
+
+class HentaifoxBase():
+ """Base class for hentaifox extractors"""
+ category = "hentaifox"
+ root = "https://hentaifox.com"
+
+
+class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
+ """Extractor for image galleries on hentaifox.com"""
+ pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
+ test = ("https://hentaifox.com/gallery/56622/", {
+ "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
+ "count": 24,
+ "keyword": "38f8517605feb6854d48833297da6b05c6541b69",
+ })
+
+ def __init__(self, match):
+ GalleryExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+
+ def metadata(self, page, split=text.split_html):
+ extr = text.extract_from(page)
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : text.unescape(extr("<h1>", "</h1>")),
+ "parody" : split(extr(">Parodies:" , "</a></span>"))[::2],
+ "characters": split(extr(">Characters:", "</a></span>"))[::2],
+ "tags" : split(extr(">Tags:" , "</a></span>"))[::2],
+ "artist" : split(extr(">Artists:" , "</a></span>"))[::2],
+ "group" : split(extr(">Groups:" , "</a></span>"))[::2],
+ "type" : text.remove_html(extr(">Category:", "</a></span>")),
+ "language" : "English",
+ "lang" : "en",
+ }
+
+ def images(self, page):
+ return [
+ (text.urljoin(self.root, url.replace("t.", ".")), None)
+ for url in text.extract_iter(page, 'data-src="', '"')
+ ]
+
+
+class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
+ """Extractor for search results and listings on hentaifox.com"""
+ subcategory = "search"
+ pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
+ r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
+ test = (
+ ("https://hentaifox.com/parody/touhou-project/"),
+ ("https://hentaifox.com/character/reimu-hakurei/"),
+ ("https://hentaifox.com/artist/distance/"),
+ ("https://hentaifox.com/search/touhou/"),
+ ("https://hentaifox.com/tag/full-colour/", {
+ "pattern": HentaifoxGalleryExtractor.pattern,
+ "count": ">= 40",
+ "keyword": {
+ "url": str,
+ "gallery_id": int,
+ "thumbnail": r"re:https://i\d*.hentaifox.com/\d+/\d+/thumb\.",
+ "title": str,
+ "tags": list,
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for gallery in self.galleries():
+ yield Message.Queue, gallery["url"], gallery
+
+ def galleries(self):
+ url = "{}/{}/".format(self.root, self.path)
+
+ while True:
+ page = self.request(url).text
+ info, gpos = text.extract(
+ page, 'class="galleries_overview">', 'class="clear">')
+
+ for ginfo in text.extract_iter(info, '<div class="item', '</a>'):
+ tags , pos = text.extract(ginfo, '', '"')
+ url , pos = text.extract(ginfo, 'href="', '"', pos)
+ title, pos = text.extract(ginfo, 'alt="', '"', pos)
+ thumb, pos = text.extract(ginfo, 'src="', '"', pos)
+
+ yield {
+ "url": text.urljoin(self.root, url),
+ "gallery_id": text.parse_int(
+ url.strip("/").rpartition("/")[2]),
+ "thumbnail": text.urljoin(self.root, thumb),
+ "title": text.unescape(title),
+ "tags": tags.split(),
+ "_extractor": HentaifoxGalleryExtractor,
+ }
+
+ pos = page.find('class="current"', gpos)
+ url = text.extract(page, 'href="', '"', pos)[0]
+ if pos == -1 or "/pag" not in url:
+ return
+ url = text.urljoin(self.root, url)
diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py
new file mode 100644
index 0000000..8083a9b
--- /dev/null
+++ b/gallery_dl/extractor/hentaihere.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract hentai-manga from https://hentaihere.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import json
+import re
+
+
+class HentaihereBase():
+ """Base class for hentaihere extractors"""
+ category = "hentaihere"
+ root = "https://hentaihere.com"
+
+
+class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
+ """Extractor for a single manga chapter from hentaihere.com"""
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"
+ test = ("https://hentaihere.com/m/S13812/1/1/", {
+ "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
+ "keyword": "cbcee0c0eb178c4b87f06a834085784f8dddad24",
+ })
+
+ def __init__(self, match):
+ self.manga_id, self.chapter = match.groups()
+ url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ title = text.extract(page, "<title>", "</title>")[0]
+ chapter_id = text.extract(page, 'report/C', '"')[0]
+ pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
+ match = re.match(pattern, title)
+ return {
+ "manga": match.group(1),
+ "manga_id": text.parse_int(self.manga_id),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_id": text.parse_int(chapter_id),
+ "type": match.group(2),
+ "title": match.group(3),
+ "author": match.group(4),
+ "lang": "en",
+ "language": "English",
+ }
+
+ @staticmethod
+ def images(page):
+ images = text.extract(page, "var rff_imageList = ", ";")[0]
+ return [
+ ("https://hentaicdn.com/hentai" + part, None)
+ for part in json.loads(images)
+ ]
+
+
+class HentaihereMangaExtractor(HentaihereBase, MangaExtractor):
+ """Extractor for hmanga from hentaihere.com"""
+ chapterclass = HentaihereChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com(/m/S\d+)/?$"
+ test = (
+ ("https://hentaihere.com/m/S13812", {
+ "url": "d1ba6e28bb2162e844f8559c2b2725ba0a093559",
+ "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac",
+ }),
+ ("https://hentaihere.com/m/S7608", {
+ "url": "6c5239758dc93f6b1b4175922836c10391b174f7",
+ "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36",
+ }),
+ )
+
+ def chapters(self, page):
+ results = []
+ manga_id = text.parse_int(
+ self.manga_url.rstrip("/").rpartition("/")[2][1:])
+ manga, pos = text.extract(
+ page, '<span itemprop="name">', '</span>')
+ mtype, pos = text.extract(
+ page, '<span class="mngType text-danger">[', ']</span>', pos)
+
+ while True:
+ marker, pos = text.extract(
+ page, '<li class="sub-chp clearfix">', '', pos)
+ if marker is None:
+ return results
+ url, pos = text.extract(page, '<a href="', '"', pos)
+ chapter, pos = text.extract(page, 'title="Tagged: -">\n', '<', pos)
+ chapter_id, pos = text.extract(page, '/C', '"', pos)
+ chapter, _, title = text.unescape(chapter).strip().partition(" - ")
+ results.append((url, {
+ "manga_id": manga_id, "manga": manga, "type": mtype,
+ "chapter_id": text.parse_int(chapter_id),
+ "chapter": text.parse_int(chapter),
+ "title": title, "lang": "en", "language": "English",
+ }))
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
new file mode 100644
index 0000000..d875817
--- /dev/null
+++ b/gallery_dl/extractor/hentainexus.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentainexus.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import json
+
+
+class HentainexusGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries on hentainexus.com"""
+ category = "hentainexus"
+ root = "https://hentainexus.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
+ r"/(?:view|read)/(\d+)")
+ test = (
+ ("https://hentainexus.com/view/5688", {
+ "url": "746d0043e20030f1171aae5ea113176607302517",
+ "keyword": "b05986369fbaf29cfa08b118960d92c49e59524b",
+ }),
+ ("https://hentainexus.com/read/5688"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/view/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ rmve = text.remove_html
+ extr = text.extract_from(page)
+ data = {
+ "gallery_id" : text.parse_int(self.gallery_id),
+ "tags" : extr('"og:description" content="', '"').split(", "),
+ "thumbnail" : extr('"og:image" content="', '"'),
+ "title" : extr('<h1 class="title">', '</h1>'),
+ "artist" : rmve(extr('viewcolumn">Artist</td>' , '</td>')),
+ "book" : rmve(extr('viewcolumn">Book</td>' , '</td>')),
+ "language" : rmve(extr('viewcolumn">Language</td>' , '</td>')),
+ "magazine" : rmve(extr('viewcolumn">Magazine</td>' , '</td>')),
+ "parody" : rmve(extr('viewcolumn">Parody</td>' , '</td>')),
+ "publisher" : rmve(extr('viewcolumn">Publisher</td>' , '</td>')),
+ "description": rmve(extr('viewcolumn">Description</td>', '</td>')),
+ }
+ data["lang"] = util.language_to_code(data["language"])
+ return data
+
+ def images(self, page):
+ url = "{}/read/{}".format(self.root, self.gallery_id)
+ extr = text.extract_from(self.request(url).text)
+ urls = extr("initReader(", "]") + "]"
+ return [(url, None) for url in json.loads(urls)]
+
+
+class HentainexusSearchExtractor(Extractor):
+ """Extractor for search results on hentainexus.com"""
+ category = "hentainexus"
+ subcategory = "search"
+ root = "https://hentainexus.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
+ r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$")
+ test = (
+ ("https://hentainexus.com/?q=tag:%22heart+pupils%22%20tag:group", {
+ "pattern": HentainexusGalleryExtractor.pattern,
+ "count": ">= 50",
+ }),
+ ("https://hentainexus.com/page/3?q=tag:%22heart+pupils%22"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ params = self.params
+ path = "/"
+
+ while path:
+ page = self.request(self.root + path, params=params).text
+ extr = text.extract_from(page)
+ data = {"_extractor": HentainexusGalleryExtractor}
+
+ while True:
+ gallery_id = extr('<a href="/view/', '"')
+ if not gallery_id:
+ break
+ yield Message.Queue, self.root + "/view/" + gallery_id, data
+
+ path = extr('class="pagination-next" href="', '"')
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
new file mode 100644
index 0000000..c112465
--- /dev/null
+++ b/gallery_dl/extractor/hitomi.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://hitomi.la/"""
+
+from .common import GalleryExtractor
+from .. import text, util
+import string
+
+
+class HitomiGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from hitomi.la"""
+ category = "hitomi"
+ root = "https://hitomi.la"
+ pattern = r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)"
+ test = (
+ ("https://hitomi.la/galleries/867789.html", {
+ "url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
+ "keyword": "067b5d9b9c0f98530cd5dd2444e0f5a5b4b00d38",
+ }),
+ ("https://hitomi.la/galleries/1036181.html", {
+ # "aa" subdomain for gallery-id ending in 1 (#142)
+ "pattern": r"https://aa\.hitomi\.la/",
+ }),
+ ("https://hitomi.la/galleries/1401410.html", {
+ # download test
+ "range": "1",
+ "content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",
+ }),
+ ("https://hitomi.la/galleries/733697.html", {
+ # Game CG with scenes (#321)
+ "url": "c2a84185f467450b8b9b72fbe40c0649029ce007",
+ "count": 210,
+ }),
+ ("https://hitomi.la/reader/867789.html"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = text.parse_int(match.group(1))
+ url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
+ data = {
+ "gallery_id": self.gallery_id,
+ "title" : text.unescape(extr('.html">', '<').strip()),
+ "artist" : self._prep(extr('<h2>', '</h2>')),
+ "group" : self._prep(extr('<td>Group</td><td>', '</td>')),
+ "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),
+ "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),
+ "parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
+ "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
+ "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
+ "date" : self._date(extr('<span class="date">', '</span>')),
+ }
+ if data["language"] == "N/a":
+ data["language"] = None
+ data["lang"] = util.language_to_code(data["language"])
+ return data
+
+ def images(self, page):
+ # see https://ltn.hitomi.la/common.js
+ offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
+ subdomain = chr(97 + offset) + "a"
+ base = "https://" + subdomain + ".hitomi.la/galleries/"
+
+ # set Referer header before image downloads (#239)
+ self.session.headers["Referer"] = self.chapter_url
+
+ # handle Game CG galleries with scenes (#321)
+ scenes = text.extract(page, "var scene_indexes = [", "]")[0]
+ if scenes and scenes.strip():
+ url = "{}/reader/{}.html".format(self.root, self.gallery_id)
+ page = self.request(url).text
+ begin, end = ">//g.hitomi.la/galleries/", "</div>"
+ else:
+ begin, end = "'//tn.hitomi.la/smalltn/", ".jpg',"
+
+ return [
+ (base + urlpart, None)
+ for urlpart in text.extract_iter(page, begin, end)
+ ]
+
+ @staticmethod
+ def _prep(value):
+ return [
+ text.unescape(string.capwords(v))
+ for v in text.extract_iter(value or "", '.html">', '<')
+ ]
+
+ @staticmethod
+ def _prep_1(value):
+ return text.remove_html(value).capitalize()
+
+ @staticmethod
+ def _date(value):
+ return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")
diff --git a/gallery_dl/extractor/hypnohub.py b/gallery_dl/extractor/hypnohub.py
new file mode 100644
index 0000000..bf2db96
--- /dev/null
+++ b/gallery_dl/extractor/hypnohub.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hypnohub.net/"""
+
+from . import booru
+
+
+class HypnohubExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for hypnohub extractors"""
+ category = "hypnohub"
+ api_url = "https://hypnohub.net/post.json"
+ post_url = "https://hypnohub.net/post/show/{}"
+
+
+class HypnohubTagExtractor(booru.TagMixin, HypnohubExtractor):
+ """Extractor for images from hypnohub.net based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
+ r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
+ test = ("https://hypnohub.net/post?tags=gonoike_biwa", {
+ "url": "6bebc4318489ee37e0c3b814352acd6783ba95d6",
+ })
+
+
+class HypnohubPoolExtractor(booru.PoolMixin, HypnohubExtractor):
+ """Extractor for image-pools from hypnohub.net"""
+ pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/pool/show/(?P<pool>\d+)"
+ test = ("https://hypnohub.net/pool/show/61", {
+ "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
+ })
+
+
+class HypnohubPostExtractor(booru.PostMixin, HypnohubExtractor):
+ """Extractor for single images from hypnohub.net"""
+ pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/post/show/(?P<post>\d+)"
+ test = ("https://hypnohub.net/post/show/73964", {
+ "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "gonoike_biwa icontrol_(manipper)",
+ "tags_character": "komaru_naegi",
+ "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode",
+ "tags_general": str,
+ },
+ })
+
+
+class HypnohubPopularExtractor(booru.MoebooruPopularMixin, HypnohubExtractor):
+ """Extractor for popular images from hypnohub.net"""
+ pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = (
+ ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
+ "count": 20,
+ }),
+ ("https://hypnohub.net/post/popular_recent"),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "https://hypnohub.net/post/popular_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
new file mode 100644
index 0000000..dcb4a54
--- /dev/null
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://idol.sankakucomplex.com/"""
+
+from . import sankaku
+
+
+class IdolcomplexExtractor(sankaku.SankakuExtractor):
+ """Base class for idolcomplex extractors"""
+ category = "idolcomplex"
+ cookiedomain = "idol.sankakucomplex.com"
+ subdomain = "idol"
+
+
+class IdolcomplexTagExtractor(IdolcomplexExtractor,
+ sankaku.SankakuTagExtractor):
+ """Extractor for images from idol.sankakucomplex.com by search-tags"""
+ pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)"
+ test = (
+ ("https://idol.sankakucomplex.com/?tags=lyumos+wreath", {
+ "count": ">= 6",
+ "pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
+ r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+",
+ }),
+ ("https://idol.sankakucomplex.com"
+ "/?tags=lyumos+wreath&page=3&next=694215"),
+ )
+
+
+class IdolcomplexPoolExtractor(IdolcomplexExtractor,
+ sankaku.SankakuPoolExtractor):
+ """Extractor for image-pools from idol.sankakucomplex.com"""
+ pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)"
+ test = ("https://idol.sankakucomplex.com/pool/show/145", {
+ "count": 3,
+ })
+
+
+class IdolcomplexPostExtractor(IdolcomplexExtractor,
+ sankaku.SankakuPostExtractor):
+ """Extractor for single images from idol.sankakucomplex.com"""
+ pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)"
+ test = ("https://idol.sankakucomplex.com/post/show/694215", {
+ "content": "694ec2491240787d75bf5d0c75d0082b53a85afd",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_character": "shani_(the_witcher)",
+ "tags_copyright": "the_witcher",
+ "tags_idol": str,
+ "tags_medium": str,
+ "tags_general": str,
+ },
+ })
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
new file mode 100644
index 0000000..6980185
--- /dev/null
+++ b/gallery_dl/extractor/imagebam.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from http://www.imagebam.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+class ImagebamExtractor(Extractor):
+ """Base class for imagebam extractors"""
+ category = "imagebam"
+ root = "http://www.imagebam.com"
+
+ def get_image_data(self, page_url, data):
+ """Fill 'data' and return image URL"""
+ page = self.request(page_url).text
+ image_url = text.extract(page, 'property="og:image" content="', '"')[0]
+ data["extension"] = image_url.rpartition(".")[2]
+ data["image_key"] = page_url.rpartition("/")[2]
+ data["image_id"] = data["image_key"][6:]
+ return image_url
+
+ def request_page(self, url):
+ """Retrive the main part of a gallery page"""
+ page = self.request(text.urljoin(self.root, url)).text
+ return text.extract(page, "<fieldset>", "</fieldset>")[0]
+
+
+class ImagebamGalleryExtractor(ImagebamExtractor):
+ """Extractor for image galleries from imagebam.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{title} - {gallery_key}")
+ filename_fmt = "{num:>03}-{image_key}.{extension}"
+ archive_fmt = "{gallery_key}_{image_key}"
+ pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)"
+ test = (
+ ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", {
+ "url": "fb01925129a1ff1941762eaa3a2783a66de6847f",
+ "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a",
+ "content": "596e6bfa157f2c7169805d50075c2986549973a8",
+ }),
+ ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", {
+ # more than 100 images; see issue #219
+ "count": 107,
+ "url": "f92ce5b17676b6ea69288f0aef26f4cdbea7fd8d",
+ }),
+ ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ ImagebamExtractor.__init__(self, match)
+ self.gallery_key = match.group(1)
+
+ def items(self):
+ url = "{}/gallery/{}".format(self.root, self.gallery_key)
+ page = self.request_page(url)
+ if not page or ">Error<" in page:
+ raise exception.NotFoundError("gallery")
+
+ data = self.get_metadata(page)
+ imgs = self.get_image_pages(page)
+ data["count"] = len(imgs)
+ data["gallery_key"] = self.gallery_key
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], page_url in enumerate(imgs, 1):
+ image_url = self.get_image_data(page_url, data)
+ yield Message.Url, image_url, data
+
+ @staticmethod
+ def get_metadata(page):
+ """Return gallery metadata"""
+ return text.extract_all(page, (
+ ("title" , "'> ", " <span "),
+ (None , "'>", "</span>"),
+ ("description", ":#FCFCFC;'>", "</div>"),
+ ))[0]
+
+ def get_image_pages(self, page):
+ """Return a list of all image pages"""
+ pages = []
+ while True:
+ pages.extend(text.extract_iter(page, "\n<a href='", "'"))
+ pos = page.find('"pagination_current"')
+ if pos > 0:
+ url = text.extract(page, "<a href='", "'", pos)[0]
+ if url:
+ page = self.request_page(url)
+ continue
+ return pages
+
+
+class ImagebamImageExtractor(ImagebamExtractor):
+ """Extractor for single images from imagebam.com"""
+ subcategory = "image"
+ filename_fmt = "{image_key}.{extension}"
+ archive_fmt = "{image_key}"
+ pattern = (r"(?:https?://)?(?:\w+\.)?imagebam\.com"
+ r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)")
+ test = (
+ ("http://www.imagebam.com/image/94d56c502511890", {
+ "url": "b384893c35a01a09c58018db71ddc4cf2480be95",
+ "keyword": "4263d4840007524129792b8587a562b5d20c2687",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ ("http://images3.imagebam.com/1d/8c/44/94d56c502511890.png"),
+ )
+
+ def __init__(self, match):
+ ImagebamExtractor.__init__(self, match)
+ self.image_key = match.group(1)
+
+ def items(self):
+ page_url = "{}/image/{}".format(self.root, self.image_key)
+ data = {}
+ image_url = self.get_image_data(page_url, data)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, image_url, data
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
new file mode 100644
index 0000000..152b631
--- /dev/null
+++ b/gallery_dl/extractor/imagefap.py
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://imagefap.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class ImagefapExtractor(Extractor):
+ """Base class for imagefap extractors"""
+ category = "imagefap"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ filename_fmt = "{category}_{gallery_id}_{filename}.{extension}"
+ archive_fmt = "{gallery_id}_{image_id}"
+ root = "https://www.imagefap.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.session.headers["Referer"] = self.root
+
+
+class ImagefapGalleryExtractor(ImagefapExtractor):
+ """Extractor for image galleries from imagefap.com"""
+ subcategory = "gallery"
+ pattern = (r"(?:https?://)?(?:www\.)?imagefap\.com/"
+ r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")
+ test = (
+ ("https://www.imagefap.com/pictures/7102714", {
+ "url": "268995eac5d01ddecd0fe58cfa9828390dc85a84",
+ "keyword": "b5bd65ab2ff574ed1639db9a43c7b1b8583c85ef",
+ "content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab",
+ }),
+ ("https://www.imagefap.com/gallery/5486966", {
+ "url": "14906b4f0b8053d1d69bc730a325acb793cbc898",
+ "keyword": "ab90972f3527a2011478fabc621a2c99a541f752",
+ }),
+ ("https://www.imagefap.com/gallery.php?gid=7102714"),
+ )
+
+ def __init__(self, match):
+ ImagefapExtractor.__init__(self, match)
+ self.gid = match.group(1)
+ self.image_id = ""
+
+ def items(self):
+ url = "{}/pictures/{}/".format(self.root, self.gid)
+ page = self.request(url).text
+ data = self.get_job_metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for url, image in self.get_images():
+ data.update(image)
+ yield Message.Url, url, data
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ descr, pos = text.extract(
+ page, '<meta name="description" content="Browse ', '"')
+ count, pos = text.extract(page, ' 1 of ', ' pics"', pos)
+ self.image_id = text.extract(page, 'id="img_ed_', '"', pos)[0]
+
+ title, _, descr = descr.partition(" porn picture gallery by ")
+ uploader, _, tags = descr.partition(" to see hottest ")
+ return {
+ "gallery_id": text.parse_int(self.gid),
+ "title": text.unescape(title),
+ "uploader": uploader,
+ "tags": tags[:-11].split(", "),
+ "count": text.parse_int(count),
+ }
+
+ def get_images(self):
+ """Collect image-urls and -metadata"""
+ num = 0
+ url = "{}/photo/{}/".format(self.root, self.image_id)
+ params = {"gid": self.gid, "idx": 0, "partial": "true"}
+ while True:
+ pos = 0
+ page = self.request(url, params=params).text
+ for _ in range(24):
+ imgurl, pos = text.extract(page, '<a href="', '"', pos)
+ if not imgurl:
+ return
+ num += 1
+ _, imgid, name = imgurl.rsplit("/", 2)
+ data = {"image_id": text.parse_int(imgid), "num": num}
+ yield imgurl, text.nameext_from_url(name, data)
+ params["idx"] += 24
+
+
+class ImagefapImageExtractor(ImagefapExtractor):
+ """Extractor for single images from imagefap.com"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"
+ test = ("https://www.imagefap.com/photo/1369341772/", {
+ "url": "b31ee405b61ff0450020a1bf11c0581ca9adb471",
+ "keyword": "eadaa8f8012298384996efd21cf1f9e9e0dddb9b",
+ })
+
+ def __init__(self, match):
+ ImagefapExtractor.__init__(self, match)
+ self.image_id = match.group(1)
+
+ def items(self):
+ data = self.get_job_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, data["url"], data
+
+ def get_job_metadata(self):
+ """Collect metadata for extractor-job"""
+ url = "{}/photo/{}/".format(self.root, self.image_id)
+ page = self.request(url).text
+ info = json.loads(text.extract(
+ page, '<script type="application/ld+json">', '</script>')[0])
+ parts = info["contentUrl"].rsplit("/", 3)
+ return text.nameext_from_url(parts[3], {
+ "url": info["contentUrl"],
+ "title": text.unescape(info["name"]),
+ "uploader": info["author"],
+ "date": info["datePublished"],
+ "width": text.parse_int(info["width"]),
+ "height": text.parse_int(info["height"]),
+ "gallery_id": text.parse_int(parts[1]),
+ "image_id": text.parse_int(parts[2]),
+ })
+
+
+class ImagefapUserExtractor(ImagefapExtractor):
+ """Extractor for all galleries from a user at imagefap.com"""
+ subcategory = "user"
+ categorytransfer = True
+ pattern = (r"(?:https?://)?(?:www\.)?imagefap\.com/"
+ r"(?:profile(?:\.php\?user=|/)([^/?&#]+)"
+ r"|usergallery\.php\?userid=(\d+))")
+ test = (
+ ("https://www.imagefap.com/profile/LucyRae/galleries", {
+ "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd",
+ }),
+ ("https://www.imagefap.com/usergallery.php?userid=1862791", {
+ "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd",
+ }),
+ ("https://www.imagefap.com/profile.php?user=LucyRae"),
+ )
+
+ def __init__(self, match):
+ ImagefapExtractor.__init__(self, match)
+ self.user, self.user_id = match.groups()
+
+ def items(self):
+ yield Message.Version, 1
+ for gid, name in self.get_gallery_data():
+ url = "{}/gallery/{}".format(self.root, gid)
+ data = {
+ "gallery_id": text.parse_int(gid),
+ "title": text.unescape(name),
+ "_extractor": ImagefapGalleryExtractor,
+ }
+ yield Message.Queue, url, data
+
+ def get_gallery_data(self):
+ """Yield all gallery_ids of a specific user"""
+ folders = self.get_gallery_folders()
+ url = "{}/ajax_usergallery_folder.php".format(self.root)
+ params = {"userid": self.user_id}
+ for folder_id in folders:
+ params["id"] = folder_id
+ page = self.request(url, params=params).text
+
+ pos = 0
+ while True:
+ gid, pos = text.extract(page, '<a href="/gallery/', '"', pos)
+ if not gid:
+ break
+ name, pos = text.extract(page, "<b>", "<", pos)
+ yield gid, name
+
+ def get_gallery_folders(self):
+ """Create a list of all folder_ids of a specific user"""
+ if self.user:
+ url = "{}/profile/{}/galleries".format(self.root, self.user)
+ else:
+ url = "{}/usergallery.php?userid={}".format(
+ self.root, self.user_id)
+ page = self.request(url).text
+ self.user_id, pos = text.extract(page, '?userid=', '"')
+ folders, pos = text.extract(page, ' id="tgl_all" value="', '"', pos)
+ return folders.split("|")[:-1]
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
new file mode 100644
index 0000000..954c1f0
--- /dev/null
+++ b/gallery_dl/extractor/imagehosts.py
@@ -0,0 +1,251 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Collection of extractors for various imagehosts"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text, exception
+from ..cache import memcache
+from os.path import splitext
+
+
+class ImagehostImageExtractor(SharedConfigMixin, Extractor):
+ """Base class for single-image extractors for various imagehosts"""
+ basecategory = "imagehost"
+ subcategory = "image"
+ archive_fmt = "{token}"
+ https = False
+ method = "post"
+ params = "simple"
+ cookies = None
+ encoding = None
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_url = "http{}://{}".format(
+ "s" if self.https else "", match.group(1))
+ self.token = match.group(2)
+ if self.params == "simple":
+ self.params = {
+ "imgContinue": "Continue+to+image+...+",
+ }
+ elif self.params == "complex":
+ self.params = {
+ "op": "view",
+ "id": self.token,
+ "pre": "1",
+ "adb": "1",
+ "next": "Continue+to+image+...+",
+ }
+ else:
+ self.params = {}
+ self.method = "get"
+
+ def items(self):
+ page = self.request(
+ self.page_url,
+ method=self.method,
+ data=self.params,
+ cookies=self.cookies,
+ encoding=self.encoding,
+ ).text
+
+ url, filename = self.get_info(page)
+ data = text.nameext_from_url(filename, {"token": self.token})
+ if self.https and url.startswith("http:"):
+ url = "https:" + url[5:]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+ def get_info(self, page):
+ """Find image-url and string to get filename from"""
+
+
+class ImxtoImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imx.to"""
+ category = "imxto"
+ pattern = (r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)"
+ r"/(?:i/|img-)(\w+)(\.html)?)")
+ test = (
+ ("https://imx.to/i/1qdeva", { # new-style URL
+ "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130",
+ "keyword": "1153a986c939d7aed599905588f5c940048bc517",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ ("https://imx.to/img-57a2050547b97.html", { # old-style URL
+ "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
+ "keyword": "fd2240aee77a21b8252d5b829a1f7e542f927f09",
+ "content": "54592f2635674c25677c6872db3709d343cdf92f",
+ }),
+ ("https://img.yt/img-57a2050547b97.html", { # img.yt domain
+ "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
+ }),
+ ("https://imx.to/img-57a2050547b98.html", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+ https = True
+ encoding = "utf-8"
+
+ def __init__(self, match):
+ ImagehostImageExtractor.__init__(self, match)
+ if "/img-" in self.page_url:
+ self.page_url = self.page_url.replace("img.yt", "imx.to")
+ self.url_ext = True
+ else:
+ self.url_ext = False
+
+ def get_info(self, page):
+ url, pos = text.extract(
+ page, '<div style="text-align:center;"><a href="', '"')
+ if not url:
+ raise exception.NotFoundError("image")
+ filename, pos = text.extract(page, ' title="', '"', pos)
+ if self.url_ext and filename:
+ filename += splitext(url)[1]
+ return url, filename or url
+
+
+class AcidimgImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from acidimg.cc"""
+ category = "acidimg"
+ pattern = r"(?:https?://)?((?:www\.)?acidimg\.cc/img-([a-z0-9]+)\.html)"
+ test = ("https://acidimg.cc/img-5acb6b9de4640.html", {
+ "url": "f132a630006e8d84f52d59555191ed82b3b64c04",
+ "keyword": "a8bb9ab8b2f6844071945d31f8c6e04724051f37",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ encoding = "utf-8"
+
+ def get_info(self, page):
+ url, pos = text.extract(page, "<img class='centred' src='", "'")
+ if not url:
+ raise exception.NotFoundError("image")
+ filename, pos = text.extract(page, " alt='", "'", pos)
+ return url, (filename + splitext(url)[1]) if filename else url
+
+
+class ImagevenueImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imagevenue.com"""
+ category = "imagevenue"
+ pattern = (r"(?:https?://)?(img\d+\.imagevenue\.com"
+ r"/img\.php\?image=(?:[a-z]+_)?(\d+)_[^&#]+)")
+ test = (("http://img28116.imagevenue.com/img.php"
+ "?image=th_52709_test_122_64lo.jpg"), {
+ "url": "46812995d557f2c6adf0ebd0e631e6e4e45facde",
+ "content": "59ec819cbd972dd9a71f25866fbfc416f2f215b3",
+ })
+ params = None
+
+ def get_info(self, page):
+ url = text.extract(page, "SRC='", "'")[0]
+ return text.urljoin(self.page_url, url), url
+
+
+class ImagetwistImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imagetwist.com"""
+ category = "imagetwist"
+ pattern = r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))"
+ test = ("https://imagetwist.com/4e46hv31tu0q/test.jpg", {
+ "url": "c999dc1a5dec0525ac9eb8c092f173dfe6dba0b0",
+ "keyword": "a9f2e01757ec96d4ee4752cbd8446ede80f7935e",
+ "content": "96b1fd099b06faad5879fce23a7e4eb8290d8810",
+ })
+ https = True
+ params = None
+
+ @property
+ @memcache(maxage=3*3600)
+ def cookies(self):
+ return self.request(self.page_url).cookies
+
+ def get_info(self, page):
+ url , pos = text.extract(page, 'center;"><img src="', '"')
+ filename, pos = text.extract(page, ' alt="', '"', pos)
+ return url, filename
+
+
+class ImgspiceImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imgspice.com"""
+ category = "imgspice"
+ pattern = r"(?:https?://)?((?:www\.)?imgspice\.com/([^/?&#]+))"
+ test = ("https://imgspice.com/nwfwtpyog50y/test.png.html", {
+ "url": "b8c30a8f51ee1012959a4cfd46197fabf14de984",
+ "keyword": "100e310a19a2fa22d87e1bbc427ecb9f6501e0c0",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ params = None
+
+ def get_info(self, page):
+ pos = page.find('id="imgpreview"')
+ if pos < 0:
+ raise exception.NotFoundError("image")
+ url , pos = text.extract(page, 'src="', '"', pos)
+ name, pos = text.extract(page, 'alt="', '"', pos)
+ return url, text.unescape(name)
+
+
+class PixhostImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from pixhost.to"""
+ category = "pixhost"
+ pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)"
+ r"/show/\d+/(\d+)_[^/?&#]+)")
+ test = ("https://pixhost.to/show/224/96246707_test-.png", {
+ "url": "8f3d41fdd2dbec4c844e5ee45bf49961fbd79c67",
+ "keyword": "ecefe2d5814286f9d1dff3d88d9bdc78dd456c5d",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ params = None
+ cookies = {"pixhostads": "1", "pixhosttest": "1"}
+
+ def get_info(self, page):
+ url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"")
+ filename, pos = text.extract(page, "alt=\"", "\"", pos)
+ return url, filename
+
+
+class PostimgImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from postimages.org"""
+ category = "postimg"
+ pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)"
+ r"/(?:image/)?([^/?&#]+)/?)")
+ test = ("https://postimg.cc/Wtn2b3hC", {
+ "url": "0794cfda9b8951a8ac3aa692472484200254ab86",
+ "keyword": "2d05808d04e4e83e33200db83521af06e3147a84",
+ "content": "cfaa8def53ed1a575e0c665c9d6d8cf2aac7a0ee",
+ })
+ https = True
+ params = None
+
+ def get_info(self, page):
+ url , pos = text.extract(page, 'id="main-image" src="', '"')
+ filename, pos = text.extract(page, 'class="imagename">', '<', pos)
+ return url, text.unescape(filename)
+
+
+class TurboimagehostImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from www.turboimagehost.com"""
+ category = "turboimagehost"
+ pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com"
+ r"/p/(\d+)/[^/?&#]+\.html)")
+ test = ("https://www.turboimagehost.com/p/39078423/test--.png.html", {
+ "url": "b94de43612318771ced924cb5085976f13b3b90e",
+ "keyword": "704757ca8825f51cec516ec44c1e627c1f2058ca",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ params = None
+
+ def get_info(self, page):
+ url = text.extract(page, 'src="', '"', page.index("<img "))[0]
+ return url, url
diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py
new file mode 100644
index 0000000..516ef18
--- /dev/null
+++ b/gallery_dl/extractor/imgbox.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from galleries at https://imgbox.com/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, exception
+import re
+
+
+class ImgboxExtractor(Extractor):
+ """Base class for imgbox extractors"""
+ category = "imgbox"
+ root = "https://imgbox.com"
+
+ def items(self):
+ data = self.get_job_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for image_key in self.get_image_keys():
+ imgpage = self.request(self.root + "/" + image_key).text
+ imgdata = self.get_image_metadata(imgpage)
+ if imgdata["filename"]:
+ imgdata.update(data)
+ imgdata["image_key"] = image_key
+ text.nameext_from_url(imgdata["filename"], imgdata)
+ yield Message.Url, self.get_image_url(imgpage), imgdata
+
+ @staticmethod
+ def get_job_metadata():
+ """Collect metadata for extractor-job"""
+ return {}
+
+ @staticmethod
+ def get_image_keys():
+ """Return an iterable containing all image-keys"""
+ return []
+
+ @staticmethod
+ def get_image_metadata(page):
+ """Collect metadata for a downloadable file"""
+ return text.extract_all(page, (
+ ("num" , '</a> &nbsp; ', ' of '),
+ (None , 'class="image-container"', ''),
+ ("filename" , ' title="', '"'),
+ ))[0]
+
+ @staticmethod
+ def get_image_url(page):
+ """Extract download-url"""
+ pos = page.index(">Image</a>")
+ return text.extract(page, '<a href="', '"', pos)[0]
+
+
+class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
+ """Extractor for image galleries from imgbox.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{title} - {gallery_key}")
+ filename_fmt = "{num:>03}-{filename}.{extension}"
+ archive_fmt = "{gallery_key}_{image_key}"
+ pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"
+ test = (
+ ("https://imgbox.com/g/JaX5V5HX7g", {
+ "url": "678f0bca1251d810372326ea4f16582cafa800e4",
+ "keyword": "4b1e62820ac2c6205b7ad0b6322cc8e00dbe1b0c",
+ "content": "d20307dc8511ac24d688859c55abf2e2cc2dd3cc",
+ }),
+ ("https://imgbox.com/g/cUGEkRbdZZ", {
+ "url": "d839d47cbbbeb121f83c520072512f7e51f52107",
+ "keyword": "fb0427b87983197849fb2887905e758f3e50cb6e",
+ }),
+ ("https://imgbox.com/g/JaX5V5HX7h", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ ImgboxExtractor.__init__(self, match)
+ self.gallery_key = match.group(1)
+ self.image_keys = []
+
+ def get_job_metadata(self):
+ page = self.request(self.root + "/g/" + self.gallery_key).text
+ if "The specified gallery could not be found." in page:
+ raise exception.NotFoundError("gallery")
+ self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page)
+
+ title = text.extract(page, "<h1>", "</h1>")[0]
+ title, _, count = title.rpartition(" - ")
+ return {
+ "gallery_key": self.gallery_key,
+ "title": text.unescape(title),
+ "count": count[:-7],
+ }
+
+ def get_image_keys(self):
+ return self.image_keys
+
+
+class ImgboxImageExtractor(ImgboxExtractor):
+ """Extractor for single images from imgbox.com"""
+ subcategory = "image"
+ archive_fmt = "{image_key}"
+ pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})"
+ test = (
+ ("https://imgbox.com/qHhw7lpG", {
+ "url": "d931f675a9b848fa7cb9077d6c2b14eb07bdb80f",
+ "keyword": "dfc72310026b45f3feb4f9cada20c79b2575e1af",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ ("https://imgbox.com/qHhw7lpH", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ ImgboxExtractor.__init__(self, match)
+ self.image_key = match.group(1)
+
+ def get_image_keys(self):
+ return (self.image_key,)
+
+ @staticmethod
+ def get_image_metadata(page):
+ data = ImgboxExtractor.get_image_metadata(page)
+ if not data["filename"]:
+ raise exception.NotFoundError("image")
+ return data
diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py
new file mode 100644
index 0000000..a97f2e0
--- /dev/null
+++ b/gallery_dl/extractor/imgth.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://imgth.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class ImgthGalleryExtractor(Extractor):
+ """Extractor for image galleries from imgth.com"""
+ category = "imgth"
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+ archive_fmt = "{gallery_id}_{num}"
+ pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)"
+ test = ("http://imgth.com/gallery/37/wallpaper-anime", {
+ "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
+ "keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.gid = match.group(1)
+ self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/"
+
+ def items(self):
+ page = self.request(self.url_base + "0").text
+ data = self.metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], url in enumerate(self.images(page), 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def images(self, page):
+ """Yield all image urls for this gallery"""
+ pnum = 0
+ while True:
+ thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
+ for url in text.extract_iter(thumbs, '<img src="', '"'):
+ yield "https://imgth.com/images/" + url[24:]
+ if '<li class="next">' not in page:
+ return
+ pnum += 1
+ page = self.request(self.url_base + str(pnum)).text
+
+ def metadata(self, page):
+ """Collect metadata for extractor-job"""
+ return text.extract_all(page, (
+ ("title", '<h1>', '</h1>'),
+ ("count", 'total of images in this gallery: ', ' '),
+ ("date" , 'created on ', ' by <'),
+ (None , 'href="/users/', ''),
+ ("user" , '>', '<'),
+ ), values={"gallery_id": self.gid})[0]
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
new file mode 100644
index 0000000..0468c0b
--- /dev/null
+++ b/gallery_dl/extractor/imgur.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://imgur.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import json
+
+
+class ImgurExtractor(Extractor):
+ """Base class for imgur extractors"""
+ category = "imgur"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_id = match.group(1)
+ self.mp4 = self.config("mp4", True)
+
+ def _get_data(self, urlpart):
+ response = self.request("https://imgur.com/" + urlpart, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError(self.subcategory)
+ data = text.extract(response.text, "image : ", ",\n")[0]
+ return self._clean(json.loads(data))
+
+ def _prepare(self, image):
+ image["ext"] = image["ext"].partition("?")[0]
+ if image["ext"] == ".gif" and (
+ (self.mp4 and image["prefer_video"]) or self.mp4 == "always"):
+ image["ext"] = ".mp4"
+ url = "https://i.imgur.com/" + image["hash"] + image["ext"]
+ image["extension"] = image["ext"][1:]
+ return url
+
+ @staticmethod
+ def _clean(data):
+ try:
+ del data["adConfig"]
+ del data["isAd"]
+ except KeyError:
+ pass
+ return data
+
+
+class ImgurImageExtractor(ImgurExtractor):
+ """Extractor for individual images from imgur.com"""
+ subcategory = "image"
+ filename_fmt = "{category}_{hash}{title:?_//}.{extension}"
+ archive_fmt = "{hash}"
+ pattern = (r"(?:https?://)?(?:www\.|[im]\.|)?imgur\.com"
+ r"/(?!gallery)(\w{7}|\w{5})[sbtmlh]?\.?")
+ test = (
+ ("https://imgur.com/21yMxCS", {
+ "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ "keyword": {
+ "animated": False,
+ "datetime": "2016-11-10 14:24:35",
+ "description": str,
+ "ext": ".png",
+ "extension": "png",
+ "hash": "21yMxCS",
+ "height": "32",
+ "is_moderated": False,
+ "is_safe": False,
+ "is_viral": 0,
+ "looping": False,
+ "mimetype": "image/png",
+ "name": None,
+ "prefer_video": False,
+ "size": 182,
+ "source": "",
+ "title": "Test",
+ "video_host": None,
+ "video_source": None,
+ "width": "64",
+ },
+ }),
+ ("http://imgur.com/0gybAXR", { # gifv/mp4 video
+ "url": "a2220eb265a55b0c95e0d3d721ec7665460e3fd7",
+ "content": "a3c080e43f58f55243ab830569ba02309d59abfc",
+ }),
+ ("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1'
+ "url": "73f361b50753ab25da64160aa50bc5d139480d45",
+ }),
+ ("https://imgur.com/zzzzzzz", { # not found
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.imgur.com/21yMxCS"), # www
+ ("https://m.imgur.com/21yMxCS"), # mobile
+ ("https://imgur.com/zxaY6"), # 5 character key
+ ("https://i.imgur.com/21yMxCS.png"), # direct link
+ ("https://i.imgur.com/21yMxCSh.png"), # direct link thumbnail
+ ("https://i.imgur.com/zxaY6.gif"), # direct link (short)
+ ("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb)
+ )
+
+ def items(self):
+ image = self._get_data(self.item_id)
+ url = self._prepare(image)
+
+ yield Message.Version, 1
+ yield Message.Directory, image
+ yield Message.Url, url, image
+
+
+class ImgurAlbumExtractor(ImgurExtractor):
+ """Extractor for image albums from imgur.com"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}")
+ filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}"
+ archive_fmt = "{album[hash]}_{hash}"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com"
+ r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})")
+ test = (
+ ("https://imgur.com/a/TcBmP", {
+ "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563",
+ "keyword": {
+ "album": {
+ "album_cover": "693j2Kr",
+ "album_description": None,
+ "cover": "693j2Kr",
+ "datetime": "2015-10-09 10:37:50",
+ "description": None,
+ "hash": "TcBmP",
+ "id": "TcBmP",
+ "is_album": True,
+ "num_images": "19",
+ "title": "138",
+ "title_clean": "TcBmP",
+ "views": str,
+ },
+ "animated": bool,
+ "datetime": str,
+ "extension": str,
+ "hash": str,
+ "height": int,
+ "num": int,
+ "prefer_video": bool,
+ "size": int,
+ "title": str,
+ "width": int,
+ },
+ }),
+ ("https://imgur.com/gallery/eD9CT", { # large album
+ "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937",
+ }),
+ ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash
+ "url": "695ef0c950023362a0163ee5041796300db76674",
+ }),
+ ("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL
+ "url": "86b4747f8147cec7602f0214e267309af73a8655",
+ }),
+ ("https://imgur.com/a/TcBmQ", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.imgur.com/a/TcBmP"), # www
+ ("https://m.imgur.com/a/TcBmP"), # mobile
+ )
+
+ def items(self):
+ album = self._get_data("a/" + self.item_id + "/all")
+ images = album["album_images"]["images"]
+ del album["album_images"]
+
+ if int(album["num_images"]) > len(images):
+ url = ("https://imgur.com/ajaxalbums/getimages/" +
+ self.item_id + "/hit.json")
+ images = self.request(url).json()["data"]["images"]
+
+ yield Message.Version, 1
+ yield Message.Directory, {"album": album, "count": len(images)}
+ for num, image in enumerate(images, 1):
+ url = self._prepare(image)
+ image["num"] = num
+ image["album"] = album
+ yield Message.Url, url, image
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
new file mode 100644
index 0000000..871236b
--- /dev/null
+++ b/gallery_dl/extractor/instagram.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.instagram.com/"""
+
+import hashlib
+import json
+from .common import Extractor, Message
+from .. import text
+
+
+class InstagramExtractor(Extractor):
+ """Base class for instagram extractors"""
+ category = "instagram"
+ directory_fmt = ("{category}", "{username}")
+ filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}"
+ archive_fmt = "{media_id}"
+ root = "https://www.instagram.com"
+
+ def get_metadata(self):
+ return {}
+
+ def items(self):
+ yield Message.Version, 1
+
+ metadata = self.get_metadata()
+ for data in self.instagrams():
+ data.update(metadata)
+ yield Message.Directory, data
+
+ if data['typename'] == 'GraphImage':
+ yield Message.Url, data['display_url'], \
+ text.nameext_from_url(data['display_url'], data)
+ elif data['typename'] == 'GraphVideo':
+ yield Message.Url, \
+ 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
+
+ def _extract_shared_data(self, page):
+ return json.loads(text.extract(page,
+ 'window._sharedData = ', ';</script>')[0])
+
+ def _extract_postpage(self, url):
+ page = self.request(url).text
+ shared_data = self._extract_shared_data(page)
+ media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']
+
+ common = {
+ 'date': text.parse_timestamp(media['taken_at_timestamp']),
+ 'likes': text.parse_int(media['edge_media_preview_like']['count']),
+ 'owner_id': media['owner']['id'],
+ 'username': media['owner']['username'],
+ 'fullname': media['owner']['full_name'],
+ 'description': text.parse_unicode_escapes('\n'.join(
+ edge['node']['text']
+ for edge in media['edge_media_to_caption']['edges']
+ )),
+ }
+
+ medias = []
+ if media['__typename'] == 'GraphSidecar':
+ yi = 0
+ for n in media['edge_sidecar_to_children']['edges']:
+ children = n['node']
+ media_data = {
+ 'media_id': children['id'],
+ 'shortcode': children['shortcode'],
+ 'typename': children['__typename'],
+ 'display_url': children['display_url'],
+ 'height': text.parse_int(children['dimensions']['height']),
+ 'width': text.parse_int(children['dimensions']['width']),
+ 'sidecar_media_id': media['id'],
+ 'sidecar_shortcode': media['shortcode'],
+ }
+ if children['__typename'] == 'GraphVideo':
+ media_data["_ytdl_index"] = yi
+ yi += 1
+ media_data.update(common)
+ medias.append(media_data)
+
+ else:
+ media_data = {
+ 'media_id': media['id'],
+ 'shortcode': media['shortcode'],
+ 'typename': media['__typename'],
+ 'display_url': media['display_url'],
+ 'height': text.parse_int(media['dimensions']['height']),
+ 'width': text.parse_int(media['dimensions']['width']),
+ }
+ media_data.update(common)
+ medias.append(media_data)
+
+ return medias
+
+ def _extract_page(self, url, page_type):
+ shared_data_fields = {
+ 'ProfilePage': {
+ 'node': 'user',
+ 'node_id': 'id',
+ 'edge_to_medias': 'edge_owner_to_timeline_media',
+ 'variables_id': 'id',
+ 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41',
+ },
+ 'TagPage': {
+ 'node': 'hashtag',
+ 'node_id': 'name',
+ 'edge_to_medias': 'edge_hashtag_to_media',
+ 'variables_id': 'tag_name',
+ 'query_hash': 'f92f56d47dc7a55b606908374b43a314',
+ },
+ }
+
+ page = self.request(url).text
+ shared_data = self._extract_shared_data(page)
+ psdf = shared_data_fields[page_type]
+
+ while True:
+ # Deal with different structure of pages: the first page
+ # has interesting data in `entry_data', next pages in `data'.
+ if 'entry_data' in shared_data:
+ base_shared_data = shared_data['entry_data'][page_type][0]['graphql']
+
+ # variables_id is available only in the first page
+ variables_id = base_shared_data[psdf['node']][psdf['node_id']]
+ else:
+ base_shared_data = shared_data['data']
+
+ medias = base_shared_data[psdf['node']][psdf['edge_to_medias']]
+ has_next_page = medias['page_info']['has_next_page']
+ shortcodes = [n['node']['shortcode'] for n in medias['edges']]
+
+ for s in shortcodes:
+ url = '{}/p/{}/'.format(self.root, s)
+ yield from self._extract_postpage(url)
+
+ if not has_next_page:
+ break
+
+ end_cursor = medias['page_info']['end_cursor']
+ variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format(
+ psdf['variables_id'],
+ variables_id,
+ end_cursor,
+ )
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(),
+ }
+ url = '{}/graphql/query/?query_hash={}&variables={}'.format(
+ self.root,
+ psdf['query_hash'],
+ variables,
+ )
+ shared_data = self.request(url, headers=headers).json()
+
+ def _extract_profilepage(self, url):
+ yield from self._extract_page(url, 'ProfilePage')
+
+ def _extract_tagpage(self, url):
+ yield from self._extract_page(url, 'TagPage')
+
+
+class InstagramImageExtractor(InstagramExtractor):
+ """Extractor for PostPage"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)"
+ test = (
+ # GraphImage
+ ("https://www.instagram.com/p/BqvsDleB3lV/", {
+ "pattern": r"https://[^/]+\.(cdninstagram\.com|fbcdn\.net)"
+ r"/vp/[0-9a-f]+/[0-9A-F]+/t51.2885-15/e35"
+ r"/44877605_725955034447492_3123079845831750529_n.jpg",
+ "keyword": {
+ "date": "type:datetime",
+ "description": str,
+ "height": int,
+ "likes": int,
+ "media_id": "1922949326347663701",
+ "shortcode": "BqvsDleB3lV",
+ "typename": "GraphImage",
+ "username": "instagram",
+ "width": int,
+ }
+ }),
+
+ # GraphSidecar
+ ("https://www.instagram.com/p/BoHk1haB5tM/", {
+ "count": 5,
+ "keyword": {
+ "sidecar_media_id": "1875629777499953996",
+ "sidecar_shortcode": "BoHk1haB5tM",
+ "likes": int,
+ "username": "instagram",
+ }
+ }),
+
+ # GraphVideo
+ ("https://www.instagram.com/p/Bqxp0VSBgJg/", {
+ "url": "8f38c1cf460c9804842f7306c487410f33f82e7e",
+ "keyword": {
+ "date": "type:datetime",
+ "description": str,
+ "height": int,
+ "likes": int,
+ "media_id": "1923502432034620000",
+ "shortcode": "Bqxp0VSBgJg",
+ "typename": "GraphVideo",
+ "username": "instagram",
+ "width": int,
+ }
+ }),
+
+ # GraphSidecar with 2 embedded GraphVideo objects
+ ("https://www.instagram.com/p/BtOvDOfhvRr/", {
+ "count": 2,
+ "url": "e290d4180a58ae50c910d51d3b04d5f5c4622cd7",
+ "keyword": {
+ "sidecar_media_id": "1967717017113261163",
+ "sidecar_shortcode": "BtOvDOfhvRr",
+ "_ytdl_index": int,
+ }
+ })
+ )
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.shortcode = match.group(1)
+
+ def instagrams(self):
+ url = '{}/p/{}/'.format(self.root, self.shortcode)
+ return self._extract_postpage(url)
+
+
+class InstagramUserExtractor(InstagramExtractor):
+ """Extractor for ProfilePage"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)")
+ test = ("https://www.instagram.com/instagram/", {
+ "range": "1-12",
+ "count": ">= 12",
+ })
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.username = match.group(1)
+
+ def instagrams(self):
+ url = '{}/{}/'.format(self.root, self.username)
+ return self._extract_profilepage(url)
+
+
+class InstagramTagExtractor(InstagramExtractor):
+ """Extractor for TagPage"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{subcategory}", "{tag}")
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/explore/tags/([^/?&#]+)")
+ test = ("https://www.instagram.com/explore/tags/instagram/", {
+ "range": "1-12",
+ "count": ">= 12",
+ })
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.tag = match.group(1)
+
+ def get_metadata(self):
+ return {"tag": self.tag}
+
+ def instagrams(self):
+ url = '{}/explore/tags/{}/'.format(self.root, self.tag)
+ return self._extract_tagpage(url)
diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py
new file mode 100644
index 0000000..5902333
--- /dev/null
+++ b/gallery_dl/extractor/keenspot.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for http://www.keenspot.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class KeenspotComicExtractor(Extractor):
+ """Extractor for webcomics from keenspot.com"""
+ category = "keenspot"
+ subcategory = "comic"
+ directory_fmt = ("{category}", "{comic}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{comic}_{filename}"
+ pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?"
+ test = (
+ ("http://marksmen.keenspot.com/", { # link
+ "range": "1-3",
+ "url": "83bcf029103bf8bc865a1988afa4aaeb23709ba6",
+ }),
+ ("http://barkercomic.keenspot.com/", { # id
+ "range": "1-3",
+ "url": "c4080926db18d00bac641fdd708393b7d61379e6",
+ }),
+ ("http://crowscare.keenspot.com/", { # id v2
+ "range": "1-3",
+ "url": "a00e66a133dd39005777317da90cef921466fcaa"
+ }),
+ ("http://supernovas.keenspot.com/", { # ks
+ "range": "1-3",
+ "url": "de21b12887ef31ff82edccbc09d112e3885c3aab"
+ }),
+ ("http://twokinds.keenspot.com/comic/1066/", { # "random" access
+ "range": "1-3",
+ "url": "97e2a6ed8ba1709314f2449f84b6b1ce5db21c04",
+ })
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.comic = match.group(1).lower()
+ self.path = match.group(2)
+ self.root = "http://" + self.comic + ".keenspot.com"
+
+ self._needle = ""
+ self._image = 'class="ksc"'
+ self._next = self._next_needle
+
+ def items(self):
+ data = {"comic": self.comic}
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ url = self._first(self.request(self.root + "/").text)
+ if self.path:
+ url = self.root + self.path
+
+ prev = None
+ ilen = len(self._image)
+ while url and url != prev:
+ prev = url
+ page = self.request(text.urljoin(self.root, url)).text
+
+ pos = 0
+ while True:
+ pos = page.find(self._image, pos)
+ if pos < 0:
+ break
+ img, pos = text.extract(page, 'src="', '"', pos + ilen)
+ if img.endswith(".js"):
+ continue
+ if img[0] == "/":
+ img = self.root + img
+ elif "youtube.com/" in img:
+ img = "ytdl:" + img
+ yield Message.Url, img, text.nameext_from_url(img, data)
+
+ url = self._next(page)
+
+ def _first(self, page):
+ if self.comic == "brawlinthefamily":
+ self._next = self._next_brawl
+ self._image = '<div id="comic">'
+ return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"
+
+ url = text.extract(page, '<link rel="first" href="', '"')[0]
+ if url:
+ if self.comic == "porcelain":
+ self._needle = 'id="porArchivetop_"'
+ else:
+ self._next = self._next_link
+ return url
+
+ pos = page.find('id="first_day1"')
+ if pos >= 0:
+ self._next = self._next_id
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('>FIRST PAGE<')
+ if pos >= 0:
+ if self.comic == "lastblood":
+ self._next = self._next_lastblood
+ self._image = '<div id="comic">'
+ else:
+ self._next = self._next_id
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('<div id="kscomicpart"')
+ if pos >= 0:
+ self._needle = '<a href="/archive.html'
+ return text.extract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('>First Comic<') # twokinds
+ if pos >= 0:
+ self._image = '</header>'
+ self._needle = 'class="navarchive"'
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('id="flip_FirstDay"') # flipside
+ if pos >= 0:
+ self._image = 'class="flip_Pages ksc"'
+ self._needle = 'id="flip_ArcButton"'
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ self.log.error("Unrecognized page layout")
+ return None
+
+ def _next_needle(self, page):
+ pos = page.index(self._needle) + len(self._needle)
+ return text.extract(page, 'href="', '"', pos)[0]
+
+ @staticmethod
+ def _next_link(page):
+ return text.extract(page, '<link rel="next" href="', '"')[0]
+
+ @staticmethod
+ def _next_id(page):
+ pos = page.find('id="next_')
+ return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None
+
+ @staticmethod
+ def _next_lastblood(page):
+ pos = page.index("link rel='next'")
+ return text.extract(page, "href='", "'", pos)[0]
+
+ @staticmethod
+ def _next_brawl(page):
+ pos = page.index("comic-nav-next")
+ url = text.rextract(page, 'href="', '"', pos)[0]
+ return None if "?random" in url else url
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
new file mode 100644
index 0000000..c9e6959
--- /dev/null
+++ b/gallery_dl/extractor/khinsider.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract soundtracks from https://downloads.khinsider.com/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, exception
+
+
+class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
+ """Extractor for soundtracks from khinsider.com"""
+ category = "khinsider"
+ subcategory = "soundtrack"
+ directory_fmt = ("{category}", "{album}")
+ archive_fmt = "{album}_{filename}.{extension}"
+ pattern = (r"(?:https?://)?downloads\.khinsider\.com"
+ r"/game-soundtracks/album/([^/?&#]+)")
+ test = (("https://downloads.khinsider.com"
+ "/game-soundtracks/album/horizon-riders-wii"), {
+ "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+"
+ r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3",
+ "count": 1,
+ "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679",
+ })
+ root = "https://downloads.khinsider.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.album = match.group(1)
+
+ def items(self):
+ url = (self.root + "/game-soundtracks/album/" + self.album)
+ page = self.request(url, encoding="utf-8").text
+ data = self.get_job_metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for url, track in self.get_album_tracks(page):
+ track.update(data)
+ yield Message.Url, url, track
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ if "Download all songs at once:" not in page:
+ raise exception.NotFoundError("soundtrack")
+ data = text.extract_all(page, (
+ ("album", "Album name: <b>", "</b>"),
+ ("count", "Number of Files: <b>", "</b>"),
+ ("size" , "Total Filesize: <b>", "</b>"),
+ ("date" , "Date added: <b>", "</b>"),
+ ("type" , "Album type: <b>", "</b>"),
+ ))[0]
+ data["album"] = text.unescape(data["album"])
+ return data
+
+ def get_album_tracks(self, page):
+ """Collect url and metadata for all tracks of a soundtrack"""
+ page = text.extract(page, '<table id="songlist">', '</table>')[0]
+ for num, url in enumerate(text.extract_iter(
+ page, '<td class="clickable-row"><a href="', '"'), 1):
+ url = text.urljoin(self.root, url)
+ page = self.request(url, encoding="utf-8").text
+ url = text.extract(
+ page, '<p><a style="color: #21363f;" href="', '"')[0]
+ yield url, text.nameext_from_url(url, {"num": num})
diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py
new file mode 100644
index 0000000..6314a94
--- /dev/null
+++ b/gallery_dl/extractor/kissmanga.py
@@ -0,0 +1,223 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://kissmanga.com/"""
+
+from .common import ChapterExtractor, MangaExtractor, Extractor
+from .. import text, aes, exception
+from ..cache import cache
+import hashlib
+import ast
+import re
+
+
+class RedirectMixin():
+ """Detect and handle redirects to CAPTCHA pages"""
+
+ def request(self, url):
+ while True:
+ response = Extractor.request(self, url)
+ if not response.history or "/AreYouHuman" not in response.url:
+ return response
+ if self.config("captcha", "stop") == "wait":
+ self.log.warning(
+ "Redirect to \n%s\nVisit this URL in your browser, solve "
+ "the CAPTCHA, and press ENTER to continue", response.url)
+ try:
+ input()
+ except (EOFError, OSError):
+ pass
+ else:
+ self.log.error(
+ "Redirect to \n%s\nVisit this URL in your browser and "
+ "solve the CAPTCHA to continue", response.url)
+ raise exception.StopExtraction()
+
+
+class KissmangaBase(RedirectMixin):
+ """Base class for kissmanga extractors"""
+ category = "kissmanga"
+ archive_fmt = "{chapter_id}_{page}"
+ root = "https://kissmanga.com"
+
+ @staticmethod
+ def parse_chapter_string(data):
+ """Parse 'chapter_string' value contained in 'data'"""
+ data["chapter_string"] = text.unescape(data["chapter_string"])
+
+ match = re.match((
+ r"(?:[Vv]ol\.0*(\d+) )?"
+ r"(?:[Cc]h\.)?0*(\d+)"
+ r"(?:[.:]0*(\d+))?"
+ r"(?: *[:-]? *(.+))?"
+ ), data["chapter_string"])
+
+ if not match:
+ match = re.match((
+ r".+?(?: -)? ()"
+ r"0*(\d+)(?:[Vv.]0*(\d+))?"
+ r"(?: *[:-]? *(.+))?"
+ ), data["chapter_string"])
+
+ if match:
+ volume, chapter, minor, title = match.groups()
+ else:
+ volume, chapter, minor, title = 0, 0, "", data["chapter_string"]
+
+ data["volume"] = text.parse_int(volume)
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = "." + minor if minor else ""
+ data["title"] = title if title and title != "Read Online" else ""
+ return data
+
+
+class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
+ """Extractor for manga-chapters from kissmanga.com"""
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com"
+ r"(/Manga/[^/?&#]+/[^/?&#]+\?id=(\d+))")
+ test = (
+ ("https://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", {
+ "url": "46e63fd63e9e16f19bc1e6c7a45dc060815642fd",
+ "keyword": "1cd0b5214ac7ae4d53e2fd8fec40ceec84cd09bf",
+ }),
+ ("https://kissmanga.com/Manga/Urban-Tales/a?id=256717", {
+ "url": "c26be8bf9c2abacee2076979d021634092cf38f1",
+ "keyword": "e1d16780df8e04076ed2b5f0637c5b710ec2f2ea",
+ }),
+ ("https://kissmanga.com/Manga/Monster/Monster-79?id=7608", {
+ "count": 23,
+ "keyword": "f433a7a8fae840e17dace316a243fa27faab86de",
+ }),
+ ("https://kissmanga.com/Manga/Houseki-no-Kuni/Oneshot?id=404189", {
+ "count": 49,
+ "keyword": "d44d1b21d08e4dbf888b0c450a3f1bc919588b4f",
+ }),
+ ("https://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608"),
+ )
+
+ def __init__(self, match):
+ ChapterExtractor.__init__(self, match)
+ self.chapter_id = match.group(2)
+ self.session.headers["Referer"] = self.root
+
+ def metadata(self, page):
+ title = text.extract(page, "<title>", "</title>")[0].strip()
+ manga, cinfo = title.split("\n")[1:3]
+ data = {
+ "manga": manga.strip(),
+ "chapter_string": cinfo.strip(),
+ "chapter_id": text.parse_int(self.chapter_id),
+ "lang": "en",
+ "language": "English",
+ }
+ return self.parse_chapter_string(data)
+
+ def images(self, page):
+ self.session.headers["Referer"] = None
+ try:
+ key = self.build_aes_key(page)
+ iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
+ 0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3)
+ return [
+ (aes.aes_cbc_decrypt_text(
+ data, key, iv).partition("&")[0], None)
+ for data in text.extract_iter(
+ page, 'lstImages.push(wrapKA("', '"'
+ )
+ ]
+ except UnicodeDecodeError:
+ self.log.error("Failed to decrypt image URLs")
+ except (ValueError, IndexError):
+ self.log.error("Failed to get AES key")
+ return []
+
+ def build_aes_key(self, page):
+ chko = self._chko_from_external_script()
+
+ for script in self._scripts(page):
+ for stmt in [s.strip() for s in script.split(";")]:
+
+ if stmt.startswith("var _"):
+ name, _, value = stmt[4:].partition(" = ")
+ name += "[0]"
+ value = ast.literal_eval(value)[0]
+
+ elif stmt.startswith("chko = "):
+ stmt = stmt[7:]
+ if stmt == name:
+ chko = value
+ elif stmt == "chko + " + name:
+ chko = chko + value
+ elif stmt == name + " + chko":
+ chko = value + chko
+ else:
+ self.log.warning("unrecognized expression: '%s'", stmt)
+
+ elif stmt.startswith("key = "):
+ pass
+
+ else:
+ self.log.warning("unrecognized statement: '%s'", stmt)
+
+ return list(hashlib.sha256(chko.encode("ascii")).digest())
+
+ @staticmethod
+ def _scripts(page):
+ end = 0
+ while True:
+ pos = page.find("key = ", end)
+ if pos == -1:
+ return
+ beg = page.rindex('<script type="text/javascript">', 0, pos) + 31
+ end = page.index('</script>', pos)
+ yield page[beg:end]
+
+ @cache(maxage=3600)
+ def _chko_from_external_script(self):
+ script = self.request(self.root + "/Scripts/lo.js").text
+
+ pos = script.index("var chko")
+ var = text.extract(script, "=", "[", pos)[0].lstrip()
+ idx = text.extract(script, "[", "]", pos)[0]
+
+ pos = script.index(var)
+ lst = text.extract(script, "=", ";", pos)[0]
+ return ast.literal_eval(lst.strip())[int(idx)]
+
+
+class KissmangaMangaExtractor(KissmangaBase, MangaExtractor):
+ """Extractor for manga from kissmanga.com"""
+ chapterclass = KissmangaChapterExtractor
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com"
+ r"(/Manga/[^/?&#]+/?)$")
+ test = (
+ ("https://kissmanga.com/Manga/Dropout", {
+ "url": "9e3a6f715b229aa3fafa42a1d5da5d65614cb532",
+ "keyword": "32b09711c28b481845acc32e3bb6054cfc90224d",
+ }),
+ ("https://kissmanga.com/manga/feng-shen-ji"), # lowercase
+ )
+
+ def chapters(self, page):
+ results = []
+ manga, pos = text.extract(page, ' class="barTitle">', '\ninformation')
+ page , pos = text.extract(page, ' class="listing">', '</table>', pos)
+ manga = manga.strip()
+ needle = '" title="Read ' + manga + ' '
+ manga = text.unescape(manga)
+
+ for item in text.extract_iter(page, '<a href="', ' online">'):
+ url, _, chapter = item.partition(needle)
+ data = {
+ "manga": manga, "chapter_string": chapter,
+ "chapter_id": text.parse_int(url.rpartition("=")[2]),
+ "lang": "en", "language": "English",
+ }
+ self.parse_chapter_string(data)
+ results.append((self.root + url, data))
+ return results
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
new file mode 100644
index 0000000..8541e4f
--- /dev/null
+++ b/gallery_dl/extractor/komikcast.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://komikcast.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import re
+
+
+class KomikcastBase():
+ """Base class for komikcast extractors"""
+ category = "komikcast"
+ root = "https://komikcast.com"
+
+ @staticmethod
+ def parse_chapter_string(chapter_string, data=None):
+ """Parse 'chapter_string' value and add its info to 'data'"""
+ if not data:
+ data = {}
+
+ match = re.match(
+ r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?",
+ text.unescape(chapter_string),
+ )
+ manga, chapter, data["chapter_minor"], title = match.groups()
+
+ if manga:
+ data["manga"] = manga.partition(" Chapter ")[0]
+ if title and title.lower() != "bahasa indonesia":
+ data["title"] = title.strip()
+ else:
+ data["title"] = ""
+ data["chapter"] = text.parse_int(chapter)
+ data["lang"] = "id"
+ data["language"] = "Indonesian"
+
+ return data
+
+
+class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
+ """Extractor for manga-chapters from komikcast.com"""
+ pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?&#]+/)"
+ test = (
+ (("https://komikcast.com/chapter/"
+ "apotheosis-chapter-02-2-bahasa-indonesia/"), {
+ "url": "f6b43fbc027697749b3ea1c14931c83f878d7936",
+ "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4",
+ }),
+ (("https://komikcast.com/chapter/"
+ "tonari-no-kashiwagi-san-chapter-18b/"), {
+ "url": "aff90dd21dbb945a726778b10bdef522af7c42fe",
+ "keyword": "19b5783864c4299913de436513b124b028b557c1",
+ }),
+ (("https://komikcast.com/chapter/090-eko-to-issho-chapter-1/"), {
+ "url": "cda104a32ea2b06b3d6b096726622f519ed1fa33",
+ }),
+ )
+
+ def metadata(self, page):
+ info = text.extract(page, '<b>', "</b>")[0]
+ return self.parse_chapter_string(info)
+
+ @staticmethod
+ def images(page):
+ readerarea = text.extract(
+ page, '<div id="readerarea">', '<div class="navig">')[0]
+ return [
+ (text.unescape(url), None)
+ for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
+ if "/Banner-" not in url
+ ]
+
+
+class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
+ """Extractor for manga from komikcast.com"""
+ chapterclass = KomikcastChapterExtractor
+ pattern = (r"(?:https?://)?(?:www\.)?komikcast\.com"
+ r"(/(?:komik/)?[^/?&#]+)/?$")
+ test = (
+ ("https://komikcast.com/komik/090-eko-to-issho/", {
+ "url": "dc798d107697d1f2309b14ca24ca9dba30c6600f",
+ "keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1",
+ }),
+ ("https://komikcast.com/tonari-no-kashiwagi-san/"),
+ )
+
+ def chapters(self, page):
+ results = []
+ data = self.metadata(page)
+
+ for item in text.extract_iter(
+ page, '<span class="leftoff"><a href="', '</a>'):
+ url, _, chapter_string = item.rpartition('">Chapter ')
+ self.parse_chapter_string(chapter_string, data)
+ results.append((url, data.copy()))
+ return results
+
+ @staticmethod
+ def metadata(page):
+ """Return a dict with general metadata"""
+ manga , pos = text.extract(page, "<title>" , "</title>")
+ genres, pos = text.extract(page, ">Genres:", "</span>", pos)
+ author, pos = text.extract(page, ">Author:", "</span>", pos)
+ mtype , pos = text.extract(page, ">Type:" , "</span>", pos)
+
+ return {
+ "manga": text.unescape(manga[:-12]),
+ "author": text.remove_html(author),
+ "genres": text.split_html(genres)[::2],
+ "type": text.remove_html(mtype),
+ }
diff --git a/gallery_dl/extractor/konachan.py b/gallery_dl/extractor/konachan.py
new file mode 100644
index 0000000..a9d8b3a
--- /dev/null
+++ b/gallery_dl/extractor/konachan.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://konachan.com/"""
+
+from . import booru
+
+
+class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for konachan extractors"""
+ category = "konachan"
+
+ def __init__(self, match):
+ root = "https://konachan." + match.group("tld")
+ self.api_url = root + "/post.json"
+ self.post_url = root + "/post/show/{}"
+ super().__init__(match)
+
+
+class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
+ """Extractor for images from konachan.com based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
+ test = (
+ ("https://konachan.com/post?tags=patata", {
+ "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
+ }),
+ ("https://konachan.net/post?tags=patata"),
+ )
+
+
+class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
+ """Extractor for image-pools from konachan.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/pool/show/(?P<pool>\d+)")
+ test = (
+ ("https://konachan.com/pool/show/95", {
+ "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
+ }),
+ ("https://konachan.net/pool/show/95"),
+ )
+
+
+class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
+ """Extractor for single images from konachan.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/post/show/(?P<post>\d+)")
+ test = (
+ ("https://konachan.com/post/show/205189", {
+ "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "patata",
+ "tags_character": "clownpiece",
+ "tags_copyright": "touhou",
+ "tags_general": str,
+ },
+ }),
+ ("https://konachan.net/post/show/205189"),
+ )
+
+
+class KonachanPopularExtractor(booru.MoebooruPopularMixin, KonachanExtractor):
+ """Extractor for popular images from konachan.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = (
+ ("https://konachan.com/post/popular_by_month?month=11&year=2010", {
+ "count": 20,
+ }),
+ ("https://konachan.com/post/popular_recent"),
+ ("https://konachan.net/post/popular_recent"),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = (
+ "https://konachan.{tld}/post/popular_{scale}.json".format(
+ tld=match.group("tld"), scale=self.scale))
diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py
new file mode 100644
index 0000000..ed72f4c
--- /dev/null
+++ b/gallery_dl/extractor/livedoor.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for http://blog.livedoor.jp/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class LivedoorExtractor(Extractor):
+ """Base class for livedoor extractors"""
+ category = "livedoor"
+ root = "http://blog.livedoor.jp"
+ filename_fmt = "{post[id]}_{post[title]}_{num:>02}.{extension}"
+ directory_fmt = ("{category}", "{post[user]}")
+ archive_fmt = "{post[id]}_{hash}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for post in self.posts():
+ images = self._images(post)
+ if images:
+ yield Message.Directory, {"post": post}
+ for image in images:
+ yield Message.Url, image["url"], image
+
+ def posts(self):
+ """Return an iterable with post objects"""
+
+ def _load(self, data, body):
+ extr = text.extract_from(data)
+ tags = text.extract(body, '</dt><dd>', '</dl>')[0]
+
+ return {
+ "id" : text.parse_int(extr("id : '", "'")),
+ "title" : text.unescape(extr("title : '", "'")),
+ "categories": [extr("name:'", "'"), extr("name:'", "'")],
+ "date" : text.parse_datetime(
+ extr("date : '", "'"), "%Y-%m-%d %H:%M:%S"),
+ "tags" : text.split_html(tags),
+ "user" : self.user,
+ "body" : body,
+ }
+
+ def _images(self, post):
+ imgs = []
+ body = post.pop("body")
+
+ for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
+ src = text.extract(img, 'src="', '"')[0]
+ alt = text.extract(img, 'alt="', '"')[0]
+
+ if not src:
+ continue
+ if "://livedoor.blogimg.jp/" in src:
+ url = src.replace("-s.", ".")
+ else:
+ url = text.urljoin(self.root, src)
+ name, _, ext = url.rpartition("/")[2].rpartition(".")
+
+ imgs.append({
+ "url" : url,
+ "num" : num,
+ "hash" : name,
+ "filename" : alt or name,
+ "extension": ext,
+ "post" : post,
+ })
+
+ return imgs
+
+
+class LivedoorBlogExtractor(LivedoorExtractor):
+ """Extractor for a user's blog on blog.livedoor.jp"""
+ subcategory = "blog"
+ pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])"
+ test = (
+ ("http://blog.livedoor.jp/zatsu_ke/", {
+ "range": "1-50",
+ "count": 50,
+ "pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+",
+ "keyword": {
+ "post": {
+ "categories": list,
+ "date": "type:datetime",
+ "id": int,
+ "tags": list,
+ "title": str,
+ "user": "zatsu_ke"
+ },
+ "filename": str,
+ "hash": r"re:\w{4,}",
+ "num": int,
+ },
+ }),
+ ("http://blog.livedoor.jp/uotapo/", {
+ "range": "1-5",
+ "count": 5,
+ }),
+ )
+
+ def posts(self):
+ url = "{}/{}".format(self.root, self.user)
+
+ while url:
+ extr = text.extract_from(self.request(url).text)
+ while True:
+ data = extr('.articles.push(', ');')
+ if not data:
+ break
+ body = extr('class="article-body-inner">',
+ 'class="article-footer">')
+ yield self._load(data, body)
+ url = extr('<a rel="next" href="', '"')
+
+
+class LivedoorPostExtractor(LivedoorExtractor):
+ """Extractor for images from a blog post on blog.livedoor.jp"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/archives/(\d+)"
+ test = (
+ ("http://blog.livedoor.jp/zatsu_ke/archives/51493859.html", {
+ "url": "8826fe623f19dc868e7538e8519bf8491e92a0a2",
+ "keyword": "52fcba9253a000c339bcd658572d252e282626af",
+ }),
+ ("http://blog.livedoor.jp/amaumauma/archives/7835811.html", {
+ "url": "fc1d6a9557245b5a27d3a10bf0fa9922ef377215",
+ "keyword": "0229072abb5cd8a221df72e0ffdfc13336c0e9ce",
+ }),
+ ("http://blog.livedoor.jp/uotapo/archives/1050616939.html", {
+ "url": "3f3581807ec4776e6a67ed7985a22494d4bc4904",
+ "keyword": "2eb3e383c68e909c4dd3d563c16d0b6e2fe6627b",
+ }),
+ )
+
+ def __init__(self, match):
+ LivedoorExtractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def posts(self):
+ url = "{}/{}/archives/{}.html".format(
+ self.root, self.user, self.post_id)
+ extr = text.extract_from(self.request(url).text)
+ data = extr('articles :', '</script>')
+ body = extr('class="article-body-inner">',
+ 'class="article-footer">')
+ return (self._load(data, body),)
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
new file mode 100644
index 0000000..65ae843
--- /dev/null
+++ b/gallery_dl/extractor/luscious.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://luscious.net/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class LusciousBase(Extractor):
+ """Base class for luscious extractors"""
+ category = "luscious"
+ cookiedomain = ".luscious.net"
+ root = "https://members.luscious.net"
+
+ def login(self):
+ """Login and set necessary cookies"""
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=14*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "https://members.luscious.net/accounts/login/"
+ headers = {"Referer": "https://members.luscious.net/login/"}
+ data = {
+ "login": username,
+ "password": password,
+ "remember": "on",
+ "next": "/",
+ }
+
+ response = self.request(url, method="POST", headers=headers, data=data)
+ if "/accounts/login/" in response.url or not response.history:
+ raise exception.AuthenticationError()
+ for cookie in response.history[0].cookies:
+ if cookie.name.startswith("sessionid_"):
+ return {cookie.name: cookie.value}
+ raise exception.AuthenticationError()
+
+ @staticmethod
+ def _parse_tags(tags):
+ return [
+ text.unescape(tag.replace(":_", ":"))
+ for tag in text.extract_iter(tags or "", "/tags/", "/")
+ ]
+
+
+class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
+ """Extractor for image albums from luscious.net"""
+ subcategory = "album"
+ archive_fmt = "{gallery_id}_{image_id}"
+ pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net"
+ r"/(?:albums|pictures/c/[^/?&#]+/album)/([^/?&#]+_(\d+))")
+ test = (
+ ("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
+ "url": "7e4984a271a1072ac6483e4228a045895aff86f3",
+ "keyword": "c597c132834f4990f90bf5dee5de2a9d4ba263a4",
+ "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
+ }),
+ ("https://luscious.net/albums/virgin-killer-sweater_282582/", {
+ "url": "21cc68a7548f4d71dfd67d8caf96349dde7e791c",
+ "keyword": "e1202078b504adeccd521aa932f456a5a85479a0",
+ }),
+ ("https://luscious.net/albums/not-found_277035/", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://members.luscious.net/albums/login-required_323871/", {
+ "options": (("username", None),),
+ "exception": exception.AuthorizationError,
+ }),
+ ("https://www.luscious.net/albums/okinami_277031/"),
+ ("https://members.luscious.net/albums/okinami_277031/"),
+ ("https://luscious.net/pictures/c/video_game_manga/album"
+ "/okinami-no-koigokoro_277031/sorted/position/id/16528978/@_1"),
+ )
+
+ def __init__(self, match):
+ path, self.gallery_id = match.groups()
+ url = "{}/albums/{}/".format(self.root, path)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ pos = page.find("<h1>404 Not Found</h1>")
+ if pos >= 0:
+ msg = text.extract(page, '<div class="content">', '</div>', pos)[0]
+ if msg and "content is not available" in msg:
+ raise exception.AuthorizationError()
+ raise exception.NotFoundError("album")
+
+ title, pos = text.extract(page, '"og:title" content="', '"')
+ info , pos = text.extract(page, '<li class="user_info">', "", pos)
+ if info is None:
+ count, pos = text.extract(page, '>Pages:', '<', pos)
+ else:
+ count, pos = text.extract(page, '<p>', ' ', pos)
+ genre, pos = text.extract(page, '<p>Genre:', '</p>', pos)
+ adnce, pos = text.extract(page, '<p>Audience:', '</p>', pos)
+ tags , pos = text.extract(page, '"tag_list static">', '</ol>', pos)
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : text.unescape(title or ""),
+ "count" : text.parse_int(count),
+ "genre" : text.remove_html(genre),
+ "audience" : text.remove_html(adnce),
+ "tags" : self._parse_tags(tags),
+ }
+
+ def images(self, page):
+ extr = text.extract
+
+ url = "{}/pictures/album/x_{}/sorted/old/page/1/".format(
+ self.root, self.gallery_id)
+ page = self.request(url).text
+ pos = page.find('<div id="picture_page_')
+ url = extr(page, '<a href="', '"', pos)[0]
+ iurl = None
+
+ while url and not url.endswith("/more_like_this/"):
+ page = self.request(self.root + url).text
+
+ if not iurl: # first loop iteraton
+ current = extr(page, '"pj_current_page" value="', '"')[0]
+ if current and current != "1":
+ url = "{}/albums/{}/jump_to_page/1/".format(
+ self.root, self.gallery_id)
+ page = self.request(url, method="POST").text
+
+ iid , pos = extr(url , '/id/', '/')
+ url , pos = extr(page, '<link rel="next" href="', '"')
+ name, pos = extr(page, '<h1 id="picture_title">', '</h1>', pos)
+ _ , pos = extr(page, '<ul class="image_option_icons">', '', pos)
+ iurl, pos = extr(page, '<li><a href="', '"', pos+100)
+
+ if iurl[0] == "/":
+ iurl = text.urljoin(self.root, iurl)
+
+ yield iurl, {
+ "name": name,
+ "image_id": text.parse_int(iid),
+ }
+
+
+class LusciousSearchExtractor(LusciousBase, Extractor):
+ """Extractor for album searches on luscious.net"""
+ subcategory = "search"
+ pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net"
+ r"/(albums(?:/(?![^/?&#]+_\d+)[^/?&#]+)+|manga|pictures)/?$")
+ test = (
+ ("https://luscious.net/manga/"),
+ ("https://members.luscious.net/albums/sorted/updated/album_type/manga"
+ "/content_id/2/tagged/+full_color/page/1/", {
+ "pattern": LusciousAlbumExtractor.pattern,
+ "range": "20-40",
+ "count": 21,
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1).partition("/page/")[0]
+ if not self.path.startswith("albums/"):
+ self.path = "albums/sorted/updated/album_type/" + self.path
+
+ def items(self):
+ self.login()
+ yield Message.Version, 1
+ for album in self.albums():
+ url, data = self.parse_album(album)
+ yield Message.Queue, url, data
+
+ def albums(self, pnum=1):
+ while True:
+ url = "{}/{}/page/{}/.json/".format(self.root, self.path, pnum)
+ data = self.request(url).json()
+
+ yield from text.extract_iter(
+ data["html"], "<figcaption>", "</figcaption>")
+
+ if data["paginator_complete"]:
+ return
+ pnum += 1
+
+ def parse_album(self, album):
+ url , pos = text.extract(album, 'href="', '"')
+ title, pos = text.extract(album, ">", "<", pos)
+ count, pos = text.extract(album, "# of pictures:", "<", pos)
+ date , pos = text.extract(album, "Updated:&nbsp;", "<", pos)
+ desc , pos = text.extract(album, "class='desc'>", "<", pos)
+ tags , pos = text.extract(album, "<ol ", "</ol>", pos)
+
+ return text.urljoin(self.root, url), {
+ "title": text.unescape(title or ""),
+ "description": text.unescape(desc or ""),
+ "gallery_id": text.parse_int(url.rpartition("_")[2].rstrip("/")),
+ "count": text.parse_int(count),
+ "date": date,
+ "tags": self._parse_tags(tags),
+ "_extractor": LusciousAlbumExtractor,
+ }
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
new file mode 100644
index 0000000..d0eb2a9
--- /dev/null
+++ b/gallery_dl/extractor/mangadex.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://mangadex.org/"""
+
+from .common import Extractor, Message
+from .. import text, util
+from ..cache import memcache
+
+
+class MangadexExtractor(Extractor):
+ """Base class for mangadex extractors"""
+ category = "mangadex"
+ root = "https://mangadex.org"
+
+ # mangadex-to-iso639-1 codes
+ iso639_map = {
+ "br": "pt",
+ "ct": "ca",
+ "gb": "en",
+ "vn": "vi",
+ }
+
+ def chapter_data(self, chapter_id):
+ """Request API results for 'chapter_id'"""
+ url = "{}/api/chapter/{}".format(self.root, chapter_id)
+ return self.request(url).json()
+
+ @memcache(keyarg=1)
+ def manga_data(self, manga_id):
+ """Request API results for 'manga_id'"""
+ url = "{}/api/manga/{}".format(self.root, manga_id)
+ return self.request(url).json()
+
+
+class MangadexChapterExtractor(MangadexExtractor):
+ """Extractor for manga-chapters from mangadex.org"""
+ subcategory = "chapter"
+ directory_fmt = (
+ "{category}", "{manga}",
+ "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}")
+ filename_fmt = (
+ "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"
+ test = (
+ ("https://mangadex.org/chapter/122094", {
+ "keyword": "1c834dca33025f521e1874aee1f71c51e28ebf99",
+ "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
+ }),
+ # oneshot
+ ("https://mangadex.org/chapter/138086", {
+ "count": 64,
+ "keyword": "178777bd0352fb19eb934cbee5630d16e3fb60ab",
+ }),
+ )
+
+ def __init__(self, match):
+ MangadexExtractor.__init__(self, match)
+ self.chapter_id = match.group(1)
+ self.data = None
+
+ def items(self):
+ data = self.metadata()
+ imgs = self.images()
+ data["count"] = len(imgs)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def metadata(self):
+ """Return a dict with general metadata"""
+ cdata = self.chapter_data(self.chapter_id)
+ mdata = self.manga_data(cdata["manga_id"])
+ self.data = cdata
+
+ chapter, sep, minor = cdata["chapter"].partition(".")
+ return {
+ "manga": mdata["manga"]["title"],
+ "manga_id": cdata["manga_id"],
+ "artist": mdata["manga"]["artist"],
+ "author": mdata["manga"]["author"],
+ "title": text.unescape(cdata["title"]),
+ "volume": text.parse_int(cdata["volume"]),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "chapter_id": cdata["id"],
+ "group": mdata["chapter"][self.chapter_id]["group_name"],
+ "date": cdata["timestamp"],
+ "lang": util.language_to_code(cdata["lang_name"]),
+ "language": cdata["lang_name"],
+ }
+
+ def images(self):
+ """Return a list of all image URLs"""
+ base = self.data["server"] + self.data["hash"] + "/"
+ if base.startswith("/"):
+ base = text.urljoin(self.root, base)
+ return [base + page for page in self.data["page_array"]]
+
+
+class MangadexMangaExtractor(MangadexExtractor):
+ """Extractor for manga from mangadex.org"""
+ subcategory = "manga"
+ categorytransfer = True
+ pattern = (r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)"
+ r"/(?:title|manga)/(\d+)")
+ test = (
+ ("https://mangadex.org/manga/2946/souten-no-koumori", {
+ "pattern": r"https://mangadex.org/chapter/\d+",
+ "keywords": {
+ "manga": "Souten no Koumori",
+ "manga_id": 2946,
+ "title": "Oneshot",
+ "volume": 0,
+ "chapter": 0,
+ "chapter_minor": "",
+ "chapter_id": int,
+ "group": str,
+ "date": int,
+ "lang": str,
+ "language": str,
+ },
+ }),
+ ("https://mangadex.org/manga/13318/dagashi-kashi/chapters/2/", {
+ "count": ">= 100",
+ }),
+ ("https://mangadex.org/title/13004/yorumori-no-kuni-no-sora-ni", {
+ "count": 0,
+ }),
+ ("https://mangadex.org/title/2946/souten-no-koumori"),
+ )
+
+ def __init__(self, match):
+ MangadexExtractor.__init__(self, match)
+ self.manga_id = text.parse_int(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ for data in self.chapters():
+ url = "{}/chapter/{}".format(self.root, data["chapter_id"])
+ yield Message.Queue, url, data
+
+ def chapters(self):
+ """Return a sorted list of chapter-metadata dicts"""
+ data = self.manga_data(self.manga_id)
+ if "chapter" not in data:
+ return ()
+ manga = data["manga"]
+
+ results = []
+ for chid, info in data["chapter"].items():
+ chapter, sep, minor = info["chapter"].partition(".")
+ lang = self.iso639_map.get(info["lang_code"], info["lang_code"])
+ results.append({
+ "manga": manga["title"],
+ "manga_id": self.manga_id,
+ "artist": manga["artist"],
+ "author": manga["author"],
+ "title": text.unescape(info["title"]),
+ "volume": text.parse_int(info["volume"]),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "chapter_id": text.parse_int(chid),
+ "group": text.unescape(info["group_name"]),
+ "date": info["timestamp"],
+ "lang": lang,
+ "language": util.code_to_language(lang),
+ "_extractor": MangadexChapterExtractor,
+ })
+
+ results.sort(key=lambda x: (x["chapter"], x["chapter_minor"]))
+ return results
diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py
new file mode 100644
index 0000000..1b8a4a6
--- /dev/null
+++ b/gallery_dl/extractor/mangafox.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://fanfox.net/"""
+
+from .common import ChapterExtractor
+from .. import text
+
+
+class MangafoxChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from fanfox.net"""
+ category = "mangafox"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?(?:mangafox\.me|fanfox\.net)"
+ r"(/manga/[^/]+/((?:v(\d+)/)?c(\d+)([^/?&#]*)))")
+ test = (
+ ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
+ "keyword": "5661dab258d42d09d98f194f7172fb9851a49766",
+ "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
+ }),
+ ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/"),
+ )
+ root = "https://m.fanfox.net"
+
+ def __init__(self, match):
+ base, self.cstr, self.volume, self.chapter, self.minor = match.groups()
+ self.urlbase = self.root + base
+ ChapterExtractor.__init__(self, match, self.urlbase + "/1.html")
+
+ def metadata(self, page):
+ manga, pos = text.extract(page, "<title>", "</title>")
+ count, pos = text.extract(
+ page, ">", "<", page.find("</select>", pos) - 20)
+ sid , pos = text.extract(page, "var series_id =", ";", pos)
+ cid , pos = text.extract(page, "var chapter_id =", ";", pos)
+
+ return {
+ "manga": text.unescape(manga),
+ "volume": text.parse_int(self.volume),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_minor": self.minor or "",
+ "chapter_string": self.cstr,
+ "count": text.parse_int(count),
+ "sid": text.parse_int(sid),
+ "cid": text.parse_int(cid),
+ }
+
+ def images(self, page):
+ pnum = 1
+ while True:
+ url, pos = text.extract(page, '<img src="', '"')
+ yield url, None
+ url, pos = text.extract(page, ' src="', '"', pos)
+ yield url, None
+
+ pnum += 2
+ page = self.request("{}/{}.html".format(self.urlbase, pnum)).text
diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py
new file mode 100644
index 0000000..e15acbe
--- /dev/null
+++ b/gallery_dl/extractor/mangahere.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://www.mangahere.cc/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import re
+
+
+class MangahereBase():
+ """Base class for mangahere extractors"""
+ category = "mangahere"
+ root = "https://www.mangahere.cc"
+ mobile_root = "https://m.mangahere.cc"
+ url_fmt = mobile_root + "/manga/{}/{}.html"
+
+
+class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
+ """Extractor for manga-chapters from mangahere.cc"""
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/"
+ r"([^/]+(?:/v0*(\d+))?/c([^/?&#]+))")
+ test = (
+ ("https://www.mangahere.cc/manga/dongguo_xiaojie/c004.2/", {
+ "keyword": "7c98d7b50a47e6757b089aa875a53aa970cac66f",
+ "content": "708d475f06893b88549cbd30df1e3f9428f2c884",
+ }),
+ ("http://www.mangahere.co/manga/dongguo_xiaojie/c003.2/"),
+ ("http://m.mangahere.co/manga/dongguo_xiaojie/c003.2/"),
+ )
+
+ def __init__(self, match):
+ self.part, self.volume, self.chapter = match.groups()
+ url = self.url_fmt.format(self.part, 1)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ pos = page.index("</select>")
+ count , pos = text.extract(page, ">", "<", pos - 20)
+ manga_id , pos = text.extract(page, "series_id = ", ";", pos)
+ chapter_id, pos = text.extract(page, "chapter_id = ", ";", pos)
+ manga , pos = text.extract(page, '"name":"', '"', pos)
+ chapter, dot, minor = self.chapter.partition(".")
+
+ return {
+ "manga": text.unescape(manga),
+ "manga_id": text.parse_int(manga_id),
+ "title": self._get_title(),
+ "volume": text.parse_int(self.volume),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": dot + minor,
+ "chapter_id": text.parse_int(chapter_id),
+ "count": text.parse_int(count),
+ "lang": "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ pnum = 1
+
+ while True:
+ url, pos = text.extract(page, '<img src="', '"')
+ yield url, None
+ url, pos = text.extract(page, ' src="', '"', pos)
+ yield url, None
+ pnum += 2
+ page = self.request(self.url_fmt.format(self.part, pnum)).text
+
+ def _get_title(self):
+ url = "{}/manga/{}/".format(self.root, self.part)
+ page = self.request(url).text
+
+ try:
+ pos = page.index(self.part) + len(self.part)
+ pos = page.index(self.part, pos) + len(self.part)
+ return text.extract(page, ' title="', '"', pos)[0]
+ except ValueError:
+ return ""
+
+
+class MangahereMangaExtractor(MangahereBase, MangaExtractor):
+ """Extractor for manga from mangahere.cc"""
+ chapterclass = MangahereChapterExtractor
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]"
+ r"(/manga/[^/]+)/?(?:#.*)?$")
+ test = (
+ ("https://www.mangahere.cc/manga/aria/", {
+ "url": "23ad9256f7392de5973b79a36f6875e9fdcb7563",
+ "keyword": "79e326641e7d5d2fed43a1eb9949471b8162a9e0",
+ }),
+ ("https://www.mangahere.cc/manga/hiyokoi/#50", {
+ "url": "654850570aa03825cd57e2ae2904af489602c523",
+ "keyword": "c8084d89a9ea6cf40353093669f9601a39bf5ca2",
+ }),
+ ("https://www.mangahere.co/manga/aria/"),
+ ("https://m.mangahere.co/manga/aria/"),
+ )
+
+ def chapters(self, page):
+ results = []
+ manga, pos = text.extract(page, '<meta name="og:title" content="', '"')
+ manga = text.unescape(manga)
+
+ page = text.extract(
+ page, 'id="chapterlist"', 'class="detail-main-list-more"', pos)[0]
+ pos = 0
+ while True:
+ url, pos = text.extract(page, ' href="', '"', pos)
+ if not url:
+ return results
+ info, pos = text.extract(page, 'class="title3">', '<', pos)
+ date, pos = text.extract(page, 'class="title2">', '<', pos)
+
+ match = re.match(
+ r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?", info)
+ if match:
+ volume, chapter, minor, title = match.groups()
+ else:
+ chapter, _, minor = url[:-1].rpartition("/c")[2].partition(".")
+ minor = "." + minor
+ volume = 0
+ title = ""
+
+ results.append((text.urljoin(self.root, url), {
+ "manga": manga,
+ "title": text.unescape(title) if title else "",
+ "volume": text.parse_int(volume),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": minor,
+ "date": date,
+ "lang": "en",
+ "language": "English",
+ }))
diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py
new file mode 100644
index 0000000..18ef005
--- /dev/null
+++ b/gallery_dl/extractor/mangapanda.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://www.mangapanda.com/"""
+
+from .mangareader import MangareaderMangaExtractor, MangareaderChapterExtractor
+
+
+class MangapandaBase():
+ """Base class for mangapanda extractors"""
+ category = "mangapanda"
+ root = "https://www.mangapanda.com"
+
+
+class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor):
+ """Extractor for manga-chapters from mangapanda.com"""
+ pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))"
+ test = ("https://www.mangapanda.com/red-storm/2", {
+ "url": "1f633f776e950531ba9b1e81965316458e785261",
+ "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb",
+ })
+
+
+class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor):
+ """Extractor for manga from mangapanda.com"""
+ chapterclass = MangapandaChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?&#]+)/?$"
+ test = ("https://www.mangapanda.com/mushishi", {
+ "url": "357f965732371cac1990fee8b480f62e29141a42",
+ "keyword": "031b3ea085921c552de017ecbb9b906e462229c9",
+ })
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
new file mode 100644
index 0000000..ee11231
--- /dev/null
+++ b/gallery_dl/extractor/mangapark.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://mangapark.me/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, exception
+import json
+
+
+class MangaparkBase():
+ """Base class for mangapark extractors"""
+ category = "mangapark"
+ root_fmt = "https://mangapark.{}"
+
+ @staticmethod
+ def parse_chapter_path(path, data):
+ """Get volume/chapter information from url-path of a chapter"""
+ data["volume"], data["chapter_minor"] = 0, ""
+ for part in path.split("/")[1:]:
+ key, value = part[0], part[1:]
+ if key == "c":
+ chapter, dot, minor = value.partition(".")
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = dot + minor
+ elif key == "i":
+ data["chapter_id"] = text.parse_int(value)
+ elif key == "v":
+ data["volume"] = text.parse_int(value)
+ elif key == "s":
+ data["stream"] = text.parse_int(value)
+ elif key == "e":
+ data["chapter_minor"] = "v" + value
+
+
+class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
+ """Extractor for manga-chapters from mangapark.me"""
+ pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)"
+ r"/manga/([^?&#]+/i\d+)")
+ test = (
+ ("https://mangapark.me/manga/gosu/i811615/c55/1", {
+ "count": 50,
+ "keyword": "373d678048d29492f9763743ccaa9b6d840f17cf",
+ }),
+ (("https://mangapark.me/manga"
+ "/ad-astra-per-aspera-hata-kenjirou/i662054/c001.2/1"), {
+ "count": 40,
+ "keyword": "8e9cce4ed0e25d12a45e02f840d6f32ef838e257",
+ }),
+ ("https://mangapark.me/manga/gekkan-shoujo-nozaki-kun/i655476/c70/1", {
+ "count": 15,
+ "keyword": "19f730617074d65f91c0781f429de324890925bf",
+ }),
+ ("https://mangapark.net/manga/gosu/i811615/c55/1"),
+ ("https://mangapark.com/manga/gosu/i811615/c55/1"),
+ )
+
+ def __init__(self, match):
+ tld, self.path = match.groups()
+ self.root = self.root_fmt.format(tld)
+ url = "{}/manga/{}?zoom=2".format(self.root, self.path)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ data = text.extract_all(page, (
+ ("manga_id" , "var _manga_id = '", "'"),
+ ("chapter_id", "var _book_id = '", "'"),
+ ("stream" , "var _stream = '", "'"),
+ ("path" , "var _book_link = '", "'"),
+ ("manga" , "<h2>", "</h2>"),
+ ("title" , "</a>", "<"),
+ ), values={"lang": "en", "language": "English"})[0]
+
+ if not data["path"]:
+ raise exception.NotFoundError("chapter")
+ self.parse_chapter_path(data["path"], data)
+
+ data["manga"], _, data["type"] = data["manga"].rpartition(" ")
+ data["manga"] = text.unescape(data["manga"])
+ data["title"] = data["title"].partition(": ")[2]
+ for key in ("manga_id", "chapter_id", "stream"):
+ data[key] = text.parse_int(data[key])
+
+ return data
+
+ def images(self, page):
+ data = json.loads(text.extract(
+ page, "var _load_pages =", ";")[0] or "[]")
+ return [
+ (text.urljoin(self.root, item["u"]), {
+ "width": text.parse_int(item["w"]),
+ "height": text.parse_int(item["h"]),
+ })
+ for item in data
+ ]
+
+
+class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
+ """Extractor for manga from mangapark.me"""
+ chapterclass = MangaparkChapterExtractor
+ pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)"
+ r"(/manga/[^/?&#]+)/?$")
+ test = (
+ ("https://mangapark.me/manga/aria", {
+ "url": "a58be23ef3874fe9705b0b41dd462b67eaaafd9a",
+ "keyword": "b3b5a30aa2a326bc0ca8b74c65b5ecd4bf676ebf",
+ }),
+ ("https://mangapark.net/manga/aria"),
+ ("https://mangapark.com/manga/aria"),
+ )
+
+ def __init__(self, match):
+ self.root = self.root_fmt.format(match.group(1))
+ MangaExtractor.__init__(self, match, self.root + match.group(2))
+
+ def chapters(self, page):
+ results = []
+ data = {"lang": "en", "language": "English"}
+ data["manga"] = text.unescape(
+ text.extract(page, '<title>', ' Manga - ')[0])
+
+ for stream in page.split('<div id="stream_')[1:]:
+ data["stream"] = text.parse_int(text.extract(stream, '', '"')[0])
+
+ for chapter in text.extract_iter(stream, '<li ', '</li>'):
+ path , pos = text.extract(chapter, 'href="', '"')
+ title, pos = text.extract(chapter, '>: </span>', '<', pos)
+ count, pos = text.extract(chapter, ' of ', ' ', pos)
+
+ self.parse_chapter_path(path[8:], data)
+ data["title"] = title.strip() if title else ""
+ data["count"] = text.parse_int(count)
+ results.append((self.root + path, data.copy()))
+
+ return results
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
new file mode 100644
index 0000000..d24d452
--- /dev/null
+++ b/gallery_dl/extractor/mangareader.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://www.mangareader.net/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+
+
+class MangareaderBase():
+ """Base class for mangareader extractors"""
+ category = "mangareader"
+ root = "https://www.mangareader.net"
+
+ @staticmethod
+ def parse_page(page, data):
+ """Parse metadata on 'page' and add it to 'data'"""
+ text.extract_all(page, (
+ ("manga" , '<h2 class="aname">', '</h2>'),
+ ("release", '>Year of Release:</td>\n<td>', '</td>'),
+ ('author' , '>Author:</td>\n<td>', '</td>'),
+ ('artist' , '>Artist:</td>\n<td>', '</td>'),
+ ), values=data)
+ data["manga"] = data["manga"].strip()
+ data["author"] = text.unescape(data["author"])
+ data["artist"] = text.unescape(data["artist"])
+ return data
+
+
+class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
+ """Extractor for manga-chapters from mangareader.net"""
+ archive_fmt = "{manga}_{chapter}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"
+ test = (("https://www.mangareader.net"
+ "/karate-shoukoushi-kohinata-minoru/11"), {
+ "url": "061cc92a07edf17bb991ce0821fa4c77a147a860",
+ "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6",
+ })
+
+ def __init__(self, match):
+ path, self.url_title, self.chapter = match.groups()
+ ChapterExtractor.__init__(self, match, self.root + path)
+
+ def metadata(self, chapter_page):
+ page = self.request(self.root + self.url_title).text
+ data = self.parse_page(page, {
+ "chapter": text.parse_int(self.chapter),
+ "lang": "en",
+ "language": "English",
+ })
+ text.extract_all(page, (
+ ('title', ' ' + self.chapter + '</a> : ', '</td>'),
+ ('date', '<td>', '</td>'),
+ ), page.index('<div id="chapterlist">'), data)
+ data["count"] = text.parse_int(text.extract(
+ chapter_page, '</select> of ', '<')[0]
+ )
+ return data
+
+ def images(self, page):
+ while True:
+ next_url, image_url, image_data = self.get_image_metadata(page)
+ yield image_url, image_data
+
+ if not next_url:
+ return
+ page = self.request(next_url).text
+
+ def get_image_metadata(self, page):
+ """Collect next url, image-url and metadata for one manga-page"""
+ extr = text.extract
+ width = None
+ test , pos = extr(page, "document['pu']", '')
+ if test is None:
+ return None, None, None
+ if page.find("document['imgwidth']", pos, pos+200) != -1:
+ width , pos = extr(page, "document['imgwidth'] = ", ";", pos)
+ height, pos = extr(page, "document['imgheight'] = ", ";", pos)
+ _ , pos = extr(page, '<div id="imgholder">', '')
+ url, pos = extr(page, ' href="', '"', pos)
+ if width is None:
+ width , pos = extr(page, '<img id="img" width="', '"', pos)
+ height, pos = extr(page, ' height="', '"', pos)
+ image, pos = extr(page, ' src="', '"', pos)
+ return self.root + url, image, {
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ }
+
+
+class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
+ """Extractor for manga from mangareader.net"""
+ chapterclass = MangareaderChapterExtractor
+ reverse = False
+ pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?&#]+)/?$"
+ test = ("https://www.mangareader.net/mushishi", {
+ "url": "bc203b858b4ad76e5d77e39118a7be0350e357da",
+ "keyword": "031b3ea085921c552de017ecbb9b906e462229c9",
+ })
+
+ def chapters(self, page):
+ results = []
+ data = self.parse_page(page, {"lang": "en", "language": "English"})
+
+ needle = '<div class="chico_manga"></div>\n<a href="'
+ pos = page.index('<div id="chapterlist">')
+ while True:
+ url, pos = text.extract(page, needle, '"', pos)
+ if not url:
+ return results
+ data["title"], pos = text.extract(page, '</a> : ', '</td>', pos)
+ data["date"] , pos = text.extract(page, '<td>', '</td>', pos)
+ data["chapter"] = text.parse_int(url.rpartition("/")[2])
+ results.append((self.root + url, data.copy()))
diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py
new file mode 100644
index 0000000..7ff0239
--- /dev/null
+++ b/gallery_dl/extractor/mangastream.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from https://readms.net/"""
+
+from .common import ChapterExtractor
+from .. import text
+
+
+class MangastreamChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from mangastream.com"""
+ category = "mangastream"
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)"
+ r"/r(?:ead)?/([^/]*/([^/]+)/(\d+))")
+ test = (
+ ("https://readms.net/r/onepunch_man/087/4874/1"),
+ ("https://mangastream.com/r/onepunch_man/087/4874/1"),
+ )
+ root = "https://readms.net"
+
+ def __init__(self, match):
+ self.part, self.chapter, self.chapter_id = match.groups()
+ url = "{}/r/{}".format(self.root, self.part)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ manga, pos = text.extract(
+ page, '<span class="hidden-xs hidden-sm">', "<")
+ pos = page.find(self.part, pos)
+ title, pos = text.extract(page, ' - ', '<', pos)
+ count, pos = text.extract(page, 'Last Page (', ')', pos)
+ return {
+ "manga": manga,
+ "chapter": text.unquote(self.chapter),
+ "chapter_id": text.parse_int(self.chapter_id),
+ "title": title,
+ "count": text.parse_int(count, 1),
+ "lang": "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ while True:
+ pos = page.index(' class="page"')
+ next_url = text.extract(page, ' href="', '"', pos)[0]
+ image_url = text.extract(page, ' src="', '"', pos)[0]
+ yield text.urljoin(self.root, image_url), None
+ page = self.request(text.urljoin(self.root, next_url)).text
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
new file mode 100644
index 0000000..4ad8da2
--- /dev/null
+++ b/gallery_dl/extractor/mangoxo.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.mangoxo.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+import hashlib
+
+
+class MangoxoExtractor(Extractor):
+ """Base class for mangoxo extractors"""
+ category = "mangoxo"
+ root = "https://www.mangoxo.com"
+ cookiedomain = "www.mangoxo.com"
+ cookienames = ("SESSION",)
+ _warning = True
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ elif MangoxoExtractor._warning:
+ MangoxoExtractor._warning = False
+ self.log.warning("Unauthenticated users cannot see "
+ "more than 5 images per album")
+
+ @cache(maxage=3*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ page = self.request(self.root + "/login/").text
+ token = text.extract(page, 'id="loginToken" value="', '"')[0]
+ if not token:
+ self.log.debug("failed to extract 'loginToken'")
+
+ url = self.root + "/login/loginxmm"
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": self.root + "/login",
+ }
+ data = {
+ "name": username,
+ "password": hashlib.md5(password.encode()).hexdigest(),
+ "loginToken": token,
+ }
+ response = self.request(url, method="POST", headers=headers, data=data)
+
+ if response.json().get("result") != "1":
+ raise exception.AuthenticationError()
+ return {"SESSION": self.session.cookies.get("SESSION")}
+
+ @staticmethod
+ def _total_pages(page):
+ return text.parse_int(text.extract(page, "total :", ",")[0])
+
+
+class MangoxoAlbumExtractor(MangoxoExtractor):
+ """Extractor for albums on mangoxo.com"""
+ subcategory = "album"
+ filename_fmt = "{album[id]}_{num:>03}.{extension}"
+ directory_fmt = ("{category}", "{channel[name]}", "{album[name]}")
+ archive_fmt = "{album[id]}_{num}"
+ pattern = r"(?:https?://)?(?:www\.)?mangoxo\.com/album/(\w+)"
+ test = ("https://www.mangoxo.com/album/lzVOv1Q9", {
+ "url": "ad921fe62663b06e7d73997f7d00646cab7bdd0d",
+ "keyword": {
+ "channel": {
+ "id": "QeYKRkO0",
+ "name": "美女图社",
+ "cover": str,
+ },
+ "album": {
+ "id": "lzVOv1Q9",
+ "name": "池永康晟 Ikenaga Yasunari 透出古朴气息的日本美女人像画作",
+ "date": "2019.3.22 14:42",
+ "description": str,
+ },
+ "num": int,
+ "count": 65,
+ },
+ })
+
+ def __init__(self, match):
+ MangoxoExtractor.__init__(self, match)
+ self.album_id = match.group(1)
+
+ def items(self):
+ self.login()
+ url = "{}/album/{}/".format(self.root, self.album_id)
+ page = self.request(url).text
+ data = self.metadata(page)
+ imgs = self.images(url, page)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], image in enumerate(imgs, 1):
+ yield Message.Url, image, text.nameext_from_url(image, data)
+
+ def metadata(self, page):
+ """Return general metadata"""
+ title, pos = text.extract(page, '<title>', '</title>')
+ count, pos = text.extract(page, 'id="pic-count">', '<', pos)
+ cover, pos = text.extract(page, ' src="', '"', pos)
+ cid , pos = text.extract(page, '//www.mangoxo.com/channel/', '"', pos)
+ cname, pos = text.extract(page, '>', '<', pos)
+ date , pos = text.extract(page, '</i>', '<', pos)
+ descr, pos = text.extract(page, '<pre>', '</pre>', pos)
+
+ return {
+ "channel": {
+ "id": cid,
+ "name": text.unescape(cname),
+ "cover": cover,
+ },
+ "album": {
+ "id": self.album_id,
+ "name": text.unescape(title),
+ "date": date.strip(),
+ "description": text.unescape(descr),
+ },
+ "count": text.parse_int(count),
+ }
+
+ def images(self, url, page):
+ """Generator; Yields all image URLs"""
+ total = self._total_pages(page)
+ num = 1
+
+ while True:
+ yield from text.extract_iter(
+ page, 'class="lightgallery-item" href="', '"')
+ if num >= total:
+ return
+ num += 1
+ page = self.request(url + str(num)).text
+
+
+class MangoxoChannelExtractor(MangoxoExtractor):
+ """Extractor for all albums on a mangoxo channel"""
+ subcategory = "channel"
+ pattern = r"(?:https?://)?(?:www\.)?mangoxo\.com/channel/(\w+)"
+ test = ("https://www.mangoxo.com/channel/QeYKRkO0", {
+ "pattern": MangoxoAlbumExtractor.pattern,
+ "range": "1-30",
+ "count": "> 20",
+ })
+
+ def __init__(self, match):
+ MangoxoExtractor.__init__(self, match)
+ self.channel_id = match.group(1)
+
+ def items(self):
+ self.login()
+ num = total = 1
+ url = "{}/channel/{}/album/".format(self.root, self.channel_id)
+ yield Message.Version, 1
+
+ while True:
+ page = self.request(url + str(num)).text
+
+ for album in text.extract_iter(
+ page, '<a class="link black" href="', '"'):
+ yield Message.Queue, album, {}
+
+ if num == 1:
+ total = self._total_pages(page)
+ if num >= total:
+ return
+ num += 1
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
new file mode 100644
index 0000000..28a2c2d
--- /dev/null
+++ b/gallery_dl/extractor/mastodon.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for mastodon instances"""
+
+from .common import Extractor, Message
+from .. import text, config, exception
+import re
+
+
+class MastodonExtractor(Extractor):
+ """Base class for mastodon extractors"""
+ basecategory = "mastodon"
+ directory_fmt = ("mastodon", "{instance}", "{account[username]}")
+ filename_fmt = "{category}_{id}_{media[id]}.{extension}"
+ archive_fmt = "{media[id]}"
+ instance = None
+ root = None
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = MastodonAPI(self)
+
+ def config(self, key, default=None, *, sentinel=object()):
+ value = Extractor.config(self, key, sentinel)
+ if value is not sentinel:
+ return value
+ return config.interpolate(
+ ("extractor", "mastodon", self.instance, self.subcategory, key),
+ default,
+ )
+
+ def items(self):
+ yield Message.Version, 1
+ for status in self.statuses():
+ attachments = self.prepare(status)
+ yield Message.Directory, status
+ for media in attachments:
+ status["media"] = media
+ url = media["url"]
+ yield Message.Url, url, text.nameext_from_url(url, status)
+
+ def statuses(self):
+ """Return an iterable containing all relevant Status-objects"""
+ return ()
+
+ def prepare(self, status):
+ """Prepare a status object"""
+ status["instance"] = self.instance
+ status["tags"] = [tag["name"] for tag in status["tags"]]
+ attachments = status["media_attachments"]
+ del status["media_attachments"]
+ return attachments
+
+
+class MastodonUserExtractor(MastodonExtractor):
+ """Extractor for all images of an account/user"""
+ subcategory = "user"
+
+ def __init__(self, match):
+ MastodonExtractor.__init__(self, match)
+ self.account_name = match.group(1)
+
+ def statuses(self):
+ results = self.api.account_search("@" + self.account_name, 1)
+ for account in results:
+ if account["username"] == self.account_name:
+ break
+ else:
+ raise exception.NotFoundError("account")
+ return self.api.account_statuses(account["id"])
+
+
+class MastodonStatusExtractor(MastodonExtractor):
+ """Extractor for images from a status"""
+ subcategory = "status"
+
+ def __init__(self, match):
+ MastodonExtractor.__init__(self, match)
+ self.status_id = match.group(1)
+
+ def statuses(self):
+ return (self.api.status(self.status_id),)
+
+
+class MastodonAPI():
+ """Minimal interface for the Mastodon API
+
+ https://github.com/tootsuite/mastodon
+ https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
+ """
+
+ def __init__(self, extractor, access_token=None):
+ self.root = extractor.root
+ self.extractor = extractor
+
+ if not access_token:
+ access_token = extractor.config(
+ "access-token", extractor.access_token)
+ self.headers = {"Authorization": "Bearer {}".format(access_token)}
+
+ def account_search(self, query, limit=40):
+ """Search for content"""
+ params = {"q": query, "limit": limit}
+ return self._call("accounts/search", params)
+
+ def account_statuses(self, account_id):
+ """Get an account's statuses"""
+ endpoint = "accounts/{}/statuses".format(account_id)
+ params = {"only_media": "1"}
+ return self._pagination(endpoint, params)
+
+ def status(self, status_id):
+ """Fetch a Status"""
+ return self._call("statuses/" + status_id)
+
+ def _call(self, endpoint, params=None):
+ url = "{}/api/v1/{}".format(self.root, endpoint)
+ response = self.extractor.request(
+ url, params=params, headers=self.headers)
+ return self._parse(response)
+
+ def _pagination(self, endpoint, params):
+ url = "{}/api/v1/{}".format(self.root, endpoint)
+ while url:
+ response = self.extractor.request(
+ url, params=params, headers=self.headers)
+ yield from self._parse(response)
+ url = response.links.get("next", {}).get("url")
+
+ @staticmethod
+ def _parse(response):
+ """Parse an API response"""
+ if response.status_code == 404:
+ raise exception.NotFoundError()
+ return response.json()
+
+
+def generate_extractors():
+ """Dynamically generate Extractor classes for Mastodon instances"""
+
+ symtable = globals()
+ extractors = config.get(("extractor", "mastodon"))
+ if extractors:
+ EXTRACTORS.update(extractors)
+ config.set(("extractor", "mastodon"), EXTRACTORS)
+
+ for instance, info in EXTRACTORS.items():
+
+ if not isinstance(info, dict):
+ continue
+
+ category = info.get("category") or instance.replace(".", "")
+ root = info.get("root") or "https://" + instance
+ name = (info.get("name") or category).capitalize()
+ token = info.get("access-token")
+ pattern = info.get("pattern") or re.escape(instance)
+
+ class Extr(MastodonUserExtractor):
+ pass
+
+ Extr.__name__ = Extr.__qualname__ = name + "UserExtractor"
+ Extr.__doc__ = "Extractor for all images of a user on " + instance
+ Extr.category = category
+ Extr.instance = instance
+ Extr.pattern = (r"(?:https?://)?" + pattern +
+ r"/@([^/?&#]+)(?:/media)?/?$")
+ Extr.root = root
+ Extr.access_token = token
+ symtable[Extr.__name__] = Extr
+
+ class Extr(MastodonStatusExtractor):
+ pass
+
+ Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor"
+ Extr.__doc__ = "Extractor for images from a status on " + instance
+ Extr.category = category
+ Extr.instance = instance
+ Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?&#]+/(\d+)"
+ Extr.root = root
+ Extr.access_token = token
+ symtable[Extr.__name__] = Extr
+
+
+EXTRACTORS = {
+ "pawoo.net": {
+ "category" : "pawoo",
+ "access-token" : "286462927198d0cf3e24683e91c8259a"
+ "ac4367233064e0570ca18df2ac65b226",
+ "client-id" : "97b142b6904abf97a1068d51a7bc2f2f"
+ "cf9323cef81f13cb505415716dba7dac",
+ "client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
+ "75e7fb2532c31a026327a93549236481",
+ },
+}
+
+
+generate_extractors()
diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py
new file mode 100644
index 0000000..1831620
--- /dev/null
+++ b/gallery_dl/extractor/message.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+
+class Message():
+ """Enum for message identifiers
+
+ Extractors yield their results as message-tuples, where the first element
+ is one of the following identifiers. This message-identifier determines
+ the type and meaning of the other elements in such a tuple.
+
+ - Message.Version:
+ - Message protocol version (currently always '1')
+ - 2nd element specifies the version of all following messages as integer
+
+ - Message.Directory:
+ - Sets the target directory for all following images
+ - 2nd element is a dictionary containing general metadata
+
+ - Message.Url:
+ - Image URL and its metadata
+ - 2nd element is the URL as a string
+ - 3rd element is a dictionary with image-specific metadata
+
+ - Message.Headers: # obsolete
+ - HTTP headers to use while downloading
+ - 2nd element is a dictionary with header-name and -value pairs
+
+ - Message.Cookies: # obsolete
+ - Cookies to use while downloading
+ - 2nd element is a dictionary with cookie-name and -value pairs
+
+ - Message.Queue:
+ - (External) URL that should be handled by another extractor
+ - 2nd element is the (external) URL as a string
+ - 3rd element is a dictionary containing URL-specific metadata
+
+ - Message.Urllist:
+ - Same as Message.Url, but its 2nd element is a list of multiple URLs
+ - The additional URLs serve as a fallback if the primary one fails
+ """
+
+ Version = 1
+ Directory = 2
+ Url = 3
+ # Headers = 4
+ # Cookies = 5
+ Queue = 6
+ Urllist = 7
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
new file mode 100644
index 0000000..1515f53
--- /dev/null
+++ b/gallery_dl/extractor/myportfolio.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.myportfolio.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class MyportfolioGalleryExtractor(Extractor):
+ """Extractor for an image gallery on www.myportfolio.com"""
+ category = "myportfolio"
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user}", "{title}")
+ filename_fmt = "{num:>02}.{extension}"
+ archive_fmt = "{user}_{filename}"
+ pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
+ r"(?:https?://)?([^.]+\.myportfolio\.com))"
+ r"(/[^/?&#]+)?")
+ test = (
+ ("https://hannahcosgrove.myportfolio.com/robyn", {
+ "url": "93b5430e765e53564b13e7d9c64c30c286011a6b",
+ "keyword": "25cb3dbdad6b011242a133f30ec598318b7512e8",
+ }),
+ ("https://hannahcosgrove.myportfolio.com/lfw", {
+ "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$",
+ "count": ">= 8",
+ }),
+ ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", {
+ "count": 3,
+ }),
+ ("myportfolio:https://tooco.com.ar/", {
+ "count": ">= 40",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ domain1, domain2, self.path = match.groups()
+ self.domain = domain1 or domain2
+ self.prefix = "myportfolio:" if domain1 else ""
+
+ def items(self):
+ yield Message.Version, 1
+ url = "https://" + self.domain + (self.path or "")
+ page = self.request(url).text
+
+ projects = text.extract(
+ page, '<section class="project-covers', '</section>')[0]
+
+ if projects:
+ data = {"_extractor": MyportfolioGalleryExtractor}
+ base = self.prefix + "https://" + self.domain
+ for path in text.extract_iter(projects, ' href="', '"'):
+ yield Message.Queue, base + path, data
+ else:
+ data = self.metadata(page)
+ imgs = self.images(page)
+ data["count"] = len(imgs)
+ yield Message.Directory, data
+ for data["num"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ @staticmethod
+ def metadata(page):
+ """Collect general image metadata"""
+ # og:title contains data as "<user> - <title>", but both
+ # <user> and <title> can contain a "-" as well, so we get the title
+ # from somewhere else and cut that amount from the og:title content
+
+ user, pos = text.extract(
+ page, 'property=og:title content="', '"')
+ desc, pos = text.extract(
+ page, 'property=og:description content="', '"', pos)
+ title, pos = text.extract(
+ page, '<h1 ', '</h1>', pos)
+
+ title = title.partition(">")[2]
+ user = user[:-len(title)-3]
+
+ return {
+ "user": text.unescape(user),
+ "title": text.unescape(title),
+ "description": text.unescape(desc or ""),
+ }
+
+ @staticmethod
+ def images(page):
+ """Extract and return a list of all image-urls"""
+ return list(text.extract_iter(page, 'js-lightbox" data-src="', '"'))
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
new file mode 100644
index 0000000..9e0aaa3
--- /dev/null
+++ b/gallery_dl/extractor/newgrounds.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.newgrounds.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class NewgroundsExtractor(Extractor):
+ """Base class for newgrounds extractors"""
+ category = "newgrounds"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{category}_{index}_{title}.{extension}"
+ archive_fmt = "{index}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.root = "https://{}.newgrounds.com".format(self.user)
+
+ def items(self):
+ data = self.get_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for page_url in self.get_page_urls():
+ image = self.parse_page_data(page_url)
+ image.update(data)
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def get_metadata(self):
+ """Collect metadata for extractor-job"""
+ return {"user": self.user}
+
+ def get_page_urls(self):
+ """Return urls of all relevant image pages"""
+
+ def parse_page_data(self, page_url):
+ """Collect url and metadata from an image page"""
+ extr = text.extract_from(self.request(page_url).text)
+ full = text.extract_from(json.loads(extr('"full_image_text":', '});')))
+ data = {
+ "description": text.unescape(extr(':description" content="', '"')),
+ "date" : extr('itemprop="datePublished" content="', '"'),
+ "rating" : extr('class="rated-', '"'),
+ "favorites" : text.parse_int(extr('id="faves_load">', '<')),
+ "score" : text.parse_float(extr('id="score_number">', '<')),
+ "url" : full('src="', '"'),
+ "title" : text.unescape(full('alt="', '"')),
+ "width" : text.parse_int(full('width="', '"')),
+ "height" : text.parse_int(full('height="', '"')),
+ }
+
+ tags = text.split_html(extr('<dd class="tags momag">', '</dd>'))
+ tags.sort()
+ data["tags"] = tags
+
+ data["date"] = text.parse_datetime(data["date"])
+ data["index"] = text.parse_int(
+ data["url"].rpartition("/")[2].partition("_")[0])
+ return data
+
+ def _pagination(self, url):
+ headers = {
+ "Referer": self.root,
+ "X-Requested-With": "XMLHttpRequest",
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ }
+
+ while True:
+ data = self.request(url, headers=headers).json()
+
+ for year in data["sequence"]:
+ for item in data["years"][str(year)]["items"]:
+ page_url = text.extract(item, 'href="', '"')[0]
+ yield text.urljoin(self.root, page_url)
+
+ if not data["more"]:
+ return
+ url = text.urljoin(self.root, data["more"])
+
+
+class NewgroundsUserExtractor(NewgroundsExtractor):
+ """Extractor for all images of a newgrounds user"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com(?:/art)?/?$"
+ test = (
+ ("https://blitzwuff.newgrounds.com/art", {
+ "url": "24b19c4a135a09889fac7b46a74e427e4308d02b",
+ "keyword": "2aab0532a894ff3cf88dd01ce5c60f114011b268",
+ }),
+ ("https://blitzwuff.newgrounds.com/"),
+ )
+
+ def get_page_urls(self):
+ return self._pagination(self.root + "/art/page/1")
+
+
+class NewgroundsImageExtractor(NewgroundsExtractor):
+ """Extractor for a single image from newgrounds.com"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:www\.)?newgrounds\.com/art/view/([^/?&#]+)/[^/?&#]+"
+ r"|art\.ngfiles\.com/images/\d+/\d+_([^_]+)_([^.]+))")
+ test = (
+ ("https://www.newgrounds.com/art/view/blitzwuff/ffx", {
+ "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818",
+ "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e",
+ "content": "cb067d6593598710292cdd340d350d14a26fe075",
+ }),
+ ("https://art.ngfiles.com/images/587000/587551_blitzwuff_ffx.png", {
+ "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818",
+ "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e",
+ }),
+ )
+
+ def __init__(self, match):
+ NewgroundsExtractor.__init__(self, match)
+ if match.group(2):
+ self.user = match.group(2)
+ self.page_url = "https://www.newgrounds.com/art/view/{}/{}".format(
+ self.user, match.group(3))
+ else:
+ self.page_url = match.group(0)
+
+ def get_page_urls(self):
+ return (self.page_url,)
+
+
+class NewgroundsVideoExtractor(NewgroundsExtractor):
+ """Extractor for all videos of a newgrounds user"""
+ subcategory = "video"
+ filename_fmt = "{category}_{index}.{extension}"
+ pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$"
+ test = ("https://twistedgrim.newgrounds.com/movies", {
+ "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+",
+ "count": ">= 29",
+ })
+
+ def get_page_urls(self):
+ return self._pagination(self.root + "/movies/page/1")
+
+ def parse_page_data(self, page_url):
+ return {
+ "url" : "ytdl:" + page_url,
+ "index": text.parse_int(page_url.rpartition("/")[2]),
+ }
diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py
new file mode 100644
index 0000000..8135a8a
--- /dev/null
+++ b/gallery_dl/extractor/ngomik.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from http://ngomik.in/"""
+
+from .common import ChapterExtractor
+from .. import text
+import re
+
+
+class NgomikChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from ngomik.in"""
+ category = "ngomik"
+ root = "http://ngomik.in"
+ pattern = (r"(?:https?://)?(?:www\.)?ngomik\.in"
+ r"(/[^/?&#]+-chapter-[^/?&#]+)")
+ test = (
+ ("https://www.ngomik.in/14-sai-no-koi-chapter-1-6/", {
+ "url": "8e67fdf751bbc79bc6f4dead7675008ddb8e32a4",
+ "keyword": "204d177f09d438fd50c9c28d98c73289194640d8",
+ }),
+ ("https://ngomik.in/break-blade-chapter-26/", {
+ "count": 34,
+ }),
+ )
+
+ def metadata(self, page):
+ info = text.extract(page, '<title>', "</title>")[0]
+ manga, _, chapter = info.partition(" Chapter ")
+ chapter, sep, minor = chapter.partition(" ")[0].partition(".")
+
+ return {
+ "manga": text.unescape(manga),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "lang": "id",
+ "language": "Indonesian",
+ }
+
+ @staticmethod
+ def images(page):
+ readerarea = text.extract(page, 'id=readerarea', 'class=chnav')[0]
+ return [
+ (text.unescape(url), None)
+ for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea)
+ ]
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
new file mode 100644
index 0000000..746144a
--- /dev/null
+++ b/gallery_dl/extractor/nhentai.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://nhentai.net/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import collections
+import json
+
+
+class NhentaiBase():
+ """Base class for nhentai extractors"""
+ category = "nhentai"
+ root = "https://nhentai.net"
+ media_url = "https://i.nhentai.net"
+
+
+class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
+ """Extractor for image galleries from nhentai.net"""
+ pattern = r"(?:https?://)?nhentai\.net(/g/(\d+))"
+ test = ("https://nhentai.net/g/147850/", {
+ "url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0",
+ "keyword": {
+ "title" : r"re:\[Morris\] Amazon no Hiyaku \| Amazon Elixir",
+ "title_en" : str,
+ "title_ja" : str,
+ "gallery_id": 147850,
+ "media_id" : 867789,
+ "count" : 16,
+ "date" : 1446050915,
+ "scanlator" : "",
+ "artist" : ["morris"],
+ "group" : list,
+ "parody" : list,
+ "characters": list,
+ "tags" : list,
+ "type" : "manga",
+ "lang" : "en",
+ "language" : "English",
+ "width" : int,
+ "height" : int,
+ },
+ })
+
+ def __init__(self, match):
+ GalleryExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+ self.data = None
+
+ def metadata(self, page):
+ data = json.loads(text.extract(page, "N.gallery(", ");")[0])
+ self.data = data
+
+ title_en = data["title"].get("english", "")
+ title_ja = data["title"].get("japanese", "")
+
+ info = collections.defaultdict(list)
+ for tag in data["tags"]:
+ info[tag["type"]].append(tag["name"])
+
+ language = ""
+ for language in info["language"]:
+ if language != "translated":
+ language = language.capitalize()
+ break
+
+ return {
+ "title" : title_en or title_ja,
+ "title_en" : title_en,
+ "title_ja" : title_ja,
+ "gallery_id": data["id"],
+ "media_id" : text.parse_int(data["media_id"]),
+ "date" : data["upload_date"],
+ "scanlator" : data["scanlator"],
+ "artist" : info["artist"],
+ "group" : info["group"],
+ "parody" : info["parody"],
+ "characters": info["character"],
+ "tags" : info["tag"],
+ "type" : info["category"][0] if info["category"] else "",
+ "lang" : util.language_to_code(language),
+ "language" : language,
+ }
+
+ def images(self, _):
+ ufmt = "{}/galleries/{}/{{}}.{{}}".format(
+ self.media_url, self.data["media_id"])
+ extdict = {"j": "jpg", "p": "png", "g": "gif"}
+
+ return [
+ (ufmt.format(num, extdict.get(img["t"], "jpg")), {
+ "width": img["w"], "height": img["h"],
+ })
+ for num, img in enumerate(self.data["images"]["pages"], 1)
+ ]
+
+
+class NhentaiSearchExtractor(NhentaiBase, Extractor):
+ """Extractor for nhentai search results"""
+ category = "nhentai"
+ subcategory = "search"
+ pattern = r"(?:https?://)?nhentai\.net/search/?\?([^#]+)"
+ test = ("https://nhentai.net/search/?q=touhou", {
+ "pattern": NhentaiGalleryExtractor.pattern,
+ "count": 30,
+ "range": "1-30",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ data = {"_extractor": NhentaiGalleryExtractor}
+ for gallery_id in self._pagination(self.params):
+ url = "{}/g/{}/".format(self.root, gallery_id)
+ yield Message.Queue, url, data
+
+ def _pagination(self, params):
+ url = "{}/search/".format(self.root)
+ params["page"] = text.parse_int(params.get("page"), 1)
+
+ while True:
+ page = self.request(url, params=params).text
+ yield from text.extract_iter(page, 'href="/g/', '/')
+ if 'class="next"' not in page:
+ return
+ params["page"] += 1
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
new file mode 100644
index 0000000..abf1eaa
--- /dev/null
+++ b/gallery_dl/extractor/nijie.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://nijie.info/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, exception
+from ..cache import cache
+
+
+class NijieExtractor(AsynchronousMixin, Extractor):
+ """Base class for nijie extractors"""
+ category = "nijie"
+ directory_fmt = ("{category}", "{user_id}")
+ filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}"
+ archive_fmt = "{image_id}_{index}"
+ cookiedomain = "nijie.info"
+ cookienames = ("nemail", "nlogin")
+ root = "https://nijie.info"
+ view_url = "https://nijie.info/view.php?id="
+ popup_url = "https://nijie.info/view_popup.php?id="
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user_id = match.group(1)
+ self.session.headers["Referer"] = self.root + "/"
+
+ def items(self):
+ self.login()
+ data = self.get_job_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for image_id in self.get_image_ids():
+ for image_url, image_data in self.get_image_data(image_id):
+ image_data.update(data)
+ if not image_data["extension"]:
+ image_data["extension"] = "jpg"
+ yield Message.Url, image_url, image_data
+
+ def get_job_metadata(self):
+ """Collect metadata for extractor-job"""
+ return {"user_id": text.parse_int(self.user_id)}
+
+ def get_image_ids(self):
+ """Collect all relevant image-ids"""
+
+ def get_image_data(self, image_id):
+ """Get URL and metadata for images specified by 'image_id'"""
+ page = self.request(self.view_url + image_id).text
+ return self.extract_image_data(page, image_id)
+
+ def extract_image_data(self, page, image_id):
+ """Get URL and metadata for images from 'page'"""
+ title, pos = text.extract(
+ page, '<meta property="og:title" content="', '"')
+ description, pos = text.extract(
+ page, '<meta property="og:description" content="', '"', pos)
+ artist_id, pos = text.extract(
+ page, '"sameAs": "https://nijie.info/members.php?id=', '"', pos)
+ images = list(text.extract_iter(
+ page, '<a href="./view_popup.php', '</a>', pos))
+
+ title = title.rpartition("|")[0].strip()
+ image_id = text.parse_int(image_id)
+ artist_id = text.parse_int(artist_id)
+
+ for index, image in enumerate(images):
+ url = "https:" + text.extract(image, 'src="', '"')[0]
+ url = url.replace("/__rs_l120x120/", "/", 1)
+
+ yield url, text.nameext_from_url(url, {
+ "index": index,
+ "count": len(images),
+ "title": title,
+ "description": description,
+ "image_id": image_id,
+ "artist_id": artist_id,
+ })
+
+ def login(self):
+ """Login and obtain session cookies"""
+ if not self._check_cookies(self.cookienames):
+ username, password = self._get_auth_info()
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=150*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "{}/login_int.php".format(self.root)
+ data = {"email": username, "password": password, "save": "on"}
+
+ response = self.request(url, method="POST", data=data)
+ if "//nijie.info/login.php" in response.text:
+ raise exception.AuthenticationError()
+ return self.session.cookies
+
+ def _pagination(self, path):
+ url = "{}/{}.php".format(self.root, path)
+ params = {"id": self.user_id, "p": 1}
+
+ while True:
+ response = self.request(url, params=params, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("artist")
+
+ page = response.text
+ ids = list(text.extract_iter(page, ' illust_id="', '"'))
+ yield from ids
+
+ if '<a rel="next"' not in page:
+ return
+ params["p"] += 1
+
+
+class NijieUserExtractor(NijieExtractor):
+ """Extractor for works of a nijie-user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
+ r"/members(?:_illust)?\.php\?id=(\d+)")
+ test = (
+ ("https://nijie.info/members_illust.php?id=44", {
+ "url": "585d821df4716b1098660a0be426d01db4b65f2a",
+ "keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a",
+ }),
+ ("https://nijie.info/members_illust.php?id=43", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://nijie.info/members.php?id=44"),
+ )
+
+ def get_image_ids(self):
+ return self._pagination("members_illust")
+
+
+class NijieDoujinExtractor(NijieExtractor):
+ """Extractor for doujin entries of a nijie-user"""
+ subcategory = "doujin"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info/"
+ r"members_dojin\.php\?id=(\d+)")
+ test = ("https://nijie.info/members_dojin.php?id=6782", {
+ "count": ">= 18",
+ })
+
+ def get_image_ids(self):
+ return self._pagination("members_dojin")
+
+
+class NijieFavoriteExtractor(NijieExtractor):
+ """Extractor for all favorites/bookmarks of a nijie-user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "bookmarks", "{user_id}")
+ archive_fmt = "f_{user_id}_{image_id}_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
+ r"/user_like_illust_view\.php\?id=(\d+)")
+ test = ("https://nijie.info/user_like_illust_view.php?id=44", {
+ "count": ">= 16",
+ })
+
+ def get_image_ids(self):
+ return self._pagination("user_like_illust_view")
+
+
+class NijieImageExtractor(NijieExtractor):
+ """Extractor for a work/image from nijie.info"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
+ r"/view(?:_popup)?\.php\?id=(\d+)")
+ test = (
+ ("https://nijie.info/view.php?id=70720", {
+ "url": "a10d4995645b5f260821e32c60a35f73546c2699",
+ "keyword": "408393d010307c76d52cbd0a4368d6d357805aea",
+ "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6",
+ }),
+ ("https://nijie.info/view.php?id=70724", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://nijie.info/view_popup.php?id=70720"),
+ )
+
+ def __init__(self, match):
+ NijieExtractor.__init__(self, match)
+ self.image_id = match.group(1)
+ self.page = ""
+
+ def get_job_metadata(self):
+ response = self.request(self.view_url + self.image_id, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("image")
+ self.page = response.text
+ self.user_id = text.extract(
+ self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0]
+ return NijieExtractor.get_job_metadata(self)
+
+ def get_image_ids(self):
+ return (self.image_id,)
+
+ def get_image_data(self, _):
+ return self.extract_image_data(self.page, self.image_id)
diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py
new file mode 100644
index 0000000..c55f80a
--- /dev/null
+++ b/gallery_dl/extractor/nsfwalbum.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nsfwalbum.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class NsfwalbumAlbumExtractor(GalleryExtractor):
+ """Extractor for image albums on nsfwalbum.com"""
+ category = "nsfwalbum"
+ subcategory = "album"
+ root = "https://nsfwalbum.com"
+ filename_fmt = "{album_id}_{page:>03}_{id}.{extension}"
+ directory_fmt = ("{category}", "{album_id} {title}")
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))"
+ test = ("https://nsfwalbum.com/album/295201", {
+ "range": "1-5",
+ "url": "e60eced1873215f5deee1ca7226d60cb4dcc051c",
+ "keyword": "e0573ecb1966611e96d10172a3ca1db1078a7984",
+ })
+
+ def __init__(self, match):
+ self.album_id = match.group(2)
+ GalleryExtractor.__init__(self, match)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ return {
+ "album_id": text.parse_int(self.album_id),
+ "title" : text.unescape(extr('<h6>', '</h6>')),
+ "models" : text.split_html(extr('"models"> Models:', '</div>')),
+ "studio" : text.remove_html(extr('"models"> Studio:', '</div>')),
+ }
+
+ def images(self, page):
+ iframe = self.root + "/iframe_image.php?id="
+ backend = self.root + "/backend.php"
+ for image_id in text.extract_iter(page, 'data-img-id="', '"'):
+ spirit = text.extract(self.request(
+ iframe + image_id).text, 'giraffe.annihilate("', '"')[0]
+ params = {"spirit": self._annihilate(spirit), "photo": image_id}
+ data = self.request(backend, params=params).json()
+ yield data[0], {
+ "id" : text.parse_int(image_id),
+ "width" : text.parse_int(data[1]),
+ "height": text.parse_int(data[2]),
+ }
+
+ @staticmethod
+ def _annihilate(value, base=6):
+ return "".join(
+ chr(ord(char) ^ base)
+ for char in value
+ )
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
new file mode 100644
index 0000000..e26eae1
--- /dev/null
+++ b/gallery_dl/extractor/oauth.py
@@ -0,0 +1,375 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Utility classes to setup OAuth and link a users account to gallery-dl"""
+
+from .common import Extractor, Message
+from . import deviantart, flickr, reddit, smugmug, tumblr
+from .. import text, oauth, config, exception
+from ..cache import cache
+import os
+import urllib.parse
+
+
+class OAuthBase(Extractor):
+ """Base class for OAuth Helpers"""
+ category = "oauth"
+ redirect_uri = "http://localhost:6414/"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.client = None
+
+ def oauth_config(self, key, default=None):
+ return config.interpolate(
+ ("extractor", self.subcategory, key), default)
+
+ def recv(self):
+ """Open local HTTP server and recv callback parameters"""
+ import socket
+ print("Waiting for response. (Cancel with Ctrl+c)")
+ server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ server.bind(("localhost", 6414))
+ server.listen(1)
+
+ # workaround for ctrl+c not working during server.accept on Windows
+ if os.name == "nt":
+ server.settimeout(1.0)
+ while True:
+ try:
+ self.client = server.accept()[0]
+ break
+ except socket.timeout:
+ pass
+ server.close()
+
+ data = self.client.recv(1024).decode()
+ path = data.split(" ", 2)[1]
+ return text.parse_query(path.partition("?")[2])
+
+ def send(self, msg):
+ """Send 'msg' to the socket opened in 'recv()'"""
+ print(msg)
+ self.client.send(b"HTTP/1.1 200 OK\r\n\r\n" + msg.encode())
+ self.client.close()
+
+ def open(self, url, params):
+ """Open 'url' in browser amd return response parameters"""
+ import webbrowser
+ url += "?" + urllib.parse.urlencode(params)
+ if not self.config("browser", True) or not webbrowser.open(url):
+ print("Please open this URL in your browser:")
+ print(url, end="\n\n", flush=True)
+ return self.recv()
+
+ def _oauth1_authorization_flow(
+ self, request_token_url, authorize_url, access_token_url):
+ """Perform the OAuth 1.0a authorization flow"""
+ # get a request token
+ params = {"oauth_callback": self.redirect_uri}
+ data = self.session.get(request_token_url, params=params).text
+
+ data = text.parse_query(data)
+ self.session.auth.token_secret = data["oauth_token_secret"]
+
+ # get the user's authorization
+ params = {"oauth_token": data["oauth_token"], "perms": "read"}
+ data = self.open(authorize_url, params)
+
+ # exchange the request token for an access token
+ data = self.session.get(access_token_url, params=data).text
+
+ data = text.parse_query(data)
+ self.send(OAUTH1_MSG_TEMPLATE.format(
+ category=self.subcategory,
+ token=data["oauth_token"],
+ token_secret=data["oauth_token_secret"],
+ ))
+
+ def _oauth2_authorization_code_grant(
+ self, client_id, client_secret, auth_url, token_url,
+ scope="read", key="refresh_token", auth=True,
+ message_template=None):
+ """Perform an OAuth2 authorization code grant"""
+
+ state = "gallery-dl_{}_{}".format(
+ self.subcategory,
+ oauth.nonce(8),
+ )
+
+ auth_params = {
+ "client_id": client_id,
+ "response_type": "code",
+ "state": state,
+ "redirect_uri": self.redirect_uri,
+ "duration": "permanent",
+ "scope": scope,
+ }
+
+ # receive an authorization code
+ params = self.open(auth_url, auth_params)
+
+ # check authorization response
+ if state != params.get("state"):
+ self.send("'state' mismatch: expected {}, got {}.".format(
+ state, params.get("state")
+ ))
+ return
+ if "error" in params:
+ self.send(params["error"])
+ return
+
+ # exchange the authorization code for a token
+ data = {
+ "grant_type": "authorization_code",
+ "code": params["code"],
+ "redirect_uri": self.redirect_uri,
+ }
+
+ if auth:
+ auth = (client_id, client_secret)
+ else:
+ auth = None
+ data["client_id"] = client_id
+ data["client_secret"] = client_secret
+
+ data = self.session.post(token_url, data=data, auth=auth).json()
+
+ # check token response
+ if "error" in data:
+ self.send(data["error"])
+ return
+
+ # display token
+ part = key.partition("_")[0]
+ template = message_template or OAUTH2_MSG_TEMPLATE
+ self.send(template.format(
+ category=self.subcategory,
+ key=part,
+ Key=part.capitalize(),
+ token=data[key],
+ instance=getattr(self, "instance", ""),
+ client_id=client_id,
+ client_secret=client_secret,
+ ))
+
+
+class OAuthDeviantart(OAuthBase):
+ subcategory = "deviantart"
+ pattern = "oauth:deviantart$"
+ redirect_uri = "https://mikf.github.io/gallery-dl/oauth-redirect.html"
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth2_authorization_code_grant(
+ self.oauth_config(
+ "client-id", deviantart.DeviantartAPI.CLIENT_ID),
+ self.oauth_config(
+ "client-secret", deviantart.DeviantartAPI.CLIENT_SECRET),
+ "https://www.deviantart.com/oauth2/authorize",
+ "https://www.deviantart.com/oauth2/token",
+ scope="browse",
+ )
+
+
+class OAuthFlickr(OAuthBase):
+ subcategory = "flickr"
+ pattern = "oauth:flickr$"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.session = oauth.OAuth1Session(
+ self.oauth_config("api-key", flickr.FlickrAPI.API_KEY),
+ self.oauth_config("api-secret", flickr.FlickrAPI.API_SECRET),
+ )
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth1_authorization_flow(
+ "https://www.flickr.com/services/oauth/request_token",
+ "https://www.flickr.com/services/oauth/authorize",
+ "https://www.flickr.com/services/oauth/access_token",
+ )
+
+
+class OAuthReddit(OAuthBase):
+ subcategory = "reddit"
+ pattern = "oauth:reddit$"
+
+ def items(self):
+ yield Message.Version, 1
+
+ self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT
+ self._oauth2_authorization_code_grant(
+ self.oauth_config("client-id", reddit.RedditAPI.CLIENT_ID),
+ "",
+ "https://www.reddit.com/api/v1/authorize",
+ "https://www.reddit.com/api/v1/access_token",
+ scope="read",
+ )
+
+
+class OAuthSmugmug(OAuthBase):
+ subcategory = "smugmug"
+ pattern = "oauth:smugmug$"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.session = oauth.OAuth1Session(
+ self.oauth_config("api-key", smugmug.SmugmugAPI.API_KEY),
+ self.oauth_config("api-secret", smugmug.SmugmugAPI.API_SECRET),
+ )
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth1_authorization_flow(
+ "https://api.smugmug.com/services/oauth/1.0a/getRequestToken",
+ "https://api.smugmug.com/services/oauth/1.0a/authorize",
+ "https://api.smugmug.com/services/oauth/1.0a/getAccessToken",
+ )
+
+
+class OAuthTumblr(OAuthBase):
+ subcategory = "tumblr"
+ pattern = "oauth:tumblr$"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.session = oauth.OAuth1Session(
+ self.oauth_config("api-key", tumblr.TumblrAPI.API_KEY),
+ self.oauth_config("api-secret", tumblr.TumblrAPI.API_SECRET),
+ )
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth1_authorization_flow(
+ "https://www.tumblr.com/oauth/request_token",
+ "https://www.tumblr.com/oauth/authorize",
+ "https://www.tumblr.com/oauth/access_token",
+ )
+
+
+class OAuthMastodon(OAuthBase):
+ subcategory = "mastodon"
+ pattern = "oauth:mastodon:(?:https?://)?([^/?&#]+)"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.instance = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+
+ application = self.oauth_config(self.instance)
+ if not application:
+ application = self._register(self.instance)
+
+ self._oauth2_authorization_code_grant(
+ application["client-id"],
+ application["client-secret"],
+ "https://{}/oauth/authorize".format(self.instance),
+ "https://{}/oauth/token".format(self.instance),
+ key="access_token",
+ message_template=MASTODON_MSG_TEMPLATE,
+ )
+
+ @cache(maxage=10*365*24*3600, keyarg=1)
+ def _register(self, instance):
+ self.log.info("Registering application for '%s'", instance)
+
+ url = "https://{}/api/v1/apps".format(instance)
+ data = {
+ "client_name": "gdl:" + oauth.nonce(8),
+ "redirect_uris": self.redirect_uri,
+ "scopes": "read",
+ }
+ data = self.session.post(url, data=data).json()
+
+ if "client_id" not in data or "client_secret" not in data:
+ self.log.error("Failed to register new application: '%s'", data)
+ raise exception.StopExtraction()
+
+ data["client-id"] = data.pop("client_id")
+ data["client-secret"] = data.pop("client_secret")
+
+ self.log.info("client-id:\n%s", data["client-id"])
+ self.log.info("client-secret:\n%s", data["client-secret"])
+
+ return data
+
+
+OAUTH1_MSG_TEMPLATE = """
+Your Access Token and Access Token Secret are
+
+{token}
+{token_secret}
+
+Put these values into your configuration file as
+'extractor.{category}.access-token' and
+'extractor.{category}.access-token-secret'.
+
+Example:
+{{
+ "extractor": {{
+ "{category}": {{
+ "access-token": "{token}",
+ "access-token-secret": "{token_secret}"
+ }}
+ }}
+}}
+"""
+
+
+OAUTH2_MSG_TEMPLATE = """
+Your {Key} Token is
+
+{token}
+
+Put this value into your configuration file as
+'extractor.{category}.{key}-token'.
+
+Example:
+{{
+ "extractor": {{
+ "{category}": {{
+ "{key}-token": "{token}"
+ }}
+ }}
+}}
+"""
+
+
+MASTODON_MSG_TEMPLATE = """
+Your {Key} Token is
+
+{token}
+
+Put this value into your configuration file as
+'extractor.mastodon.{instance}.{key}-token'.
+
+You can also add your 'client-id' and 'client-secret' values
+if you want to register another account in the future.
+
+Example:
+{{
+ "extractor": {{
+ "mastodon": {{
+ "{instance}": {{
+ "{key}-token": "{token}",
+ "client-id": "{client_id}",
+ "client-secret": "{client_secret}"
+ }}
+ }}
+ }}
+}}
+"""
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
new file mode 100644
index 0000000..a4731d0
--- /dev/null
+++ b/gallery_dl/extractor/paheal.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://rule34.paheal.net/"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+
+
+class PahealExtractor(SharedConfigMixin, Extractor):
+ """Base class for paheal extractors"""
+ basecategory = "booru"
+ category = "paheal"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://rule34.paheal.net"
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, self.get_metadata()
+
+ for data in self.get_posts():
+ url = data["file_url"]
+ for key in ("id", "width", "height"):
+ data[key] = text.parse_int(data[key])
+ data["tags"] = text.unquote(data["tags"])
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def get_metadata(self):
+ """Return general metadata"""
+ return {}
+
+ def get_posts(self):
+ """Return an iterable containing data of all relevant posts"""
+
+
+class PahealTagExtractor(PahealExtractor):
+ """Extractor for images from rule34.paheal.net by search-tags"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
+ r"/post/list/([^/?&#]+)")
+ test = ("https://rule34.paheal.net/post/list/k-on/1", {
+ "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20",
+ "count": ">= 15"
+ })
+ per_page = 70
+
+ def __init__(self, match):
+ PahealExtractor.__init__(self, match)
+ self.tags = text.unquote(match.group(1))
+
+ def get_metadata(self):
+ return {"search_tags": self.tags}
+
+ def get_posts(self):
+ pnum = 1
+ while True:
+ url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
+ page = self.request(url).text
+
+ for post in text.extract_iter(
+ page, '<img id="thumb_', '>Image Only<'):
+ yield self._extract_data(post)
+
+ if ">Next<" not in page:
+ return
+ pnum += 1
+
+ @staticmethod
+ def _extract_data(post):
+ pid , pos = text.extract(post, '', '"')
+ data, pos = text.extract(post, 'title="', '"', pos)
+ md5 , pos = text.extract(post, '/_thumbs/', '/', pos)
+ url , pos = text.extract(post, '<a href="', '"', pos)
+
+ tags, dimensions, size, _ = data.split(" // ")
+ width, _, height = dimensions.partition("x")
+
+ return {
+ "id": pid, "md5": md5, "tags": tags, "file_url": url,
+ "width": width, "height": height,
+ "size": text.parse_bytes(size[:-1]),
+ }
+
+
+class PahealPostExtractor(PahealExtractor):
+ """Extractor for single images from rule34.paheal.net"""
+ subcategory = "post"
+ pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
+ r"/post/view/(\d+)")
+ test = ("https://rule34.paheal.net/post/view/481609", {
+ "url": "1142779378f655ec0497d4c301836aa667f788b1",
+ "keyword": "34e9e93d4fa6fa06fac1a56e78c9a52e8cd7b271",
+ "content": "7b924bcf150b352ac75c9d281d061e174c851a11",
+ })
+
+ def __init__(self, match):
+ PahealExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def get_posts(self):
+ url = "{}/post/view/{}".format(self.root, self.post_id)
+ page = self.request(url).text
+
+ tags , pos = text.extract(page, ": ", "<")
+ md5 , pos = text.extract(page, "/_thumbs/", "/", pos)
+ url , pos = text.extract(page, "id='main_image' src='", "'", pos)
+ width , pos = text.extract(page, "data-width='", "'", pos)
+ height, pos = text.extract(page, "data-height='", "'", pos)
+
+ return ({
+ "id": self.post_id, "md5": md5, "tags": tags, "file_url": url,
+ "width": width, "height": height, "size": 0,
+ },)
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
new file mode 100644
index 0000000..4884497
--- /dev/null
+++ b/gallery_dl/extractor/patreon.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.patreon.com/"""
+
+from .common import Extractor, Message
+from .. import text
+from ..cache import memcache
+
+
+class PatreonExtractor(Extractor):
+ """Base class for patreon extractors"""
+ category = "patreon"
+ root = "https://www.patreon.com"
+ directory_fmt = ("{category}", "{creator[full_name]}")
+ filename_fmt = "{id}_{title}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ _warning = True
+
+ def items(self):
+ yield Message.Version, 1
+
+ if self._warning:
+ if "session_id" not in self.session.cookies:
+ self.log.warning("no 'session_id' cookie set")
+ PatreonExtractor._warning = False
+
+ for post in self.posts():
+ yield Message.Directory, post
+
+ post["num"] = 0
+ content = post.get("content")
+ postfile = post.get("post_file")
+
+ for url in text.extract_iter(content or "", 'src="', '"'):
+ post["num"] += 1
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ if postfile:
+ post["num"] += 1
+ text.nameext_from_url(postfile["name"], post)
+ yield Message.Url, postfile["url"], post
+
+ for attachment in post["attachments"]:
+ post["num"] += 1
+ text.nameext_from_url(attachment["name"], post)
+ yield Message.Url, attachment["url"], post
+
+ def posts(self):
+ """Return all relevant post objects"""
+
+ def _pagination(self, url):
+ headers = {"Referer": self.root}
+ empty = []
+
+ while url:
+ posts = self.request(url, headers=headers).json()
+
+ if "included" not in posts:
+ return
+
+ # collect attachments
+ attachments = {}
+ for inc in posts["included"]:
+ if inc["type"] == "attachment":
+ attachments[inc["id"]] = inc["attributes"]
+
+ # update posts
+ for post in posts["data"]:
+ attr = post["attributes"]
+ attr["id"] = text.parse_int(post["id"])
+ attr["date"] = text.parse_datetime(
+ attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ attr["creator"] = self._user(
+ post["relationships"]["user"]["links"]["related"])
+
+ # add attachments to post attributes
+ files = post["relationships"].get("attachments")
+ if files:
+ attr["attachments"] = [
+ attachments[f["id"]]
+ for f in files["data"]
+ ]
+ else:
+ attr["attachments"] = empty
+
+ yield attr
+
+ if "links" not in posts:
+ return
+ url = posts["links"].get("next")
+
+ @memcache(keyarg=1)
+ def _user(self, url):
+ user = self.request(url).json()["data"]
+ attr = user["attributes"]
+ attr["id"] = user["id"]
+ attr["date"] = text.parse_datetime(
+ attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ return attr
+
+ @staticmethod
+ def _build_url(endpoint, query):
+ return (
+ "https://www.patreon.com/api/" + endpoint +
+
+ "?include=user,attachments,user_defined_tags,campaign,poll.choices"
+ ",poll.current_user_responses.user,poll.current_user_responses.cho"
+ "ice,poll.current_user_responses.poll,access_rules.tier.null"
+
+ "&fields[post]=change_visibility_at,comment_count,content,current_"
+ "user_can_delete,current_user_can_view,current_user_has_liked,embe"
+ "d,image,is_paid,like_count,min_cents_pledged_to_view,post_file,pu"
+ "blished_at,patron_count,patreon_url,post_type,pledge_url,thumbnai"
+ "l_url,teaser_text,title,upgrade_url,url,was_posted_by_campaign_ow"
+ "ner"
+ "&fields[user]=image_url,full_name,url"
+ "&fields[campaign]=avatar_photo_url,earnings_visibility,is_nsfw,is"
+ "_monthly,name,url"
+ "&fields[access_rule]=access_rule_type,amount_cents" + query +
+
+ "&json-api-use-default-includes=false"
+ "&json-api-version=1.0"
+ )
+
+
+class PatreonCreatorExtractor(PatreonExtractor):
+ """Extractor for a creator's works"""
+ subcategory = "creator"
+ pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
+ r"/(?!(?:home|join|login|signup)(?:$|[/?&#]))([^/?&#]+)/?")
+ test = ("https://www.patreon.com/koveliana", {
+ "range": "1-25",
+ "count": ">= 25",
+ "keyword": {
+ "attachments": list,
+ "comment_count": int,
+ "content": str,
+ "creator": dict,
+ "date": "type:datetime",
+ "id": int,
+ "like_count": int,
+ "post_type": str,
+ "published_at": str,
+ "title": str,
+ },
+ })
+
+ def __init__(self, match):
+ PatreonExtractor.__init__(self, match)
+ self.creator = match.group(1).lower()
+
+ def posts(self):
+ url = "{}/{}".format(self.root, self.creator)
+ page = self.request(url).text
+ campaign_id = text.extract(page, "/campaign/", "/")[0]
+
+ url = self._build_url("posts", (
+ "&sort=-published_at"
+ "&filter[is_draft]=false"
+ "&filter[contains_exclusive_posts]=true"
+ "&filter[campaign_id]=" + campaign_id
+ ))
+ return self._pagination(url)
+
+
+class PatreonUserExtractor(PatreonExtractor):
+ """Extractor for media from creators supported by you"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?(?:www\.)?patreon\.com/home$"
+ test = ("https://www.patreon.com/home",)
+
+ def posts(self):
+ url = self._build_url("stream", (
+ "&page[cursor]=null"
+ "&filter[is_following]=true"
+ ))
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py
new file mode 100644
index 0000000..83f75a3
--- /dev/null
+++ b/gallery_dl/extractor/photobucket.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://photobucket.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import base64
+import json
+
+
+class PhotobucketAlbumExtractor(Extractor):
+ """Extractor for albums on photobucket.com"""
+ category = "photobucket"
+ subcategory = "album"
+ directory_fmt = ("{category}", "{username}", "{location}")
+ filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
+ archive_fmt = "{id}"
+ pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)"
+ r"/user/[^/?&#]+/library/[^?&#]*")
+ test = (
+ ("https://s258.photobucket.com/user/focolandia/library/", {
+ "pattern": r"https?://[oi]+\d+.photobucket.com/albums/hh280/",
+ "count": ">= 39"
+ }),
+ # subalbums of main "directory"
+ ("https://s271.photobucket.com/user/lakerfanryan/library/", {
+ "options": (("image-filter", "False"),),
+ "pattern": pattern,
+ "count": 1,
+ }),
+ # subalbums of subalbum without images
+ ("https://s271.photobucket.com/user/lakerfanryan/library/Basketball", {
+ "pattern": pattern,
+ "count": ">= 9",
+ }),
+ # private (missing JSON data)
+ ("https://s1277.photobucket.com/user/sinisterkat44/library/", {
+ "count": 0,
+ }),
+ ("https://s1110.photobucket.com/user/chndrmhn100/library/"
+ "Chandu%20is%20the%20King?sort=3&page=1"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.album_path = ""
+ self.root = "https://" + match.group(1)
+ self.session.headers["Referer"] = self.url
+
+ def items(self):
+ yield Message.Version, 1
+ for image in self.images():
+ image["titleOrFilename"] = text.unescape(image["titleOrFilename"])
+ image["title"] = text.unescape(image["title"])
+ image["extension"] = image["ext"]
+ yield Message.Directory, image
+ yield Message.Url, image["fullsizeUrl"], image
+
+ if self.config("subalbums", True):
+ for album in self.subalbums():
+ album["_extractor"] = PhotobucketAlbumExtractor
+ yield Message.Queue, album["url"], album
+
+ def images(self):
+ """Yield all images of the current album"""
+ url = self.url
+ params = {"sort": "3", "page": 1}
+
+ while True:
+ page = self.request(url, params=params).text
+ json_data = text.extract(page, "collectionData:", ",\n")[0]
+ if not json_data:
+ msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0]
+ msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
+ self.log.error("Unable to get JSON data%s", msg)
+ return
+ data = json.loads(json_data)
+
+ yield from data["items"]["objects"]
+
+ if data["total"] <= data["offset"] + data["pageSize"]:
+ self.album_path = data["currentAlbumPath"]
+ return
+ params["page"] += 1
+
+ def subalbums(self):
+ """Return all subalbum objects"""
+ url = self.root + "/component/Albums-SubalbumList"
+ params = {
+ "albumPath": self.album_path,
+ "fetchSubAlbumsOnly": "true",
+ "deferCollapsed": "true",
+ "json": "1",
+ }
+
+ data = self.request(url, params=params).json()
+ return data["body"].get("subAlbums", ())
+
+
+class PhotobucketImageExtractor(Extractor):
+ """Extractor for individual images from photobucket.com"""
+ category = "photobucket"
+ subcategory = "image"
+ directory_fmt = ("{category}", "{username}")
+ filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
+ archive_fmt = "{username}_{id}"
+ pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com"
+ r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"
+ r"|/user/([^/?&#]+)/media/[^?&#]+\.html)")
+ test = (
+ (("https://s271.photobucket.com/user/lakerfanryan"
+ "/media/Untitled-3-1.jpg.html"), {
+ "url": "3b647deeaffc184cc48c89945f67574559c9051f",
+ "keyword": "a2de4e60d584912537b8025c01bdd1d20bdea735",
+ }),
+ (("https://s271.photobucket.com/user/lakerfanryan"
+ "/media/IsotopeswBros.jpg.html?sort=3&o=2"), {
+ "url": "12c1890c09c9cdb8a88fba7eec13f324796a8d7b",
+ "keyword": "61200a223df6c06f45ac3d30c88b3f5b048ce9a8",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1) or match.group(3)
+ self.media_id = match.group(2)
+ self.session.headers["Referer"] = self.url
+
+ def items(self):
+ url = "https://photobucket.com/galleryd/search.php"
+ params = {"userName": self.user, "searchTerm": "", "ref": ""}
+
+ if self.media_id:
+ params["mediaId"] = self.media_id
+ else:
+ params["url"] = self.url
+
+ # retry API call up to 5 times, since it can randomly fail
+ tries = 0
+ while tries < 5:
+ data = self.request(url, method="POST", params=params).json()
+ image = data["mediaDocuments"]
+ if "message" not in image:
+ break # success
+ tries += 1
+ self.log.debug("'%s'", image["message"])
+ else:
+ self.log.error("%s", image["message"])
+ raise exception.StopExtraction()
+
+ # adjust metadata entries to be at least somewhat similar
+ # to what the 'album' extractor provides
+ if "media" in image:
+ image = image["media"][image["mediaIndex"]]
+ image["albumView"] = data["mediaDocuments"]["albumView"]
+ image["username"] = image["ownerId"]
+ else:
+ image["fileUrl"] = image.pop("imageUrl")
+
+ image.setdefault("title", "")
+ image.setdefault("description", "")
+ name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".")
+ image["ext"] = image["extension"] = ext
+ image["titleOrFilename"] = image["title"] or name
+ image["tags"] = image.pop("clarifaiTagList", [])
+
+ mtype, _, mid = base64.b64decode(image["id"]).partition(b":")
+ image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""
+
+ yield Message.Version, 1
+ yield Message.Directory, image
+ yield Message.Url, image["fileUrl"], image
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
new file mode 100644
index 0000000..6a5c41c
--- /dev/null
+++ b/gallery_dl/extractor/piczel.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://piczel.tv/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class PiczelExtractor(Extractor):
+ """Base class for piczel extractors"""
+ category = "piczel"
+ directory_fmt = ("{category}", "{user[username]}")
+ filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ root = "https://piczel.tv"
+ api_root = "https://apollo.piczel.tv"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_id = match.group(1)
+
+ def items(self):
+ first = True
+ yield Message.Version, 1
+ for image in self.unpack(self.get_images()):
+ if first:
+ yield Message.Directory, image
+ first = False
+ path = image["image"]["image"]["url"]
+ url = "{}/static/{}".format(self.api_root, path)
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ @staticmethod
+ def unpack(images):
+ """Unpack 'images' into individual image objects"""
+ for image in images:
+ if image["multi"]:
+ multi = image["images"]
+ del image["images"]
+ for image["num"], img in enumerate(multi):
+ image["image"] = img
+ yield image
+ else:
+ image["num"] = 0
+ yield image
+
+ def get_images(self):
+ """Return an iterable with all relevant image objects"""
+
+
+class PiczelUserExtractor(PiczelExtractor):
+ """Extractor for all images from a user's gallery"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$"
+ test = ("https://piczel.tv/gallery/Lulena", {
+ "count": ">= 13",
+ })
+
+ def get_images(self):
+ url = "{}/api/users/{}/gallery".format(self.api_root, self.item_id)
+ return self.request(url).json()
+
+
+class PiczelFolderExtractor(PiczelExtractor):
+ """Extractor for images inside a user's folder"""
+ subcategory = "folder"
+ directory_fmt = ("{category}", "{user[username]}", "{folder[name]}")
+ archive_fmt = "f{folder[id]}_{id}_{num}"
+ pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv"
+ r"/gallery/(?!image)[^/?&#]+/(\d+)")
+ test = ("https://piczel.tv/gallery/Lulena/1114", {
+ "count": ">= 4",
+ })
+
+ def get_images(self):
+ url = "{}/api/gallery/folder/{}".format(self.api_root, self.item_id)
+ images = self.request(url).json()
+ images.reverse()
+ return images
+
+
+class PiczelImageExtractor(PiczelExtractor):
+ """Extractor for individual images"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"
+ test = ("https://piczel.tv/gallery/image/7807", {
+ "url": "9b9e416b6ab7e58676fab84453d5028f306ece34",
+ "content": "df9a053a24234474a19bce2b7e27e0dec23bff87",
+ "keyword": {
+ "created_at": "2018-07-22T05:13:58.000Z",
+ "description": None,
+ "extension": "png",
+ "favorites_count": int,
+ "folder": dict,
+ "folder_id": 1113,
+ "id": 7807,
+ "is_flash": False,
+ "is_video": False,
+ "multi": False,
+ "nsfw": False,
+ "num": 0,
+ "password_protected": False,
+ "tags": "fanart, commission, altair, recreators, ",
+ "title": "Altair",
+ "user": dict,
+ "views": int,
+ },
+ })
+
+ def get_images(self):
+ url = "{}/api/gallery/image/{}".format(self.api_root, self.item_id)
+ return (self.request(url).json(),)
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
new file mode 100644
index 0000000..fa8cd48
--- /dev/null
+++ b/gallery_dl/extractor/pinterest.py
@@ -0,0 +1,260 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.pinterest.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.\w+"
+
+
+class PinterestExtractor(Extractor):
+ """Base class for pinterest extractors"""
+ category = "pinterest"
+ filename_fmt = "{category}_{id}.{extension}"
+ archive_fmt = "{id}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = PinterestAPI(self)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for pin in self.pins():
+ if "images" in pin:
+ url, pin_data = self.data_from_pin(pin)
+ pin_data.update(data)
+ yield Message.Url, url, pin_data
+
+ def metadata(self):
+ """Return general metadata"""
+
+ def pins(self):
+ """Return all relevant pin-objects"""
+
+ @staticmethod
+ def data_from_pin(pin):
+ """Get image url and metadata from a pin-object"""
+ img = pin["images"]["orig"]
+ url = img["url"]
+ pin["width"] = img["width"]
+ pin["height"] = img["height"]
+ return url, text.nameext_from_url(url, pin)
+
+
+class PinterestPinExtractor(PinterestExtractor):
+ """Extractor for images from a single pin from pinterest.com"""
+ subcategory = "pin"
+ pattern = BASE_PATTERN + r"/pin/([^/?#&]+)(?!.*#related$)"
+ test = (
+ ("https://www.pinterest.com/pin/858146903966145189/", {
+ "url": "afb3c26719e3a530bb0e871c480882a801a4e8a5",
+ # image version depends on CDN server used
+ # "content": "d3e24bc9f7af585e8c23b9136956bd45a4d9b947",
+ # "content": "4c435a66f6bb82bb681db2ecc888f76cf6c5f9ca",
+ }),
+ ("https://www.pinterest.com/pin/858146903966145188/", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.pin_id = match.group(1)
+ self.pin = None
+
+ def metadata(self):
+ self.pin = self.api.pin(self.pin_id)
+ return self.data_from_pin(self.pin)[1]
+
+ def pins(self):
+ return (self.pin,)
+
+
+class PinterestBoardExtractor(PinterestExtractor):
+ """Extractor for images from a board from pinterest.com"""
+ subcategory = "board"
+ directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}")
+ archive_fmt = "{board[id]}_{id}"
+ pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)(?!.*#related$)"
+ test = (
+ ("https://www.pinterest.com/g1952849/test-/", {
+ "pattern": r"https://i\.pinimg\.com/originals/",
+ "count": 2,
+ }),
+ ("https://www.pinterest.com/g1952848/test/", {
+ "exception": exception.GalleryDLException,
+ }),
+ )
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.user = text.unquote(match.group(1))
+ self.board = text.unquote(match.group(2))
+ self.board_id = 0
+
+ def metadata(self):
+ board = self.api.board(self.user, self.board)
+ self.board_id = board["id"]
+ return {"board": board}
+
+ def pins(self):
+ return self.api.board_pins(self.board_id)
+
+
+class PinterestRelatedPinExtractor(PinterestPinExtractor):
+ """Extractor for related pins of another pin from pinterest.com"""
+ subcategory = "related-pin"
+ directory_fmt = ("{category}", "related {original_pin[id]}")
+ pattern = BASE_PATTERN + r"/pin/([^/?#&]+).*#related$"
+ test = ("https://www.pinterest.com/pin/858146903966145189/#related", {
+ "range": "31-50",
+ "count": 20,
+ })
+
+ def metadata(self):
+ pin = self.api.pin(self.pin_id)
+ return {"original_pin": self.data_from_pin(pin)[1]}
+
+ def pins(self):
+ return self.api.pin_related(self.pin_id)
+
+
+class PinterestRelatedBoardExtractor(PinterestBoardExtractor):
+ """Extractor for related pins of a board from pinterest.com"""
+ subcategory = "related-board"
+ directory_fmt = ("{category}", "{board[owner][username]}",
+ "{board[name]}", "related")
+ pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+).*#related$"
+ test = ("https://www.pinterest.com/g1952849/test-/#related", {
+ "range": "31-50",
+ "count": 20,
+ })
+
+ def pins(self):
+ return self.api.board_related(self.board_id)
+
+
+class PinterestPinitExtractor(PinterestExtractor):
+ """Extractor for images from a pin.it URL"""
+ subcategory = "pinit"
+ pattern = r"(?:https?://)?pin\.it/([^/?#&]+)"
+
+ test = (
+ ("https://pin.it/Hvt8hgT", {
+ "url": "8daad8558382c68f0868bdbd17d05205184632fa",
+ }),
+ ("https://pin.it/Hvt8hgS", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.shortened_id = match.group(1)
+
+ def items(self):
+ url = "https://api.pinterest.com/url_shortener/{}/redirect".format(
+ self.shortened_id)
+ response = self.request(url, method="HEAD", allow_redirects=False)
+ location = response.headers.get("Location")
+ if not location or location in ("https://api.pinterest.com/None",
+ "https://pin.it/None",
+ "https://www.pinterest.com"):
+ raise exception.NotFoundError("pin")
+ yield Message.Queue, location, {}
+
+
+class PinterestAPI():
+ """Minimal interface for the Pinterest Web API
+
+ For a better and more complete implementation in PHP, see
+ - https://github.com/seregazhuk/php-pinterest-bot
+ """
+
+ BASE_URL = "https://www.pinterest.com"
+ HEADERS = {
+ "Accept" : "application/json, text/javascript, "
+ "*/*, q=0.01",
+ "Accept-Language" : "en-US,en;q=0.5",
+ "X-Pinterest-AppState": "active",
+ "X-APP-VERSION" : "cb1c7f9",
+ "X-Requested-With" : "XMLHttpRequest",
+ "Origin" : BASE_URL + "/",
+ }
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+
+ def pin(self, pin_id):
+ """Query information about a pin"""
+ options = {"id": pin_id, "field_set_key": "detailed"}
+ return self._call("Pin", options)["resource_response"]["data"]
+
+ def pin_related(self, pin_id):
+ """Yield related pins of another pin"""
+ options = {"pin": pin_id, "add_vase": True, "pins_only": True}
+ return self._pagination("RelatedPinFeed", options)
+
+ def board(self, user, board):
+ """Query information about a board"""
+ options = {"slug": board, "username": user,
+ "field_set_key": "detailed"}
+ return self._call("Board", options)["resource_response"]["data"]
+
+ def board_pins(self, board_id):
+ """Yield all pins of a specific board"""
+ options = {"board_id": board_id}
+ return self._pagination("BoardFeed", options)
+
+ def board_related(self, board_id):
+ """Yield related pins of a specific board"""
+ options = {"board_id": board_id, "add_vase": True}
+ return self._pagination("BoardRelatedPixieFeed", options)
+
+ def _call(self, resource, options):
+ url = "{}/resource/{}Resource/get/".format(self.BASE_URL, resource)
+ params = {"data": json.dumps({"options": options}), "source_url": ""}
+
+ response = self.extractor.request(
+ url, params=params, headers=self.HEADERS, expect=range(400, 500))
+
+ try:
+ data = response.json()
+ except ValueError:
+ data = {}
+
+ if 200 <= response.status_code < 400 and not response.history:
+ return data
+
+ if response.status_code == 404 or response.history:
+ resource = self.extractor.subcategory.rpartition("-")[2]
+ raise exception.NotFoundError(resource)
+ self.extractor.log.error("API request failed")
+ self.extractor.log.debug("%s", response.text)
+ raise exception.StopExtraction()
+
+ def _pagination(self, resource, options):
+ while True:
+ data = self._call(resource, options)
+ yield from data["resource_response"]["data"]
+
+ try:
+ bookmarks = data["resource"]["options"]["bookmarks"]
+ if (not bookmarks or bookmarks[0] == "-end-" or
+ bookmarks[0].startswith("Y2JOb25lO")):
+ return
+ options["bookmarks"] = bookmarks
+ except KeyError:
+ return
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
new file mode 100644
index 0000000..af29c4b
--- /dev/null
+++ b/gallery_dl/extractor/pixiv.py
@@ -0,0 +1,517 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images and ugoira from https://www.pixiv.net/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+from datetime import datetime, timedelta
+
+
+class PixivExtractor(Extractor):
+ """Base class for pixiv extractors"""
+ category = "pixiv"
+ directory_fmt = ("{category}", "{user[id]} {user[account]}")
+ filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
+ archive_fmt = "{id}{num}.{extension}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = PixivAppAPI(self)
+ self.user_id = -1
+ self.load_ugoira = self.config("ugoira", True)
+
+ def items(self):
+ metadata = self.get_metadata()
+ yield Message.Version, 1
+
+ for work in self.works():
+ if not work["user"]["id"]:
+ continue
+
+ meta_single_page = work["meta_single_page"]
+ meta_pages = work["meta_pages"]
+ del work["meta_single_page"]
+ del work["image_urls"]
+ del work["meta_pages"]
+ work["num"] = ""
+ work["tags"] = [tag["name"] for tag in work["tags"]]
+ work["date"] = text.parse_datetime(work["create_date"])
+ work.update(metadata)
+
+ yield Message.Directory, work
+
+ if work["type"] == "ugoira":
+ if not self.load_ugoira:
+ continue
+ ugoira = self.api.ugoira_metadata(work["id"])
+
+ url = ugoira["zip_urls"]["medium"].replace(
+ "_ugoira600x600", "_ugoira1920x1080")
+ work["frames"] = ugoira["frames"]
+ work["extension"] = "zip"
+ yield Message.Url, url, work
+
+ elif work["page_count"] == 1:
+ url = meta_single_page["original_image_url"]
+ work["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, work
+
+ else:
+ for num, img in enumerate(meta_pages):
+ url = img["image_urls"]["original"]
+ work["num"] = "_p{:02}".format(num)
+ work["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, work
+
+ def works(self):
+ """Return an iterable containing all relevant 'work'-objects"""
+
+ def get_metadata(self, user=None):
+ """Collect metadata for extractor-job"""
+ if not user:
+ user = self.api.user_detail(self.user_id)
+ return {"user": user}
+
+
+class PixivUserExtractor(PixivExtractor):
+ """Extractor for works of a pixiv-user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/"
+ r"(?:member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?"
+ r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))")
+ test = (
+ ("http://www.pixiv.net/member_illust.php?id=173530", {
+ "url": "852c31ad83b6840bacbce824d85f2a997889efb7",
+ }),
+ # illusts with specific tag
+ (("https://www.pixiv.net/member_illust.php?id=173530"
+ "&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), {
+ "url": "25b1cd81153a8ff82eec440dd9f20a4a22079658",
+ }),
+ ("http://www.pixiv.net/member_illust.php?id=173531", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.pixiv.net/u/173530"),
+ ("https://www.pixiv.net/user/173530"),
+ ("https://www.pixiv.net/mypage.php#id=173530"),
+ ("https://www.pixiv.net/#id=173530"),
+ ("https://touch.pixiv.net/member_illust.php?id=173530"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.user_id = match.group(1) or match.group(3)
+ self.query = text.parse_query(match.group(2))
+
+ def works(self):
+ works = self.api.user_illusts(self.user_id)
+
+ if "tag" in self.query:
+ tag = text.unquote(self.query["tag"]).lower()
+ works = (
+ work for work in works
+ if tag in [t["name"].lower() for t in work["tags"]]
+ )
+
+ return works
+
+
+class PixivMeExtractor(PixivExtractor):
+ """Extractor for pixiv.me URLs"""
+ subcategory = "me"
+ pattern = r"(?:https?://)?pixiv\.me/([^/?&#]+)"
+ test = (
+ ("https://pixiv.me/del_shannon", {
+ "url": "0b1a18c3e3553c44ee6e0ccc36a7fd906c498e8f",
+ }),
+ ("https://pixiv.me/del_shanno", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.account = match.group(1)
+
+ def items(self):
+ url = "https://pixiv.me/" + self.account
+ response = self.request(
+ url, method="HEAD", allow_redirects=False, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("user")
+ yield Message.Version, 1
+ yield Message.Queue, response.headers["Location"], {}
+
+
+class PixivWorkExtractor(PixivExtractor):
+ """Extractor for a single pixiv work/illustration"""
+ subcategory = "work"
+ pattern = (r"(?:https?://)?(?:(?:www\.|touch\.)?pixiv\.net"
+ r"/member(?:_illust)?\.php\?(?:[^&]+&)*illust_id=(\d+)"
+ r"|(?:i(?:\d+\.pixiv|\.pximg)\.net"
+ r"/(?:(?:.*/)?img-[^/]+/img/\d{4}(?:/\d\d){5}|img\d+/img/[^/]+)"
+ r"|img\d*\.pixiv\.net/img/[^/]+|(?:www\.)?pixiv\.net/i)/(\d+))")
+ test = (
+ (("http://www.pixiv.net/member_illust.php"
+ "?mode=medium&illust_id=966412"), {
+ "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
+ "content": "69a8edfb717400d1c2e146ab2b30d2c235440c5a",
+ }),
+ (("http://www.pixiv.net/member_illust.php"
+ "?mode=medium&illust_id=966411"), {
+ "exception": exception.NotFoundError,
+ }),
+ # ugoira
+ (("https://www.pixiv.net/member_illust.php"
+ "?mode=medium&illust_id=66806629"), {
+ "url": "7267695a985c4db8759bebcf8d21dbdd2d2317ef",
+ "keywords": {"frames": list},
+ }),
+ ("http://i1.pixiv.net/c/600x600/img-master"
+ "/img/2008/06/13/00/29/13/966412_p0_master1200.jpg"),
+ ("https://i.pximg.net/img-original"
+ "/img/2017/04/25/07/33/29/62568267_p0.png"),
+ ("https://www.pixiv.net/i/966412"),
+ ("http://img.pixiv.net/img/soundcross/42626136.jpg"),
+ ("http://i2.pixiv.net/img76/img/snailrin/42672235.jpg"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.illust_id = match.group(1) or match.group(2)
+ self.load_ugoira = True
+ self.work = None
+
+ def works(self):
+ return (self.work,)
+
+ def get_metadata(self, user=None):
+ self.work = self.api.illust_detail(self.illust_id)
+ return PixivExtractor.get_metadata(self, self.work["user"])
+
+
+class PixivFavoriteExtractor(PixivExtractor):
+ """Extractor for all favorites/bookmarks of a pixiv-user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "bookmarks",
+ "{user_bookmark[id]} {user_bookmark[account]}")
+ archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}"
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/bookmark\.php(?:\?([^#]*))?")
+ test = (
+ ("https://www.pixiv.net/bookmark.php?id=173530", {
+ "url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
+ }),
+ # bookmarks with specific tag
+ (("https://www.pixiv.net/bookmark.php?id=3137110"
+ "&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), {
+ "count": 2,
+ }),
+ # own bookmarks
+ ("https://www.pixiv.net/bookmark.php", {
+ "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
+ }),
+ # touch URLs
+ ("https://touch.pixiv.net/bookmark.php?id=173530"),
+ ("https://touch.pixiv.net/bookmark.php"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.query = text.parse_query(match.group(1))
+ if "id" not in self.query:
+ self.subcategory = "bookmark"
+
+ def works(self):
+ tag = None
+ restrict = "public"
+
+ if "tag" in self.query:
+ tag = text.unquote(self.query["tag"])
+ if "rest" in self.query and self.query["rest"] == "hide":
+ restrict = "private"
+
+ return self.api.user_bookmarks_illust(self.user_id, tag, restrict)
+
+ def get_metadata(self, user=None):
+ if "id" in self.query:
+ user = self.api.user_detail(self.query["id"])
+ else:
+ self.api.login()
+ user = self.api.user
+
+ self.user_id = user["id"]
+ return {"user_bookmark": user}
+
+
+class PixivRankingExtractor(PixivExtractor):
+ """Extractor for pixiv ranking pages"""
+ subcategory = "ranking"
+ archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}"
+ directory_fmt = ("{category}", "rankings",
+ "{ranking[mode]}", "{ranking[date]}")
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/ranking\.php(?:\?([^#]*))?")
+ test = (
+ ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"),
+ ("https://www.pixiv.net/ranking.php"),
+ ("https://touch.pixiv.net/ranking.php"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.query = match.group(1)
+ self.mode = self.date = None
+
+ def works(self):
+ return self.api.illust_ranking(self.mode, self.date)
+
+ def get_metadata(self, user=None):
+ query = text.parse_query(self.query)
+
+ mode = query.get("mode", "daily").lower()
+ mode_map = {
+ "daily": "day",
+ "daily_r18": "day_r18",
+ "weekly": "week",
+ "weekly_r18": "week_r18",
+ "monthly": "month",
+ "male": "day_male",
+ "male_r18": "day_male_r18",
+ "female": "day_female",
+ "female_r18": "day_female_r18",
+ "original": "week_original",
+ "rookie": "week_rookie",
+ "r18g": "week_r18g",
+ }
+ if mode not in mode_map:
+ self.log.warning("invalid mode '%s'", mode)
+ mode = "daily"
+ self.mode = mode_map[mode]
+
+ date = query.get("date")
+ if date:
+ if len(date) == 8 and date.isdecimal():
+ date = "{}-{}-{}".format(date[0:4], date[4:6], date[6:8])
+ else:
+ self.log.warning("invalid date '%s'", date)
+ date = None
+ if not date:
+ date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d")
+ self.date = date
+
+ return {"ranking": {
+ "mode": mode,
+ "date": self.date,
+ }}
+
+
+class PixivSearchExtractor(PixivExtractor):
+ """Extractor for pixiv search results"""
+ subcategory = "search"
+ archive_fmt = "s_{search[word]}_{id}{num}.{extension}"
+ directory_fmt = ("{category}", "search", "{search[word]}")
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/search\.php\?([^#]+)")
+ test = (
+ ("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original"),
+ ("https://touch.pixiv.net/search.php?word=Original"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.query = match.group(1)
+ self.word = self.sort = self.target = None
+
+ def works(self):
+ return self.api.search_illust(self.word, self.sort, self.target)
+
+ def get_metadata(self, user=None):
+ query = text.parse_query(self.query)
+
+ if "word" in query:
+ self.word = text.unescape(query["word"])
+ else:
+ self.log.error("missing search term")
+ raise exception.StopExtraction()
+
+ sort = query.get("order", "date_d")
+ sort_map = {
+ "date": "date_asc",
+ "date_d": "date_desc",
+ }
+ if sort not in sort_map:
+ self.log.warning("invalid sort order '%s'", sort)
+ sort = "date_d"
+ self.sort = sort_map[sort]
+
+ target = query.get("s_mode", "s_tag")
+ target_map = {
+ "s_tag": "partial_match_for_tags",
+ "s_tag_full": "exact_match_for_tags",
+ "s_tc": "title_and_caption",
+ }
+ if target not in target_map:
+ self.log.warning("invalid search target '%s'", target)
+ target = "s_tag"
+ self.target = target_map[target]
+
+ return {"search": {
+ "word": self.word,
+ "sort": self.sort,
+ "target": self.target,
+ }}
+
+
+class PixivFollowExtractor(PixivExtractor):
+ """Extractor for new illustrations from your followed artists"""
+ subcategory = "follow"
+ archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}"
+ directory_fmt = ("{category}", "following")
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/bookmark_new_illust\.php")
+ test = (
+ ("https://www.pixiv.net/bookmark_new_illust.php"),
+ ("https://touch.pixiv.net/bookmark_new_illust.php"),
+ )
+
+ def works(self):
+ return self.api.illust_follow()
+
+ def get_metadata(self, user=None):
+ self.api.login()
+ return {"user_follow": self.api.user}
+
+
+class PixivAppAPI():
+ """Minimal interface for the Pixiv App API for mobile devices
+
+ For a more complete implementation or documentation, see
+ - https://github.com/upbit/pixivpy
+ - https://gist.github.com/ZipFile/3ba99b47162c23f8aea5d5942bb557b1
+ """
+ CLIENT_ID = "MOBrBDS8blbauoSck0ZfDbtuzpyT"
+ CLIENT_SECRET = "lsACyCD94FhDUtGTXi3QzcFE2uU1hqtDaKeqrdwj"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.log = extractor.log
+ self.username, self.password = extractor._get_auth_info()
+ self.user = None
+
+ self.client_id = extractor.config(
+ "client-id", self.CLIENT_ID)
+ self.client_secret = extractor.config(
+ "client-secret", self.CLIENT_SECRET)
+
+ extractor.session.headers.update({
+ "App-OS": "ios",
+ "App-OS-Version": "10.3.1",
+ "App-Version": "6.7.1",
+ "User-Agent": "PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)",
+ "Referer": "https://app-api.pixiv.net/",
+ })
+
+ def login(self):
+ """Login and gain an access token"""
+ self.user, auth = self._login_impl(self.username, self.password)
+ self.extractor.session.headers["Authorization"] = auth
+
+ @cache(maxage=3600, keyarg=1)
+ def _login_impl(self, username, password):
+ url = "https://oauth.secure.pixiv.net/auth/token"
+ data = {
+ "client_id": self.client_id,
+ "client_secret": self.client_secret,
+ "get_secure_url": 1,
+ }
+ refresh_token = _refresh_token_cache(username)
+
+ if refresh_token:
+ self.log.info("Refreshing access token")
+ data["grant_type"] = "refresh_token"
+ data["refresh_token"] = refresh_token
+ else:
+ self.log.info("Logging in as %s", username)
+ data["grant_type"] = "password"
+ data["username"] = username
+ data["password"] = password
+
+ response = self.extractor.request(
+ url, method="POST", data=data, expect=(400,))
+ if response.status_code >= 400:
+ raise exception.AuthenticationError()
+
+ data = response.json()["response"]
+ if not refresh_token:
+ _refresh_token_cache.update(username, data["refresh_token"])
+ return data["user"], "Bearer " + data["access_token"]
+
+ def illust_detail(self, illust_id):
+ params = {"illust_id": illust_id}
+ return self._call("v1/illust/detail", params)["illust"]
+
+ def illust_follow(self, restrict="all"):
+ params = {"restrict": restrict}
+ return self._pagination("v2/illust/follow", params)
+
+ def illust_ranking(self, mode="day", date=None):
+ params = {"mode": mode, "date": date}
+ return self._pagination("v1/illust/ranking", params)
+
+ def search_illust(self, word, sort=None, target=None, duration=None):
+ params = {"word": word, "search_target": target,
+ "sort": sort, "duration": duration}
+ return self._pagination("v1/search/illust", params)
+
+ def user_bookmarks_illust(self, user_id, tag=None, restrict="public"):
+ params = {"user_id": user_id, "tag": tag, "restrict": restrict}
+ return self._pagination("v1/user/bookmarks/illust", params)
+
+ def user_detail(self, user_id):
+ params = {"user_id": user_id}
+ return self._call("v1/user/detail", params)["user"]
+
+ def user_illusts(self, user_id):
+ params = {"user_id": user_id}
+ return self._pagination("v1/user/illusts", params)
+
+ def ugoira_metadata(self, illust_id):
+ params = {"illust_id": illust_id}
+ return self._call("v1/ugoira/metadata", params)["ugoira_metadata"]
+
+ def _call(self, endpoint, params=None):
+ url = "https://app-api.pixiv.net/" + endpoint
+
+ self.login()
+ response = self.extractor.request(
+ url, params=params, expect=range(400, 500))
+
+ if 200 <= response.status_code < 400:
+ return response.json()
+ if response.status_code == 404:
+ raise exception.NotFoundError()
+ self.log.error("API request failed: %s", response.text)
+ raise exception.StopExtraction()
+
+ def _pagination(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, params)
+ yield from data["illusts"]
+
+ if not data["next_url"]:
+ return
+ query = data["next_url"].rpartition("?")[2]
+ params = text.parse_query(query)
+
+
+@cache(maxage=10*365*24*3600, keyarg=0)
+def _refresh_token_cache(username):
+ return None
diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py
new file mode 100644
index 0000000..9cada6b
--- /dev/null
+++ b/gallery_dl/extractor/pixnet.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.pixnet.net/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net"
+
+
+class PixnetExtractor(Extractor):
+ """Base class for pixnet extractors"""
+ category = "pixnet"
+ filename_fmt = "{num:>03}_{id}.{extension}"
+ archive_fmt = "{id}"
+ url_fmt = ""
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.blog, self.item_id = match.groups()
+ self.root = "https://{}.pixnet.net".format(self.blog)
+
+ def items(self):
+ url = self.url_fmt.format(self.root, self.item_id)
+ page = self.request(url, encoding="utf-8").text
+ user = text.extract(page, '<meta name="author" content="', '";')[0]
+ data = {
+ "blog": self.blog,
+ "user": user.rpartition(" (")[0],
+ }
+
+ for info in self._pagination(page):
+ url, pos = text.extract(info, ' href="', '"')
+ alt, pos = text.extract(info, ' alt="', '"', pos)
+ item = {
+ "id" : text.parse_int(url.rpartition("/")[2]),
+ "title" : text.unescape(alt),
+ "_extractor": (PixnetFolderExtractor if "/folder/" in url else
+ PixnetSetExtractor),
+ }
+ item.update(data)
+ yield Message.Queue, url, item
+
+ def _pagination(self, page):
+ while True:
+ yield from text.extract_iter(page, '<li id="', '</li>')
+
+ pnext = text.extract(page, 'class="nextBtn"', '>')[0]
+ if "href" not in pnext:
+ return
+ url = self.root + text.extract(pnext, 'href="', '"')[0]
+ page = self.request(url, encoding="utf-8").text
+
+
+class PixnetImageExtractor(PixnetExtractor):
+ """Extractor for a single photo from pixnet.net"""
+ subcategory = "image"
+ filename_fmt = "{id}.{extension}"
+ directory_fmt = ("{category}", "{blog}")
+ pattern = BASE_PATTERN + r"/album/photo/(\d+)"
+ test = ("https://albertayu773.pixnet.net/album/photo/159443828", {
+ "url": "156564c422138914c9fa5b42191677b45c414af4",
+ "keyword": "19971bcd056dfef5593f4328a723a9602be0f087",
+ "content": "0e097bdf49e76dd9b9d57a016b08b16fa6a33280",
+ })
+
+ def items(self):
+ url = "https://api.pixnet.cc/oembed"
+ params = {
+ "url": "https://{}.pixnet.net/album/photo/{}".format(
+ self.blog, self.item_id),
+ "format": "json",
+ }
+
+ data = self.request(url, params=params).json()
+ data["id"] = text.parse_int(
+ data["url"].rpartition("/")[2].partition("-")[0])
+ data["filename"], _, data["extension"] = data["title"].rpartition(".")
+ data["blog"] = self.blog
+ data["user"] = data.pop("author_name")
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, data["url"], data
+
+
+class PixnetSetExtractor(PixnetExtractor):
+ """Extractor for images from a pixnet set"""
+ subcategory = "set"
+ url_fmt = "{}/album/set/{}"
+ directory_fmt = ("{category}", "{blog}",
+ "{folder_id} {folder_title}", "{set_id} {set_title}")
+ pattern = BASE_PATTERN + r"/album/set/(\d+)"
+ test = (
+ ("https://albertayu773.pixnet.net/album/set/15078995", {
+ "url": "6535712801af47af51110542f4938a7cef44557f",
+ "keyword": "bf25d59e5b0959cb1f53e7fd2e2a25f2f67e5925",
+ }),
+ ("https://anrine910070.pixnet.net/album/set/5917493", {
+ "url": "b3eb6431aea0bcf5003432a4a0f3a3232084fc13",
+ "keyword": "bf7004faa1cea18cf9bd856f0955a69be51b1ec6",
+ }),
+ )
+
+ def items(self):
+ url = self.url_fmt.format(self.root, self.item_id)
+ page = self.request(url, encoding="utf-8").text
+ data = self.metadata(page)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, info in enumerate(self._pagination(page), 1):
+ url, pos = text.extract(info, ' href="', '"')
+ src, pos = text.extract(info, ' src="', '"', pos)
+ alt, pos = text.extract(info, ' alt="', '"', pos)
+
+ photo = {
+ "id": text.parse_int(url.rpartition("/")[2].partition("#")[0]),
+ "url": src.replace("_s.", "."),
+ "num": num,
+ "filename": alt,
+ "extension": src.rpartition(".")[2],
+ }
+ photo.update(data)
+ yield Message.Url, photo["url"], photo
+
+ def metadata(self, page):
+ user , pos = text.extract(page, '<meta name="author" content="', '";')
+ _ , pos = text.extract(page, 'id="breadcrumb"', '', pos)
+ fid , pos = text.extract(page, '/folder/', '"', pos)
+ fname, pos = text.extract(page, '>', '<', pos)
+ sid , pos = text.extract(page, '/set/', '"', pos)
+ sname, pos = text.extract(page, '>', '<', pos)
+ return {
+ "blog": self.blog,
+ "user": user.rpartition(" (")[0],
+ "folder_id" : text.parse_int(fid, ""),
+ "folder_title": text.unescape(fname).strip(),
+ "set_id" : text.parse_int(sid),
+ "set_title" : text.unescape(sname),
+ }
+
+
+class PixnetFolderExtractor(PixnetExtractor):
+ """Extractor for all sets in a pixnet folder"""
+ subcategory = "folder"
+ url_fmt = "{}/album/folder/{}"
+ pattern = BASE_PATTERN + r"/album/folder/(\d+)"
+ test = ("https://albertayu773.pixnet.net/album/folder/1405768", {
+ "pattern": PixnetSetExtractor.pattern,
+ "count": ">= 15",
+ })
+
+
+class PixnetUserExtractor(PixnetExtractor):
+ """Extractor for all sets and folders of a pixnet user"""
+ subcategory = "user"
+ url_fmt = "{}{}/album/list"
+ pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?&#])"
+ test = (
+ ("https://albertayu773.pixnet.net/"),
+ ("https://albertayu773.pixnet.net/blog"),
+ ("https://albertayu773.pixnet.net/album"),
+ ("https://albertayu773.pixnet.net/album/list", {
+ "pattern": PixnetFolderExtractor.pattern,
+ "count": ">= 30",
+ }),
+ ("https://anrine910070.pixnet.net/album/list", {
+ "pattern": PixnetSetExtractor.pattern,
+ "count": ">= 14",
+ }),
+ )
diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py
new file mode 100644
index 0000000..325c6a0
--- /dev/null
+++ b/gallery_dl/extractor/plurk.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.plurk.com/"""
+
+from .common import Extractor, Message
+from .. import text, extractor, exception
+import datetime
+import json
+import re
+
+
+class PlurkExtractor(Extractor):
+ """Base class for plurk extractors"""
+ category = "plurk"
+ root = "https://www.plurk.com"
+
+ def items(self):
+ urls = self._urls_ex if self.config("comments", False) else self._urls
+
+ yield Message.Version, 1
+ with extractor.blacklist(("plurk",)):
+ for plurk in self.plurks():
+ for url in urls(plurk):
+ yield Message.Queue, url, plurk
+
+ def plurks(self):
+ """Return an iterable with all relevant 'plurk' objects"""
+
+ @staticmethod
+ def _urls(obj):
+ """Extract URLs from a 'plurk' object"""
+ return text.extract_iter(obj["content"], ' href="', '"')
+
+ def _urls_ex(self, plurk):
+ """Extract URLs from a 'plurk' and its comments"""
+ yield from self._urls(plurk)
+ for comment in self._comments(plurk):
+ yield from self._urls(comment)
+
+ def _comments(self, plurk):
+ """Return an iterable with a 'plurk's comments"""
+ url = "https://www.plurk.com/Responses/get"
+ data = {"plurk_id": plurk["id"], "count": "200"}
+
+ while True:
+ info = self.request(url, "POST", data=data).json()
+ yield from info["responses"]
+ if not info["has_newer"]:
+ return
+ data["from_response_id"] = info["responses"][-1]["id"]
+
+ @staticmethod
+ def _load(data):
+ if not data:
+ raise exception.NotFoundError("user")
+ return json.loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data))
+
+
+class PlurkTimelineExtractor(PlurkExtractor):
+ """Extractor for URLs from all posts in a Plurk timeline"""
+ subcategory = "timeline"
+ pattern = r"(?:https?://)?(?:www\.)?plurk\.com/(?!p/)(\w+)/?(?:$|[?&#])"
+ test = ("https://www.plurk.com/plurkapi", {
+ "pattern": r"https?://.+",
+ "count": ">= 23"
+ })
+
+ def __init__(self, match):
+ PlurkExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def plurks(self):
+ url = "{}/{}".format(self.root, self.user)
+ page = self.request(url).text
+ user_id, pos = text.extract(page, '"user_id":', ',')
+ plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0])
+
+ url = "https://www.plurk.com/TimeLine/getPlurks"
+ data = {"user_id": user_id.strip()}
+ headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"}
+
+ while plurks:
+ yield from plurks
+
+ offset = datetime.datetime.strptime(
+ plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
+ data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+ response = self.request(url, "POST", headers=headers, data=data)
+ plurks = response.json()["plurks"]
+
+
+class PlurkPostExtractor(PlurkExtractor):
+ """Extractor for URLs from a Plurk post"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)"
+ test = (
+ ("https://www.plurk.com/p/i701j1", {
+ "url": "2115f208564591b8748525c2807a84596aaaaa5f",
+ "count": 3,
+ }),
+ ("https://www.plurk.com/p/i701j1", {
+ "options": (("comments", True),),
+ "count": ">= 210",
+ }),
+ )
+
+ def __init__(self, match):
+ PlurkExtractor.__init__(self, match)
+ self.plurk_id = match.group(1)
+
+ def plurks(self):
+ url = "{}/p/{}".format(self.root, self.plurk_id)
+ page = self.request(url).text
+ user, pos = text.extract(page, " GLOBAL = ", "\n")
+ data, pos = text.extract(page, "plurk = ", ";\n", pos)
+
+ data = self._load(data)
+ data["user"] = self._load(user)["page_user"]
+ return (data,)
diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
new file mode 100644
index 0000000..40816b3
--- /dev/null
+++ b/gallery_dl/extractor/pornhub.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.pornhub.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com"
+
+
+class PornhubExtractor(Extractor):
+ """Base class for pornhub extractors"""
+ category = "pornhub"
+ root = "https://www.pornhub.com"
+
+
+class PornhubGalleryExtractor(PornhubExtractor):
+ """Extractor for image galleries on pornhub.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}")
+ filename_fmt = "{num:>03}_{id}.{extension}"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/album/(\d+)"
+ test = (
+ ("https://www.pornhub.com/album/1708982", {
+ "pattern": r"https://\w+.phncdn.com/pics/albums/\d+/\d+/\d+/\d+/",
+ "count": 93,
+ "keyword": {
+ "id": int,
+ "num": int,
+ "score": int,
+ "views": int,
+ "caption": str,
+ "user": "Unknown",
+ "gallery": {
+ "id" : 1708982,
+ "score": int,
+ "views": int,
+ "tags" : list,
+ "title": "Random Hentai",
+ },
+ },
+ }),
+ ("https://www.pornhub.com/album/37180171", {
+ "exception": exception.AuthorizationError,
+ }),
+ )
+
+ def __init__(self, match):
+ PornhubExtractor.__init__(self, match)
+ self.gallery_id = match.group(1)
+ self._first = None
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, image in enumerate(self.images(), 1):
+ url = image["url"]
+ image.update(data)
+ image["num"] = num
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def metadata(self):
+ url = "{}/album/{}".format(
+ self.root, self.gallery_id)
+ extr = text.extract_from(self.request(url).text)
+
+ title = extr("<title>", "</title>")
+ score = extr('<div id="albumGreenBar" style="width:', '"')
+ views = extr('<div id="viewsPhotAlbumCounter">', '<')
+ tags = extr('<div id="photoTagsBox"', '<script')
+ self._first = extr('<a href="/photo/', '"')
+ title, _, user = title.rpartition(" - ")
+
+ return {
+ "user" : text.unescape(user[:-14]),
+ "gallery": {
+ "id" : text.parse_int(self.gallery_id),
+ "title": text.unescape(title),
+ "score": text.parse_int(score.partition("%")[0]),
+ "views": text.parse_int(views.partition(" ")[0]),
+ "tags" : text.split_html(tags)[2:],
+ },
+ }
+
+ def images(self):
+ url = "{}/album/show_album_json?album={}".format(
+ self.root, self.gallery_id)
+ response = self.request(url)
+
+ if response.content == b"Permission denied":
+ raise exception.AuthorizationError()
+ images = response.json()
+ key = end = self._first
+
+ while True:
+ img = images[key]
+ yield {
+ "url" : img["img_large"],
+ "caption": img["caption"],
+ "id" : text.parse_int(img["id"]),
+ "views" : text.parse_int(img["times_viewed"]),
+ "score" : text.parse_int(img["vote_percent"]),
+ }
+ key = img["next"]
+ if key == end:
+ return
+
+
+class PornhubUserExtractor(PornhubExtractor):
+ """Extractor for all galleries of a pornhub user"""
+ subcategory = "user"
+ pattern = (BASE_PATTERN + r"/(users|model)/([^/?&#]+)"
+ "(?:/photos(?:/(public|private|favorites))?)?/?$")
+ test = (
+ ("https://www.pornhub.com/users/flyings0l0/photos/public", {
+ "pattern": PornhubGalleryExtractor.pattern,
+ "count": ">= 8",
+ }),
+ ("https://www.pornhub.com/users/flyings0l0/"),
+ ("https://www.pornhub.com/users/flyings0l0/photos/public"),
+ ("https://www.pornhub.com/users/flyings0l0/photos/private"),
+ ("https://www.pornhub.com/users/flyings0l0/photos/favorites"),
+ ("https://www.pornhub.com/model/bossgirl/photos"),
+ )
+
+ def __init__(self, match):
+ PornhubExtractor.__init__(self, match)
+ self.type, self.user, self.cat = match.groups()
+
+ def items(self):
+ url = "{}/{}/{}/photos/{}/ajax".format(
+ self.root, self.type, self.user, self.cat or "public")
+ params = {"page": 1}
+ headers = {
+ "Referer": url[:-5],
+ "X-Requested-With": "XMLHttpRequest",
+ }
+
+ data = {"_extractor": PornhubGalleryExtractor}
+ yield Message.Version, 1
+ while True:
+ page = self.request(
+ url, method="POST", headers=headers, params=params).text
+ if not page:
+ return
+ for gid in text.extract_iter(page, 'id="albumphoto', '"'):
+ yield Message.Queue, self.root + "/album/" + gid, data
+ params["page"] += 1
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
new file mode 100644
index 0000000..fa4eb81
--- /dev/null
+++ b/gallery_dl/extractor/pururin.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://pururin.io/"""
+
+from .common import GalleryExtractor
+from .. import text, util
+import json
+
+
+class PururinGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries on pururin.io"""
+ category = "pururin"
+ pattern = r"(?:https?://)?(?:www\.)?pururin\.io/(?:gallery|read)/(\d+)"
+ test = (
+ ("https://pururin.io/gallery/38661/iowant-2", {
+ "pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg",
+ "keyword": {
+ "title" : "Iowant 2!!",
+ "title_en" : "Iowant 2!!",
+ "title_jp" : "",
+ "gallery_id": 38661,
+ "count" : 19,
+ "artist" : ["Shoda Norihiro"],
+ "group" : ["Obsidian Order"],
+ "parody" : ["Kantai Collection"],
+ "characters": ["Iowa", "Teitoku"],
+ "tags" : list,
+ "type" : "Doujinshi",
+ "collection": "",
+ "convention": "C92",
+ "rating" : float,
+ "uploader" : "demo",
+ "scanlator" : "",
+ "lang" : "en",
+ "language" : "English",
+ }
+ }),
+ ("https://pururin.io/gallery/7661/unisis-team-vanilla", {
+ "count": 17,
+ }),
+ )
+ root = "https://pururin.io"
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/gallery/{}/x".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ self._ext = ""
+ self._cnt = 0
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+
+ def _lst(key, e=extr):
+ return [
+ text.unescape(item)
+ for item in text.extract_iter(e(key, "</td>"), 'title="', '"')
+ ]
+
+ def _str(key, e=extr):
+ return text.unescape(text.extract(
+ e(key, "</td>"), 'title="', '"')[0] or "")
+
+ url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
+ page = self.request(url).text
+ info = json.loads(text.unescape(text.extract(
+ page, ':gallery="', '"')[0]))
+ self._ext = info["image_extension"]
+ self._cnt = info["total_pages"]
+
+ data = {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : info["title"] or info.get("j_title") or "",
+ "title_en" : info["title"],
+ "title_jp" : info.get("j_title") or "",
+ "artist" : _lst("<td>Artist</td>"),
+ "group" : _lst("<td>Circle</td>"),
+ "parody" : _lst("<td>Parody</td>"),
+ "tags" : _lst("<td>Contents</td>"),
+ "type" : _str("<td>Category</td>"),
+ "characters": _lst("<td>Character</td>"),
+ "collection": _str("<td>Collection</td>"),
+ "language" : _str("<td>Language</td>"),
+ "scanlator" : _str("<td>Scanlator</td>"),
+ "convention": _str("<td>Convention</td>"),
+ "uploader" : text.remove_html(extr("<td>Uploader</td>", "</td>")),
+ "rating" : text.parse_float(extr(" :rating='" , "'")),
+ }
+ data["lang"] = util.language_to_code(data["language"])
+ return data
+
+ def images(self, _):
+ ufmt = "https://cdn.pururin.io/assets/images/data/{}/{{}}.{}".format(
+ self.gallery_id, self._ext)
+ return [(ufmt.format(num), None) for num in range(1, self._cnt + 1)]
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
new file mode 100644
index 0000000..59d502a
--- /dev/null
+++ b/gallery_dl/extractor/reactor.py
@@ -0,0 +1,338 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Generic extractors for *reactor sites"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+import urllib.parse
+import random
+import time
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)"
+
+
+class ReactorExtractor(SharedConfigMixin, Extractor):
+ """Base class for *reactor.cc extractors"""
+ basecategory = "reactor"
+ filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
+ archive_fmt = "{post_id}_{num}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.root = "http://" + match.group(1)
+ self.session.headers["Referer"] = self.root
+
+ self.wait_min = self.config("wait-min", 3)
+ self.wait_max = self.config("wait-max", 6)
+ if self.wait_max < self.wait_min:
+ self.wait_max = self.wait_min
+
+ if not self.category:
+ # set category based on domain name
+ netloc = urllib.parse.urlsplit(self.root).netloc
+ self.category = netloc.rpartition(".")[0]
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in self.posts():
+ for image in self._parse_post(post):
+ url = image["url"]
+ image.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def metadata(self):
+ """Collect metadata for extractor-job"""
+ return {}
+
+ def posts(self):
+ """Return all relevant post-objects"""
+ return self._pagination(self.url)
+
+ def _pagination(self, url):
+ while True:
+ time.sleep(random.uniform(self.wait_min, self.wait_max))
+
+ response = self.request(url)
+ if response.history:
+ # sometimes there is a redirect from
+ # the last page of a listing (.../tag/<tag>/1)
+ # to the first page (.../tag/<tag>)
+ # which could cause an endless loop
+ cnt_old = response.history[0].url.count("/")
+ cnt_new = response.url.count("/")
+ if cnt_old == 5 and cnt_new == 4:
+ return
+ page = response.text
+
+ yield from text.extract_iter(
+ page, '<div class="uhead">', '<div class="ufoot">')
+
+ try:
+ pos = page.index("class='next'")
+ pos = page.rindex("class='current'", 0, pos)
+ url = self.root + text.extract(page, "href='", "'", pos)[0]
+ except (ValueError, TypeError):
+ return
+
+ def _parse_post(self, post):
+ post, _, script = post.partition('<script type="application/ld+json">')
+ images = text.extract_iter(post, '<div class="image">', '</div>')
+ script = script[:script.index("</")].strip()
+
+ try:
+ data = json.loads(script)
+ except ValueError:
+ try:
+ # remove control characters and escape backslashes
+ mapping = dict.fromkeys(range(32))
+ script = script.translate(mapping).replace("\\", "\\\\")
+ data = json.loads(script)
+ except ValueError as exc:
+ self.log.warning("Unable to parse JSON data: %s", exc)
+ return
+
+ num = 0
+ date = text.parse_datetime(data["datePublished"])
+ user = data["author"]["name"]
+ description = text.unescape(data["description"])
+ title, _, tags = text.unescape(data["headline"]).partition(" / ")
+ post_id = text.parse_int(
+ data["mainEntityOfPage"]["@id"].rpartition("/")[2])
+
+ if not tags:
+ title, tags = tags, title
+ tags = tags.split(" :: ")
+
+ for image in images:
+ url = text.extract(image, ' src="', '"')[0]
+ if not url:
+ continue
+ width = text.extract(image, ' width="', '"')[0]
+ height = text.extract(image, ' height="', '"')[0]
+ image_id = url.rpartition("-")[2].partition(".")[0]
+ num += 1
+
+ if image.startswith("<iframe "): # embed
+ url = "ytdl:" + text.unescape(url)
+ elif "/post/webm/" not in url and "/post/mp4/" not in url:
+ url = url.replace("/post/", "/post/full/")
+
+ yield {
+ "url": url,
+ "post_id": post_id,
+ "image_id": text.parse_int(image_id),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ "title": title,
+ "description": description,
+ "tags": tags,
+ "date": date,
+ "user": user,
+ "num": num,
+ }
+
+
+class ReactorTagExtractor(ReactorExtractor):
+ """Extractor for tag searches on *reactor.cc sites"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "{search_tags}_{post_id}_{num}"
+ pattern = BASE_PATTERN + r"/tag/([^/?&#]+)"
+ test = ("http://anime.reactor.cc/tag/Anime+Art",)
+
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.tag = match.group(2)
+
+ def metadata(self):
+ return {"search_tags": text.unescape(self.tag).replace("+", " ")}
+
+
+class ReactorSearchExtractor(ReactorTagExtractor):
+ """Extractor for search results on *reactor.cc sites"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "search", "{search_tags}")
+ archive_fmt = "s_{search_tags}_{post_id}_{num}"
+ pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
+ test = ("http://anime.reactor.cc/search?q=Art",)
+
+
+class ReactorUserExtractor(ReactorExtractor):
+ """Extractor for all posts of a user on *reactor.cc sites"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "user", "{user}")
+ pattern = BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = ("http://anime.reactor.cc/user/Shuster",)
+
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.user = match.group(2)
+
+ def metadata(self):
+ return {"user": text.unescape(self.user).replace("+", " ")}
+
+
+class ReactorPostExtractor(ReactorExtractor):
+ """Extractor for single posts on *reactor.cc sites"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
+ test = ("http://anime.reactor.cc/post/3576250",)
+
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def items(self):
+ yield Message.Version, 1
+ post = self.request(self.url).text
+ pos = post.find('class="uhead">')
+ for image in self._parse_post(post[pos:]):
+ if image["num"] == 1:
+ yield Message.Directory, image
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+
+# --------------------------------------------------------------------
+# JoyReactor
+
+JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))"
+
+
+class JoyreactorTagExtractor(ReactorTagExtractor):
+ """Extractor for tag searches on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/tag/([^/?&#]+)"
+ test = (
+ ("http://joyreactor.cc/tag/Advent+Cirno", {
+ "count": ">= 17",
+ }),
+ ("http://joyreactor.com/tag/Cirno", {
+ "url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914",
+ }),
+ )
+
+
+class JoyreactorSearchExtractor(ReactorSearchExtractor):
+ """Extractor for search results on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
+ test = (
+ ("http://joyreactor.cc/search/Cirno+Gifs", {
+ "range": "1-25",
+ "count": ">= 20",
+ }),
+ ("http://joyreactor.com/search?q=Cirno+Gifs", {
+ "count": 0, # no search results on joyreactor.com
+ }),
+ )
+
+
+class JoyreactorUserExtractor(ReactorUserExtractor):
+ """Extractor for all posts of a user on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = (
+ ("http://joyreactor.cc/user/hemantic"),
+ ("http://joyreactor.com/user/Tacoman123", {
+ "url": "452cd0fa23e2ad0e122c296ba75aa7f0b29329f6",
+ }),
+ )
+
+
+class JoyreactorPostExtractor(ReactorPostExtractor):
+ """Extractor for single posts on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/post/(\d+)"
+ test = (
+ ("http://joyreactor.com/post/3721876", { # single image
+ "url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663",
+ "keyword": "966d2acd462732a9ed823a9db5ed19f95734fd10",
+ }),
+ ("http://joyreactor.com/post/3713804", { # 4 images
+ "url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304",
+ "keyword": "84e34d402342607045a65fab6d4d593d146c238a",
+ }),
+ ("http://joyreactor.com/post/3726210", { # gif / video
+ "url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b",
+ "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47",
+ }),
+ ("http://joyreactor.com/post/3668724", { # youtube embed
+ "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a",
+ "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651",
+ }),
+ ("http://joyreactor.cc/post/1299", { # "malformed" JSON
+ "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde",
+ }),
+ )
+
+
+# --------------------------------------------------------------------
+# PornReactor
+
+PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)"
+
+
+class PornreactorTagExtractor(ReactorTagExtractor):
+ """Extractor for tag searches on pornreactor.cc"""
+ category = "pornreactor"
+ pattern = PR_BASE_PATTERN + r"/tag/([^/?&#]+)"
+ test = (
+ ("http://pornreactor.cc/tag/RiceGnat", {
+ "range": "1-25",
+ "count": ">= 25",
+ }),
+ ("http://fapreactor.com/tag/RiceGnat"),
+ )
+
+
+class PornreactorSearchExtractor(ReactorSearchExtractor):
+ """Extractor for search results on pornreactor.cc"""
+ category = "pornreactor"
+ pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
+ test = (
+ ("http://pornreactor.cc/search?q=ecchi+hentai", {
+ "range": "1-25",
+ "count": ">= 25",
+ }),
+ ("http://fapreactor.com/search/ecchi+hentai"),
+ )
+
+
+class PornreactorUserExtractor(ReactorUserExtractor):
+ """Extractor for all posts of a user on pornreactor.cc"""
+ category = "pornreactor"
+ pattern = PR_BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = (
+ ("http://pornreactor.cc/user/Disillusion", {
+ "range": "1-25",
+ "count": ">= 25",
+ }),
+ ("http://fapreactor.com/user/Disillusion"),
+ )
+
+
+class PornreactorPostExtractor(ReactorPostExtractor):
+ """Extractor for single posts on pornreactor.cc"""
+ category = "pornreactor"
+ subcategory = "post"
+ pattern = PR_BASE_PATTERN + r"/post/(\d+)"
+ test = (
+ ("http://pornreactor.cc/post/863166", {
+ "url": "680db1e33ca92ff70b2c0e1708c471cbe2201324",
+ "content": "ec6b0568bfb1803648744077da082d14de844340",
+ }),
+ ("http://fapreactor.com/post/863166", {
+ "url": "864ecd5785e4898301aa8d054dd653b1165be158",
+ }),
+ )
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
new file mode 100644
index 0000000..dda4809
--- /dev/null
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract comic-issues and entire comics from https://readcomiconline.to/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .kissmanga import RedirectMixin
+from .. import text
+import re
+
+
+class ReadcomiconlineBase(RedirectMixin):
+ """Base class for readcomiconline extractors"""
+ category = "readcomiconline"
+ directory_fmt = ("{category}", "{comic}", "{issue:>03}")
+ filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
+ archive_fmt = "{issue_id}_{page}"
+ root = "https://readcomiconline.to"
+
+
+class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
+ """Extractor for comic-issues from readcomiconline.to"""
+ subcategory = "issue"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
+ r"(/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+))")
+ test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
+ "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
+ "keyword": "30fe110273e871305001f33c18634516a0a51421",
+ })
+
+ def __init__(self, match):
+ ChapterExtractor.__init__(self, match)
+ self.issue_id = match.group(2)
+
+ def metadata(self, page):
+ comic, pos = text.extract(page, " - Read\r\n ", "\r\n")
+ iinfo, pos = text.extract(page, " ", "\r\n", pos)
+ match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
+ return {
+ "comic": comic,
+ "issue": match.group(1) or match.group(2),
+ "issue_id": text.parse_int(self.issue_id),
+ "lang": "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ return [
+ (url, None)
+ for url in text.extract_iter(
+ page, 'lstImages.push("', '"'
+ )
+ ]
+
+
+class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
+ """Extractor for comics from readcomiconline.to"""
+ chapterclass = ReadcomiconlineIssueExtractor
+ subcategory = "comic"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
+ r"(/Comic/[^/?&#]+/?)$")
+ test = (
+ ("https://readcomiconline.to/Comic/W-i-t-c-h", {
+ "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14",
+ "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",
+ }),
+ ("https://readcomiconline.to/Comic/Bazooka-Jules", {
+ "url": "711674cb78ed10bd2557315f7a67552d01b33985",
+ "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",
+ }),
+ )
+
+ def chapters(self, page):
+ results = []
+ comic, pos = text.extract(page, ' class="barTitle">', '<')
+ page , pos = text.extract(page, ' class="listing">', '</table>', pos)
+
+ comic = comic.rpartition("information")[0].strip()
+ needle = ' title="Read {} '.format(comic)
+ comic = text.unescape(comic)
+
+ for item in text.extract_iter(page, ' href="', ' comic online '):
+ url, _, issue = item.partition(needle)
+ url = url.rpartition('"')[0]
+ if issue.startswith('Issue #'):
+ issue = issue[7:]
+ results.append((self.root + url, {
+ "comic": comic, "issue": issue,
+ "issue_id": text.parse_int(url.rpartition("=")[2]),
+ "lang": "en", "language": "English",
+ }))
+ return results
diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py
new file mode 100644
index 0000000..1a793a0
--- /dev/null
+++ b/gallery_dl/extractor/recursive.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Recursive extractor"""
+
+from .common import Extractor, Message
+from .. import extractor, util
+import requests
+import re
+
+
+class RecursiveExtractor(Extractor):
+ """Extractor that fetches URLs from a remote or local source"""
+ category = "recursive"
+ pattern = r"r(?:ecursive)?:"
+ test = ("recursive:https://pastebin.com/raw/FLwrCYsT", {
+ "url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
+ })
+
+ def items(self):
+ blist = self.config(
+ "blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
+
+ self.session.mount("file://", FileAdapter())
+ page = self.request(self.url.partition(":")[2]).text
+
+ yield Message.Version, 1
+ with extractor.blacklist(blist):
+ for match in re.finditer(r"https?://[^\s\"']+", page):
+ yield Message.Queue, match.group(0), {}
+
+
+class FileAdapter(requests.adapters.BaseAdapter):
+ """Requests adapter for local files"""
+
+ def send(self, request, **kwargs):
+ response = requests.Response()
+ try:
+ response.raw = open(request.url[7:], "rb")
+ except OSError:
+ import io
+ response.raw = io.BytesIO()
+ response.status_code = requests.codes.bad_request
+ else:
+ response.raw.release_conn = response.raw.close
+ response.status_code = requests.codes.ok
+ return response
+
+ def close(self):
+ pass
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
new file mode 100644
index 0000000..0c5a924
--- /dev/null
+++ b/gallery_dl/extractor/reddit.py
@@ -0,0 +1,313 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from subreddits at https://www.reddit.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, extractor, exception
+from ..cache import cache
+import datetime
+import time
+
+
+class RedditExtractor(Extractor):
+ """Base class for reddit extractors"""
+ category = "reddit"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = RedditAPI(self)
+ self.max_depth = int(self.config("recursion", 0))
+ self._visited = set()
+
+ def items(self):
+ subre = RedditSubmissionExtractor.pattern
+ submissions = self.submissions()
+ depth = 0
+
+ yield Message.Version, 1
+ with extractor.blacklist(
+ util.SPECIAL_EXTRACTORS, [RedditSubredditExtractor]):
+ while True:
+ extra = []
+ for url, data in self._urls(submissions):
+ if url[0] == "#":
+ continue
+ if url[0] == "/":
+ url = "https://www.reddit.com" + url
+
+ match = subre.match(url)
+ if match:
+ extra.append(match.group(1))
+ else:
+ yield Message.Queue, text.unescape(url), data
+
+ if not extra or depth == self.max_depth:
+ return
+ depth += 1
+ submissions = (
+ self.api.submission(sid) for sid in extra
+ if sid not in self._visited
+ )
+
+ def submissions(self):
+ """Return an iterable containing all (submission, comments) tuples"""
+
+ def _urls(self, submissions):
+ for submission, comments in submissions:
+ self._visited.add(submission["id"])
+
+ if not submission["is_self"]:
+ yield submission["url"], submission
+
+ for url in text.extract_iter(
+ submission["selftext_html"] or "", ' href="', '"'):
+ yield url, submission
+
+ for comment in comments:
+ for url in text.extract_iter(
+ comment["body_html"] or "", ' href="', '"'):
+ yield url, comment
+
+
+class RedditSubredditExtractor(RedditExtractor):
+ """Extractor for images from subreddits on reddit.com"""
+ subcategory = "subreddit"
+ pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)"
+ r"(/[a-z]+)?/?"
+ r"(?:\?.*?(?:\bt=([a-z]+))?)?$")
+ test = (
+ ("https://www.reddit.com/r/lavaporn/"),
+ ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"),
+ ("https://old.reddit.com/r/lavaporn/"),
+ ("https://np.reddit.com/r/lavaporn/"),
+ ("https://m.reddit.com/r/lavaporn/"),
+ )
+
+ def __init__(self, match):
+ RedditExtractor.__init__(self, match)
+ self.subreddit, self.order, self.timeframe = match.groups()
+
+ def submissions(self):
+ subreddit = self.subreddit + (self.order or "")
+ params = {"t": self.timeframe} if self.timeframe else {}
+ return self.api.submissions_subreddit(subreddit, params)
+
+
+class RedditSubmissionExtractor(RedditExtractor):
+ """Extractor for images from a submission on reddit.com"""
+ subcategory = "submission"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|"
+ r"redd\.it"
+ r")/([a-z0-9]+)")
+ test = (
+ ("https://www.reddit.com/r/lavaporn/comments/2a00np/", {
+ "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg",
+ }),
+ ("https://old.reddit.com/r/lavaporn/comments/2a00np/"),
+ ("https://np.reddit.com/r/lavaporn/comments/2a00np/"),
+ ("https://m.reddit.com/r/lavaporn/comments/2a00np/"),
+ ("https://redd.it/2a00np/"),
+ )
+
+ def __init__(self, match):
+ RedditExtractor.__init__(self, match)
+ self.submission_id = match.group(1)
+
+ def submissions(self):
+ return (self.api.submission(self.submission_id),)
+
+
+class RedditImageExtractor(Extractor):
+ """Extractor for reddit-hosted images"""
+ category = "reddit"
+ subcategory = "image"
+ archive_fmt = "{filename}"
+ pattern = (r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)"
+ r"/[^/?&#]+(?:\?[^#]*)?")
+ test = (
+ ("https://i.redd.it/upjtjcx2npzz.jpg", {
+ "url": "0de614900feef103e580b632190458c0b62b641a",
+ "content": "cc9a68cf286708d5ce23c68e79cd9cf7826db6a3",
+ }),
+ (("https://i.reddituploads.com/0f44f1b1fca2461f957c713d9592617d"
+ "?fit=max&h=1536&w=1536&s=e96ce7846b3c8e1f921d2ce2671fb5e2"), {
+ "url": "f24f25efcedaddeec802e46c60d77ef975dc52a5",
+ "content": "541dbcc3ad77aa01ee21ca49843c5e382371fae7",
+ }),
+ )
+
+ def items(self):
+ data = text.nameext_from_url(self.url)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, self.url, data
+
+
+class RedditAPI():
+ """Minimal interface for the reddit API"""
+ CLIENT_ID = "6N9uN0krSDE-ig"
+ USER_AGENT = "Python:gallery-dl:0.8.4 (by /u/mikf1)"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.comments = extractor.config("comments", 500)
+ self.morecomments = extractor.config("morecomments", False)
+ self.refresh_token = extractor.config("refresh-token")
+ self.log = extractor.log
+
+ client_id = extractor.config("client-id", self.CLIENT_ID)
+ user_agent = extractor.config("user-agent", self.USER_AGENT)
+
+ if (client_id == self.CLIENT_ID) ^ (user_agent == self.USER_AGENT):
+ self.client_id = None
+ self.log.warning(
+ "Conflicting values for 'client-id' and 'user-agent': "
+ "override either both or none of them.")
+ else:
+ self.client_id = client_id
+ extractor.session.headers["User-Agent"] = user_agent
+
+ def submission(self, submission_id):
+ """Fetch the (submission, comments)=-tuple for a submission id"""
+ endpoint = "/comments/" + submission_id + "/.json"
+ link_id = "t3_" + submission_id if self.morecomments else None
+ submission, comments = self._call(endpoint, {"limit": self.comments})
+ return (submission["data"]["children"][0]["data"],
+ self._flatten(comments, link_id))
+
+ def submissions_subreddit(self, subreddit, params):
+ """Collect all (submission, comments)-tuples of a subreddit"""
+ endpoint = "/r/" + subreddit + "/.json"
+ params["limit"] = 100
+ return self._pagination(endpoint, params)
+
+ def morechildren(self, link_id, children):
+ """Load additional comments from a submission"""
+ endpoint = "/api/morechildren"
+ params = {"link_id": link_id, "api_type": "json"}
+ index, done = 0, False
+ while not done:
+ if len(children) - index < 100:
+ done = True
+ params["children"] = ",".join(children[index:index + 100])
+ index += 100
+
+ data = self._call(endpoint, params)["json"]
+ for thing in data["data"]["things"]:
+ if thing["kind"] == "more":
+ children.extend(thing["data"]["children"])
+ else:
+ yield thing["data"]
+
+ def authenticate(self):
+ """Authenticate the application by requesting an access token"""
+ access_token = self._authenticate_impl(self.refresh_token)
+ self.extractor.session.headers["Authorization"] = access_token
+
+ @cache(maxage=3600, keyarg=1)
+ def _authenticate_impl(self, refresh_token=None):
+ """Actual authenticate implementation"""
+ url = "https://www.reddit.com/api/v1/access_token"
+ if refresh_token:
+ self.log.info("Refreshing private access token")
+ data = {"grant_type": "refresh_token",
+ "refresh_token": refresh_token}
+ else:
+ self.log.info("Requesting public access token")
+ data = {"grant_type": ("https://oauth.reddit.com/"
+ "grants/installed_client"),
+ "device_id": "DO_NOT_TRACK_THIS_DEVICE"}
+ response = self.extractor.request(
+ url, method="POST", data=data, auth=(self.client_id, ""))
+ if response.status_code != 200:
+ raise exception.AuthenticationError('"{} ({})"'.format(
+ response.json().get("message"), response.status_code))
+ return "Bearer " + response.json()["access_token"]
+
+ def _call(self, endpoint, params):
+ url = "https://oauth.reddit.com" + endpoint
+ params["raw_json"] = 1
+ self.authenticate()
+ response = self.extractor.request(
+ url, params=params, expect=range(400, 500))
+ remaining = response.headers.get("x-ratelimit-remaining")
+ if remaining and float(remaining) < 2:
+ wait = int(response.headers["x-ratelimit-reset"])
+ self.log.info("Waiting %d seconds for ratelimit reset", wait)
+ time.sleep(wait)
+ data = response.json()
+ if "error" in data:
+ if data["error"] == 403:
+ raise exception.AuthorizationError()
+ if data["error"] == 404:
+ raise exception.NotFoundError()
+ raise Exception(data["message"])
+ return data
+
+ def _pagination(self, endpoint, params, _empty=()):
+ date_fmt = self.extractor.config("date-format", "%Y-%m-%dT%H:%M:%S")
+ date_min = self._parse_datetime("date-min", 0, date_fmt)
+ date_max = self._parse_datetime("date-max", 253402210800, date_fmt)
+
+ id_min = self._parse_id("id-min", 0)
+ id_max = self._parse_id("id-max", 2147483647)
+
+ while True:
+ data = self._call(endpoint, params)["data"]
+
+ for submission in data["children"]:
+ submission = submission["data"]
+ if (date_min <= submission["created_utc"] <= date_max and
+ id_min <= self._decode(submission["id"]) <= id_max):
+ if submission["num_comments"] and self.comments:
+ try:
+ yield self.submission(submission["id"])
+ except exception.AuthorizationError:
+ pass
+ else:
+ yield submission, _empty
+
+ if not data["after"]:
+ return
+ params["after"] = data["after"]
+
+ def _flatten(self, comments, link_id=None):
+ extra = []
+ queue = comments["data"]["children"]
+ while queue:
+ comment = queue.pop(0)
+ if comment["kind"] == "more":
+ if link_id:
+ extra.extend(comment["data"]["children"])
+ continue
+ comment = comment["data"]
+ yield comment
+ if comment["replies"]:
+ queue += comment["replies"]["data"]["children"]
+ if link_id and extra:
+ yield from self.morechildren(link_id, extra)
+
+ def _parse_datetime(self, key, default, fmt):
+ ts = self.extractor.config(key, default)
+ if isinstance(ts, str):
+ try:
+ ts = int(datetime.datetime.strptime(ts, fmt).timestamp())
+ except ValueError as exc:
+ self.log.warning("Unable to parse '%s': %s", key, exc)
+ ts = default
+ return ts
+
+ def _parse_id(self, key, default):
+ sid = self.extractor.config(key)
+ return self._decode(sid.rpartition("_")[2].lower()) if sid else default
+
+ @staticmethod
+ def _decode(sid):
+ return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz")
diff --git a/gallery_dl/extractor/rule34.py b/gallery_dl/extractor/rule34.py
new file mode 100644
index 0000000..de7ef45
--- /dev/null
+++ b/gallery_dl/extractor/rule34.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://rule34.xxx/"""
+
+from . import booru
+
+
+class Rule34Extractor(booru.XmlParserMixin,
+ booru.GelbooruPageMixin,
+ booru.BooruExtractor):
+ """Base class for rule34 extractors"""
+ category = "rule34"
+ api_url = "https://rule34.xxx/index.php"
+ post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
+ pool_url = "https://rule34.xxx/index.php?page=pool&s=show&id={}"
+ page_limit = 4000
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params.update({"page": "dapi", "s": "post", "q": "index"})
+
+
+class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
+ """Extractor for images from rule34.xxx based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
+ r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
+ test = ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "pattern": r"https?://([^.]+\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
+ "count": 1,
+ })
+
+
+class Rule34PoolExtractor(booru.GelbooruPoolMixin, Rule34Extractor):
+ """Extractor for image-pools from rule34.xxx"""
+ pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
+ r"\?page=pool&s=show&id=(?P<pool>\d+)")
+ test = ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
+ "count": 3,
+ })
+
+
+class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
+ """Extractor for single images from rule34.xxx"""
+ pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
+ r"\?page=post&s=view&id=(?P<post>\d+)")
+ test = ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "danraku",
+ "tags_character": "kashima_(kantai_collection)",
+ "tags_copyright": "kantai_collection",
+ "tags_general": str,
+ "tags_metadata": str,
+ },
+ })
diff --git a/gallery_dl/extractor/safebooru.py b/gallery_dl/extractor/safebooru.py
new file mode 100644
index 0000000..f5f058c
--- /dev/null
+++ b/gallery_dl/extractor/safebooru.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://safebooru.org/"""
+
+from . import booru
+
+
+class SafebooruExtractor(booru.XmlParserMixin,
+ booru.GelbooruPageMixin,
+ booru.BooruExtractor):
+ """Base class for safebooru extractors"""
+ category = "safebooru"
+ api_url = "https://safebooru.org/index.php"
+ post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
+ pool_url = "https://safebooru.org/index.php?page=pool&s=show&id={}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params.update({"page": "dapi", "s": "post", "q": "index"})
+
+
+class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
+ """Extractor for images from safebooru.org based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
+ r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
+ test = ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+ "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
+ "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
+ })
+
+
+class SafebooruPoolExtractor(booru.GelbooruPoolMixin, SafebooruExtractor):
+ """Extractor for image-pools from safebooru.org"""
+ pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
+ r"\?page=pool&s=show&id=(?P<pool>\d+)")
+ test = ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
+ "count": 5,
+ })
+
+
+class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
+ """Extractor for single images from safebooru.org"""
+ pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
+ r"\?page=post&s=view&id=(?P<post>\d+)")
+ test = ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
+ "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
+ "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "kawanakajima",
+ "tags_character": "heath_ledger ronald_mcdonald the_joker",
+ "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+ "tags_general": str,
+ },
+ })
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
new file mode 100644
index 0000000..012cb8b
--- /dev/null
+++ b/gallery_dl/extractor/sankaku.py
@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://chan.sankakucomplex.com/"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text, util, exception
+from ..cache import cache
+import collections
+import random
+import time
+import re
+
+
+class SankakuExtractor(SharedConfigMixin, Extractor):
+ """Base class for sankaku extractors"""
+ basecategory = "booru"
+ category = "sankaku"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
+ cookienames = ("login", "pass_hash")
+ cookiedomain = "chan.sankakucomplex.com"
+ subdomain = "chan"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.root = "https://" + self.cookiedomain
+ self.logged_in = True
+ self.start_page = 1
+ self.start_post = 0
+ self.extags = self.config("tags", False)
+ self.wait_min = self.config("wait-min", 3.0)
+ self.wait_max = self.config("wait-max", 6.0)
+ if self.wait_max < self.wait_min:
+ self.wait_max = self.wait_min
+
+ def items(self):
+ self.login()
+ data = self.get_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for post_id in util.advance(self.get_posts(), self.start_post):
+ self.wait()
+ post = self.get_post_data(post_id)
+ url = post["file_url"]
+ post.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ def skip(self, num):
+ self.start_post += num
+ return num
+
+ def get_metadata(self):
+ """Return general metadata"""
+ return {}
+
+ def get_posts(self):
+ """Return an iterable containing all relevant post ids"""
+
+ def get_post_data(self, post_id, extr=text.extract):
+ """Extract metadata of a single post"""
+ url = self.root + "/post/show/" + post_id
+ page = self.request(url, retries=10).text
+
+ tags , pos = extr(page, "<title>", " | ")
+ vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos)
+ vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos)
+ _ , pos = extr(page, "Posted: <", "", pos)
+ created, pos = extr(page, ' title="', '"', pos)
+ rating = extr(page, "<li>Rating: ", "<", pos)[0]
+
+ file_url, pos = extr(page, '<li>Original: <a href="', '"', pos)
+ if file_url:
+ width , pos = extr(page, '>', 'x', pos)
+ height, pos = extr(page, '', ' ', pos)
+ else:
+ width , pos = extr(page, '<object width=', ' ', pos)
+ height, pos = extr(page, 'height=', '>', pos)
+ file_url = extr(page, '<embed src="', '"', pos)[0]
+
+ data = {
+ "id": text.parse_int(post_id),
+ "md5": file_url.rpartition("/")[2].partition(".")[0],
+ "tags": text.unescape(tags),
+ "vote_average": text.parse_float(vavg),
+ "vote_count": text.parse_int(vcnt),
+ "created_at": created,
+ "rating": (rating or "?")[0].lower(),
+ "file_url": "https:" + text.unescape(file_url),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ }
+
+ if self.extags:
+ tags = collections.defaultdict(list)
+ tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0]
+ pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
+ for tag_type, tag_name in pattern.findall(tags_html or ""):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ data["tags_" + key] = " ".join(value)
+
+ return data
+
+ def wait(self):
+ """Wait for a randomly chosen amount of seconds"""
+ time.sleep(random.uniform(self.wait_min, self.wait_max))
+
+ def login(self):
+ """Login and set necessary cookies"""
+ if self._check_cookies(self.cookienames):
+ return
+ username, password = self._get_auth_info()
+ if username:
+ cookies = self._login_impl((username, self.subdomain), password)
+ self._update_cookies(cookies)
+ else:
+ self.logged_in = False
+
+ @cache(maxage=90*24*3600, keyarg=1)
+ def _login_impl(self, usertuple, password):
+ username = usertuple[0]
+ self.log.info("Logging in as %s", username)
+ url = self.root + "/user/authenticate"
+ data = {
+ "url": "",
+ "user[name]": username,
+ "user[password]": password,
+ "commit": "Login",
+ }
+ response = self.request(url, method="POST", data=data)
+
+ if not response.history or response.url != self.root + "/user/home":
+ raise exception.AuthenticationError()
+ cookies = response.history[0].cookies
+ return {c: cookies[c] for c in self.cookienames}
+
+
+class SankakuTagExtractor(SankakuExtractor):
+ """Extractor for images from chan.sankakucomplex.com by search-tags"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = r"(?:https?://)?chan\.sankakucomplex\.com/\?([^#]*)"
+ test = (
+ ("https://chan.sankakucomplex.com/?tags=bonocho", {
+ "count": 5,
+ "pattern": r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
+ r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+",
+ }),
+ # respect 'page' query parameter
+ ("https://chan.sankakucomplex.com/?tags=bonocho&page=2", {
+ "count": 0,
+ }),
+ # respect 'next' query parameter
+ ("https://chan.sankakucomplex.com/?tags=bonocho&next=182284", {
+ "count": 1,
+ }),
+ # error on five or more tags
+ ("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", {
+ "options": (("username", None),),
+ "exception": exception.StopExtraction,
+ }),
+ # match arbitrary query parameters
+ ("https://chan.sankakucomplex.com"
+ "/?tags=marie_rose&page=98&next=3874906&commit=Search"),
+ )
+ per_page = 20
+
+ def __init__(self, match):
+ SankakuExtractor.__init__(self, match)
+ query = text.parse_query(match.group(1))
+ self.tags = text.unquote(query.get("tags", "").replace("+", " "))
+ self.start_page = text.parse_int(query.get("page"), 1)
+ self.next = text.parse_int(query.get("next"), 0)
+
+ def skip(self, num):
+ if self.next:
+ self.start_post += num
+ else:
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+
+ def get_metadata(self):
+ if not self.next:
+ max_page = 50 if self.logged_in else 25
+ if self.start_page > max_page:
+ self.log.info("Traversing from page %d to page %d",
+ max_page, self.start_page)
+ self.start_post += self.per_page * (self.start_page - max_page)
+ self.start_page = max_page
+
+ tags = self.tags.split()
+ if not self.logged_in and len(tags) > 4:
+ self.log.error("Unauthenticated users cannot use "
+ "more than 4 tags at once.")
+ raise exception.StopExtraction()
+ return {"search_tags": " ".join(tags)}
+
+ def get_posts(self):
+ params = {"tags": self.tags}
+
+ if self.next:
+ params["next"] = self.next
+ else:
+ params["page"] = self.start_page
+
+ while True:
+ self.wait()
+ page = self.request(self.root, params=params, retries=10).text
+ pos = page.find("<div id=more-popular-posts-link>") + 1
+
+ ids = list(text.extract_iter(page, '" id=p', '>', pos))
+ if not ids:
+ return
+ yield from ids
+
+ next_qs = text.extract(page, 'next-page-url="/?', '"', pos)[0]
+ next_id = text.parse_query(next_qs).get("next")
+
+ # stop if the same "next" parameter occurs twice in a row (#265)
+ if "next" in params and params["next"] == next_id:
+ return
+
+ params["next"] = next_id or (text.parse_int(ids[-1]) - 1)
+ params["page"] = "2"
+
+
+class SankakuPoolExtractor(SankakuExtractor):
+ """Extractor for image-pools from chan.sankakucomplex.com"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool}")
+ archive_fmt = "p_{pool}_{id}"
+ pattern = r"(?:https?://)?chan\.sankakucomplex\.com/pool/show/(\d+)"
+ test = ("https://chan.sankakucomplex.com/pool/show/90", {
+ "count": 5,
+ })
+ per_page = 24
+
+ def __init__(self, match):
+ SankakuExtractor.__init__(self, match)
+ self.pool_id = match.group(1)
+
+ def skip(self, num):
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+
+ def get_metadata(self):
+ return {"pool": self.pool_id}
+
+ def get_posts(self):
+ url = self.root + "/pool/show/" + self.pool_id
+ params = {"page": self.start_page}
+
+ while True:
+ page = self.request(url, params=params, retries=10).text
+ ids = list(text.extract_iter(page, '" id=p', '>'))
+
+ yield from ids
+ if len(ids) < self.per_page:
+ return
+
+ params["page"] += 1
+
+
+class SankakuPostExtractor(SankakuExtractor):
+ """Extractor for single images from chan.sankakucomplex.com"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?chan\.sankakucomplex\.com/post/show/(\d+)"
+ test = ("https://chan.sankakucomplex.com/post/show/360451", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "bonocho",
+ "tags_copyright": "batman_(series) the_dark_knight",
+ "tags_medium": "sketch copyright_name",
+ "tags_studio": "dc_comics",
+ "tags_character": str,
+ "tags_general": str,
+ },
+ })
+
+ def __init__(self, match):
+ SankakuExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def get_posts(self):
+ return (self.post_id,)
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
new file mode 100644
index 0000000..22b2b63
--- /dev/null
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.sankakucomplex.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import re
+
+
+class SankakucomplexExtractor(Extractor):
+ """Base class for sankakucomplex extractors"""
+ category = "sankakucomplex"
+ root = "https://www.sankakucomplex.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+
+class SankakucomplexArticleExtractor(SankakucomplexExtractor):
+ """Extractor for articles on www.sankakucomplex.com"""
+ subcategory = "article"
+ directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{date:%Y%m%d}_{filename}"
+ pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ r"/(\d{4}/\d\d/\d\d/[^/?&#]+)")
+ test = (
+ ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", {
+ "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d",
+ "keyword": "4b3b5766b277a5d0acbec90fa8f2343262b07efd",
+ }),
+ ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", {
+ "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
+ "keyword": "f47a416d680717855bbc3e4f0cd44479f61d9aa4",
+ }),
+ )
+
+ def items(self):
+ url = "{}/{}/?pg=X".format(self.root, self.path)
+ extr = text.extract_from(self.request(url).text)
+ data = {
+ "title" : text.unescape(
+ extr('property="og:title" content="', '"')),
+ "description": text.unescape(
+ extr('property="og:description" content="', '"')),
+ "date" : text.parse_datetime(
+ extr('property="article:published_time" content="', '"')),
+ }
+ imgs = self.images(extr)
+ data["count"] = len(imgs)
+ data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for img in imgs:
+ img.update(data)
+ yield Message.Url, img["url"], img
+
+ def images(self, extr):
+ num = 0
+ imgs = []
+ urls = set()
+ orig = re.compile(r"-\d+x\d+\.")
+
+ extr('<div class="entry-content">', '')
+ while True:
+ url = extr('data-lazy-src="', '"')
+ if not url:
+ return imgs
+ if url in urls:
+ continue
+ if url[0] == "/":
+ url = text.urljoin(self.root, url)
+ url = orig.sub(".", url)
+ num += 1
+ imgs.append(text.nameext_from_url(url, {
+ "url" : url,
+ "num" : num,
+ }))
+ urls.add(url)
+
+
+class SankakucomplexTagExtractor(SankakucomplexExtractor):
+ """Extractor for sankakucomplex blog articles by tag or author"""
+ subcategory = "tag"
+ pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ r"/((?:tag|category|author)/[^/&?#]+)")
+ test = (
+ ("https://www.sankakucomplex.com/tag/cosplay/", {
+ "range": "1-50",
+ "count": 50,
+ "pattern": SankakucomplexArticleExtractor.pattern,
+ }),
+ ("https://www.sankakucomplex.com/category/anime/"),
+ ("https://www.sankakucomplex.com/author/rift/page/5/"),
+ )
+
+ def items(self):
+ pnum = 1
+ last = None
+ data = {"_extractor": SankakucomplexArticleExtractor}
+
+ yield Message.Version, 1
+ while True:
+ url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
+ response = self.request(url, expect=(404,))
+ if response.status_code == 404:
+ return
+ for url in text.extract_iter(response.text, 'data-direct="', '"'):
+ if url != last:
+ last = url
+ yield Message.Queue, url, data
+ pnum += 1
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
new file mode 100644
index 0000000..f63c999
--- /dev/null
+++ b/gallery_dl/extractor/seiga.py
@@ -0,0 +1,198 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://seiga.nicovideo.jp/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+
+
+class SeigaExtractor(Extractor):
+ """Base class for seiga extractors"""
+ category = "seiga"
+ archive_fmt = "{image_id}"
+ cookiedomain = ".nicovideo.jp"
+ root = "https://seiga.nicovideo.jp"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.start_image = 0
+
+ def items(self):
+ self.login()
+ images = iter(self.get_images())
+ data = next(images)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for image in util.advance(images, self.start_image):
+ data.update(image)
+ data["extension"] = None
+ yield Message.Url, self.get_image_url(data["image_id"]), data
+
+ def get_images(self):
+ """Return iterable containing metadata and images"""
+
+ def get_image_url(self, image_id):
+ """Get url for an image with id 'image_id'"""
+ url = "{}/image/source/{}".format(self.root, image_id)
+ response = self.request(
+ url, method="HEAD", allow_redirects=False, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("image")
+ return response.headers["Location"].replace("/o/", "/priv/", 1)
+
+ def login(self):
+ """Login and set necessary cookies"""
+ if not self._check_cookies(("user_session",)):
+ username, password = self._get_auth_info()
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=7*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "https://account.nicovideo.jp/api/v1/login"
+ data = {"mail_tel": username, "password": password}
+
+ self.request(url, method="POST", data=data)
+ if "user_session" not in self.session.cookies:
+ raise exception.AuthenticationError()
+ del self.session.cookies["nicosid"]
+ return self.session.cookies
+
+
+class SeigaUserExtractor(SeigaExtractor):
+ """Extractor for images of a user from seiga.nicovideo.jp"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "{user[id]}")
+ filename_fmt = "{category}_{user[id]}_{image_id}.{extension}"
+ pattern = (r"(?:https?://)?(?:www\.|seiga\.)?nicovideo\.jp/"
+ r"user/illust/(\d+)(?:\?(?:[^&]+&)*sort=([^&#]+))?")
+ test = (
+ ("https://seiga.nicovideo.jp/user/illust/39537793", {
+ "pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
+ "count": ">= 4",
+ "keyword": {
+ "user": {
+ "id": 39537793,
+ "message": str,
+ "name": str,
+ },
+ "clips": int,
+ "comments": int,
+ "count": int,
+ "extension": None,
+ "image_id": int,
+ "title": str,
+ "views": int,
+ },
+ }),
+ ("https://seiga.nicovideo.jp/user/illust/79433", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://seiga.nicovideo.jp/user/illust/39537793"
+ "?sort=image_view&target=illust_all"),
+ )
+
+ def __init__(self, match):
+ SeigaExtractor.__init__(self, match)
+ self.user_id, self.order = match.groups()
+ self.start_page = 1
+
+ def skip(self, num):
+ pages, images = divmod(num, 40)
+ self.start_page += pages
+ self.start_image += images
+ return num
+
+ def get_metadata(self, page):
+ """Collect metadata from 'page'"""
+ data = text.extract_all(page, (
+ ("name" , '<img alt="', '"'),
+ ("msg" , '<li class="user_message">', '</li>'),
+ (None , '<span class="target_name">すべて</span>', ''),
+ ("count", '<span class="count ">', '</span>'),
+ ))[0]
+
+ if not data["name"] and "ユーザー情報が取得出来ませんでした" in page:
+ raise exception.NotFoundError("user")
+
+ return {
+ "user": {
+ "id": text.parse_int(self.user_id),
+ "name": data["name"],
+ "message": (data["msg"] or "").strip(),
+ },
+ "count": text.parse_int(data["count"]),
+ }
+
+ def get_images(self):
+ url = "{}/user/illust/{}".format(self.root, self.user_id)
+ params = {"sort": self.order, "page": self.start_page,
+ "target": "illust_all"}
+
+ while True:
+ cnt = 0
+ page = self.request(url, params=params).text
+
+ if params["page"] == self.start_page:
+ yield self.get_metadata(page)
+
+ for info in text.extract_iter(
+ page, '<li class="list_item', '</a></li> '):
+ data = text.extract_all(info, (
+ ("image_id", '/seiga/im', '"'),
+ ("title" , '<li class="title">', '</li>'),
+ ("views" , '</span>', '</li>'),
+ ("comments", '</span>', '</li>'),
+ ("clips" , '</span>', '</li>'),
+ ))[0]
+ for key in ("image_id", "views", "comments", "clips"):
+ data[key] = text.parse_int(data[key])
+ yield data
+ cnt += 1
+
+ if cnt < 40:
+ return
+ params["page"] += 1
+
+
+class SeigaImageExtractor(SeigaExtractor):
+ """Extractor for single images from seiga.nicovideo.jp"""
+ subcategory = "image"
+ filename_fmt = "{category}_{image_id}.{extension}"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:seiga\.|www\.)?nicovideo\.jp/(?:seiga/im|image/source/)"
+ r"|lohas\.nicoseiga\.jp/(?:thumb|(?:priv|o)/[^/]+/\d+)/)(\d+)")
+ test = (
+ ("https://seiga.nicovideo.jp/seiga/im5977527", {
+ "keyword": "f66ba5de33d4ce2cb57f23bb37e1e847e0771c10",
+ "content": "d9202292012178374d57fb0126f6124387265297",
+ }),
+ ("https://seiga.nicovideo.jp/seiga/im123", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://seiga.nicovideo.jp/image/source/5977527"),
+ ("https://lohas.nicoseiga.jp/thumb/5977527i"),
+ ("https://lohas.nicoseiga.jp/priv"
+ "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
+ ("https://lohas.nicoseiga.jp/o"
+ "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
+ )
+
+ def __init__(self, match):
+ SeigaExtractor.__init__(self, match)
+ self.image_id = match.group(1)
+
+ def skip(self, num):
+ self.start_image += num
+ return num
+
+ def get_images(self):
+ return ({}, {"image_id": text.parse_int(self.image_id)})
diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py
new file mode 100644
index 0000000..736173f
--- /dev/null
+++ b/gallery_dl/extractor/senmanga.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from from https://raw.senmanga.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class SenmangaChapterExtractor(Extractor):
+ """Extractor for manga-chapters from raw.senmanga.com"""
+ category = "senmanga"
+ subcategory = "chapter"
+ directory_fmt = ("{category}", "{manga}", "{chapter_string}")
+ filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
+ archive_fmt = "{manga}_{chapter_string}_{page}"
+ pattern = r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"
+ test = (
+ ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {
+ "url": "5f95140ff511d8497e2ec08fa7267c6bb231faec",
+ "keyword": "705d941a150765edb33cd2707074bd703a93788c",
+ "content": "0e37b1995708ffc175f2e175d91a518e6948c379",
+ }),
+ ("http://raw.senmanga.com/Love-Lab/2016-03/1", {
+ "url": "8347b9f00c14b864dd3c19a1f5ae52adb2ef00de",
+ "keyword": "8a8ab2529ba2edfc83a6b3a8bede1d6c580db7b4",
+ }),
+ )
+ root = "https://raw.senmanga.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ part = match.group(1)
+ self.chapter_url = "{}/{}/".format(self.root, part)
+ self.img_url = "{}/viewer/{}/".format(self.root, part)
+ self.session.headers["Referer"] = self.chapter_url
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"] in range(1, data["count"]+1):
+ data["extension"] = None
+ yield Message.Url, self.img_url + str(data["page"]), data
+
+ def metadata(self):
+ """Collect metadata for extractor-job"""
+ page = self.request(self.chapter_url).text
+ self.session.cookies.clear()
+ title, pos = text.extract(page, '<title>', '</title>')
+ count, pos = text.extract(page, '</select> of ', '\n', pos)
+ manga, _, chapter = title.partition(" - Chapter ")
+
+ return {
+ "manga": text.unescape(manga).replace("-", " "),
+ "chapter_string": chapter.partition(" - Page ")[0],
+ "count": text.parse_int(count),
+ "lang": "jp",
+ "language": "Japanese",
+ }
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
new file mode 100644
index 0000000..aa2b16b
--- /dev/null
+++ b/gallery_dl/extractor/sexcom.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.sex.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class SexcomExtractor(Extractor):
+ """Base class for sexcom extractors"""
+ category = "sexcom"
+ directory_fmt = ("{category}")
+ filename_fmt = "{pin_id}{title:? //}.{extension}"
+ archive_fmt = "{pin_id}"
+ root = "https://www.sex.com"
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, self.metadata()
+ for url in self.pins():
+ pin = self._parse_pin(url)
+ yield Message.Url, pin["url"], pin
+
+ def metadata(self):
+ return {}
+
+ def pins(self):
+ return ()
+
+ def _pagination(self, url):
+ while True:
+ extr = text.extract_from(self.request(url).text)
+
+ while True:
+ href = extr('<a class="image_wrapper" href="', '"')
+ if not href:
+ break
+ yield self.root + href
+
+ pager = extr('id="pagenum"', '</div>')
+ url = text.extract(pager, ' href="', '"')[0]
+ if not url:
+ return
+ url = text.urljoin(self.root, url)
+
+ def _parse_pin(self, pin_url):
+ extr = text.extract_from(self.request(pin_url).text)
+ data = {}
+
+ data["thumbnail"] = extr('itemprop="thumbnail" content="', '"')
+ data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower()
+ data["title"] = text.unescape(extr('itemprop="name">' , '<'))
+ data["repins"] = text.parse_int(text.extract(
+ extr('"btn-group"', '</div>'), '"btn btn-primary">' , '<')[0])
+ data["likes"] = text.parse_int(text.extract(
+ extr('"btn-group"', '</div>'), '"btn btn-default">' , '<')[0])
+ data["pin_id"] = text.parse_int(extr('data-id="', '"'))
+
+ if data["type"] == "video":
+ info = extr("player.updateSrc(", ");")
+
+ if info:
+ path = text.extract(info, "src: '", "'")[0]
+ data["filename"] = path.rpartition("/")[2]
+ data["extension"] = "mp4"
+ if "'HD'" in info:
+ path += "/hd"
+ data["url"] = self.root + path
+ else:
+ data["url"] = "ytdl:" + text.extract(
+ extr('<iframe', '>'), ' src="', '"')[0]
+ else:
+ data["url"] = extr(' src="', '"')
+ text.nameext_from_url(data["url"], data)
+
+ data["uploader"] = extr('itemprop="author">', '<')
+ data["date"] = extr('datetime="', '"')
+ data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>'))
+ data["comments"] = text.parse_int(extr('Comments (', ')'))
+
+ return data
+
+
+class SexcomPinExtractor(SexcomExtractor):
+ """Extractor a pinned image or video on www.sex.com"""
+ subcategory = "pin"
+ directory_fmt = ("{category}",)
+ pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)"
+ test = (
+ # picture
+ ("https://www.sex.com/pin/56714360/", {
+ "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86",
+ "keyword": {
+ "comments": int,
+ "date": "2018-10-02T21:18:17-04:00",
+ "extension": "jpg",
+ "filename": "20037816",
+ "likes": int,
+ "pin_id": 56714360,
+ "repins": int,
+ "tags": list,
+ "thumbnail": str,
+ "title": "Pin #56714360",
+ "type": "picture",
+ "uploader": "alguem",
+ "url": str,
+ },
+ }),
+ # gif
+ ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", {
+ "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1",
+ }),
+ # video
+ ("https://www.sex.com/pin/55748381/", {
+ "pattern": "https://www.sex.com/video/stream/776238/hd",
+ }),
+ # pornhub embed
+ ("https://www.sex.com/pin/55847384-very-nicely-animated/", {
+ "pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2",
+ }),
+ )
+
+ def __init__(self, match):
+ SexcomExtractor.__init__(self, match)
+ self.pin_id = match.group(1)
+
+ def pins(self):
+ return ("{}/pin/{}/".format(self.root, self.pin_id),)
+
+
+class SexcomBoardExtractor(SexcomExtractor):
+ """Extractor for pins from a board on www.sex.com"""
+ subcategory = "board"
+ directory_fmt = ("{category}", "{user}", "{board}")
+ pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user"
+ r"/([^/?&#]+)/(?!(?:following|pins|repins|likes)/)([^/?&#]+)")
+ test = ("https://www.sex.com/user/ronin17/exciting-hentai/", {
+ "count": ">= 15",
+ })
+
+ def __init__(self, match):
+ SexcomExtractor.__init__(self, match)
+ self.user, self.board = match.groups()
+
+ def metadata(self):
+ return {
+ "user" : text.unquote(self.user),
+ "board": text.unquote(self.board),
+ }
+
+ def pins(self):
+ url = "{}/user/{}/{}/".format(self.root, self.user, self.board)
+ return self._pagination(url)
+
+
+class SexcomSearchExtractor(SexcomExtractor):
+ """Extractor for search results on www.sex.com"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "search", "{search[query]}")
+ pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:"
+ r"(pic|gif|video)s/([^/?&#]+)|search/(pic|gif|video)s"
+ r")/?(?:\?([^#]+))?)")
+ test = (
+ ("https://www.sex.com/search/pics?query=ecchi", {
+ "range": "1-10",
+ "count": 10,
+ }),
+ ("https://www.sex.com/videos/hentai/", {
+ "range": "1-10",
+ "count": 10,
+ }),
+ )
+
+ def __init__(self, match):
+ SexcomExtractor.__init__(self, match)
+ self.path = match.group(1)
+
+ self.search = text.parse_query(match.group(5))
+ self.search["type"] = match.group(2) or match.group(4)
+ if "query" not in self.search:
+ self.search["query"] = match.group(3) or ""
+
+ def metadata(self):
+ return {"search": self.search}
+
+ def pins(self):
+ url = "{}/{}".format(self.root, self.path)
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
new file mode 100644
index 0000000..35895bb
--- /dev/null
+++ b/gallery_dl/extractor/shopify.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Shopify instances"""
+
+from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .. import text
+import time
+import re
+
+
+class ShopifyExtractor(SharedConfigMixin, Extractor):
+ """Base class for Shopify extractors"""
+ basecategory = "shopify"
+ filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"
+ archive_fmt = "{id}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_url = self.root + match.group(1)
+
+ def request(self, url, method="GET", expect=range(400, 500), **kwargs):
+ tries = 0
+ kwargs["expect"] = expect
+ while True:
+ response = Extractor.request(self, url, method, **kwargs)
+ if response.status_code not in (429, 430):
+ return response
+ tries += 1
+ waittime = 2 ** (tries + 2)
+ self.log.warning(
+ "HTTP status %s: %s - Waiting for %d seconds",
+ response.status_code, response.reason, waittime)
+ time.sleep(waittime)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ for url in self.products():
+ response = self.request(url + ".json", headers=headers)
+ if response.status_code >= 400:
+ self.log.warning('Skipping %s ("%d: %s")',
+ url, response.status_code, response.reason)
+ continue
+ product = response.json()["product"]
+ del product["image"]
+
+ for num, image in enumerate(product.pop("images"), 1):
+ text.nameext_from_url(image["src"], image)
+ image.update(data)
+ image["product"] = product
+ image["num"] = num
+ yield Message.Url, image["src"], image
+
+ def metadata(self):
+ """Return general metadata"""
+ return {}
+
+ def products(self):
+ """Return an iterable with all relevant product URLs"""
+
+
+class ShopifyCollectionExtractor(ShopifyExtractor):
+ """Base class for collection extractors for Shopify based sites"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{collection[title]}")
+ pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)"
+
+ def __init__(self, match):
+ ShopifyExtractor.__init__(self, match)
+ self.params = match.group(2)
+
+ def metadata(self):
+ return self.request(self.item_url + ".json").json()
+
+ def products(self):
+ params = text.parse_query(self.params)
+ params["page"] = text.parse_int(params.get("page"), 1)
+ search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+")
+
+ while True:
+ page = self.request(self.item_url, params=params).text
+ urls = search_re.findall(page)
+
+ if not urls:
+ return
+ for path in urls:
+ yield self.root + path
+ params["page"] += 1
+
+
+class ShopifyProductExtractor(ShopifyExtractor):
+ """Base class for product extractors for Shopify based sites"""
+ subcategory = "product"
+ directory_fmt = ("{category}", "Products")
+ pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)"
+
+ def products(self):
+ return (self.item_url,)
+
+
+EXTRACTORS = {
+ "fashionnova": {
+ "root": "https://www.fashionnova.com",
+ "pattern": r"(?:www\.)?fashionnova\.com",
+ "test-product": (
+ ("https://www.fashionnova.com/products/essential-slide-red", {
+ "pattern": r"https?://cdn\.shopify.com/",
+ "count": 3,
+ }),
+ ("https://www.fashionnova.com/collections/flats/products/name"),
+ ),
+ "test-collection": (
+ ("https://www.fashionnova.com/collections/mini-dresses", {
+ "range": "1-20",
+ "count": 20,
+ }),
+ ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
+ ("https://www.fashionnova.com/collections/mini-dresses#1"),
+ ),
+
+ },
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ ShopifyProductExtractor,
+ ShopifyCollectionExtractor,
+))
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
new file mode 100644
index 0000000..44dc6fe
--- /dev/null
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract hentai-manga from https://www.simply-hentai.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util, exception
+
+
+class SimplyhentaiGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from simply-hentai.com"""
+ category = "simplyhentai"
+ archive_fmt = "{image_id}"
+ pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com"
+ r"(?!/(?:album|gifs?|images?|series)(?:/|$))"
+ r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)")
+ test = (
+ (("https://original-work.simply-hentai.com"
+ "/amazon-no-hiyaku-amazon-elixir"), {
+ "url": "258289249990502c3138719cb89e995a60861e49",
+ "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
+ }),
+ ("https://www.simply-hentai.com/notfound", {
+ "exception": exception.GalleryDLException,
+ }),
+ # custom subdomain
+ ("https://pokemon.simply-hentai.com/mao-friends-9bc39"),
+ # www subdomain, two path segments
+ ("https://www.simply-hentai.com/vocaloid/black-magnet"),
+ )
+
+ def __init__(self, match):
+ url = "https://" + match.group(1)
+ GalleryExtractor.__init__(self, match, url)
+ self.session.headers["Referer"] = url
+
+ def metadata(self, page):
+ extr = text.extract
+ title , pos = extr(page, '<meta property="og:title" content="', '"')
+ if not title:
+ raise exception.NotFoundError("gallery")
+ gid , pos = extr(page, '/Album/', '/', pos)
+ series, pos = extr(page, 'box-title">Series</div>', '</div>', pos)
+ lang , pos = extr(page, 'box-title">Language</div>', '</div>', pos)
+ chars , pos = extr(page, 'box-title">Characters</div>', '</div>', pos)
+ tags , pos = extr(page, 'box-title">Tags</div>', '</div>', pos)
+ artist, pos = extr(page, 'box-title">Artists</div>', '</div>', pos)
+ date , pos = extr(page, 'Uploaded', '</div>', pos)
+ lang = text.remove_html(lang) if lang else None
+
+ return {
+ "gallery_id": text.parse_int(gid),
+ "title" : text.unescape(title),
+ "artist" : text.split_html(artist),
+ "parody" : text.split_html(series),
+ "characters": text.split_html(chars),
+ "tags" : text.split_html(tags),
+ "lang" : util.language_to_code(lang),
+ "language" : lang,
+ "date" : text.remove_html(date),
+ }
+
+ def images(self, _):
+ url = self.chapter_url + "/all-pages"
+ headers = {"Accept": "application/json"}
+ images = self.request(url, headers=headers).json()
+ return [
+ (urls["full"], {"image_id": text.parse_int(image_id)})
+ for image_id, urls in sorted(images.items())
+ ]
+
+
+class SimplyhentaiImageExtractor(Extractor):
+ """Extractor for individual images from simply-hentai.com"""
+ category = "simplyhentai"
+ subcategory = "image"
+ directory_fmt = ("{category}", "{type}s")
+ filename_fmt = "{category}_{token}{title:?_//}.{extension}"
+ archive_fmt = "{token}"
+ pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com"
+ r"/(image|gif)/[^/?&#]+)")
+ test = (
+ (("https://www.simply-hentai.com/image"
+ "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), {
+ "url": "0338eb137830ab6f81e5f410d3936ef785d063d9",
+ "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2",
+ }),
+ ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", {
+ "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1",
+ "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_url = "https://www." + match.group(1)
+ self.type = match.group(2)
+
+ def items(self):
+ page = self.request(self.page_url).text
+ url_search = 'data-src="' if self.type == "image" else '<source src="'
+
+ title, pos = text.extract(page, '"og:title" content="', '"')
+ descr, pos = text.extract(page, '"og:description" content="', '"', pos)
+ url , pos = text.extract(page, url_search, '"', pos)
+
+ tags = text.extract(descr, " tagged with ", " online for free ")[0]
+ if tags:
+ tags = tags.split(", ")
+ tags[-1] = tags[-1].partition(" ")[2]
+ else:
+ tags = []
+
+ data = text.nameext_from_url(url, {
+ "title": text.unescape(title) if title else "",
+ "tags": tags,
+ "type": self.type,
+ })
+ data["token"] = data["filename"].rpartition("_")[2]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+
+class SimplyhentaiVideoExtractor(Extractor):
+ """Extractor for hentai videos from simply-hentai.com"""
+ category = "simplyhentai"
+ subcategory = "video"
+ directory_fmt = ("{category}", "{type}s")
+ filename_fmt = "{title}{episode:?_//>02}.{extension}"
+ archive_fmt = "{title}_{episode}"
+ pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)"
+ test = (
+ ("https://videos.simply-hentai.com/creamy-pie-episode-02", {
+ "pattern": r"https://www\.googleapis\.com/drive/v3/files"
+ r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+",
+ "keyword": "29d63987fed33f0a9f4b3786d1d71b03d793250a",
+ "count": 1,
+ }),
+ (("https://videos.simply-hentai.com"
+ "/1715-tifa-in-hentai-gang-bang-3d-movie"), {
+ "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0",
+ "keyword": "c561341aa3c6999f615abf1971d28fb2a83da2a7",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_url = "https://" + match.group(1)
+
+ def items(self):
+ page = self.request(self.page_url).text
+
+ title, pos = text.extract(page, "<title>", "</title>")
+ tags , pos = text.extract(page, ">Tags</div>", "</div>", pos)
+ date , pos = text.extract(page, ">Upload Date</div>", "</div>", pos)
+ title = title.rpartition(" - ")[0]
+
+ if "<video" in page:
+ video_url = text.extract(page, '<source src="', '"', pos)[0]
+ episode = 0
+ else:
+ # video url from myhentai.tv embed
+ pos = page.index('<div class="video-frame-container">', pos)
+ embed_url = text.extract(page, 'src="', '"', pos)[0].replace(
+ "embedplayer.php?link=", "embed.php?name=")
+ embed_page = self.request(embed_url).text
+ video_url = text.extract(embed_page, '"file":"', '"')[0]
+ title, _, episode = title.rpartition(" Episode ")
+
+ data = text.nameext_from_url(video_url, {
+ "title": text.unescape(title),
+ "episode": text.parse_int(episode),
+ "tags": text.split_html(tags)[::2],
+ "date": text.remove_html(date),
+ "type": "video",
+ })
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, video_url, data
diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py
new file mode 100644
index 0000000..127cce8
--- /dev/null
+++ b/gallery_dl/extractor/slickpic.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.slickpic.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import time
+
+
+BASE_PATTERN = r"(?:https?://)?([^.]+)\.slickpic\.com"
+
+
+class SlickpicExtractor(Extractor):
+ """Base class for slickpic extractors"""
+ category = "slickpic"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.root = "https://{}.slickpic.com".format(self.user)
+
+
+class SlickpicAlbumExtractor(SlickpicExtractor):
+ """Extractor for albums on slickpic.com"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{user[name]}",
+ "{album[id]} {album[title]}")
+ filename_fmt = "{num:>03}_{id}{title:?_//}.{extension}"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/albums/([^/?&#]+)"
+ test = (
+ ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", {
+ "url": "58bd94ebc80fd906e9879826970b408d54c6da07",
+ "keyword": "54a9d6f9e42ae43c644aa9316186fb9d9955fe53",
+ }),
+ ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", {
+ "range": "34",
+ "content": "cec6630e659dc72db1ee1a9a6f3b525189261988",
+ }),
+ )
+
+ def __init__(self, match):
+ SlickpicExtractor.__init__(self, match)
+ self.album = match.group(2)
+
+ def items(self):
+ data = self.metadata()
+ imgs = self.images(data)
+
+ data = {
+ "album": {
+ "id" : text.parse_int(data["aid"]),
+ "title": text.unescape(data["title"]),
+ },
+ "user": {
+ "id" : text.parse_int(data["uid"]),
+ "name": text.unescape(data["user"]),
+ "nick": self.user
+ },
+ "count": len(imgs),
+ }
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, img in enumerate(imgs, 1):
+ url = img["url_rsz"] + "/o/" + img["fname"]
+ img = text.nameext_from_url(img["fname"], {
+ "url" : url,
+ "num" : num,
+ "id" : text.parse_int(img["id"]),
+ "width" : text.parse_int(img["width"]),
+ "height" : text.parse_int(img["height"]),
+ "title" : img["title"],
+ "description": img["descr"],
+ })
+ img.update(data)
+ yield Message.Url, url, img
+
+ def metadata(self):
+ url = "{}/albums/{}/?wallpaper".format(self.root, self.album)
+ extr = text.extract_from(self.request(url).text)
+
+ title = text.unescape(extr("<title>", "</title>"))
+ title, _, user = title.rpartition(" by ")
+
+ return {
+ "title": title,
+ "user" : user,
+ "tk" : extr('tk = "', '"'),
+ "shd" : extr('shd = "', '"'),
+ "aid" : extr('data-aid="', '"', ),
+ "uid" : extr('data-uid="', '"', ),
+ }
+
+ def images(self, data):
+ url = self.root + "/xhr/photo/get/list"
+ data = {
+ "tm" : time.time(),
+ "tk" : data["tk"],
+ "shd" : data["shd"],
+ "aid" : data["aid"],
+ "uid" : data["uid"],
+ "col" : "0",
+ "sys" : self.album,
+ "vw" : "1280",
+ "vh" : "1024",
+ "skey" : "",
+ "viewer": "false",
+ "pub" : "1",
+ "sng" : "0",
+ "whq" : "1",
+ }
+ return self.request(url, method="POST", data=data).json()["list"]
+
+
+class SlickpicUserExtractor(SlickpicExtractor):
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"(?:/gallery)?/?(?:$|[?#])"
+ test = (
+ ("https://mattcrandall.slickpic.com/gallery/", {
+ "count": ">= 358",
+ "pattern": SlickpicAlbumExtractor.pattern,
+ }),
+ ("https://mattcrandall.slickpic.com/"),
+ )
+
+ def items(self):
+ page = self.request(self.root + "/gallery?viewer").text
+ data = {"_extractor": SlickpicAlbumExtractor}
+ base = self.root + "/albums/"
+
+ yield Message.Version, 1
+ for album in text.extract_iter(page, 'href="' + base, '"'):
+ yield Message.Queue, base + album, data
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
new file mode 100644
index 0000000..30420a8
--- /dev/null
+++ b/gallery_dl/extractor/slideshare.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann, Leonardo Taccari
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.slideshare.net/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class SlidesharePresentationExtractor(Extractor):
+ """Extractor for images from a presentation on slideshare.net"""
+ category = "slideshare"
+ subcategory = "presentation"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{presentation}-{num:>02}.{extension}"
+ archive_fmt = "{presentation}_{num}"
+ pattern = (r"(?:https?://)?(?:www\.)?slideshare\.net"
+ r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)")
+ test = (
+ (("https://www.slideshare.net"
+ "/Slideshare/get-started-with-slide-share"), {
+ "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
+ "content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc",
+ }),
+ # long title
+ (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
+ "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
+ "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
+ }),
+ # mobile URL
+ (("https://www.slideshare.net"
+ "/mobile/uqudent/introduction-to-fixed-prosthodontics"), {
+ "url": "59993ad7b0cb93c73011547eedcd02c622649e9d",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user, self.presentation = match.groups()
+
+ def items(self):
+ page = self.request("https://www.slideshare.net/" + self.user +
+ "/" + self.presentation).text
+ data = self.get_job_metadata(page)
+ imgs = self.get_image_urls(page)
+ data["count"] = len(imgs)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ descr, pos = text.extract(
+ page, '<meta name="description" content="', '"')
+ title, pos = text.extract(
+ page, '<span class="j-title-breadcrumb">', '</span>', pos)
+ views, pos = text.extract(
+ page, '<span class="notranslate pippin-data">', 'views<', pos)
+ published, pos = text.extract(
+ page, '<time datetime="', '"', pos)
+ alt_descr, pos = text.extract(
+ page, 'id="slideshow-description-paragraph" class="notranslate">',
+ '</p>', pos)
+
+ if descr.endswith("…") and alt_descr:
+ descr = text.remove_html(alt_descr).strip()
+
+ return {
+ "user": self.user,
+ "presentation": self.presentation,
+ "title": text.unescape(title.strip()),
+ "description": text.unescape(descr),
+ "views": text.parse_int(views.replace(",", "")),
+ "published": published,
+ }
+
+ @staticmethod
+ def get_image_urls(page):
+ """Extract and return a list of all image-urls"""
+ return list(text.extract_iter(page, 'data-full="', '"'))
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
new file mode 100644
index 0000000..80348ae
--- /dev/null
+++ b/gallery_dl/extractor/smugmug.py
@@ -0,0 +1,316 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.smugmug.com/"""
+
+from .common import Extractor, Message
+from .. import text, oauth, exception
+
+BASE_PATTERN = (
+ r"(?:smugmug:(?!album:)(?:https?://)?([^/]+)|"
+ r"(?:https?://)?([^.]+)\.smugmug\.com)")
+
+
+class SmugmugExtractor(Extractor):
+ """Base class for smugmug extractors"""
+ category = "smugmug"
+ filename_fmt = ("{category}_{User[NickName]:?/_/}"
+ "{Image[UploadKey]}_{Image[ImageKey]}.{extension}")
+ empty_user = {
+ "Uri": "",
+ "ResponseLevel": "Public",
+ "Name": "",
+ "NickName": "",
+ "QuickShare": False,
+ "RefTag": "",
+ "ViewPassHint": "",
+ "WebUri": "",
+ "Uris": None,
+ }
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = SmugmugAPI(self)
+ self.videos = self.config("videos", True)
+ self.session = self.api.session
+
+ def _select_format(self, image):
+ details = image["Uris"]["ImageSizeDetails"]
+ media = None
+
+ if self.videos and image["IsVideo"]:
+ fltr = "VideoSize"
+ elif "ImageSizeOriginal" in details:
+ media = details["ImageSizeOriginal"]
+ else:
+ fltr = "ImageSize"
+
+ if not media:
+ sizes = filter(lambda s: s[0].startswith(fltr), details.items())
+ media = max(sizes, key=lambda s: s[1]["Width"])[1]
+ del image["Uris"]
+
+ for key in ("Url", "Width", "Height", "MD5", "Size", "Watermarked",
+ "Bitrate", "Duration"):
+ if key in media:
+ image[key] = media[key]
+ return image["Url"]
+
+
+class SmugmugAlbumExtractor(SmugmugExtractor):
+ """Extractor for smugmug albums"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{User[NickName]}", "{Album[Name]}")
+ archive_fmt = "a_{Album[AlbumKey]}_{Image[ImageKey]}"
+ pattern = r"smugmug:album:([^:]+)$"
+ test = (
+ ("smugmug:album:ddvxpg", {
+ "url": "0429e9bf50ee600674e448934e3882ca1761ae7b",
+ }),
+ # empty
+ ("smugmug:album:SXvjbW", {
+ "count": 0,
+ }),
+ # no "User"
+ ("smugmug:album:6VRT8G", {
+ "url": "c4a0f4c4bfd514b93cbdeb02b3345bf7ef6604df",
+ }),
+ )
+
+ def __init__(self, match):
+ SmugmugExtractor.__init__(self, match)
+ self.album_id = match.group(1)
+
+ def items(self):
+ album = self.api.album(self.album_id, "User")
+ user = album["Uris"].get("User") or self.empty_user.copy()
+
+ del user["Uris"]
+ del album["Uris"]
+ data = {"Album": album, "User": user}
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for image in self.api.album_images(self.album_id, "ImageSizeDetails"):
+ url = self._select_format(image)
+ data["Image"] = image
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class SmugmugImageExtractor(SmugmugExtractor):
+ """Extractor for individual smugmug images"""
+ subcategory = "image"
+ archive_fmt = "{Image[ImageKey]}"
+ pattern = BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#-]+)"
+ test = (
+ ("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", {
+ "url": "78f0bf3516b6d670b7319216bdeccb35942ca4cf",
+ "keyword": "b298ef7ed2b1918263b6a7dc6f56e54401584381",
+ "content": "64a8f69a1d824921eebbdf2420087937adfa45cd",
+ }),
+ # video
+ ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", {
+ "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee",
+ "keyword": "c708c4b9527a2fb29396c19f7628f9cf4b0b3a39",
+ }),
+ )
+
+ def __init__(self, match):
+ SmugmugExtractor.__init__(self, match)
+ self.image_id = match.group(3)
+
+ def items(self):
+ image = self.api.image(self.image_id, "ImageSizeDetails")
+ url = self._select_format(image)
+
+ data = {"Image": image}
+ text.nameext_from_url(url, data)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+
+class SmugmugPathExtractor(SmugmugExtractor):
+ """Extractor for smugmug albums from URL paths and users"""
+ subcategory = "path"
+ pattern = BASE_PATTERN + r"((?:/[^/?&#a-fh-mo-z][^/?&#]*)*)/?$"
+ test = (
+ ("https://acapella.smugmug.com/Micro-Macro/Drops/", {
+ "pattern": "smugmug:album:ddvxpg$",
+ }),
+ ("https://acapella.smugmug.com/", {
+ "pattern": SmugmugAlbumExtractor.pattern,
+ "url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68",
+ }),
+ # gallery node without owner
+ ("https://www.smugmug.com/gallery/n-GLCjnD/", {
+ "pattern": "smugmug:album:6VRT8G$",
+ }),
+ # custom domain
+ ("smugmug:www.creativedogportraits.com/PortfolioGallery/", {
+ "pattern": "smugmug:album:txWXzs$",
+ }),
+ ("smugmug:www.creativedogportraits.com/", {
+ "pattern": "smugmug:album:txWXzs$",
+ }),
+ ("smugmug:https://www.creativedogportraits.com/"),
+ )
+
+ def __init__(self, match):
+ SmugmugExtractor.__init__(self, match)
+ self.domain, self.user, self.path = match.groups()
+
+ def items(self):
+ yield Message.Version, 1
+
+ if not self.user:
+ self.user = self.api.site_user(self.domain)["NickName"]
+
+ if self.path:
+ if self.path.startswith("/gallery/n-"):
+ node = self.api.node(self.path[11:])
+ else:
+ data = self.api.user_urlpathlookup(self.user, self.path)
+ node = data["Uris"]["Node"]
+
+ if node["Type"] == "Album":
+ nodes = (node,)
+ elif node["Type"] == "Folder":
+ nodes = self.album_nodes(node)
+ else:
+ nodes = ()
+
+ for node in nodes:
+ album_id = node["Uris"]["Album"].rpartition("/")[2]
+ node["_extractor"] = SmugmugAlbumExtractor
+ yield Message.Queue, "smugmug:album:" + album_id, node
+
+ else:
+ for album in self.api.user_albums(self.user):
+ uri = "smugmug:album:" + album["AlbumKey"]
+ album["_extractor"] = SmugmugAlbumExtractor
+ yield Message.Queue, uri, album
+
+ def album_nodes(self, root):
+ """Yield all descendant album nodes of 'root'"""
+ for node in self.api.node_children(root["NodeID"]):
+ if node["Type"] == "Album":
+ yield node
+ elif node["Type"] == "Folder":
+ yield from self.album_nodes(node)
+
+
+class SmugmugAPI(oauth.OAuth1API):
+ """Minimal interface for the smugmug API v2"""
+ API_DOMAIN = "api.smugmug.com"
+ API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK"
+ API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S"
+ "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq")
+ HEADERS = {"Accept": "application/json"}
+
+ def album(self, album_id, expands=None):
+ return self._expansion("album/" + album_id, expands)
+
+ def image(self, image_id, expands=None):
+ return self._expansion("image/" + image_id, expands)
+
+ def node(self, node_id, expands=None):
+ return self._expansion("node/" + node_id, expands)
+
+ def user(self, username, expands=None):
+ return self._expansion("user/" + username, expands)
+
+ def album_images(self, album_id, expands=None):
+ return self._pagination("album/" + album_id + "!images", expands)
+
+ def node_children(self, node_id, expands=None):
+ return self._pagination("node/" + node_id + "!children", expands)
+
+ def user_albums(self, username, expands=None):
+ return self._pagination("user/" + username + "!albums", expands)
+
+ def site_user(self, domain):
+ return self._call("!siteuser", domain=domain)["Response"]["User"]
+
+ def user_urlpathlookup(self, username, path):
+ endpoint = "user/" + username + "!urlpathlookup"
+ params = {"urlpath": path}
+ return self._expansion(endpoint, "Node", params)
+
+ def _call(self, endpoint, params=None, domain=API_DOMAIN):
+ url = "https://{}/api/v2/{}".format(domain, endpoint)
+ params = params or {}
+ if self.api_key:
+ params["APIKey"] = self.api_key
+ params["_verbosity"] = "1"
+
+ response = self.request(url, params=params, headers=self.HEADERS)
+ data = response.json()
+
+ if 200 <= data["Code"] < 400:
+ return data
+ if data["Code"] == 404:
+ raise exception.NotFoundError()
+ if data["Code"] == 429:
+ self.log.error("Rate limit reached")
+ else:
+ self.log.error("API request failed")
+ self.log.debug(data)
+ raise exception.StopExtraction()
+
+ def _expansion(self, endpoint, expands, params=None):
+ endpoint = self._extend(endpoint, expands)
+ result = self._apply_expansions(self._call(endpoint, params), expands)
+ if not result:
+ raise exception.NotFoundError()
+ return result[0]
+
+ def _pagination(self, endpoint, expands=None):
+ endpoint = self._extend(endpoint, expands)
+ params = {"start": 1, "count": 100}
+
+ while True:
+ data = self._call(endpoint, params)
+ yield from self._apply_expansions(data, expands)
+
+ if "NextPage" not in data["Response"]["Pages"]:
+ return
+ params["start"] += params["count"]
+
+ @staticmethod
+ def _extend(endpoint, expands):
+ if expands:
+ endpoint += "?_expand=" + expands
+ return endpoint
+
+ @staticmethod
+ def _apply_expansions(data, expands):
+
+ def unwrap(response):
+ locator = response["Locator"]
+ return response[locator] if locator in response else []
+
+ objs = unwrap(data["Response"])
+ if not isinstance(objs, list):
+ objs = (objs,)
+
+ if "Expansions" in data:
+ expansions = data["Expansions"]
+ expands = expands.split(",")
+
+ for obj in objs:
+ uris = obj["Uris"]
+
+ for name in expands:
+ if name in uris:
+ uri = uris[name]
+ uris[name] = unwrap(expansions[uri])
+
+ return objs
diff --git a/gallery_dl/extractor/test.py b/gallery_dl/extractor/test.py
new file mode 100644
index 0000000..2f4992c
--- /dev/null
+++ b/gallery_dl/extractor/test.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2017 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Utility extractor to execute tests of other extractors"""
+
+from .common import Extractor, Message
+from .. import extractor, exception
+
+
+class TestExtractor(Extractor):
+ """Extractor to select and run the test URLs of other extractors
+
+ The general form is 'test:<categories>:<subcategories>:<indices>', where
+ <categories> and <subcategories> are comma-separated (sub)category names
+ and <indices> is a comma-seperated list of array indices.
+ To select all possible values for a field use the star '*' character or
+ leave the field empty.
+
+ Examples:
+ - test:pixiv
+ run all pixiv tests
+
+ - test:pixiv:user,favorite:0
+ run the first test of the PixivUser- and PixivFavoriteExtractor
+
+ - test:
+ run all tests
+ """
+ category = "test"
+ pattern = r"t(?:est)?:([^:]*)(?::([^:]*)(?::(\*|[\d,]*))?)?$"
+ test = (
+ ("test:pixiv"),
+ ("test:pixiv:user,favorite:0"),
+ ("test:"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ categories, subcategories, indices = match.groups()
+ self.categories = self._split(categories)
+ self.subcategories = self._split(subcategories)
+ self.indices = self._split(indices) or self
+
+ def items(self):
+ extractors = extractor.extractors()
+
+ if self.categories:
+ extractors = [
+ extr for extr in extractors
+ if extr.category in self.categories
+ ]
+
+ if self.subcategories:
+ extractors = [
+ extr for extr in extractors
+ if extr.subcategory in self.subcategories
+ ]
+
+ tests = [
+ test
+ for extr in extractors
+ for index, test in enumerate(extr._get_tests())
+ if str(index) in self.indices
+ ]
+
+ if not tests:
+ raise exception.NotFoundError("test")
+
+ yield Message.Version, 1
+ for test in tests:
+ yield Message.Queue, test[0], {}
+
+ @staticmethod
+ def __contains__(_):
+ return True
+
+ @staticmethod
+ def _split(value):
+ if value and value != "*":
+ return value.split(",")
+ return None
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
new file mode 100644
index 0000000..62a9173
--- /dev/null
+++ b/gallery_dl/extractor/tsumino.py
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tsumino.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class TsuminoBase():
+ """Base class for tsumino extractors"""
+ category = "tsumino"
+ cookiedomain = "www.tsumino.com"
+ root = "https://www.tsumino.com"
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ else:
+ self.session.cookies.setdefault(
+ "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5")
+
+ @cache(maxage=14*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "{}/Account/Login".format(self.root)
+ headers = {"Referer": url}
+ data = {"Username": username, "Password": password}
+
+ response = self.request(url, method="POST", headers=headers, data=data)
+ if not response.history:
+ raise exception.AuthenticationError()
+ return {".aotsumino": response.history[0].cookies[".aotsumino"]}
+
+
+class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
+ """Extractor for image galleries on tsumino.com"""
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+ r"/(?:Book/Info|Read/View)/(\d+)")
+ test = (
+ ("https://www.tsumino.com/Book/Info/40996", {
+ "url": "84bf30a86623039fc87855680fada884dc8a1ddd",
+ "keyword": {
+ "title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
+ "gallery_id": 40996,
+ "date" : "2018 June 29",
+ "count" : 42,
+ "collection": "",
+ "artist" : ["Itou Life"],
+ "group" : ["Itou Life"],
+ "parody" : ["Fate/Grand Order"],
+ "characters": list,
+ "tags" : list,
+ "type" : "Doujinshi",
+ "rating" : float,
+ "uploader" : "sehki",
+ "lang" : "en",
+ "language" : "English",
+ "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
+ },
+ }),
+ ("https://www.tsumino.com/Read/View/45834"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ title = extr('"og:title" content="', '"')
+ title_en, _, title_jp = text.unescape(title).partition("/")
+ title_en = title_en.strip()
+ title_jp = title_jp.strip()
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : title_en or title_jp,
+ "title_en" : title_en,
+ "title_jp" : title_jp,
+ "thumbnail" : extr('"og:image" content="', '"'),
+ "uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
+ "date" : extr('id="Uploaded">', '</div>').strip(),
+ "rating" : text.parse_float(extr(
+ 'id="Rating">', '</div>').partition(" ")[0]),
+ "type" : text.remove_html(extr('id="Category">' , '</div>')),
+ "collection": text.remove_html(extr('id="Collection">', '</div>')),
+ "group" : text.split_html(extr('id="Group">' , '</div>')),
+ "artist" : text.split_html(extr('id="Artist">' , '</div>')),
+ "parody" : text.split_html(extr('id="Parody">' , '</div>')),
+ "characters": text.split_html(extr('id="Character">' , '</div>')),
+ "tags" : text.split_html(extr('id="Tag">' , '</div>')),
+ "language" : "English",
+ "lang" : "en",
+ }
+
+ def images(self, page):
+ url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
+ headers = {"Referer": self.chapter_url}
+ response = self.request(url, headers=headers, expect=(404,))
+
+ if response.status_code == 404:
+ url = "{}/Read/View/{}".format(self.root, self.gallery_id)
+ self.log.error(
+ "Failed to get gallery JSON data. Visit '%s' in a browser "
+ "and solve the CAPTCHA to continue.", url)
+ raise exception.StopExtraction()
+
+ base = self.root + "/Image/Object?name="
+ return [
+ (base + text.quote(name), None)
+ for name in response.json()["reader_page_urls"]
+ ]
+
+
+class TsuminoSearchExtractor(TsuminoBase, Extractor):
+ """Extractor for search results on tsumino.com"""
+ subcategory = "search"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+ r"/(?:Books/?)?#(.+)")
+ test = (
+ ("https://www.tsumino.com/Books#?Character=Reimu+Hakurei", {
+ "pattern": TsuminoGalleryExtractor.pattern,
+ "range": "1-40",
+ "count": 40,
+ }),
+ (("http://www.tsumino.com/Books#~(Tags~(~"
+ "(Type~7~Text~'Reimu*20Hakurei~Exclude~false)~"
+ "(Type~'1~Text~'Pantyhose~Exclude~false)))#"), {
+ "pattern": TsuminoGalleryExtractor.pattern,
+ "count": ">= 3",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for gallery in self.galleries():
+ url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
+ gallery["_extractor"] = TsuminoGalleryExtractor
+ yield Message.Queue, url, gallery
+
+ def galleries(self):
+ """Return all gallery results matching 'self.query'"""
+ url = "{}/Books/Operate".format(self.root)
+ headers = {
+ "Referer": "{}/".format(self.root),
+ "X-Requested-With": "XMLHttpRequest",
+ }
+ data = {
+ "PageNumber": 1,
+ "Text": "",
+ "Sort": "Newest",
+ "List": "0",
+ "Length": "0",
+ "MinimumRating": "0",
+ "ExcludeList": "0",
+ "CompletelyExcludeHated": "false",
+ }
+ data.update(self._parse(self.query))
+
+ while True:
+ info = self.request(
+ url, method="POST", headers=headers, data=data).json()
+
+ for gallery in info["Data"]:
+ yield gallery["Entry"]
+
+ if info["PageNumber"] >= info["PageCount"]:
+ return
+ data["PageNumber"] += 1
+
+ def _parse(self, query):
+ try:
+ if query.startswith("?"):
+ return self._parse_simple(query)
+ return self._parse_jsurl(query)
+ except Exception as exc:
+ self.log.error("Invalid search query: '%s' (%s)", query, exc)
+ raise exception.StopExtraction()
+
+ @staticmethod
+ def _parse_simple(query):
+ """Parse search query with format '?<key>=value>'"""
+ key, _, value = query.partition("=")
+ tag_types = {
+ "Tag": "1",
+ "Category": "2",
+ "Collection": "3",
+ "Group": "4",
+ "Artist": "5",
+ "Parody": "6",
+ "Character": "7",
+ "Uploader": "100",
+ }
+
+ return {
+ "Tags[0][Type]": tag_types[key[1:].capitalize()],
+ "Tags[0][Text]": text.unquote(value).replace("+", " "),
+ "Tags[0][Exclude]": "false",
+ }
+
+ @staticmethod
+ def _parse_jsurl(data):
+ """Parse search query in JSURL format
+
+ Nested lists and dicts are handled in a special way to deal
+ with the way Tsumino expects its parameters -> expand(...)
+
+ Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill))
+ Ref: https://github.com/Sage/jsurl
+ """
+ if not data:
+ return {}
+ i = 0
+ imax = len(data)
+
+ def eat(expected):
+ nonlocal i
+
+ if data[i] != expected:
+ error = "bad JSURL syntax: expected '{}', got {}".format(
+ expected, data[i])
+ raise ValueError(error)
+ i += 1
+
+ def decode():
+ nonlocal i
+
+ beg = i
+ result = ""
+
+ while i < imax:
+ ch = data[i]
+
+ if ch not in "~)*!":
+ i += 1
+
+ elif ch == "*":
+ if beg < i:
+ result += data[beg:i]
+ if data[i + 1] == "*":
+ result += chr(int(data[i+2:i+6], 16))
+ i += 6
+ else:
+ result += chr(int(data[i+1:i+3], 16))
+ i += 3
+ beg = i
+
+ elif ch == "!":
+ if beg < i:
+ result += data[beg:i]
+ result += "$"
+ i += 1
+ beg = i
+
+ else:
+ break
+
+ return result + data[beg:i]
+
+ def parse_one():
+ nonlocal i
+
+ eat('~')
+ result = ""
+ ch = data[i]
+
+ if ch == "(":
+ i += 1
+
+ if data[i] == "~":
+ result = []
+ if data[i+1] == ")":
+ i += 1
+ else:
+ result.append(parse_one())
+ while data[i] == "~":
+ result.append(parse_one())
+
+ else:
+ result = {}
+
+ if data[i] != ")":
+ while True:
+ key = decode()
+ value = parse_one()
+ for ekey, evalue in expand(key, value):
+ result[ekey] = evalue
+ if data[i] != "~":
+ break
+ i += 1
+ eat(")")
+
+ elif ch == "'":
+ i += 1
+ result = decode()
+
+ else:
+ beg = i
+ i += 1
+
+ while i < imax and data[i] not in "~)":
+ i += 1
+
+ sub = data[beg:i]
+ if ch in "0123456789-":
+ fval = float(sub)
+ ival = int(fval)
+ result = ival if ival == fval else fval
+ else:
+ if sub not in ("true", "false", "null"):
+ raise ValueError("bad value keyword: " + sub)
+ result = sub
+
+ return result
+
+ def expand(key, value):
+ if isinstance(value, list):
+ for index, cvalue in enumerate(value):
+ ckey = "{}[{}]".format(key, index)
+ yield from expand(ckey, cvalue)
+ elif isinstance(value, dict):
+ for ckey, cvalue in value.items():
+ ckey = "{}[{}]".format(key, ckey)
+ yield from expand(ckey, cvalue)
+ else:
+ yield key, value
+
+ return parse_one()
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
new file mode 100644
index 0000000..5679cdc
--- /dev/null
+++ b/gallery_dl/extractor/tumblr.py
@@ -0,0 +1,425 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.tumblr.com/"""
+
+from .common import Extractor, Message
+from .. import text, oauth, extractor, exception
+from datetime import datetime, timedelta
+import re
+import time
+
+
+def _original_inline_image(url):
+ return re.sub(
+ (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
+ r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
+ r"https://\1_1280.\2", url
+ )
+
+
+def _original_video(url):
+ return re.sub(
+ (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
+ r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"),
+ r"https://\1.\2", url
+ )
+
+
+POST_TYPES = frozenset((
+ "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+
+BASE_PATTERN = (
+ r"(?:tumblr:(?:https?://)?([^/]+)|"
+ r"(?:https?://)?([^.]+\.tumblr\.com))")
+
+
+class TumblrExtractor(Extractor):
+ """Base class for tumblr extractors"""
+ category = "tumblr"
+ directory_fmt = ("{category}", "{name}")
+ filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.blog = match.group(1) or match.group(2)
+ self.api = TumblrAPI(self)
+
+ self.types = self._setup_posttypes()
+ self.avatar = self.config("avatar", False)
+ self.inline = self.config("inline", True)
+ self.reblogs = self.config("reblogs", True)
+ self.external = self.config("external", False)
+
+ if len(self.types) == 1:
+ self.api.posts_type = next(iter(self.types))
+ elif not self.types:
+ self.log.warning("no valid post types selected")
+
+ if self.reblogs == "same-blog":
+ self._skip_reblog = self._skip_reblog_same_blog
+
+ def items(self):
+ blog = None
+ yield Message.Version, 1
+
+ for post in self.posts():
+ if post["type"] not in self.types:
+ continue
+ if not blog:
+ blog = self.api.info(self.blog)
+ blog["uuid"] = self.blog
+ yield Message.Directory, blog.copy()
+
+ if self.avatar:
+ url = self.api.avatar(self.blog)
+ yield self._prepare_avatar(url, post.copy(), blog)
+
+ reblog = "reblogged_from_id" in post
+ if reblog and self._skip_reblog(post):
+ continue
+ post["reblogged"] = reblog
+
+ post["blog"] = blog
+ post["date"] = text.parse_timestamp(post["timestamp"])
+ post["num"] = 0
+
+ if "trail" in post:
+ del post["trail"]
+
+ if "photos" in post: # type "photo" or "link"
+ photos = post["photos"]
+ del post["photos"]
+
+ for photo in photos:
+ post["photo"] = photo
+ photo.update(photo["original_size"])
+ del photo["original_size"]
+ del photo["alt_sizes"]
+ yield self._prepare_image(photo["url"], post)
+
+ url = post.get("audio_url") # type: "audio"
+ if url:
+ yield self._prepare(url, post)
+
+ url = post.get("video_url") # type: "video"
+ if url:
+ yield self._prepare(_original_video(url), post)
+
+ if self.inline and "reblog" in post: # inline media
+ # only "chat" posts are missing a "reblog" key in their
+ # API response, but they can't contain images/videos anyway
+ body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
+ for url in re.findall('<img src="([^"]+)"', body):
+ url = _original_inline_image(url)
+ yield self._prepare_image(url, post)
+ for url in re.findall('<source src="([^"]+)"', body):
+ url = _original_video(url)
+ yield self._prepare(url, post)
+
+ if self.external: # external links
+ post["extension"] = None
+ with extractor.blacklist(("tumblr",)):
+ for key in ("permalink_url", "url"):
+ url = post.get(key)
+ if url:
+ yield Message.Queue, url, post
+ break
+
+ def posts(self):
+ """Return an iterable containing all relevant posts"""
+
+ def _setup_posttypes(self):
+ types = self.config("posts", "all")
+
+ if types == "all":
+ return POST_TYPES
+
+ elif not types:
+ return frozenset()
+
+ else:
+ if isinstance(types, str):
+ types = types.split(",")
+ types = frozenset(types)
+
+ invalid = types - POST_TYPES
+ if invalid:
+ types = types & POST_TYPES
+ self.log.warning('invalid post types: "%s"',
+ '", "'.join(sorted(invalid)))
+ return types
+
+ @staticmethod
+ def _prepare(url, post):
+ text.nameext_from_url(url, post)
+ post["num"] += 1
+ post["hash"] = post["filename"].partition("_")[2]
+ return Message.Url, url, post
+
+ @staticmethod
+ def _prepare_image(url, post):
+ text.nameext_from_url(url, post)
+ post["num"] += 1
+
+ parts = post["filename"].split("_")
+ try:
+ post["hash"] = parts[1] if parts[1] != "inline" else parts[2]
+ except IndexError:
+ # filename doesn't follow the usual pattern (#129)
+ post["hash"] = post["filename"]
+
+ return Message.Url, url, post
+
+ @staticmethod
+ def _prepare_avatar(url, post, blog):
+ text.nameext_from_url(url, post)
+ post["num"] = 1
+ post["blog"] = blog
+ post["reblogged"] = False
+ post["type"] = post["id"] = post["hash"] = "avatar"
+ return Message.Url, url, post
+
+ def _skip_reblog(self, _):
+ return not self.reblogs
+
+ def _skip_reblog_same_blog(self, post):
+ return self.blog != post["reblogged_root_uuid"]
+
+
+class TumblrUserExtractor(TumblrExtractor):
+ """Extractor for all images from a tumblr-user"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$"
+ test = (
+ ("http://demo.tumblr.com/", {
+ "pattern": r"https://\d+\.media\.tumblr\.com"
+ r"/tumblr_[^/_]+_\d+\.jpg",
+ "count": 1,
+ "options": (("posts", "photo"),),
+ }),
+ ("http://demo.tumblr.com/", {
+ "pattern": (r"https?://(?:$|"
+ r"\d+\.media\.tumblr\.com/.+_1280\.jpg|"
+ r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"),
+ "count": 3,
+ "options": (("posts", "all"), ("external", True))
+ }),
+ ("https://mikf123-hidden.tumblr.com/", { # dashbord-only
+ "count": 2,
+ "keyword": {"tags": ["test", "hidden"]},
+ }),
+ ("https://mikf123-private.tumblr.com/", { # password protected
+ "count": 2,
+ "keyword": {"tags": ["test", "private"]},
+ }),
+ ("https://mikf123-private-hidden.tumblr.com/", { # both
+ "count": 2,
+ "keyword": {"tags": ["test", "private", "hidden"]},
+ }),
+ ("https://demo.tumblr.com/page/2"),
+ ("https://demo.tumblr.com/archive"),
+ ("tumblr:http://www.b-authentique.com/"),
+ ("tumblr:www.b-authentique.com"),
+ )
+
+ def posts(self):
+ return self.api.posts(self.blog, {})
+
+
+class TumblrPostExtractor(TumblrExtractor):
+ """Extractor for images from a single post on tumblr"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(?:post|image)/(\d+)"
+ test = (
+ ("http://demo.tumblr.com/post/459265350", {
+ "pattern": (r"https://\d+\.media\.tumblr\.com"
+ r"/tumblr_[^/_]+_1280.jpg"),
+ "count": 1,
+ }),
+ ("https://mikf123.tumblr.com/post/167770226574/text-post", {
+ "count": 2,
+ }),
+ ("https://mikf123.tumblr.com/post/181022561719/quote-post", {
+ "count": 1,
+ }),
+ ("https://mikf123.tumblr.com/post/167623351559/link-post", {
+ "count": 2,
+ }),
+ ("https://muyanna.tumblr.com/post/180692431632/answer-post", {
+ "count": 1,
+ }),
+ ("https://mikf123.tumblr.com/post/167633596145/video-post", {
+ "count": 2,
+ }),
+ ("https://mikf123.tumblr.com/post/167770026604/audio-post", {
+ "count": 2,
+ }),
+ ("https://mikf123.tumblr.com/post/172687798174/photo-post", {
+ "count": 4,
+ }),
+ ("https://mikf123.tumblr.com/post/181022380064/chat-post", {
+ "count": 0,
+ }),
+ ("http://pinetre-3.tumblr.com/post/181904381470/via", {
+ "count": 0, # audio post with "null" as URL (#165)
+ }),
+ ("http://ziemniax.tumblr.com/post/109697912859/", {
+ "exception": exception.NotFoundError, # HTML response (#297)
+ }),
+ ("http://demo.tumblr.com/image/459265350"),
+ )
+
+ def __init__(self, match):
+ TumblrExtractor.__init__(self, match)
+ self.post_id = match.group(3)
+ self.reblogs = True
+
+ def posts(self):
+ return self.api.posts(self.blog, {"id": self.post_id})
+
+ @staticmethod
+ def _setup_posttypes():
+ return POST_TYPES
+
+
+class TumblrTagExtractor(TumblrExtractor):
+ """Extractor for images from a tumblr-user by tag"""
+ subcategory = "tag"
+ pattern = BASE_PATTERN + r"/tagged/([^/?&#]+)"
+ test = ("http://demo.tumblr.com/tagged/Times%20Square", {
+ "pattern": (r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg"),
+ "count": 1,
+ })
+
+ def __init__(self, match):
+ TumblrExtractor.__init__(self, match)
+ self.tag = text.unquote(match.group(3))
+
+ def posts(self):
+ return self.api.posts(self.blog, {"tag": self.tag})
+
+
+class TumblrLikesExtractor(TumblrExtractor):
+ """Extractor for images from a tumblr-user's liked posts"""
+ subcategory = "likes"
+ directory_fmt = ("{category}", "{name}", "likes")
+ archive_fmt = "f_{blog[name]}_{id}_{num}"
+ pattern = BASE_PATTERN + r"/likes"
+ test = ("http://mikf123.tumblr.com/likes", {
+ "count": 1,
+ })
+
+ def posts(self):
+ return self.api.likes(self.blog)
+
+
+class TumblrAPI(oauth.OAuth1API):
+ """Minimal interface for the Tumblr API v2"""
+ API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B"
+ API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03"
+ BLOG_CACHE = {}
+
+ def __init__(self, extractor):
+ oauth.OAuth1API.__init__(self, extractor)
+ self.posts_type = None
+
+ def info(self, blog):
+ """Return general information about a blog"""
+ if blog not in self.BLOG_CACHE:
+ self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"]
+ return self.BLOG_CACHE[blog]
+
+ def avatar(self, blog, size="512"):
+ """Retrieve a blog avatar"""
+ if self.api_key:
+ url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}"
+ return url_fmt.format(blog, size, self.api_key)
+ params = {"size": size}
+ data = self._call(blog, "avatar", params, allow_redirects=False)
+ return data["avatar_url"]
+
+ def posts(self, blog, params):
+ """Retrieve published posts"""
+ params.update({"offset": 0, "limit": 50, "reblog_info": "true"})
+ if self.posts_type:
+ params["type"] = self.posts_type
+ while True:
+ data = self._call(blog, "posts", params)
+ self.BLOG_CACHE[blog] = data["blog"]
+ yield from data["posts"]
+ params["offset"] += params["limit"]
+ if params["offset"] >= data["total_posts"]:
+ return
+
+ def likes(self, blog):
+ """Retrieve liked posts"""
+ params = {"limit": 50}
+ while True:
+ posts = self._call(blog, "likes", params)["liked_posts"]
+ if not posts:
+ return
+ yield from posts
+ params["before"] = posts[-1]["liked_timestamp"]
+
+ def _call(self, blog, endpoint, params, **kwargs):
+ if self.api_key:
+ params["api_key"] = self.api_key
+ url = "https://api.tumblr.com/v2/blog/{}/{}".format(
+ blog, endpoint)
+
+ response = self.request(url, params=params, **kwargs)
+
+ try:
+ data = response.json()
+ except ValueError:
+ data = response.text
+ status = response.status_code
+ else:
+ status = data["meta"]["status"]
+ if 200 <= status < 400:
+ return data["response"]
+
+ if status == 403:
+ raise exception.AuthorizationError()
+ elif status == 404:
+ raise exception.NotFoundError("user or post")
+ elif status == 429:
+
+ # daily rate limit
+ if response.headers.get("x-ratelimit-perday-remaining") == "0":
+ reset = response.headers.get("x-ratelimit-perday-reset")
+ self.log.error(
+ "Daily API rate limit exceeded: aborting; "
+ "rate limit will reset at %s",
+ self._to_time(reset),
+ )
+ raise exception.StopExtraction()
+
+ # hourly rate limit
+ reset = response.headers.get("x-ratelimit-perhour-reset")
+ if reset:
+ self.log.info(
+ "Hourly API rate limit exceeded; "
+ "waiting until %s for rate limit reset",
+ self._to_time(reset),
+ )
+ time.sleep(int(reset) + 1)
+ return self._call(blog, endpoint, params)
+
+ self.log.error(data)
+ raise exception.StopExtraction()
+
+ @staticmethod
+ def _to_time(reset):
+ try:
+ reset_time = datetime.now() + timedelta(seconds=int(reset))
+ except (ValueError, TypeError):
+ return "?"
+ return reset_time.strftime("%H:%M:%S")
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
new file mode 100644
index 0000000..ad4dc46
--- /dev/null
+++ b/gallery_dl/extractor/twitter.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://twitter.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class TwitterExtractor(Extractor):
+ """Base class for twitter extractors"""
+ category = "twitter"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{tweet_id}_{num}.{extension}"
+ archive_fmt = "{tweet_id}_{retweet_id}_{num}"
+ root = "https://twitter.com"
+ sizes = (":orig", ":large", ":medium", ":small")
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.retweets = self.config("retweets", True)
+ self.videos = self.config("videos", False)
+
+ def items(self):
+ self.login()
+ yield Message.Version, 1
+ yield Message.Directory, self.metadata()
+
+ for tweet in self.tweets():
+ data = self._data_from_tweet(tweet)
+ if not self.retweets and data["retweet_id"]:
+ continue
+
+ images = text.extract_iter(
+ tweet, 'data-image-url="', '"')
+ for data["num"], url in enumerate(images, 1):
+ text.nameext_from_url(url, data)
+ urls = [url + size for size in self.sizes]
+ yield Message.Urllist, urls, data
+
+ if self.videos and "-videoContainer" in tweet:
+ data["num"] = 1
+ url = "ytdl:{}/{}/status/{}".format(
+ self.root, data["user"], data["tweet_id"])
+ yield Message.Url, url, data
+
+ def metadata(self):
+ """Return general metadata"""
+ return {"user": self.user}
+
+ def tweets(self):
+ """Yield HTML content of all relevant tweets"""
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=360*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ page = self.request(self.root + "/login").text
+ pos = page.index('name="authenticity_token"')
+ token = text.extract(page, 'value="', '"', pos-80)[0]
+
+ url = self.root + "/sessions"
+ data = {
+ "session[username_or_email]": username,
+ "session[password]" : password,
+ "authenticity_token" : token,
+ "ui_metrics" : '{"rf":{},"s":""}',
+ "scribe_log" : "",
+ "redirect_after_login" : "",
+ "remember_me" : "1",
+ }
+ response = self.request(url, method="POST", data=data)
+
+ if "/error" in response.url:
+ raise exception.AuthenticationError()
+ return self.session.cookies
+
+ @staticmethod
+ def _data_from_tweet(tweet):
+ extr = text.extract_from(tweet)
+ return {
+ "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
+ "retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
+ "retweeter" : extr('data-retweeter="' , '"'),
+ "user" : extr('data-screen-name="', '"'),
+ "username" : extr('data-name="' , '"'),
+ "user_id" : text.parse_int(extr('data-user-id="' , '"')),
+ "date" : text.parse_timestamp(extr('data-time="', '"')),
+ }
+
+ def _tweets_from_api(self, url):
+ params = {
+ "include_available_features": "1",
+ "include_entities": "1",
+ "reset_error_state": "false",
+ "lang": "en",
+ }
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "X-Twitter-Active-User": "yes",
+ "Referer": "{}/{}".format(self.root, self.user)
+ }
+
+ while True:
+ data = self.request(url, params=params, headers=headers).json()
+ if "inner" in data:
+ data = data["inner"]
+
+ for tweet in text.extract_iter(
+ data["items_html"], '<div class="tweet ', '\n</li>'):
+ yield tweet
+
+ if not data["has_more_items"]:
+ return
+
+ position = text.parse_int(text.extract(
+ tweet, 'data-tweet-id="', '"')[0])
+ if "max_position" in params and position >= params["max_position"]:
+ return
+ params["max_position"] = position
+
+
+class TwitterTimelineExtractor(TwitterExtractor):
+ """Extractor for all images from a user's timeline"""
+ subcategory = "timeline"
+ pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+ r"/([^/?&#]+)/?$")
+ test = ("https://twitter.com/supernaturepics", {
+ "range": "1-40",
+ "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
+ "keyword": "7210d679606240405e0cf62cbc67596e81a7a250",
+ })
+
+ def tweets(self):
+ url = "{}/i/profiles/show/{}/timeline/tweets".format(
+ self.root, self.user)
+ return self._tweets_from_api(url)
+
+
+class TwitterMediaExtractor(TwitterExtractor):
+ """Extractor for all images from a user's Media Tweets"""
+ subcategory = "media"
+ pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+ r"/([^/?&#]+)/media(?!\w)")
+ test = ("https://twitter.com/supernaturepics/media", {
+ "range": "1-40",
+ "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
+ })
+
+ def tweets(self):
+ url = "{}/i/profiles/show/{}/media_timeline".format(
+ self.root, self.user)
+ return self._tweets_from_api(url)
+
+
+class TwitterTweetExtractor(TwitterExtractor):
+ """Extractor for images from individual tweets"""
+ subcategory = "tweet"
+ pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+ r"/([^/?&#]+)/status/(\d+)")
+ test = (
+ ("https://twitter.com/supernaturepics/status/604341487988576256", {
+ "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
+ "keyword": "1b8afb93cc04a9f44d89173f8facc61c3a6caf91",
+ "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
+ }),
+ # 4 images
+ ("https://twitter.com/perrypumas/status/894001459754180609", {
+ "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
+ "keyword": "43d98ab448193f0d4f30aa571a4b6bda9b6a5692",
+ }),
+ # video
+ ("https://twitter.com/perrypumas/status/1065692031626829824", {
+ "options": (("videos", True),),
+ "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+",
+ }),
+ )
+
+ def __init__(self, match):
+ TwitterExtractor.__init__(self, match)
+ self.tweet_id = match.group(2)
+
+ def metadata(self):
+ return {"user": self.user, "tweet_id": self.tweet_id}
+
+ def tweets(self):
+ url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id)
+ page = self.request(url).text
+ return (text.extract(
+ page, '<div class="tweet ', '<ul class="stats')[0],)
diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py
new file mode 100644
index 0000000..687ce3c
--- /dev/null
+++ b/gallery_dl/extractor/vanillarock.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://vanilla-rock.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class VanillarockExtractor(Extractor):
+ """Base class for vanillarock extractors"""
+ category = "vanillarock"
+ root = "https://vanilla-rock.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+
+class VanillarockPostExtractor(VanillarockExtractor):
+ """Extractor for blogposts on vanilla-rock.com"""
+ subcategory = "post"
+ directory_fmt = ("{category}", "{path}")
+ filename_fmt = "{num:>02}.{extension}"
+ archive_fmt = "{filename}"
+ pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com"
+ r"(/(?!category/|tag/)[^/?&#]+)/?$")
+ test = ("https://vanilla-rock.com/mizuhashi_parsee-5", {
+ "url": "7fb9a4d18d9fa22d7295fee8d94ab5a7a52265dd",
+ "keyword": "b91df99b714e1958d9636748b1c81a07c3ef52c9",
+ })
+
+ def items(self):
+ extr = text.extract_from(self.request(self.root + self.path).text)
+ name = extr("<title>", "</title>")
+
+ imgs = []
+ while True:
+ img = extr('<div class="main-img">', '</div>')
+ if not img:
+ break
+ imgs.append(text.extract(img, 'href="', '"')[0])
+
+ data = {
+ "count": len(imgs),
+ "title": text.unescape(name.rpartition(" | ")[0]),
+ "path" : self.path.strip("/"),
+ "date" : text.parse_datetime(extr(
+ '<div class="date">', '</div>'), "%Y-%m-%d %H:%M"),
+ "tags" : text.split_html(extr(
+ '<div class="cat-tag">', '</div>'))[::2],
+ }
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class VanillarockTagExtractor(VanillarockExtractor):
+ """Extractor for vanillarock blog posts by tag or category"""
+ subcategory = "tag"
+ pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com"
+ r"(/(?:tag|category)/[^?&#]+)")
+ test = (
+ ("https://vanilla-rock.com/tag/%e5%b0%84%e5%91%bd%e4%b8%b8%e6%96%87", {
+ "pattern": VanillarockPostExtractor.pattern,
+ "count": ">= 12",
+ }),
+ (("https://vanilla-rock.com/category/%e4%ba%8c%e6%ac%a1%e3%82%a8%e3%83"
+ "%ad%e7%94%bb%e5%83%8f/%e8%90%8c%e3%81%88%e3%83%bb%e3%82%bd%e3%83%95"
+ "%e3%83%88%e3%82%a8%e3%83%ad"), {
+ "pattern": VanillarockPostExtractor.pattern,
+ "count": 3,
+ }),
+ )
+
+ def items(self):
+ url = self.root + self.path
+ data = {"_extractor": VanillarockPostExtractor}
+
+ yield Message.Version, 1
+ while url:
+ extr = text.extract_from(self.request(url).text)
+ while True:
+ post = extr('<h2 class="entry-title">', '</h2>')
+ if not post:
+ break
+ yield Message.Queue, text.extract(post, 'href="', '"')[0], data
+ url = text.unescape(extr('class="next page-numbers" href="', '"'))
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
new file mode 100644
index 0000000..4326582
--- /dev/null
+++ b/gallery_dl/extractor/wallhaven.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://wallhaven.cc/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class WallhavenExtractor(Extractor):
+ """Base class for wallhaven extractors"""
+ category = "wallhaven"
+ filename_fmt = "{category}_{id}_{resolution}.{extension}"
+ root = "https://wallhaven.cc"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = WallhavenAPI(self)
+
+
+class WallhavenSearchExtractor(WallhavenExtractor):
+ """Extractor for search results on wallhaven.cc"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "{search[q]}")
+ archive_fmt = "s_{search[q]}_{id}"
+ pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^/?#]+))?"
+ test = (
+ ("https://wallhaven.cc/search?q=touhou"),
+ (("https://wallhaven.cc/search?q=id%3A87"
+ "&categories=111&purity=100&sorting=date_added&order=asc&page=3"), {
+ "count": 4,
+ "url": "d024bc11895d758b76ffdb0fa85a627e53f072cf",
+ }),
+ )
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, {"search": self.params}
+ for wp in self.api.search(self.params.copy()):
+ wp["search"] = self.params
+ yield Message.Url, wp["url"], wp
+
+
+class WallhavenImageExtractor(WallhavenExtractor):
+ """Extractor for individual wallpaper on wallhaven.cc"""
+ subcategory = "image"
+ archive_fmt = "{id}"
+ pattern = (r"(?:https?://)?(?:wallhaven\.cc/w/|whvn\.cc/"
+ r"|w\.wallhaven\.cc/[a-z]+/\w\w/wallhaven-)(\w+)")
+ test = (
+ ("https://wallhaven.cc/w/01w334", {
+ "pattern": "https://[^.]+.wallhaven.cc/full/01/[^-]+-01w334.jpg",
+ "content": "497212679383a465da1e35bd75873240435085a2",
+ "keyword": {
+ "id" : "01w334",
+ "width" : 1920,
+ "height" : 1200,
+ "resolution" : "1920x1200",
+ "ratio" : 1.6,
+ "colors" : list,
+ "tags" : list,
+ "file_size" : 278799,
+ "file_type" : "image/jpeg",
+ "purity" : "sfw",
+ "short_url" : "https://whvn.cc/01w334",
+ "source" : str,
+ "uploader" : {
+ "group" : "Owner/Developer",
+ "username" : "AksumkA",
+ },
+ "date" : "type:datetime",
+ "wh_category": "anime",
+ "views" : int,
+ "favorites" : int,
+ },
+ }),
+ # NSFW
+ ("https://wallhaven.cc/w/dge6v3", {
+ "url": "e4b802e70483f659d790ad5d0bd316245badf2ec",
+ }),
+ ("https://whvn.cc/01w334"),
+ ("https://w.wallhaven.cc/full/01/wallhaven-01w334.jpg"),
+ )
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.wallpaper_id = match.group(1)
+
+ def items(self):
+ data = self.api.info(self.wallpaper_id)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, data["url"], data
+
+
+class WallhavenAPI():
+ """Minimal interface to wallhaven's API"""
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+
+ key = extractor.config("api-key")
+ if key is None:
+ key = "25HYZenXTICjzBZXzFSg98uJtcQVrDs2"
+ extractor.log.debug("Using default API Key")
+ else:
+ extractor.log.debug("Using custom API Key")
+ self.headers = {"X-API-Key": key}
+
+ def info(self, wallpaper_id):
+ url = "https://wallhaven.cc/api/v1/w/" + wallpaper_id
+ return self._update(self._call(url)["data"])
+
+ def search(self, params):
+ url = "https://wallhaven.cc/api/v1/search"
+ while True:
+ data = self._call(url, params)
+ yield from map(self._update, data["data"])
+ if data["meta"]["current_page"] >= data["meta"]["last_page"]:
+ return
+ params["page"] = data["meta"]["current_page"] + 1
+
+ def _call(self, url, params=None):
+ return self.extractor.request(
+ url, headers=self.headers, params=params).json()
+
+ @staticmethod
+ def _update(wp):
+ width, _, height = wp["resolution"].partition("x")
+ wp["url"] = wp.pop("path")
+ if "tags" in wp:
+ wp["tags"] = [t["name"] for t in wp["tags"]]
+ wp["date"] = text.parse_datetime(
+ wp.pop("created_at"), "%Y-%m-%d %H:%M:%S")
+ wp["ratio"] = text.parse_float(wp["ratio"])
+ wp["width"] = wp.pop("dimension_x")
+ wp["height"] = wp.pop("dimension_y")
+ wp["wh_category"] = wp["category"]
+ return text.nameext_from_url(wp["url"], wp)
diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py
new file mode 100644
index 0000000..d353144
--- /dev/null
+++ b/gallery_dl/extractor/warosu.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://warosu.org/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class WarosuThreadExtractor(Extractor):
+ """Extractor for images from threads on warosu.org"""
+ category = "warosu"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} - {title}")
+ filename_fmt = "{tim}-{filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ pattern = r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"
+ test = (
+ ("https://warosu.org/jp/thread/16656025", {
+ "url": "889d57246ed67e491e5b8f7f124e50ea7991e770",
+ "keyword": "c00ea4c5460c5986994f17bb8416826d42ca57c0",
+ }),
+ ("https://warosu.org/jp/thread/16658073", {
+ "url": "4500cf3184b067424fd9883249bd543c905fbecd",
+ "keyword": "7534edf4ec51891dbf44d775b73fbbefd52eec71",
+ "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c",
+ }),
+ )
+ root = "https://warosu.org"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/{}/thread/{}".format(self.root, self.board, self.thread)
+ page = self.request(url).text
+ data = self.get_metadata(page)
+ posts = self.posts(page)
+
+ if not data["title"]:
+ title = text.remove_html(posts[0]["com"])
+ data["title"] = text.unescape(title)[:50]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in posts:
+ if "image" in post:
+ for key in ("w", "h", "no", "time", "tim"):
+ post[key] = text.parse_int(post[key])
+ post.update(data)
+ yield Message.Url, post["image"], post
+
+ def get_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ boardname = text.extract(page, "<title>", "</title>")[0]
+ title = text.extract(page, 'filetitle" itemprop="name">', '<')[0]
+ return {
+ "board": self.board,
+ "board_name": boardname.rpartition(" - ")[2],
+ "thread": self.thread,
+ "title": title,
+ }
+
+ def posts(self, page):
+ """Build a list of all post-objects"""
+ page = text.extract(page, '<div class="content">', '<table>')[0]
+ needle = '<table itemscope itemtype="http://schema.org/Comment">'
+ return [self.parse(post) for post in page.split(needle)]
+
+ def parse(self, post):
+ """Build post-object by extracting data from an HTML post"""
+ data = self._extract_post(post)
+ if "<span>File:" in post:
+ self._extract_image(post, data)
+ part = data["image"].rpartition("/")[2]
+ data["tim"], _, data["extension"] = part.partition(".")
+ data["ext"] = "." + data["extension"]
+ return data
+
+ @staticmethod
+ def _extract_post(post):
+ data = text.extract_all(post, (
+ ("no" , 'id="p', '"'),
+ ("name", '<span itemprop="name">', '</span>'),
+ ("time", '<span class="posttime" title="', '000">'),
+ ("now" , '', '<'),
+ ("com" , '<blockquote><p itemprop="text">', '</p></blockquote>'),
+ ))[0]
+ data["com"] = text.unescape(text.remove_html(data["com"].strip()))
+ return data
+
+ @staticmethod
+ def _extract_image(post, data):
+ text.extract_all(post, (
+ ("fsize" , '<span>File: ', ', '),
+ ("w" , '', 'x'),
+ ("h" , '', ', '),
+ ("filename", '', '<'),
+ ("image" , '<br />\n<a href="', '"'),
+ ), 0, data)
+ data["filename"] = text.unquote(data["filename"].rpartition(".")[0])
+ data["image"] = "https:" + data["image"]
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
new file mode 100644
index 0000000..7a4ee8f
--- /dev/null
+++ b/gallery_dl/extractor/weibo.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.weibo.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class WeiboExtractor(Extractor):
+ category = "weibo"
+ directory_fmt = ("{category}", "{user[screen_name]}")
+ filename_fmt = "{status[id]}_{num:>02}.{extension}"
+ archive_fmt = "{status[id]}_{num}"
+ root = "https://m.weibo.cn"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.retweets = self.config("retweets", True)
+
+ def items(self):
+ first = True
+
+ for status in self.statuses():
+
+ obj = status
+ num = 1
+
+ if first:
+ yield Message.Version, 1
+ yield Message.Directory, status
+ first = False
+
+ while True:
+
+ if "pics" in obj:
+ for image in obj["pics"]:
+ pid = image["pid"]
+ if "large" in image:
+ image = image["large"]
+ data = text.nameext_from_url(image["url"], {
+ "num": num,
+ "pid": pid,
+ "width": text.parse_int(image["geo"]["width"]),
+ "height": text.parse_int(image["geo"]["height"]),
+ "status": status,
+ })
+ yield Message.Url, image["url"], data
+ num += 1
+
+ if "page_info" in obj and "media_info" in obj["page_info"]:
+ info = obj["page_info"]["media_info"]
+ url = info.get("stream_url_hd") or info["stream_url"]
+ data = text.nameext_from_url(url, {
+ "num": num,
+ "url": url,
+ "width": 0,
+ "height": 0,
+ "status": status,
+ })
+ yield Message.Url, url, data
+
+ if self.retweets and "retweeted_status" in obj:
+ obj = obj["retweeted_status"]
+ else:
+ break
+
+ def statuses(self):
+ """Returns an iterable containing all relevant 'status' objects"""
+
+
+class WeiboUserExtractor(WeiboExtractor):
+ """Extractor for all images of a user on weibo.cn"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
+ r"/(?:u|p(?:rofile)?)/(\d+)")
+ test = (
+ ("https://m.weibo.cn/u/2314621010", {
+ "range": "1-30",
+ }),
+ ("https://m.weibo.cn/profile/2314621010"),
+ ("https://m.weibo.cn/p/2304132314621010_-_WEIBO_SECOND_PROFILE_WEIBO"),
+ ("https://www.weibo.com/p/1003062314621010/home"),
+ )
+
+ def __init__(self, match):
+ WeiboExtractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def statuses(self):
+ url = self.root + "/api/container/getIndex"
+ params = {"page": 1, "containerid": "107603" + self.user_id[-10:]}
+
+ while True:
+ data = self.request(url, params=params).json()
+
+ for card in data["data"]["cards"]:
+ if "mblog" in card:
+ yield card["mblog"]
+
+ if len(data["data"]["cards"]) < 5:
+ return
+ params["page"] += 1
+
+
+class WeiboStatusExtractor(WeiboExtractor):
+ """Extractor for images from a status on weibo.cn"""
+ subcategory = "status"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
+ r"/(?:detail|status|\d+)/(\d+)")
+ test = (
+ ("https://m.weibo.cn/detail/4323047042991618", {
+ "pattern": r"https?://wx\d+.sinaimg.cn/large/\w+.jpg",
+ }),
+ ("https://m.weibo.cn/detail/4339748116375525", {
+ "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_hd",
+ }),
+ ("https://m.weibo.cn/status/4339748116375525"),
+ ("https://m.weibo.cn/5746766133/4339748116375525"),
+ )
+
+ def __init__(self, match):
+ WeiboExtractor.__init__(self, match)
+ self.status_id = match.group(1)
+
+ def statuses(self):
+ url = "{}/detail/{}".format(self.root, self.status_id)
+ page = self.request(url).text
+ data = json.loads(text.extract(
+ page, " var $render_data = [", "][0] || {};")[0])
+ return (data["status"],)
diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py
new file mode 100644
index 0000000..b9c223c
--- /dev/null
+++ b/gallery_dl/extractor/wikiart.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.wikiart.org/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?wikiart\.org/([a-z]+)"
+
+
+class WikiartExtractor(Extractor):
+ """Base class for wikiart extractors"""
+ category = "wikiart"
+ filename_fmt = "{id}_{title}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://www.wikiart.org"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.lang = match.group(1)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for painting in self.paintings():
+ url = painting["image"]
+ painting.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, painting)
+
+ def metadata(self):
+ """Return a dict with general metadata"""
+
+ def paintings(self):
+ """Return an iterable containing all relevant 'painting' objects"""
+
+ def _pagination(self, url, extra_params=None, key="Paintings"):
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": url,
+ }
+ params = {
+ "json": "2",
+ "layout": "new",
+ "page": 1,
+ "resultType": "masonry",
+ }
+ if extra_params:
+ params.update(extra_params)
+
+ while True:
+ data = self.request(url, headers=headers, params=params).json()
+ items = data.get(key)
+ if not items:
+ return
+ yield from items
+ params["page"] += 1
+
+
+class WikiartArtistExtractor(WikiartExtractor):
+ """Extractor for an artist's paintings on wikiart.org"""
+ subcategory = "artist"
+ directory_fmt = ("{category}", "{artist[artistName]}")
+ pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)"
+ test = ("https://www.wikiart.org/en/thomas-cole", {
+ "url": "f1eee8158f5b8b7380382ab730a8f53884715c8b",
+ "keyword": "b62678394ce645815963883d5c9642255307225f",
+ })
+
+ def __init__(self, match):
+ WikiartExtractor.__init__(self, match)
+ self.artist = match.group(2)
+
+ def metadata(self):
+ url = "{}/{}/{}?json=2".format(self.root, self.lang, self.artist)
+ return {"artist": self.request(url).json()}
+
+ def paintings(self):
+ url = "{}/{}/{}/mode/all-paintings".format(
+ self.root, self.lang, self.artist)
+ return self._pagination(url)
+
+
+class WikiartArtworksExtractor(WikiartExtractor):
+ """Extractor for artwork collections on wikiart.org"""
+ subcategory = "artworks"
+ directory_fmt = ("{category}", "Artworks by {group!c}", "{type}")
+ pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)"
+ test = ("https://www.wikiart.org/en/paintings-by-media/grisaille", {
+ "url": "f92d55669fa949491c26a5437527adb14b35b8cc",
+ })
+
+ def __init__(self, match):
+ WikiartExtractor.__init__(self, match)
+ self.group = match.group(2)
+ self.type = match.group(3)
+
+ def metadata(self):
+ return {"group": self.group, "type": self.type}
+
+ def paintings(self):
+ url = "{}/{}/paintings-by-{}/{}".format(
+ self.root, self.lang, self.group, self.type)
+ return self._pagination(url)
+
+
+class WikiartArtistsExtractor(WikiartExtractor):
+ """Extractor for artist collections on wikiart.org"""
+ subcategory = "artists"
+ pattern = (BASE_PATTERN + r"/artists-by-([\w-]+)/([\w-]+)")
+ test = ("https://www.wikiart.org/en/artists-by-century/12", {
+ "pattern": WikiartArtistExtractor.pattern,
+ "count": 7,
+ })
+
+ def __init__(self, match):
+ WikiartExtractor.__init__(self, match)
+ self.group = match.group(2)
+ self.type = match.group(3)
+
+ def items(self):
+ url = "{}/{}/App/Search/Artists-by-{}".format(
+ self.root, self.lang, self.group)
+ params = {"json": "3", "searchterm": self.type}
+
+ for artist in self._pagination(url, params, "Artists"):
+ artist["_extractor"] = WikiartArtistExtractor
+ yield Message.Queue, self.root + artist["artistUrl"], artist
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
new file mode 100644
index 0000000..9699806
--- /dev/null
+++ b/gallery_dl/extractor/xhamster.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://xhamster.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)"
+
+
+class XhamsterExtractor(Extractor):
+ """Base class for xhamster extractors"""
+ category = "xhamster"
+ root = "https://xhamster.com"
+
+
+class XhamsterGalleryExtractor(XhamsterExtractor):
+ """Extractor for image galleries on xhamster.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user[name]}",
+ "{gallery[id]} {gallery[title]}")
+ filename_fmt = "{num:>03}_{id}.{extension}"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"(/photos/gallery/[^/?&#]+)"
+ test = (
+ ("https://xhamster.com/photos/gallery/11748968", {
+ "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",
+ "count": ">= 144",
+ "keyword": {
+ "comments": int,
+ "count": int,
+ "favorite": bool,
+ "id": int,
+ "num": int,
+ "height": int,
+ "width": int,
+ "imageURL": str,
+ "pageURL": str,
+ "thumbURL": str,
+ "gallery": {
+ "date": "type:datetime",
+ "description": "",
+ "dislikes": int,
+ "id": 11748968,
+ "likes": int,
+ "tags": ["NON-Porn"],
+ "thumbnail": str,
+ "title": "Make the world better.",
+ "views": int,
+ },
+ "user": {
+ "id": 16874672,
+ "name": "Anonymousrants",
+ "retired": bool,
+ "subscribers": int,
+ "url": "https://xhamster.com/users/anonymousrants",
+ "verified": bool,
+ },
+ },
+ }),
+ ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"),
+ ("https://xhamster.com/photos/gallery/11748968"),
+ ("https://xhamster.one/photos/gallery/11748968"),
+ ("https://xhamster.desi/photos/gallery/11748968"),
+ ("https://en.xhamster.com/photos/gallery/11748968"),
+ )
+
+ def __init__(self, match):
+ XhamsterExtractor.__init__(self, match)
+ self.path = match.group(1)
+ self.data = None
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, image in enumerate(self.images(), 1):
+ url = image["imageURL"]
+ image.update(data)
+ image["num"] = num
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def metadata(self):
+ self.data = self._data(self.root + self.path)
+ user = self.data["authorModel"]
+ imgs = self.data["photosGalleryModel"]
+
+ return {
+ "user":
+ {
+ "id" : text.parse_int(user["id"]),
+ "url" : user["pageURL"],
+ "name" : user["name"],
+ "retired" : user["retired"],
+ "verified" : user["verified"],
+ "subscribers": user["subscribers"],
+ },
+ "gallery":
+ {
+ "id" : text.parse_int(imgs["id"]),
+ "tags" : [c["name"] for c in imgs["categories"]],
+ "date" : text.parse_timestamp(imgs["created"]),
+ "views" : text.parse_int(imgs["views"]),
+ "likes" : text.parse_int(imgs["rating"]["likes"]),
+ "dislikes" : text.parse_int(imgs["rating"]["dislikes"]),
+ "title" : imgs["title"],
+ "description": imgs["description"],
+ "thumbnail" : imgs["thumbURL"],
+ },
+ "count": text.parse_int(imgs["quantity"]),
+ }
+
+ def images(self):
+ data = self.data
+ self.data = None
+
+ while True:
+ for image in data["photosGalleryModel"]["photos"]:
+ del image["modelName"]
+ yield image
+
+ pgntn = data["pagination"]
+ if pgntn["active"] == pgntn["maxPage"]:
+ return
+ url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"])
+ data = self._data(url)
+
+ def _data(self, url):
+ page = self.request(url).text
+ return json.loads(text.extract(
+ page, "window.initials =", "</script>")[0].rstrip("\n\r;"))
+
+
+class XhamsterUserExtractor(XhamsterExtractor):
+ """Extractor for all galleries of an xhamster user"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/users/([^/?&#]+)(?:/photos)?/?(?:$|[?#])"
+ test = (
+ ("https://xhamster.com/users/nickname68/photos", {
+ "pattern": XhamsterGalleryExtractor.pattern,
+ "count": 50,
+ "range": "1-50",
+ }),
+ ("https://xhamster.com/users/nickname68"),
+ )
+
+ def __init__(self, match):
+ XhamsterExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ url = "{}/users/{}/photos".format(self.root, self.user)
+ data = {"_extractor": XhamsterGalleryExtractor}
+
+ while url:
+ extr = text.extract_from(self.request(url).text)
+ while True:
+ url = extr('thumb-image-container" href="', '"')
+ if not url:
+ break
+ yield Message.Queue, url, data
+ url = extr('data-page="next" href="', '"')
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
new file mode 100644
index 0000000..7eec18b
--- /dev/null
+++ b/gallery_dl/extractor/xvideos.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.xvideos.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import json
+
+
+class XvideosExtractor(Extractor):
+ """Base class for xvideos extractors"""
+ category = "xvideos"
+ root = "https://www.xvideos.com"
+
+ def get_page(self, url, codes=(403, 404)):
+ response = self.request(url, expect=codes)
+ if response.status_code in codes:
+ raise exception.NotFoundError(self.subcategory)
+ return response.text
+
+
+class XvideosGalleryExtractor(XvideosExtractor):
+ """Extractor for user profile galleries from xvideos.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user[name]}", "{title}")
+ filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+ archive_fmt = "{gallery_id}_{num}"
+ pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com"
+ r"/profiles/([^/?&#]+)/photos/(\d+)")
+ test = (
+ (("https://www.xvideos.com/profiles"
+ "/pervertedcouple/photos/751031/random_stuff"), {
+ "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7",
+ "keyword": "8d637b372c6231cc4ada92dd5918db5fdbd06520",
+ }),
+ ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ XvideosExtractor.__init__(self, match)
+ self.user, self.gid = match.groups()
+
+ def items(self):
+ url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
+ page = self.get_page(url)
+ data = self.get_metadata(page)
+ imgs = self.get_images(page)
+ data["count"] = len(imgs)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for url in imgs:
+ data["num"] = text.parse_int(url.rsplit("_", 2)[1])
+ data["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, data
+
+ def get_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ data = text.extract_all(page, (
+ ("userid" , '"id_user":', ','),
+ ("display", '"display":"', '"'),
+ ("title" , '"title":"', '"'),
+ ("descr" , '<small class="mobile-hide">', '</small>'),
+ ("tags" , '<em>Tagged:</em>', '<'),
+ ))[0]
+
+ return {
+ "user": {
+ "id": text.parse_int(data["userid"]),
+ "name": self.user,
+ "display": data["display"],
+ "description": data["descr"].strip(),
+ },
+ "tags": text.unescape(data["tags"] or "").strip().split(", "),
+ "title": text.unescape(data["title"]),
+ "gallery_id": text.parse_int(self.gid),
+ }
+
+ @staticmethod
+ def get_images(page):
+ """Return a list of all image urls for this gallery"""
+ return list(text.extract_iter(
+ page, '<a class="embed-responsive-item" href="', '"'))
+
+
+class XvideosUserExtractor(XvideosExtractor):
+ """Extractor for user profiles from xvideos.com"""
+ subcategory = "user"
+ categorytransfer = True
+ pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com"
+ r"/profiles/([^/?&#]+)/?(?:#.*)?$")
+ test = (
+ ("https://www.xvideos.com/profiles/pervertedcouple", {
+ "url": "a413f3e60d6d3a2de79bd44fa3b7a9c03db4336e",
+ "keyword": "a796760d34732adc7ec52a8feb057515209a2ca6",
+ }),
+ ("https://www.xvideos.com/profiles/niwehrwhernvh", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.xvideos.com/profiles/pervertedcouple#_tabPhotos"),
+ )
+
+ def __init__(self, match):
+ XvideosExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ url = "{}/profiles/{}".format(self.root, self.user)
+ page = self.get_page(url)
+ data = json.loads(text.extract(
+ page, "xv.conf=", ";</script>")[0])["data"]
+
+ if not isinstance(data["galleries"], dict):
+ return
+ if "0" in data["galleries"]:
+ del data["galleries"]["0"]
+
+ galleries = [
+ {
+ "gallery_id": text.parse_int(gid),
+ "title": text.unescape(gdata["title"]),
+ "count": gdata["nb_pics"],
+ "_extractor": XvideosGalleryExtractor,
+ }
+ for gid, gdata in data["galleries"].items()
+ ]
+ galleries.sort(key=lambda x: x["gallery_id"])
+
+ yield Message.Version, 1
+ for gallery in galleries:
+ url = "https://www.xvideos.com/profiles/{}/photos/{}".format(
+ self.user, gallery["gallery_id"])
+ yield Message.Queue, url, gallery
diff --git a/gallery_dl/extractor/yandere.py b/gallery_dl/extractor/yandere.py
new file mode 100644
index 0000000..623e7a8
--- /dev/null
+++ b/gallery_dl/extractor/yandere.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://yande.re/"""
+
+from . import booru
+
+
+class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for yandere extractors"""
+ category = "yandere"
+ api_url = "https://yande.re/post.json"
+ post_url = "https://yande.re/post/show/{}"
+
+
+class YandereTagExtractor(booru.TagMixin, YandereExtractor):
+ """Extractor for images from yande.re based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
+ r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
+ test = ("https://yande.re/post?tags=ouzoku+armor", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ })
+
+
+class YanderePoolExtractor(booru.PoolMixin, YandereExtractor):
+ """Extractor for image-pools from yande.re"""
+ pattern = r"(?:https?://)?(?:www\.)?yande\.re/pool/show/(?P<pool>\d+)"
+ test = ("https://yande.re/pool/show/318", {
+ "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68",
+ })
+
+
+class YanderePostExtractor(booru.PostMixin, YandereExtractor):
+ """Extractor for single images from yande.re"""
+ pattern = r"(?:https?://)?(?:www\.)?yande\.re/post/show/(?P<post>\d+)"
+ test = ("https://yande.re/post/show/51824", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "sasaki_tamaru",
+ "tags_circle": "softhouse_chara",
+ "tags_copyright": "ouzoku",
+ "tags_general": str,
+ },
+ })
+
+
+class YanderePopularExtractor(booru.MoebooruPopularMixin, YandereExtractor):
+ """Extractor for popular images from yande.re"""
+ pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = (
+ ("https://yande.re/post/popular_by_month?month=6&year=2014", {
+ "count": 40,
+ }),
+ ("https://yande.re/post/popular_recent"),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "https://yande.re/post/popular_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py
new file mode 100644
index 0000000..b3c5501
--- /dev/null
+++ b/gallery_dl/extractor/yaplog.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://yaplog.jp/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, util
+
+
+class YaplogExtractor(AsynchronousMixin, Extractor):
+ """Base class for yaplog extractors"""
+ category = "yaplog"
+ root = "https://yaplog.jp"
+ filename_fmt = "{post[id]}_{post[title]}_{id}.{extension}"
+ directory_fmt = ("{category}", "{post[user]}")
+ archive_fmt = "{post[user]}_{id}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for post, urls in self.posts():
+ yield Message.Directory, {"post": post}
+ for num, url in enumerate(urls, 1):
+ page = self.request(url).text if num > 1 else url
+ iurl = text.extract(page, '<img src="', '"')[0]
+ iid, _, ext = iurl.rpartition("/")[2].rpartition(".")
+ image = {
+ "url" : iurl,
+ "num" : num,
+ "id" : text.parse_int(iid.partition("_")[0]),
+ "extension": ext,
+ "post" : post,
+ }
+ yield Message.Url, iurl, image
+
+ def posts(self):
+ """Return an iterable with (data, image page URLs) tuples"""
+
+ def _parse_post(self, url):
+ page = self.request(url).text
+ title, pos = text.extract(page, 'class="title">', '<')
+ date , pos = text.extract(page, 'class="date">' , '<', pos)
+ pid , pos = text.extract(page, '/archive/' , '"', pos)
+ prev , pos = text.extract(page, 'class="last"><a href="', '"', pos)
+
+ urls = list(text.extract_iter(page, '<li><a href="', '"', pos))
+ urls[0] = page # cache HTML of first page
+
+ if len(urls) == 24 and text.extract(page, '(1/', ')')[0] != '24':
+ # there are a maximum of 24 image entries in an /image/ page
+ # -> search /archive/ page for the rest
+ url = "{}/{}/archive/{}".format(self.root, self.user, pid)
+ page = self.request(url).text
+
+ base = "{}/{}/image/{}/".format(self.root, self.user, pid)
+ for part in util.advance(text.extract_iter(
+ page, base, '"', pos), 24):
+ urls.append(base + part)
+
+ return prev, urls, {
+ "id" : text.parse_int(pid),
+ "title": text.unescape(title[:-3]),
+ "user" : self.user,
+ "date" : date,
+ }
+
+
+class YaplogBlogExtractor(YaplogExtractor):
+ """Extractor for a user's blog on yaplog.jp"""
+ subcategory = "blog"
+ pattern = r"(?:https?://)?(?:www\.)?yaplog\.jp/(\w+)/?(?:$|[?&#])"
+ test = ("https://yaplog.jp/omitakashi3", {
+ "pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/",
+ "count": ">= 2",
+ })
+
+ def posts(self):
+ url = "{}/{}/image/".format(self.root, self.user)
+ while url:
+ url, images, data = self._parse_post(url)
+ yield data, images
+
+
+class YaplogPostExtractor(YaplogExtractor):
+ """Extractor for images from a blog post on yaplog.jp"""
+ subcategory = "post"
+ pattern = (r"(?:https?://)?(?:www\.)?yaplog\.jp"
+ r"/(\w+)/(?:archive|image)/(\d+)")
+ test = ("https://yaplog.jp/imamiami0726/image/1299", {
+ "url": "896cae20fa718735a57e723c48544e830ff31345",
+ "keyword": "f8d8781e61c4c38238a7622d6df6c905f864e5d3",
+ })
+
+ def __init__(self, match):
+ YaplogExtractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def posts(self):
+ url = "{}/{}/image/{}".format(self.root, self.user, self.post_id)
+ _, images, data = self._parse_post(url)
+ return ((data, images),)
diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py
new file mode 100644
index 0000000..0844c40
--- /dev/null
+++ b/gallery_dl/extractor/yuki.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://yuki.la/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class YukiThreadExtractor(Extractor):
+ """Extractor for images from threads on yuki.la"""
+ category = "yuki"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread}{title:? - //}")
+ filename_fmt = "{time}-{filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ pattern = r"(?:https?://)?yuki\.la/([^/?&#]+)/(\d+)"
+ test = (
+ ("https://yuki.la/gd/309639", {
+ "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9",
+ "keyword": "01cbe29ae207a5cb7556bcbd5ed481ecdaf32727",
+ "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
+ }),
+ ("https://yuki.la/a/159767162", {
+ "url": "cd94d0eb646d279c3b7efb9b7898888e5d44fa93",
+ "keyword": "7a4ff90e423c74bd3126fb65d13015decec2fa45",
+ }),
+ # old thread - missing board name in title and multi-line HTML
+ ("https://yuki.la/gif/6877752", {
+ "url": "3dbb2f8453490d002416c5fc2fe95b56c129faf9",
+ "keyword": "563ef4ae80134d845dddaed7ebe56f5fc41056be",
+ }),
+ # even older thread - no thread title
+ ("https://yuki.la/a/9357051", {
+ "url": "010560bf254bd485e48366c3531728bda4b22583",
+ "keyword": "7b736c41e307dcfcb84ef495f29299a6ddd06d67",
+ }),
+ )
+ root = "https://yuki.la"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/{}/{}".format(self.root, self.board, self.thread)
+ page = self.request(url).text
+ data = self.get_metadata(page)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in self.posts(page):
+ if "image" in post:
+ for key in ("w", "h", "no", "time"):
+ post[key] = text.parse_int(post[key])
+ post.update(data)
+ yield Message.Url, post["image"], post
+
+ def get_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ title = text.extract(page, "<title>", "</title>")[0]
+ try:
+ title, boardname, _ = title.rsplit(" - ", 2)
+ except ValueError:
+ title = boardname = ""
+ else:
+ title = title.partition(" - ")[2]
+ if not title:
+ title, boardname = boardname, ""
+ return {
+ "board": self.board,
+ "board_name": boardname,
+ "thread": text.parse_int(self.thread),
+ "title": text.unescape(title),
+ }
+
+ def posts(self, page):
+ """Build a list of all post-objects"""
+ return [
+ self.parse(post) for post in text.extract_iter(
+ page, '<div class="postContainer', '</blockquote>')
+ ]
+
+ def parse(self, post):
+ """Build post-object by extracting data from an HTML post"""
+ data = self._extract_post(post)
+ if 'class="file"' in post:
+ self._extract_image(post, data)
+ part = data["image"].rpartition("/")[2]
+ data["tim"], _, data["extension"] = part.partition(".")
+ data["ext"] = "." + data["extension"]
+ return data
+
+ @staticmethod
+ def _extract_post(post):
+ data, pos = text.extract_all(post, (
+ ("no" , 'id="pc', '"'),
+ ("name", '<span class="name">', '</span>'),
+ ("time", 'data-utc="', '"'),
+ ("now" , '>', ' <'),
+ ))
+ data["com"] = text.unescape(text.remove_html(
+ post[post.index("<blockquote ", pos):].partition(">")[2]))
+ return data
+
+ @staticmethod
+ def _extract_image(post, data):
+ text.extract_all(post, (
+ (None , '>File:', ''),
+ ("fullname", '<a title="', '"'),
+ ("image" , 'href="', '"'),
+ ("filename", '>', '<'),
+ ("fsize" , '(', ', '),
+ ("w" , '', 'x'),
+ ("h" , '', ')'),
+ ), 0, data)
+ filename = data["fullname"] or data["filename"]
+ data["filename"] = text.unescape(filename.rpartition(".")[0])
+ data["image"] = "https:" + data["image"]
+ del data["fullname"]
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
new file mode 100644
index 0000000..667b9b3
--- /dev/null
+++ b/gallery_dl/job.py
@@ -0,0 +1,492 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import sys
+import time
+import logging
+from . import extractor, downloader, postprocessor
+from . import config, text, util, output, exception
+from .extractor.message import Message
+
+
+class Job():
+ """Base class for Job-types"""
+ ulog = None
+
+ def __init__(self, extr, parent=None):
+ if isinstance(extr, str):
+ extr = extractor.find(extr)
+ if not extr:
+ raise exception.NoExtractorError()
+
+ self.extractor = extr
+ extr.log.extractor = extr
+ extr.log.job = self
+ extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
+
+ # url predicates
+ self.pred_url = self._prepare_predicates(
+ "image", [util.UniquePredicate()], True)
+
+ # queue predicates
+ self.pred_queue = self._prepare_predicates(
+ "chapter", [], False)
+
+ # category transfer
+ if parent and parent.extractor.config(
+ "category-transfer", parent.extractor.categorytransfer):
+ self.extractor.category = parent.extractor.category
+ self.extractor.subcategory = parent.extractor.subcategory
+
+ # user-supplied metadata
+ self.userkwds = self.extractor.config("keywords")
+
+ def run(self):
+ """Execute or run the job"""
+ try:
+ log = self.extractor.log
+ for msg in self.extractor:
+ self.dispatch(msg)
+ except exception.AuthenticationError as exc:
+ msg = str(exc) or "Please provide a valid username/password pair."
+ log.error("Authentication failed: %s", msg)
+ except exception.AuthorizationError:
+ log.error("You do not have permission to access the resource "
+ "at '%s'", self.extractor.url)
+ except exception.NotFoundError as exc:
+ res = str(exc) or "resource (gallery/image/user)"
+ log.error("The %s at '%s' does not exist", res, self.extractor.url)
+ except exception.HttpError as exc:
+ err = exc.args[0]
+ if isinstance(err, Exception):
+ err = "{}: {}".format(err.__class__.__name__, err)
+ log.error("HTTP request failed: %s", err)
+ except exception.FormatError as exc:
+ err, obj = exc.args
+ log.error("Applying %s format string failed: %s: %s",
+ obj, err.__class__.__name__, err)
+ except exception.FilterError as exc:
+ err = exc.args[0]
+ log.error("Evaluating filter expression failed: %s: %s",
+ err.__class__.__name__, err)
+ except exception.StopExtraction:
+ pass
+ except OSError as exc:
+ log.error("Unable to download data: %s: %s",
+ exc.__class__.__name__, exc)
+ log.debug("", exc_info=True)
+ except Exception as exc:
+ log.error(("An unexpected error occurred: %s - %s. "
+ "Please run gallery-dl again with the --verbose flag, "
+ "copy its output and report this issue on "
+ "https://github.com/mikf/gallery-dl/issues ."),
+ exc.__class__.__name__, exc)
+ log.debug("", exc_info=True)
+ self.handle_finalize()
+
+ def dispatch(self, msg):
+ """Call the appropriate message handler"""
+ if msg[0] == Message.Url:
+ _, url, kwds = msg
+ if self.pred_url(url, kwds):
+ self.update_kwdict(kwds)
+ self.handle_url(url, kwds)
+
+ elif msg[0] == Message.Directory:
+ self.update_kwdict(msg[1])
+ self.handle_directory(msg[1])
+
+ elif msg[0] == Message.Queue:
+ _, url, kwds = msg
+ if self.pred_queue(url, kwds):
+ self.handle_queue(url, kwds)
+
+ elif msg[0] == Message.Urllist:
+ _, urls, kwds = msg
+ if self.pred_url(urls[0], kwds):
+ self.update_kwdict(kwds)
+ self.handle_urllist(urls, kwds)
+
+ elif msg[0] == Message.Version:
+ if msg[1] != 1:
+ raise "unsupported message-version ({}, {})".format(
+ self.extractor.category, msg[1]
+ )
+ # TODO: support for multiple message versions
+
+ def handle_url(self, url, keywords):
+ """Handle Message.Url"""
+
+ def handle_urllist(self, urls, keywords):
+ """Handle Message.Urllist"""
+ self.handle_url(urls[0], keywords)
+
+ def handle_directory(self, keywords):
+ """Handle Message.Directory"""
+
+ def handle_queue(self, url, keywords):
+ """Handle Message.Queue"""
+
+ def handle_finalize(self):
+ """Handle job finalization"""
+
+ def update_kwdict(self, kwdict):
+ """Update 'kwdict' with additional metadata"""
+ kwdict["category"] = self.extractor.category
+ kwdict["subcategory"] = self.extractor.subcategory
+ if self.userkwds:
+ kwdict.update(self.userkwds)
+
+ def _prepare_predicates(self, target, predicates, skip=True):
+ pfilter = self.extractor.config(target + "-filter")
+ if pfilter:
+ try:
+ pred = util.FilterPredicate(pfilter, target)
+ except (SyntaxError, ValueError, TypeError) as exc:
+ self.extractor.log.warning(exc)
+ else:
+ predicates.append(pred)
+
+ prange = self.extractor.config(target + "-range")
+ if prange:
+ try:
+ pred = util.RangePredicate(prange)
+ except ValueError as exc:
+ self.extractor.log.warning(
+ "invalid %s range: %s", target, exc)
+ else:
+ if skip and pred.lower > 1 and not pfilter:
+ pred.index += self.extractor.skip(pred.lower - 1)
+ predicates.append(pred)
+
+ return util.build_predicate(predicates)
+
+ def _write_unsupported(self, url):
+ if self.ulog:
+ self.ulog.info(url)
+
+ @staticmethod
+ def _filter(kwdict):
+ """Return a copy of 'kwdict' without "private" entries"""
+ return {k: v for k, v in kwdict.items() if k[0] != "_"}
+
+
+class DownloadJob(Job):
+ """Download images into appropriate directory/filename locations"""
+
+ def __init__(self, url, parent=None):
+ Job.__init__(self, url, parent)
+ self.log = logging.getLogger("download")
+ self.pathfmt = None
+ self.archive = None
+ self.sleep = None
+ self.downloaders = {}
+ self.postprocessors = None
+ self.out = output.select()
+
+ def handle_url(self, url, keywords, fallback=None):
+ """Download the resource specified in 'url'"""
+ # prepare download
+ self.pathfmt.set_keywords(keywords)
+
+ if self.postprocessors:
+ for pp in self.postprocessors:
+ pp.prepare(self.pathfmt)
+
+ if self.pathfmt.exists(self.archive):
+ self.handle_skip()
+ return
+
+ if self.sleep:
+ time.sleep(self.sleep)
+
+ # download from URL
+ if not self.download(url):
+
+ # use fallback URLs if available
+ for num, url in enumerate(fallback or (), 1):
+ self.log.info("Trying fallback URL #%d", num)
+ if self.download(url):
+ break
+ else:
+ # download failed
+ self.log.error(
+ "Failed to download %s", self.pathfmt.filename or url)
+ return
+
+ if not self.pathfmt.temppath:
+ self.handle_skip()
+ return
+
+ # run post processors
+ if self.postprocessors:
+ for pp in self.postprocessors:
+ pp.run(self.pathfmt)
+
+ # download succeeded
+ self.pathfmt.finalize()
+ self.out.success(self.pathfmt.path, 0)
+ if self.archive:
+ self.archive.add(keywords)
+ self._skipcnt = 0
+
+ def handle_urllist(self, urls, keywords):
+ """Download the resource specified in 'url'"""
+ fallback = iter(urls)
+ url = next(fallback)
+ self.handle_url(url, keywords, fallback)
+
+ def handle_directory(self, keywords):
+ """Set and create the target directory for downloads"""
+ if not self.pathfmt:
+ self.initialize(keywords)
+ else:
+ self.pathfmt.set_directory(keywords)
+
+ def handle_queue(self, url, keywords):
+ if "_extractor" in keywords:
+ extr = keywords["_extractor"].from_url(url)
+ else:
+ extr = extractor.find(url)
+ if extr:
+ self.__class__(extr, self).run()
+ else:
+ self._write_unsupported(url)
+
+ def handle_finalize(self):
+ if self.postprocessors:
+ for pp in self.postprocessors:
+ pp.finalize()
+
+ def handle_skip(self):
+ self.out.skip(self.pathfmt.path)
+ if self._skipexc:
+ self._skipcnt += 1
+ if self._skipcnt >= self._skipmax:
+ raise self._skipexc()
+
+ def download(self, url):
+ """Download 'url'"""
+ scheme = url.partition(":")[0]
+ downloader = self.get_downloader(scheme)
+ if downloader:
+ return downloader.download(url, self.pathfmt)
+ self._write_unsupported(url)
+ return False
+
+ def get_downloader(self, scheme):
+ """Return a downloader suitable for 'scheme'"""
+ if scheme == "https":
+ scheme = "http"
+ try:
+ return self.downloaders[scheme]
+ except KeyError:
+ pass
+
+ klass = downloader.find(scheme)
+ if klass and config.get(("downloader", scheme, "enabled"), True):
+ instance = klass(self.extractor, self.out)
+ else:
+ instance = None
+ self.log.error("'%s:' URLs are not supported/enabled", scheme)
+ self.downloaders[scheme] = instance
+ return instance
+
+ def initialize(self, keywords=None):
+ """Delayed initialization of PathFormat, etc."""
+ self.pathfmt = util.PathFormat(self.extractor)
+ if keywords:
+ self.pathfmt.set_directory(keywords)
+ self.sleep = self.extractor.config("sleep")
+
+ skip = self.extractor.config("skip", True)
+ if skip:
+ self._skipexc = None
+ if isinstance(skip, str):
+ skip, _, smax = skip.partition(":")
+ if skip == "abort":
+ self._skipexc = exception.StopExtraction
+ elif skip == "exit":
+ self._skipexc = sys.exit
+ self._skipcnt = 0
+ self._skipmax = text.parse_int(smax)
+ else:
+ self.pathfmt.exists = lambda x=None: False
+
+ archive = self.extractor.config("archive")
+ if archive:
+ path = util.expand_path(archive)
+ self.archive = util.DownloadArchive(path, self.extractor)
+
+ postprocessors = self.extractor.config("postprocessors")
+ if postprocessors:
+ self.postprocessors = []
+ for pp_dict in postprocessors:
+ whitelist = pp_dict.get("whitelist")
+ blacklist = pp_dict.get("blacklist")
+ if (whitelist and self.extractor.category not in whitelist or
+ blacklist and self.extractor.category in blacklist):
+ continue
+ name = pp_dict.get("name")
+ pp_cls = postprocessor.find(name)
+ if not pp_cls:
+ postprocessor.log.warning("module '%s' not found", name)
+ continue
+ try:
+ pp_obj = pp_cls(self.pathfmt, pp_dict)
+ except Exception as exc:
+ postprocessor.log.error(
+ "'%s' initialization failed: %s: %s",
+ name, exc.__class__.__name__, exc)
+ else:
+ self.postprocessors.append(pp_obj)
+ self.extractor.log.debug(
+ "Active postprocessor modules: %s", self.postprocessors)
+
+
+class SimulationJob(DownloadJob):
+ """Simulate the extraction process without downloading anything"""
+
+ def handle_url(self, url, keywords, fallback=None):
+ self.pathfmt.set_keywords(keywords)
+ self.out.skip(self.pathfmt.path)
+ if self.sleep:
+ time.sleep(self.sleep)
+ if self.archive:
+ self.archive.add(keywords)
+
+ def handle_directory(self, keywords):
+ if not self.pathfmt:
+ self.initialize()
+
+
+class KeywordJob(Job):
+ """Print available keywords"""
+
+ def handle_url(self, url, keywords):
+ print("\nKeywords for filenames and --filter:")
+ print("------------------------------------")
+ self.print_keywords(keywords)
+ raise exception.StopExtraction()
+
+ def handle_directory(self, keywords):
+ print("Keywords for directory names:")
+ print("-----------------------------")
+ self.print_keywords(keywords)
+
+ def handle_queue(self, url, keywords):
+ if not keywords:
+ self.extractor.log.info(
+ "This extractor delegates work to other extractors "
+ "and does not provide any keywords on its own. Try "
+ "'gallery-dl -K \"%s\"' instead.", url)
+ else:
+ print("Keywords for --chapter-filter:")
+ print("------------------------------")
+ self.print_keywords(keywords)
+ if self.extractor.categorytransfer:
+ print()
+ KeywordJob(url, self).run()
+ raise exception.StopExtraction()
+
+ @staticmethod
+ def print_keywords(keywords, prefix=""):
+ """Print key-value pairs with formatting"""
+ suffix = "]" if prefix else ""
+ for key, value in sorted(keywords.items()):
+ if key[0] == "_":
+ continue
+ key = prefix + key + suffix
+
+ if isinstance(value, dict):
+ KeywordJob.print_keywords(value, key + "[")
+
+ elif isinstance(value, list):
+ if value and isinstance(value[0], dict):
+ KeywordJob.print_keywords(value[0], key + "[][")
+ else:
+ print(key, "[]", sep="")
+ for val in value:
+ print(" -", val)
+
+ else:
+ # string or number
+ print(key, "\n ", value, sep="")
+
+
+class UrlJob(Job):
+ """Print download urls"""
+ maxdepth = 1
+
+ def __init__(self, url, parent=None, depth=1):
+ Job.__init__(self, url, parent)
+ self.depth = depth
+ if depth >= self.maxdepth:
+ self.handle_queue = self.handle_url
+
+ @staticmethod
+ def handle_url(url, _):
+ print(url)
+
+ @staticmethod
+ def handle_urllist(urls, _):
+ prefix = ""
+ for url in urls:
+ print(prefix, url, sep="")
+ prefix = "| "
+
+ def handle_queue(self, url, _):
+ try:
+ UrlJob(url, self, self.depth + 1).run()
+ except exception.NoExtractorError:
+ self._write_unsupported(url)
+
+
+class DataJob(Job):
+ """Collect extractor results and dump them"""
+
+ def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True):
+ Job.__init__(self, url, parent)
+ self.file = file
+ self.data = []
+ self.ascii = config.get(("output", "ascii"), ensure_ascii)
+
+ def run(self):
+ # collect data
+ try:
+ for msg in self.extractor:
+ self.dispatch(msg)
+ except exception.StopExtraction:
+ pass
+ except Exception as exc:
+ self.data.append((exc.__class__.__name__, str(exc)))
+ except BaseException:
+ pass
+
+ # convert numbers to string
+ if config.get(("output", "num-to-str"), False):
+ for msg in self.data:
+ util.transform_dict(msg[-1], util.number_to_string)
+
+ # dump to 'file'
+ util.dump_json(self.data, self.file, self.ascii, 2)
+
+ def handle_url(self, url, kwdict):
+ self.data.append((Message.Url, url, self._filter(kwdict)))
+
+ def handle_urllist(self, urls, kwdict):
+ self.data.append((Message.Urllist, list(urls), self._filter(kwdict)))
+
+ def handle_directory(self, kwdict):
+ self.data.append((Message.Directory, self._filter(kwdict)))
+
+ def handle_queue(self, url, kwdict):
+ self.data.append((Message.Queue, url, self._filter(kwdict)))
+
+ def handle_finalize(self):
+ self.file.close()
diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py
new file mode 100644
index 0000000..58126ac
--- /dev/null
+++ b/gallery_dl/oauth.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""OAuth helper functions and classes"""
+
+import hmac
+import time
+import base64
+import random
+import string
+import hashlib
+import urllib.parse
+
+import requests
+import requests.auth
+
+from . import text
+
+
+def nonce(size, alphabet=string.ascii_letters):
+ """Generate a nonce value with 'size' characters"""
+ return "".join(random.choice(alphabet) for _ in range(size))
+
+
+def quote(value, quote=urllib.parse.quote):
+ """Quote 'value' according to the OAuth1.0 standard"""
+ return quote(value, "~")
+
+
+def concat(*args):
+ """Concatenate 'args' as expected by OAuth1.0"""
+ return "&".join(quote(item) for item in args)
+
+
+class OAuth1Session(requests.Session):
+ """Extension to requests.Session to support OAuth 1.0"""
+
+ def __init__(self, consumer_key, consumer_secret,
+ token=None, token_secret=None):
+
+ requests.Session.__init__(self)
+ self.auth = OAuth1Client(
+ consumer_key, consumer_secret,
+ token, token_secret,
+ )
+
+ def rebuild_auth(self, prepared_request, response):
+ if "Authorization" in prepared_request.headers:
+ del prepared_request.headers["Authorization"]
+ prepared_request.prepare_auth(self.auth)
+
+
+class OAuth1Client(requests.auth.AuthBase):
+ """OAuth1.0a authentication"""
+
+ def __init__(self, consumer_key, consumer_secret,
+ token=None, token_secret=None):
+
+ self.consumer_key = consumer_key
+ self.consumer_secret = consumer_secret
+ self.token = token
+ self.token_secret = token_secret
+
+ def __call__(self, request):
+ oauth_params = [
+ ("oauth_consumer_key", self.consumer_key),
+ ("oauth_nonce", nonce(16)),
+ ("oauth_signature_method", "HMAC-SHA1"),
+ ("oauth_timestamp", str(int(time.time()))),
+ ("oauth_version", "1.0"),
+ ]
+ if self.token:
+ oauth_params.append(("oauth_token", self.token))
+
+ signature = self.generate_signature(request, oauth_params)
+ oauth_params.append(("oauth_signature", signature))
+
+ request.headers["Authorization"] = "OAuth " + ",".join(
+ key + '="' + value + '"' for key, value in oauth_params)
+
+ return request
+
+ def generate_signature(self, request, params):
+ """Generate 'oauth_signature' value"""
+ url, _, query = request.url.partition("?")
+
+ params = params.copy()
+ for key, value in text.parse_query(query).items():
+ params.append((quote(key), quote(value)))
+ params.sort()
+ query = "&".join("=".join(item) for item in params)
+
+ message = concat(request.method, url, query).encode()
+ key = concat(self.consumer_secret, self.token_secret or "").encode()
+ signature = hmac.new(key, message, hashlib.sha1).digest()
+
+ return quote(base64.b64encode(signature).decode())
+
+
+class OAuth1API():
+ """Base class for OAuth1.0 based API interfaces"""
+ API_KEY = None
+ API_SECRET = None
+
+ def __init__(self, extractor):
+ self.log = extractor.log
+ self.extractor = extractor
+
+ api_key = extractor.config("api-key", self.API_KEY)
+ api_secret = extractor.config("api-secret", self.API_SECRET)
+ token = extractor.config("access-token")
+ token_secret = extractor.config("access-token-secret")
+
+ if api_key and api_secret and token and token_secret:
+ self.log.debug("Using OAuth1.0 authentication")
+ self.session = OAuth1Session(
+ api_key, api_secret, token, token_secret)
+ self.api_key = None
+ else:
+ self.log.debug("Using api_key authentication")
+ self.session = extractor.session
+ self.api_key = api_key
+
+ def request(self, url, method="GET", *, expect=range(400, 500), **kwargs):
+ kwargs["expect"] = expect
+ kwargs["session"] = self.session
+ return self.extractor.request(url, method, **kwargs)
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
new file mode 100644
index 0000000..f23b79d
--- /dev/null
+++ b/gallery_dl/option.py
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Command line option parsing"""
+
+import argparse
+import logging
+import json
+from . import job, version
+
+
+class ConfigAction(argparse.Action):
+ """Set argparse results as config values"""
+ def __call__(self, parser, namespace, values, option_string=None):
+ namespace.options.append(((self.dest,), values))
+
+
+class ConfigConstAction(argparse.Action):
+ """Set argparse const values as config values"""
+ def __call__(self, parser, namespace, values, option_string=None):
+ namespace.options.append(((self.dest,), self.const))
+
+
+class ParseAction(argparse.Action):
+ """Parse <key>=<value> options and set them as config values"""
+ def __call__(self, parser, namespace, values, option_string=None):
+ key, _, value = values.partition("=")
+ try:
+ value = json.loads(value)
+ except ValueError:
+ pass
+ key = key.split(".")
+ namespace.options.append((key, value))
+
+
+class Formatter(argparse.HelpFormatter):
+ """Custom HelpFormatter class to customize help output"""
+ def __init__(self, *args, **kwargs):
+ super().__init__(max_help_position=50, *args, **kwargs)
+
+ def _format_action_invocation(self, action):
+ opts = action.option_strings[:]
+ if opts:
+ if action.nargs != 0:
+ args_string = self._format_args(action, "ARG")
+ opts[-1] += " " + args_string
+ return ', '.join(opts)
+ else:
+ return self._metavar_formatter(action, action.dest)(1)[0]
+
+
+def build_parser():
+ """Build and configure an ArgumentParser object"""
+ parser = argparse.ArgumentParser(
+ usage="%(prog)s [OPTION]... URL...",
+ formatter_class=Formatter,
+ add_help=False,
+ )
+
+ general = parser.add_argument_group("General Options")
+ general.add_argument(
+ "-h", "--help",
+ action="help",
+ help="Print this help message and exit",
+ )
+ general.add_argument(
+ "--version",
+ action="version", version=version.__version__,
+ help="Print program version and exit",
+ )
+ general.add_argument(
+ "-d", "--dest",
+ dest="base-directory", metavar="DEST", action=ConfigAction,
+ help="Destination directory",
+ )
+ general.add_argument(
+ "-i", "--input-file",
+ dest="inputfile", metavar="FILE",
+ help="Download URLs found in FILE ('-' for stdin)",
+ )
+ general.add_argument(
+ "--cookies",
+ dest="cookies", metavar="FILE", action=ConfigAction,
+ help="File to load additional cookies from",
+ )
+ general.add_argument(
+ "--proxy",
+ dest="proxy", metavar="URL", action=ConfigAction,
+ help="Use the specified proxy",
+ )
+ general.add_argument(
+ "--clear-cache",
+ dest="clear_cache", action="store_true",
+ help="Delete all cached login sessions, cookies, etc.",
+ )
+
+ output = parser.add_argument_group("Output Options")
+ output.add_argument(
+ "-q", "--quiet",
+ dest="loglevel", default=logging.INFO,
+ action="store_const", const=logging.ERROR,
+ help="Activate quiet mode",
+ )
+ output.add_argument(
+ "-v", "--verbose",
+ dest="loglevel",
+ action="store_const", const=logging.DEBUG,
+ help="Print various debugging information",
+ )
+ output.add_argument(
+ "-g", "--get-urls",
+ dest="list_urls", action="count",
+ help="Print URLs instead of downloading",
+ )
+ output.add_argument(
+ "-j", "--dump-json",
+ dest="jobtype", action="store_const", const=job.DataJob,
+ help="Print JSON information",
+ )
+ output.add_argument(
+ "-s", "--simulate",
+ dest="jobtype", action="store_const", const=job.SimulationJob,
+ help="Simulate data extraction; do not download anything",
+ )
+ output.add_argument(
+ "-K", "--list-keywords",
+ dest="jobtype", action="store_const", const=job.KeywordJob,
+ help=("Print a list of available keywords and example values "
+ "for the given URLs"),
+ )
+ output.add_argument(
+ "--list-modules",
+ dest="list_modules", action="store_true",
+ help="Print a list of available extractor modules",
+ )
+ output.add_argument(
+ "--list-extractors",
+ dest="list_extractors", action="store_true",
+ help=("Print a list of extractor classes "
+ "with description, (sub)category and example URL"),
+ )
+ output.add_argument(
+ "--write-log",
+ dest="logfile", metavar="FILE", action=ConfigAction,
+ help="Write logging output to FILE",
+ )
+ output.add_argument(
+ "--write-unsupported",
+ dest="unsupportedfile", metavar="FILE", action=ConfigAction,
+ help=("Write URLs, which get emitted by other extractors but cannot "
+ "be handled, to FILE"),
+ )
+
+ downloader = parser.add_argument_group("Downloader Options")
+ downloader.add_argument(
+ "-r", "--limit-rate",
+ dest="rate", metavar="RATE", action=ConfigAction,
+ help="Maximum download rate (e.g. 500k or 2.5M)",
+ )
+ downloader.add_argument(
+ "-R", "--retries",
+ dest="retries", metavar="RETRIES", type=int, action=ConfigAction,
+ help="Number of retries (default: 5)",
+ )
+ downloader.add_argument(
+ "--http-timeout",
+ dest="timeout", metavar="SECONDS", type=float, action=ConfigAction,
+ help="Timeout for HTTP connections (defaut: 30.0)",
+ )
+ downloader.add_argument(
+ "--sleep",
+ dest="sleep", metavar="SECONDS", type=float, action=ConfigAction,
+ help="Number of seconds to sleep before each download",
+ )
+ downloader.add_argument(
+ "--no-part",
+ dest="part", nargs=0, action=ConfigConstAction, const=False,
+ help="Do not use .part files",
+ )
+ downloader.add_argument(
+ "--no-check-certificate",
+ dest="verify", nargs=0, action=ConfigConstAction, const=False,
+ help="Disable HTTPS certificate validation",
+ )
+ downloader.add_argument(
+ "--abort-on-skip",
+ dest="skip", nargs=0, action=ConfigConstAction, const="abort",
+ help=("Abort extractor run if a file download would normally be "
+ "skipped, i.e. if a file with the same filename already exists"),
+ )
+
+ configuration = parser.add_argument_group("Configuration Options")
+ configuration.add_argument(
+ "-c", "--config",
+ dest="cfgfiles", metavar="FILE", action="append",
+ help="Additional configuration files",
+ )
+ configuration.add_argument(
+ "--config-yaml",
+ dest="yamlfiles", metavar="FILE", action="append",
+ help=argparse.SUPPRESS,
+ )
+ configuration.add_argument(
+ "-o", "--option",
+ dest="options", metavar="OPT", action=ParseAction, default=[],
+ help="Additional '<key>=<value>' option values",
+ )
+ configuration.add_argument(
+ "--ignore-config",
+ dest="load_config", action="store_false",
+ help="Do not read the default configuration files",
+ )
+
+ authentication = parser.add_argument_group("Authentication Options")
+ authentication.add_argument(
+ "-u", "--username",
+ dest="username", metavar="USER", action=ConfigAction,
+ help="Username to login with",
+ )
+ authentication.add_argument(
+ "-p", "--password",
+ dest="password", metavar="PASS", action=ConfigAction,
+ help="Password belonging to the given username",
+ )
+ authentication.add_argument(
+ "--netrc",
+ dest="netrc", nargs=0, action=ConfigConstAction, const=True,
+ help="Enable .netrc authentication data",
+ )
+
+ selection = parser.add_argument_group("Selection Options")
+ selection.add_argument(
+ "--download-archive",
+ dest="archive", metavar="FILE", action=ConfigAction,
+ help=("Record all downloaded files in the archive file and "
+ "skip downloading any file already in it."),
+ )
+ selection.add_argument(
+ "--range",
+ dest="image-range", metavar="RANGE", action=ConfigAction,
+ help=("Index-range(s) specifying which images to download. "
+ "For example '5-10' or '1,3-5,10-'"),
+ )
+ selection.add_argument(
+ "--chapter-range",
+ dest="chapter-range", metavar="RANGE", action=ConfigAction,
+ help=("Like '--range', but applies to manga-chapters "
+ "and other delegated URLs"),
+ )
+ selection.add_argument(
+ "--filter",
+ dest="image-filter", metavar="EXPR", action=ConfigAction,
+ help=("Python expression controlling which images to download. "
+ "Files for which the expression evaluates to False are ignored. "
+ "Available keys are the filename-specific ones listed by '-K'. "
+ "Example: --filter \"image_width >= 1000 and "
+ "rating in ('s', 'q')\""),
+ )
+ selection.add_argument(
+ "--chapter-filter",
+ dest="chapter-filter", metavar="EXPR", action=ConfigAction,
+ help=("Like '--filter', but applies to manga-chapters "
+ "and other delegated URLs"),
+ )
+
+ postprocessor = parser.add_argument_group("Post-processing Options")
+ postprocessor.add_argument(
+ "--zip",
+ dest="postprocessors",
+ action="append_const", const={"name": "zip"},
+ help="Store downloaded files in a ZIP archive",
+ )
+ postprocessor.add_argument(
+ "--ugoira-conv",
+ dest="postprocessors",
+ action="append_const", const={"name": "ugoira", "ffmpeg-args": (
+ "-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an")},
+ help="Convert Pixiv Ugoira to WebM (requires FFmpeg)",
+ )
+ postprocessor.add_argument(
+ "--write-metadata",
+ dest="postprocessors",
+ action="append_const", const={"name": "metadata"},
+ help="Write metadata to separate JSON files",
+ )
+ postprocessor.add_argument(
+ "--write-tags",
+ dest="postprocessors",
+ action="append_const", const={"name": "metadata", "mode": "tags"},
+ help="Write image tags to separate text files",
+ )
+
+ parser.add_argument(
+ "urls",
+ metavar="URL", nargs="*",
+ help=argparse.SUPPRESS,
+ )
+
+ return parser
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
new file mode 100644
index 0000000..327b69a
--- /dev/null
+++ b/gallery_dl/output.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import os
+import sys
+import shutil
+import logging
+from . import config, util
+
+
+# --------------------------------------------------------------------
+# Logging
+
+LOG_FORMAT = "[{name}][{levelname}] {message}"
+LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S"
+LOG_LEVEL = logging.INFO
+
+
+class Logger(logging.Logger):
+ """Custom logger that includes extractor and job info in log records"""
+ extractor = util.NONE
+ job = util.NONE
+
+ def makeRecord(self, name, level, fn, lno, msg, args, exc_info,
+ func=None, extra=None, sinfo=None,
+ factory=logging._logRecordFactory):
+ rv = factory(name, level, fn, lno, msg, args, exc_info, func, sinfo)
+ rv.extractor = self.extractor
+ rv.job = self.job
+ return rv
+
+
+def initialize_logging(loglevel):
+ """Setup basic logging functionality before configfiles have been loaded"""
+ # convert levelnames to lowercase
+ for level in (10, 20, 30, 40, 50):
+ name = logging.getLevelName(level)
+ logging.addLevelName(level, name.lower())
+
+ # register custom Logging class
+ logging.Logger.manager.setLoggerClass(Logger)
+
+ # setup basic logging to stderr
+ formatter = logging.Formatter(LOG_FORMAT, LOG_FORMAT_DATE, "{")
+ handler = logging.StreamHandler()
+ handler.setFormatter(formatter)
+ handler.setLevel(loglevel)
+ root = logging.getLogger()
+ root.setLevel(logging.NOTSET)
+ root.addHandler(handler)
+
+ return logging.getLogger("gallery-dl")
+
+
+def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL):
+ """Setup a new logging handler"""
+ opts = config.interpolate(("output", key))
+ if not opts:
+ return None
+ if not isinstance(opts, dict):
+ opts = {"path": opts}
+
+ path = opts.get("path")
+ mode = opts.get("mode", "w")
+ encoding = opts.get("encoding", "utf-8")
+ try:
+ path = util.expand_path(path)
+ handler = logging.FileHandler(path, mode, encoding)
+ except (OSError, ValueError) as exc:
+ logging.getLogger("gallery-dl").warning(
+ "%s: %s", key, exc)
+ return None
+ except TypeError as exc:
+ logging.getLogger("gallery-dl").warning(
+ "%s: missing or invalid path (%s)", key, exc)
+ return None
+
+ level = opts.get("level", lvl)
+ logfmt = opts.get("format", fmt)
+ datefmt = opts.get("format-date", LOG_FORMAT_DATE)
+ formatter = logging.Formatter(logfmt, datefmt, "{")
+ handler.setFormatter(formatter)
+ handler.setLevel(level)
+
+ return handler
+
+
+def configure_logging_handler(key, handler):
+ """Configure a logging handler"""
+ opts = config.interpolate(("output", key))
+ if not opts:
+ return
+ if isinstance(opts, str):
+ opts = {"format": opts}
+ if handler.level == LOG_LEVEL and "level" in opts:
+ handler.setLevel(opts["level"])
+ if "format" in opts or "format-date" in opts:
+ logfmt = opts.get("format", LOG_FORMAT)
+ datefmt = opts.get("format-date", LOG_FORMAT_DATE)
+ formatter = logging.Formatter(logfmt, datefmt, "{")
+ handler.setFormatter(formatter)
+
+
+# --------------------------------------------------------------------
+# Utility functions
+
+def replace_std_streams(errors="replace"):
+ """Replace standard streams and set their error handlers to 'errors'"""
+ for name in ("stdout", "stdin", "stderr"):
+ stream = getattr(sys, name)
+ setattr(sys, name, stream.__class__(
+ stream.buffer,
+ errors=errors,
+ newline=stream.newlines,
+ line_buffering=stream.line_buffering,
+ ))
+
+
+# --------------------------------------------------------------------
+# Downloader output
+
+def select():
+ """Automatically select a suitable output class"""
+ pdict = {
+ "default": PipeOutput,
+ "pipe": PipeOutput,
+ "term": TerminalOutput,
+ "terminal": TerminalOutput,
+ "color": ColorOutput,
+ "null": NullOutput,
+ }
+ omode = config.get(("output", "mode"), "auto").lower()
+ if omode in pdict:
+ return pdict[omode]()
+ elif omode == "auto":
+ if hasattr(sys.stdout, "isatty") and sys.stdout.isatty():
+ return ColorOutput() if ANSI else TerminalOutput()
+ else:
+ return PipeOutput()
+ else:
+ raise Exception("invalid output mode: " + omode)
+
+
+class NullOutput():
+
+ def start(self, path):
+ """Print a message indicating the start of a download"""
+
+ def skip(self, path):
+ """Print a message indicating that a download has been skipped"""
+
+ def success(self, path, tries):
+ """Print a message indicating the completion of a download"""
+
+
+class PipeOutput(NullOutput):
+
+ def skip(self, path):
+ print(CHAR_SKIP, path, sep="", flush=True)
+
+ def success(self, path, tries):
+ print(path, flush=True)
+
+
+class TerminalOutput(NullOutput):
+
+ def __init__(self):
+ self.short = config.get(("output", "shorten"), True)
+ if self.short:
+ self.width = shutil.get_terminal_size().columns - OFFSET
+
+ def start(self, path):
+ print(self.shorten(" " + path), end="", flush=True)
+
+ def skip(self, path):
+ print(self.shorten(CHAR_SKIP + path))
+
+ def success(self, path, tries):
+ print("\r", self.shorten(CHAR_SUCCESS + path), sep="")
+
+ def shorten(self, txt):
+ """Reduce the length of 'txt' to the width of the terminal"""
+ if self.short and len(txt) > self.width:
+ hwidth = self.width // 2 - OFFSET
+ return "".join((
+ txt[:hwidth-1],
+ CHAR_ELLIPSIES,
+ txt[-hwidth-(self.width % 2):]
+ ))
+ return txt
+
+
+class ColorOutput(TerminalOutput):
+
+ def start(self, path):
+ print(self.shorten(path), end="", flush=True)
+
+ def skip(self, path):
+ print("\033[2m", self.shorten(path), "\033[0m", sep="")
+
+ def success(self, path, tries):
+ print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")
+
+
+if os.name == "nt":
+ ANSI = os.environ.get("TERM") == "ANSI"
+ OFFSET = 1
+ CHAR_SKIP = "# "
+ CHAR_SUCCESS = "* "
+ CHAR_ELLIPSIES = "..."
+else:
+ ANSI = True
+ OFFSET = 0
+ CHAR_SKIP = "# "
+ CHAR_SUCCESS = "✔ "
+ CHAR_ELLIPSIES = "…"
diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py
new file mode 100644
index 0000000..093f8e0
--- /dev/null
+++ b/gallery_dl/postprocessor/__init__.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Post-processing modules"""
+
+import importlib
+import logging
+
+modules = [
+ "classify",
+ "exec",
+ "metadata",
+ "ugoira",
+ "zip",
+]
+
+log = logging.getLogger("postprocessor")
+
+
+def find(name):
+ """Return a postprocessor class with the given name"""
+ try:
+ return _cache[name]
+ except KeyError:
+ klass = None
+ try:
+ if name in modules: # prevent unwanted imports
+ module = importlib.import_module("." + name, __package__)
+ klass = module.__postprocessor__
+ except (ImportError, AttributeError, TypeError):
+ pass
+ _cache[name] = klass
+ return klass
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = {}
diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py
new file mode 100644
index 0000000..62460d3
--- /dev/null
+++ b/gallery_dl/postprocessor/classify.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Categorize files by file extension"""
+
+from .common import PostProcessor
+import os
+
+
+class ClassifyPP(PostProcessor):
+
+ DEFAULT_MAPPING = {
+ "Music" : ("mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"),
+ "Video" : ("flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv",
+ "webm", "vob", "wmv"),
+ "Pictures" : ("jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"),
+ "Archives" : ("zip", "rar", "7z", "tar", "gz", "bz2"),
+ }
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+ mapping = options.get("mapping", self.DEFAULT_MAPPING)
+
+ self.mapping = {
+ ext: directory
+ for directory, exts in mapping.items()
+ for ext in exts
+ }
+
+ def prepare(self, pathfmt):
+ ext = pathfmt.keywords.get("extension")
+
+ if ext in self.mapping:
+ self._dir = pathfmt.realdirectory + os.sep + self.mapping[ext]
+ pathfmt.realpath = self._dir + os.sep + pathfmt.filename
+ else:
+ self._dir = None
+
+ def run(self, pathfmt):
+ if self._dir:
+ os.makedirs(self._dir, exist_ok=True)
+
+
+__postprocessor__ = ClassifyPP
diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py
new file mode 100644
index 0000000..c642f0f
--- /dev/null
+++ b/gallery_dl/postprocessor/common.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by postprocessor modules."""
+
+from . import log
+
+
+class PostProcessor():
+ """Base class for postprocessors"""
+ log = log
+
+ def prepare(self, pathfmt):
+ """ """
+
+ def run(self, pathfmt):
+ """Execute the postprocessor for a file"""
+
+ def finalize(self):
+ """Cleanup"""
diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py
new file mode 100644
index 0000000..c86b480
--- /dev/null
+++ b/gallery_dl/postprocessor/exec.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Execute processes"""
+
+from .common import PostProcessor
+import subprocess
+
+
+class ExecPP(PostProcessor):
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+
+ try:
+ self.args = options["command"]
+ self.args[0] # test if 'args' is subscriptable
+ except (KeyError, IndexError, TypeError):
+ raise TypeError("option 'command' must be a non-empty list")
+
+ if options.get("async", False):
+ self._exec = subprocess.Popen
+
+ def run(self, pathfmt):
+ self._exec([
+ arg.format_map(pathfmt.keywords)
+ for arg in self.args
+ ])
+
+ def _exec(self, args):
+ retcode = subprocess.Popen(args).wait()
+ if retcode:
+ self.log.warning(
+ "executing '%s' returned non-zero exit status %d",
+ " ".join(args), retcode)
+
+
+__postprocessor__ = ExecPP
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
new file mode 100644
index 0000000..77be9c7
--- /dev/null
+++ b/gallery_dl/postprocessor/metadata.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Write metadata to JSON files"""
+
+from .common import PostProcessor
+from .. import util
+
+
+class MetadataPP(PostProcessor):
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+
+ mode = options.get("mode", "json")
+ ext = "txt"
+
+ if mode == "custom":
+ self.write = self._write_custom
+ self.formatter = util.Formatter(options.get("format"))
+ elif mode == "tags":
+ self.write = self._write_tags
+ else:
+ self.write = self._write_json
+ self.indent = options.get("indent", 4)
+ self.ascii = options.get("ascii", False)
+ ext = "json"
+
+ self.extension = options.get("extension", ext)
+
+ def run(self, pathfmt):
+ path = "{}.{}".format(pathfmt.realpath, self.extension)
+ with open(path, "w", encoding="utf-8") as file:
+ self.write(file, pathfmt)
+
+ def _write_custom(self, file, pathfmt):
+ output = self.formatter.format_map(pathfmt.keywords)
+ file.write(output)
+
+ def _write_tags(self, file, pathfmt):
+ kwds = pathfmt.keywords
+ tags = kwds.get("tags") or kwds.get("tag_string")
+
+ if not tags:
+ return
+
+ if not isinstance(tags, list):
+ taglist = tags.split(", ")
+ if len(taglist) < len(tags) / 16:
+ taglist = tags.split(" ")
+ tags = taglist
+
+ file.write("\n".join(tags))
+ file.write("\n")
+
+ def _write_json(self, file, pathfmt):
+ util.dump_json(pathfmt.keywords, file, self.ascii, self.indent)
+
+
+__postprocessor__ = MetadataPP
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
new file mode 100644
index 0000000..bd8c5ad
--- /dev/null
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Convert pixiv ugoira to webm"""
+
+from .common import PostProcessor
+from .. import util
+import collections
+import subprocess
+import tempfile
+import zipfile
+import os
+
+
+class UgoiraPP(PostProcessor):
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+ self.extension = options.get("extension") or "webm"
+ self.args = options.get("ffmpeg-args") or ()
+ self.twopass = options.get("ffmpeg-twopass", False)
+ self.output = options.get("ffmpeg-output", True)
+ self.delete = not options.get("keep-files", False)
+
+ ffmpeg = options.get("ffmpeg-location")
+ self.ffmpeg = util.expand_path(ffmpeg) if ffmpeg else "ffmpeg"
+
+ rate = options.get("framerate", "auto")
+ if rate != "auto":
+ self.calculate_framerate = lambda _: (None, rate)
+
+ if options.get("libx264-prevent-odd", True):
+ # get last video-codec argument
+ vcodec = None
+ for index, arg in enumerate(self.args):
+ arg, _, stream = arg.partition(":")
+ if arg == "-vcodec" or arg in ("-c", "-codec") and (
+ not stream or stream.partition(":")[0] in ("v", "V")):
+ vcodec = self.args[index + 1]
+ # use filter if libx264/5 is explicitly or implicitly used
+ self.prevent_odd = (
+ vcodec in ("libx264", "libx265") or
+ not vcodec and self.extension.lower() in ("mp4", "mkv"))
+ else:
+ self.prevent_odd = False
+
+ def prepare(self, pathfmt):
+ self._frames = None
+
+ if pathfmt.keywords["extension"] != "zip":
+ return
+
+ if "frames" in pathfmt.keywords:
+ self._frames = pathfmt.keywords["frames"]
+ elif "pixiv_ugoira_frame_data" in pathfmt.keywords:
+ self._frames = pathfmt.keywords["pixiv_ugoira_frame_data"]["data"]
+ else:
+ return
+
+ if self.delete:
+ pathfmt.set_extension(self.extension)
+
+ def run(self, pathfmt):
+ if not self._frames:
+ return
+
+ rate_in, rate_out = self.calculate_framerate(self._frames)
+
+ with tempfile.TemporaryDirectory() as tempdir:
+ # extract frames
+ with zipfile.ZipFile(pathfmt.temppath) as zfile:
+ zfile.extractall(tempdir)
+
+ # write ffconcat file
+ ffconcat = tempdir + "/ffconcat.txt"
+ with open(ffconcat, "w") as file:
+ file.write("ffconcat version 1.0\n")
+ for frame in self._frames:
+ file.write("file '{}'\n".format(frame["file"]))
+ file.write("duration {}\n".format(frame["delay"] / 1000))
+ if self.extension != "gif":
+ # repeat the last frame to prevent it from only being
+ # displayed for a very short amount of time
+ file.write("file '{}'\n".format(self._frames[-1]["file"]))
+
+ # collect command-line arguments
+ args = [self.ffmpeg]
+ if rate_in:
+ args += ["-r", str(rate_in)]
+ args += ["-i", ffconcat]
+ if rate_out:
+ args += ["-r", str(rate_out)]
+ if self.prevent_odd:
+ args += ["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"]
+ if self.args:
+ args += self.args
+ self.log.debug("ffmpeg args: %s", args)
+
+ # invoke ffmpeg
+ pathfmt.set_extension(self.extension)
+ if self.twopass:
+ if "-f" not in args:
+ args += ["-f", self.extension]
+ args += ["-passlogfile", tempdir + "/ffmpeg2pass", "-pass"]
+ self._exec(args + ["1", "-y", os.devnull])
+ self._exec(args + ["2", pathfmt.realpath])
+ else:
+ args.append(pathfmt.realpath)
+ self._exec(args)
+
+ if self.delete:
+ pathfmt.delete = True
+ else:
+ pathfmt.set_extension("zip")
+
+ def _exec(self, args):
+ out = None if self.output else subprocess.DEVNULL
+ return subprocess.Popen(args, stdout=out, stderr=out).wait()
+
+ @staticmethod
+ def calculate_framerate(framelist):
+ counter = collections.Counter(frame["delay"] for frame in framelist)
+ fps = "1000/{}".format(min(counter))
+ return (fps, None) if len(counter) == 1 else (None, fps)
+
+
+__postprocessor__ = UgoiraPP
diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py
new file mode 100644
index 0000000..3a0c323
--- /dev/null
+++ b/gallery_dl/postprocessor/zip.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Store files in ZIP archives"""
+
+from .common import PostProcessor
+import zipfile
+import os
+
+
+class ZipPP(PostProcessor):
+
+ COMPRESSION_ALGORITHMS = {
+ "store": zipfile.ZIP_STORED,
+ "zip": zipfile.ZIP_DEFLATED,
+ "bzip2": zipfile.ZIP_BZIP2,
+ "lzma": zipfile.ZIP_LZMA,
+ }
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+ self.delete = not options.get("keep-files", False)
+ self.ext = "." + options.get("extension", "zip")
+ algorithm = options.get("compression", "store")
+ if algorithm not in self.COMPRESSION_ALGORITHMS:
+ self.log.warning(
+ "unknown compression algorithm '%s'; falling back to 'store'",
+ algorithm)
+ algorithm = "store"
+
+ self.path = pathfmt.realdirectory
+ self.zfile = zipfile.ZipFile(
+ self.path + self.ext, "a",
+ self.COMPRESSION_ALGORITHMS[algorithm], True)
+
+ def run(self, pathfmt):
+ # 'NameToInfo' is not officially documented, but it's available
+ # for all supported Python versions and using it directly is a lot
+ # better than calling getinfo()
+ if pathfmt.filename not in self.zfile.NameToInfo:
+ self.zfile.write(pathfmt.temppath, pathfmt.filename)
+ pathfmt.delete = self.delete
+
+ def finalize(self):
+ self.zfile.close()
+
+ if self.delete:
+ try:
+ os.rmdir(self.path)
+ except OSError:
+ pass
+
+ if not self.zfile.NameToInfo:
+ try:
+ os.unlink(self.zfile.filename)
+ except OSError:
+ pass
+
+
+__postprocessor__ = ZipPP
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
new file mode 100644
index 0000000..151fa30
--- /dev/null
+++ b/gallery_dl/text.py
@@ -0,0 +1,278 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Collection of functions that work on strings/text"""
+
+import re
+import html
+import os.path
+import datetime
+import urllib.parse
+
+
+INVALID_XML_CHARS = (
+ "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
+ "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
+ "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a",
+ "\x1b", "\x1c", "\x1d", "\x1e", "\x1f",
+)
+
+
+def clean_xml(xmldata, repl=""):
+ """Replace/Remove invalid control characters in 'xmldata'"""
+ if not isinstance(xmldata, str):
+ try:
+ xmldata = "".join(xmldata)
+ except TypeError:
+ return ""
+ for char in INVALID_XML_CHARS:
+ if char in xmldata:
+ xmldata = xmldata.replace(char, repl)
+ return xmldata
+
+
+def remove_html(txt):
+ """Remove html-tags from a string"""
+ try:
+ return " ".join(re.sub("<[^>]+>", " ", txt).split())
+ except TypeError:
+ return ""
+
+
+def split_html(txt, sep=None):
+ """Split input string by html-tags"""
+ try:
+ return [
+ x.strip() for x in re.split("<[^>]+>", txt)
+ if x and not x.isspace()
+ ]
+ except TypeError:
+ return []
+
+
+def filename_from_url(url):
+ """Extract the last part of an URL to use as a filename"""
+ try:
+ return urllib.parse.urlsplit(url).path.rpartition("/")[2]
+ except (TypeError, AttributeError):
+ return ""
+
+
+def ext_from_url(url):
+ """Extract the filename extension of an URL"""
+ filename = filename_from_url(url)
+ ext = os.path.splitext(filename)[1]
+ return ext[1:].lower()
+
+
+def nameext_from_url(url, data=None):
+ """Extract the last part of an URL and fill 'data' accordingly"""
+ if data is None:
+ data = {}
+ name = unquote(filename_from_url(url))
+ data["filename"], ext = os.path.splitext(name)
+ data["extension"] = ext[1:].lower()
+ return data
+
+
+def clean_path_windows(path):
+ """Remove illegal characters from a path-segment (Windows)"""
+ try:
+ return re.sub(r'[<>:"\\/|?*]', "_", path)
+ except TypeError:
+ return ""
+
+
+def clean_path_posix(path):
+ """Remove illegal characters from a path-segment (Posix)"""
+ try:
+ return path.replace("/", "_")
+ except AttributeError:
+ return ""
+
+
+def extract(txt, begin, end, pos=0):
+ """Extract the text between 'begin' and 'end' from 'txt'
+
+ Args:
+ txt: String to search in
+ begin: First string to be searched for
+ end: Second string to be searched for after 'begin'
+ pos: Starting position for searches in 'txt'
+
+ Returns:
+ The string between the two search-strings 'begin' and 'end' beginning
+ with position 'pos' in 'txt' as well as the position after 'end'.
+
+ If at least one of 'begin' or 'end' is not found, None and the original
+ value of 'pos' is returned
+
+ Examples:
+ extract("abcde", "b", "d") -> "c" , 4
+ extract("abcde", "b", "d", 3) -> None, 3
+ """
+ try:
+ first = txt.index(begin, pos) + len(begin)
+ last = txt.index(end, first)
+ return txt[first:last], last+len(end)
+ except (ValueError, TypeError, AttributeError):
+ return None, pos
+
+
+def rextract(txt, begin, end, pos=-1):
+ try:
+ lbeg = len(begin)
+ first = txt.rindex(begin, 0, pos)
+ last = txt.index(end, first + lbeg)
+ return txt[first + lbeg:last], first
+ except (ValueError, TypeError, AttributeError):
+ return None, pos
+
+
+def extract_all(txt, rules, pos=0, values=None):
+ """Calls extract for each rule and returns the result in a dict"""
+ if values is None:
+ values = {}
+ for key, begin, end in rules:
+ result, pos = extract(txt, begin, end, pos)
+ if key:
+ values[key] = result
+ return values, pos
+
+
+def extract_iter(txt, begin, end, pos=0):
+ """Yield values that would be returned by repeated calls of extract()"""
+ index = txt.index
+ lbeg = len(begin)
+ lend = len(end)
+ try:
+ while True:
+ first = index(begin, pos) + lbeg
+ last = index(end, first)
+ pos = last + lend
+ yield txt[first:last]
+ except (ValueError, TypeError, AttributeError):
+ return
+
+
+def extract_from(txt, pos=0, default=""):
+ """Returns a function object that extracts from 'txt'"""
+ def extr(begin, end, index=txt.index, txt=txt):
+ nonlocal pos
+ try:
+ first = index(begin, pos) + len(begin)
+ last = index(end, first)
+ pos = last + len(end)
+ return txt[first:last]
+ except (ValueError, TypeError, AttributeError):
+ return default
+ return extr
+
+
+def parse_unicode_escapes(txt):
+ """Convert JSON Unicode escapes in 'txt' into actual characters"""
+ if "\\u" in txt:
+ return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
+ return txt
+
+
+def _hex_to_char(match):
+ return chr(int(match.group(1), 16))
+
+
+def parse_bytes(value, default=0, suffixes="bkmgtp"):
+ """Convert a bytes-amount ("500k", "2.5M", ...) to int"""
+ try:
+ last = value[-1].lower()
+ except (TypeError, KeyError, IndexError):
+ return default
+
+ if last in suffixes:
+ mul = 1024 ** suffixes.index(last)
+ value = value[:-1]
+ else:
+ mul = 1
+
+ try:
+ return round(float(value) * mul)
+ except ValueError:
+ return default
+
+
+def parse_int(value, default=0):
+ """Convert 'value' to int"""
+ if not value:
+ return default
+ try:
+ return int(value)
+ except (ValueError, TypeError):
+ return default
+
+
+def parse_float(value, default=0.0):
+ """Convert 'value' to float"""
+ if not value:
+ return default
+ try:
+ return float(value)
+ except (ValueError, TypeError):
+ return default
+
+
+def parse_query(qs):
+ """Parse a query string into key-value pairs"""
+ result = {}
+ try:
+ for key, value in urllib.parse.parse_qsl(qs):
+ if key not in result:
+ result[key] = value
+ except AttributeError:
+ pass
+ return result
+
+
+def parse_timestamp(ts, default=None):
+ """Create a datetime object from a unix timestamp"""
+ try:
+ return datetime.datetime.utcfromtimestamp(int(ts))
+ except (TypeError, ValueError, OverflowError):
+ return default
+
+
+def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z"):
+ """Create a datetime object by parsing 'date_string'"""
+ try:
+ if format.endswith("%z") and date_string[-3] == ":":
+ # workaround for Python < 3.7: +00:00 -> +0000
+ ds = date_string[:-3] + date_string[-2:]
+ else:
+ ds = date_string
+ d = datetime.datetime.strptime(ds, format)
+ o = d.utcoffset()
+ if o is not None:
+ d = d.replace(tzinfo=None) - o # convert to naive UTC
+ return d
+ except (TypeError, IndexError, KeyError):
+ return None
+ except (ValueError, OverflowError):
+ return date_string
+
+
+if os.name == "nt":
+ clean_path = clean_path_windows
+else:
+ clean_path = clean_path_posix
+
+
+urljoin = urllib.parse.urljoin
+
+quote = urllib.parse.quote
+unquote = urllib.parse.unquote
+
+escape = html.escape
+unescape = html.unescape
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
new file mode 100644
index 0000000..5c0ae41
--- /dev/null
+++ b/gallery_dl/util.py
@@ -0,0 +1,673 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Utility functions and classes"""
+
+import re
+import os
+import sys
+import json
+import shutil
+import string
+import _string
+import sqlite3
+import datetime
+import operator
+import itertools
+import urllib.parse
+from . import text, exception
+
+
+def bencode(num, alphabet="0123456789"):
+ """Encode an integer into a base-N encoded string"""
+ data = ""
+ base = len(alphabet)
+ while num:
+ num, remainder = divmod(num, base)
+ data = alphabet[remainder] + data
+ return data
+
+
+def bdecode(data, alphabet="0123456789"):
+ """Decode a base-N encoded string ( N = len(alphabet) )"""
+ num = 0
+ base = len(alphabet)
+ for c in data:
+ num *= base
+ num += alphabet.index(c)
+ return num
+
+
+def advance(iterable, num):
+ """"Advance the iterable by 'num' steps"""
+ iterator = iter(iterable)
+ next(itertools.islice(iterator, num, num), None)
+ return iterator
+
+
+def raises(obj):
+ """Returns a function that raises 'obj' as exception"""
+ def wrap():
+ raise obj
+ return wrap
+
+
+def combine_dict(a, b):
+ """Recursively combine the contents of 'b' into 'a'"""
+ for key, value in b.items():
+ if key in a and isinstance(value, dict) and isinstance(a[key], dict):
+ combine_dict(a[key], value)
+ else:
+ a[key] = value
+ return a
+
+
+def transform_dict(a, func):
+ """Recursively apply 'func' to all values in 'a'"""
+ for key, value in a.items():
+ if isinstance(value, dict):
+ transform_dict(value, func)
+ else:
+ a[key] = func(value)
+
+
+def number_to_string(value, numbers=(int, float)):
+ """Convert numbers (int, float) to string; Return everything else as is."""
+ return str(value) if value.__class__ in numbers else value
+
+
+def to_string(value):
+ """str() with "better" defaults"""
+ if not value:
+ return ""
+ if value.__class__ is list:
+ try:
+ return ", ".join(value)
+ except Exception:
+ return ", ".join(map(str, value))
+ return str(value)
+
+
+def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4):
+ """Serialize 'obj' as JSON and write it to 'fp'"""
+ json.dump(
+ obj, fp,
+ ensure_ascii=ensure_ascii,
+ indent=indent,
+ default=str,
+ sort_keys=True,
+ )
+ fp.write("\n")
+
+
+def expand_path(path):
+ """Expand environment variables and tildes (~)"""
+ if not path:
+ return path
+ if not isinstance(path, str):
+ path = os.path.join(*path)
+ return os.path.expandvars(os.path.expanduser(path))
+
+
+def code_to_language(code, default=None):
+ """Map an ISO 639-1 language code to its actual name"""
+ return CODES.get((code or "").lower(), default)
+
+
+def language_to_code(lang, default=None):
+ """Map a language name to its ISO 639-1 code"""
+ if lang is None:
+ return default
+ lang = lang.capitalize()
+ for code, language in CODES.items():
+ if language == lang:
+ return code
+ return default
+
+
+CODES = {
+ "ar": "Arabic",
+ "bg": "Bulgarian",
+ "ca": "Catalan",
+ "cs": "Czech",
+ "da": "Danish",
+ "de": "German",
+ "el": "Greek",
+ "en": "English",
+ "es": "Spanish",
+ "fi": "Finnish",
+ "fr": "French",
+ "he": "Hebrew",
+ "hu": "Hungarian",
+ "id": "Indonesian",
+ "it": "Italian",
+ "jp": "Japanese",
+ "ko": "Korean",
+ "ms": "Malay",
+ "nl": "Dutch",
+ "no": "Norwegian",
+ "pl": "Polish",
+ "pt": "Portuguese",
+ "ro": "Romanian",
+ "ru": "Russian",
+ "sv": "Swedish",
+ "th": "Thai",
+ "tr": "Turkish",
+ "vi": "Vietnamese",
+ "zh": "Chinese",
+}
+
+SPECIAL_EXTRACTORS = {"oauth", "recursive", "test"}
+
+
+class UniversalNone():
+ """None-style object that supports more operations than None itself"""
+ __slots__ = ()
+
+ def __getattribute__(self, _):
+ return self
+
+ def __getitem__(self, _):
+ return self
+
+ @staticmethod
+ def __bool__():
+ return False
+
+ @staticmethod
+ def __str__():
+ return "None"
+
+ __repr__ = __str__
+
+
+NONE = UniversalNone()
+
+
+def build_predicate(predicates):
+ if not predicates:
+ return lambda url, kwds: True
+ elif len(predicates) == 1:
+ return predicates[0]
+ else:
+ return ChainPredicate(predicates)
+
+
+class RangePredicate():
+ """Predicate; True if the current index is in the given range"""
+ def __init__(self, rangespec):
+ self.ranges = self.optimize_range(self.parse_range(rangespec))
+ self.index = 0
+
+ if self.ranges:
+ self.lower, self.upper = self.ranges[0][0], self.ranges[-1][1]
+ else:
+ self.lower, self.upper = 0, 0
+
+ def __call__(self, url, kwds):
+ self.index += 1
+
+ if self.index > self.upper:
+ raise exception.StopExtraction()
+
+ for lower, upper in self.ranges:
+ if lower <= self.index <= upper:
+ return True
+ return False
+
+ @staticmethod
+ def parse_range(rangespec):
+ """Parse an integer range string and return the resulting ranges
+
+ Examples:
+ parse_range("-2,4,6-8,10-") -> [(1,2), (4,4), (6,8), (10,INTMAX)]
+ parse_range(" - 3 , 4- 4, 2-6") -> [(1,3), (4,4), (2,6)]
+ """
+ ranges = []
+
+ for group in rangespec.split(","):
+ if not group:
+ continue
+ first, sep, last = group.partition("-")
+ if not sep:
+ beg = end = int(first)
+ else:
+ beg = int(first) if first.strip() else 1
+ end = int(last) if last.strip() else sys.maxsize
+ ranges.append((beg, end) if beg <= end else (end, beg))
+
+ return ranges
+
+ @staticmethod
+ def optimize_range(ranges):
+ """Simplify/Combine a parsed list of ranges
+
+ Examples:
+ optimize_range([(2,4), (4,6), (5,8)]) -> [(2,8)]
+ optimize_range([(1,1), (2,2), (3,6), (8,9))]) -> [(1,6), (8,9)]
+ """
+ if len(ranges) <= 1:
+ return ranges
+
+ ranges.sort()
+ riter = iter(ranges)
+ result = []
+
+ beg, end = next(riter)
+ for lower, upper in riter:
+ if lower > end+1:
+ result.append((beg, end))
+ beg, end = lower, upper
+ elif upper > end:
+ end = upper
+ result.append((beg, end))
+ return result
+
+
+class UniquePredicate():
+ """Predicate; True if given URL has not been encountered before"""
+ def __init__(self):
+ self.urls = set()
+
+ def __call__(self, url, kwds):
+ if url.startswith("text:"):
+ return True
+ if url not in self.urls:
+ self.urls.add(url)
+ return True
+ return False
+
+
+class FilterPredicate():
+ """Predicate; True if evaluating the given expression returns True"""
+ globalsdict = {
+ "parse_int": text.parse_int,
+ "urlsplit": urllib.parse.urlsplit,
+ "datetime": datetime.datetime,
+ "abort": raises(exception.StopExtraction()),
+ "re": re,
+ }
+
+ def __init__(self, filterexpr, target="image"):
+ name = "<{} filter>".format(target)
+ self.codeobj = compile(filterexpr, name, "eval")
+
+ def __call__(self, url, kwds):
+ try:
+ return eval(self.codeobj, self.globalsdict, kwds)
+ except exception.GalleryDLException:
+ raise
+ except Exception as exc:
+ raise exception.FilterError(exc)
+
+
+class ChainPredicate():
+ """Predicate; True if all of its predicates return True"""
+ def __init__(self, predicates):
+ self.predicates = predicates
+
+ def __call__(self, url, kwds):
+ for pred in self.predicates:
+ if not pred(url, kwds):
+ return False
+ return True
+
+
+class ExtendedUrl():
+ """URL with attached config key-value pairs"""
+ def __init__(self, url, gconf, lconf):
+ self.value, self.gconfig, self.lconfig = url, gconf, lconf
+
+ def __str__(self):
+ return self.value
+
+
+class Formatter():
+ """Custom, extended version of string.Formatter
+
+ This string formatter implementation is a mostly performance-optimized
+ variant of the original string.Formatter class. Unnecessary features have
+ been removed (positional arguments, unused argument check) and new
+ formatting options have been added.
+
+ Extra Conversions:
+ - "l": calls str.lower on the target value
+ - "u": calls str.upper
+ - "c": calls str.capitalize
+ - "C": calls string.capwords
+ - "U": calls urllib.parse.unquote
+ - "S": calls util.to_string()
+ - Example: {f!l} -> "example"; {f!u} -> "EXAMPLE"
+
+ Extra Format Specifiers:
+ - "?<before>/<after>/":
+ Adds <before> and <after> to the actual value if it evaluates to True.
+ Otherwise the whole replacement field becomes an empty string.
+ Example: {f:?-+/+-/} -> "-+Example+-" (if "f" contains "Example")
+ -> "" (if "f" is None, 0, "")
+
+ - "L<maxlen>/<replacement>/":
+ Replaces the output with <replacement> if its length (in characters)
+ exceeds <maxlen>. Otherwise everything is left as is.
+ Example: {f:L5/too long/} -> "foo" (if "f" is "foo")
+ -> "too long" (if "f" is "foobar")
+
+ - "J<separator>/":
+ Joins elements of a list (or string) using <separator>
+ Example: {f:J - /} -> "a - b - c" (if "f" is ["a", "b", "c"])
+
+ - "R<old>/<new>/":
+ Replaces all occurrences of <old> with <new>
+ Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r")
+ """
+ CONVERSIONS = {
+ "l": str.lower,
+ "u": str.upper,
+ "c": str.capitalize,
+ "C": string.capwords,
+ "U": urllib.parse.unquote,
+ "S": to_string,
+ "s": str,
+ "r": repr,
+ "a": ascii,
+ }
+
+ def __init__(self, format_string, default=None):
+ self.default = default
+ self.result = []
+ self.fields = []
+
+ for literal_text, field_name, format_spec, conversion in \
+ _string.formatter_parser(format_string):
+ if literal_text:
+ self.result.append(literal_text)
+ if field_name:
+ self.fields.append((
+ len(self.result),
+ self._field_access(field_name, format_spec, conversion)
+ ))
+ self.result.append("")
+
+ def format_map(self, kwargs):
+ """Apply 'kwargs' to the initial format_string and return its result"""
+ for index, func in self.fields:
+ self.result[index] = func(kwargs)
+ return "".join(self.result)
+
+ def _field_access(self, field_name, format_spec, conversion):
+ first, rest = _string.formatter_field_name_split(field_name)
+
+ funcs = []
+ for is_attr, key in rest:
+ if is_attr:
+ func = operator.attrgetter
+ elif ":" in key:
+ func = self._slicegetter
+ else:
+ func = operator.itemgetter
+ funcs.append(func(key))
+
+ if conversion:
+ funcs.append(self.CONVERSIONS[conversion])
+
+ if format_spec:
+ if format_spec[0] == "?":
+ func = self._format_optional
+ elif format_spec[0] == "L":
+ func = self._format_maxlen
+ elif format_spec[0] == "J":
+ func = self._format_join
+ elif format_spec[0] == "R":
+ func = self._format_replace
+ else:
+ func = self._format_default
+ fmt = func(format_spec)
+ else:
+ fmt = str
+
+ if funcs:
+ return self._apply(first, funcs, fmt)
+ return self._apply_simple(first, fmt)
+
+ def _apply_simple(self, key, fmt):
+ def wrap(obj):
+ if key in obj:
+ obj = obj[key]
+ else:
+ obj = self.default
+ return fmt(obj)
+ return wrap
+
+ def _apply(self, key, funcs, fmt):
+ def wrap(obj):
+ try:
+ obj = obj[key]
+ for func in funcs:
+ obj = func(obj)
+ except Exception:
+ obj = self.default
+ return fmt(obj)
+ return wrap
+
+ @staticmethod
+ def _slicegetter(key):
+ start, _, stop = key.partition(":")
+ stop, _, step = stop.partition(":")
+ start = int(start) if start else None
+ stop = int(stop) if stop else None
+ step = int(step) if step else None
+ return operator.itemgetter(slice(start, stop, step))
+
+ @staticmethod
+ def _format_optional(format_spec):
+ def wrap(obj):
+ if not obj:
+ return ""
+ return before + format(obj, format_spec) + after
+ before, after, format_spec = format_spec.split("/", 2)
+ before = before[1:]
+ return wrap
+
+ @staticmethod
+ def _format_maxlen(format_spec):
+ def wrap(obj):
+ obj = format(obj, format_spec)
+ return obj if len(obj) <= maxlen else replacement
+ maxlen, replacement, format_spec = format_spec.split("/", 2)
+ maxlen = text.parse_int(maxlen[1:])
+ return wrap
+
+ @staticmethod
+ def _format_join(format_spec):
+ def wrap(obj):
+ obj = separator.join(obj)
+ return format(obj, format_spec)
+ separator, _, format_spec = format_spec.partition("/")
+ separator = separator[1:]
+ return wrap
+
+ @staticmethod
+ def _format_replace(format_spec):
+ def wrap(obj):
+ obj = obj.replace(old, new)
+ return format(obj, format_spec)
+ old, new, format_spec = format_spec.split("/", 2)
+ old = old[1:]
+ return wrap
+
+ @staticmethod
+ def _format_default(format_spec):
+ def wrap(obj):
+ return format(obj, format_spec)
+ return wrap
+
+
+class PathFormat():
+
+ def __init__(self, extractor):
+ self.filename_fmt = extractor.config(
+ "filename", extractor.filename_fmt)
+ self.directory_fmt = extractor.config(
+ "directory", extractor.directory_fmt)
+ self.kwdefault = extractor.config("keywords-default")
+
+ try:
+ self.formatter = Formatter(self.filename_fmt, self.kwdefault)
+ except Exception as exc:
+ raise exception.FormatError(exc, "filename")
+
+ self.delete = False
+ self.has_extension = False
+ self.keywords = {}
+ self.filename = ""
+ self.directory = self.realdirectory = ""
+ self.path = self.realpath = self.temppath = ""
+
+ self.basedirectory = expand_path(
+ extractor.config("base-directory", (".", "gallery-dl")))
+ if os.altsep:
+ self.basedirectory = self.basedirectory.replace(os.altsep, os.sep)
+
+ def open(self, mode="wb"):
+ """Open file and return a corresponding file object"""
+ return open(self.temppath, mode)
+
+ def exists(self, archive=None):
+ """Return True if the file exists on disk or in 'archive'"""
+ if (archive and archive.check(self.keywords) or
+ self.has_extension and os.path.exists(self.realpath)):
+ if not self.has_extension:
+ # adjust display name
+ self.set_extension("")
+ if self.path[-1] == ".":
+ self.path = self.path[:-1]
+ return True
+ return False
+
+ def set_directory(self, keywords):
+ """Build directory path and create it if necessary"""
+ try:
+ segments = [
+ text.clean_path(
+ Formatter(segment, self.kwdefault)
+ .format_map(keywords).strip())
+ for segment in self.directory_fmt
+ ]
+ except Exception as exc:
+ raise exception.FormatError(exc, "directory")
+
+ self.directory = os.path.join(
+ self.basedirectory,
+ *segments
+ )
+
+ # remove trailing path separator;
+ # occurs if the last argument to os.path.join() is an empty string
+ if self.directory[-1] == os.sep:
+ self.directory = self.directory[:-1]
+
+ self.realdirectory = self.adjust_path(self.directory)
+ os.makedirs(self.realdirectory, exist_ok=True)
+
+ def set_keywords(self, keywords):
+ """Set filename keywords"""
+ self.keywords = keywords
+ self.temppath = ""
+ self.has_extension = bool(keywords.get("extension"))
+ if self.has_extension:
+ self.build_path()
+
+ def set_extension(self, extension, real=True):
+ """Set the 'extension' keyword"""
+ self.has_extension = real
+ self.keywords["extension"] = extension
+ self.build_path()
+
+ def build_path(self):
+ """Use filename-keywords and directory to build a full path"""
+ try:
+ self.filename = text.clean_path(
+ self.formatter.format_map(self.keywords))
+ except Exception as exc:
+ raise exception.FormatError(exc, "filename")
+
+ filename = os.sep + self.filename
+ self.path = self.directory + filename
+ self.realpath = self.realdirectory + filename
+ if not self.temppath:
+ self.temppath = self.realpath
+
+ def part_enable(self, part_directory=None):
+ """Enable .part file usage"""
+ if self.has_extension:
+ self.temppath += ".part"
+ else:
+ self.set_extension("part", False)
+ if part_directory:
+ self.temppath = os.path.join(
+ part_directory,
+ os.path.basename(self.temppath),
+ )
+
+ def part_size(self):
+ """Return size of .part file"""
+ try:
+ return os.stat(self.temppath).st_size
+ except OSError:
+ pass
+ return 0
+
+ def finalize(self):
+ """Move tempfile to its target location"""
+ if self.delete:
+ self.delete = False
+ os.unlink(self.temppath)
+ return
+
+ if self.temppath == self.realpath:
+ return
+
+ try:
+ os.replace(self.temppath, self.realpath)
+ return
+ except OSError:
+ pass
+
+ shutil.copyfile(self.temppath, self.realpath)
+ os.unlink(self.temppath)
+
+ @staticmethod
+ def adjust_path(path):
+ """Enable longer-than-260-character paths on windows"""
+ return "\\\\?\\" + os.path.abspath(path) if os.name == "nt" else path
+
+
+class DownloadArchive():
+
+ def __init__(self, path, extractor):
+ con = sqlite3.connect(path)
+ con.isolation_level = None
+ self.cursor = con.cursor()
+ self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
+ "(entry PRIMARY KEY) WITHOUT ROWID")
+ self.keygen = (extractor.category + extractor.config(
+ "archive-format", extractor.archive_fmt)
+ ).format_map
+
+ def check(self, kwdict):
+ """Return True if item described by 'kwdict' exists in archive"""
+ key = self.keygen(kwdict)
+ self.cursor.execute(
+ "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
+ return self.cursor.fetchone()
+
+ def add(self, kwdict):
+ """Add item described by 'kwdict' to archive"""
+ key = self.keygen(kwdict)
+ self.cursor.execute(
+ "INSERT OR IGNORE INTO archive VALUES (?)", (key,))
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
new file mode 100644
index 0000000..4167bc4
--- /dev/null
+++ b/gallery_dl/version.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+__version__ = "1.8.7"