diff options
| author | 2023-11-27 17:57:01 -0500 | |
|---|---|---|
| committer | 2023-11-27 17:57:01 -0500 | |
| commit | 2a817af4fe41289fa705bdc5ee61372333f43996 (patch) | |
| tree | 544d884724e98184afc1d982f0e9fa59137ef498 /gallery_dl | |
| parent | 7997fa94c82f9a6db63421c0af433f325a8aa607 (diff) | |
New upstream version 1.26.3.upstream/1.26.3
Diffstat (limited to 'gallery_dl')
38 files changed, 878 insertions, 354 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index d3a0f58..287faf1 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -18,19 +18,6 @@ __email__ = "mike_faehrmann@web.de" __version__ = version.__version__ -def progress(urls, pformat): - """Wrapper around urls to output a simple progress indicator""" - if pformat is True: - pformat = "[{current}/{total}] {url}\n" - else: - pformat += "\n" - - pinfo = {"total": len(urls)} - for pinfo["current"], pinfo["url"] in enumerate(urls, 1): - output.stderr_write(pformat.format_map(pinfo)) - yield pinfo["url"] - - def main(): try: parser = option.build_parser() @@ -128,6 +115,7 @@ def main(): output.configure_logging(args.loglevel) if args.loglevel >= logging.ERROR: config.set(("output",), "mode", "null") + config.set(("downloader",), "progress", None) elif args.loglevel <= logging.DEBUG: import platform import requests @@ -224,7 +212,7 @@ def main(): return config.initialize() else: - if not args.urls and not args.inputfiles: + if not args.urls and not args.input_files: parser.error( "The following arguments are required: URL\n" "Use 'gallery-dl --help' to get a list of all options.") @@ -238,22 +226,6 @@ def main(): else: jobtype = args.jobtype or job.DownloadJob - urls = args.urls - if args.inputfiles: - for inputfile in args.inputfiles: - try: - if inputfile == "-": - if sys.stdin: - urls += util.parse_inputfile(sys.stdin, log) - else: - log.warning( - "input file: stdin is not readable") - else: - with open(inputfile, encoding="utf-8") as file: - urls += util.parse_inputfile(file, log) - except OSError as exc: - log.warning("input file: %s", exc) - # unsupported file logging handler handler = output.setup_logging_handler( "unsupportedfile", fmt="{message}") @@ -263,25 +235,44 @@ def main(): ulog.propagate = False job.Job.ulog = ulog + # collect input URLs + input_manager = InputManager() + input_manager.log = input_log = logging.getLogger("inputfile") + input_manager.add_list(args.urls) + + if args.input_files: + for input_file, action in args.input_files: + try: + path = util.expand_path(input_file) + input_manager.add_file(path, action) + except Exception as exc: + input_log.error(exc) + return getattr(exc, "code", 128) + pformat = config.get(("output",), "progress", True) - if pformat and len(urls) > 1 and args.loglevel < logging.ERROR: - urls = progress(urls, pformat) - else: - urls = iter(urls) + if pformat and len(input_manager.urls) > 1 and \ + args.loglevel < logging.ERROR: + input_manager.progress(pformat) + # process input URLs retval = 0 - url = next(urls, None) - - while url is not None: + for url in input_manager: try: log.debug("Starting %s for '%s'", jobtype.__name__, url) - if isinstance(url, util.ExtendedUrl): + + if isinstance(url, ExtendedUrl): for opts in url.gconfig: config.set(*opts) with config.apply(url.lconfig): - retval |= jobtype(url.value).run() + status = jobtype(url.value).run() + else: + status = jobtype(url).run() + + if status: + retval |= status else: - retval |= jobtype(url).run() + input_manager.success() + except exception.TerminateExtraction: pass except exception.RestartExtraction: @@ -291,8 +282,7 @@ def main(): log.error("Unsupported URL '%s'", url) retval |= 64 - url = next(urls, None) - + input_manager.next() return retval except KeyboardInterrupt: @@ -304,3 +294,206 @@ def main(): if exc.errno != errno.EPIPE: raise return 1 + + +class InputManager(): + + def __init__(self): + self.urls = [] + self.files = () + self._index = 0 + self._current = None + self._pformat = None + + def add_url(self, url): + self.urls.append(url) + + def add_list(self, urls): + self.urls += urls + + def add_file(self, path, action=None): + """Process an input file. + + Lines starting with '#' and empty lines will be ignored. + Lines starting with '-' will be interpreted as a key-value pair + separated by an '='. where + 'key' is a dot-separated option name and + 'value' is a JSON-parsable string. + These configuration options will be applied + while processing the next URL only. + Lines starting with '-G' are the same as above, except these options + will be applied for *all* following URLs, i.e. they are Global. + Everything else will be used as a potential URL. + + Example input file: + + # settings global options + -G base-directory = "/tmp/" + -G skip = false + + # setting local options for the next URL + -filename="spaces_are_optional.jpg" + -skip = true + + https://example.org/ + + # next URL uses default filename and 'skip' is false. + https://example.com/index.htm # comment1 + https://example.com/404.htm # comment2 + """ + if path == "-" and not action: + try: + lines = sys.stdin.readlines() + except Exception: + raise exception.InputFileError("stdin is not readable") + path = None + else: + try: + with open(path, encoding="utf-8") as fp: + lines = fp.readlines() + except Exception as exc: + raise exception.InputFileError(str(exc)) + + if self.files: + self.files[path] = lines + else: + self.files = {path: lines} + + if action == "c": + action = self._action_comment + elif action == "d": + action = self._action_delete + else: + action = None + + gconf = [] + lconf = [] + indicies = [] + strip_comment = None + append = self.urls.append + + for n, line in enumerate(lines): + line = line.strip() + + if not line or line[0] == "#": + # empty line or comment + continue + + elif line[0] == "-": + # config spec + if len(line) >= 2 and line[1] == "G": + conf = gconf + line = line[2:] + else: + conf = lconf + line = line[1:] + if action: + indicies.append(n) + + key, sep, value = line.partition("=") + if not sep: + raise exception.InputFileError( + "Invalid KEY=VALUE pair '%s' on line %s in %s", + line, n+1, path) + + try: + value = util.json_loads(value.strip()) + except ValueError as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + raise exception.InputFileError( + "Unable to parse '%s' on line %s in %s", + value, n+1, path) + + key = key.strip().split(".") + conf.append((key[:-1], key[-1], value)) + + else: + # url + if " #" in line or "\t#" in line: + if strip_comment is None: + import re + strip_comment = re.compile(r"\s+#.*").sub + line = strip_comment("", line) + if gconf or lconf: + url = ExtendedUrl(line, gconf, lconf) + gconf = [] + lconf = [] + else: + url = line + + if action: + indicies.append(n) + append((url, path, action, indicies)) + indicies = [] + else: + append(url) + + def progress(self, pformat=True): + if pformat is True: + pformat = "[{current}/{total}] {url}\n" + else: + pformat += "\n" + self._pformat = pformat.format_map + + def next(self): + self._index += 1 + + def success(self): + if self._current: + url, path, action, indicies = self._current + lines = self.files[path] + action(lines, indicies) + try: + with open(path, "w", encoding="utf-8") as fp: + fp.writelines(lines) + except Exception as exc: + self.log.warning( + "Unable to update '%s' (%s: %s)", + path, exc.__class__.__name__, exc) + + @staticmethod + def _action_comment(lines, indicies): + for i in indicies: + lines[i] = "# " + lines[i] + + @staticmethod + def _action_delete(lines, indicies): + for i in indicies: + lines[i] = "" + + def __iter__(self): + self._index = 0 + return self + + def __next__(self): + try: + item = self.urls[self._index] + except IndexError: + raise StopIteration + + if isinstance(item, tuple): + self._current = item + item = item[0] + else: + self._current = None + + if self._pformat: + output.stderr_write(self._pformat({ + "total" : len(self.urls), + "current": self._index + 1, + "url" : item, + })) + return item + + +class ExtendedUrl(): + """URL with attached config key-value pairs""" + __slots__ = ("value", "gconfig", "lconfig") + + def __init__(self, url, gconf, lconf): + self.value = url + self.gconfig = gconf + self.lconfig = lconf + + def __str__(self): + return self.value diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 30ac001..f493947 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -200,13 +200,15 @@ class HttpDownloader(DownloaderBase): self.log.warning( "File size smaller than allowed minimum (%s < %s)", size, self.minsize) - return False + pathfmt.temppath = "" + return True if self.maxsize and size > self.maxsize: self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", size, self.maxsize) - return False + pathfmt.temppath = "" + return True build_path = False diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py index ef190f2..ee183fc 100644 --- a/gallery_dl/exception.py +++ b/gallery_dl/exception.py @@ -21,6 +21,7 @@ Exception | +-- FilenameFormatError | +-- DirectoryFormatError +-- FilterError + +-- InputFileError +-- NoExtractorError +-- StopExtraction +-- TerminateExtraction @@ -99,6 +100,15 @@ class FilterError(GalleryDLException): code = 32 +class InputFileError(GalleryDLException): + """Error when parsing input file""" + code = 32 + + def __init__(self, message, *args): + GalleryDLException.__init__( + self, message % args if args else message) + + class NoExtractorError(GalleryDLException): """No extractor can handle the given URL""" code = 64 diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 22e4fe3..72239d5 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -117,6 +117,7 @@ modules = [ "piczel", "pillowfort", "pinterest", + "pixeldrain", "pixiv", "pixnet", "plurk", @@ -147,6 +148,7 @@ modules = [ "tapas", "tcbscans", "telegraph", + "tmohentai", "toyhouse", "tsumino", "tumblr", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index a92918e..ad0caf9 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -89,6 +89,17 @@ class BehanceGalleryExtractor(BehanceExtractor): BehanceExtractor.__init__(self, match) self.gallery_id = match.group(1) + def _init(self): + BehanceExtractor._init(self) + + modules = self.config("modules") + if modules: + if isinstance(modules, str): + modules = modules.split(",") + self.modules = set(modules) + else: + self.modules = {"image", "video", "mediacollection", "embed"} + def items(self): data = self.get_gallery_data() imgs = self.get_images(data) @@ -97,7 +108,8 @@ class BehanceGalleryExtractor(BehanceExtractor): yield Message.Directory, data for data["num"], (url, module) in enumerate(imgs, 1): data["module"] = module - data["extension"] = text.ext_from_url(url) + data["extension"] = (module.get("extension") or + text.ext_from_url(url)) yield Message.Url, url, data def get_gallery_data(self): @@ -133,13 +145,17 @@ class BehanceGalleryExtractor(BehanceExtractor): append = result.append for module in data["modules"]: - mtype = module["__typename"] + mtype = module["__typename"][:-6].lower() - if mtype == "ImageModule": + if mtype not in self.modules: + self.log.debug("Skipping '%s' module", mtype) + continue + + if mtype == "image": url = module["imageSizes"]["size_original"]["url"] append((url, module)) - elif mtype == "VideoModule": + elif mtype == "video": try: renditions = module["videoData"]["renditions"] except Exception: @@ -158,7 +174,7 @@ class BehanceGalleryExtractor(BehanceExtractor): append((url, module)) - elif mtype == "MediaCollectionModule": + elif mtype == "mediacollection": for component in module["components"]: for size in component["imageSizes"].values(): if size: @@ -167,12 +183,17 @@ class BehanceGalleryExtractor(BehanceExtractor): append(("/".join(parts), module)) break - elif mtype == "EmbedModule": + elif mtype == "embed": embed = module.get("originalEmbed") or module.get("fluidEmbed") if embed: embed = text.unescape(text.extr(embed, 'src="', '"')) + module["extension"] = "mp4" append(("ytdl:" + embed, module)) + elif mtype == "text": + module["extension"] = "txt" + append(("text:" + module["text"], module)) + return result diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index d75c349..58ae59d 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -8,30 +8,22 @@ """Extractors for Blogger blogs""" -from .common import Extractor, Message +from .common import BaseExtractor, Message from .. import text, util import re -BASE_PATTERN = ( - r"(?:blogger:(?:https?://)?([^/]+)|" - r"(?:https?://)?([\w-]+\.blogspot\.com))") - -class BloggerExtractor(Extractor): +class BloggerExtractor(BaseExtractor): """Base class for blogger extractors""" - category = "blogger" - directory_fmt = ("{category}", "{blog[name]}", + basecategory = "blogger" + directory_fmt = ("blogger", "{blog[name]}", "{post[date]:%Y-%m-%d} {post[title]}") filename_fmt = "{num:>03}.{extension}" archive_fmt = "{post[id]}_{num}" - root = "https://www.blogger.com" - - def __init__(self, match): - Extractor.__init__(self, match) - self.blog = match.group(1) or match.group(2) def _init(self): self.api = BloggerAPI(self) + self.blog = self.root.rpartition("/")[2] self.videos = self.config("videos", True) def items(self): @@ -92,6 +84,18 @@ class BloggerExtractor(Extractor): """Return additional metadata""" +BASE_PATTERN = BloggerExtractor.update({ + "blogspot": { + "root": None, + "pattern": r"[\w-]+\.blogspot\.com", + }, + "micmicidol": { + "root": "https://www.micmicidol.club", + "pattern": r"(?:www\.)?micmicidol\.club", + }, +}) + + class BloggerPostExtractor(BloggerExtractor): """Extractor for a single blog post""" subcategory = "post" @@ -100,7 +104,7 @@ class BloggerPostExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.path = match.group(3) + self.path = match.group(match.lastindex) def posts(self, blog): return (self.api.post_by_path(blog["id"], self.path),) @@ -124,7 +128,7 @@ class BloggerSearchExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.query = text.unquote(match.group(3)) + self.query = text.unquote(match.group(match.lastindex)) def posts(self, blog): return self.api.blog_search(blog["id"], self.query) @@ -141,7 +145,7 @@ class BloggerLabelExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.label = text.unquote(match.group(3)) + self.label = text.unquote(match.group(match.lastindex)) def posts(self, blog): return self.api.blog_posts(blog["id"], self.label) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 3bec424..f378427 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -78,6 +78,12 @@ class Extractor(): def config(self, key, default=None): return config.interpolate(self._cfgpath, key, default) + def config2(self, key, key2, default=None, sentinel=util.SENTINEL): + value = self.config(key, sentinel) + if value is not sentinel: + return value + return self.config(key2, default) + def config_deprecated(self, key, deprecated, default=None, sentinel=util.SENTINEL, history=set()): value = self.config(deprecated, sentinel) diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index 59fd1e5..d864960 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -7,6 +7,7 @@ """Extractors for https://cyberdrop.me/""" from . import lolisafe +from .common import Message from .. import text @@ -16,24 +17,43 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" example = "https://cyberdrop.me/a/ID" + def items(self): + files, data = self.fetch_album(self.album_id) + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + file.update(data) + text.nameext_from_url(file["name"], file) + file["name"], sep, file["id"] = file["filename"].rpartition("-") + yield Message.Url, file["url"], file + def fetch_album(self, album_id): - url = self.root + "/a/" + self.album_id - extr = text.extract_from(self.request(url).text) - - files = [] - append = files.append - while True: - url = text.unescape(extr('id="file" href="', '"')) - if not url: - break - append({"file": url, - "_fallback": (self.root + url[url.find("/", 8):],)}) - - return files, { + url = "{}/a/{}".format(self.root, album_id) + page = self.request(url).text + extr = text.extract_from(page) + + desc = extr('property="og:description" content="', '"') + if desc.startswith("A privacy-focused censorship-resistant file " + "sharing platform free for everyone."): + desc = "" + extr('id="title"', "") + + album = { "album_id" : self.album_id, - "album_name" : extr("name: '", "'"), - "date" : text.parse_timestamp(extr("timestamp: ", ",")), - "album_size" : text.parse_int(extr("totalSize: ", ",")), - "description": extr("description: `", "`"), - "count" : len(files), + "album_name" : text.unescape(extr('title="', '"')), + "album_size" : text.parse_bytes(extr( + '<p class="title">', "B")), + "date" : text.parse_datetime(extr( + '<p class="title">', '<'), "%d.%m.%Y"), + "description": text.unescape(text.unescape( # double + desc.rpartition(" [R")[0])), } + + file_ids = list(text.extract_iter(page, 'id="file" href="/f/', '"')) + album["count"] = len(file_ids) + return self._extract_files(file_ids), album + + def _extract_files(self, file_ids): + for file_id in file_ids: + url = "{}/api/f/{}".format(self.root, file_id) + yield self.request(url).json() diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 2aed678..6a0e069 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,11 +44,15 @@ class EromeExtractor(Extractor): pos = page.index('<div class="user-profile', pos) user, pos = text.extract( page, 'href="https://www.erome.com/', '"', pos) + count, pos = text.extract( + page, 'fa-camera"></i>', '</span>', pos) + data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), "_http_headers": {"Referer": url}, + "count" : text.parse_int(count), } yield Message.Directory, data diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 182910c..5dc498f 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -40,6 +40,7 @@ class ExhentaiExtractor(Extractor): if domain == "auto": domain = ("ex" if self.version == "ex" else "e-") + "hentai.org" self.root = "https://" + domain + self.api_url = self.root + "/api.php" self.cookies_domain = "." + domain Extractor.initialize(self) @@ -120,7 +121,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.key_start = None self.key_show = None self.key_next = None - self.api_url = "" self.count = 0 def _init(self): @@ -171,6 +171,21 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): # declared inside 'items()' to be able to access 'data' if not response.history and response.headers.get( "content-type", "").startswith("text/html"): + page = response.text + self.log.warning("'%s'", page) + + if " requires GP" in page: + gp = self.config("gp") + if gp == "stop": + raise exception.StopExtraction("Not enough GP") + elif gp == "wait": + input("Press ENTER to continue.") + return response.url + + self.log.info("Falling back to non-original downloads") + self.original = False + return data["_url_1280"] + self._report_limits(data) return True @@ -212,7 +227,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def metadata_from_page(self, page): extr = text.extract_from(page) - self.api_url = extr('var api_url = "', '"') or (self.root + "/api.php") + + api_url = extr('var api_url = "', '"') + if api_url: + self.api_url = api_url data = { "gid" : self.gallery_id, @@ -296,6 +314,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = self.image_num data["image_token"] = self.key_start = extr('var startkey="', '";') + data["_url_1280"] = iurl self.key_show = extr('var showkey="', '";') self._check_509(iurl, data) @@ -345,6 +364,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey + data["_url_1280"] = imgurl self._check_509(imgurl, data) yield url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index d4524e0..aff8e61 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -10,6 +10,9 @@ from .common import Extractor, Message from .. import text, exception +BASE_PATTERN = r"(?:https?://)?(?:www\.)?fapello\.(?:com|su)" + + class FapelloPostExtractor(Extractor): """Extractor for individual posts on fapello.com""" category = "fapello" @@ -17,16 +20,16 @@ class FapelloPostExtractor(Extractor): directory_fmt = ("{category}", "{model}") filename_fmt = "{model}_{id}.{extension}" archive_fmt = "{type}_{model}_{id}" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" - r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)") + pattern = BASE_PATTERN + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)" example = "https://fapello.com/MODEL/12345/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.model, self.id = match.groups() def items(self): - url = "https://fapello.com/{}/{}/".format(self.model, self.id) + url = "{}/{}/{}/".format(self.root, self.model, self.id) page = text.extr( self.request(url, allow_redirects=False).text, 'class="uk-align-center"', "</div>", None) @@ -48,27 +51,29 @@ class FapelloModelExtractor(Extractor): """Extractor for all posts from a fapello model""" category = "fapello" subcategory = "model" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" - r"/(?!top-(?:likes|followers)|popular_videos" + pattern = (BASE_PATTERN + r"/(?!top-(?:likes|followers)|popular_videos" r"|videos|trending|search/?$)" r"([^/?#]+)/?$") example = "https://fapello.com/model/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.model = match.group(1) def items(self): num = 1 data = {"_extractor": FapelloPostExtractor} while True: - url = "https://fapello.com/ajax/model/{}/page-{}/".format( - self.model, num) + url = "{}/ajax/model/{}/page-{}/".format( + self.root, self.model, num) page = self.request(url).text if not page: return for url in text.extract_iter(page, '<a href="', '"'): + if url == "javascript:void(0);": + continue yield Message.Queue, url, data num += 1 @@ -77,13 +82,14 @@ class FapelloPathExtractor(Extractor): """Extractor for models and posts from fapello.com paths""" category = "fapello" subcategory = "path" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" + pattern = (BASE_PATTERN + r"/(?!search/?$)(top-(?:likes|followers)|videos|trending" r"|popular_videos/[^/?#]+)/?$") example = "https://fapello.com/trending/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.path = match.group(1) def items(self): @@ -93,9 +99,14 @@ class FapelloPathExtractor(Extractor): else: data = {"_extractor": FapelloPostExtractor} + if "fapello.su" in self.root: + self.path = self.path.replace("-", "/") + if self.path == "trending": + data = {"_extractor": FapelloModelExtractor} + while True: - page = self.request("https://fapello.com/ajax/{}/page-{}/".format( - self.path, num)).text + page = self.request("{}/ajax/{}/page-{}/".format( + self.root, self.path, num)).text if not page: return diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index b0699b0..bb684c2 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -38,10 +38,6 @@ class FoolslideExtractor(BaseExtractor): BASE_PATTERN = FoolslideExtractor.update({ - "powermanga": { - "root": "https://read.powermanga.org", - "pattern": r"read(?:er)?\.powermanga\.org", - }, }) diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 62df192..d5ff8c8 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -42,7 +42,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): def images(self, page): return [ - (url, None) + (url.replace("http:", "https:", 1), None) for url in text.extract_iter( page, '<amp-img class="auto-style" src="', '"') ] diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 8ba23c2..c75c90d 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -133,9 +133,25 @@ class HentaifoundryExtractor(Extractor): return text.nameext_from_url(data["src"], data) - def _init_site_filters(self): + def _request_check(self, url, **kwargs): + self.request = self._request_original + + # check for Enter button / front page + # and update PHPSESSID and content filters if necessary + response = self.request(url, **kwargs) + content = response.content + if len(content) < 5000 and \ + b'<div id="entryButtonContainer"' in content: + self._init_site_filters(False) + response = self.request(url, **kwargs) + return response + + def _init_site_filters(self, check_cookies=True): """Set site-internal filters to show all images""" - if self.cookies.get("PHPSESSID", domain=self.cookies_domain): + if check_cookies and self.cookies.get( + "PHPSESSID", domain=self.cookies_domain): + self._request_original = self.request + self.request = self._request_check return url = self.root + "/?enterAgree=1" diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 32ca151..20491b5 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -30,10 +30,10 @@ class HiperdexBase(): extr = text.extract_from(page) return { - "manga" : text.unescape(extr( - "<title>", "<").rpartition(" Manga - ")[0].strip()), "url" : text.unescape(extr( 'property="og:url" content="', '"')), + "manga" : text.unescape(extr( + '"headline": "', '"')), "score" : text.parse_float(extr( 'id="averagerate">', '<')), "author" : text.remove_html(extr( diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index b7b6ef1..5c7a1b3 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -15,15 +15,17 @@ from .. import text, util, exception import collections import re +BASE_PATTERN = r"(?:https?://)?idol\.sankakucomplex\.com(?:/[a-z]{2})?" + class IdolcomplexExtractor(SankakuExtractor): """Base class for idolcomplex extractors""" category = "idolcomplex" + root = "https://idol.sankakucomplex.com" cookies_domain = "idol.sankakucomplex.com" - cookies_names = ("login", "pass_hash") - root = "https://" + cookies_domain + cookies_names = ("_idolcomplex_session",) referer = False - request_interval = 5.0 + request_interval = (4.0, 6.0) def __init__(self, match): SankakuExtractor.__init__(self, match) @@ -32,14 +34,16 @@ class IdolcomplexExtractor(SankakuExtractor): self.start_post = 0 def _init(self): - self.extags = self.config("tags", False) + self.find_tags = re.compile( + r'tag-type-([^"]+)">\s*<div [^>]+>\s*<a href="/\?tags=([^"]+)' + ).findall def items(self): self.login() data = self.metadata() for post_id in util.advance(self.post_ids(), self.start_post): - post = self._parse_post(post_id) + post = self._extract_post(post_id) url = post["file_url"] post.update(data) text.nameext_from_url(url, post) @@ -67,63 +71,75 @@ class IdolcomplexExtractor(SankakuExtractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = self.root + "/user/authenticate" + url = self.root + "/users/login" + page = self.request(url).text + + headers = { + "Referer": url, + } + url = self.root + (text.extr(page, '<form action="', '"') or + "/en/user/authenticate") data = { + "authenticity_token": text.unescape(text.extr( + page, 'name="authenticity_token" value="', '"')), "url" : "", "user[name]" : username, "user[password]": password, "commit" : "Login", } - response = self.request(url, method="POST", data=data) + response = self.request(url, method="POST", headers=headers, data=data) - if not response.history or response.url != self.root + "/user/home": + if not response.history or response.url.endswith("/user/home"): raise exception.AuthenticationError() - cookies = response.history[0].cookies - return {c: cookies[c] for c in self.cookies_names} + return {c.name: c.value for c in response.history[0].cookies} - def _parse_post(self, post_id): - """Extract metadata of a single post""" - url = self.root + "/post/show/" + post_id + def _extract_post(self, post_id): + url = self.root + "/posts/" + post_id page = self.request(url, retries=10).text - extr = text.extract + extr = text.extract_from(page) - tags , pos = extr(page, "<title>", " | ") - vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos) - vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos) - _ , pos = extr(page, "Posted: <", "", pos) - created, pos = extr(page, ' title="', '"', pos) - rating = extr(page, "<li>Rating: ", "<", pos)[0] + tags = extr("<title>", " | ") + vavg = extr('itemprop="ratingValue">', "<") + vcnt = extr('itemprop="reviewCount">', "<") + pid = extr(">Post ID:", "<") + created = extr(' title="', '"') - file_url, pos = extr(page, '<li>Original: <a href="', '"', pos) + file_url = extr('>Original:', 'id=') if file_url: - width , pos = extr(page, '>', 'x', pos) - height, pos = extr(page, '', ' ', pos) + file_url = extr(' href="', '"') + width = extr(">", "x") + height = extr("", " ") else: - width , pos = extr(page, '<object width=', ' ', pos) - height, pos = extr(page, 'height=', '>', pos) - file_url = extr(page, '<embed src="', '"', pos)[0] + width = extr('<object width=', ' ') + height = extr('height=', '>') + file_url = extr('<embed src="', '"') + + rating = extr(">Rating:", "<br") data = { - "id": text.parse_int(post_id), - "md5": file_url.rpartition("/")[2].partition(".")[0], - "tags": text.unescape(tags), + "id" : text.parse_int(pid), + "md5" : file_url.rpartition("/")[2].partition(".")[0], + "tags" : text.unescape(tags), "vote_average": text.parse_float(vavg), - "vote_count": text.parse_int(vcnt), - "created_at": created, - "rating": (rating or "?")[0].lower(), - "file_url": "https:" + text.unescape(file_url), - "width": text.parse_int(width), - "height": text.parse_int(height), + "vote_count" : text.parse_int(vcnt), + "created_at" : created, + "date" : text.parse_datetime( + created, "%Y-%m-%d %H:%M:%S.%f"), + "rating" : text.remove_html(rating).lower(), + "file_url" : "https:" + text.unescape(file_url), + "width" : text.parse_int(width), + "height" : text.parse_int(height), } - if self.extags: - tags = collections.defaultdict(list) - tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>') - pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)') - for tag_type, tag_name in pattern.findall(tags_html or ""): - tags[tag_type].append(text.unquote(tag_name)) - for key, value in tags.items(): - data["tags_" + key] = " ".join(value) + tags = collections.defaultdict(list) + tags_list = [] + tags_html = text.extr(page, '<ul id="tag-sidebar"', '</ul>') + for tag_type, tag_name in self.find_tags(tags_html or ""): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + data["tags_" + key] = " ".join(value) + tags_list += value + data["tags"] = " ".join(tags_list) return data @@ -178,15 +194,16 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): while True: page = self.request(self.root, params=params, retries=10).text - pos = page.find("<div id=more-popular-posts-link>") + 1 - yield from text.extract_iter(page, '" id=p', '>', pos) + pos = ((page.find('id="more-popular-posts-link"') + 1) or + (page.find('<span class="thumb') + 1)) + yield from text.extract_iter(page, ' href="/posts/', '"', pos) next_url = text.extract(page, 'next-page-url="', '"', pos)[0] if not next_url: return - next_params = text.parse_query(text.unescape( - next_url).lstrip("?/")) + next_params = text.parse_query(text.unescape(text.unescape( + next_url).lstrip("?/"))) if "next" in next_params: # stop if the same "next" value occurs twice in a row (#265) @@ -201,8 +218,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)" - example = "https://idol.sankakucomplex.com/pool/show/12345" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pools?/show/(\d+)" + example = "https://idol.sankakucomplex.com/pools/show/12345" per_page = 24 def __init__(self, match): @@ -219,15 +236,17 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): return {"pool": self.pool_id} def post_ids(self): - url = self.root + "/pool/show/" + self.pool_id + url = self.root + "/pools/show/" + self.pool_id params = {"page": self.start_page} while True: page = self.request(url, params=params, retries=10).text - ids = list(text.extract_iter(page, '" id=p', '>')) + pos = page.find('id="pool-show"') + 1 + post_ids = list(text.extract_iter( + page, ' href="/posts/', '"', pos)) - yield from ids - if len(ids) < self.per_page: + yield from post_ids + if len(post_ids) < self.per_page: return params["page"] += 1 @@ -236,8 +255,8 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor): """Extractor for single images from idol.sankakucomplex.com""" subcategory = "post" archive_fmt = "{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)" - example = "https://idol.sankakucomplex.com/post/show/12345" + pattern = BASE_PATTERN + r"/posts?/(?:show/)?([0-9a-f]+)" + example = "https://idol.sankakucomplex.com/posts/0123456789abcdef" def __init__(self, match): IdolcomplexExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index aca101e..3bdcfdf 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -126,14 +126,15 @@ class ImagefapImageExtractor(ImagefapExtractor): url = "{}/photo/{}/".format(self.root, self.image_id) page = self.request(url).text + url, pos = text.extract( + page, 'original="', '"') info, pos = text.extract( - page, '<script type="application/ld+json">', '</script>') + page, '<script type="application/ld+json">', '</script>', pos) image_id, pos = text.extract( page, 'id="imageid_input" value="', '"', pos) gallery_id, pos = text.extract( page, 'id="galleryid_input" value="', '"', pos) info = util.json_loads(info) - url = info["contentUrl"] return url, text.nameext_from_url(url, { "title": text.unescape(info["name"]), diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index b0789be..8ec6741 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -217,9 +217,10 @@ class InstagramExtractor(Extractor): data["post_shortcode"]) continue - if "video_versions" in item: + video_versions = item.get("video_versions") + if video_versions: video = max( - item["video_versions"], + video_versions, key=lambda x: (x["width"], x["height"], x["type"]), ) media = video diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 8f19374..4b017dc 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -50,8 +50,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor): page = text.extr( page, '<div class="reading-content">', '<div class="entry-header') return [ - (url.strip(), None) - for url in text.extract_iter(page, 'data-src="', '"') + (text.extr(img, 'src="', '"').strip(), None) + for img in text.extract_iter(page, '<img id="image-', '>') ] diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 3c2b03e..c5fe840 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -45,6 +45,9 @@ class MastodonExtractor(BaseExtractor): attachments = status["media_attachments"] del status["media_attachments"] + if status["reblog"]: + attachments.extend(status["reblog"]["media_attachments"]) + status["instance"] = self.instance acct = status["account"]["acct"] status["instance_remote"] = \ @@ -113,7 +116,10 @@ class MastodonUserExtractor(MastodonExtractor): return api.account_statuses( api.account_id_by_username(self.item), - only_media=not self.config("text-posts", False), + only_media=( + not self.reblogs and + not self.config("text-posts", False) + ), exclude_replies=not self.replies, ) diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 9f5cc9d..bc7b308 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -96,6 +96,8 @@ class NitterExtractor(BaseExtractor): for url in text.extract_iter( attachments, '<source src="', '"'): + if url[0] == "/": + url = self.root + url append(text.nameext_from_url(url, {"url": url})) else: diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 45313c5..d1f135d 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, oauth, util, config, exception from ..output import stdout_write -from ..cache import cache +from ..cache import cache, memcache import urllib.parse import binascii import hashlib @@ -31,6 +31,9 @@ class OAuthBase(Extractor): def _init(self): self.cache = config.get(("extractor", self.category), "cache", True) + if self.cache and cache is memcache: + self.log.warning("cache file is not writeable") + self.cache = False def oauth_config(self, key, default=None): value = config.interpolate(("extractor", self.subcategory), key) diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py new file mode 100644 index 0000000..34b4ebf --- /dev/null +++ b/gallery_dl/extractor/pixeldrain.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pixeldrain.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?pixeldrain\.com" + + +class PixeldrainExtractor(Extractor): + """Base class for pixeldrain extractors""" + category = "pixeldrain" + root = "https://pixeldrain.com" + archive_fmt = "{id}" + + def _init(self): + api_key = self.config("api-key") + if api_key: + self.session.auth = ("", api_key) + + def parse_datetime(self, date_string): + return text.parse_datetime( + date_string, "%Y-%m-%dT%H:%M:%S.%fZ") + + +class PixeldrainFileExtractor(PixeldrainExtractor): + """Extractor for pixeldrain files""" + subcategory = "file" + filename_fmt = "{filename[:230]} ({id}).{extension}" + pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)" + example = "https://pixeldrain.com/u/abcdefgh" + + def __init__(self, match): + Extractor.__init__(self, match) + self.file_id = match.group(1) + + def items(self): + url = "{}/api/file/{}".format(self.root, self.file_id) + file = self.request(url + "/info").json() + + file["url"] = url + "?download" + file["date"] = self.parse_datetime(file["date_upload"]) + + text.nameext_from_url(file["name"], file) + yield Message.Directory, file + yield Message.Url, file["url"], file + + +class PixeldrainAlbumExtractor(PixeldrainExtractor): + """Extractor for pixeldrain albums""" + subcategory = "album" + directory_fmt = ("{category}", + "{album[date]:%Y-%m-%d} {album[title]} ({album[id]})") + filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}" + pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)" + example = "https://pixeldrain.com/l/abcdefgh" + + def __init__(self, match): + Extractor.__init__(self, match) + self.album_id = match.group(1) + + def items(self): + url = "{}/api/list/{}".format(self.root, self.album_id) + album = self.request(url).json() + + files = album["files"] + album["count"] = album["file_count"] + album["date"] = self.parse_datetime(album["date_created"]) + + del album["files"] + del album["file_count"] + + yield Message.Directory, {"album": album} + for num, file in enumerate(files, 1): + file["album"] = album + file["num"] = num + file["url"] = url = "{}/api/file/{}?download".format( + self.root, file["id"]) + file["date"] = self.parse_datetime(file["date_upload"]) + text.nameext_from_url(file["name"], file) + yield Message.Url, url, file diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index c5ce832..7ff40a3 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -143,7 +143,7 @@ class PornhubGifExtractor(PornhubExtractor): "url" : extr('"contentUrl": "', '"'), "date" : text.parse_datetime( extr('"uploadDate": "', '"'), "%Y-%m-%d"), - "user" : extr('data-mxptext="', '"'), + "user" : text.remove_html(extr("Created by:", "</div>")), } yield Message.Directory, gif diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index bebea2a..8941258 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -117,7 +117,7 @@ class SankakuPoolExtractor(SankakuExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)" + pattern = BASE_PATTERN + r"/(?:books|pools?/show)/(\d+)" example = "https://sankaku.app/books/12345" def __init__(self, match): @@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post(?:s|/show)/([0-9a-f]+)" + pattern = BASE_PATTERN + r"/posts?(?:/show)?/([0-9a-f]+)" example = "https://sankaku.app/post/show/12345" def __init__(self, match): diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py new file mode 100644 index 0000000..9c29727 --- /dev/null +++ b/gallery_dl/extractor/tmohentai.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tmohentai.com/""" + +from .common import GalleryExtractor +from .. import text + +BASE_PATTERN = r"(?:https?://)?tmohentai\.com" + + +class TmohentaiGalleryExtractor(GalleryExtractor): + category = "tmohentai" + root = "http://tmohentai.com" + directory_fmt = ("{category}", "{title} ({gallery_id})") + pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)" + example = "https://tmohentai.com/contents/12345a67b89c0" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/contents/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def images(self, page): + fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format( + self.gallery_id).format + cnt = page.count('class="lanzador') + return [(fmt(i), None) for i in range(0, cnt)] + + def metadata(self, page): + extr = text.extract_from(page) + + return { + "gallery_id": self.gallery_id, + "title" : text.unescape(extr("<h3>", "<").strip()), + "artists" : text.split_html(extr( + "<label>Artists and Artists Groups</label>", "</ul>")), + "genres" : text.split_html(extr( + "<label>Genders</label>", "</ul>")), + "tags" : text.split_html(extr( + "<label>Tags</label>", "</ul>")), + "uploader" : text.remove_html(extr( + "<label>Uploaded By</label>", "</ul>")), + "language" : extr(" ", "\n"), + } diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 3dab16e..f50ddb7 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -322,12 +322,15 @@ class TumblrDayExtractor(TumblrExtractor): def __init__(self, match): TumblrExtractor.__init__(self, match) year, month, day = match.group(4).split("/") - self.date_min = ( - # 719163 == date(1970, 1, 1).toordinal() - date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 + self.ordinal = date(int(year), int(month), int(day)).toordinal() def _init(self): TumblrExtractor._init(self) + + self.date_min = ( + # 719163 == date(1970, 1, 1).toordinal() + (self.ordinal - 719163) * 86400) + self.api.before = self.date_min + 86400 def posts(self): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4766ae5..ca1e906 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -43,6 +43,7 @@ class TwitterExtractor(Extractor): self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) self.cards = self.config("cards", False) + self.ads = self.config("ads", False) self.cards_blacklist = self.config("cards-blacklist") self.syndication = self.config("syndication") @@ -1034,7 +1035,7 @@ class TwitterAPI(): "focalTweetId": tweet_id, "referrer": "profile", "with_rux_injections": False, - "includePromotedContent": True, + "includePromotedContent": False, "withCommunity": True, "withQuickPromoteEligibilityTweetFields": True, "withBirdwatchNotes": True, @@ -1049,7 +1050,7 @@ class TwitterAPI(): variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, - "includePromotedContent": True, + "includePromotedContent": False, "withQuickPromoteEligibilityTweetFields": True, "withVoice": True, "withV2Timeline": True, @@ -1061,7 +1062,7 @@ class TwitterAPI(): variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, - "includePromotedContent": True, + "includePromotedContent": False, "withCommunity": True, "withVoice": True, "withV2Timeline": True, @@ -1498,13 +1499,21 @@ class TwitterAPI(): for entry in tweets: try: - tweet = ((entry.get("content") or entry["item"]) - ["itemContent"]["tweet_results"]["result"]) + item = ((entry.get("content") or entry["item"]) + ["itemContent"]) + if "promotedMetadata" in item and not extr.ads: + extr.log.debug( + "Skipping %s (ad)", + (entry.get("entryId") or "").rpartition("-")[2]) + continue + + tweet = item["tweet_results"]["result"] if "tombstone" in tweet: tweet = self._process_tombstone( entry, tweet["tombstone"]) if not tweet: continue + if "tweet" in tweet: tweet = tweet["tweet"] legacy = tweet["legacy"] diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py index bce1026..faf3b0d 100644 --- a/gallery_dl/extractor/wallpapercave.py +++ b/gallery_dl/extractor/wallpapercave.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2021 David Hoppenbrouwers +# Copyright 2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,7 +23,20 @@ class WallpapercaveImageExtractor(Extractor): def items(self): page = self.request(text.ensure_http_scheme(self.url)).text + + path = None for path in text.extract_iter(page, 'class="download" href="', '"'): image = text.nameext_from_url(path) yield Message.Directory, image yield Message.Url, self.root + path, image + + if path is None: + try: + path = text.rextract( + page, 'href="', '"', page.index('id="tdownload"'))[0] + except Exception: + pass + else: + image = text.nameext_from_url(path) + yield Message.Directory, image + yield Message.Url, self.root + path, image diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 8e6b842..3bb635d 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -90,4 +90,7 @@ class WarosuThreadExtractor(Extractor): data["filename"] = text.unquote(extr( "", "<").rstrip().rpartition(".")[0]) extr("<br>", "") - data["image"] = self.root + extr("<a href=", ">") + + data["image"] = url = extr("<a href=", ">") + if url[0] == "/": + data["image"] = self.root + url diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index dc9a4f1..3f2f410 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -146,7 +146,12 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): if page and path not in page: return - page = self.request(self.root + path).text + response = self.request(self.root + path) + if response.history: + parts = response.url.split("/") + self.path = "/".join(parts[3:-1]) + + page = response.text data["page"] = self.page_no for url in self.get_episode_urls(page): diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index ed05e1f..7413b5a 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -41,9 +41,14 @@ class WeiboExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and "passport.weibo.com" in response.url: - self._sina_visitor_system(response) - response = Extractor.request(self, url, **kwargs) + if response.history: + if "login.sina.com" in response.url: + raise exception.StopExtraction( + "HTTP redirect to login page (%s)", + response.url.partition("?")[0]) + if "passport.weibo.com" in response.url: + self._sina_visitor_system(response) + response = Extractor.request(self, url, **kwargs) return response diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index a28d8f5..46e574e 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -38,13 +38,13 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) - title = extr('"title":"', '"') user = { "id" : text.parse_int(extr('"id_user":', ',')), "display": extr('"display":"', '"'), "sex" : extr('"sex":"', '"'), "name" : self.user, } + title = extr('"title":"', '"') user["description"] = extr( '<small class="mobile-hide">', '</small>').strip() tags = extr('<em>Tagged:</em>', '<').strip() diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 5fe1943..1307399 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -63,14 +63,14 @@ class ZerochanExtractor(BooruExtractor): data = { "id" : text.parse_int(entry_id), - "author" : extr('"author": "', '"'), + "author" : text.parse_unicode_escapes(extr(' "name": "', '"')), "file_url": extr('"contentUrl": "', '"'), "date" : text.parse_datetime(extr('"datePublished": "', '"')), "width" : text.parse_int(extr('"width": "', ' ')), "height" : text.parse_int(extr('"height": "', ' ')), "size" : text.parse_bytes(extr('"contentSize": "', 'B')), "path" : text.split_html(extr( - 'class="breadcrumbs', '</p>'))[2:], + 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('<ul id="tags"', '</ul>'), "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition( @@ -80,9 +80,9 @@ class ZerochanExtractor(BooruExtractor): html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: - category = text.extr(tag, 'alt="', '"') - name = text.extr(tag, ">-->", "</a>") - tags.append(category + ":" + name.strip()) + category = text.extr(tag, 'data-type="', '"') + name = text.extr(tag, 'data-tag="', '"') + tags.append(category.capitalize() + ":" + name) return data diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 1e80cbf..ac2ac7a 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -87,24 +87,27 @@ class Job(): extr.category = pextr.category extr.subcategory = pextr.subcategory - self.metadata_url = extr.config("url-metadata") - self.metadata_http = extr.config("http-metadata") + self.metadata_url = extr.config2("metadata-url", "url-metadata") + self.metadata_http = extr.config2("metadata-http", "http-metadata") + metadata_path = extr.config2("metadata-path", "path-metadata") + metadata_version = extr.config2("metadata-version", "version-metadata") + metadata_extractor = extr.config2( + "metadata-extractor", "extractor-metadata") - version_info = extr.config("version-metadata") - metadata_path = extr.config("path-metadata") - - # user-supplied metadata - kwdict = extr.config("keywords") - if kwdict: - self.kwdict.update(kwdict) if metadata_path: self.kwdict[metadata_path] = path_proxy - if version_info: - self.kwdict[version_info] = { + if metadata_extractor: + self.kwdict[metadata_extractor] = extr + if metadata_version: + self.kwdict[metadata_version] = { "version" : version.__version__, "is_executable" : util.EXECUTABLE, "current_git_head": util.git_head() } + # user-supplied metadata + kwdict = extr.config("keywords") + if kwdict: + self.kwdict.update(kwdict) def run(self): """Execute or run the job""" @@ -375,7 +378,7 @@ class DownloadJob(Job): else: extr._parentdir = pextr._parentdir - pmeta = pextr.config("parent-metadata") + pmeta = pextr.config2("parent-metadata", "metadata-parent") if pmeta: if isinstance(pmeta, str): data = self.kwdict.copy() diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 1982b71..255d9f2 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -44,21 +44,94 @@ class DeprecatedConfigConstAction(argparse.Action): namespace.options.append(((), self.dest, self.const)) -class ParseAction(argparse.Action): - """Parse <key>=<value> options and set them as config values""" +class ConfigParseAction(argparse.Action): + """Parse KEY=VALUE config options""" def __call__(self, parser, namespace, values, option_string=None): key, value = _parse_option(values) key = key.split(".") # splitting an empty string becomes [""] namespace.options.append((key[:-1], key[-1], value)) -class OptionAction(argparse.Action): - """Parse <key>=<value> options for """ +class PPParseAction(argparse.Action): + """Parse KEY=VALUE post processor options""" def __call__(self, parser, namespace, values, option_string=None): key, value = _parse_option(values) namespace.options_pp[key] = value +class InputfileAction(argparse.Action): + """Collect input files""" + def __call__(self, parser, namespace, value, option_string=None): + namespace.input_files.append((value, self.const)) + + +class MtimeAction(argparse.Action): + """Configure mtime post processors""" + def __call__(self, parser, namespace, value, option_string=None): + namespace.postprocessors.append({ + "name": "mtime", + "value": "{" + (self.const or value) + "}", + }) + + +class UgoiraAction(argparse.Action): + """Configure ugoira post processors""" + def __call__(self, parser, namespace, value, option_string=None): + if self.const: + value = self.const + else: + value = value.strip().lower() + + if value in ("webm", "vp9"): + pp = { + "extension" : "webm", + "ffmpeg-args" : ("-c:v", "libvpx-vp9", + "-crf", "12", + "-b:v", "0", "-an"), + } + elif value == "vp9-lossless": + pp = { + "extension" : "webm", + "ffmpeg-args" : ("-c:v", "libvpx-vp9", + "-lossless", "1", + "-pix_fmt", "yuv420p", "-an"), + } + elif value == "vp8": + pp = { + "extension" : "webm", + "ffmpeg-args" : ("-c:v", "libvpx", + "-crf", "4", + "-b:v", "5000k", "-an"), + } + elif value == "mp4": + pp = { + "extension" : "mp4", + "ffmpeg-args" : ("-c:v", "libx264", "-an", "-b:v", "5M"), + "libx264-prevent-odd": True, + } + elif value == "gif": + pp = { + "extension" : "gif", + "ffmpeg-args" : ("-filter_complex", "[0:v] split [a][b];" + "[a] palettegen [p];[b][p] paletteuse"), + "repeat-last-frame": False, + } + elif value in ("mkv", "copy"): + pp = { + "extension" : "mkv", + "ffmpeg-args" : ("-c:v", "copy"), + "repeat-last-frame": False, + } + else: + parser.error("Unsupported Ugoira format '{}'".format(value)) + + pp["name"] = "ugoira" + pp["whitelist"] = ("pixiv", "danbooru") + + namespace.options.append(((), "ugoira", True)) + namespace.postprocessors.append(pp) + + class Formatter(argparse.HelpFormatter): """Custom HelpFormatter class to customize help output""" def __init__(self, prog): @@ -101,12 +174,6 @@ def build_parser(): help="Print program version and exit", ) general.add_argument( - "-i", "--input-file", - dest="inputfiles", metavar="FILE", action="append", - help=("Download URLs found in FILE ('-' for stdin). " - "More than one --input-file can be specified"), - ) - general.add_argument( "-f", "--filename", dest="filename", metavar="FORMAT", help=("Filename format string for downloaded files " @@ -149,6 +216,32 @@ def build_parser(): "(ALL to delete everything)", ) + input = parser.add_argument_group("Input Options") + input.add_argument( + "urls", + metavar="URL", nargs="*", + help=argparse.SUPPRESS, + ) + input.add_argument( + "-i", "--input-file", + dest="input_files", metavar="FILE", action=InputfileAction, const=None, + default=[], + help=("Download URLs found in FILE ('-' for stdin). " + "More than one --input-file can be specified"), + ) + input.add_argument( + "-I", "--input-file-comment", + dest="input_files", metavar="FILE", action=InputfileAction, const="c", + help=("Download URLs found in FILE. " + "Comment them out after they were downloaded successfully."), + ) + input.add_argument( + "-x", "--input-file-delete", + dest="input_files", metavar="FILE", action=InputfileAction, const="d", + help=("Download URLs found in FILE. " + "Delete them after they were downloaded successfully."), + ) + output = parser.add_argument_group("Output Options") output.add_argument( "-q", "--quiet", @@ -308,7 +401,8 @@ def build_parser(): configuration = parser.add_argument_group("Configuration Options") configuration.add_argument( "-o", "--option", - dest="options", metavar="KEY=VALUE", action=ParseAction, default=[], + dest="options", metavar="KEY=VALUE", + action=ConfigParseAction, default=[], help=("Additional options. " "Example: -o browser=firefox") , ) @@ -437,43 +531,15 @@ def build_parser(): } postprocessor = parser.add_argument_group("Post-processing Options") postprocessor.add_argument( - "--zip", - dest="postprocessors", - action="append_const", const="zip", - help="Store downloaded files in a ZIP archive", - ) - postprocessor.add_argument( - "--ugoira-conv", - dest="postprocessors", action="append_const", const={ - "name" : "ugoira", - "ffmpeg-args" : ("-c:v", "libvpx", "-crf", "4", "-b:v", "5000k"), - "ffmpeg-twopass": True, - "whitelist" : ("pixiv", "danbooru"), - }, - help="Convert Pixiv Ugoira to WebM (requires FFmpeg)", - ) - postprocessor.add_argument( - "--ugoira-conv-lossless", - dest="postprocessors", action="append_const", const={ - "name" : "ugoira", - "ffmpeg-args" : ("-c:v", "libvpx-vp9", "-lossless", "1", - "-pix_fmt", "yuv420p"), - "ffmpeg-twopass": False, - "whitelist" : ("pixiv", "danbooru"), - }, - help="Convert Pixiv Ugoira to WebM in VP9 lossless mode", + "-P", "--postprocessor", + dest="postprocessors", metavar="NAME", action="append", default=[], + help="Activate the specified post processor", ) postprocessor.add_argument( - "--ugoira-conv-copy", - dest="postprocessors", action="append_const", const={ - "name" : "ugoira", - "extension" : "mkv", - "ffmpeg-args" : ("-c:v", "copy"), - "ffmpeg-twopass" : False, - "repeat-last-frame": False, - "whitelist" : ("pixiv", "danbooru"), - }, - help="Convert Pixiv Ugoira to MKV without re-encoding any frames", + "-O", "--postprocessor-option", + dest="options_pp", metavar="KEY=VALUE", + action=PPParseAction, default={}, + help="Additional post processor options", ) postprocessor.add_argument( "--write-metadata", @@ -500,10 +566,54 @@ def build_parser(): help="Write image tags to separate text files", ) postprocessor.add_argument( - "--mtime-from-date", + "--zip", dest="postprocessors", - action="append_const", const="mtime", - help="Set file modification times according to 'date' metadata", + action="append_const", const="zip", + help="Store downloaded files in a ZIP archive", + ) + postprocessor.add_argument( + "--cbz", + dest="postprocessors", + action="append_const", const={ + "name" : "zip", + "extension": "cbz", + }, + help="Store downloaded files in a CBZ archive", + ) + postprocessor.add_argument( + "--mtime", + dest="postprocessors", metavar="NAME", action=MtimeAction, + help=("Set file modification times according to metadata " + "selected by NAME. Examples: 'date' or 'status[date]'"), + ) + postprocessor.add_argument( + "--mtime-from-date", + dest="postprocessors", nargs=0, action=MtimeAction, + const="date|status[date]", + help=argparse.SUPPRESS, + ) + postprocessor.add_argument( + "--ugoira", + dest="postprocessors", metavar="FORMAT", action=UgoiraAction, + help=("Convert Pixiv Ugoira to FORMAT using FFmpeg. " + "Supported formats are 'webm', 'mp4', 'gif', " + "'vp8', 'vp9', 'vp9-lossless', 'copy'."), + ) + postprocessor.add_argument( + "--ugoira-conv", + dest="postprocessors", nargs=0, action=UgoiraAction, const="vp8", + help=argparse.SUPPRESS, + ) + postprocessor.add_argument( + "--ugoira-conv-lossless", + dest="postprocessors", nargs=0, action=UgoiraAction, + const="vp9-lossless", + help=argparse.SUPPRESS, + ) + postprocessor.add_argument( + "--ugoira-conv-copy", + dest="postprocessors", nargs=0, action=UgoiraAction, const="copy", + help=argparse.SUPPRESS, ) postprocessor.add_argument( "--exec", @@ -519,25 +629,9 @@ def build_parser(): dest="postprocessors", metavar="CMD", action=AppendCommandAction, const={ "name": "exec", "event": "finalize"}, - help=("Execute CMD after all files were downloaded successfully. " + help=("Execute CMD after all files were downloaded. " "Example: --exec-after \"cd {_directory} " "&& convert * ../doc.pdf\""), ) - postprocessor.add_argument( - "-P", "--postprocessor", - dest="postprocessors", metavar="NAME", action="append", - help="Activate the specified post processor", - ) - postprocessor.add_argument( - "-O", "--postprocessor-option", - dest="options_pp", metavar="OPT", action=OptionAction, default={}, - help="Additional '<key>=<value>' post processor options", - ) - - parser.add_argument( - "urls", - metavar="URL", nargs="*", - help=argparse.SUPPRESS, - ) return parser diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 6255d49..62aa12d 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -487,82 +487,6 @@ CODES = { } -def parse_inputfile(file, log): - """Filter and process strings from an input file. - - Lines starting with '#' and empty lines will be ignored. - Lines starting with '-' will be interpreted as a key-value pair separated - by an '='. where 'key' is a dot-separated option name and 'value' is a - JSON-parsable value. These configuration options will be applied while - processing the next URL. - Lines starting with '-G' are the same as above, except these options will - be applied for *all* following URLs, i.e. they are Global. - Everything else will be used as a potential URL. - - Example input file: - - # settings global options - -G base-directory = "/tmp/" - -G skip = false - - # setting local options for the next URL - -filename="spaces_are_optional.jpg" - -skip = true - - https://example.org/ - - # next URL uses default filename and 'skip' is false. - https://example.com/index.htm # comment1 - https://example.com/404.htm # comment2 - """ - gconf = [] - lconf = [] - strip_comment = None - - for line in file: - line = line.strip() - - if not line or line[0] == "#": - # empty line or comment - continue - - elif line[0] == "-": - # config spec - if len(line) >= 2 and line[1] == "G": - conf = gconf - line = line[2:] - else: - conf = lconf - line = line[1:] - - key, sep, value = line.partition("=") - if not sep: - log.warning("input file: invalid <key>=<value> pair: %s", line) - continue - - try: - value = json_loads(value.strip()) - except ValueError as exc: - log.warning("input file: unable to parse '%s': %s", value, exc) - continue - - key = key.strip().split(".") - conf.append((key[:-1], key[-1], value)) - - else: - # url - if " #" in line or "\t#" in line: - if strip_comment is None: - strip_comment = re.compile(r"\s+#.*").sub - line = strip_comment("", line) - if gconf or lconf: - yield ExtendedUrl(line, gconf, lconf) - gconf = [] - lconf = [] - else: - yield line - - class CustomNone(): """None-style type that supports more operations than regular None""" __slots__ = () @@ -873,15 +797,6 @@ class FilterPredicate(): raise exception.FilterError(exc) -class ExtendedUrl(): - """URL with attached config key-value pairs""" - def __init__(self, url, gconf, lconf): - self.value, self.gconfig, self.lconfig = url, gconf, lconf - - def __str__(self): - return self.value - - class DownloadArchive(): def __init__(self, path, format_string, pragma=None, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 5050174..5034fb2 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.2" +__version__ = "1.26.3" |
