From 78e2d1672e4301497f786cd03637de9ddbc717ac Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Mon, 17 Oct 2022 03:44:04 -0400 Subject: New upstream version 1.23.3. --- CHANGELOG.md | 31 +++++++ PKG-INFO | 6 +- README.rst | 4 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 57 +++++++++++- docs/gallery-dl.conf | 10 ++ gallery_dl.egg-info/PKG-INFO | 6 +- gallery_dl.egg-info/SOURCES.txt | 3 + gallery_dl/__init__.py | 80 +--------------- gallery_dl/extractor/2chen.py | 99 ++++++++++++++++++++ gallery_dl/extractor/8chan.py | 172 +++++++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 3 + gallery_dl/extractor/artstation.py | 16 +++- gallery_dl/extractor/common.py | 13 ++- gallery_dl/extractor/danbooru.py | 17 +++- gallery_dl/extractor/deviantart.py | 10 +- gallery_dl/extractor/directlink.py | 10 +- gallery_dl/extractor/fanbox.py | 23 +++++ gallery_dl/extractor/generic.py | 6 +- gallery_dl/extractor/hitomi.py | 8 +- gallery_dl/extractor/imagefap.py | 109 +++++++++++++--------- gallery_dl/extractor/instagram.py | 105 ++++++++++++++------- gallery_dl/extractor/nana.py | 115 +++++++++++++++++++++++ gallery_dl/extractor/nijie.py | 49 ++++++++++ gallery_dl/extractor/nozomi.py | 14 +-- gallery_dl/extractor/redgifs.py | 26 +++++- gallery_dl/extractor/tumblr.py | 21 ++++- gallery_dl/extractor/unsplash.py | 2 +- gallery_dl/extractor/vk.py | 33 ++++++- gallery_dl/extractor/wallhaven.py | 2 +- gallery_dl/job.py | 21 +++-- gallery_dl/path.py | 5 +- gallery_dl/postprocessor/metadata.py | 18 ++-- gallery_dl/util.py | 76 ++++++++++++++++ gallery_dl/version.py | 2 +- test/test_postprocessor.py | 1 + test/test_results.py | 4 + 37 files changed, 946 insertions(+), 233 deletions(-) create mode 100644 gallery_dl/extractor/2chen.py create mode 100644 gallery_dl/extractor/8chan.py create mode 100644 gallery_dl/extractor/nana.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c83ab91..5901e37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,36 @@ # Changelog +## 1.23.3 - 2022-10-15 +### Additions +- [2chen] Add `2chen.moe` extractor ([#2707](https://github.com/mikf/gallery-dl/issues/2707)) +- [8chan] add `thread` and `board` extractors ([#2938](https://github.com/mikf/gallery-dl/issues/2938)) +- [deviantart] add `group` option ([#3018](https://github.com/mikf/gallery-dl/issues/3018)) +- [fanbox] add `content` metadata field ([#3020](https://github.com/mikf/gallery-dl/issues/3020)) +- [instagram] restore `cursor` functionality ([#2991](https://github.com/mikf/gallery-dl/issues/2991)) +- [instagram] restore warnings for private profiles ([#3004](https://github.com/mikf/gallery-dl/issues/3004), [#3045](https://github.com/mikf/gallery-dl/issues/3045)) +- [nana] add `nana` extractors ([#2967](https://github.com/mikf/gallery-dl/issues/2967)) +- [nijie] add `feed` and `followed` extractors ([#3048](https://github.com/mikf/gallery-dl/issues/3048)) +- [tumblr] support `https://www.tumblr.com/BLOGNAME` URLs ([#3034](https://github.com/mikf/gallery-dl/issues/3034)) +- [tumblr] add `offset` option +- [vk] add `tagged` extractor ([#2997](https://github.com/mikf/gallery-dl/issues/2997)) +- add `path-extended` option ([#3021](https://github.com/mikf/gallery-dl/issues/3021)) +- emit debug logging messages before calling time.sleep() ([#2982](https://github.com/mikf/gallery-dl/issues/2982)) +### Changes +- [postprocessor:metadata] assume `"mode": "custom"` when `format` is given +### Fixes +- [artstation] skip missing projects ([#3016](https://github.com/mikf/gallery-dl/issues/3016)) +- [danbooru] fix ugoira metadata extraction ([#3056](https://github.com/mikf/gallery-dl/issues/3056)) +- [deviantart] fix `deviation` extraction ([#2981](https://github.com/mikf/gallery-dl/issues/2981)) +- [hitomi] fall back to `webp` when selected format is not available ([#3030](https://github.com/mikf/gallery-dl/issues/3030)) +- [imagefap] fix and improve folder extraction and gallery pagination ([#3013](https://github.com/mikf/gallery-dl/issues/3013)) +- [instagram] fix login ([#3011](https://github.com/mikf/gallery-dl/issues/3011), [#3015](https://github.com/mikf/gallery-dl/issues/3015)) +- [nozomi] fix extraction ([#3051](https://github.com/mikf/gallery-dl/issues/3051)) +- [redgifs] fix extraction ([#3037](https://github.com/mikf/gallery-dl/issues/3037)) +- [tumblr] sleep between fallback retries ([#2957](https://github.com/mikf/gallery-dl/issues/2957)) +- [vk] unescape error messages +- fix duplicated metadata bug with `-j` ([#3033](https://github.com/mikf/gallery-dl/issues/3033)) +- fix bug when processing input file comments ([#2808](https://github.com/mikf/gallery-dl/issues/2808)) + ## 1.23.2 - 2022-10-01 ### Additions - [artstation] support search filters ([#2970](https://github.com/mikf/gallery-dl/issues/2970)) diff --git a/PKG-INFO b/PKG-INFO index aea8d49..2ecb797 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.23.2 +Version: 1.23.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/README.rst b/README.rst index 5676a0e..1457efc 100644 --- a/README.rst +++ b/README.rst @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index c7051c2..cca3dee 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-10-01" "1.23.2" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-10-15" "1.23.3" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 14db723..1c484b6 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-10-01" "1.23.2" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-10-15" "1.23.3" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -310,6 +310,18 @@ depending on the local operating system * \f[I]"windows"\f[]: \f[I]". "\f[] +.SS extractor.*.path-extended +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +On Windows, use \f[I]extended-length paths\f[] +prefixed with \f[I]\\\\?\\\f[] to work around the 260 characters path length limit. + + .SS extractor.*.extension-map .IP "Type:" 6 \f[I]object\f[] @@ -1211,6 +1223,18 @@ Note: Gathering this information requires a lot of API calls. Use with caution. +.SS extractor.deviantart.group +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Check whether the profile name in a given URL +belongs to a group or a regular user. + + .SS extractor.deviantart.include .IP "Type:" 6 \f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] @@ -1974,6 +1998,18 @@ Fetch media from replies to other posts. Also emit metadata for text-only posts without media content. +.SS extractor.nana.favkey +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Your \f[I]Nana Favorite Key\f[], +used to access your favorite archives. + + .SS extractor.newgrounds.flash .IP "Type:" 6 \f[I]bool\f[] @@ -2610,6 +2646,19 @@ images from them. Search posts for inline images and videos. +.SS extractor.tumblr.offset +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +Custom \f[I]offset\f[] starting value when paginating over blog posts. + +Allows skipping over posts without having to waste API calls. + + .SS extractor.tumblr.original .IP "Type:" 6 \f[I]bool\f[] @@ -3474,9 +3523,13 @@ Disable the use of a proxy by explicitly setting this option to \f[I]null\f[]. \f[I]true\f[] .IP "Description:" 4 -Check the file headers of \f[I]jpg\f[], \f[I]png\f[], and \f[I]gif\f[] files +Check file headers of downloaded files and adjust their filename extensions if they do not match. +For example, this will change the filename extension (\f[I]{extension}\f[]) +of a file called \f[I]example.png\f[] from \f[I]png\f[] to \f[I]jpg\f[] when said file +contains JPEG/JFIF data. + .SS downloader.http.headers .IP "Type:" 6 diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1c565ec..e507eb0 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -24,6 +24,8 @@ "path-replace": "_", "path-remove": "\\u0000-\\u001f\\u007f", "path-strip": "auto", + "path-extended": true, + "extension-map": { "jpeg": "jpg", "jpe" : "jpg", @@ -71,10 +73,13 @@ { "client-id": null, "client-secret": null, + "auto-watch": false, + "auto-unwatch": false, "comments": false, "extra": false, "flat": true, "folders": false, + "group": true, "include": "gallery", "journals": "html", "mature": true, @@ -189,6 +194,10 @@ "format": "original", "include": "art" }, + "nana": + { + "favkey": null + }, "nijie": { "username": null, @@ -288,6 +297,7 @@ "external": false, "inline": true, "posts": "all", + "offset": 0, "original": true, "reblogs": true }, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 016840e..c1bfabf 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.23.2 +Version: 1.23.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 73cc80b..b768d5b 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -40,11 +40,13 @@ gallery_dl/downloader/http.py gallery_dl/downloader/text.py gallery_dl/downloader/ytdl.py gallery_dl/extractor/2chan.py +gallery_dl/extractor/2chen.py gallery_dl/extractor/35photo.py gallery_dl/extractor/3dbooru.py gallery_dl/extractor/420chan.py gallery_dl/extractor/4chan.py gallery_dl/extractor/500px.py +gallery_dl/extractor/8chan.py gallery_dl/extractor/8kun.py gallery_dl/extractor/8muses.py gallery_dl/extractor/__init__.py @@ -132,6 +134,7 @@ gallery_dl/extractor/message.py gallery_dl/extractor/moebooru.py gallery_dl/extractor/myhentaigallery.py gallery_dl/extractor/myportfolio.py +gallery_dl/extractor/nana.py gallery_dl/extractor/naver.py gallery_dl/extractor/naverwebtoon.py gallery_dl/extractor/newgrounds.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 7504fa4..b64fa2f 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -7,7 +7,6 @@ # published by the Free Software Foundation. import sys -import json import logging from . import version, config, option, output, extractor, job, util, exception @@ -32,81 +31,6 @@ def progress(urls, pformat): yield pinfo["url"] -def parse_inputfile(file, log): - """Filter and process strings from an input file. - - Lines starting with '#' and empty lines will be ignored. - Lines starting with '-' will be interpreted as a key-value pair separated - by an '='. where 'key' is a dot-separated option name and 'value' is a - JSON-parsable value. These configuration options will be applied while - processing the next URL. - Lines starting with '-G' are the same as above, except these options will - be applied for *all* following URLs, i.e. they are Global. - Everything else will be used as a potential URL. - - Example input file: - - # settings global options - -G base-directory = "/tmp/" - -G skip = false - - # setting local options for the next URL - -filename="spaces_are_optional.jpg" - -skip = true - - https://example.org/ - - # next URL uses default filename and 'skip' is false. - https://example.com/index.htm # comment1 - https://example.com/404.htm # comment2 - """ - gconf = [] - lconf = [] - - for line in file: - line = line.strip() - - if not line or line[0] == "#": - # empty line or comment - continue - - elif line[0] == "-": - # config spec - if len(line) >= 2 and line[1] == "G": - conf = gconf - line = line[2:] - else: - conf = lconf - line = line[1:] - - key, sep, value = line.partition("=") - if not sep: - log.warning("input file: invalid = pair: %s", line) - continue - - try: - value = json.loads(value.strip()) - except ValueError as exc: - log.warning("input file: unable to parse '%s': %s", value, exc) - continue - - key = key.strip().split(".") - conf.append((key[:-1], key[-1], value)) - - else: - # url - if " #" in line: - line = line.partition(" #")[0].rstrip() - elif "\t#" in line: - line = line.partition("\t#")[0].rstrip() - if gconf or lconf: - yield util.ExtendedUrl(line, gconf, lconf) - gconf = [] - lconf = [] - else: - yield line - - def main(): try: if sys.stdout and sys.stdout.encoding.lower() != "utf-8": @@ -275,12 +199,12 @@ def main(): try: if inputfile == "-": if sys.stdin: - urls += parse_inputfile(sys.stdin, log) + urls += util.parse_inputfile(sys.stdin, log) else: log.warning("input file: stdin is not readable") else: with open(inputfile, encoding="utf-8") as file: - urls += parse_inputfile(file, log) + urls += util.parse_inputfile(file, log) except OSError as exc: log.warning("input file: %s", exc) diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py new file mode 100644 index 0000000..8fffeb0 --- /dev/null +++ b/gallery_dl/extractor/2chen.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://2chen.moe/""" + +from .common import Extractor, Message +from .. import text + + +class _2chenThreadExtractor(Extractor): + """Extractor for 2chen threads""" + category = "2chen" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{time} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{hash}" + root = "https://2chen.moe" + pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)" + test = ( + ("https://2chen.moe/jp/303786", { + "count": ">= 10", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + yield Message.Directory, data + for post in self.posts(page): + if not post["url"]: + continue + post.update(data) + post["url"] = self.root + post["url"] + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + board, pos = text.extract(page, 'class="board">/', '/<') + title = text.extract(page, "

", "

", pos)[0] + return { + "board" : board, + "thread": self.thread, + "title" : text.unescape(title), + } + + def posts(self, page): + """Return iterable with relevant posts""" + return map(self.parse, text.extract_iter( + page, 'class="glass media', '')) + + def parse(self, post): + extr = text.extract_from(post) + return { + "name" : text.unescape(extr("", "")), + "date" : text.parse_datetime( + extr("")[2], + "%d %b %Y (%a) %H:%M:%S" + ), + "no" : extr('href="#p', '"'), + "url" : extr(' board["pageCount"]: + return + url = "{}/{}/{}.json".format(self.root, self.board, page) + threads = self.request(url).json()["threads"] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index fed6998..851f660 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,11 +10,13 @@ import re modules = [ "2chan", + "2chen", "35photo", "3dbooru", "420chan", "4chan", "500px", + "8chan", "8kun", "8muses", "adultempire", @@ -90,6 +92,7 @@ modules = [ "mememuseum", "myhentaigallery", "myportfolio", + "nana", "naver", "naverwebtoon", "newgrounds", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 62626a1..14d1e6b 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -76,7 +76,12 @@ class ArtstationExtractor(Extractor): def get_project_assets(self, project_id): """Return all assets associated with 'project_id'""" url = "{}/projects/{}.json".format(self.root, project_id) - data = self.request(url).json() + + try: + data = self.request(url).json() + except exception.HttpError as exc: + self.log.warning(exc) + return data["title"] = text.unescape(data["title"]) data["description"] = text.unescape(text.remove_html( @@ -406,6 +411,10 @@ class ArtstationImageExtractor(ArtstationExtractor): "options": (("external", True),), "pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0", }), + # 404 (#3016) + ("https://www.artstation.com/artwork/3q3mXB", { + "count": 0, + }), # alternate URL patterns ("https://sungchoi.artstation.com/projects/LQVJr"), ("https://artstn.co/p/LQVJr"), @@ -419,7 +428,10 @@ class ArtstationImageExtractor(ArtstationExtractor): def metadata(self): self.assets = list(ArtstationExtractor.get_project_assets( self, self.project_id)) - self.user = self.assets[0]["user"]["username"] + try: + self.user = self.assets[0]["user"]["username"] + except IndexError: + self.user = "" return ArtstationExtractor.metadata(self) def projects(self): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index f7ee51f..e304717 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -122,8 +122,7 @@ class Extractor(): seconds = (self._interval() - (time.time() - Extractor.request_timestamp)) if seconds > 0.0: - self.log.debug("Sleeping for %.5s seconds", seconds) - time.sleep(seconds) + self.sleep(seconds, "request") while True: try: @@ -169,8 +168,9 @@ class Extractor(): self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: break - time.sleep( - max(tries, self._interval()) if self._interval else tries) + self.sleep( + max(tries, self._interval()) if self._interval else tries, + "retry") tries += 1 raise exception.HttpError(msg, response) @@ -202,6 +202,11 @@ class Extractor(): self.log.info("Waiting until %s for %s.", isotime, reason) time.sleep(seconds) + def sleep(self, seconds, reason): + self.log.debug("Sleeping %.2f seconds (%s)", + seconds, reason) + time.sleep(seconds) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 8c2ed53..c455ce1 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -88,10 +88,7 @@ class DanbooruExtractor(BaseExtractor): if post["extension"] == "zip": if self.ugoira: - post["frames"] = self.request( - "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format( - self.root, post["id"]) - ).json()["pixiv_ugoira_frame_data"]["data"] + post["frames"] = self._ugoira_frames(post) post["_http_adjust_extension"] = False else: url = post["large_file_url"] @@ -139,6 +136,18 @@ class DanbooruExtractor(BaseExtractor): else: return + def _ugoira_frames(self, post): + data = self.request("{}/posts/{}.json?only=media_metadata".format( + self.root, post["id"]) + ).json()["media_metadata"]["metadata"] + + ext = data["ZIP:ZipFileName"].rpartition(".")[2] + print(post["id"], ext) + fmt = ("{:>06}." + ext).format + delays = data["Ugoira:FrameDelays"] + return [{"file": fmt(index), "delay": delay} + for index, delay in enumerate(delays)] + INSTANCES = { "danbooru": { diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 6897476..cb2aa24 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -72,7 +72,7 @@ class DeviantartExtractor(Extractor): def items(self): self.api = DeviantartOAuthAPI(self) - if self.user: + if self.user and self.config("group", True): profile = self.api.user_profile(self.user) self.group = not profile if self.group: @@ -938,11 +938,11 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def deviations(self): url = "{}/{}/{}/{}".format( self.root, self.user, self.type, self.deviation_id) - appurl = text.extract(self._limited_request(url).text, - 'property="da:appurl" content="', '"')[0] - if not appurl: + uuid = text.extract(self._limited_request(url).text, + '"deviationUuid\\":\\"', '\\')[0] + if not uuid: raise exception.NotFoundError("deviation") - return (self.api.deviation(appurl.rpartition("/")[2]),) + return (self.api.deviation(uuid),) class DeviantartScrapsExtractor(DeviantartExtractor): diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 6ddf2ec..8b90250 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,7 +19,7 @@ class DirectlinkExtractor(Extractor): archive_fmt = filename_fmt pattern = (r"(?i)https?://(?P[^/?#]+)/(?P[^?#]+\." r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" - r"(?:\?(?P[^/?#]*))?(?:#(?P.*))?$") + r"(?:\?(?P[^#]*))?(?:#(?P.*))?$") test = ( (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), { "url": "18c5d00077332e98e53be9fed2ee4be66154b88d", @@ -31,9 +31,9 @@ class DirectlinkExtractor(Extractor): "keyword": "29dad729c40fb09349f83edafa498dba1297464a", }), # more complex example - ("https://example.org/path/to/file.webm?que=1&ry=2#fragment", { - "url": "114b8f1415cc224b0f26488ccd4c2e7ce9136622", - "keyword": "06014abd503e3b2b58aa286f9bdcefdd2ae336c0", + ("https://example.org/path/to/file.webm?que=1?&ry=2/#fragment", { + "url": "6fb1061390f8aada3db01cb24b51797c7ee42b31", + "keyword": "3d7abc31d45ba324e59bc599c3b4862452d5f29c", }), # percent-encoded characters ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", { diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 8481248..f692a90 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -68,6 +68,16 @@ class FanboxExtractor(Extractor): post["html"] = content_body["html"] if post["type"] == "article": post["articleBody"] = content_body.copy() + if "blocks" in content_body: + content = [] + append = content.append + for block in content_body["blocks"]: + if "text" in block: + append(block["text"]) + if "links" in block: + for link in block["links"]: + append(link["url"]) + post["content"] = "\n".join(content) post["date"] = text.parse_datetime(post["publishedDatetime"]) post["text"] = content_body.get("text") if content_body else None @@ -271,6 +281,19 @@ class FanboxPostExtractor(FanboxExtractor): "hasAdultContent": True }, }), + # 'content' metadata (#3020) + ("https://www.fanbox.cc/@official-en/posts/4326303", { + "keyword": { + "content": r"re:(?s)^Greetings from FANBOX.\n \nAs of Monday, " + r"September 5th, 2022, we are happy to announce " + r"the start of the FANBOX hashtag event " + r"#MySetupTour ! \nAbout the event\nTo join this " + r"event .+ \nPlease check this page for further " + r"details regarding the Privacy & Terms.\n" + r"https://fanbox.pixiv.help/.+/10184952456601\n\n\n" + r"Thank you for your continued support of FANBOX.$", + }, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index bece905..69c07d0 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -27,9 +27,9 @@ class GenericExtractor(Extractor): pattern += r""" (?Phttps?://)? # optional http(s) scheme (?P[-\w\.]+) # required domain - (?P/[^?&#]*)? # optional path - (?:\?(?P[^/?#]*))? # optional query - (?:\#(?P.*))?$ # optional fragment + (?P/[^?#]*)? # optional path + (?:\?(?P[^#]*))? # optional query + (?:\#(?P.*))? # optional fragment """ def __init__(self, match): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index f8b0c3b..cc110aa 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -115,12 +115,16 @@ class HitomiGalleryExtractor(GalleryExtractor): fmt = self.config("format") or "webp" if fmt == "original": - subdomain, fmt, ext = "b", "images", None + subdomain, fmt, ext, check = "b", "images", None, False else: - subdomain, ext = "a", fmt + subdomain, ext, check = "a", fmt, True result = [] for image in self.info["files"]: + if check: + if not image.get("has" + fmt): + fmt = ext = "webp" + check = False ihash = image["hash"] idata = text.nameext_from_url(image["name"]) if ext: diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index b1c0e9e..2c899eb 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -44,7 +44,9 @@ class ImagefapGalleryExtractor(ImagefapExtractor): ("https://www.imagefap.com/gallery/5486966", { "pattern": r"https://cdnh?\.imagefap\.com" r"/images/full/\d+/\d+/\d+\.jpg", - "keyword": "3e24eace5b09639b881ebd393165862feb46adde", + "keyword": "8d2e562df7a0bc9e8eecb9d1bb68d32b4086bf98", + "archive": False, + "count": 62, }), ("https://www.imagefap.com/gallery.php?gid=7102714"), ("https://beta.imagefap.com/gallery.php?gid=7102714"), @@ -73,32 +75,42 @@ class ImagefapGalleryExtractor(ImagefapExtractor): title, _, descr = descr.partition(" porn picture gallery by ") uploader, _, tags = descr.partition(" to see hottest ") + self._count = text.parse_int(count) return { "gallery_id": text.parse_int(self.gid), "title": text.unescape(title), "uploader": uploader, "tags": tags[:-11].split(", "), - "count": text.parse_int(count), + "count": self._count, } def get_images(self): """Collect image-urls and -metadata""" - num = 0 url = "{}/photo/{}/".format(self.root, self.image_id) params = {"gid": self.gid, "idx": 0, "partial": "true"} + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "X-Requested-With": "XMLHttpRequest", + "Referer": "{}?pgid=&gid={}&page=0".format(url, self.image_id) + } + + num = 0 + total = self._count while True: - pos = 0 - page = self.request(url, params=params).text - for _ in range(24): - imgurl, pos = text.extract(page, '= total: + return + params["idx"] += cnt class ImagefapImageExtractor(ImagefapExtractor): @@ -170,40 +182,49 @@ class ImagefapUserExtractor(ImagefapExtractor): self.user, self.user_id = match.groups() def items(self): - for gid, name in self.get_gallery_data(): - url = "{}/gallery/{}".format(self.root, gid) - data = { - "gallery_id": text.parse_int(gid), - "title": text.unescape(name), - "_extractor": ImagefapGalleryExtractor, - } - yield Message.Queue, url, data - - def get_gallery_data(self): - """Yield all gallery_ids of a specific user""" - folders = self.get_gallery_folders() - url = "{}/ajax_usergallery_folder.php".format(self.root) - params = {"userid": self.user_id} - for folder_id in folders: - params["id"] = folder_id - page = self.request(url, params=params).text - - pos = 0 - while True: - gid, pos = text.extract(page, '", "<", pos) - yield gid, name - - def get_gallery_folders(self): - """Create a list of all folder_ids of a specific user""" + for folder_id in self.folders(): + for gallery_id, name in self.galleries(folder_id): + url = "{}/gallery/{}".format(self.root, gallery_id) + data = { + "gallery_id": text.parse_int(gallery_id), + "title" : text.unescape(name), + "_extractor": ImagefapGalleryExtractor, + } + yield Message.Queue, url, data + + def folders(self): + """Return a list of folder_ids of a specific user""" if self.user: url = "{}/profile/{}/galleries".format(self.root, self.user) else: url = "{}/usergallery.php?userid={}".format( self.root, self.user_id) - page = self.request(url).text - self.user_id, pos = text.extract(page, '?userid=', '"') - folders, pos = text.extract(page, ' id="tgl_all" value="', '"', pos) - return folders.split("|")[:-1] + + response = self.request(url) + self.user = response.url.split("/")[-2] + folders = text.extract(response.text, ' id="tgl_all" value="', '"')[0] + return folders.rstrip("|").split("|") + + def galleries(self, folder_id): + """Yield gallery_ids of a folder""" + if folder_id == "-1": + url = "{}/profile/{}/galleries?folderid=-1".format( + self.root, self.user) + else: + url = "{}/organizer/{}/".format(self.root, folder_id) + params = {"page": 0} + + while True: + extr = text.extract_from(self.request(url, params=params).text) + cnt = 0 + + while True: + gid = extr('", "<") + cnt += 1 + + if cnt < 25: + break + params["page"] += 1 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 425d541..4775613 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -338,6 +338,14 @@ class InstagramExtractor(Extractor): "username" : user["username"], "full_name": user["full_name"]}) + def _init_cursor(self): + return self.config("cursor") or None + + def _update_cursor(self, cursor): + self.log.debug("Cursor: %s", cursor) + self._cursor = cursor + return cursor + class InstagramUserExtractor(InstagramExtractor): """Extractor for an Instagram user profile""" @@ -409,8 +417,8 @@ class InstagramTaggedExtractor(InstagramExtractor): self.user_id = self.item[3:] return {"tagged_owner_id": self.user_id} + self.user_id = self.api.user_id(self.item) user = self.api.user(self.item) - self.user_id = user["id"] return { "tagged_owner_id" : user["id"], @@ -693,7 +701,15 @@ class InstagramRestAPI(): def user_id(self, screen_name): if screen_name.startswith("id:"): return screen_name[3:] - return self.user(screen_name)["id"] + user = self.user(screen_name) + if user is None: + raise exception.AuthorizationError( + "Login required to access this profile") + if user["is_private"] and not user["followed_by_viewer"]: + name = user["username"] + s = "" if name.endswith("s") else "s" + raise exception.StopExtraction("%s'%s posts are private", name, s) + return user["id"] def user_clips(self, user_id): endpoint = "/v1/clips/user/" @@ -741,6 +757,9 @@ class InstagramRestAPI(): def _pagination(self, endpoint, params=None, media=False): if params is None: params = {} + extr = self.extractor + params["max_id"] = extr._init_cursor() + while True: data = self._call(endpoint, params=params) @@ -752,9 +771,12 @@ class InstagramRestAPI(): if not data.get("more_available"): return - params["max_id"] = data["next_max_id"] + params["max_id"] = extr._update_cursor(data["next_max_id"]) def _pagination_post(self, endpoint, params): + extr = self.extractor + params["max_id"] = extr._init_cursor() + while True: data = self._call(endpoint, method="POST", data=params) @@ -764,9 +786,12 @@ class InstagramRestAPI(): info = data["paging_info"] if not info.get("more_available"): return - params["max_id"] = info["max_id"] + params["max_id"] = extr._update_cursor(info["max_id"]) def _pagination_sections(self, endpoint, params): + extr = self.extractor + params["max_id"] = extr._init_cursor() + while True: info = self._call(endpoint, method="POST", data=params) @@ -774,19 +799,22 @@ class InstagramRestAPI(): if not info.get("more_available"): return - params["max_id"] = info["next_max_id"] params["page"] = info["next_page"] + params["max_id"] = extr._update_cursor(info["next_max_id"]) class InstagramGraphqlAPI(): def __init__(self, extractor): self.extractor = extractor - self.user = InstagramRestAPI(extractor).user self.user_collection = self.user_saved = self.reels_media = \ self.highlights_media = self._login_required self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + api = InstagramRestAPI(extractor) + self.user = api.user + self.user_id = api.user_id + @staticmethod def _login_required(_=None): raise exception.AuthorizationError("Login required") @@ -824,11 +852,6 @@ class InstagramGraphqlAPI(): return self._pagination(query_hash, variables, "hashtag", "edge_hashtag_to_media") - def user_id(self, screen_name): - if screen_name.startswith("id:"): - return screen_name[3:] - return self.user(screen_name)["id"] - def user_clips(self, user_id): query_hash = "bc78b344a68ed16dd5d7f264681c4c76" variables = {"id": user_id, "first": 50} @@ -871,9 +894,8 @@ class InstagramGraphqlAPI(): def _pagination(self, query_hash, variables, key_data="user", key_edge=None): - cursor = self.extractor.config("cursor") - if cursor: - variables["after"] = cursor + extr = self.extractor + variables["after"] = extr._init_cursor() while True: data = self._call(query_hash, variables)[key_data] @@ -890,35 +912,55 @@ class InstagramGraphqlAPI(): raise exception.StopExtraction( "%s'%s posts are private", self.item, s) - variables["after"] = self._cursor = info["end_cursor"] - self.extractor.log.debug("Cursor: %s", self._cursor) + variables["after"] = extr._update_cursor(info["end_cursor"]) -@cache(maxage=360*24*3600, keyarg=1) +@cache(maxage=90*24*3600, keyarg=1) def _login_impl(extr, username, password): extr.log.info("Logging in as %s", username) + user_agent = ("Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/106.0.5249.79 Mobile " + "Safari/537.36 Instagram 255.1.0.17.102") + + headers = { + "User-Agent" : user_agent, + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + } url = extr.root + "/accounts/login/" - page = extr.request(url).text + response = extr.request(url, headers=headers) + + extract = text.extract_from(response.text) + csrf_token = extract('"csrf_token":"', '"') + device_id = extract('"device_id":"', '"') + rollout_hash = extract('"rollout_hash":"', '"') + + cset = extr.session.cookies.set + cset("csrftoken", csrf_token, domain=extr.cookiedomain) + cset("ig_did", device_id, domain=extr.cookiedomain) headers = { - "X-Web-Device-Id" : text.extract(page, '"device_id":"', '"')[0], + "User-Agent" : user_agent, + "Accept" : "*/*", + "X-CSRFToken" : csrf_token, + "X-Instagram-AJAX": rollout_hash, "X-IG-App-ID" : "936619743392459", - "X-ASBD-ID" : "437806", + "X-ASBD-ID" : "198387", "X-IG-WWW-Claim" : "0", "X-Requested-With": "XMLHttpRequest", + "Origin" : extr.root, "Referer" : url, + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-origin", } - url = extr.root + "/data/shared_data/" - data = extr.request(url, headers=headers).json() - - headers["X-CSRFToken"] = data["config"]["csrf_token"] - headers["X-Instagram-AJAX"] = data["rollout_hash"] - headers["Origin"] = extr.root data = { - "username" : username, - "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format( + "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format( int(time.time()), password), + "username" : username, "queryParams" : "{}", "optIntoOneTap" : "false", "stopDeletionNonce" : "", @@ -930,11 +972,8 @@ def _login_impl(extr, username, password): if not response.json().get("authenticated"): raise exception.AuthenticationError() - cget = extr.session.cookies.get - return { - name: cget(name) - for name in ("sessionid", "mid", "ig_did") - } + return {cookie.name: cookie.value + for cookie in extr.session.cookies} def id_from_shortcode(shortcode): diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py new file mode 100644 index 0000000..6062418 --- /dev/null +++ b/gallery_dl/extractor/nana.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nana.my.id/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception +import json + + +class NanaGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from nana.my.id""" + category = "nana" + directory_fmt = ("{category}", "{title}") + pattern = r"(?:https?://)?nana\.my\.id/reader/([^/?#]+)" + test = ( + (("https://nana.my.id/reader/" + "059f7de55a4297413bfbd432ce7d6e724dd42bae"), { + "pattern": r"https://nana\.my\.id/reader/" + r"\w+/image/page\?path=.*\.\w+", + "title" : "Everybody Loves Shion", + "artist" : "fuzui", + "tags" : list, + "count" : 29, + }), + (("https://nana.my.id/reader/" + "77c8712b67013e427923573379f5bafcc0c72e46"), { + "pattern": r"https://nana\.my\.id/reader/" + r"\w+/image/page\?path=.*\.\w+", + "title" : "Lovey-Dovey With an Otaku-Friendly Gyaru", + "artist" : "Sueyuu", + "tags" : ["Sueyuu"], + "count" : 58, + }), + ) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "https://nana.my.id/reader/" + self.gallery_id + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + title = text.unescape( + text.extract(page, '  ', '')[0]) + artist = text.unescape(text.extract( + page, '', '')[0])[len(title):-10] + tags = text.extract(page, 'Reader.tags = "', '"')[0] + + return { + "gallery_id": self.gallery_id, + "title" : title, + "artist" : artist[4:] if artist.startswith(" by ") else "", + "tags" : tags.split(", ") if tags else (), + "lang" : "en", + "language" : "English", + } + + def images(self, page): + data = json.loads(text.extract(page, "Reader.pages = ", ".pages")[0]) + return [ + ("https://nana.my.id" + image, None) + for image in data["pages"] + ] + + +class NanaSearchExtractor(Extractor): + """Extractor for nana search results""" + category = "nana" + subcategory = "search" + pattern = r"(?:https?://)?nana\.my\.id(?:/?\?([^#]+))" + test = ( + ('https://nana.my.id/?q=+"elf"&sort=desc', { + "pattern": NanaGalleryExtractor.pattern, + "range": "1-100", + "count": 100, + }), + ("https://nana.my.id/?q=favorites%3A", { + "pattern": NanaGalleryExtractor.pattern, + "count": ">= 2", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + self.params["p"] = text.parse_int(self.params.get("p"), 1) + self.params["q"] = self.params.get("q") or "" + + def items(self): + if "favorites:" in self.params["q"]: + favkey = self.config("favkey") + if not favkey: + raise exception.AuthenticationError( + "'Favorite key' not provided. " + "Please see 'https://nana.my.id/tutorial'") + self.session.cookies.set("favkey", favkey, domain="nana.my.id") + + data = {"_extractor": NanaGalleryExtractor} + while True: + try: + page = self.request( + "https://nana.my.id", params=self.params).text + except exception.HttpError: + return + + for gallery in text.extract_iter( + page, '
', '
'): + url = "https://nana.my.id" + text.extract( + gallery, '", "さんの抜いた")[0] or "") +class NijieFeedExtractor(NijieExtractor): + """Extractor for nijie liked user feed""" + subcategory = "feed" + pattern = BASE_PATTERN + r"/like_user_view\.php" + test = ( + ("https://nijie.info/like_user_view.php", { + "range": "1-10", + "count": 10, + }), + ("https://horne.red/like_user_view.php"), + ) + + def image_ids(self): + return self._pagination("like_user_view") + + @staticmethod + def _extract_user_name(page): + return "" + + +class NijiefollowedExtractor(NijieExtractor): + """Extractor for followed nijie users""" + subcategory = "followed" + pattern = BASE_PATTERN + r"/like_my\.php" + test = ( + ("https://nijie.info/like_my.php"), + ("https://horne.red/like_my.php"), + ) + + def items(self): + self.login() + + url = self.root + "/like_my.php" + params = {"p": 1} + data = {"_extractor": NijieUserExtractor} + + while True: + page = self.request(url, params=params).text + + for user_id in text.extract_iter( + page, '">