diff options
| author | 2024-01-08 03:22:30 -0500 | |
|---|---|---|
| committer | 2024-01-08 03:22:30 -0500 | |
| commit | 8646342be01c70ec97eacb16f6b88e4360aa585e (patch) | |
| tree | 7e39f7daad35df3cae62e51073d3a79e79411c92 | |
| parent | 0e6f612882a06d191e8d56bbe2bc05020009e379 (diff) | |
| parent | e949aaf6f6ac93896947d5b736e48e7911926efb (diff) | |
Update upstream source from tag 'upstream/1.26.6'
Update to upstream version '1.26.6'
with Debian dir 5409f55ee0db2818bae3ba157a2f17afdb377c21
35 files changed, 623 insertions, 190 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 8907e07..7b135b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,37 @@ # Changelog +## 1.26.6 - 2024-01-06 +### Extractors +#### Additions +- [batoto] add `chapter` and `manga` extractors ([#1434](https://github.com/mikf/gallery-dl/issues/1434), [#2111](https://github.com/mikf/gallery-dl/issues/2111), [#4979](https://github.com/mikf/gallery-dl/issues/4979)) +- [deviantart] add `avatar` and `background` extractors ([#4995](https://github.com/mikf/gallery-dl/issues/4995)) +- [poringa] add support ([#4675](https://github.com/mikf/gallery-dl/issues/4675), [#4962](https://github.com/mikf/gallery-dl/issues/4962)) +- [szurubooru] support `snootbooru.com` ([#5023](https://github.com/mikf/gallery-dl/issues/5023)) +- [zzup] add `gallery` extractor ([#4517](https://github.com/mikf/gallery-dl/issues/4517), [#4604](https://github.com/mikf/gallery-dl/issues/4604), [#4659](https://github.com/mikf/gallery-dl/issues/4659), [#4863](https://github.com/mikf/gallery-dl/issues/4863), [#5016](https://github.com/mikf/gallery-dl/issues/5016)) +#### Fixes +- [gelbooru] fix `favorite` extractor ([#4903](https://github.com/mikf/gallery-dl/issues/4903)) +- [idolcomplex] fix extraction & update URL patterns ([#5002](https://github.com/mikf/gallery-dl/issues/5002)) +- [imagechest] fix loading more than 10 images in a gallery ([#4469](https://github.com/mikf/gallery-dl/issues/4469)) +- [jpgfish] update domain +- [komikcast] fix `manga` extractor ([#5027](https://github.com/mikf/gallery-dl/issues/5027)) +- [komikcast] update domain ([#5027](https://github.com/mikf/gallery-dl/issues/5027)) +- [lynxchan] update `bbw-chan` domain ([#4970](https://github.com/mikf/gallery-dl/issues/4970)) +- [manganelo] fix extraction & recognize `.to` TLDs ([#5005](https://github.com/mikf/gallery-dl/issues/5005)) +- [paheal] restore `extension` metadata ([#4976](https://github.com/mikf/gallery-dl/issues/4976)) +- [rule34us] add fallback for `video-cdn1` videos ([#4985](https://github.com/mikf/gallery-dl/issues/4985)) +- [weibo] fix AttributeError in `user` extractor ([#5022](https://github.com/mikf/gallery-dl/issues/5022)) +#### Improvements +- [gelbooru] show error for invalid API responses ([#4903](https://github.com/mikf/gallery-dl/issues/4903)) +- [rule34] recognize URLs with `www` subdomain ([#4984](https://github.com/mikf/gallery-dl/issues/4984)) +- [twitter] raise error for invalid `strategy` values ([#4953](https://github.com/mikf/gallery-dl/issues/4953)) +#### Metadata +- [fanbox] add `metadata` option ([#4921](https://github.com/mikf/gallery-dl/issues/4921)) +- [nijie] add `count` metadata ([#146](https://github.com/mikf/gallery-dl/issues/146)) +- [pinterest] add `count` metadata ([#4981](https://github.com/mikf/gallery-dl/issues/4981)) +### Miscellaneous +- fix and update zsh completion ([#4972](https://github.com/mikf/gallery-dl/issues/4972)) +- fix `--cookies-from-browser` macOS Firefox profile path + ## 1.26.5 - 2023-12-23 ### Extractors #### Additions @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.26.5 +Version: 1.26.6 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -112,9 +112,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.bin>`__ Nightly Builds @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.bin>`__ Nightly Builds diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 39fabcc..e5153f5 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -4,10 +4,10 @@ local curcontext="$curcontext" typeset -A opt_args local rc=1 -_arguments -C -S \ +_arguments -s -S \ {-h,--help}'[Print this help message and exit]' \ --version'[Print program version and exit]' \ -{-f,--filename}'[Filename format string for downloaded files ("/O" for "original" filenames)]':'<format>' \ +{-f,--filename}'[Filename format string for downloaded files ('\''/O'\'' for "original" filenames)]':'<format>' \ {-d,--destination}'[Target location for file downloads]':'<path>' \ {-D,--directory}'[Exact location for file downloads]':'<path>' \ {-X,--extractors}'[Load external extractors from PATH]':'<path>' \ @@ -15,7 +15,7 @@ _arguments -C -S \ --source-address'[Client-side IP address to bind to]':'<ip>' \ --user-agent'[User-Agent request header]':'<ua>' \ --clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'<module>' \ -{-i,--input-file}'[Download URLs found in FILE ("-" for stdin). More than one --input-file can be specified]':'<file>':_files \ +{-i,--input-file}'[Download URLs found in FILE ('\''-'\'' for stdin). More than one --input-file can be specified]':'<file>':_files \ {-I,--input-file-comment}'[Download URLs found in FILE. Comment them out after they were downloaded successfully.]':'<file>':_files \ {-x,--input-file-delete}'[Download URLs found in FILE. Delete them after they were downloaded successfully.]':'<file>':_files \ {-q,--quiet}'[Activate quiet mode]' \ @@ -53,34 +53,28 @@ _arguments -C -S \ --config-toml'[Additional configuration files in TOML format]':'<file>':_files \ --config-create'[Create a basic configuration file]' \ --config-ignore'[Do not read default configuration files]' \ ---ignore-config'[==SUPPRESS==]' \ {-u,--username}'[Username to login with]':'<user>' \ {-p,--password}'[Password belonging to the given username]':'<pass>' \ --netrc'[Enable .netrc authentication data]' \ {-C,--cookies}'[File to load additional cookies from]':'<file>':_files \ --cookies-export'[Export session cookies to FILE]':'<file>':_files \ ---cookies-from-browser'[Name of the browser to load cookies from, with optional domain prefixed with "/", keyring name prefixed with "+", profile prefixed with ":", and container prefixed with "::" ("none" for no container)]':'<browser[/domain][+keyring][:profile][::container]>' \ +--cookies-from-browser'[Name of the browser to load cookies from, with optional domain prefixed with '\''/'\'', keyring name prefixed with '\''+'\'', profile prefixed with '\'':'\'', and container prefixed with '\''::'\'' ('\''none'\'' for no container)]':'<browser[/domain][+keyring][:profile][::container]>' \ --download-archive'[Record all downloaded or skipped files in FILE and skip downloading any file already in it]':'<file>':_files \ {-A,--abort}'[Stop current extractor run after N consecutive file downloads were skipped]':'<n>' \ {-T,--terminate}'[Stop current and parent extractor runs after N consecutive file downloads were skipped]':'<n>' \ ---range'[Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. "5", "8-20", or "1:24:3")]':'<range>' \ ---chapter-range'[Like "--range", but applies to manga chapters and other delegated URLs]':'<range>' \ ---filter'[Python expression controlling which files to download. Files for which the expression evaluates to False are ignored. Available keys are the filename-specific ones listed by "-K". Example: --filter "image_width >= 1000 and rating in ("s", "q")"]':'<expr>' \ ---chapter-filter'[Like "--filter", but applies to manga chapters and other delegated URLs]':'<expr>' \ +--range'[Index range(s) specifying which files to download. These can be either a constant value, range, or slice (e.g. '\''5'\'', '\''8-20'\'', or '\''1:24:3'\'')]':'<range>' \ +--chapter-range'[Like '\''--range'\'', but applies to manga chapters and other delegated URLs]':'<range>' \ +--filter'[Python expression controlling which files to download. Files for which the expression evaluates to False are ignored. Available keys are the filename-specific ones listed by '\''-K'\''. Example: --filter "image_width >= 1000 and rating in ('\''s'\'', '\''q'\'')"]':'<expr>' \ +--chapter-filter'[Like '\''--filter'\'', but applies to manga chapters and other delegated URLs]':'<expr>' \ {-P,--postprocessor}'[Activate the specified post processor]':'<name>' \ {-O,--postprocessor-option}'[Additional post processor options]':'<key=value>' \ --write-metadata'[Write metadata to separate JSON files]' \ --write-info-json'[Write gallery metadata to a info.json file]' \ ---write-infojson'[==SUPPRESS==]' \ --write-tags'[Write image tags to separate text files]' \ --zip'[Store downloaded files in a ZIP archive]' \ --cbz'[Store downloaded files in a CBZ archive]' \ ---mtime'[Set file modification times according to metadata selected by NAME. Examples: "date" or "status[date]"]':'<name>' \ ---mtime-from-date'[==SUPPRESS==]' \ ---ugoira'[Convert Pixiv Ugoira to FORMAT using FFmpeg. Supported formats are "webm", "mp4", "gif", "vp8", "vp9", "vp9-lossless", "copy".]':'<format>' \ ---ugoira-conv'[==SUPPRESS==]' \ ---ugoira-conv-lossless'[==SUPPRESS==]' \ ---ugoira-conv-copy'[==SUPPRESS==]' \ +--mtime'[Set file modification times according to metadata selected by NAME. Examples: '\''date'\'' or '\''status\[date\]'\'']':'<name>' \ +--ugoira'[Convert Pixiv Ugoira to FORMAT using FFmpeg. Supported formats are '\''webm'\'', '\''mp4'\'', '\''gif'\'', '\''vp8'\'', '\''vp9'\'', '\''vp9-lossless'\'', '\''copy'\''.]':'<format>' \ --exec'[Execute CMD for each downloaded file. Supported replacement fields are {} or {_path}, {_directory}, {_filename}. Example: --exec "convert {} {}.png && rm {}"]':'<cmd>' \ --exec-after'[Execute CMD after all files were downloaded. Example: --exec-after "cd {_directory} && convert * ../doc.pdf"]':'<cmd>' && rc=0 diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index caa0d4a..ad6fd4a 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2023-12-23" "1.26.5" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2024-01-06" "1.26.6" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index b641f29..a57d39b 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2023-12-23" "1.26.5" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2024-01-06" "1.26.6" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -761,6 +761,23 @@ to be passed to \f[I]ssl.SSLContext.set_ciphers()\f[] +.SS extractor.*.tls12 +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +.br +* \f[I]true\f[] +.br +* \f[I]false\f[] for \f[I]patreon\f[], \f[I]pixiv:series\f[] + +.IP "Description:" 4 +Allow selecting TLS 1.2 cipher suites. + +Can be disabled to alter TLS fingerprints +and potentially bypass Cloudflare blocks. + + .SS extractor.*.keywords .IP "Type:" 6 \f[I]object\f[] (name -> value) @@ -1243,6 +1260,29 @@ To use a different formatting for those values other than the default after a colon \f[I]:\f[], for example \f[I]{date:%Y%m%d}\f[]. +.SS extractor.*.write-pages +.IP "Type:" 6 +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +During data extraction, +write received HTTP request data +to enumerated files in the current working directory. + +Special values: + +.br +* \f[I]"all"\f[]: Include HTTP request and response headers. Hide \f[I]Authorization\f[], \f[I]Cookie\f[], and \f[I]Set-Cookie\f[] values. +.br +* \f[I]"ALL"\f[]: Include all HTTP request and response headers. + + .SH EXTRACTOR-SPECIFIC OPTIONS .SS extractor.artstation.external .IP "Type:" 6 @@ -1587,7 +1627,13 @@ A (comma-separated) list of subcategories to include when processing a user profile. Possible values are -\f[I]"gallery"\f[], \f[I]"scraps"\f[], \f[I]"journal"\f[], \f[I]"favorite"\f[], \f[I]"status"\f[]. +\f[I]"avatar"\f[], +\f[I]"background"\f[], +\f[I]"gallery"\f[], +\f[I]"scraps"\f[], +\f[I]"journal"\f[], +\f[I]"favorite"\f[], +\f[I]"status"\f[]. It is possible to use \f[I]"all"\f[] instead of listing all values separately. @@ -1773,7 +1819,7 @@ Minimum wait time in seconds before API requests. .br * notes,pools .br -* ["notes", "pools" +* ["notes", "pools"] .IP "Description:" 4 Extract additional metadata (notes, pool metadata) if available. @@ -1940,6 +1986,28 @@ extraction and download for YouTube, Vimeo and SoundCloud embeds. * \f[I]false\f[]: Ignore embeds. +.SS extractor.fanbox.metadata +.IP "Type:" 6 +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Example:" 4 +.br +* user,plan +.br +* ["user", "plan"] + +.IP "Description:" 4 +Extract \f[I]plan\f[] and extended \f[I]user\f[] metadata. + + .SS extractor.flickr.access-token & .access-token-secret .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index c3f8049..cda584e 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -176,16 +176,15 @@ "imgur": { - "#": "use different directory and filename formats when coming from a reddit post", - "directory": - { - "'_reddit' in locals()": [] - }, - "filename": - { - "'_reddit' in locals()": "{_reddit[id]} {id}.{extension}", - "" : "{id}.{extension}" - } + "#": "general imgur settings", + "filename": "{id}.{extension}" + }, + + "reddit>imgur": + { + "#": "special settings for imgur URLs found in reddit posts", + "directory": [], + "filename": "{_reddit[id]} {_reddit[title]} {id}.{extension}" }, "tumblr": diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 934609a..d695df9 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.26.5 +Version: 1.26.6 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -112,9 +112,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 30cda54..271b4a9 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -55,6 +55,7 @@ gallery_dl/extractor/adultempire.py gallery_dl/extractor/architizer.py gallery_dl/extractor/artstation.py gallery_dl/extractor/aryion.py +gallery_dl/extractor/batoto.py gallery_dl/extractor/bbc.py gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py @@ -149,7 +150,6 @@ gallery_dl/extractor/nijie.py gallery_dl/extractor/nitter.py gallery_dl/extractor/nozomi.py gallery_dl/extractor/nsfwalbum.py -gallery_dl/extractor/nudecollect.py gallery_dl/extractor/oauth.py gallery_dl/extractor/paheal.py gallery_dl/extractor/patreon.py @@ -165,6 +165,7 @@ gallery_dl/extractor/pixiv.py gallery_dl/extractor/pixnet.py gallery_dl/extractor/plurk.py gallery_dl/extractor/poipiku.py +gallery_dl/extractor/poringa.py gallery_dl/extractor/pornhub.py gallery_dl/extractor/pornpics.py gallery_dl/extractor/postmill.py @@ -224,6 +225,7 @@ gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py gallery_dl/extractor/ytdl.py gallery_dl/extractor/zerochan.py +gallery_dl/extractor/zzup.py gallery_dl/postprocessor/__init__.py gallery_dl/postprocessor/classify.py gallery_dl/postprocessor/common.py diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 416cc9a..478abb6 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -215,9 +215,11 @@ def _firefox_cookies_database(profile=None, container=None): def _firefox_browser_directory(): if sys.platform in ("win32", "cygwin"): - return os.path.expandvars(r"%APPDATA%\Mozilla\Firefox\Profiles") + return os.path.expandvars( + r"%APPDATA%\Mozilla\Firefox\Profiles") if sys.platform == "darwin": - return os.path.expanduser("~/Library/Application Support/Firefox") + return os.path.expanduser( + "~/Library/Application Support/Firefox/Profiles") return os.path.expanduser("~/.mozilla/firefox") diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 695b8b2..9e33f2c 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,6 +24,7 @@ modules = [ "architizer", "artstation", "aryion", + "batoto", "bbc", "behance", "blogger", @@ -107,7 +108,6 @@ modules = [ "nitter", "nozomi", "nsfwalbum", - "nudecollect", "paheal", "patreon", "philomena", @@ -122,6 +122,7 @@ modules = [ "pixnet", "plurk", "poipiku", + "poringa", "pornhub", "pornpics", "postmill", @@ -177,6 +178,7 @@ modules = [ "xhamster", "xvideos", "zerochan", + "zzup", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py new file mode 100644 index 0000000..cd6302e --- /dev/null +++ b/gallery_dl/extractor/batoto.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://bato.to/""" + +from .common import Extractor, ChapterExtractor, MangaExtractor +from .. import text, exception +import re + +BASE_PATTERN = (r"(?:https?://)?" + r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") + + +class BatotoBase(): + """Base class for batoto extractors""" + category = "batoto" + root = "https://bato.to" + + def request(self, url, **kwargs): + kwargs["encoding"] = "utf-8" + return Extractor.request(self, url, **kwargs) + + +class BatotoChapterExtractor(BatotoBase, ChapterExtractor): + """Extractor for bato.to manga chapters""" + pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)" + example = "https://bato.to/title/12345-MANGA/54321" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + self.chapter_id = match.group(1) + url = "{}/title/0/{}".format(self.root, self.chapter_id) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + manga, info, _ = extr("<title>", "<").rsplit(" - ", 3) + manga_id = extr("/title/", "/") + + match = re.match( + r"(?:Volume\s+(\d+) )?" + r"\w+\s+(\d+)(.*)", info) + if match: + volume, chapter, minor = match.groups() + title = text.remove_html(extr( + "selected>", "</option")).partition(" : ")[2] + else: + volume = chapter = 0 + minor = "" + title = info + + return { + "manga" : text.unescape(manga), + "manga_id" : text.parse_int(manga_id), + "title" : text.unescape(title), + "volume" : text.parse_int(volume), + "chapter" : text.parse_int(chapter), + "chapter_minor": minor, + "chapter_id" : text.parse_int(self.chapter_id), + "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), + } + + def images(self, page): + images_container = text.extr(page, 'pageOpts', ':[0,0]}"') + images_container = text.unescape(images_container) + return [ + (url, None) + for url in text.extract_iter(images_container, r"\"", r"\"") + ] + + +class BatotoMangaExtractor(BatotoBase, MangaExtractor): + """Extractor for bato.to manga""" + reverse = False + chapterclass = BatotoChapterExtractor + pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" + example = "https://bato.to/title/12345-MANGA/" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + self.manga_id = match.group(1) + url = "{}/title/{}".format(self.root, self.manga_id) + MangaExtractor.__init__(self, match, url) + + def chapters(self, page): + extr = text.extract_from(page) + + warning = extr(' class="alert alert-warning">', "</div><") + if warning: + raise exception.StopExtraction("'%s'", text.remove_html(warning)) + + data = { + "manga_id": text.parse_int(self.manga_id), + "manga" : text.unescape(extr( + "<title>", "<").rpartition(" - ")[0]), + } + + extr('<div data-hk="0-0-0-0"', "") + results = [] + while True: + href = extr('<a href="/title/', '"') + if not href: + break + + chapter = href.rpartition("-ch_")[2] + chapter, sep, minor = chapter.partition(".") + + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + data["date"] = text.parse_datetime( + extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ") + + url = "{}/title/{}".format(self.root, href) + results.append((url, data.copy())) + return results diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 21166bd..2bf200b 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -35,7 +35,7 @@ class CheveretoExtractor(BaseExtractor): BASE_PATTERN = CheveretoExtractor.update({ "jpgfish": { - "root": "https://jpg2.su", + "root": "https://jpg4.su", "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, "pixl": { diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 9b010c5..0dd05ef 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -526,12 +526,15 @@ class Extractor(): if include == "all": include = extractors elif isinstance(include, str): - include = include.split(",") + include = include.replace(" ", "").split(",") result = [(Message.Version, 1)] for category in include: - if category in extractors: + try: extr, url = extractors[category] + except KeyError: + self.log.warning("Invalid include '%s'", category) + else: result.append((Message.Queue, url, {"_extractor": extr})) return iter(result) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2ba47e1..4b5f1d7 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = match.group(1) or match.group(2) + self.user = (match.group(1) or match.group(2)).lower() self.offset = 0 def _init(self): @@ -104,7 +104,6 @@ class DeviantartExtractor(Extractor): raise exception.StopExtraction() else: self.subcategory = "group-" + self.subcategory - self.user = self.user.lower() self.group = True for deviation in self.deviations(): @@ -513,11 +512,13 @@ class DeviantartUserExtractor(DeviantartExtractor): def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( - (DeviantartGalleryExtractor , base + "gallery"), - (DeviantartScrapsExtractor , base + "gallery/scraps"), - (DeviantartJournalExtractor , base + "posts"), - (DeviantartStatusExtractor , base + "posts/statuses"), - (DeviantartFavoriteExtractor, base + "favourites"), + (DeviantartAvatarExtractor , base + "avatar"), + (DeviantartBackgroundExtractor, base + "banner"), + (DeviantartGalleryExtractor , base + "gallery"), + (DeviantartScrapsExtractor , base + "gallery/scraps"), + (DeviantartJournalExtractor , base + "posts"), + (DeviantartStatusExtractor , base + "posts/statuses"), + (DeviantartFavoriteExtractor , base + "favourites"), ), ("gallery",)) @@ -538,6 +539,47 @@ class DeviantartGalleryExtractor(DeviantartExtractor): return self._folder_urls(folders, "gallery", DeviantartFolderExtractor) +class DeviantartAvatarExtractor(DeviantartExtractor): + """Extractor for an artist's avatar""" + subcategory = "avatar" + archive_fmt = "a_{_username}_{index}" + pattern = BASE_PATTERN + r"/avatar" + example = "https://www.deviantart.com/USER/avatar/" + + def deviations(self): + profile = self.api.user_profile(self.user.lower()) + if profile: + url = profile["user"]["usericon"] + return ({ + "author" : profile["user"], + "category" : "avatar", + "index" : text.parse_int(url.rpartition("?")[2]), + "is_deleted" : False, + "is_downloadable": False, + "published_time" : 0, + "title" : "avatar", + "content" : { + "src": url.replace("/avatars/", "/avatars-big/", 1), + }, + },) + return () + + +class DeviantartBackgroundExtractor(DeviantartExtractor): + """Extractor for an artist's banner""" + subcategory = "background" + archive_fmt = "b_{index}" + pattern = BASE_PATTERN + r"/ba(?:nner|ckground)" + example = "https://www.deviantart.com/USER/banner/" + + def deviations(self): + try: + return (self.api.user_profile(self.user.lower()) + ["cover_deviation"]["cover_deviation"],) + except Exception: + return () + + class DeviantartFolderExtractor(DeviantartExtractor): """Extractor for deviations inside an artist's gallery folder""" subcategory = "folder" diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 4572bea..61a3928 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text +from ..cache import memcache import re BASE_PATTERN = ( @@ -27,8 +28,20 @@ class FanboxExtractor(Extractor): _warning = True def _init(self): + self.headers = {"Origin": self.root} self.embeds = self.config("embeds", True) + includes = self.config("metadata") + if includes: + if isinstance(includes, str): + includes = includes.split(",") + elif not isinstance(includes, (list, tuple)): + includes = ("user", "plan") + self._meta_user = ("user" in includes) + self._meta_plan = ("plan" in includes) + else: + self._meta_user = self._meta_plan = False + if self._warning: if not self.cookies_check(("FANBOXSESSID",)): self.log.warning("no 'FANBOXSESSID' cookie set") @@ -43,11 +56,9 @@ class FanboxExtractor(Extractor): """Return all relevant post objects""" def _pagination(self, url): - headers = {"Origin": self.root} - while url: url = text.ensure_http_scheme(url) - body = self.request(url, headers=headers).json()["body"] + body = self.request(url, headers=self.headers).json()["body"] for item in body["items"]: try: yield self._get_post_data(item["id"]) @@ -58,9 +69,8 @@ class FanboxExtractor(Extractor): def _get_post_data(self, post_id): """Fetch and process post data""" - headers = {"Origin": self.root} url = "https://api.fanbox.cc/post.info?postId="+post_id - post = self.request(url, headers=headers).json()["body"] + post = self.request(url, headers=self.headers).json()["body"] content_body = post.pop("body", None) if content_body: @@ -98,8 +108,47 @@ class FanboxExtractor(Extractor): post["text"] = content_body.get("text") if content_body else None post["isCoverImage"] = False + if self._meta_user: + post["user"] = self._get_user_data(post["creatorId"]) + if self._meta_plan: + plans = self._get_plan_data(post["creatorId"]) + post["plan"] = plans[post["feeRequired"]] + return content_body, post + @memcache(keyarg=1) + def _get_user_data(self, creator_id): + url = "https://api.fanbox.cc/creator.get" + params = {"creatorId": creator_id} + data = self.request(url, params=params, headers=self.headers).json() + + user = data["body"] + user.update(user.pop("user")) + + return user + + @memcache(keyarg=1) + def _get_plan_data(self, creator_id): + url = "https://api.fanbox.cc/plan.listCreator" + params = {"creatorId": creator_id} + data = self.request(url, params=params, headers=self.headers).json() + + plans = {0: { + "id" : "", + "title" : "", + "fee" : 0, + "description" : "", + "coverImageUrl" : "", + "creatorId" : creator_id, + "hasAdultContent": None, + "paymentMethod" : None, + }} + for plan in data["body"]: + del plan["user"] + plans[plan["fee"]] = plan + + return plans + def _get_urls_from_post(self, content_body, post): num = 0 cover_image = post.get("coverImageUrl") diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index b62ff78..eba1539 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -32,10 +32,13 @@ class GelbooruBase(): url = self.root + "/index.php?page=dapi&q=index&json=1" data = self.request(url, params=params).json() - if key not in data: - return () + try: + posts = data[key] + except KeyError: + self.log.error("Incomplete API response (missing '%s')", key) + self.log.debug("%s", data) + return [] - posts = data[key] if not isinstance(posts, list): return (posts,) return posts @@ -165,15 +168,16 @@ class GelbooruFavoriteExtractor(GelbooruBase, "id" : self.favorite_id, "limit": "1", } - count = self._api_request(params, "@attributes")[0]["count"] + count = self._api_request(params, "@attributes")[0]["count"] if count <= self.offset: return - pnum, last = divmod(count + 1, self.per_page) - if self.offset >= last: + pnum, last = divmod(count-1, self.per_page) + if self.offset > last: + # page number change self.offset -= last - diff, self.offset = divmod(self.offset, self.per_page) + diff, self.offset = divmod(self.offset-1, self.per_page) pnum -= diff + 1 skip = self.offset @@ -183,8 +187,8 @@ class GelbooruFavoriteExtractor(GelbooruBase, while True: favs = self._api_request(params, "favorite") - favs.reverse() + if skip: favs = favs[skip:] skip = 0 diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 0864b9f..0c8af3d 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -168,7 +168,7 @@ INSTANCES = { }, "rule34": { "root": "https://rule34.xxx", - "pattern": r"rule34\.xxx", + "pattern": r"(?:www\.)?rule34\.xxx", "api_root": "https://api.rule34.xxx", }, "safebooru": { diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index b9e2c3d..f70a948 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -34,8 +34,11 @@ class IdolcomplexExtractor(SankakuExtractor): self.start_post = 0 def _init(self): + self.find_pids = re.compile( + r" href=[\"#]/\w\w/posts/([0-9a-f]+)" + ).findall self.find_tags = re.compile( - r'tag-type-([^"]+)">\s*<div [^>]+>\s*<a href="/\?tags=([^"]+)' + r'tag-type-([^"]+)">\s*<a [^>]*?href="/[^?]*\?tags=([^"]+)' ).findall def items(self): @@ -149,8 +152,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" - example = "https://idol.sankakucomplex.com/?tags=TAGS" + pattern = BASE_PATTERN + r"/(?:posts/?)?\?([^#]*)" + example = "https://idol.sankakucomplex.com/en/posts?tags=TAGS" per_page = 20 def __init__(self, match): @@ -196,7 +199,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): page = self.request(self.root, params=params, retries=10).text pos = ((page.find('id="more-popular-posts-link"') + 1) or (page.find('<span class="thumb') + 1)) - yield from text.extract_iter(page, ' href="/posts/', '"', pos) + + yield from self.find_pids(page, pos) next_url = text.extract(page, 'next-page-url="', '"', pos)[0] if not next_url: @@ -218,7 +222,7 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pools?/show/(\d+)" + pattern = BASE_PATTERN + r"/pools?/show/(\d+)" example = "https://idol.sankakucomplex.com/pools/show/12345" per_page = 24 @@ -242,8 +246,7 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): while True: page = self.request(url, params=params, retries=10).text pos = page.find('id="pool-show"') + 1 - post_ids = list(text.extract_iter( - page, ' href="/posts/', '"', pos)) + post_ids = self.find_pids(page, pos) yield from post_ids if len(post_ids) < self.per_page: diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 9aa0332..9199d12 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -44,7 +44,7 @@ class ImagechestGalleryExtractor(GalleryExtractor): } def images(self, page): - if " More Files</button>" in page: + if ' load-all">' in page: url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) headers = { "X-Requested-With": "XMLHttpRequest", diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index a3e0130..7a19be5 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://komikcast.site/""" +"""Extractors for https://komikcast.lol/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)" class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - root = "https://komikcast.site" + root = "https://komikcast.lol" @staticmethod def parse_chapter_string(chapter_string, data=None): @@ -46,9 +46,9 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): - """Extractor for manga-chapters from komikcast.site""" + """Extractor for manga-chapters from komikcast.lol""" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" - example = "https://komikcast.site/chapter/TITLE/" + example = "https://komikcast.lol/chapter/TITLE/" def metadata(self, page): info = text.extr(page, "<title>", " - Komikcast<") @@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): - """Extractor for manga from komikcast.site""" + """Extractor for manga from komikcast.lol""" chapterclass = KomikcastChapterExtractor pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" - example = "https://komikcast.site/komik/TITLE" + example = "https://komikcast.lol/komik/TITLE" def chapters(self, page): results = [] @@ -76,8 +76,10 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): for item in text.extract_iter( page, '<a class="chapter-link-item" href="', '</a'): - url, _, chapter_string = item.rpartition('">Chapter ') - self.parse_chapter_string(chapter_string, data) + url, _, chapter = item.rpartition('">Chapter') + chapter, sep, minor = chapter.strip().partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor results.append((url, data.copy())) return results diff --git a/gallery_dl/extractor/lynxchan.py b/gallery_dl/extractor/lynxchan.py index 0edd5c1..85b3fef 100644 --- a/gallery_dl/extractor/lynxchan.py +++ b/gallery_dl/extractor/lynxchan.py @@ -18,8 +18,8 @@ class LynxchanExtractor(BaseExtractor): BASE_PATTERN = LynxchanExtractor.update({ "bbw-chan": { - "root": "https://bbw-chan.nl", - "pattern": r"bbw-chan\.nl", + "root": "https://bbw-chan.link", + "pattern": r"bbw-chan\.(?:link|nl)", }, "kohlchan": { "root": "https://kohlchan.net", @@ -40,7 +40,7 @@ class LynxchanThreadExtractor(LynxchanExtractor): filename_fmt = "{postId}{num:?-//} {filename}.{extension}" archive_fmt = "{boardUri}_{postId}_{num}" pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" - example = "https://bbw-chan.nl/a/res/12345.html" + example = "https://endchan.org/a/res/12345.html" def __init__(self, match): LynxchanExtractor.__init__(self, match) @@ -71,7 +71,7 @@ class LynxchanBoardExtractor(LynxchanExtractor): """Extractor for LynxChan boards""" subcategory = "board" pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" - example = "https://bbw-chan.nl/a/" + example = "https://endchan.org/a/" def __init__(self, match): LynxchanExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 46019ad..232b98d 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -10,7 +10,11 @@ from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)" +BASE_PATTERN = ( + r"(?:https?://)?" + r"((?:chap|read|www\.|m\.)?mangan(?:at|el)o" + r"\.(?:to|com))" +) class ManganeloBase(): @@ -67,10 +71,11 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): def images(self, page): page = text.extr( - page, 'class="container-chapter-reader', '\n<div') + page, 'class="container-chapter-reader', 'class="container') return [ (url, None) for url in text.extract_iter(page, '<img src="', '"') + if not url.endswith("/gohome.png") ] or [ (url, None) for url in text.extract_iter( diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 57c3118..b991705 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -55,9 +55,12 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): else: data["user_id"] = data["artist_id"] data["user_name"] = data["artist_name"] - yield Message.Directory, data - for num, url in enumerate(self._extract_images(image_id, page)): + urls = list(self._extract_images(image_id, page)) + data["count"] = len(urls) + + yield Message.Directory, data + for num, url in enumerate(urls): image = text.nameext_from_url(url, { "num": num, "url": "https:" + url, diff --git a/gallery_dl/extractor/nudecollect.py b/gallery_dl/extractor/nudecollect.py deleted file mode 100644 index bda5d77..0000000 --- a/gallery_dl/extractor/nudecollect.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://nudecollect.com/""" - -from .common import GalleryExtractor -from .. import text - - -class NudecollectExtractor(GalleryExtractor): - """Base class for Nudecollect extractors""" - category = "nudecollect" - directory_fmt = ("{category}", "{title}") - filename_fmt = "{slug}_{num:>03}.{extension}" - archive_fmt = "{slug}_{num}" - root = "https://www.nudecollect.com" - - def request(self, url, **kwargs): - kwargs["allow_redirects"] = False - return GalleryExtractor.request(self, url, **kwargs) - - @staticmethod - def get_title(page): - return text.unescape(text.extr(page, "<title>", "</title>"))[31:] - - @staticmethod - def get_image(page): - return text.extr(page, '<img src="', '"') - - -class NudecollectImageExtractor(NudecollectExtractor): - """Extractor for individual images from nudecollect.com""" - subcategory = "image" - pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com" - r"(/content/([^/?#]+)/image-(\d+)-pics-(\d+)" - r"-mirror-(\d+)\.html)") - example = ("https://www.nudecollect.com/content/12345_TITLE" - "/image-1-pics-108-mirror-1.html") - - def __init__(self, match): - NudecollectExtractor.__init__(self, match) - _, self.slug, self.num, self.count, self.mirror = match.groups() - - def metadata(self, page): - return { - "slug" : self.slug, - "title" : self.get_title(page), - "count" : text.parse_int(self.count), - "mirror": text.parse_int(self.mirror), - } - - def images(self, page): - return ((self.get_image(page), {"num": text.parse_int(self.num)}),) - - -class NudecollectAlbumExtractor(NudecollectExtractor): - """Extractor for image albums on nudecollect.com""" - subcategory = "album" - pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com" - r"/content/([^/?#]+)/(?:index-mirror-(\d+)-(\d+)" - r"|page-\d+-pics-(\d+)-mirror-(\d+))\.html") - example = ("https://www.nudecollect.com/content/12345_TITLE" - "/index-mirror-01-123.html") - - def __init__(self, match): - self.slug = match.group(1) - self.mirror = match.group(2) or match.group(5) - self.count = text.parse_int(match.group(3) or match.group(4)) - url = "{}/content/{}/image-1-pics-{}-mirror-{}.html".format( - self.root, self.slug, self.count, self.mirror) - NudecollectExtractor.__init__(self, match, url) - - def metadata(self, page): - return { - "slug" : self.slug, - "title" : self.get_title(page), - "mirror": text.parse_int(self.mirror), - } - - def images(self, page): - url = self.get_image(page) - p1, _, p2 = url.partition("/image0") - ufmt = p1 + "/image{:>05}" + p2[4:] - return [(ufmt.format(num), None) for num in range(1, self.count + 1)] diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 0389ead..89c0d2f 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -32,7 +32,7 @@ class PahealExtractor(Extractor): post["tags"] = text.unquote(post["tags"]) post.update(data) yield Message.Directory, post - yield Message.Url, url, text.nameext_from_url(url, post) + yield Message.Url, url, post def get_metadata(self): """Return general metadata""" @@ -59,11 +59,13 @@ class PahealExtractor(Extractor): extr(">Source Link<", "</td>"), "href='", "'")), } - dimensions, size, ext = extr("Info</th><td>", ">").split(" // ") - post["width"], _, height = dimensions.partition("x") + dimensions, size, ext = extr("Info</th><td>", "<").split(" // ") post["size"] = text.parse_bytes(size[:-1]) + post["width"], _, height = dimensions.partition("x") post["height"], _, duration = height.partition(", ") post["duration"] = text.parse_float(duration[:-1]) + post["filename"] = "{} - {}".format(post_id, post["tags"]) + post["extension"] = ext return post @@ -112,6 +114,7 @@ class PahealTagExtractor(PahealExtractor): tags, data, date = data.split("\n") dimensions, size, ext = data.split(" // ") + tags = text.unescape(tags) width, _, height = dimensions.partition("x") height, _, duration = height.partition(", ") @@ -119,9 +122,11 @@ class PahealTagExtractor(PahealExtractor): "id": pid, "md5": md5, "file_url": url, "width": width, "height": height, "duration": text.parse_float(duration[:-1]), - "tags": text.unescape(tags), + "tags": tags, "size": text.parse_bytes(size[:-1]), "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"), + "filename" : "{} - {}".format(pid, tags), + "extension": ext, } def _extract_data_ex(self, post): diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 4b26393..c46a587 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -47,6 +47,7 @@ class PinterestExtractor(Extractor): carousel_data = pin.get("carousel_data") if carousel_data: + pin["count"] = len(carousel_data["carousel_slots"]) for num, slot in enumerate(carousel_data["carousel_slots"], 1): slot["media_id"] = slot.pop("id") pin.update(slot) @@ -65,7 +66,7 @@ class PinterestExtractor(Extractor): if videos or media.get("duration") is None: pin.update(media) - pin["num"] = 0 + pin["num"] = pin["count"] = 1 pin["media_id"] = "" url = media["url"] diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py new file mode 100644 index 0000000..0149d06 --- /dev/null +++ b/gallery_dl/extractor/poringa.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for http://www.poringa.net/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +import itertools + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net" + + +class PoringaExtractor(Extractor): + category = "poringa" + directory_fmt = ("{category}", "{user}", "{post_id}") + filename_fmt = "{post_id}_{title}_{num:>03}_{filename}.{extension}" + archive_fmt = "{post_id}_{num}" + root = "http://www.poringa.net" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item = match.group(1) + self.__cookies = True + + def items(self): + for post_id in self.posts(): + url = "{}/posts/imagenes/{}".format(self.root, post_id) + + try: + response = self.request(url) + except exception.HttpError as exc: + self.log.warning( + "Unable to fetch posts for '%s' (%s)", post_id, exc) + continue + + if "/registro-login?" in response.url: + self.log.warning("Private post '%s'", post_id) + continue + + page = response.text + title, pos = text.extract( + page, 'property="og:title" content="', '"') + + try: + pos = page.index('<div class="main-info', pos) + user, pos = text.extract( + page, 'href="http://www.poringa.net/', '"', pos) + except ValueError: + user = None + + if not user: + user = "poringa" + + data = { + "post_id" : post_id, + "title" : text.unescape(title), + "user" : text.unquote(user), + "_http_headers": {"Referer": url}, + } + + main_post = text.extr( + page, 'property="dc:content" role="main">', '</div>') + urls = list(text.extract_iter( + main_post, '<img class="imagen" border="0" src="', '"')) + data["count"] = len(urls) + + yield Message.Directory, data + for data["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def posts(self): + return () + + def request(self, url, **kwargs): + if self.__cookies: + self.__cookies = False + self.cookies_update(_cookie_cache()) + + for _ in range(5): + response = Extractor.request(self, url, **kwargs) + if response.cookies: + _cookie_cache.update("", response.cookies) + if response.content.find( + b"<title>Please wait a few moments</title>", 0, 600) < 0: + return response + self.sleep(5.0, "check") + + def _pagination(self, url, params): + for params["p"] in itertools.count(1): + page = self.request(url, params=params).text + + posts_ids = PoringaPostExtractor.pattern.findall(page) + posts_ids = list(dict.fromkeys(posts_ids)) + yield from posts_ids + + if len(posts_ids) < 19: + return + + +class PoringaPostExtractor(PoringaExtractor): + """Extractor for posts on poringa.net""" + subcategory = "post" + pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)" + example = "http://www.poringa.net/posts/imagenes/12345/TITLE.html" + + def posts(self): + return (self.item,) + + +class PoringaUserExtractor(PoringaExtractor): + subcategory = "user" + pattern = BASE_PATTERN + r"/(\w+)$" + example = "http://www.poringa.net/USER" + + def posts(self): + url = self.root + "/buscar/" + params = {"q": self.item} + return self._pagination(url, params) + + +class PoringaSearchExtractor(PoringaExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)" + example = "http://www.poringa.net/buscar/?q=QUERY" + + def posts(self): + url = self.root + "/buscar/" + params = {"q": self.item} + return self._pagination(url, params) + + +@cache() +def _cookie_cache(): + return () diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py index 6439a22..cf70ccc 100644 --- a/gallery_dl/extractor/rule34us.py +++ b/gallery_dl/extractor/rule34us.py @@ -38,7 +38,11 @@ class Rule34usExtractor(BooruExtractor): "height" : extr(' x ', 'h'), "file_url": extr(' src="', '"'), } - post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + + url = post["file_url"] + if "//video-cdn1." in url: + post["_fallback"] = (url.replace("//video-cdn1.", "//video."),) + post["md5"] = url.rpartition("/")[2].partition(".")[0] tags = collections.defaultdict(list) for tag_type, tag_name in self._find_tags(page): diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index 5415bf3..08cccab 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -87,6 +87,10 @@ BASE_PATTERN = SzurubooruExtractor.update({ "root": "https://booru.bcbnsfw.space", "pattern": r"booru\.bcbnsfw\.space", }, + "snootbooru": { + "root": "https://snootbooru.com", + "pattern": r"snootbooru\.com", + }, }) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index fdcefdd..aa9ab9f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -552,9 +552,11 @@ class TwitterTimelineExtractor(TwitterExtractor): return self.api.user_media if strategy == "tweets": return self.api.user_tweets + if strategy == "media": + return self.api.user_media if strategy == "with_replies": return self.api.user_tweets_and_replies - return self.api.user_media + raise exception.StopExtraction("Invalid strategy '%s'", strategy) class TwitterTweetsExtractor(TwitterExtractor): diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 7413b5a..3bd0648 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -225,9 +225,6 @@ class WeiboUserExtractor(WeiboExtractor): pattern = USER_PATTERN + r"(?:$|#)" example = "https://weibo.com/USER" - def initialize(self): - pass - def items(self): base = "{}/u/{}?tabtype=".format(self.root, self._user_id()) return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py new file mode 100644 index 0000000..45b0cd8 --- /dev/null +++ b/gallery_dl/extractor/zzup.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import GalleryExtractor +from .. import text + + +class ZzupGalleryExtractor(GalleryExtractor): + category = "zzup" + directory_fmt = ("{category}", "{title}") + filename_fmt = "{slug}_{num:>03}.{extension}" + archive_fmt = "{slug}_{num}" + root = "https://zzup.com" + pattern = (r"(?:https?://)?(?:www\.)?zzup\.com(/content" + r"/[\w=]+/([^/?#]+)/[\w=]+)/(?:index|page-\d+)\.html") + example = "https://zzup.com/content/xyz=/12345_TITLE/123=/index.html" + + def __init__(self, match): + url = "{}/{}/index.html".format(self.root, match.group(1)) + GalleryExtractor.__init__(self, match, url) + self.slug = match.group(2) + + def metadata(self, page): + return { + "slug" : self.slug, + "title": text.unescape(text.extr( + page, "<title>", "</title>"))[:-11], + } + + def images(self, page): + path = text.extr(page, 'class="picbox"><a target="_blank" href="', '"') + count = text.parse_int(text.extr(path, "-pics-", "-mirror")) + page = self.request(self.root + path).text + url = self.root + text.extr(page, '\n<a href="', '"') + p1, _, p2 = url.partition("/image0") + ufmt = p1 + "/image{:>05}" + p2[4:] + return [(ufmt.format(num), None) for num in range(1, count + 1)] diff --git a/gallery_dl/version.py b/gallery_dl/version.py index b74d977..15905d6 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.5" +__version__ = "1.26.6" @@ -1,10 +1,11 @@ [flake8] -exclude = .git,__pycache__,build,dist,archive,results +exclude = .git,__pycache__,build,dist,archive ignore = E203,E226,W504 per-file-ignores = setup.py: E501 gallery_dl/extractor/500px.py: E501 gallery_dl/extractor/mangapark.py: E501 + test/results/*.py: E122,E241,E402,E501 [egg_info] tag_build = |
