diff options
| author | 2019-07-20 05:51:44 -0400 | |
|---|---|---|
| committer | 2019-07-20 05:51:44 -0400 | |
| commit | 2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (patch) | |
| tree | 3d5f633ff69cd393036a3dabc4d4533c8484f9ad | |
| parent | 195c45911e79c33cf0bb986721365fb06df5a153 (diff) | |
New upstream version 1.9.0upstream/1.9.0
49 files changed, 1047 insertions, 556 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index cd74a9f..625018a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,31 @@ # Changelog +## 1.9.0 - 2019-07-19 +### Additions +- Support for + - `erolord` - http://erolord.com/ ([#326](https://github.com/mikf/gallery-dl/issues/326)) +- Add login support for `instagram` ([#195](https://github.com/mikf/gallery-dl/issues/195)) +- Add `--no-download` and `extractor.*.download` disable file downloads ([#220](https://github.com/mikf/gallery-dl/issues/220)) +- Add `-A/--abort` to specify the number of consecutive download skips before aborting +- Interpret `-1` as infinite retries ([#300](https://github.com/mikf/gallery-dl/issues/300)) +- Implement custom log message formats per log-level ([#304](https://github.com/mikf/gallery-dl/issues/304)) +- Implement an `mtime` post-processor that sets file modification times according to metadata fields ([#332](https://github.com/mikf/gallery-dl/issues/332)) +- Implement a `twitter.content` option to enable tweet text extraction ([#333](https://github.com/mikf/gallery-dl/issues/333), [#338](https://github.com/mikf/gallery-dl/issues/338)) +- Enable `date-min/-max/-format` options for `tumblr` ([#337](https://github.com/mikf/gallery-dl/issues/337)) +### Changes +- Set file modification times according to their `Last-Modified` header when downloading ([#236](https://github.com/mikf/gallery-dl/issues/236), [#277](https://github.com/mikf/gallery-dl/issues/277)) + - Use `--no-mtime` or `downloader.*.mtime` to disable this behavior +- Duplicate download URLs are no longer silently ignored (controllable with `extractor.*.image-unique`) +- Deprecate `--abort-on-skip` +### Fixes +- Retry downloads on OpenSSL exceptions ([#324](https://github.com/mikf/gallery-dl/issues/324)) +- Ignore unavailable pins on `sexcom` instead of raising an exception ([#325](https://github.com/mikf/gallery-dl/issues/325)) +- Use Firefox's SSL/TLS ciphers to prevent Cloudflare CAPTCHAs ([#342](https://github.com/mikf/gallery-dl/issues/342)) +- Improve folder name matching on `deviantart` ([#343](https://github.com/mikf/gallery-dl/issues/343)) +- Forward cookies to `youtube-dl` to allow downloading private videos +- Miscellaneous fixes for `35photo`, `500px`, `newgrounds`, `simplyhentai` + + ## 1.8.7 - 2019-06-28 ### Additions - Support for @@ -22,6 +22,7 @@ Optional - FFmpeg_: Pixiv Ugoira to WebM conversion - youtube-dl_: Video downloads +- pyOpenSSL_: Access Cloudflare protected sites Installation @@ -77,8 +78,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.8.7/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.8.7/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.9.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.9.0/gallery-dl.bin>`__ These executables include a Python 3.7 interpreter and all required Python packages. @@ -222,7 +223,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.8.7.zip +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.9.0.zip .. _dev: https://github.com/mikf/gallery-dl/archive/master.zip .. _Python: https://www.python.org/downloads/ @@ -231,6 +232,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _Requests: http://docs.python-requests.org/en/master/ .. _FFmpeg: https://www.ffmpeg.org/ .. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ +.. _pyOpenSSL: https://pyopenssl.org/ .. _Snapd: https://docs.snapcraft.io/installing-snapd .. _OAuth: https://en.wikipedia.org/wiki/OAuth diff --git a/docs/configuration.rst b/docs/configuration.rst index c606c6c..32a529a 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -217,7 +217,7 @@ extractor.*.user-agent ---------------------- =========== ===== Type ``string`` -Default ``"Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0"`` +Default ``"Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"`` Description User-Agent header value to be used for HTTP requests. Note: This option has no effect on `pixiv` and @@ -300,8 +300,9 @@ extractor.*.retries ------------------- =========== ===== Type ``integer`` -Default ``5`` -Description Number of times a failed HTTP request is retried before giving up. +Default ``4`` +Description Maximum number of times a failed HTTP request is retried before + giving up or ``-1`` for infinite retries. =========== ===== @@ -333,6 +334,22 @@ Description Controls whether to verify SSL/TLS certificates for HTTPS requests. =========== ===== +extractor.*.download +-------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Controls whether to download media files. + + Setting this to ``false`` won't download any files, but all other + functions (postprocessors_, `download archive`_, etc.) + will be executed as normal. +=========== ===== + +.. _postprocessors: `extractor.*.postprocessors`_ +.. _download archive: `extractor.*.archive`_ + + extractor.*.image-range ----------------------- =========== ===== @@ -381,6 +398,40 @@ Description Like `image-filter`__, but applies to delegated URLs __ `extractor.*.image-filter`_ +extractor.*.image-unique +------------------------ +=========== ===== +Type ``bool`` +Default ``false`` +Description Ignore image URLs that have been encountered before during the + current extractor run. +=========== ===== + + +extractor.*.chapter-unique +-------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Like `image-unique`__, but applies to delegated URLs + like manga-chapters, etc. +=========== ===== + +__ `extractor.*.image-unique`_ + + +extractor.*.date-format +---------------------------- +=========== ===== +Type ``string`` +Default ``"%Y-%m-%dT%H:%M:%S"`` +Description Format string used to parse ``string`` values of + `date-min` and `date-max`. + + See |strptime|_ for a list of formatting directives. +=========== ===== + + Extractor-specific Options ========================== @@ -737,24 +788,9 @@ Description Retrieve additional comments by resolving the ``more`` comment extractor.reddit.date-min & .date-max ------------------------------------- =========== ===== -Type ``integer`` or ``string`` +Type |Date|_ Default ``0`` and ``253402210800`` (timestamp of |datetime.max|_) Description Ignore all submissions posted before/after this date. - - * If this is an ``integer``, it represents the date as UTC timestamp. - * If this is a ``string``, it will get parsed according to date-format_. -=========== ===== - - -extractor.reddit.date-format ----------------------------- -=========== ===== -Type ``string`` -Default ``"%Y-%m-%dT%H:%M:%S"`` -Description An explicit format string used to parse the ``string`` values of - `date-min and date-max`_. - - See |strptime|_ for a list of formatting directives. =========== ===== @@ -831,6 +867,15 @@ Description Download blog avatars. =========== ===== +extractor.tumblr.date-min & .date-max +------------------------------------- +=========== ===== +Type |Date|_ +Default ``0`` and ``null`` +Description Ignore all posts published before/after this date. +=========== ===== + + extractor.tumblr.external ------------------------- =========== ===== @@ -877,6 +922,15 @@ Description A (comma-separated) list of post types to extract images, etc. from. =========== ===== +extractor.twitter.content +------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Extract tweet text as ``content`` metadata. +=========== ===== + + extractor.twitter.retweets -------------------------- =========== ===== @@ -945,6 +999,16 @@ Description Enable/Disable this downloader module. =========== ===== +downloader.*.mtime +------------------ +=========== ===== +Type ``bool`` +Default ``true`` +Description Use |Last-Modified|_ HTTP response headers + to set file modification times. +=========== ===== + + downloader.*.part ----------------- =========== ===== @@ -992,7 +1056,8 @@ downloader.*.retries =========== ===== Type ``integer`` Default `extractor.*.retries`_ -Description Number of retries during file downloads. +Description Maximum number of retries during file downloads + or ``-1`` for infinite retries. =========== ===== @@ -1240,6 +1305,23 @@ Description Custom format string to build content of metadata files. =========== ===== +mtime +----- + +Set file modification time according to its metadata + +mtime.key +--------- +=========== ===== +Type ``string`` +Default ``"date"`` +Description Name of the metadata field whose value should be used. + + This value must either be a UNIX timestamp or a + |datetime|_ object. +=========== ===== + + ugoira ------ @@ -1375,6 +1457,19 @@ Description Path of the SQLite3 database used to cache login sessions, =========== ===== +ciphers +------- +=========== ===== +Type ``bool`` or ``string`` +Default ``true`` +Description * ``true``: Update urllib3's default cipher list + * ``false``: Leave the default cipher list as is + * Any ``string``: Replace urllib3's default ciphers with these + (See `SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>`__ + for details) +=========== ===== + + API Tokens & IDs ================ @@ -1479,6 +1574,20 @@ Custom Types ============ +Date +---- +=========== ===== +Type ``string`` or ``integer`` +Examples * ``"2019-01-01T00:00:00"`` + * ``"2019"`` with ``"%Y"`` as date-format_ + * ``1546297200`` +Description A |Date|_ value represents a specific point in time. + + * If given as ``string``, it is parsed according to date-format_. + * If given as ``integer``, it is interpreted as UTC timestamp. +=========== ===== + + Path ---- =========== ===== @@ -1508,7 +1617,7 @@ Logging Configuration =========== ===== Type ``object`` -Example .. code:: +Examples .. code:: { "format": "{asctime} {name}: {message}", @@ -1517,10 +1626,21 @@ Example .. code:: "encoding": "ascii" } + { + "level": "debug", + "format": { + "debug" : "debug: {message}", + "info" : "[{name}] {message}", + "warning": "Warning: {message}", + "error" : "ERROR: {message}" + } + } + Description Extended logging output configuration. * format - * Format string for logging messages + * General format string for logging messages + or a dictionary with format strings for each loglevel. In addition to the default `LogRecord attributes <https://docs.python.org/3/library/logging.html#logrecord-attributes>`__, @@ -1587,16 +1707,18 @@ Description An object with the ``name`` of a post-processor and its options. .. |verify| replace:: ``verify`` .. |mature_content| replace:: ``mature_content`` .. |webbrowser.open()| replace:: ``webbrowser.open()`` +.. |datetime| replace:: ``datetime`` .. |datetime.max| replace:: ``datetime.max`` +.. |Date| replace:: ``Date`` .. |Path| replace:: ``Path`` +.. |Last-Modified| replace:: ``Last-Modified`` .. |Logging Configuration| replace:: ``Logging Configuration`` .. |Postprocessor Configuration| replace:: ``Postprocessor Configuration`` .. |strptime| replace:: strftime() and strptime() Behavior .. _base-directory: `extractor.*.base-directory`_ .. _skipped: `extractor.*.skip`_ -.. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_ -.. _date-format: extractor.reddit.date-format_ +.. _date-format: `extractor.*.date-format`_ .. _deviantart.metadata: extractor.deviantart.metadata_ .. _.netrc: https://stackoverflow.com/tags/.netrc/info @@ -1604,12 +1726,14 @@ Description An object with the ``name`` of a post-processor and its options. .. _requests.request(): https://docs.python-requests.org/en/master/api/#requests.request .. _timeout: https://docs.python-requests.org/en/latest/user/advanced/#timeouts .. _verify: https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification +.. _Last-Modified: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.29 .. _`Requests' proxy documentation`: http://docs.python-requests.org/en/master/user/advanced/#proxies .. _format string: https://docs.python.org/3/library/string.html#formatstrings .. _format strings: https://docs.python.org/3/library/string.html#formatstrings .. _strptime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior .. _mature_content: https://www.deviantart.com/developers/http/v1/20160316/object/deviation .. _webbrowser.open(): https://docs.python.org/3/library/webbrowser.html +.. _datetime: https://docs.python.org/3/library/datetime.html#datetime-objects .. _datetime.max: https://docs.python.org/3/library/datetime.html#datetime.datetime.max .. _Authentication: https://github.com/mikf/gallery-dl#5authentication .. _youtube-dl: https://github.com/ytdl-org/youtube-dl diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index a5270d2..04be5e6 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -148,8 +148,13 @@ { "mode": "terminal", "log": { - "format": "{name}: {message}", - "level": "info" + "level": "info", + "format": { + "debug" : "\u001b[0;37m{name}: {message}\u001b[0m", + "info" : "\u001b[1;37m{name}: {message}\u001b[0m", + "warning": "\u001b[1;33m{name}: {message}\u001b[0m", + "error" : "\u001b[1;31m{name}: {message}\u001b[0m" + } }, "logfile": { "path": "~/gallery-dl/log.txt", diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index c792e9e..835ed17 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -8,7 +8,7 @@ "proxy": null, "skip": true, "sleep": 0, - "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0", + "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0", "artstation": { @@ -132,6 +132,7 @@ }, "twitter": { + "content": false, "retweets": true, "videos": false }, @@ -152,8 +153,19 @@ "http": { + "mtime": true, "rate": null, - "retries": 5, + "retries": 4, + "timeout": 30.0, + "verify": true + }, + + "ytdl": + { + "format": null, + "mtime": true, + "rate": null, + "retries": 4, "timeout": 30.0, "verify": true } @@ -164,6 +176,7 @@ "mode": "auto", "progress": true, "shorten": true, + "log": "[{name}][{levelname}] {message}", "logfile": null, "unsupportedfile": null }, diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index f47ed10..2a1a1ed 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -1,5 +1,7 @@ Supported Sites =============== +Unless otherwise known, assume all sites to be NSFW + ==================== =================================== ================================================== ================ Site URL Capabilities Authentication ==================== =================================== ================================================== ================ @@ -23,6 +25,7 @@ DeviantArt https://www.deviantart.com/ |deviantart-C| Doki Reader https://kobato.hologfx.com/reader/ Chapters, Manga Dynasty Reader https://dynasty-scans.com/ Chapters, individual Images, Search Results e621 https://e621.net/ Pools, Popular Images, Posts, Tag-Searches +EroLord.com http://erolord.com/ Galleries ExHentai https://exhentai.org/ Favorites, Galleries, Search Results Optional Fallen Angels Scans https://www.fascans.com/ Chapters, Manga Fashion Nova https://www.fashionnova.com/ Collections, Products @@ -46,7 +49,7 @@ ImageFap https://imagefap.com/ Images from Users, Gall imgbox https://imgbox.com/ Galleries, individual Images imgth https://imgth.com/ Galleries imgur https://imgur.com/ Albums, individual Images -Instagram https://www.instagram.com/ Images from Users, individual Images, Tag-Searches +Instagram https://www.instagram.com/ Images from Users, individual Images, Tag-Searches Optional Jaimini's Box https://jaiminisbox.com/reader/ Chapters, Manga Joyreactor http://joyreactor.cc/ |joyreactor-C| Keenspot http://www.keenspot.com/ Comics @@ -94,7 +97,7 @@ Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searc Sankaku Complex https://www.sankakucomplex.com/ Articles, Tag-Searches Sen Manga https://raw.senmanga.com/ Chapters Sense-Scans http://sensescans.com/reader/ Chapters, Manga -Sex.com https://www.sex.com/ Boards, Pins, Search Results +Sex.com https://www.sex.com/ Boards, Pins, related Pins, Search Results Simply Hentai https://www.simply-hentai.com/ Galleries, individual Images, Videos SlickPic https://www.slickpic.com/ Images from Users, Albums SlideShare https://www.slideshare.net/ Presentations diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 3643a5c..806b229 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -122,7 +122,9 @@ def main(): if args.yamlfiles: config.load(args.yamlfiles, strict=True, fmt="yaml") if args.postprocessors: - config.set(("postprocessors", ), args.postprocessors) + config.set(("postprocessors",), args.postprocessors) + if args.abort: + config.set(("skip",), "abort:" + str(args.abort)) for key, value in args.options: config.set(key, value) diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py index 97972cd..6fb09e1 100644 --- a/gallery_dl/downloader/__init__.py +++ b/gallery_dl/downloader/__init__.py @@ -22,15 +22,24 @@ def find(scheme): try: return _cache[scheme] except KeyError: - klass = None + pass + + klass = None + if scheme == "https": + scheme = "http" + if scheme in modules: # prevent unwanted imports try: - if scheme in modules: # prevent unwanted imports - module = importlib.import_module("." + scheme, __package__) - klass = module.__downloader__ - except (ImportError, AttributeError, TypeError): + module = importlib.import_module("." + scheme, __package__) + except ImportError: pass + else: + klass = module.__downloader__ + + if scheme == "http": + _cache["http"] = _cache["https"] = klass + else: _cache[scheme] = klass - return klass + return klass # -------------------------------------------------------------------- diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index 4803c85..6e5cd4c 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 Mike Fährmann +# Copyright 2014-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,23 +9,18 @@ """Common classes and constants used by downloader modules.""" import os -import time import logging -from .. import config, util, exception -from requests.exceptions import RequestException -from ssl import SSLError +from .. import config, util class DownloaderBase(): """Base class for downloaders""" scheme = "" - retries = 1 def __init__(self, extractor, output): self.session = extractor.session self.out = output self.log = logging.getLogger("downloader." + self.scheme) - self.downloading = False self.part = self.config("part", True) self.partdir = self.config("part-directory") @@ -34,137 +29,8 @@ class DownloaderBase(): os.makedirs(self.partdir, exist_ok=True) def config(self, key, default=None): - """Interpolate config value for 'key'""" + """Interpolate downloader config value for 'key'""" return config.interpolate(("downloader", self.scheme, key), default) def download(self, url, pathfmt): - """Download the resource at 'url' and write it to a file-like object""" - try: - return self.download_impl(url, pathfmt) - except Exception: - print() - raise - finally: - # remove file from incomplete downloads - if self.downloading and not self.part: - try: - os.remove(pathfmt.temppath) - except (OSError, AttributeError): - pass - - def download_impl(self, url, pathfmt): - """Actual implementaion of the download process""" - adj_ext = None - tries = 0 - msg = "" - - if self.part: - pathfmt.part_enable(self.partdir) - - while True: - self.reset() - if tries: - self.log.warning("%s (%d/%d)", msg, tries, self.retries) - if tries >= self.retries: - return False - time.sleep(tries) - tries += 1 - - # check for .part file - filesize = pathfmt.part_size() - - # connect to (remote) source - try: - offset, size = self.connect(url, filesize) - except exception.DownloadRetry as exc: - msg = exc - continue - except exception.DownloadComplete: - break - except Exception as exc: - self.log.warning(exc) - return False - - # check response - if not offset: - mode = "w+b" - if filesize: - self.log.info("Unable to resume partial download") - else: - mode = "r+b" - self.log.info("Resuming download at byte %d", offset) - - # set missing filename extension - if not pathfmt.has_extension: - pathfmt.set_extension(self.get_extension()) - if pathfmt.exists(): - pathfmt.temppath = "" - return True - - self.out.start(pathfmt.path) - self.downloading = True - with pathfmt.open(mode) as file: - if offset: - file.seek(offset) - - # download content - try: - self.receive(file) - except (RequestException, SSLError) as exc: - msg = exc - print() - continue - - # check filesize - if size and file.tell() < size: - msg = "filesize mismatch ({} < {})".format( - file.tell(), size) - continue - - # check filename extension - adj_ext = self._check_extension(file, pathfmt) - - break - - self.downloading = False - if adj_ext: - pathfmt.set_extension(adj_ext) - return True - - def connect(self, url, offset): - """Connect to 'url' while respecting 'offset' if possible - - Returns a 2-tuple containing the actual offset and expected filesize. - If the returned offset-value is greater than zero, all received data - will be appended to the existing .part file. - Return '0' as second tuple-field to indicate an unknown filesize. - """ - - def receive(self, file): - """Write data to 'file'""" - - def reset(self): - """Reset internal state / cleanup""" - - def get_extension(self): - """Return a filename extension appropriate for the current request""" - - @staticmethod - def _check_extension(file, pathfmt): - """Check filename extension against fileheader""" - extension = pathfmt.keywords["extension"] - if extension in FILETYPE_CHECK: - file.seek(0) - header = file.read(8) - if len(header) >= 8 and not FILETYPE_CHECK[extension](header): - for ext, check in FILETYPE_CHECK.items(): - if ext != extension and check(header): - return ext - return None - - -FILETYPE_CHECK = { - "jpg": lambda h: h[0:2] == b"\xff\xd8", - "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", - "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, -} + """Write data from 'url' into the file specified by 'pathfmt'""" diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 961c1a2..7a95191 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 Mike Fährmann +# Copyright 2014-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,11 +8,17 @@ """Downloader module for http:// and https:// URLs""" +import os import time import mimetypes -from requests.exceptions import ConnectionError, Timeout +from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, exception +from .. import text + +try: + from OpenSSL.SSL import Error as SSLError +except ImportError: + from ssl import SSLError class HttpDownloader(DownloaderBase): @@ -20,13 +26,16 @@ class HttpDownloader(DownloaderBase): def __init__(self, extractor, output): DownloaderBase.__init__(self, extractor, output) - self.response = None self.retries = self.config("retries", extractor._retries) self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) + self.mtime = self.config("mtime", True) self.rate = self.config("rate") + self.downloading = False self.chunk_size = 16384 + if self.retries < 0: + self.retries = float("inf") if self.rate: self.rate = text.parse_bytes(self.rate) if not self.rate: @@ -34,41 +43,132 @@ class HttpDownloader(DownloaderBase): elif self.rate < self.chunk_size: self.chunk_size = self.rate - def connect(self, url, offset): - headers = {} - if offset: - headers["Range"] = "bytes={}-".format(offset) - + def download(self, url, pathfmt): try: - self.response = self.session.request( - "GET", url, stream=True, headers=headers, allow_redirects=True, - timeout=self.timeout, verify=self.verify) - except (ConnectionError, Timeout) as exc: - raise exception.DownloadRetry(exc) - - code = self.response.status_code - if code == 200: # OK - offset = 0 - size = self.response.headers.get("Content-Length") - elif code == 206: # Partial Content - size = self.response.headers["Content-Range"].rpartition("/")[2] - elif code == 416: # Requested Range Not Satisfiable - raise exception.DownloadComplete() - elif code == 429 or 500 <= code < 600: # Server Error - raise exception.DownloadRetry( - "{} Server Error: {} for url: {}".format( - code, self.response.reason, url)) - else: - self.response.raise_for_status() - - return offset, text.parse_int(size) - - def receive(self, file): + return self._download_impl(url, pathfmt) + except Exception: + print() + raise + finally: + # remove file from incomplete downloads + if self.downloading and not self.part: + try: + os.unlink(pathfmt.temppath) + except (OSError, AttributeError): + pass + + def _download_impl(self, url, pathfmt): + response = None + adj_ext = None + tries = 0 + msg = "" + + if self.part: + pathfmt.part_enable(self.partdir) + + while True: + if tries: + if response: + response.close() + self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) + if tries > self.retries: + return False + time.sleep(min(2 ** (tries-1), 1800)) + tries += 1 + + # check for .part file + filesize = pathfmt.part_size() + if filesize: + headers = {"Range": "bytes={}-".format(filesize)} + else: + headers = None + + # connect to (remote) source + try: + response = self.session.request( + "GET", url, stream=True, headers=headers, + timeout=self.timeout, verify=self.verify) + except (ConnectionError, Timeout) as exc: + msg = str(exc) + continue + except Exception as exc: + self.log.warning("%s", exc) + return False + + # check response + code = response.status_code + if code == 200: # OK + offset = 0 + size = response.headers.get("Content-Length") + elif code == 206: # Partial Content + offset = filesize + size = response.headers["Content-Range"].rpartition("/")[2] + elif code == 416: # Requested Range Not Satisfiable + break + else: + msg = "{}: {} for url: {}".format(code, response.reason, url) + if code == 429 or 500 <= code < 600: # Server Error + continue + self.log.warning("%s", msg) + return False + size = text.parse_int(size) + + # set missing filename extension + if not pathfmt.has_extension: + pathfmt.set_extension(self.get_extension(response)) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + + # set open mode + if not offset: + mode = "w+b" + if filesize: + self.log.info("Unable to resume partial download") + else: + mode = "r+b" + self.log.info("Resuming download at byte %d", offset) + + # start downloading + self.out.start(pathfmt.path) + self.downloading = True + with pathfmt.open(mode) as file: + if offset: + file.seek(offset) + + # download content + try: + self.receive(response, file) + except (RequestException, SSLError) as exc: + msg = str(exc) + print() + continue + + # check filesize + if size and file.tell() < size: + msg = "filesize mismatch ({} < {})".format( + file.tell(), size) + print() + continue + + # check filename extension + adj_ext = self.check_extension(file, pathfmt) + + break + + self.downloading = False + if adj_ext: + pathfmt.set_extension(adj_ext) + if self.mtime: + pathfmt.keywords["_mtime"] = response.headers.get("Last-Modified") + return True + + def receive(self, response, file): if self.rate: total = 0 # total amount of bytes received start = time.time() # start time - for data in self.response.iter_content(self.chunk_size): + for data in response.iter_content(self.chunk_size): file.write(data) if self.rate: @@ -79,13 +179,8 @@ class HttpDownloader(DownloaderBase): # sleep if less time passed than expected time.sleep(expected - delta) - def reset(self): - if self.response: - self.response.close() - self.response = None - - def get_extension(self): - mtype = self.response.headers.get("Content-Type", "image/jpeg") + def get_extension(self, response): + mtype = response.headers.get("Content-Type", "image/jpeg") mtype = mtype.partition(";")[0] if mtype in MIMETYPE_MAP: @@ -100,6 +195,26 @@ class HttpDownloader(DownloaderBase): "No filename extension found for MIME type '%s'", mtype) return "txt" + @staticmethod + def check_extension(file, pathfmt): + """Check filename extension against fileheader""" + extension = pathfmt.keywords["extension"] + if extension in FILETYPE_CHECK: + file.seek(0) + header = file.read(8) + if len(header) >= 8 and not FILETYPE_CHECK[extension](header): + for ext, check in FILETYPE_CHECK.items(): + if ext != extension and check(header): + return ext + return None + + +FILETYPE_CHECK = { + "jpg": lambda h: h[0:2] == b"\xff\xd8", + "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", + "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, +} + MIMETYPE_MAP = { "image/jpeg": "jpg", diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py index ca33863..c57fbd0 100644 --- a/gallery_dl/downloader/text.py +++ b/gallery_dl/downloader/text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 Mike Fährmann +# Copyright 2014-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,24 +14,13 @@ from .common import DownloaderBase class TextDownloader(DownloaderBase): scheme = "text" - def __init__(self, extractor, output): - DownloaderBase.__init__(self, extractor, output) - self.content = b"" - - def connect(self, url, offset): - data = url.encode() - self.content = data[offset + 5:] - return offset, len(data) - 5 - - def receive(self, file): - file.write(self.content) - - def reset(self): - self.content = b"" - - @staticmethod - def get_extension(): - return "txt" + def download(self, url, pathfmt): + if self.part: + pathfmt.part_enable(self.partdir) + self.out.start(pathfmt.path) + with pathfmt.open("wb") as file: + file.write(url.encode()[5:]) + return True __downloader__ = TextDownloader diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 57a84d0..da57935 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -20,13 +20,15 @@ class YoutubeDLDownloader(DownloaderBase): def __init__(self, extractor, output): DownloaderBase.__init__(self, extractor, output) + retries = self.config("retries", extractor._retries) options = { "format": self.config("format") or None, "ratelimit": text.parse_bytes(self.config("rate"), None), - "retries": self.config("retries", extractor._retries), + "retries": retries+1 if retries >= 0 else float("inf"), "socket_timeout": self.config("timeout", extractor._timeout), "nocheckcertificate": not self.config("verify", extractor._verify), "nopart": not self.part, + "updatetime": self.config("mtime", True), } options.update(self.config("raw-options") or {}) @@ -36,6 +38,9 @@ class YoutubeDLDownloader(DownloaderBase): self.ytdl = YoutubeDL(options) def download(self, url, pathfmt): + for cookie in self.session.cookies: + self.ytdl.cookiejar.set_cookie(cookie) + try: info_dict = self.ytdl.extract_info(url[5:], download=False) except Exception: diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index 50dbfe8..d3e9276 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -72,7 +72,6 @@ class _35photoExtractor(Extractor): "user" : data["user_login"], "user_id" : data["user_id"], "user_name" : data["user_name"], - "other" : data["otherData"], } if "series" in data: @@ -89,6 +88,8 @@ class _35photoExtractor(Extractor): def _photo_ids(page): """Extract unique photo IDs and return them as sorted list""" # searching for photo-id="..." doesn't always work (see unit tests) + if not page: + return () return sorted( set(text.extract_iter(page, "/photo_", "/")), key=text.parse_int, @@ -100,7 +101,7 @@ class _35photoUserExtractor(_35photoExtractor): """Extractor for all images of a user on 35photo.pro""" subcategory = "user" pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro" - r"/(?!photo_|genre_)([^/?&#]+)") + r"/(?!photo_|genre_|rating/)([^/?&#]+)") test = ( ("https://35photo.pro/liya", { "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg", @@ -146,7 +147,14 @@ class _35photoGenreExtractor(_35photoExtractor): ("https://35photo.pro/genre_109/", { "range": "1-30", }), - ("https://35photo.pro/genre_109/new/"), + ("https://35photo.pro/genre_103/", { + "range": "1-30", + "count": 30, + }), + ("https://35photo.pro/genre_103/new/", { + "range": "1-30", + "count": 30, + }), ) def __init__(self, match): @@ -165,6 +173,8 @@ class _35photoGenreExtractor(_35photoExtractor): } def photos(self): + if not self.photo_ids: + return () return self._pagination({ "page": "genre", "community_id": self.genre_id, @@ -193,7 +203,6 @@ class _35photoImageExtractor(_35photoExtractor): "user" : "liya", "user_id" : 20415, "user_name" : "Liya Mirzaeva", - "other" : str, }, }) diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 00b8ab5..07c2e14 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -31,8 +31,7 @@ class _500pxExtractor(Extractor): for photo in self.photos(): url = photo["images"][-1]["url"] - fmt = photo["image_format"] - photo["extension"] = "jpg" if fmt == "jpeg" else fmt + photo["extension"] = photo["image_format"] if data: photo.update(data) if first: @@ -59,7 +58,7 @@ class _500pxExtractor(Extractor): "include_releases" : "true", "liked_by" : "1", "following_sample" : "100", - "image_size" : "32768", + "image_size" : "4096", "ids" : ",".join(str(p["id"]) for p in photos), } @@ -90,7 +89,7 @@ class _500pxUserExtractor(_500pxExtractor): pattern = (r"(?:https?://)?500px\.com" r"/(?!photo/)([^/?&#]+)/?(?:$|\?|#)") test = ("https://500px.com/light_expression_photography", { - "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D5000/v2", + "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D4096/v2", "range": "1-99", "count": 99, }) @@ -124,7 +123,7 @@ class _500pxGalleryExtractor(_500pxExtractor): pattern = (r"(?:https?://)?500px\.com" r"/(?!photo/)([^/?&#]+)/galleries/([^/?&#]+)") test = ("https://500px.com/fashvamp/galleries/lera", { - "url": "8a520272ece83278166b4f8556f9c9da43c43c45", + "url": "002dc81dee5b4a655f0e31ad8349e8903b296df6", "count": 3, "keyword": { "gallery": dict, @@ -144,7 +143,7 @@ class _500pxGalleryExtractor(_500pxExtractor): page = self.request(url).text self.csrf_token, pos = text.extract(page, 'csrf-token" content="', '"') self.user_id , pos = text.extract(page, 'App.CuratorId =', '\n', pos) - self.user_id = self.user_id.strip() + self.user_id = self.user_id.strip(" '\";") # get gallery metadata; transform gallery name into id url = "https://api.500px.com/v1/users/{}/galleries/{}".format( @@ -174,37 +173,30 @@ class _500pxImageExtractor(_500pxExtractor): subcategory = "image" pattern = r"(?:https?://)?500px\.com/photo/(\d+)" test = ("https://500px.com/photo/222049255/queen-of-coasts", { - "url": "d1eda7afeaa589f71f05b9bb5c0694e3ffb357cd", + "url": "fbdf7df39325cae02f5688e9f92935b0e7113315", "count": 1, "keyword": { "camera": "Canon EOS 600D", "camera_info": dict, - "collections_count": int, "comments": list, "comments_count": int, - "converted": False, - "converted_bits": int, - "created_at": "2017-08-01T04:40:05-04:00", - "crop_version": 0, + "created_at": "2017-08-01T08:40:05+00:00", "description": str, - "editored_by": dict, + "editored_by": None, "editors_choice": False, "extension": "jpg", - "favorites_count": int, "feature": "popular", "feature_date": "2017-08-01T09:58:28+00:00", "focal_length": "208", "height": 3111, "id": 222049255, - "image_format": "jpeg", - "image_url": str, + "image_format": "jpg", + "image_url": list, "images": list, "iso": "100", "lens": "EF-S55-250mm f/4-5.6 IS II", "lens_info": dict, - "license_type": 0, - "licensed_at": None, - "liked": False, + "liked": None, "location": None, "location_details": dict, "name": "Queen Of Coasts", @@ -212,15 +204,11 @@ class _500pxImageExtractor(_500pxExtractor): "privacy": False, "profile": True, "rating": float, - "sales_count": int, "status": 1, - "store_download": False, - "store_height": 3111, - "store_width": 4637, "tags": list, - "taken_at": "2017-05-04T13:36:51-04:00", + "taken_at": "2017-05-04T17:36:51+00:00", "times_viewed": int, - "url": "/photo/222049255/queen-of-coasts-by-olesya-nabieva", + "url": "/photo/222049255/Queen-Of-Coasts-by-Olesya-Nabieva", "user": dict, "user_id": 12847235, "votes_count": int, diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 81d480e..189c163 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,6 +24,7 @@ modules = [ "deviantart", "dynastyscans", "e621", + "erolord", "exhentai", "fallenangels", "flickr", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 24197ad..f7b3bc1 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -79,9 +79,7 @@ class ArtstationExtractor(Extractor): def get_user_info(self, username): """Return metadata for a specific user""" url = "{}/users/{}/quick.json".format(self.root, username.lower()) - response = self.request(url, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("user") + response = self.request(url, notfound="user") return response.json() def _pagination(self, url, params=None): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 175af63..5c40e2a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -13,6 +13,7 @@ import time import netrc import queue import logging +import datetime import requests import threading import http.cookiejar @@ -39,10 +40,13 @@ class Extractor(): self._init_headers() self._init_cookies() self._init_proxies() - self._retries = self.config("retries", 5) + self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) + if self._retries < 0: + self._retries = float("inf") + @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): @@ -63,11 +67,11 @@ class Extractor(): return config.interpolate( ("extractor", self.category, self.subcategory, key), default) - def request(self, url, method="GET", *, session=None, - encoding=None, expect=(), retries=None, **kwargs): - tries = 0 - retries = retries or self._retries - session = session or self.session + def request(self, url, method="GET", *, session=None, retries=None, + encoding=None, fatal=True, notfound=None, **kwargs): + tries = 1 + retries = self._retries if retries is None else retries + session = self.session if session is None else session kwargs.setdefault("timeout", self._timeout) kwargs.setdefault("verify", self._verify) @@ -83,26 +87,37 @@ class Extractor(): raise exception.HttpError(exc) else: code = response.status_code - if 200 <= code < 400 or code in expect: + if 200 <= code < 400 or not fatal and \ + (400 <= code < 429 or 431 <= code < 500): if encoding: response.encoding = encoding return response + if notfound and code == 404: + raise exception.NotFoundError(notfound) if cloudflare.is_challenge(response): self.log.info("Solving Cloudflare challenge") url, domain, cookies = cloudflare.solve_challenge( session, response, kwargs) cloudflare.cookies.update(self.category, (domain, cookies)) continue + if cloudflare.is_captcha(response): + try: + import OpenSSL # noqa + except ImportError: + msg = " - Install 'pyOpenSSL' and try again" + else: + msg = "" + self.log.warning("Cloudflare CAPTCHA" + msg) msg = "{}: {} for url: {}".format(code, response.reason, url) - if code < 500 and code != 429: + if code < 500 and code != 429 and code != 430: break - tries += 1 - self.log.debug("%s (%d/%d)", msg, tries, retries) - if tries >= retries: + self.log.debug("%s (%s/%s)", msg, tries, retries+1) + if tries > retries: break - time.sleep(2 ** tries) + time.sleep(min(2 ** (tries-1), 1800)) + tries += 1 raise exception.HttpError(msg) @@ -130,8 +145,8 @@ class Extractor(): headers.clear() headers["User-Agent"] = self.config( - "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) " - "Gecko/20100101 Firefox/62.0")) + "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) " + "Gecko/20100101 Firefox/68.0")) headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" headers["Accept-Encoding"] = "gzip, deflate" @@ -203,6 +218,20 @@ class Extractor(): return False return True + def _get_date_min_max(self, dmin=None, dmax=None): + """Retrieve and parse 'date-min' and 'date-max' config values""" + def get(key, default): + ts = self.config(key, default) + if isinstance(ts, str): + try: + ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) + except ValueError as exc: + self.log.warning("Unable to parse '%s': %s", key, exc) + ts = default + return ts + fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") + return get("date-min", dmin), get("date-max", dmax) + @classmethod def _get_tests(cls): """Yield an extractor's test cases as (URL, RESULTS) tuples""" @@ -403,30 +432,36 @@ def generate_extractors(extractor_data, symtable, classes): http.cookiejar.MozillaCookieJar.magic_re = re.compile( "#( Netscape)? HTTP Cookie File", re.IGNORECASE) -# Update default cipher list of urllib3 -# to fix issues with Cloudflare and, by extension, Artstation (#227) -from requests.packages.urllib3.util import ssl_ # noqa -logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers") - -# cipher list taken from urllib3 1.25 -# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py -# with additions from -# https://github.com/Anorov/cloudflare-scrape/pull/242 -ssl_.DEFAULT_CIPHERS = ( - "ECDHE+AESGCM:" - "ECDHE+CHACHA20:" - "DHE+AESGCM:" - "DHE+CHACHA20:" - "ECDH+AESGCM:" - "DH+AESGCM:" - "ECDH+AES:" - "DH+AES:" - "RSA+AESGCM:" - "RSA+AES:" - "!ECDHE+SHA:" - "!AES128-SHA:" - "!aNULL:" - "!eNULL:" - "!MD5:" - "!DSS" -) +# Replace default cipher list of urllib3 to avoid Cloudflare CAPTCHAs +ciphers = config.get(("ciphers",), True) +if ciphers: + logging.getLogger("gallery-dl").debug("Updating urllib3 ciphers") + + if ciphers is True: + ciphers = ( + # Firefox's list + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "DHE-RSA-AES128-SHA:" + "DHE-RSA-AES256-SHA:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + ) + elif isinstance(ciphers, list): + ciphers = ":".join(ciphers) + + from requests.packages.urllib3.util import ssl_ # noqa + ssl_.DEFAULT_CIPHERS = ciphers + del ssl_ diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ebab040..63e2913 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -205,8 +205,7 @@ class DeviantartExtractor(Extractor): @staticmethod def _find_folder(folders, name): - pattern = re.compile( - r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$") + pattern = re.compile(r"(?i)\W*" + name.replace("-", r"\W+") + r"\W*$") for folder in folders: if pattern.match(folder["name"]): return folder @@ -416,7 +415,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def deviations(self): url = "{}/{}/{}".format(self.root, self.user, self.path) - response = self._html_request(url, expect=range(400, 500)) + response = self._html_request(url, fatal=False) deviation_id = text.extract(response.text, '//deviation/', '"')[0] if response.status_code >= 400 or not deviation_id: raise exception.NotFoundError("image") @@ -767,7 +766,7 @@ class DeviantartAPI(): def user_profile(self, username): """Get user profile information""" endpoint = "user/profile/" + username - return self._call(endpoint, expect_error=True) + return self._call(endpoint, fatal=False) def authenticate(self, refresh_token): """Authenticate the application by requesting an access token""" @@ -797,7 +796,7 @@ class DeviantartAPI(): _refresh_token_cache.update(refresh_token, data["refresh_token"]) return "Bearer " + data["access_token"] - def _call(self, endpoint, params=None, expect_error=False, public=True): + def _call(self, endpoint, params=None, fatal=True, public=True): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint while True: @@ -806,11 +805,7 @@ class DeviantartAPI(): self.authenticate(None if public else self.refresh_token) response = self.extractor.request( - url, - params=params, - headers=self.headers, - expect=range(400, 500), - ) + url, headers=self.headers, params=params, fatal=False) data = response.json() status = response.status_code @@ -818,7 +813,7 @@ class DeviantartAPI(): if self.delay > self.delay_min: self.delay -= 1 return data - if expect_error: + if not fatal: return None if data.get("error_description") == "User not found.": raise exception.NotFoundError("user or group") diff --git a/gallery_dl/extractor/erolord.py b/gallery_dl/extractor/erolord.py new file mode 100644 index 0000000..8628039 --- /dev/null +++ b/gallery_dl/extractor/erolord.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://erolord.com/""" + +from .common import GalleryExtractor +from .. import text, util +import json + + +class ErolordGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from erolord.com""" + category = "erolord" + root = "http://erolord.com" + pattern = r"(?:https?://)?(?:www\.)?erolord.com(/doujin/(\d+)/?)" + test = ("http://erolord.com/doujin/2189055/", { + "url": "7ce6d10a3934102b95c9718a34ccd3d35f55d85f", + "keyword": { + "title" : "Amazon No Hiyaku | Amazon Elixir", + "gallery_id": 2189055, + "count" : 16, + "artist" : ["Morris"], + "group" : list, + "parody" : list, + "characters": list, + "tags" : list, + "lang" : "en", + "language" : "English", + }, + }) + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self, page): + extr = text.extract_from(page) + split = text.split_html + title, _, language = extr('<h1 class="t64">', '</h1>').rpartition(" ") + language = language.strip("[]") + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(title), + # double quotes for anime, circle, tags + # single quotes for characters, artist + "parody" : split(extr('class="sp1">Anime:' , "</div>\r")), + "characters": split(extr("class='sp1'>Characters:", "</div>\r")), + "artist" : split(extr("class='sp1'>Artist:" , "</div>\r")), + "group" : split(extr('class="sp1">Circle:' , "</div>\r")), + "tags" : split(extr('class="sp1">Tags:' , "</div>\r")), + "lang" : util.language_to_code(language), + "language" : language, + } + + def images(self, page): + url = self.root + text.extract(page, 'id="d1"><a href="', '"')[0] + imgs = text.extract(self.request(url).text, 'var imgs=', ';')[0] + return [(self.root + path, None) for path in json.loads(imgs)] diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index d67c58a..20e0746 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -259,7 +259,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def _gallery_page(self): url = "{}/g/{}/{}/".format( self.root, self.gallery_id, self.gallery_token) - response = self.request(url, expect=range(400, 500)) + response = self.request(url, fatal=False) page = response.text if response.status_code == 404 and "Gallery Not Available" in page: @@ -271,7 +271,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def _image_page(self): url = "{}/s/{}/{}-{}".format( self.root, self.image_token, self.gallery_id, self.image_num) - page = self.request(url, expect=range(400, 500)).text + page = self.request(url, fatal=False).text if page.startswith(("Invalid page", "Keep trying")): raise exception.NotFoundError("image page") diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 0468c0b..c5e3d17 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -16,16 +16,15 @@ import json class ImgurExtractor(Extractor): """Base class for imgur extractors""" category = "imgur" + root = "https://imgur.com" def __init__(self, match): Extractor.__init__(self, match) self.item_id = match.group(1) self.mp4 = self.config("mp4", True) - def _get_data(self, urlpart): - response = self.request("https://imgur.com/" + urlpart, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError(self.subcategory) + def _get_data(self, path): + response = self.request(self.root + path, notfound=self.subcategory) data = text.extract(response.text, "image : ", ",\n")[0] return self._clean(json.loads(data)) @@ -102,7 +101,7 @@ class ImgurImageExtractor(ImgurExtractor): ) def items(self): - image = self._get_data(self.item_id) + image = self._get_data("/" + self.item_id) url = self._prepare(image) yield Message.Version, 1 @@ -165,13 +164,13 @@ class ImgurAlbumExtractor(ImgurExtractor): ) def items(self): - album = self._get_data("a/" + self.item_id + "/all") + album = self._get_data("/a/" + self.item_id + "/all") images = album["album_images"]["images"] del album["album_images"] if int(album["num_images"]) > len(images): - url = ("https://imgur.com/ajaxalbums/getimages/" + - self.item_id + "/hit.json") + url = "{}/ajaxalbums/getimages/{}/hit.json".format( + self.root, self.item_id) images = self.request(url).json()["data"]["images"] yield Message.Version, 1 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 871236b..475e24b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -11,7 +11,8 @@ import hashlib import json from .common import Extractor, Message -from .. import text +from .. import text, exception +from ..cache import cache class InstagramExtractor(Extractor): @@ -21,11 +22,14 @@ class InstagramExtractor(Extractor): filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}" archive_fmt = "{media_id}" root = "https://www.instagram.com" + cookiedomain = ".instagram.com" + cookienames = ("sessionid",) def get_metadata(self): return {} def items(self): + self.login() yield Message.Version, 1 metadata = self.get_metadata() @@ -40,6 +44,46 @@ class InstagramExtractor(Extractor): yield Message.Url, \ 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data + def login(self): + if self._check_cookies(self.cookienames): + return + username, password = self._get_auth_info() + if username: + self.session.cookies.set("ig_cb", "1", domain="www.instagram.com") + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=360*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + page = self.request(self.root + "/accounts/login/").text + headers = { + "Referer" : self.root + "/accounts/login/", + "X-IG-App-ID" : "936619743392459", + "X-Requested-With": "XMLHttpRequest", + } + + response = self.request(self.root + "/web/__mid/", headers=headers) + headers["X-CSRFToken"] = response.cookies["csrftoken"] + headers["X-Instagram-AJAX"] = text.extract( + page, '"rollout_hash":"', '"')[0] + + url = self.root + "/accounts/login/ajax/" + data = { + "username" : username, + "password" : password, + "queryParams" : "{}", + "optIntoOneTap": "true", + } + response = self.request(url, method="POST", headers=headers, data=data) + + if not response.json().get("authenticated"): + raise exception.AuthenticationError() + return { + key: self.session.cookies.get(key) + for key in ("sessionid", "mid", "csrftoken") + } + def _extract_shared_data(self, page): return json.loads(text.extract(page, 'window._sharedData = ', ';</script>')[0]) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 9e0aaa3..282c389 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -48,22 +48,20 @@ class NewgroundsExtractor(Extractor): extr = text.extract_from(self.request(page_url).text) full = text.extract_from(json.loads(extr('"full_image_text":', '});'))) data = { + "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), - "date" : extr('itemprop="datePublished" content="', '"'), + "date" : text.parse_datetime(extr( + 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), "favorites" : text.parse_int(extr('id="faves_load">', '<')), "score" : text.parse_float(extr('id="score_number">', '<')), + "tags" : text.split_html(extr( + '<dd class="tags momag">', '</dd>')), "url" : full('src="', '"'), - "title" : text.unescape(full('alt="', '"')), "width" : text.parse_int(full('width="', '"')), "height" : text.parse_int(full('height="', '"')), } - - tags = text.split_html(extr('<dd class="tags momag">', '</dd>')) - tags.sort() - data["tags"] = tags - - data["date"] = text.parse_datetime(data["date"]) + data["tags"].sort() data["index"] = text.parse_int( data["url"].rpartition("/")[2].partition("_")[0]) return data @@ -95,7 +93,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor): test = ( ("https://blitzwuff.newgrounds.com/art", { "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", - "keyword": "2aab0532a894ff3cf88dd01ce5c60f114011b268", + "keyword": "98566e0c8096a8099b8d71962fea7e31c8b098d4", }), ("https://blitzwuff.newgrounds.com/"), ) @@ -140,9 +138,9 @@ class NewgroundsVideoExtractor(NewgroundsExtractor): subcategory = "video" filename_fmt = "{category}_{index}.{extension}" pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$" - test = ("https://twistedgrim.newgrounds.com/movies", { + test = ("https://tomfulp.newgrounds.com/movies", { "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+", - "count": ">= 29", + "count": ">= 32", }) def get_page_urls(self): diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index abf1eaa..4c48d73 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -106,13 +106,8 @@ class NijieExtractor(AsynchronousMixin, Extractor): params = {"id": self.user_id, "p": 1} while True: - response = self.request(url, params=params, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("artist") - - page = response.text - ids = list(text.extract_iter(page, ' illust_id="', '"')) - yield from ids + page = self.request(url, params=params, notfound="artist").text + yield from text.extract_iter(page, 'illust_id="', '"') if '<a rel="next"' not in page: return @@ -126,7 +121,7 @@ class NijieUserExtractor(NijieExtractor): r"/members(?:_illust)?\.php\?id=(\d+)") test = ( ("https://nijie.info/members_illust.php?id=44", { - "url": "585d821df4716b1098660a0be426d01db4b65f2a", + "url": "66c4ff94c6e77c0765dd88f2d8c663055fda573e", "keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a", }), ("https://nijie.info/members_illust.php?id=43", { @@ -174,7 +169,7 @@ class NijieImageExtractor(NijieExtractor): r"/view(?:_popup)?\.php\?id=(\d+)") test = ( ("https://nijie.info/view.php?id=70720", { - "url": "a10d4995645b5f260821e32c60a35f73546c2699", + "url": "5497f897311397dafa188521258624346a0af2a3", "keyword": "408393d010307c76d52cbd0a4368d6d357805aea", "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", }), @@ -190,10 +185,8 @@ class NijieImageExtractor(NijieExtractor): self.page = "" def get_job_metadata(self): - response = self.request(self.view_url + self.image_id, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("image") - self.page = response.text + self.page = self.request( + self.view_url + self.image_id, notfound="image").text self.user_id = text.extract( self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0] return NijieExtractor.get_job_metadata(self) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index fa8cd48..f5b8869 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -228,14 +228,14 @@ class PinterestAPI(): params = {"data": json.dumps({"options": options}), "source_url": ""} response = self.extractor.request( - url, params=params, headers=self.HEADERS, expect=range(400, 500)) + url, params=params, headers=self.HEADERS, fatal=False) try: data = response.json() except ValueError: data = {} - if 200 <= response.status_code < 400 and not response.history: + if response.status_code < 400 and not response.history: return data if response.status_code == 404 or response.history: diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index af29c4b..76d4dc4 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -143,9 +143,7 @@ class PixivMeExtractor(PixivExtractor): def items(self): url = "https://pixiv.me/" + self.account response = self.request( - url, method="HEAD", allow_redirects=False, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("user") + url, method="HEAD", allow_redirects=False, notfound="user") yield Message.Version, 1 yield Message.Queue, response.headers["Location"], {} @@ -445,7 +443,7 @@ class PixivAppAPI(): data["password"] = password response = self.extractor.request( - url, method="POST", data=data, expect=(400,)) + url, method="POST", data=data, fatal=False) if response.status_code >= 400: raise exception.AuthenticationError() @@ -491,10 +489,9 @@ class PixivAppAPI(): url = "https://app-api.pixiv.net/" + endpoint self.login() - response = self.extractor.request( - url, params=params, expect=range(400, 500)) + response = self.extractor.request(url, params=params, fatal=False) - if 200 <= response.status_code < 400: + if response.status_code < 400: return response.json() if response.status_code == 404: raise exception.NotFoundError() diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 0c5a924..2ba4b99 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, util, extractor, exception from ..cache import cache -import datetime import time @@ -235,8 +234,7 @@ class RedditAPI(): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = 1 self.authenticate() - response = self.extractor.request( - url, params=params, expect=range(400, 500)) + response = self.extractor.request(url, params=params, fatal=False) remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: wait = int(response.headers["x-ratelimit-reset"]) @@ -252,12 +250,9 @@ class RedditAPI(): return data def _pagination(self, endpoint, params, _empty=()): - date_fmt = self.extractor.config("date-format", "%Y-%m-%dT%H:%M:%S") - date_min = self._parse_datetime("date-min", 0, date_fmt) - date_max = self._parse_datetime("date-max", 253402210800, date_fmt) - id_min = self._parse_id("id-min", 0) id_max = self._parse_id("id-max", 2147483647) + date_min, date_max = self.extractor._get_date_min_max(0, 253402210800) while True: data = self._call(endpoint, params)["data"] @@ -294,16 +289,6 @@ class RedditAPI(): if link_id and extra: yield from self.morechildren(link_id, extra) - def _parse_datetime(self, key, default, fmt): - ts = self.extractor.config(key, default) - if isinstance(ts, str): - try: - ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) - except ValueError as exc: - self.log.warning("Unable to parse '%s': %s", key, exc) - ts = default - return ts - def _parse_id(self, key, default): sid = self.extractor.config(key) return self._decode(sid.rpartition("_")[2].lower()) if sid else default diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index 22b2b63..55eda9f 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -110,8 +110,8 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor): yield Message.Version, 1 while True: url = "{}/{}/page/{}/".format(self.root, self.path, pnum) - response = self.request(url, expect=(404,)) - if response.status_code == 404: + response = self.request(url, fatal=False) + if response.status_code >= 400: return for url in text.extract_iter(response.text, 'data-direct="', '"'): if url != last: diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index f63c999..0d92573 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -43,9 +43,7 @@ class SeigaExtractor(Extractor): """Get url for an image with id 'image_id'""" url = "{}/image/source/{}".format(self.root, image_id) response = self.request( - url, method="HEAD", allow_redirects=False, expect=(404,)) - if response.status_code == 404: - raise exception.NotFoundError("image") + url, method="HEAD", allow_redirects=False, notfound="image") return response.headers["Location"].replace("/o/", "/priv/", 1) def login(self): diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index aa2b16b..afd4eaa 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -23,9 +23,9 @@ class SexcomExtractor(Extractor): def items(self): yield Message.Version, 1 yield Message.Directory, self.metadata() - for url in self.pins(): - pin = self._parse_pin(url) - yield Message.Url, pin["url"], pin + for pin in map(self._parse_pin, self.pins()): + if pin: + yield Message.Url, pin["url"], pin def metadata(self): return {} @@ -49,8 +49,13 @@ class SexcomExtractor(Extractor): return url = text.urljoin(self.root, url) - def _parse_pin(self, pin_url): - extr = text.extract_from(self.request(pin_url).text) + def _parse_pin(self, url): + response = self.request(url, fatal=False) + if response.status_code >= 400: + self.log.warning('Unable to fetch %s ("%s: %s")', + url, response.status_code, response.reason) + return None + extr = text.extract_from(response.text) data = {} data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') @@ -88,10 +93,10 @@ class SexcomExtractor(Extractor): class SexcomPinExtractor(SexcomExtractor): - """Extractor a pinned image or video on www.sex.com""" + """Extractor for a pinned image or video on www.sex.com""" subcategory = "pin" directory_fmt = ("{category}",) - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)" + pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)(?!.*#related$)" test = ( # picture ("https://www.sex.com/pin/56714360/", { @@ -124,6 +129,10 @@ class SexcomPinExtractor(SexcomExtractor): ("https://www.sex.com/pin/55847384-very-nicely-animated/", { "pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2", }), + # 404 + ("https://www.sex.com/pin/55847385/", { + "count": 0, + }), ) def __init__(self, match): @@ -134,6 +143,25 @@ class SexcomPinExtractor(SexcomExtractor): return ("{}/pin/{}/".format(self.root, self.pin_id),) +class SexcomRelatedPinExtractor(SexcomPinExtractor): + """Extractor for related pins on www.sex.com""" + subcategory = "related-pin" + directory_fmt = ("{category}", "related {original_pin[pin_id]}") + pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$" + test = ("https://www.sex.com/pin/56714360/#related", { + "count": 24, + }) + + def metadata(self): + pin = self._parse_pin(SexcomPinExtractor.pins(self)[0]) + return {"original_pin": pin} + + def pins(self): + url = "{}/pin/related?pinId={}&limit=24&offset=0".format( + self.root, self.pin_id) + return self._pagination(url) + + class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 35895bb..b2498a0 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -10,7 +10,6 @@ from .common import Extractor, Message, SharedConfigMixin, generate_extractors from .. import text -import time import re @@ -24,19 +23,9 @@ class ShopifyExtractor(SharedConfigMixin, Extractor): Extractor.__init__(self, match) self.item_url = self.root + match.group(1) - def request(self, url, method="GET", expect=range(400, 500), **kwargs): - tries = 0 - kwargs["expect"] = expect - while True: - response = Extractor.request(self, url, method, **kwargs) - if response.status_code not in (429, 430): - return response - tries += 1 - waittime = 2 ** (tries + 2) - self.log.warning( - "HTTP status %s: %s - Waiting for %d seconds", - response.status_code, response.reason, waittime) - time.sleep(waittime) + def request(self, url, **kwargs): + kwargs["retries"] = float("inf") + return Extractor.request(self, url, **kwargs) def items(self): data = self.metadata() @@ -45,9 +34,10 @@ class ShopifyExtractor(SharedConfigMixin, Extractor): headers = {"X-Requested-With": "XMLHttpRequest"} for url in self.products(): - response = self.request(url + ".json", headers=headers) + response = self.request( + url + ".json", headers=headers, fatal=False) if response.status_code >= 400: - self.log.warning('Skipping %s ("%d: %s")', + self.log.warning('Skipping %s ("%s: %s")', url, response.status_code, response.reason) continue product = response.json()["product"] @@ -89,10 +79,14 @@ class ShopifyCollectionExtractor(ShopifyExtractor): while True: page = self.request(self.item_url, params=params).text urls = search_re.findall(page) + last = None if not urls: return for path in urls: + if last == path: + continue + last = path yield self.root + path params["page"] += 1 @@ -113,7 +107,7 @@ EXTRACTORS = { "pattern": r"(?:www\.)?fashionnova\.com", "test-product": ( ("https://www.fashionnova.com/products/essential-slide-red", { - "pattern": r"https?://cdn\.shopify.com/", + "pattern": r"https?://cdn\d*\.shopify.com/", "count": 3, }), ("https://www.fashionnova.com/collections/flats/products/name"), diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 44dc6fe..5ad372d 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { "url": "258289249990502c3138719cb89e995a60861e49", - "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b", + "keyword": "eba83ccdbab3022a2280c77aa747f9458196138b", }), ("https://www.simply-hentai.com/notfound", { "exception": exception.GalleryDLException, @@ -40,30 +40,26 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): self.session.headers["Referer"] = url def metadata(self, page): - extr = text.extract - title , pos = extr(page, '<meta property="og:title" content="', '"') + extr = text.extract_from(page) + split = text.split_html + + title = extr('<meta property="og:title" content="', '"') if not title: raise exception.NotFoundError("gallery") - gid , pos = extr(page, '/Album/', '/', pos) - series, pos = extr(page, 'box-title">Series</div>', '</div>', pos) - lang , pos = extr(page, 'box-title">Language</div>', '</div>', pos) - chars , pos = extr(page, 'box-title">Characters</div>', '</div>', pos) - tags , pos = extr(page, 'box-title">Tags</div>', '</div>', pos) - artist, pos = extr(page, 'box-title">Artists</div>', '</div>', pos) - date , pos = extr(page, 'Uploaded', '</div>', pos) - lang = text.remove_html(lang) if lang else None - - return { - "gallery_id": text.parse_int(gid), + data = { "title" : text.unescape(title), - "artist" : text.split_html(artist), - "parody" : text.split_html(series), - "characters": text.split_html(chars), - "tags" : text.split_html(tags), - "lang" : util.language_to_code(lang), - "language" : lang, - "date" : text.remove_html(date), + "gallery_id": text.parse_int(extr('/Album/', '/')), + "parody" : split(extr('box-title">Series</div>', '</div>')), + "language" : text.remove_html(extr( + 'box-title">Language</div>', '</div>')) or None, + "characters": split(extr('box-title">Characters</div>', '</div>')), + "tags" : split(extr('box-title">Tags</div>', '</div>')), + "artist" : split(extr('box-title">Artists</div>', '</div>')), + "date" : text.parse_datetime(text.remove_html( + extr('Uploaded', '</div>')), "%d.%m.%Y"), } + data["lang"] = util.language_to_code(data["language"]) + return data def images(self, _): url = self.chapter_url + "/all-pages" @@ -102,12 +98,11 @@ class SimplyhentaiImageExtractor(Extractor): self.type = match.group(2) def items(self): - page = self.request(self.page_url).text - url_search = 'data-src="' if self.type == "image" else '<source src="' - - title, pos = text.extract(page, '"og:title" content="', '"') - descr, pos = text.extract(page, '"og:description" content="', '"', pos) - url , pos = text.extract(page, url_search, '"', pos) + extr = text.extract_from(self.request(self.page_url).text) + title = extr('"og:title" content="' , '"') + descr = extr('"og:description" content="', '"') + url = extr('"image":"' , '&') + url = extr(""content":"", "&") or url tags = text.extract(descr, " tagged with ", " online for free ")[0] if tags: @@ -140,13 +135,13 @@ class SimplyhentaiVideoExtractor(Extractor): ("https://videos.simply-hentai.com/creamy-pie-episode-02", { "pattern": r"https://www\.googleapis\.com/drive/v3/files" r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+", - "keyword": "29d63987fed33f0a9f4b3786d1d71b03d793250a", + "keyword": "706790708b14773efc1e075ddd3b738a375348a5", "count": 1, }), (("https://videos.simply-hentai.com" "/1715-tifa-in-hentai-gang-bang-3d-movie"), { "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0", - "keyword": "c561341aa3c6999f615abf1971d28fb2a83da2a7", + "keyword": "f9dad94fbde9c95859e631ff4f07297a9567b874", }), ) @@ -178,8 +173,9 @@ class SimplyhentaiVideoExtractor(Extractor): "title": text.unescape(title), "episode": text.parse_int(episode), "tags": text.split_html(tags)[::2], - "date": text.remove_html(date), "type": "video", + "date": text.parse_datetime(text.remove_html( + date), "%B %d, %Y %H:%M"), }) yield Message.Version, 1 diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 80348ae..2e6508c 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -69,11 +69,11 @@ class SmugmugAlbumExtractor(SmugmugExtractor): archive_fmt = "a_{Album[AlbumKey]}_{Image[ImageKey]}" pattern = r"smugmug:album:([^:]+)$" test = ( - ("smugmug:album:ddvxpg", { - "url": "0429e9bf50ee600674e448934e3882ca1761ae7b", + ("smugmug:album:cr4C7f", { + "url": "1436ee98d5797b308ecce5862e4885944f59c03c", }), # empty - ("smugmug:album:SXvjbW", { + ("smugmug:album:Fb7hMs", { "count": 0, }), # no "User" @@ -109,10 +109,10 @@ class SmugmugImageExtractor(SmugmugExtractor): archive_fmt = "{Image[ImageKey]}" pattern = BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#-]+)" test = ( - ("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", { - "url": "78f0bf3516b6d670b7319216bdeccb35942ca4cf", - "keyword": "b298ef7ed2b1918263b6a7dc6f56e54401584381", - "content": "64a8f69a1d824921eebbdf2420087937adfa45cd", + ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { + "url": "f624ad7293afd6412a7d34e3950a118596c36c85", + "keyword": "ea70e93be5067dca988d871dcf9afac491a189a4", + "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { @@ -142,12 +142,12 @@ class SmugmugPathExtractor(SmugmugExtractor): subcategory = "path" pattern = BASE_PATTERN + r"((?:/[^/?&#a-fh-mo-z][^/?&#]*)*)/?$" test = ( - ("https://acapella.smugmug.com/Micro-Macro/Drops/", { - "pattern": "smugmug:album:ddvxpg$", + ("https://tdm.smugmug.com/Nature/Dove", { + "pattern": "smugmug:album:cr4C7f$", }), - ("https://acapella.smugmug.com/", { + ("https://tdm.smugmug.com/", { "pattern": SmugmugAlbumExtractor.pattern, - "url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68", + "url": "1640028712875b90974e5aecd91b60e6de6138c7", }), # gallery node without owner ("https://www.smugmug.com/gallery/n-GLCjnD/", { diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 62a9173..03ee144 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -107,9 +107,9 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): def images(self, page): url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id) headers = {"Referer": self.chapter_url} - response = self.request(url, headers=headers, expect=(404,)) + response = self.request(url, headers=headers, fatal=False) - if response.status_code == 404: + if response.status_code >= 400: url = "{}/Read/View/{}".format(self.root, self.gallery_id) self.log.error( "Failed to get gallery JSON data. Visit '%s' in a browser " diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 5679cdc..024d6e9 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -65,11 +65,15 @@ class TumblrExtractor(Extractor): if self.reblogs == "same-blog": self._skip_reblog = self._skip_reblog_same_blog + self.date_min, self.api.before = self._get_date_min_max(0, None) + def items(self): blog = None yield Message.Version, 1 for post in self.posts(): + if self.date_min > post["timestamp"]: + return if post["type"] not in self.types: continue if not blog: @@ -207,7 +211,7 @@ class TumblrUserExtractor(TumblrExtractor): ("http://demo.tumblr.com/", { "pattern": (r"https?://(?:$|" r"\d+\.media\.tumblr\.com/.+_1280\.jpg|" - r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"), + r"a\.tumblr\.com/tumblr_\w+)"), "count": 3, "options": (("posts", "all"), ("external", True)) }), @@ -223,6 +227,11 @@ class TumblrUserExtractor(TumblrExtractor): "count": 2, "keyword": {"tags": ["test", "private", "hidden"]}, }), + ("https://mikf123.tumblr.com/", { # date-min/-max/-format (#337) + "count": 4, + "options": (("date-min", "201804"), ("date-max", "201805"), + ("date-format", "%Y%m")) + }), ("https://demo.tumblr.com/page/2"), ("https://demo.tumblr.com/archive"), ("tumblr:http://www.b-authentique.com/"), @@ -280,6 +289,7 @@ class TumblrPostExtractor(TumblrExtractor): TumblrExtractor.__init__(self, match) self.post_id = match.group(3) self.reblogs = True + self.date_min = 0 def posts(self): return self.api.posts(self.blog, {"id": self.post_id}) @@ -328,7 +338,7 @@ class TumblrAPI(oauth.OAuth1API): def __init__(self, extractor): oauth.OAuth1API.__init__(self, extractor) - self.posts_type = None + self.posts_type = self.before = None def info(self, blog): """Return general information about a blog""" @@ -350,6 +360,8 @@ class TumblrAPI(oauth.OAuth1API): params.update({"offset": 0, "limit": 50, "reblog_info": "true"}) if self.posts_type: params["type"] = self.posts_type + if self.before: + params["before"] = self.before while True: data = self._call(blog, "posts", params) self.BLOG_CACHE[blog] = data["blog"] @@ -360,7 +372,7 @@ class TumblrAPI(oauth.OAuth1API): def likes(self, blog): """Retrieve liked posts""" - params = {"limit": 50} + params = {"limit": "50", "before": self.before} while True: posts = self._call(blog, "likes", params)["liked_posts"] if not posts: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ad4dc46..ccba640 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache +import re class TwitterExtractor(Extractor): @@ -26,8 +27,13 @@ class TwitterExtractor(Extractor): Extractor.__init__(self, match) self.user = match.group(1) self.retweets = self.config("retweets", True) + self.content = self.config("content", False) self.videos = self.config("videos", False) + if self.content: + self._emoji_sub = re.compile( + r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub + def items(self): self.login() yield Message.Version, 1 @@ -35,6 +41,7 @@ class TwitterExtractor(Extractor): for tweet in self.tweets(): data = self._data_from_tweet(tweet) + if not self.retweets and data["retweet_id"]: continue @@ -87,10 +94,9 @@ class TwitterExtractor(Extractor): raise exception.AuthenticationError() return self.session.cookies - @staticmethod - def _data_from_tweet(tweet): + def _data_from_tweet(self, tweet): extr = text.extract_from(tweet) - return { + data = { "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), "retweeter" : extr('data-retweeter="' , '"'), @@ -99,6 +105,14 @@ class TwitterExtractor(Extractor): "user_id" : text.parse_int(extr('data-user-id="' , '"')), "date" : text.parse_timestamp(extr('data-time="', '"')), } + if self.content: + content = extr('<div class="js-tweet-text-container">', '\n</div>') + if '<img class="Emoji ' in content: + content = self._emoji_sub(r"\1", content) + content = text.unescape(text.remove_html(content, "", "")) + cl, _, cr = content.rpartition("pic.twitter.com/") + data["content"] = cl if cl and len(cr) < 16 else content + return data def _tweets_from_api(self, url): params = { @@ -186,6 +200,11 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("videos", True),), "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+", }), + # content with emoji, newlines, hashtags (#338) + ("https://twitter.com/yumi_san0112/status/1151144618936823808", { + "options": (("content", True),), + "keyword": "b13b6c4cd0b0c15b2ea7685479e7fedde3c47b9e", + }), ) def __init__(self, match): @@ -199,4 +218,4 @@ class TwitterTweetExtractor(TwitterExtractor): url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id) page = self.request(url).text return (text.extract( - page, '<div class="tweet ', '<ul class="stats')[0],) + page, '<div class="tweet ', 'class="js-tweet-stats-container')[0],) diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 7eec18b..e253b7f 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -18,12 +18,6 @@ class XvideosExtractor(Extractor): category = "xvideos" root = "https://www.xvideos.com" - def get_page(self, url, codes=(403, 404)): - response = self.request(url, expect=codes) - if response.status_code in codes: - raise exception.NotFoundError(self.subcategory) - return response.text - class XvideosGalleryExtractor(XvideosExtractor): """Extractor for user profile galleries from xvideos.com""" @@ -37,7 +31,7 @@ class XvideosGalleryExtractor(XvideosExtractor): (("https://www.xvideos.com/profiles" "/pervertedcouple/photos/751031/random_stuff"), { "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", - "keyword": "8d637b372c6231cc4ada92dd5918db5fdbd06520", + "keyword": "65979d63a69576cf692b41d5fbbd995cc40a51b9", }), ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { "exception": exception.NotFoundError, @@ -50,7 +44,7 @@ class XvideosGalleryExtractor(XvideosExtractor): def items(self): url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) - page = self.get_page(url) + page = self.request(url, notfound=self.subcategory).text data = self.get_metadata(page) imgs = self.get_images(page) data["count"] = len(imgs) @@ -113,7 +107,7 @@ class XvideosUserExtractor(XvideosExtractor): def items(self): url = "{}/profiles/{}".format(self.root, self.user) - page = self.get_page(url) + page = self.request(url, notfound=self.subcategory).text data = json.loads(text.extract( page, "xv.conf=", ";</script>")[0])["data"] diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 667b9b3..20823a6 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -29,15 +29,9 @@ class Job(): extr.log.job = self extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url) - # url predicates - self.pred_url = self._prepare_predicates( - "image", [util.UniquePredicate()], True) + self.pred_url = self._prepare_predicates("image", True) + self.pred_queue = self._prepare_predicates("chapter", False) - # queue predicates - self.pred_queue = self._prepare_predicates( - "chapter", [], False) - - # category transfer if parent and parent.extractor.config( "category-transfer", parent.extractor.categorytransfer): self.extractor.category = parent.extractor.category @@ -142,7 +136,12 @@ class Job(): if self.userkwds: kwdict.update(self.userkwds) - def _prepare_predicates(self, target, predicates, skip=True): + def _prepare_predicates(self, target, skip=True): + predicates = [] + + if self.extractor.config(target + "-unique"): + predicates.append(util.UniquePredicate()) + pfilter = self.extractor.config(target + "-filter") if pfilter: try: @@ -191,14 +190,18 @@ class DownloadJob(Job): def handle_url(self, url, keywords, fallback=None): """Download the resource specified in 'url'""" + postprocessors = self.postprocessors + pathfmt = self.pathfmt + archive = self.archive + # prepare download - self.pathfmt.set_keywords(keywords) + pathfmt.set_keywords(keywords) - if self.postprocessors: - for pp in self.postprocessors: - pp.prepare(self.pathfmt) + if postprocessors: + for pp in postprocessors: + pp.prepare(pathfmt) - if self.pathfmt.exists(self.archive): + if pathfmt.exists(archive): self.handle_skip() return @@ -215,24 +218,24 @@ class DownloadJob(Job): break else: # download failed - self.log.error( - "Failed to download %s", self.pathfmt.filename or url) + self.log.error("Failed to download %s", + pathfmt.filename or url) return - if not self.pathfmt.temppath: + if not pathfmt.temppath: self.handle_skip() return # run post processors - if self.postprocessors: - for pp in self.postprocessors: - pp.run(self.pathfmt) + if postprocessors: + for pp in postprocessors: + pp.run(pathfmt) # download succeeded - self.pathfmt.finalize() - self.out.success(self.pathfmt.path, 0) - if self.archive: - self.archive.add(keywords) + pathfmt.finalize() + self.out.success(pathfmt.path, 0) + if archive: + archive.add(keywords) self._skipcnt = 0 def handle_urllist(self, urls, keywords): @@ -281,20 +284,22 @@ class DownloadJob(Job): def get_downloader(self, scheme): """Return a downloader suitable for 'scheme'""" - if scheme == "https": - scheme = "http" try: return self.downloaders[scheme] except KeyError: pass klass = downloader.find(scheme) - if klass and config.get(("downloader", scheme, "enabled"), True): + if klass and config.get(("downloader", klass.scheme, "enabled"), True): instance = klass(self.extractor, self.out) else: instance = None self.log.error("'%s:' URLs are not supported/enabled", scheme) - self.downloaders[scheme] = instance + + if klass.scheme == "http": + self.downloaders["http"] = self.downloaders["https"] = instance + else: + self.downloaders[scheme] = instance return instance def initialize(self, keywords=None): @@ -302,7 +307,10 @@ class DownloadJob(Job): self.pathfmt = util.PathFormat(self.extractor) if keywords: self.pathfmt.set_directory(keywords) + self.sleep = self.extractor.config("sleep") + if not self.extractor.config("download", True): + self.download = self.pathfmt.fix_extension skip = self.extractor.config("skip", True) if skip: diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py index 58126ac..8a12755 100644 --- a/gallery_dl/oauth.py +++ b/gallery_dl/oauth.py @@ -126,7 +126,7 @@ class OAuth1API(): self.session = extractor.session self.api_key = api_key - def request(self, url, method="GET", *, expect=range(400, 500), **kwargs): - kwargs["expect"] = expect + def request(self, url, method="GET", **kwargs): + kwargs["fatal"] = False kwargs["session"] = self.session return self.extractor.request(url, method, **kwargs) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index f23b79d..af70fc8 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -11,6 +11,7 @@ import argparse import logging import json +import sys from . import job, version @@ -26,6 +27,14 @@ class ConfigConstAction(argparse.Action): namespace.options.append(((self.dest,), self.const)) +class DeprecatedConfigConstAction(argparse.Action): + """Set argparse const values as config values + deprecation warning""" + def __call__(self, parser, namespace, values, option_string=None): + print("warning: {} is deprecated. Use {} instead.".format( + "/".join(self.option_strings), self.choices), file=sys.stderr) + namespace.options.append(((self.dest,), self.const)) + + class ParseAction(argparse.Action): """Parse <key>=<value> options and set them as config values""" def __call__(self, parser, namespace, values, option_string=None): @@ -164,8 +173,16 @@ def build_parser(): ) downloader.add_argument( "-R", "--retries", - dest="retries", metavar="RETRIES", type=int, action=ConfigAction, - help="Number of retries (default: 5)", + dest="retries", metavar="N", type=int, action=ConfigAction, + help=("Maximum number of retries for failed HTTP requests " + "or -1 for infinite retries (default: 4)"), + ) + downloader.add_argument( + "-A", "--abort", + dest="abort", metavar="N", type=int, + help=("Abort extractor run after N consecutive file downloads have " + "been skipped, e.g. if files with the same filename already " + "exist"), ) downloader.add_argument( "--http-timeout", @@ -183,15 +200,26 @@ def build_parser(): help="Do not use .part files", ) downloader.add_argument( + "--no-mtime", + dest="mtime", nargs=0, action=ConfigConstAction, const=False, + help=("Do not set file modification times according to " + "Last-Modified HTTP response headers") + ) + downloader.add_argument( + "--no-download", + dest="download", nargs=0, action=ConfigConstAction, const=False, + help=("Do not download any files") + ) + downloader.add_argument( "--no-check-certificate", dest="verify", nargs=0, action=ConfigConstAction, const=False, help="Disable HTTPS certificate validation", ) downloader.add_argument( "--abort-on-skip", - dest="skip", nargs=0, action=ConfigConstAction, const="abort", - help=("Abort extractor run if a file download would normally be " - "skipped, i.e. if a file with the same filename already exists"), + action=DeprecatedConfigConstAction, + dest="skip", nargs=0, const="abort", choices="-A/--abort", + help=argparse.SUPPRESS, ) configuration = parser.add_argument_group("Configuration Options") @@ -294,6 +322,12 @@ def build_parser(): action="append_const", const={"name": "metadata", "mode": "tags"}, help="Write image tags to separate text files", ) + postprocessor.add_argument( + "--mtime-from-date", + dest="postprocessors", + action="append_const", const={"name": "mtime"}, + help="Set file modification times according to 'date' metadata", + ) parser.add_argument( "urls", diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 327b69a..87c5006 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -35,6 +35,30 @@ class Logger(logging.Logger): return rv +class Formatter(logging.Formatter): + """Custom formatter that supports different formats per loglevel""" + + def __init__(self, fmt, datefmt): + if not isinstance(fmt, dict): + fmt = {"debug": fmt, "info": fmt, "warning": fmt, "error": fmt} + self.formats = fmt + self.datefmt = datefmt + + def format(self, record): + record.message = record.getMessage() + fmt = self.formats[record.levelname] + if "{asctime" in fmt: + record.asctime = self.formatTime(record, self.datefmt) + msg = fmt.format_map(record.__dict__) + if record.exc_info and not record.exc_text: + record.exc_text = self.formatException(record.exc_info) + if record.exc_text: + msg = msg + "\n" + record.exc_text + if record.stack_info: + msg = msg + "\n" + record.stack_info + return msg + + def initialize_logging(loglevel): """Setup basic logging functionality before configfiles have been loaded""" # convert levelnames to lowercase @@ -46,7 +70,7 @@ def initialize_logging(loglevel): logging.Logger.manager.setLoggerClass(Logger) # setup basic logging to stderr - formatter = logging.Formatter(LOG_FORMAT, LOG_FORMAT_DATE, "{") + formatter = Formatter(LOG_FORMAT, LOG_FORMAT_DATE) handler = logging.StreamHandler() handler.setFormatter(formatter) handler.setLevel(loglevel) @@ -80,13 +104,11 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): "%s: missing or invalid path (%s)", key, exc) return None - level = opts.get("level", lvl) - logfmt = opts.get("format", fmt) - datefmt = opts.get("format-date", LOG_FORMAT_DATE) - formatter = logging.Formatter(logfmt, datefmt, "{") - handler.setFormatter(formatter) - handler.setLevel(level) - + handler.setLevel(opts.get("level", lvl)) + handler.setFormatter(Formatter( + opts.get("format", fmt), + opts.get("format-date", LOG_FORMAT_DATE), + )) return handler @@ -100,10 +122,10 @@ def configure_logging_handler(key, handler): if handler.level == LOG_LEVEL and "level" in opts: handler.setLevel(opts["level"]) if "format" in opts or "format-date" in opts: - logfmt = opts.get("format", LOG_FORMAT) - datefmt = opts.get("format-date", LOG_FORMAT_DATE) - formatter = logging.Formatter(logfmt, datefmt, "{") - handler.setFormatter(formatter) + handler.setFormatter(Formatter( + opts.get("format", LOG_FORMAT), + opts.get("format-date", LOG_FORMAT_DATE), + )) # -------------------------------------------------------------------- diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index 093f8e0..e63d442 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -15,6 +15,7 @@ modules = [ "classify", "exec", "metadata", + "mtime", "ugoira", "zip", ] @@ -27,15 +28,18 @@ def find(name): try: return _cache[name] except KeyError: - klass = None + pass + + klass = None + if name in modules: # prevent unwanted imports try: - if name in modules: # prevent unwanted imports - module = importlib.import_module("." + name, __package__) - klass = module.__postprocessor__ - except (ImportError, AttributeError, TypeError): + module = importlib.import_module("." + name, __package__) + except ImportError: pass - _cache[name] = klass - return klass + else: + klass = module.__postprocessor__ + _cache[name] = klass + return klass # -------------------------------------------------------------------- diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py new file mode 100644 index 0000000..03d2f11 --- /dev/null +++ b/gallery_dl/postprocessor/mtime.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Use metadata as file modification time""" + +from .common import PostProcessor +from ..text import parse_int + + +class MtimePP(PostProcessor): + + def __init__(self, pathfmt, options): + PostProcessor.__init__(self) + self.key = options.get("key", "date") + + def run(self, pathfmt): + mtime = pathfmt.keywords.get(self.key) + ts = getattr(mtime, "timestamp", None) + pathfmt.keywords["_mtime"] = ts() if ts else parse_int(mtime) + + +__postprocessor__ = MtimePP diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 151fa30..81e87b5 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -36,12 +36,15 @@ def clean_xml(xmldata, repl=""): return xmldata -def remove_html(txt): +def remove_html(txt, repl=" ", sep=" "): """Remove html-tags from a string""" try: - return " ".join(re.sub("<[^>]+>", " ", txt).split()) + txt = re.sub("<[^>]+>", repl, txt) except TypeError: return "" + if sep: + return sep.join(txt.split()) + return txt.strip() def split_html(txt, sep=None): diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 5c0ae41..14ae3d2 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -12,6 +12,7 @@ import re import os import sys import json +import time import shutil import string import _string @@ -20,6 +21,7 @@ import datetime import operator import itertools import urllib.parse +from email.utils import mktime_tz, parsedate_tz from . import text, exception @@ -530,7 +532,7 @@ class PathFormat(): self.basedirectory = expand_path( extractor.config("base-directory", (".", "gallery-dl"))) - if os.altsep: + if os.altsep and os.altsep in self.basedirectory: self.basedirectory = self.basedirectory.replace(os.altsep, os.sep) def open(self, mode="wb"): @@ -539,13 +541,9 @@ class PathFormat(): def exists(self, archive=None): """Return True if the file exists on disk or in 'archive'""" - if (archive and archive.check(self.keywords) or - self.has_extension and os.path.exists(self.realpath)): - if not self.has_extension: - # adjust display name - self.set_extension("") - if self.path[-1] == ".": - self.path = self.path[:-1] + if archive and archive.check(self.keywords): + return self.fix_extension() + if self.has_extension and os.path.exists(self.realpath): return True return False @@ -588,6 +586,14 @@ class PathFormat(): self.keywords["extension"] = extension self.build_path() + def fix_extension(self, _=None): + if not self.has_extension: + self.set_extension("") + if self.path[-1] == ".": + self.path = self.path[:-1] + self.temppath = self.realpath = self.realpath[:-1] + return True + def build_path(self): """Use filename-keywords and directory to build a full path""" try: @@ -629,17 +635,24 @@ class PathFormat(): os.unlink(self.temppath) return - if self.temppath == self.realpath: - return - - try: - os.replace(self.temppath, self.realpath) - return - except OSError: - pass - - shutil.copyfile(self.temppath, self.realpath) - os.unlink(self.temppath) + if self.temppath != self.realpath: + # move temp file to its actual location + try: + os.replace(self.temppath, self.realpath) + except OSError: + shutil.copyfile(self.temppath, self.realpath) + os.unlink(self.temppath) + + if "_mtime" in self.keywords: + # set file modification time + mtime = self.keywords["_mtime"] + if mtime: + try: + if isinstance(mtime, str): + mtime = mktime_tz(parsedate_tz(mtime)) + os.utime(self.realpath, (time.time(), mtime)) + except Exception: + pass @staticmethod def adjust_path(path): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 4167bc4..d970ed6 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.8.7" +__version__ = "1.9.0" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index f326617..3d86110 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -22,6 +22,7 @@ CATEGORY_MAP = { "dokireader" : "Doki Reader", "dynastyscans" : "Dynasty Reader", "e621" : "e621", + "erolord" : "EroLord.com", "exhentai" : "ExHentai", "fallenangels" : "Fallen Angels Scans", "fashionnova" : "Fashion Nova", @@ -108,6 +109,7 @@ AUTH_MAP = { "exhentai" : "Optional", "flickr" : "Optional (OAuth)", "idolcomplex": "Optional", + "instagram" : "Optional", "luscious" : "Optional", "mangoxo" : "Optional", "nijie" : "Required", @@ -235,6 +237,7 @@ def write_output(fobj, columns, extractors): # caption w("Supported Sites\n") w("===============\n") + w("Unless otherwise known, assume all sites to be NSFW\n\n") # table head sep = " ".join("=" * c[1] for c in columns) + "\n" diff --git a/test/test_downloader.py b/test/test_downloader.py index 3f301b0..caed983 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -8,13 +8,16 @@ # published by the Free Software Foundation. import re +import sys import base64 import os.path import tempfile -import unittest import threading import http.server +import unittest +from unittest.mock import Mock, MagicMock, patch + import gallery_dl.downloader as downloader import gallery_dl.extractor as extractor import gallery_dl.config as config @@ -23,6 +26,73 @@ from gallery_dl.output import NullOutput from gallery_dl.util import PathFormat +class MockDownloaderModule(Mock): + __downloader__ = "mock" + + +class TestDownloaderModule(unittest.TestCase): + + @classmethod + def setUpClass(cls): + # allow import of ytdl downloader module without youtube_dl installed + sys.modules["youtube_dl"] = MagicMock() + + @classmethod + def tearDownClass(cls): + del sys.modules["youtube_dl"] + + def tearDown(self): + downloader._cache.clear() + + def test_find(self): + cls = downloader.find("http") + self.assertEqual(cls.__name__, "HttpDownloader") + self.assertEqual(cls.scheme , "http") + + cls = downloader.find("https") + self.assertEqual(cls.__name__, "HttpDownloader") + self.assertEqual(cls.scheme , "http") + + cls = downloader.find("text") + self.assertEqual(cls.__name__, "TextDownloader") + self.assertEqual(cls.scheme , "text") + + cls = downloader.find("ytdl") + self.assertEqual(cls.__name__, "YoutubeDLDownloader") + self.assertEqual(cls.scheme , "ytdl") + + self.assertEqual(downloader.find("ftp"), None) + self.assertEqual(downloader.find("foo"), None) + self.assertEqual(downloader.find(1234) , None) + self.assertEqual(downloader.find(None) , None) + + @patch("importlib.import_module") + def test_cache(self, import_module): + import_module.return_value = MockDownloaderModule() + downloader.find("http") + downloader.find("text") + downloader.find("ytdl") + self.assertEqual(import_module.call_count, 3) + downloader.find("http") + downloader.find("text") + downloader.find("ytdl") + self.assertEqual(import_module.call_count, 3) + + @patch("importlib.import_module") + def test_cache_http(self, import_module): + import_module.return_value = MockDownloaderModule() + downloader.find("http") + downloader.find("https") + self.assertEqual(import_module.call_count, 1) + + @patch("importlib.import_module") + def test_cache_https(self, import_module): + import_module.return_value = MockDownloaderModule() + downloader.find("https") + downloader.find("http") + self.assertEqual(import_module.call_count, 1) + + class TestDownloaderBase(unittest.TestCase): @classmethod @@ -134,9 +204,6 @@ class TestTextDownloader(TestDownloaderBase): def test_text_offset(self): self._run_test("text:foobar", "foo", "foobar", "txt", "txt") - def test_text_extension(self): - self._run_test("text:foobar", None, "foobar", None, "txt") - def test_text_empty(self): self._run_test("text:", None, "", "txt", "txt") diff --git a/test/test_results.py b/test/test_results.py index 8f03f03..41390a8 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -27,6 +27,7 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { "komikcast", + "konachan", "mangapark", } @@ -161,9 +162,10 @@ class ResultJob(job.DownloadJob): self.hash_keyword = hashlib.sha1() self.hash_archive = hashlib.sha1() self.hash_content = hashlib.sha1() + if content: self.fileobj = TestPathfmt(self.hash_content) - self.get_downloader("http")._check_extension = lambda a, b: None + self.get_downloader("http").check_extension = lambda a, b: None self.format_directory = TestFormatter( "".join(self.extractor.directory_fmt)) @@ -217,6 +219,7 @@ class TestPathfmt(): self.hashobj = hashobj self.path = "" self.size = 0 + self.keywords = {} self.has_extension = True def __enter__(self): @@ -279,9 +282,10 @@ def setup_test_config(): config.set(("extractor", "password"), name) config.set(("extractor", "nijie", "username"), email) config.set(("extractor", "seiga", "username"), email) - config.set(("extractor", "danbooru", "username"), None) - config.set(("extractor", "twitter" , "username"), None) - config.set(("extractor", "mangoxo" , "password"), "VZ8DL3983u") + config.set(("extractor", "danbooru" , "username"), None) + config.set(("extractor", "instagram", "username"), None) + config.set(("extractor", "twitter" , "username"), None) + config.set(("extractor", "mangoxo" , "password"), "VZ8DL3983u") config.set(("extractor", "deviantart", "client-id"), "7777") config.set(("extractor", "deviantart", "client-secret"), |
