diff options
| author | 2019-08-26 19:34:45 -0400 | |
|---|---|---|
| committer | 2019-08-26 19:34:45 -0400 | |
| commit | b75d158d014d6c43d7d785c46c9372a9cf84d144 (patch) | |
| tree | 7dca4a7e61fe8b6e2bff2142fc19891e783a7d6d | |
| parent | 64ad8e7bd15df71ab1116eede414558631bcad32 (diff) | |
New upstream version 1.10.2upstream/1.10.2
47 files changed, 1261 insertions, 529 deletions
diff --git a/.travis.yml b/.travis.yml index 6158941..4b3a2cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,11 +16,18 @@ matrix: env: GALLERYDL_TESTS=results - language: minimal dist: xenial + env: GALLERYDL_TESTS=snap addons: snaps: - name: snapcraft classic: true - env: SNAP_TESTS=true + install: + - true + script: + - sudo apt update + - snapcraft --destructive-mode + - sudo snap try + - snap run gallery-dl --verbose https://twitter.com/ubuntu/status/1121001597092364288 git: depth: 3 @@ -31,6 +38,7 @@ branches: - /^v\d+\.\d+\.\d+(-\S*)?$/ - /^test(-\w+)+$/ +install: + - pip install -r requirements.txt pyOpenSSL script: - - 'if test "${SNAP_TESTS}" != true; then ./scripts/run_tests.sh; else true; fi' - - 'if test "${SNAP_TESTS}" = true; then sudo apt update && snapcraft --destructive-mode && sudo snap try && snap run gallery-dl --verbose https://twitter.com/ubuntu/status/1121001597092364288; else true; fi' + - ./scripts/run_tests.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 58e295c..99df78a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,30 @@ # Changelog +## 1.10.2 - 2019-08-23 +### Additions +- Support for `instagram` stories and IGTV ([#371](https://github.com/mikf/gallery-dl/issues/371), [#373](https://github.com/mikf/gallery-dl/issues/373)) +- Support for individual `imgbb` images ([#363](https://github.com/mikf/gallery-dl/issues/363)) +- `deviantart.quality` option to set the JPEG compression quality for newer images ([#369](https://github.com/mikf/gallery-dl/issues/369)) +- `enumerate` option for `extractor.skip` ([#306](https://github.com/mikf/gallery-dl/issues/306)) +- `adjust-extensions` option to control filename extension adjustments +- `path-remove` option to remove control characters etc. from filesystem paths +### Changes +- Rename `restrict-filenames` to `path-restrict` +- Adjust `pixiv` metadata and default filename format ([#366](https://github.com/mikf/gallery-dl/issues/366)) + - Set `filename` to `"{category}_{user[id]}_{id}{suffix}.{extension}"` to restore the old default +- Improve and optimize directory and filename generation +### Fixes +- Allow the `classify` post-processor to handle files with unknown filename extension ([#138](https://github.com/mikf/gallery-dl/issues/138)) +- Fix rate limit handling for OAuth APIs ([#368](https://github.com/mikf/gallery-dl/issues/368)) +- Fix artwork and scraps extraction on `deviantart` ([#376](https://github.com/mikf/gallery-dl/issues/376), [#392](https://github.com/mikf/gallery-dl/issues/392)) +- Distinguish between `imgur` album and gallery URLs ([#380](https://github.com/mikf/gallery-dl/issues/380)) +- Prevent crash when using `--ugoira-conv` ([#382](https://github.com/mikf/gallery-dl/issues/382)) +- Handle multi-image posts on `patreon` ([#383](https://github.com/mikf/gallery-dl/issues/383)) +- Miscellaneous fixes for `*reactor`, `simplyhentai` + ## 1.10.1 - 2019-08-02 ## Fixes -- Restore functionality of both domains for `exhentai` extractors +- Use the correct domain for exhentai.org input URLs ## 1.10.0 - 2019-08-01 ### Warning @@ -78,8 +78,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.10.1/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.10.1/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.10.2/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.10.2/gallery-dl.bin>`__ These executables include a Python 3.7 interpreter and all required Python packages. @@ -224,13 +224,13 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.10.1.zip +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.10.2.zip .. _dev: https://github.com/mikf/gallery-dl/archive/master.zip .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ .. _pip: https://pip.pypa.io/en/stable/ -.. _Requests: http://docs.python-requests.org/en/master/ +.. _Requests: https://2.python-requests.org/en/master/#requests-http-for-humans .. _FFmpeg: https://www.ffmpeg.org/ .. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ .. _pyOpenSSL: https://pyopenssl.org/ diff --git a/docs/configuration.rst b/docs/configuration.rst index c6f757d..0e2e355 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -108,21 +108,36 @@ Description Directory path used as the base for all download destinations. =========== ===== -extractor.*.restrict-filenames ------------------------------- +extractor.*.path-restrict +------------------------- =========== ===== Type ``string`` Default ``"auto"`` -Example ``"/!? ()[]{}"`` -Description Characters to replace with underscores (``_``) when generating - directory and file names. +Example ``"/!? (){}"`` +Description Set of characters to replace with underscores (``_``) + in generated path segment names. Special values: * ``"auto"``: Use characters from ``"unix"`` or ``"windows"`` depending on the local operating system * ``"unix"``: ``"/"`` - * ``"windows"``: ``"<>:\"\\|/?*"`` + * ``"windows"``: ``"\\\\|/<>:\"?*"`` + + Note: In a set with 2 or more characters, ``[]^-\`` need to be + escaped with backslashes, e.g. ``"\\[\\]"`` +=========== ===== + + +extractor.*.path-remove +----------------------- +=========== ===== +Type ``string`` +Default ``"\\u0000-\\u001f\\u007f"`` (ASCII control characters) +Description Set of characters to remove from generated path names. + + Note: In a set with 2 or more characters, ``[]^-\`` need to be + escaped with backslashes, e.g. ``"\\[\\]"`` =========== ===== @@ -131,8 +146,11 @@ extractor.*.skip =========== ===== Type ``bool`` or ``string`` Default ``true`` -Description Controls the behavior when downloading files whose filename - already exists. +Description Controls the behavior when downloading files that have been + downloaded before, i.e. a file with the same filename already + exists or its ID is in a `download archive`__. + + __ `extractor.*.archive`_ * ``true``: Skip downloads * ``false``: Overwrite already existing files @@ -144,6 +162,9 @@ Description Controls the behavior when downloading files whose filename * ``"exit"``: Exit the program altogether * ``"exit:N"``: Skip downloads and exit the program after ``N`` consecutive skips + + * ``"enumerate"``: Append a numeric suffix to the end of the + original filename (``file.ext.1``, ``file.ext.2``, etc) =========== ===== @@ -555,6 +576,15 @@ Description Download original files if available. =========== ===== +extractor.deviantart.quality +---------------------------- +=========== ===== +Type ``integer`` +Default ``100`` +Description JPEG compression quality for newer images hosted on wixmp servers. +=========== ===== + + extractor.deviantart.refresh-token ---------------------------------- =========== ===== @@ -1098,6 +1128,16 @@ Description Certificate validation during file downloads. =========== ===== +downloader.http.adjust-extensions +--------------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Check the file headers of ``jpg``, ``png``, and ``gif`` files + and adjust their filename extensions if they do not match. +=========== ===== + + downloader.ytdl.format ---------------------- =========== ===== @@ -1772,7 +1812,7 @@ Description An object with the ``name`` of a post-processor and its options. .. _timeout: https://docs.python-requests.org/en/latest/user/advanced/#timeouts .. _verify: https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification .. _Last-Modified: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.29 -.. _`Requests' proxy documentation`: http://docs.python-requests.org/en/master/user/advanced/#proxies +.. _`Requests' proxy documentation`: https://2.python-requests.org/en/master/user/advanced/#proxies .. _format string: https://docs.python.org/3/library/string.html#formatstrings .. _format strings: https://docs.python.org/3/library/string.html#formatstrings .. _strptime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior @@ -1780,5 +1820,5 @@ Description An object with the ``name`` of a post-processor and its options. .. _webbrowser.open(): https://docs.python.org/3/library/webbrowser.html .. _datetime: https://docs.python.org/3/library/datetime.html#datetime-objects .. _datetime.max: https://docs.python.org/3/library/datetime.html#datetime.datetime.max -.. _Authentication: https://github.com/mikf/gallery-dl#5authentication +.. _Authentication: https://github.com/mikf/gallery-dl#authentication .. _youtube-dl: https://github.com/ytdl-org/youtube-dl diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index a4a9ee0..b9ff32d 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -8,8 +8,9 @@ "proxy": null, "skip": true, "sleep": 0, + "path-restrict": "auto", + "path-remove": "\\u0000-\\u001f\\u007f", "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0", - "restrict-filenames": "auto", "artstation": { @@ -30,6 +31,7 @@ "mature": true, "metadata": false, "original": true, + "quality": 100, "wait-min": 0 }, "exhentai": @@ -154,6 +156,7 @@ "http": { + "adjust-extensions": true, "mtime": true, "rate": null, "retries": 4, diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index d2fb4ea..05c8555 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -28,6 +28,7 @@ Dynasty Reader https://dynasty-scans.com/ Chapters, individual Im E-Hentai https://e-hentai.org/ Favorites, Galleries, Search Results Optional e621 https://e621.net/ Pools, Popular Images, Posts, Tag-Searches EroLord.com http://erolord.com/ Galleries +ExHentai https://exhentai.org/ Favorites, Galleries, Search Results Optional Fallen Angels Scans https://www.fascans.com/ Chapters, Manga Fashion Nova https://www.fashionnova.com/ Collections, Products Fireden https://boards.fireden.net/ Threads @@ -47,11 +48,11 @@ Hypnohub https://hypnohub.net/ Pools, Popular Images, Idol Complex https://idol.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional ImageBam http://www.imagebam.com/ Galleries, individual Images ImageFap https://imagefap.com/ Images from Users, Galleries, individual Images -ImgBB https://imgbb.com/ Images from Users, Albums Optional +ImgBB https://imgbb.com/ Images from Users, Albums, individual Images Optional imgbox https://imgbox.com/ Galleries, individual Images imgth https://imgth.com/ Galleries -imgur https://imgur.com/ Albums, individual Images -Instagram https://www.instagram.com/ Images from Users, individual Images, Tag-Searches Optional +imgur https://imgur.com/ Albums, Galleries, individual Images +Instagram https://www.instagram.com/ |instagram-C| Optional Jaimini's Box https://jaiminisbox.com/reader/ Chapters, Manga Joyreactor http://joyreactor.cc/ |joyreactor-C| Keenspot http://www.keenspot.com/ Comics @@ -77,7 +78,7 @@ Niconico Seiga https://seiga.nicovideo.jp/ Images from Users, indi nijie https://nijie.info/ |nijie-C| Required NSFWalbum.com https://nsfwalbum.com/ Albums Nyafuu Archive https://archive.nyafuu.org/ Threads -Patreon https://www.patreon.com/ Images from Users, Creators +Patreon https://www.patreon.com/ Images from Users, Creators, Posts Pawoo https://pawoo.net/ Images from Users, Images from Statuses Photobucket https://photobucket.com/ Albums, individual Images Piczel https://piczel.tv/ Images from Users, Folders, individual Images @@ -100,7 +101,7 @@ Sankaku Complex https://www.sankakucomplex.com/ Articles, Tag-Searches Sen Manga https://raw.senmanga.com/ Chapters Sense-Scans http://sensescans.com/reader/ Chapters, Manga Sex.com https://www.sex.com/ Boards, Pins, related Pins, Search Results -Simply Hentai https://www.simply-hentai.com/ Galleries, individual Images, Videos +Simply Hentai https://www.simply-hentai.com/ Galleries SlickPic https://www.slickpic.com/ Images from Users, Albums SlideShare https://www.slideshare.net/ Presentations SmugMug https://www.smugmug.com/ |smugmug-C| Optional (OAuth) @@ -134,6 +135,7 @@ Turboimagehost https://www.turboimagehost.com/ individual Images .. |deviantart-C| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images, Scraps, Sta.sh .. |flickr-C| replace:: Images from Users, Albums, Favorites, Galleries, Groups, individual Images, Search Results .. |hentaifoundry-C| replace:: Images from Users, Favorites, individual Images, Popular Images, Recent Images, Scraps +.. |instagram-C| replace:: Images from Users, Channels, individual Images, Stories, Tag-Searches .. |joyreactor-C| replace:: Images from Users, Posts, Search Results, Tag-Searches .. |nijie-C| replace:: Images from Users, Doujin, Favorites, individual Images .. |pixiv-C| replace:: Images from Users, Favorites, Follows, pixiv.me Links, Rankings, Search Results, Individual Images diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 7a95191..e3229eb 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -26,6 +26,7 @@ class HttpDownloader(DownloaderBase): def __init__(self, extractor, output): DownloaderBase.__init__(self, extractor, output) + self.adjust_extension = self.config("adjust-extensions", True) self.retries = self.config("retries", extractor._retries) self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) @@ -59,7 +60,6 @@ class HttpDownloader(DownloaderBase): def _download_impl(self, url, pathfmt): response = None - adj_ext = None tries = 0 msg = "" @@ -103,7 +103,7 @@ class HttpDownloader(DownloaderBase): elif code == 206: # Partial Content offset = filesize size = response.headers["Content-Range"].rpartition("/")[2] - elif code == 416: # Requested Range Not Satisfiable + elif code == 416 and filesize: # Requested Range Not Satisfiable break else: msg = "{}: {} for url: {}".format(code, response.reason, url) @@ -114,7 +114,7 @@ class HttpDownloader(DownloaderBase): size = text.parse_int(size) # set missing filename extension - if not pathfmt.has_extension: + if not pathfmt.extension: pathfmt.set_extension(self.get_extension(response)) if pathfmt.exists(): pathfmt.temppath = "" @@ -152,15 +152,16 @@ class HttpDownloader(DownloaderBase): continue # check filename extension - adj_ext = self.check_extension(file, pathfmt) + if self.adjust_extension: + adj_ext = self.check_extension(file, pathfmt.extension) + if adj_ext: + pathfmt.set_extension(adj_ext) break self.downloading = False - if adj_ext: - pathfmt.set_extension(adj_ext) if self.mtime: - pathfmt.keywords["_mtime"] = response.headers.get("Last-Modified") + pathfmt.kwdict["_mtime"] = response.headers.get("Last-Modified") return True def receive(self, response, file): @@ -196,9 +197,8 @@ class HttpDownloader(DownloaderBase): return "txt" @staticmethod - def check_extension(file, pathfmt): + def check_extension(file, extension): """Check filename extension against fileheader""" - extension = pathfmt.keywords["extension"] if extension in FILETYPE_CHECK: file.seek(0) header = file.read(8) diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index a233487..7d8b905 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -50,7 +50,7 @@ class YoutubeDLDownloader(DownloaderBase): return False if "entries" in info_dict: - index = pathfmt.keywords.get("_ytdl_index") + index = pathfmt.kwdict.get("_ytdl_index") if index is None: return self._download_playlist(pathfmt, info_dict) else: @@ -59,7 +59,7 @@ class YoutubeDLDownloader(DownloaderBase): def _download_video(self, pathfmt, info_dict): if "url" in info_dict: - text.nameext_from_url(info_dict["url"], pathfmt.keywords) + text.nameext_from_url(info_dict["url"], pathfmt.kwdict) pathfmt.set_extension(info_dict["ext"]) if pathfmt.exists(): pathfmt.temppath = "" diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index 5ea835f..5e2480a 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -21,12 +21,12 @@ class AdultempireGalleryExtractor(GalleryExtractor): test = ( ("https://www.adultempire.com/5998/gallery.html", { "range": "1", - "keyword": "0533ef1184892be8ac02b17286797c95f389ba63", + "keyword": "25c8171f5623678491a0d7bdf38a7a6ebfa4a361", "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e", }), ("https://www.adultdvdempire.com/5683/gallery.html", { "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d", - "keyword": "59fe5d95929efc5040a819a5f77aba7a022bb85a", + "keyword": "0fe9a6e3f0a331b95ba77f66a643705ca86e8ec5", }), ) @@ -42,8 +42,8 @@ class AdultempireGalleryExtractor(GalleryExtractor): "studio" : extr(">studio</small>", "<").strip(), "date" : text.parse_datetime(extr( ">released</small>", "<").strip(), "%m/%d/%Y"), - "actors" : text.split_html(extr( - '<ul class="item-details item-cast-list ', '</ul>'))[1:], + "actors" : sorted(text.split_html(extr( + '<ul class="item-details item-cast-list ', '</ul>'))[1:]), } def images(self, page): diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index f7b3bc1..2892bd4 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -41,6 +41,7 @@ class ArtstationExtractor(Extractor): player = adict["player_embedded"] url = text.extract(player, 'src="', '"')[0] if not url.startswith(self.root): + asset["extension"] = None yield Message.Url, "ytdl:" + url, asset continue diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index c63085a..54a8878 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -41,10 +41,8 @@ class BooruExtractor(SharedConfigMixin, Extractor): return pages * self.per_page def items(self): - data = self.get_metadata() - yield Message.Version, 1 - yield Message.Directory, data + data = self.get_metadata() self.reset_page() while True: @@ -59,9 +57,11 @@ class BooruExtractor(SharedConfigMixin, Extractor): if url.startswith("/"): url = text.urljoin(self.api_url, url) image.update(data) + text.nameext_from_url(url, image) if self.extags: self.extended_tags(image) - yield Message.Url, url, text.nameext_from_url(url, image) + yield Message.Directory, image + yield Message.Url, url, image if len(images) < self.per_page: return diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5c40e2a..a90af1c 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -87,7 +87,8 @@ class Extractor(): raise exception.HttpError(exc) else: code = response.status_code - if 200 <= code < 400 or not fatal and \ + if 200 <= code < 400 or fatal is None and \ + (400 <= code < 500) or not fatal and \ (400 <= code < 429 or 431 <= code < 500): if encoding: response.encoding = encoding diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 63e2913..bd1299b 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -27,7 +27,7 @@ BASE_PATTERN = ( class DeviantartExtractor(Extractor): - """Base class for deviantart extractors""" + """Base class for deviantart extractors using the OAuth API""" category = "deviantart" directory_fmt = ("{category}", "{author[username]!l}") filename_fmt = "{category}_{index}_{title}.{extension}" @@ -38,11 +38,15 @@ class DeviantartExtractor(Extractor): self.offset = 0 self.flat = self.config("flat", True) self.extra = self.config("extra", False) + self.quality = self.config("quality", "100") self.original = self.config("original", True) self.user = match.group(1) or match.group(2) self.group = False self.api = DeviantartAPI(self) + if self.quality: + self.quality = "q_{}".format(self.quality) + if self.original != "image": self._update_content = self._update_content_default else: @@ -81,12 +85,15 @@ class DeviantartExtractor(Extractor): text.ext_from_url(content["src"]) != "gif": self._update_content(deviation, content) - if deviation["index"] <= 790677560 and \ - content["src"].startswith("https://images-wixmp-"): - # https://github.com/r888888888/danbooru/issues/4069 - content["src"] = re.sub( - r"(/f/[^/]+/[^/]+)/v\d+/.*", - r"/intermediary\1", content["src"]) + if content["src"].startswith("https://images-wixmp-"): + if deviation["index"] <= 790677560: + # https://github.com/r888888888/danbooru/issues/4069 + content["src"] = re.sub( + r"(/f/[^/]+/[^/]+)/v\d+/.*", + r"/intermediary\1", content["src"]) + if self.quality: + content["src"] = re.sub( + r"q_\d+", self.quality, content["src"]) yield self.commit(deviation, content) @@ -133,8 +140,16 @@ class DeviantartExtractor(Extractor): @staticmethod def commit(deviation, target): url = target["src"] - deviation["target"] = text.nameext_from_url(url, target.copy()) - deviation["extension"] = deviation["target"]["extension"] + thumb = deviation["thumbs"][0]["src"] if "thumbs" in deviation else url + target = text.nameext_from_url(thumb, target.copy()) + if target["filename"].endswith("-150"): + target["filename"] = target["filename"][:-4] + if not target["filename"].count("-"): + name, _, hid = target["filename"].rpartition("_") + target["filename"] = name + "-" + hid + deviation["target"] = target + deviation["filename"] = target["filename"] + deviation["extension"] = target["extension"] = text.ext_from_url(url) return Message.Url, url, deviation def _commit_journal_html(self, deviation, journal): @@ -225,14 +240,6 @@ class DeviantartExtractor(Extractor): if mtype and mtype.startswith("image/"): content.update(data) - def _html_request(self, url, **kwargs): - cookies = {"userinfo": ( - '__167217c8e6aac1a3331f;{"username":"","uniqueid":"ab2e8b184471bf0' - 'e3f8ed3ee7a3220aa","vd":"Bc7vEx,BdC7Fy,A,J,A,,B,A,B,BdC7Fy,BdC7XU' - ',J,J,A,BdC7XU,13,A,B,A,,A,A,B,A,A,,A","attr":56}' - )} - return self.request(url, cookies=cookies, **kwargs) - class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" @@ -360,68 +367,6 @@ class DeviantartFolderExtractor(DeviantartExtractor): deviation["folder"] = self.folder -class DeviantartDeviationExtractor(DeviantartExtractor): - """Extractor for single deviations""" - subcategory = "deviation" - archive_fmt = "{index}.{extension}" - pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)" - test = ( - (("https://www.deviantart.com/shimoda7/art/" - "For-the-sake-of-a-memory-10073852"), { - "options": (("original", 0),), - "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", - }), - ("https://www.deviantart.com/zzz/art/zzz-1234567890", { - "exception": exception.NotFoundError, - }), - (("https://www.deviantart.com/myria-moon/art/" - "Aime-Moi-part-en-vadrouille-261986576"), { - "pattern": (r"https?://s3\.amazonaws\.com/origin-orig\." - r"deviantart\.net/a383/f/2013/135/e/7/[^.]+\.jpg\?"), - }), - # wixmp URL rewrite - (("https://www.deviantart.com/citizenfresh/art/" - "Hverarond-14-the-beauty-of-the-earth-789295466"), { - "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" - r"/intermediary/f/[^/]+/[^.]+\.jpg$") - }), - # non-download URL for GIFs (#242) - (("https://www.deviantart.com/skatergators/art/" - "COM-Monique-Model-781571783"), { - "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" - r"/f/[^/]+/[^.]+\.gif\?token="), - }), - # external URLs from description (#302) - (("https://www.deviantart.com/uotapo/art/" - "INANAKI-Memorial-Humane7-590297498"), { - "options": (("extra", 1), ("original", 0)), - "pattern": r"https?://sta\.sh/\w+$", - "range": "2-", - "count": 4, - }), - # old-style URLs - ("https://shimoda7.deviantart.com" - "/art/For-the-sake-of-a-memory-10073852"), - ("https://myria-moon.deviantart.com" - "/art/Aime-Moi-part-en-vadrouille-261986576"), - ("https://zzz.deviantart.com/art/zzz-1234567890"), - ) - - skip = Extractor.skip - - def __init__(self, match): - DeviantartExtractor.__init__(self, match) - self.path = match.group(3) - - def deviations(self): - url = "{}/{}/{}".format(self.root, self.user, self.path) - response = self._html_request(url, fatal=False) - deviation_id = text.extract(response.text, '//deviation/', '"')[0] - if response.status_code >= 400 or not deviation_id: - raise exception.NotFoundError("image") - return (self.api.deviation(deviation_id),) - - class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" @@ -558,54 +503,6 @@ class DeviantartJournalExtractor(DeviantartExtractor): return self.api.browse_user_journals(self.user, self.offset) -class DeviantartScrapsExtractor(DeviantartExtractor): - """Extractor for an artist's scraps""" - subcategory = "scraps" - directory_fmt = ("{category}", "{username}", "Scraps") - archive_fmt = "s_{username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/\?catpath=scraps\b" - test = ( - ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", { - "count": 12, - "options": (("original", False),), - }), - ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"), - ) - - def deviations(self): - url = "{}/{}/gallery/?catpath=scraps".format(self.root, self.user) - page = self._html_request(url).text - csrf, pos = text.extract(page, '"csrf":"', '"') - iid , pos = text.extract(page, '"requestid":"', '"', pos) - - url = "https://www.deviantart.com/dapi/v1/gallery/0" - data = { - "username": self.user, - "offset": self.offset, - "limit": "24", - "catpath": "scraps", - "_csrf": csrf, - "dapiIid": iid + "-jsok7403-1.1" - } - - while True: - content = self.request( - url, method="POST", data=data).json()["content"] - - for item in content["results"]: - if item["html"].startswith('<div class="ad-container'): - continue - deviation_url = text.extract(item["html"], 'href="', '"')[0] - page = self._html_request(deviation_url).text - deviation_id = text.extract(page, '//deviation/', '"')[0] - if deviation_id: - yield self.api.deviation(deviation_id) - - if not content["has_more"]: - return - data["offset"] = content["next_offset"] - - class DeviantartPopularExtractor(DeviantartExtractor): """Extractor for popular deviations""" subcategory = "popular" @@ -649,6 +546,247 @@ class DeviantartPopularExtractor(DeviantartExtractor): deviation["popular"] = self.popular +class DeviantartExtractorV2(Extractor): + """Base class for deviantart extractors using the NAPI""" + category = "deviantart" + directory_fmt = ("{category}", "{author[username]!l}") + filename_fmt = "{category}_{index}_{title}.{extension}" + root = "https://www.deviantart.com" + + def __init__(self, match=None): + Extractor.__init__(self, match) + self.offset = 0 + self.extra = self.config("extra", False) + self.quality = self.config("quality", "100") + self.user = match.group(1) or match.group(2) + + if self.quality: + self.quality = "q_{}".format(self.quality) + + def items(self): + url = ( + self.root + "/_napi/da-browse/shared_api/deviation/extended_fetch" + ) + params = { + "deviationid" : None, + "username" : None, + "type" : None, + "include_session": "false", + } + headers = { + "Referer": self.root, + } + + yield Message.Version, 1 + for deviation in self.deviations(): + params["deviationid"] = deviation["deviationId"] + params["username"] = deviation["author"]["username"] + params["type"] = "journal" if deviation["isJournal"] else "art" + data = self.request(url, params=params, headers=headers).json() + + if "deviation" not in data: + self.log.warning("Skipping %s", params["deviationid"]) + continue + deviation = self._extract(data) + + yield Message.Directory, deviation + yield Message.Url, deviation["target"]["src"], deviation + if self.extra: + for match in DeviantartStashExtractor.pattern.finditer( + deviation["description"]): + deviation["_extractor"] = DeviantartStashExtractor + yield Message.Queue, match.group(0), deviation + + def _extract(self, data): + deviation = data["deviation"] + extended = deviation["extended"] + files = deviation["files"] + del deviation["extended"] + del deviation["files"] + + # prepare deviation metadata + deviation["description"] = extended.get("description", "") + deviation["username"] = self.user.lower() + deviation["stats"] = extended["stats"] + deviation["stats"]["comments"] = data["comments"]["total"] + deviation["index"] = deviation["deviationId"] + deviation["tags"] = [t["name"] for t in extended.get("tags") or ()] + deviation["date"] = text.parse_datetime( + deviation["publishedTime"]) + deviation["category_path"] = "/".join( + extended[key]["displayNameEn"] + for key in ("typeFacet", "contentFacet", "categoryFacet") + if key in extended + ) + + # extract download target + target = files[-1] + name = files[0]["src"] + + if target["type"] == "gif": + pass + elif target["type"] == "video": + # select largest video + target = max( + files, key=lambda x: text.parse_int(x.get("quality", "")[:-1])) + name = target["src"] + elif target["type"] == "flash": + if target["src"].startswith("https://sandbox.deviantart.com"): + # extract SWF file from "sandbox" + target["src"] = text.extract( + self.request(target["src"]).text, + 'id="sandboxembed" src="', '"', + )[0] + elif "download" in extended: + target = extended["download"] + target["src"] = target["url"] + del target["url"] + + # url rewrites + if target["src"].startswith("https://images-wixmp-"): + if deviation["index"] <= 790677560: + # https://github.com/r888888888/danbooru/issues/4069 + target["src"] = re.sub( + r"(/f/[^/]+/[^/]+)/v\d+/.*", + r"/intermediary\1", target["src"]) + if self.quality: + target["src"] = re.sub( + r"q_\d+", self.quality, target["src"]) + + text.nameext_from_url(name, target) + if target["filename"].endswith("-150"): + target["filename"] = target["filename"][:-4] + if not target["filename"].count("-"): + name, _, hid = target["filename"].rpartition("_") + target["filename"] = name + "-" + hid + deviation["target"] = target + deviation["filename"] = target["filename"] + deviation["extension"] = target["extension"] = ( + text.ext_from_url(target["src"])) + return deviation + + +class DeviantartDeviationExtractor(DeviantartExtractorV2): + """Extractor for single deviations""" + subcategory = "deviation" + archive_fmt = "{index}.{extension}" + pattern = BASE_PATTERN + r"/(art|journal)/(?:[^/?&#]+-)?(\d+)" + test = ( + (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), { + "options": (("original", 0),), + "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", + }), + ("https://www.deviantart.com/zzz/art/zzz-1234567890", { + "count": 0, + }), + (("https://www.deviantart.com/myria-moon/art/Aime-Moi-261986576"), { + "pattern": (r"https://www.deviantart.com/download/261986576" + r"/[\w-]+\.jpg\?token=\w+&ts=\d+"), + }), + # wixmp URL rewrite + (("https://www.deviantart.com/citizenfresh/art/Hverarond-789295466"), { + "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" + r"/intermediary/f/[^/]+/[^.]+\.jpg$") + }), + # wixmp URL rewrite v2 (#369) + (("https://www.deviantart.com/josephbiwald/art/Destiny-2-804940104"), { + "pattern": r"https://images-wixmp-\w+\.wixmp\.com/.*,q_100," + }), + # non-download URL for GIFs (#242) + (("https://www.deviantart.com/skatergators/art/COM-Moni-781571783"), { + "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" + r"/f/[^/]+/[^.]+\.gif\?token="), + }), + # external URLs from description (#302) + (("https://www.deviantart.com/uotapo/art/INANAKI-Memo-590297498"), { + "options": (("extra", 1), ("original", 0)), + "pattern": r"https?://sta\.sh/\w+$", + "range": "2-", + "count": 4, + }), + # video + ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", { + "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b", + "keyword": { + "target": { + "duration": 306, + "extension": "mp4", + "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5", + "filesize": 9963639, + "quality": "1080p", + "src": str, + "type": "video", + }, + } + }), + # archive + ("https://www.deviantart.com/itsvenue/art/-brush-pngs-14-763300948", { + "pattern": r"https://.+deviantart.com/download/763300948/.*\.rar", + }), + # swf + ("https://www.deviantart.com/ikatxfruti/art/Bang-Bang-528130222", { + "pattern": r"https://images-wixmp-.*wixmp.com/f/.*\.swf", + }), + # old-style URLs + ("https://shimoda7.deviantart.com" + "/art/For-the-sake-of-a-memory-10073852"), + ("https://myria-moon.deviantart.com" + "/art/Aime-Moi-part-en-vadrouille-261986576"), + ("https://zzz.deviantart.com/art/zzz-1234567890"), + ) + + skip = Extractor.skip + + def __init__(self, match): + DeviantartExtractorV2.__init__(self, match) + self.type = match.group(3) + self.deviation_id = match.group(4) + + def deviations(self): + return ({ + "deviationId": self.deviation_id, + "author" : {"username": self.user}, + "isJournal" : self.type == "journal", + },) + + +class DeviantartScrapsExtractor(DeviantartExtractorV2): + """Extractor for an artist's scraps""" + subcategory = "scraps" + directory_fmt = ("{category}", "{username}", "Scraps") + archive_fmt = "s_{username}_{index}.{extension}" + pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" + test = ( + ("https://www.deviantart.com/shimoda7/gallery/scraps", { + "count": 12, + }), + ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps"), + ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"), + ) + + def deviations(self): + url = self.root + "/_napi/da-user-profile/api/gallery/contents" + params = { + "username" : self.user, + "offset" : self.offset, + "limit" : "24", + "scraps_folder": "true", + } + headers = { + "Referer": "{}/{}/gallery/scraps".format(self.root, self.user), + } + + while True: + data = self.request(url, params=params, headers=headers).json() + + for obj in data["results"]: + yield obj["deviation"] + + if not data["hasMore"]: + return + params["offset"] = data["nextOffset"] + + class DeviantartAPI(): """Minimal interface for the DeviantArt API @@ -805,7 +943,7 @@ class DeviantartAPI(): self.authenticate(None if public else self.refresh_token) response = self.extractor.request( - url, headers=self.headers, params=params, fatal=False) + url, headers=self.headers, params=params, fatal=None) data = response.json() status = response.status_code diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index ce2e83b..4ec7f00 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -33,16 +33,16 @@ class GelbooruExtractor(booru.XmlParserMixin, self.session.cookies["fringeBenefits"] = "yup" def items_noapi(self): - data = self.get_metadata() - yield Message.Version, 1 - yield Message.Directory, data + data = self.get_metadata() for post in self.get_posts(): post = self.get_post_data(post) url = post["file_url"] post.update(data) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post def get_posts(self): """Return an iterable containing all relevant post objects""" diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index c112465..e4f18b3 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor): test = ( ("https://hitomi.la/galleries/867789.html", { "url": "cb759868d090fe0e2655c3e29ebf146054322b6d", - "keyword": "067b5d9b9c0f98530cd5dd2444e0f5a5b4b00d38", + "keyword": "d097a8db8e810045131b4510c41714004f9eff3a", }), ("https://hitomi.la/galleries/1036181.html", { # "aa" subdomain for gallery-id ending in 1 (#142) diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 6980185..76b2c38 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -41,14 +41,14 @@ class ImagebamGalleryExtractor(ImagebamExtractor): pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)" test = ( ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { - "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", + "url": "76d976788ae2757ac81694736b07b72356f5c4c8", "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", "content": "596e6bfa157f2c7169805d50075c2986549973a8", }), ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", { # more than 100 images; see issue #219 "count": 107, - "url": "f92ce5b17676b6ea69288f0aef26f4cdbea7fd8d", + "url": "32ae6fe5dc3e4ca73ff6252e522d16473595d1d1", }), ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { "exception": exception.NotFoundError, @@ -108,7 +108,7 @@ class ImagebamImageExtractor(ImagebamExtractor): r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)") test = ( ("http://www.imagebam.com/image/94d56c502511890", { - "url": "b384893c35a01a09c58018db71ddc4cf2480be95", + "url": "5e9ba3b1451f8ded0ae3a1b84402888893915d4a", "keyword": "4263d4840007524129792b8587a562b5d20c2687", "content": "0c8768055e4e20e7c7259608b67799171b691140", }), diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 442634b..4aa670b 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -17,6 +17,7 @@ import json class ImgbbExtractor(Extractor): """Base class for imgbb extractors""" category = "imgbb" + directory_fmt = ("{category}", "{user}") filename_fmt = "{title} {id}.{extension}" archive_fmt = "{id}" root = "https://imgbb.com" @@ -145,7 +146,6 @@ class ImgbbAlbumExtractor(ImgbbExtractor): class ImgbbUserExtractor(ImgbbExtractor): """Extractor for user profiles in imgbb.com""" subcategory = "user" - directory_fmt = ("{category}", "{user}") pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$" test = ("https://folkie.imgbb.com", { "range": "1-80", @@ -177,3 +177,34 @@ class ImgbbUserExtractor(ImgbbExtractor): "params_hidden[userid]": user, "params_hidden[from]" : "user", }) + + +class ImgbbImageExtractor(ImgbbExtractor): + subcategory = "image" + pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)" + test = ("https://ibb.co/NLZHgqS", { + "url": "fbca86bac09de6fc0304054b2170b423ca1e84fa", + "keyword": "5d70e779bad03b2dc5273b627638045168671157", + }) + + def __init__(self, match): + ImgbbExtractor.__init__(self, match) + self.image_id = match.group(1) + + def items(self): + url = "https://ibb.co/" + self.image_id + extr = text.extract_from(self.request(url).text) + + image = { + "id" : self.image_id, + "title" : text.unescape(extr('"og:title" content="', '"')), + "url" : extr('"og:image" content="', '"'), + "width" : text.parse_int(extr('"og:image:width" content="', '"')), + "height": text.parse_int(extr('"og:image:height" content="', '"')), + "user" : extr('rel="author">', '<').lower(), + } + image["extension"] = text.ext_from_url(image["url"]) + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, image["url"], image diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index c5e3d17..8523523 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -20,13 +20,19 @@ class ImgurExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.item_id = match.group(1) + self.key = match.group(1) self.mp4 = self.config("mp4", True) - def _get_data(self, path): + def _extract_data(self, path): response = self.request(self.root + path, notfound=self.subcategory) - data = text.extract(response.text, "image : ", ",\n")[0] - return self._clean(json.loads(data)) + data = json.loads(text.extract( + response.text, "image : ", ",\n")[0]) + try: + del data["adConfig"] + del data["isAd"] + except KeyError: + pass + return data def _prepare(self, image): image["ext"] = image["ext"].partition("?")[0] @@ -37,18 +43,9 @@ class ImgurExtractor(Extractor): image["extension"] = image["ext"][1:] return url - @staticmethod - def _clean(data): - try: - del data["adConfig"] - del data["isAd"] - except KeyError: - pass - return data - class ImgurImageExtractor(ImgurExtractor): - """Extractor for individual images from imgur.com""" + """Extractor for individual images on imgur.com""" subcategory = "image" filename_fmt = "{category}_{hash}{title:?_//}.{extension}" archive_fmt = "{hash}" @@ -101,22 +98,21 @@ class ImgurImageExtractor(ImgurExtractor): ) def items(self): - image = self._get_data("/" + self.item_id) + image = self._extract_data("/" + self.key) url = self._prepare(image) - yield Message.Version, 1 yield Message.Directory, image yield Message.Url, url, image class ImgurAlbumExtractor(ImgurExtractor): - """Extractor for image albums from imgur.com""" + """Extractor for imgur albums""" subcategory = "album" directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}") filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" archive_fmt = "{album[hash]}_{hash}" pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" - r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})") + r"/(?:a|t/unmuted)/(\w{7}|\w{5})") test = ( ("https://imgur.com/a/TcBmP", { "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", @@ -147,7 +143,7 @@ class ImgurAlbumExtractor(ImgurExtractor): "width": int, }, }), - ("https://imgur.com/gallery/eD9CT", { # large album + ("https://imgur.com/a/eD9CT", { # large album "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937", }), ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash @@ -164,13 +160,13 @@ class ImgurAlbumExtractor(ImgurExtractor): ) def items(self): - album = self._get_data("/a/" + self.item_id + "/all") + album = self._extract_data("/a/" + self.key + "/all") images = album["album_images"]["images"] del album["album_images"] if int(album["num_images"]) > len(images): url = "{}/ajaxalbums/getimages/{}/hit.json".format( - self.root, self.item_id) + self.root, self.key) images = self.request(url).json()["data"]["images"] yield Message.Version, 1 @@ -180,3 +176,32 @@ class ImgurAlbumExtractor(ImgurExtractor): image["num"] = num image["album"] = album yield Message.Url, url, image + + +class ImgurGalleryExtractor(ImgurExtractor): + """Extractor for imgur galleries""" + subcategory = "gallery" + pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" + r"/gallery/(\w{7}|\w{5})") + test = ( + ("https://imgur.com/gallery/zf2fIms", { # non-album gallery (#380) + "pattern": "https://imgur.com/zf2fIms", + }), + ("https://imgur.com/gallery/eD9CT", { + "pattern": "https://imgur.com/a/eD9CT", + }), + ) + + def items(self): + url = self.root + "/a/" + self.key + with self.request(url, method="HEAD", fatal=False) as response: + code = response.status_code + + if code < 400: + extr = ImgurAlbumExtractor + else: + extr = ImgurImageExtractor + url = self.root + "/" + self.key + + yield Message.Version, 1 + yield Message.Queue, url, {"_extractor": extr} diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 475e24b..e5cfe8b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -8,11 +8,10 @@ """Extract images from https://www.instagram.com/""" -import hashlib -import json from .common import Extractor, Message from .. import text, exception from ..cache import cache +import json class InstagramExtractor(Extractor): @@ -37,10 +36,11 @@ class InstagramExtractor(Extractor): data.update(metadata) yield Message.Directory, data - if data['typename'] == 'GraphImage': + if data['typename'] in ('GraphImage', 'GraphStoryImage', 'GraphStoryVideo'): yield Message.Url, data['display_url'], \ text.nameext_from_url(data['display_url'], data) elif data['typename'] == 'GraphVideo': + data["extension"] = None yield Message.Url, \ 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data @@ -140,33 +140,113 @@ class InstagramExtractor(Extractor): return medias + def _extract_stories(self, url): + if self.highlight_id: + user_id = '' + highlight_id = '"{}"'.format(self.highlight_id) + query_hash = '30a89afdd826d78a5376008a7b81c205' + else: + page = self.request(url).text + shared_data = self._extract_shared_data(page) + + # If no stories are present the URL redirects to `ProfilePage' + if 'StoriesPage' not in shared_data['entry_data']: + return [] + + user_id = '"{}"'.format( + shared_data['entry_data']['StoriesPage'][0]['user']['id']) + highlight_id = '' + query_hash = 'cda12de4f7fd3719c0569ce03589f4c4' + + variables = ( + '{{' + '"reel_ids":[{}],"tag_names":[],"location_ids":[],' + '"highlight_reel_ids":[{}],"precomposed_overlay":true,' + '"show_story_viewer_list":true,' + '"story_viewer_fetch_count":50,"story_viewer_cursor":"",' + '"stories_video_dash_manifest":false}}' + ).format(user_id, highlight_id) + headers = { + "X-Requested-With": "XMLHttpRequest", + } + url = '{}/graphql/query/?query_hash={}&variables={}'.format( + self.root, + query_hash, + variables, + ) + shared_data = self.request(url, headers=headers).json() + + # If there are stories present but the user is not authenticated or + # does not have permissions no stories are returned. + if not shared_data['data']['reels_media']: + return [] # no stories present + + medias = [] + for media in shared_data['data']['reels_media'][0]['items']: + media_data = { + 'owner_id': media['owner']['id'], + 'username': media['owner']['username'], + 'date': text.parse_timestamp(media['taken_at_timestamp']), + 'expires': text.parse_timestamp(media['expiring_at_timestamp']), + 'media_id': media['id'], + 'typename': media['__typename'], + } + if media['__typename'] == 'GraphStoryImage': + media_data.update({ + 'display_url': media['display_url'], + 'height': text.parse_int(media['dimensions']['height']), + 'width': text.parse_int(media['dimensions']['width']), + }) + elif media['__typename'] == 'GraphStoryVideo': + vr = media['video_resources'][0] + media_data.update({ + 'duration': text.parse_float(media['video_duration']), + 'display_url': vr['src'], + 'height': text.parse_int(vr['config_height']), + 'width': text.parse_int(vr['config_width']), + }) + medias.append(media_data) + + return medias + def _extract_page(self, url, page_type): shared_data_fields = { 'ProfilePage': { + 'page': 'ProfilePage', 'node': 'user', 'node_id': 'id', 'edge_to_medias': 'edge_owner_to_timeline_media', 'variables_id': 'id', - 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41', + 'query_hash': 'f2405b236d85e8296cf30347c9f08c2a', + }, + 'ProfileChannelPage': { + 'page': 'ProfilePage', + 'node': 'user', + 'node_id': 'id', + 'edge_to_medias': 'edge_felix_video_timeline', + 'variables_id': 'id', + 'query_hash': 'bc78b344a68ed16dd5d7f264681c4c76', }, 'TagPage': { + 'page': 'TagPage', 'node': 'hashtag', 'node_id': 'name', 'edge_to_medias': 'edge_hashtag_to_media', 'variables_id': 'tag_name', - 'query_hash': 'f92f56d47dc7a55b606908374b43a314', + 'query_hash': 'f12c9ec5e46a3173b2969c712ad84744', }, } page = self.request(url).text shared_data = self._extract_shared_data(page) psdf = shared_data_fields[page_type] + csrf = shared_data["config"]["csrf_token"] while True: # Deal with different structure of pages: the first page # has interesting data in `entry_data', next pages in `data'. if 'entry_data' in shared_data: - base_shared_data = shared_data['entry_data'][page_type][0]['graphql'] + base_shared_data = shared_data['entry_data'][psdf['page']][0]['graphql'] # variables_id is available only in the first page variables_id = base_shared_data[psdf['node']][psdf['node_id']] @@ -192,7 +272,8 @@ class InstagramExtractor(Extractor): ) headers = { "X-Requested-With": "XMLHttpRequest", - "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(), + "X-CSRFToken": csrf, + "X-IG-App-ID": "936619743392459", } url = '{}/graphql/query/?query_hash={}&variables={}'.format( self.root, @@ -204,14 +285,20 @@ class InstagramExtractor(Extractor): def _extract_profilepage(self, url): yield from self._extract_page(url, 'ProfilePage') + def _extract_profilechannelpage(self, url): + yield from self._extract_page(url, 'ProfileChannelPage') + def _extract_tagpage(self, url): yield from self._extract_page(url, 'TagPage') + def _extract_storiespage(self, url): + yield from self._extract_stories(url) + class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" subcategory = "image" - pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)" + pattern = r"(?:https?://)?(?:www\.)?instagram\.com/(?:p|tv)/([^/?&#]+)" test = ( # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { @@ -258,6 +345,22 @@ class InstagramImageExtractor(InstagramExtractor): } }), + # GraphVideo (IGTV) + ("https://www.instagram.com/tv/BkQjCfsBIzi/", { + "url": "64208f408e11cbbca86c2df4488e90262ae9d9ec", + "keyword": { + "date": "type:datetime", + "description": str, + "height": int, + "likes": int, + "media_id": "1806097553666903266", + "shortcode": "BkQjCfsBIzi", + "typename": "GraphVideo", + "username": "instagram", + "width": int, + } + }), + # GraphSidecar with 2 embedded GraphVideo objects ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, @@ -283,10 +386,11 @@ class InstagramUserExtractor(InstagramExtractor): """Extractor for ProfilePage""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)") + r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"([^/?&#]+)/?$") test = ("https://www.instagram.com/instagram/", { - "range": "1-12", - "count": ">= 12", + "range": "1-16", + "count": ">= 16", }) def __init__(self, match): @@ -298,6 +402,26 @@ class InstagramUserExtractor(InstagramExtractor): return self._extract_profilepage(url) +class InstagramChannelExtractor(InstagramExtractor): + """Extractor for ProfilePage channel""" + subcategory = "channel" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"([^/?&#]+)/channel") + test = ("https://www.instagram.com/instagram/channel/", { + "range": "1-16", + "count": ">= 16", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username = match.group(1) + + def instagrams(self): + url = '{}/{}/channel/'.format(self.root, self.username) + return self._extract_profilechannelpage(url) + + class InstagramTagExtractor(InstagramExtractor): """Extractor for TagPage""" subcategory = "tag" @@ -305,8 +429,8 @@ class InstagramTagExtractor(InstagramExtractor): pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" r"/explore/tags/([^/?&#]+)") test = ("https://www.instagram.com/explore/tags/instagram/", { - "range": "1-12", - "count": ">= 12", + "range": "1-16", + "count": ">= 16", }) def __init__(self, match): @@ -319,3 +443,22 @@ class InstagramTagExtractor(InstagramExtractor): def instagrams(self): url = '{}/explore/tags/{}/'.format(self.root, self.tag) return self._extract_tagpage(url) + + +class InstagramStoriesExtractor(InstagramExtractor): + """Extractor for StoriesPage""" + subcategory = "stories" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/stories/([^/?&#]+)(?:/(\d+))?") + test = ( + ("https://www.instagram.com/stories/instagram/"), + ("https://www.instagram.com/stories/highlights/18042509488170095/"), + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username, self.highlight_id = match.groups() + + def instagrams(self): + url = '{}/stories/{}/'.format(self.root, self.username) + return self._extract_storiespage(url) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 879d38b..a73eb86 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { "url": "7e4984a271a1072ac6483e4228a045895aff86f3", - "keyword": "ab4e5b71583fd439b4c8012a642aa8b58d8d0758", + "keyword": "07c0b915f2ab1cc3bbf28b76e7950fccee1213f3", "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", }), ("https://luscious.net/albums/virgin-killer-sweater_282582/", { diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 282c389..1ca1073 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -93,7 +93,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor): test = ( ("https://blitzwuff.newgrounds.com/art", { "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", - "keyword": "98566e0c8096a8099b8d71962fea7e31c8b098d4", + "keyword": "62981f7bdd66e1f1c72ab1d9b932423c156bc9a1", }), ("https://blitzwuff.newgrounds.com/"), ) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 4884497..ab5932d 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -11,6 +11,8 @@ from .common import Extractor, Message from .. import text from ..cache import memcache +import collections +import json class PatreonExtractor(Extractor): @@ -33,70 +35,92 @@ class PatreonExtractor(Extractor): for post in self.posts(): yield Message.Directory, post + ids = set() post["num"] = 0 content = post.get("content") postfile = post.get("post_file") - for url in text.extract_iter(content or "", 'src="', '"'): + for image in post["images"]: + url = image.get("download_url") + if not url: + continue + ids.add(url.split("/")[-2]) + name = image.get("file_name") or self._filename(url) or url + post["num"] += 1 - yield Message.Url, url, text.nameext_from_url(url, post) + post["type"] = "image" + yield Message.Url, url, text.nameext_from_url(name, post) - if postfile: + if postfile and postfile["url"].split("/")[-2] not in ids: post["num"] += 1 + post["type"] = "postfile" text.nameext_from_url(postfile["name"], post) yield Message.Url, postfile["url"], post for attachment in post["attachments"]: post["num"] += 1 + post["type"] = "attachment" text.nameext_from_url(attachment["name"], post) yield Message.Url, attachment["url"], post + if content: + for url in text.extract_iter(content, 'src="', '"'): + post["num"] += 1 + post["type"] = "content" + yield Message.Url, url, text.nameext_from_url(url, post) + def posts(self): """Return all relevant post objects""" def _pagination(self, url): headers = {"Referer": self.root} - empty = [] while url: posts = self.request(url, headers=headers).json() - if "included" not in posts: - return - - # collect attachments - attachments = {} - for inc in posts["included"]: - if inc["type"] == "attachment": - attachments[inc["id"]] = inc["attributes"] - - # update posts - for post in posts["data"]: - attr = post["attributes"] - attr["id"] = text.parse_int(post["id"]) - attr["date"] = text.parse_datetime( - attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - attr["creator"] = self._user( - post["relationships"]["user"]["links"]["related"]) - - # add attachments to post attributes - files = post["relationships"].get("attachments") - if files: - attr["attachments"] = [ - attachments[f["id"]] - for f in files["data"] - ] - else: - attr["attachments"] = empty - - yield attr + if "included" in posts: + included = self._transform(posts["included"]) + for post in posts["data"]: + yield self._process(post, included) if "links" not in posts: return url = posts["links"].get("next") + def _process(self, post, included): + """Process and extend a 'post' object""" + attr = post["attributes"] + attr["id"] = text.parse_int(post["id"]) + attr["images"] = self._files(post, included, "images") + attr["attachments"] = self._files(post, included, "attachments") + attr["date"] = text.parse_datetime( + attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + attr["creator"] = self._user( + post["relationships"]["user"]["links"]["related"]) + return attr + + @staticmethod + def _transform(included): + """Transform 'included' into an easier to handle format""" + result = collections.defaultdict(dict) + for inc in included: + result[inc["type"]][inc["id"]] = inc["attributes"] + return result + + @staticmethod + def _files(post, included, key): + """Build a list of files""" + files = post["relationships"].get(key) + if files and files.get("data"): + return [ + included[file["type"]][file["id"]] + for file in files["data"] + ] + return [] + @memcache(keyarg=1) def _user(self, url): + """Fetch user information""" user = self.request(url).json()["data"] attr = user["attributes"] attr["id"] = user["id"] @@ -104,14 +128,21 @@ class PatreonExtractor(Extractor): attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z") return attr + def _filename(self, url): + """Fetch filename from its Content-Disposition header""" + response = self.request(url, method="HEAD", fatal=False) + cd = response.headers.get("Content-Disposition") + return text.extract(cd, 'filename="', '"')[0] + @staticmethod def _build_url(endpoint, query): return ( "https://www.patreon.com/api/" + endpoint + - "?include=user,attachments,user_defined_tags,campaign,poll.choices" - ",poll.current_user_responses.user,poll.current_user_responses.cho" - "ice,poll.current_user_responses.poll,access_rules.tier.null" + "?include=user,images,attachments,user_defined_tags,campaign,poll." + "choices,poll.current_user_responses.user,poll.current_user_respon" + "ses.choice,poll.current_user_responses.poll,access_rules.tier.nul" + "l" "&fields[post]=change_visibility_at,comment_count,content,current_" "user_can_delete,current_user_can_view,current_user_has_liked,embe" @@ -133,7 +164,8 @@ class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" - r"/(?!(?:home|join|login|signup)(?:$|[/?&#]))([^/?&#]+)/?") + r"/(?!(?:home|join|posts|login|signup)(?:$|[/?&#]))" + r"([^/?&#]+)/?") test = ("https://www.patreon.com/koveliana", { "range": "1-25", "count": ">= 25", @@ -144,6 +176,7 @@ class PatreonCreatorExtractor(PatreonExtractor): "creator": dict, "date": "type:datetime", "id": int, + "images": list, "like_count": int, "post_type": str, "published_at": str, @@ -181,3 +214,26 @@ class PatreonUserExtractor(PatreonExtractor): "&filter[is_following]=true" )) return self._pagination(url) + + +class PatreonPostExtractor(PatreonExtractor): + """Extractor for media from a single post""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" + r"/posts/[^/?&#]*?(\d+)") + test = ("https://www.patreon.com/posts/precious-metal-23563293", { + "count": 4, + }) + + def __init__(self, match): + PatreonExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + url = "{}/posts/{}".format(self.root, self.post_id) + page = self.request(url).text + data = text.extract(page, "window.patreon.bootstrap,", "\n});")[0] + post = json.loads(data + "}")["post"] + + included = self._transform(post["included"]) + return (self._process(post["data"], included),) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 76d4dc4..4f8ee9c 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -18,8 +18,8 @@ class PixivExtractor(Extractor): """Base class for pixiv extractors""" category = "pixiv" directory_fmt = ("{category}", "{user[id]} {user[account]}") - filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}" - archive_fmt = "{id}{num}.{extension}" + filename_fmt = "{id}_p{num}.{extension}" + archive_fmt = "{id}{suffix}.{extension}" def __init__(self, match): Extractor.__init__(self, match) @@ -40,9 +40,10 @@ class PixivExtractor(Extractor): del work["meta_single_page"] del work["image_urls"] del work["meta_pages"] - work["num"] = "" + work["num"] = 0 work["tags"] = [tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) + work["suffix"] = "" work.update(metadata) yield Message.Directory, work @@ -55,20 +56,17 @@ class PixivExtractor(Extractor): url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") work["frames"] = ugoira["frames"] - work["extension"] = "zip" - yield Message.Url, url, work + yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] - work["extension"] = url.rpartition(".")[2] - yield Message.Url, url, work + yield Message.Url, url, text.nameext_from_url(url, work) else: - for num, img in enumerate(meta_pages): + for work["num"], img in enumerate(meta_pages): url = img["image_urls"]["original"] - work["num"] = "_p{:02}".format(num) - work["extension"] = url.rpartition(".")[2] - yield Message.Url, url, work + work["suffix"] = "_p{:02}".format(work["num"]) + yield Message.Url, url, text.nameext_from_url(url, work) def works(self): """Return an iterable containing all relevant 'work'-objects""" diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index fa4eb81..aa5c9c6 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -29,7 +29,7 @@ class PururinGalleryExtractor(GalleryExtractor): "artist" : ["Shoda Norihiro"], "group" : ["Obsidian Order"], "parody" : ["Kantai Collection"], - "characters": ["Iowa", "Teitoku"], + "characters": ["Admiral", "Iowa"], "tags" : list, "type" : "Doujinshi", "collection": "", diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 59d502a..f97454b 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -117,6 +117,8 @@ class ReactorExtractor(SharedConfigMixin, Extractor): url = text.extract(image, ' src="', '"')[0] if not url: continue + if url.startswith("//"): + url = "http:" + url width = text.extract(image, ' width="', '"')[0] height = text.extract(image, ' height="', '"')[0] image_id = url.rpartition("-")[2].partition(".")[0] @@ -268,8 +270,8 @@ class JoyreactorPostExtractor(ReactorPostExtractor): "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47", }), ("http://joyreactor.com/post/3668724", { # youtube embed - "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a", - "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651", + "url": "bf1666eddcff10c9b58f6be63fa94e4e13074214", + "keyword": "989112c7888e9cc80fd35870180c6c98165d953b", }), ("http://joyreactor.cc/post/1299", { # "malformed" JSON "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2ba4b99..94e95e8 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -234,7 +234,7 @@ class RedditAPI(): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = 1 self.authenticate() - response = self.extractor.request(url, params=params, fatal=False) + response = self.extractor.request(url, params=params, fatal=None) remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: wait = int(response.headers["x-ratelimit-reset"]) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index da9735e..bb8a2ae 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -40,17 +40,18 @@ class SankakuExtractor(SharedConfigMixin, Extractor): def items(self): self.login() - data = self.get_metadata() yield Message.Version, 1 - yield Message.Directory, data + data = self.get_metadata() for post_id in util.advance(self.get_posts(), self.start_post): self.wait() post = self.get_post_data(post_id) url = post["file_url"] post.update(data) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post def skip(self, num): self.start_post += num diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index afd4eaa..38b7813 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -78,6 +78,7 @@ class SexcomExtractor(Extractor): path += "/hd" data["url"] = self.root + path else: + data["extension"] = None data["url"] = "ytdl:" + text.extract( extr('<iframe', '>'), ' src="', '"')[0] else: diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 5ad372d..8567155 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -8,14 +8,16 @@ """Extract hentai-manga from https://www.simply-hentai.com/""" -from .common import GalleryExtractor, Extractor, Message +from .common import GalleryExtractor from .. import text, util, exception +import json class SimplyhentaiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from simply-hentai.com""" category = "simplyhentai" archive_fmt = "{image_id}" + root = "https://www.simply-hentai.com" pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com" r"(?!/(?:album|gifs?|images?|series)(?:/|$))" r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)") @@ -23,7 +25,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { "url": "258289249990502c3138719cb89e995a60861e49", - "keyword": "eba83ccdbab3022a2280c77aa747f9458196138b", + "keyword": "8b2400e4b466e8f46802fa5a6b917d2788bb7e8e", }), ("https://www.simply-hentai.com/notfound", { "exception": exception.GalleryDLException, @@ -40,144 +42,30 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): self.session.headers["Referer"] = url def metadata(self, page): - extr = text.extract_from(page) - split = text.split_html - - title = extr('<meta property="og:title" content="', '"') - if not title: + path = text.extract(page, '<a class="preview" href="', '"')[0] + if not path: raise exception.NotFoundError("gallery") - data = { - "title" : text.unescape(title), - "gallery_id": text.parse_int(extr('/Album/', '/')), - "parody" : split(extr('box-title">Series</div>', '</div>')), - "language" : text.remove_html(extr( - 'box-title">Language</div>', '</div>')) or None, - "characters": split(extr('box-title">Characters</div>', '</div>')), - "tags" : split(extr('box-title">Tags</div>', '</div>')), - "artist" : split(extr('box-title">Artists</div>', '</div>')), - "date" : text.parse_datetime(text.remove_html( - extr('Uploaded', '</div>')), "%d.%m.%Y"), + page = self.request(self.root + path).text + data = json.loads(text.unescape(text.extract( + page, 'data-react-class="Reader" data-react-props="', '"')[0])) + self.manga = manga = data["manga"] + + return { + "title" : manga["title"], + "parody" : manga["series"]["title"], + "language" : manga["language"]["name"], + "lang" : util.language_to_code(manga["language"]["name"]), + "characters": [x["name"] for x in manga["characters"]], + "tags" : [x["name"] for x in manga["tags"]], + "artist" : [x["name"] for x in manga["artists"]], + "gallery_id": text.parse_int(text.extract( + manga["images"][0]["sizes"]["full"], "/Album/", "/")[0]), + "date" : text.parse_datetime( + manga["publish_date"], "%Y-%m-%dT%H:%M:%S.%f%z"), } - data["lang"] = util.language_to_code(data["language"]) - return data def images(self, _): - url = self.chapter_url + "/all-pages" - headers = {"Accept": "application/json"} - images = self.request(url, headers=headers).json() return [ - (urls["full"], {"image_id": text.parse_int(image_id)}) - for image_id, urls in sorted(images.items()) + (image["sizes"]["full"], {"image_id": image["id"]}) + for image in self.manga["images"] ] - - -class SimplyhentaiImageExtractor(Extractor): - """Extractor for individual images from simply-hentai.com""" - category = "simplyhentai" - subcategory = "image" - directory_fmt = ("{category}", "{type}s") - filename_fmt = "{category}_{token}{title:?_//}.{extension}" - archive_fmt = "{token}" - pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com" - r"/(image|gif)/[^/?&#]+)") - test = ( - (("https://www.simply-hentai.com/image" - "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), { - "url": "0338eb137830ab6f81e5f410d3936ef785d063d9", - "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2", - }), - ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", { - "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1", - "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_url = "https://www." + match.group(1) - self.type = match.group(2) - - def items(self): - extr = text.extract_from(self.request(self.page_url).text) - title = extr('"og:title" content="' , '"') - descr = extr('"og:description" content="', '"') - url = extr('"image":"' , '&') - url = extr(""content":"", "&") or url - - tags = text.extract(descr, " tagged with ", " online for free ")[0] - if tags: - tags = tags.split(", ") - tags[-1] = tags[-1].partition(" ")[2] - else: - tags = [] - - data = text.nameext_from_url(url, { - "title": text.unescape(title) if title else "", - "tags": tags, - "type": self.type, - }) - data["token"] = data["filename"].rpartition("_")[2] - - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, url, data - - -class SimplyhentaiVideoExtractor(Extractor): - """Extractor for hentai videos from simply-hentai.com""" - category = "simplyhentai" - subcategory = "video" - directory_fmt = ("{category}", "{type}s") - filename_fmt = "{title}{episode:?_//>02}.{extension}" - archive_fmt = "{title}_{episode}" - pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)" - test = ( - ("https://videos.simply-hentai.com/creamy-pie-episode-02", { - "pattern": r"https://www\.googleapis\.com/drive/v3/files" - r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+", - "keyword": "706790708b14773efc1e075ddd3b738a375348a5", - "count": 1, - }), - (("https://videos.simply-hentai.com" - "/1715-tifa-in-hentai-gang-bang-3d-movie"), { - "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0", - "keyword": "f9dad94fbde9c95859e631ff4f07297a9567b874", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_url = "https://" + match.group(1) - - def items(self): - page = self.request(self.page_url).text - - title, pos = text.extract(page, "<title>", "</title>") - tags , pos = text.extract(page, ">Tags</div>", "</div>", pos) - date , pos = text.extract(page, ">Upload Date</div>", "</div>", pos) - title = title.rpartition(" - ")[0] - - if "<video" in page: - video_url = text.extract(page, '<source src="', '"', pos)[0] - episode = 0 - else: - # video url from myhentai.tv embed - pos = page.index('<div class="video-frame-container">', pos) - embed_url = text.extract(page, 'src="', '"', pos)[0].replace( - "embedplayer.php?link=", "embed.php?name=") - embed_page = self.request(embed_url).text - video_url = text.extract(embed_page, '"file":"', '"')[0] - title, _, episode = title.rpartition(" Episode ") - - data = text.nameext_from_url(video_url, { - "title": text.unescape(title), - "episode": text.parse_int(episode), - "tags": text.split_html(tags)[::2], - "type": "video", - "date": text.parse_datetime(text.remove_html( - date), "%B %d, %Y %H:%M"), - }) - - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, video_url, data diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ccba640..3672a6d 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -54,6 +54,7 @@ class TwitterExtractor(Extractor): if self.videos and "-videoContainer" in tweet: data["num"] = 1 + data["extension"] = None url = "ytdl:{}/{}/status/{}".format( self.root, data["user"], data["tweet_id"]) yield Message.Url, url, data diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index b9c223c..463733f 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -70,7 +70,7 @@ class WikiartArtistExtractor(WikiartExtractor): pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)" test = ("https://www.wikiart.org/en/thomas-cole", { "url": "f1eee8158f5b8b7380382ab730a8f53884715c8b", - "keyword": "b62678394ce645815963883d5c9642255307225f", + "keyword": "c61f5a4774b977106000e9554d19cfb9438a7032", }) def __init__(self, match): diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 9699806..23750db 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -13,13 +13,16 @@ from .. import text import json -BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)" +BASE_PATTERN = r"(?:https?://)?((?:[^.]+\.)?xhamster\d?\.(?:com|one|desi))" class XhamsterExtractor(Extractor): """Base class for xhamster extractors""" category = "xhamster" - root = "https://xhamster.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "https://" + match.group(1) class XhamsterGalleryExtractor(XhamsterExtractor): @@ -66,16 +69,21 @@ class XhamsterGalleryExtractor(XhamsterExtractor): }, }, }), + ("https://jp.xhamster2.com/photos/gallery/11748968", { + "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$", + "count": ">= 144", + }), ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"), ("https://xhamster.com/photos/gallery/11748968"), ("https://xhamster.one/photos/gallery/11748968"), ("https://xhamster.desi/photos/gallery/11748968"), + ("https://xhamster2.com/photos/gallery/11748968"), ("https://en.xhamster.com/photos/gallery/11748968"), ) def __init__(self, match): XhamsterExtractor.__init__(self, match) - self.path = match.group(1) + self.path = match.group(2) self.data = None def items(self): @@ -154,7 +162,7 @@ class XhamsterUserExtractor(XhamsterExtractor): def __init__(self, match): XhamsterExtractor.__init__(self, match) - self.user = match.group(1) + self.user = match.group(2) def items(self): yield Message.Version, 1 diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 637561a..6d81e66 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -196,7 +196,7 @@ class DownloadJob(Job): archive = self.archive # prepare download - pathfmt.set_keywords(keywords) + pathfmt.set_filename(keywords) if postprocessors: for pp in postprocessors: @@ -316,7 +316,9 @@ class DownloadJob(Job): skip = self.extractor.config("skip", True) if skip: self._skipexc = None - if isinstance(skip, str): + if skip == "enumerate": + self.pathfmt.check_file = self.pathfmt._enum_file + elif isinstance(skip, str): skip, _, smax = skip.partition(":") if skip == "abort": self._skipexc = exception.StopExtraction @@ -334,7 +336,8 @@ class DownloadJob(Job): postprocessors = self.extractor.config("postprocessors") if postprocessors: - self.postprocessors = [] + pp_list = [] + for pp_dict in postprocessors: whitelist = pp_dict.get("whitelist") blacklist = pp_dict.get("blacklist") @@ -353,16 +356,19 @@ class DownloadJob(Job): "'%s' initialization failed: %s: %s", name, exc.__class__.__name__, exc) else: - self.postprocessors.append(pp_obj) - self.extractor.log.debug( - "Active postprocessor modules: %s", self.postprocessors) + pp_list.append(pp_obj) + + if pp_list: + self.postprocessors = pp_list + self.extractor.log.debug( + "Active postprocessor modules: %s", pp_list) class SimulationJob(DownloadJob): """Simulate the extraction process without downloading anything""" def handle_url(self, url, keywords, fallback=None): - self.pathfmt.set_keywords(keywords) + self.pathfmt.set_filename(keywords) self.out.skip(self.pathfmt.path) if self.sleep: time.sleep(self.sleep) diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py index 8a12755..69ab4f6 100644 --- a/gallery_dl/oauth.py +++ b/gallery_dl/oauth.py @@ -127,6 +127,6 @@ class OAuth1API(): self.api_key = api_key def request(self, url, method="GET", **kwargs): - kwargs["fatal"] = False + kwargs["fatal"] = None kwargs["session"] = self.session return self.extractor.request(url, method, **kwargs) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index af70fc8..ecc2ee3 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -307,7 +307,8 @@ def build_parser(): "--ugoira-conv", dest="postprocessors", action="append_const", const={"name": "ugoira", "ffmpeg-args": ( - "-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an")}, + "-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an"), + "whitelist": ("pixiv", "danbooru")}, help="Convert Pixiv Ugoira to WebM (requires FFmpeg)", ) postprocessor.add_argument( diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py index 62460d3..4a9bde9 100644 --- a/gallery_dl/postprocessor/classify.py +++ b/gallery_dl/postprocessor/classify.py @@ -33,17 +33,24 @@ class ClassifyPP(PostProcessor): } def prepare(self, pathfmt): - ext = pathfmt.keywords.get("extension") - + ext = pathfmt.extension if ext in self.mapping: - self._dir = pathfmt.realdirectory + os.sep + self.mapping[ext] - pathfmt.realpath = self._dir + os.sep + pathfmt.filename - else: - self._dir = None + # set initial paths to enable download skips + self._build_paths(pathfmt, self.mapping[ext]) def run(self, pathfmt): - if self._dir: - os.makedirs(self._dir, exist_ok=True) + ext = pathfmt.extension + if ext in self.mapping: + # rebuild paths in case the filename extension changed + path = self._build_paths(pathfmt, self.mapping[ext]) + os.makedirs(path, exist_ok=True) + + @staticmethod + def _build_paths(pathfmt, extra): + path = pathfmt.realdirectory + extra + pathfmt.realpath = path + os.sep + pathfmt.filename + pathfmt.path = pathfmt.directory + extra + os.sep + pathfmt.filename + return path __postprocessor__ = ClassifyPP diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index c642f0f..b967cf6 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Mike Fährmann +# Copyright 2018-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,3 +23,6 @@ class PostProcessor(): def finalize(self): """Cleanup""" + + def __repr__(self): + return self.__class__.__name__ diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 77be9c7..467ef11 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -36,15 +36,14 @@ class MetadataPP(PostProcessor): def run(self, pathfmt): path = "{}.{}".format(pathfmt.realpath, self.extension) with open(path, "w", encoding="utf-8") as file: - self.write(file, pathfmt) + self.write(file, pathfmt.kwdict) - def _write_custom(self, file, pathfmt): - output = self.formatter.format_map(pathfmt.keywords) + def _write_custom(self, file, kwdict): + output = self.formatter.format_map(kwdict) file.write(output) - def _write_tags(self, file, pathfmt): - kwds = pathfmt.keywords - tags = kwds.get("tags") or kwds.get("tag_string") + def _write_tags(self, file, kwdict): + tags = kwdict.get("tags") or kwdict.get("tag_string") if not tags: return @@ -58,8 +57,8 @@ class MetadataPP(PostProcessor): file.write("\n".join(tags)) file.write("\n") - def _write_json(self, file, pathfmt): - util.dump_json(pathfmt.keywords, file, self.ascii, self.indent) + def _write_json(self, file, kwdict): + util.dump_json(kwdict, file, self.ascii, self.indent) __postprocessor__ = MetadataPP diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index 03d2f11..7065428 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -19,9 +19,9 @@ class MtimePP(PostProcessor): self.key = options.get("key", "date") def run(self, pathfmt): - mtime = pathfmt.keywords.get(self.key) + mtime = pathfmt.kwdict.get(self.key) ts = getattr(mtime, "timestamp", None) - pathfmt.keywords["_mtime"] = ts() if ts else parse_int(mtime) + pathfmt.kwdict["_mtime"] = ts() if ts else parse_int(mtime) __postprocessor__ = MtimePP diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index bd8c5ad..0dbb796 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -52,13 +52,13 @@ class UgoiraPP(PostProcessor): def prepare(self, pathfmt): self._frames = None - if pathfmt.keywords["extension"] != "zip": + if pathfmt.extension != "zip": return - if "frames" in pathfmt.keywords: - self._frames = pathfmt.keywords["frames"] - elif "pixiv_ugoira_frame_data" in pathfmt.keywords: - self._frames = pathfmt.keywords["pixiv_ugoira_frame_data"]["data"] + if "frames" in pathfmt.kwdict: + self._frames = pathfmt.kwdict["frames"] + elif "pixiv_ugoira_frame_data" in pathfmt.kwdict: + self._frames = pathfmt.kwdict["pixiv_ugoira_frame_data"]["data"] else: return diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 02d998d..79fa175 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -391,10 +391,18 @@ class Formatter(): if field_name: self.fields.append(( len(self.result), - self._field_access(field_name, format_spec, conversion) + self._field_access(field_name, format_spec, conversion), )) self.result.append("") + if len(self.result) == 1: + if self.fields: + self.format_map = self.fields[0][1] + else: + self.format_map = lambda _: format_string + del self.result + del self.fields + def format_map(self, kwargs): """Apply 'kwargs' to the initial format_string and return its result""" for index, func in self.fields: @@ -512,48 +520,63 @@ class Formatter(): class PathFormat(): def __init__(self, extractor): - self.filename_fmt = extractor.config( - "filename", extractor.filename_fmt) - self.directory_fmt = extractor.config( - "directory", extractor.directory_fmt) - self.kwdefault = extractor.config("keywords-default") + filename_fmt = extractor.config("filename", extractor.filename_fmt) + directory_fmt = extractor.config("directory", extractor.directory_fmt) + kwdefault = extractor.config("keywords-default") try: - self.formatter = Formatter(self.filename_fmt, self.kwdefault) + self.filename_formatter = Formatter( + filename_fmt, kwdefault).format_map except Exception as exc: raise exception.FormatError(exc, "filename") - self.delete = False - self.has_extension = False - self.keywords = {} - self.filename = "" + try: + self.directory_formatters = [ + Formatter(dirfmt, kwdefault).format_map + for dirfmt in directory_fmt + ] + except Exception as exc: + raise exception.FormatError(exc, "directory") + self.directory = self.realdirectory = "" + self.filename = "" + self.extension = "" + self.prefix = "" + self.kwdict = {} + self.delete = False self.path = self.realpath = self.temppath = "" - self.basedirectory = expand_path( + basedir = expand_path( extractor.config("base-directory", (".", "gallery-dl"))) - if os.altsep and os.altsep in self.basedirectory: - self.basedirectory = self.basedirectory.replace(os.altsep, os.sep) + if os.altsep and os.altsep in basedir: + basedir = basedir.replace(os.altsep, os.sep) + if basedir[-1] != os.sep: + basedir += os.sep + self.basedirectory = basedir - restrict = extractor.config("restrict-filenames", "auto") + restrict = extractor.config("path-restrict", "auto") if restrict == "auto": - restrict = "<>:\"\\/|?*" if os.name == "nt" else "/" + restrict = "\\\\|/<>:\"?*" if os.name == "nt" else "/" elif restrict == "unix": restrict = "/" elif restrict == "windows": - restrict = "<>:\"\\/|?*" - self.clean_path = self._build_cleanfunc(restrict) + restrict = "\\\\|/<>:\"?*" + + remove = extractor.config("path-remove", "\x00-\x1f\x7f") + + self.clean_segment = self._build_cleanfunc(restrict, "_") + self.clean_path = self._build_cleanfunc(remove, "") @staticmethod - def _build_cleanfunc(repl): - if not repl: + def _build_cleanfunc(chars, repl): + if not chars: return lambda x: x - elif len(repl) == 1: - def func(x, r=repl): - return x.replace(r, "_") + elif len(chars) == 1: + def func(x, c=chars, r=repl): + return x.replace(c, r) else: - def func(x, sub=re.compile("[" + re.escape(repl) + "]").sub): - return sub("_", x) + def func(x, sub=re.compile("[" + chars + "]").sub, r=repl): + return sub(r, x) return func def open(self, mode="wb"): @@ -562,68 +585,91 @@ class PathFormat(): def exists(self, archive=None): """Return True if the file exists on disk or in 'archive'""" - if archive and archive.check(self.keywords): + if archive and self.kwdict in archive: return self.fix_extension() - if self.has_extension and os.path.exists(self.realpath): - return True + if self.extension and os.path.exists(self.realpath): + return self.check_file() return False - def set_directory(self, keywords): + @staticmethod + def check_file(): + return True + + def _enum_file(self): + num = 1 + while True: + self.prefix = str(num) + "." + self.set_extension(self.extension, False) + if not os.path.exists(self.realpath): + return False + num += 1 + + def set_directory(self, kwdict): """Build directory path and create it if necessary""" + + # Build path segments by applying 'kwdict' to directory format strings try: segments = [ - self.clean_path( - Formatter(segment, self.kwdefault) - .format_map(keywords).strip()) - for segment in self.directory_fmt + self.clean_segment(format_map(kwdict).strip()) + for format_map in self.directory_formatters ] except Exception as exc: raise exception.FormatError(exc, "directory") - self.directory = os.path.join( - self.basedirectory, - *segments - ) + # Join path segements + sep = os.sep + directory = self.clean_path(self.basedirectory + sep.join(segments)) - # remove trailing path separator; - # occurs if the last argument to os.path.join() is an empty string - if self.directory[-1] == os.sep: - self.directory = self.directory[:-1] + # Ensure directory ends with a path separator + if directory[-1] != sep: + directory += sep + self.directory = directory - self.realdirectory = self.adjust_path(self.directory) + # Enable longer-than-260-character paths on Windows + if os.name == "nt": + self.realdirectory = "\\\\?\\" + os.path.abspath(directory) + sep + else: + self.realdirectory = directory + + # Create directory tree os.makedirs(self.realdirectory, exist_ok=True) - def set_keywords(self, keywords): - """Set filename keywords""" - self.keywords = keywords - self.temppath = "" - self.has_extension = bool(keywords.get("extension")) - if self.has_extension: + def set_filename(self, kwdict): + """Set general filename data""" + self.kwdict = kwdict + self.temppath = self.prefix = "" + self.extension = kwdict["extension"] + + if self.extension: self.build_path() def set_extension(self, extension, real=True): - """Set the 'extension' keyword""" - self.has_extension = real - self.keywords["extension"] = extension + """Set filename extension""" + if real: + self.extension = extension + self.kwdict["extension"] = self.prefix + extension self.build_path() def fix_extension(self, _=None): - if not self.has_extension: - self.set_extension("") + """Fix filenames without a given filename extension""" + if not self.extension: + self.set_extension("", False) if self.path[-1] == ".": self.path = self.path[:-1] self.temppath = self.realpath = self.realpath[:-1] return True def build_path(self): - """Use filename-keywords and directory to build a full path""" + """Use filename metadata and directory to build a full path""" + + # Apply 'kwdict' to filename format string try: - self.filename = self.clean_path( - self.formatter.format_map(self.keywords)) + self.filename = filename = self.clean_path(self.clean_segment( + self.filename_formatter(self.kwdict))) except Exception as exc: raise exception.FormatError(exc, "filename") - filename = os.sep + self.filename + # Combine directory and filename to full paths self.path = self.directory + filename self.realpath = self.realdirectory + filename if not self.temppath: @@ -631,7 +677,7 @@ class PathFormat(): def part_enable(self, part_directory=None): """Enable .part file usage""" - if self.has_extension: + if self.extension: self.temppath += ".part" else: self.set_extension("part", False) @@ -657,16 +703,16 @@ class PathFormat(): return if self.temppath != self.realpath: - # move temp file to its actual location + # Move temp file to its actual location try: os.replace(self.temppath, self.realpath) except OSError: shutil.copyfile(self.temppath, self.realpath) os.unlink(self.temppath) - if "_mtime" in self.keywords: - # set file modification time - mtime = self.keywords["_mtime"] + if "_mtime" in self.kwdict: + # Set file modification time + mtime = self.kwdict["_mtime"] if mtime: try: if isinstance(mtime, str): @@ -675,11 +721,6 @@ class PathFormat(): except Exception: pass - @staticmethod - def adjust_path(path): - """Enable longer-than-260-character paths on windows""" - return "\\\\?\\" + os.path.abspath(path) if os.name == "nt" else path - class DownloadArchive(): @@ -693,8 +734,8 @@ class DownloadArchive(): "archive-format", extractor.archive_fmt) ).format_map - def check(self, kwdict): - """Return True if item described by 'kwdict' exists in archive""" + def __contains__(self, kwdict): + """Return True if the item described by 'kwdict' exists in archive""" key = self.keygen(kwdict) self.cursor.execute( "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d9cc3d6..911939d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.10.1" +__version__ = "1.10.2" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 334671e..d8c8a03 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -2,7 +2,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -TESTS_CORE=(config cookies downloader extractor oauth text util) +TESTS_CORE=(config cookies downloader extractor oauth postprocessor text util) TESTS_RESULTS=(results) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 498e3fc..78963aa 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -24,7 +24,8 @@ CATEGORY_MAP = { "dynastyscans" : "Dynasty Reader", "e621" : "e621", "erolord" : "EroLord.com", - "exhentai" : "E-Hentai", + "e-hentai" : "E-Hentai", + "exhentai" : "ExHentai", "fallenangels" : "Fallen Angels Scans", "fashionnova" : "Fashion Nova", "hbrowse" : "HBrowse", @@ -109,6 +110,7 @@ SUBCATEGORY_MAP = { AUTH_MAP = { "danbooru" : "Optional", "deviantart" : "Optional (OAuth)", + "e-hentai" : "Optional", "exhentai" : "Optional", "flickr" : "Optional (OAuth)", "idolcomplex": "Optional", @@ -203,6 +205,15 @@ def build_extractor_list(): for extrlist in extractors.values(): extrlist.sort(key=subcategory_key) + # ugly hack to add e-hentai.org + eh = [] + for extr in extractors["exhentai"]: + class eh_extr(extr): + category = "e-hentai" + root = "https://e-hentai.org" + eh.append(eh_extr) + extractors["e-hentai"] = eh + # sort lists by category return sorted( extractors.values(), diff --git a/test/test_downloader.py b/test/test_downloader.py index caed983..0f58d4e 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -120,7 +120,7 @@ class TestDownloaderBase(unittest.TestCase): } pathfmt = PathFormat(cls.extractor) pathfmt.set_directory(kwdict) - pathfmt.set_keywords(kwdict) + pathfmt.set_filename(kwdict) if content: mode = "w" + ("b" if isinstance(content, bytes) else "") @@ -145,7 +145,7 @@ class TestDownloaderBase(unittest.TestCase): # test filename extension self.assertEqual( - pathfmt.keywords["extension"], + pathfmt.extension, expected_extension, ) self.assertEqual( diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py new file mode 100644 index 0000000..786dc46 --- /dev/null +++ b/test/test_postprocessor.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os.path +import zipfile +import tempfile +from datetime import datetime, timezone as tz + +import unittest +from unittest.mock import Mock, mock_open, patch + +from gallery_dl import postprocessor, extractor, util, config +from gallery_dl.postprocessor.common import PostProcessor + + +class MockPostprocessorModule(Mock): + __postprocessor__ = "mock" + + +class TestPostprocessorModule(unittest.TestCase): + + def setUp(self): + postprocessor._cache.clear() + + def test_find(self): + for name in (postprocessor.modules): + cls = postprocessor.find(name) + self.assertEqual(cls.__name__, name.capitalize() + "PP") + self.assertIs(cls.__base__, PostProcessor) + + self.assertEqual(postprocessor.find("foo"), None) + self.assertEqual(postprocessor.find(1234) , None) + self.assertEqual(postprocessor.find(None) , None) + + @patch("importlib.import_module") + def test_cache(self, import_module): + import_module.return_value = MockPostprocessorModule() + + for name in (postprocessor.modules): + postprocessor.find(name) + self.assertEqual(import_module.call_count, len(postprocessor.modules)) + + # no new calls to import_module + for name in (postprocessor.modules): + postprocessor.find(name) + self.assertEqual(import_module.call_count, len(postprocessor.modules)) + + +class BasePostprocessorTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.extractor = extractor.find("test:") + cls.dir = tempfile.TemporaryDirectory() + cls.fnum = 0 + config.set(("base-directory",), cls.dir.name) + + @classmethod + def tearDownClass(cls): + cls.dir.cleanup() + config.clear() + + def _create(self, options=None, data=None): + kwdict = {"category": "test", "filename": "file", "extension": "ext"} + if options is None: + options = {} + if data is not None: + kwdict.update(data) + + self.pathfmt = util.PathFormat(self.extractor) + self.pathfmt.set_directory(kwdict) + self.pathfmt.set_filename(kwdict) + + pp = postprocessor.find(self.__class__.__name__[:-4].lower()) + return pp(self.pathfmt, options) + + +class ClassifyTest(BasePostprocessorTest): + + def test_classify_default(self): + pp = self._create() + + self.assertEqual(pp.mapping, { + ext: directory + for directory, exts in pp.DEFAULT_MAPPING.items() + for ext in exts + }) + self.pathfmt.set_extension("jpg") + + pp.prepare(self.pathfmt) + path = os.path.join(self.dir.name, "test", "Pictures") + self.assertEqual(self.pathfmt.path, path + "/file.jpg") + self.assertEqual(self.pathfmt.realpath, path + "/file.jpg") + + with patch("os.makedirs") as mkdirs: + pp.run(self.pathfmt) + mkdirs.assert_called_once_with(path, exist_ok=True) + + def test_classify_noop(self): + pp = self._create() + rp = self.pathfmt.realpath + + pp.prepare(self.pathfmt) + self.assertEqual(self.pathfmt.path, rp) + self.assertEqual(self.pathfmt.realpath, rp) + + with patch("os.makedirs") as mkdirs: + pp.run(self.pathfmt) + self.assertEqual(mkdirs.call_count, 0) + + def test_classify_custom(self): + pp = self._create({"mapping": { + "foo/bar": ["foo", "bar"], + }}) + + self.assertEqual(pp.mapping, { + "foo": "foo/bar", + "bar": "foo/bar", + }) + self.pathfmt.set_extension("foo") + + pp.prepare(self.pathfmt) + path = os.path.join(self.dir.name, "test", "foo", "bar") + self.assertEqual(self.pathfmt.path, path + "/file.foo") + self.assertEqual(self.pathfmt.realpath, path + "/file.foo") + + with patch("os.makedirs") as mkdirs: + pp.run(self.pathfmt) + mkdirs.assert_called_once_with(path, exist_ok=True) + + +class MetadataTest(BasePostprocessorTest): + + def test_metadata_default(self): + pp = self._create() + + # default arguments + self.assertEqual(pp.write , pp._write_json) + self.assertEqual(pp.ascii , False) + self.assertEqual(pp.indent , 4) + self.assertEqual(pp.extension, "json") + + def test_metadata_json(self): + pp = self._create({ + "mode" : "json", + "ascii" : True, + "indent" : 2, + "extension": "JSON", + }) + + self.assertEqual(pp.write , pp._write_json) + self.assertEqual(pp.ascii , True) + self.assertEqual(pp.indent , 2) + self.assertEqual(pp.extension, "JSON") + + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + + path = self.pathfmt.realpath + ".JSON" + m.assert_called_once_with(path, "w", encoding="utf-8") + self.assertEqual(self._output(m), """{ + "category": "test", + "extension": "ext", + "filename": "file" +} +""") + + def test_metadata_tags(self): + pp = self._create({"mode": "tags"}, {"tags": ["foo", "bar", "baz"]}) + self.assertEqual(pp.write, pp._write_tags) + self.assertEqual(pp.extension, "txt") + + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + + path = self.pathfmt.realpath + ".txt" + m.assert_called_once_with(path, "w", encoding="utf-8") + self.assertEqual(self._output(m), "foo\nbar\nbaz\n") + + def test_metadata_tags_split_1(self): + pp = self._create({"mode": "tags"}, {"tags": "foo, bar, baz"}) + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "foo\nbar\nbaz\n") + + def test_metadata_tags_split_2(self): + pp = self._create( + {"mode": "tags"}, + {"tags": "foobar1 foobar2 foobarbaz"}, + ) + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "foobar1\nfoobar2\nfoobarbaz\n") + + def test_metadata_tags_tagstring(self): + pp = self._create({"mode": "tags"}, {"tag_string": "foo, bar, baz"}) + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "foo\nbar\nbaz\n") + + def test_metadata_custom(self): + pp = self._create( + {"mode": "custom", "format": "{foo}\n{missing}\n"}, + {"foo": "bar"}, + ) + self.assertEqual(pp.write, pp._write_custom) + self.assertEqual(pp.extension, "txt") + self.assertTrue(pp.formatter) + + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "bar\nNone\n") + + @staticmethod + def _output(mock): + return "".join( + call[1][0] + for call in mock.mock_calls + if call[0] == "().write" + ) + + +class MtimeTest(BasePostprocessorTest): + + def test_mtime_default(self): + pp = self._create() + self.assertEqual(pp.key, "date") + + def test_mtime_datetime(self): + pp = self._create(None, {"date": datetime(1980, 1, 1, tzinfo=tz.utc)}) + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + + def test_mtime_timestamp(self): + pp = self._create(None, {"date": 315532800}) + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + + def test_mtime_custom(self): + pp = self._create({"key": "foo"}, {"foo": 315532800}) + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + + +class ZipTest(BasePostprocessorTest): + + def test_zip_default(self): + pp = self._create() + self.assertEqual(pp.path, self.pathfmt.realdirectory) + self.assertEqual(pp.run, pp._write) + self.assertEqual(pp.delete, True) + self.assertFalse(hasattr(pp, "args")) + self.assertEqual(pp.zfile.compression, zipfile.ZIP_STORED) + self.assertEqual( + pp.zfile.filename, self.pathfmt.realdirectory + ".zip") + + def test_zip_options(self): + pp = self._create({ + "keep-files": True, + "compression": "zip", + "extension": "cbz", + }) + self.assertEqual(pp.delete, False) + self.assertEqual(pp.zfile.compression, zipfile.ZIP_DEFLATED) + self.assertEqual( + pp.zfile.filename, self.pathfmt.realdirectory + ".cbz") + + def test_zip_safe(self): + pp = self._create({"mode": "safe"}) + self.assertEqual(pp.delete, True) + self.assertEqual(pp.path, self.pathfmt.realdirectory) + self.assertEqual(pp.run, pp._write_safe) + self.assertEqual(pp.args, ( + pp.path + ".zip", "a", zipfile.ZIP_STORED, True + )) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_results.py b/test/test_results.py index 839a75c..12f2416 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -26,12 +26,9 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "adultempire", - "flickr", + "8chan", "imgth", - "mangafox", "mangapark", - "pixnet", } @@ -90,13 +87,17 @@ class TestExtractorResults(unittest.TestCase): # test archive-id uniqueness self.assertEqual(len(set(tjob.list_archive)), len(tjob.list_archive)) - # test '_extractor' entries if tjob.queue: + # test '_extractor' entries for url, kwdict in zip(tjob.list_url, tjob.list_keyword): if "_extractor" in kwdict: extr = kwdict["_extractor"].from_url(url) self.assertIsInstance(extr, kwdict["_extractor"]) self.assertEqual(extr.url, url) + else: + # test 'extension' entries + for kwdict in tjob.list_keyword: + self.assertIn("extension", kwdict) # test extraction results if "url" in result: @@ -168,7 +169,6 @@ class ResultJob(job.DownloadJob): if content: self.fileobj = TestPathfmt(self.hash_content) - self.get_downloader("http").check_extension = lambda a, b: None self.format_directory = TestFormatter( "".join(self.extractor.directory_fmt)) @@ -222,8 +222,8 @@ class TestPathfmt(): self.hashobj = hashobj self.path = "" self.size = 0 - self.keywords = {} - self.has_extension = True + self.kwdict = {} + self.extension = "jpg" def __enter__(self): return self @@ -280,6 +280,7 @@ def setup_test_config(): config.clear() config.set(("cache", "file"), ":memory:") config.set(("downloader", "part"), False) + config.set(("downloader", "adjust-extensions"), False) config.set(("extractor", "timeout"), 60) config.set(("extractor", "username"), name) config.set(("extractor", "password"), name) |
