From b75d158d014d6c43d7d785c46c9372a9cf84d144 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Mon, 26 Aug 2019 19:34:45 -0400 Subject: New upstream version 1.10.2 --- .travis.yml | 14 +- CHANGELOG.md | 24 ++- README.rst | 8 +- docs/configuration.rst | 60 +++++- docs/gallery-dl.conf | 5 +- docs/supportedsites.rst | 12 +- gallery_dl/downloader/http.py | 18 +- gallery_dl/downloader/ytdl.py | 4 +- gallery_dl/extractor/adultempire.py | 8 +- gallery_dl/extractor/artstation.py | 1 + gallery_dl/extractor/booru.py | 8 +- gallery_dl/extractor/common.py | 3 +- gallery_dl/extractor/deviantart.py | 394 +++++++++++++++++++++++------------ gallery_dl/extractor/gelbooru.py | 8 +- gallery_dl/extractor/hitomi.py | 2 +- gallery_dl/extractor/imagebam.py | 6 +- gallery_dl/extractor/imgbb.py | 33 ++- gallery_dl/extractor/imgur.py | 67 ++++-- gallery_dl/extractor/instagram.py | 169 +++++++++++++-- gallery_dl/extractor/luscious.py | 2 +- gallery_dl/extractor/newgrounds.py | 2 +- gallery_dl/extractor/patreon.py | 130 ++++++++---- gallery_dl/extractor/pixiv.py | 20 +- gallery_dl/extractor/pururin.py | 2 +- gallery_dl/extractor/reactor.py | 6 +- gallery_dl/extractor/reddit.py | 2 +- gallery_dl/extractor/sankaku.py | 7 +- gallery_dl/extractor/sexcom.py | 1 + gallery_dl/extractor/simplyhentai.py | 162 +++----------- gallery_dl/extractor/twitter.py | 1 + gallery_dl/extractor/wikiart.py | 2 +- gallery_dl/extractor/xhamster.py | 16 +- gallery_dl/job.py | 20 +- gallery_dl/oauth.py | 2 +- gallery_dl/option.py | 3 +- gallery_dl/postprocessor/classify.py | 23 +- gallery_dl/postprocessor/common.py | 5 +- gallery_dl/postprocessor/metadata.py | 15 +- gallery_dl/postprocessor/mtime.py | 4 +- gallery_dl/postprocessor/ugoira.py | 10 +- gallery_dl/util.py | 179 ++++++++++------ gallery_dl/version.py | 2 +- scripts/run_tests.sh | 2 +- scripts/supportedsites.py | 13 +- test/test_downloader.py | 4 +- test/test_postprocessor.py | 294 ++++++++++++++++++++++++++ test/test_results.py | 17 +- 47 files changed, 1261 insertions(+), 529 deletions(-) create mode 100644 test/test_postprocessor.py diff --git a/.travis.yml b/.travis.yml index 6158941..4b3a2cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,11 +16,18 @@ matrix: env: GALLERYDL_TESTS=results - language: minimal dist: xenial + env: GALLERYDL_TESTS=snap addons: snaps: - name: snapcraft classic: true - env: SNAP_TESTS=true + install: + - true + script: + - sudo apt update + - snapcraft --destructive-mode + - sudo snap try + - snap run gallery-dl --verbose https://twitter.com/ubuntu/status/1121001597092364288 git: depth: 3 @@ -31,6 +38,7 @@ branches: - /^v\d+\.\d+\.\d+(-\S*)?$/ - /^test(-\w+)+$/ +install: + - pip install -r requirements.txt pyOpenSSL script: - - 'if test "${SNAP_TESTS}" != true; then ./scripts/run_tests.sh; else true; fi' - - 'if test "${SNAP_TESTS}" = true; then sudo apt update && snapcraft --destructive-mode && sudo snap try && snap run gallery-dl --verbose https://twitter.com/ubuntu/status/1121001597092364288; else true; fi' + - ./scripts/run_tests.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 58e295c..99df78a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,30 @@ # Changelog +## 1.10.2 - 2019-08-23 +### Additions +- Support for `instagram` stories and IGTV ([#371](https://github.com/mikf/gallery-dl/issues/371), [#373](https://github.com/mikf/gallery-dl/issues/373)) +- Support for individual `imgbb` images ([#363](https://github.com/mikf/gallery-dl/issues/363)) +- `deviantart.quality` option to set the JPEG compression quality for newer images ([#369](https://github.com/mikf/gallery-dl/issues/369)) +- `enumerate` option for `extractor.skip` ([#306](https://github.com/mikf/gallery-dl/issues/306)) +- `adjust-extensions` option to control filename extension adjustments +- `path-remove` option to remove control characters etc. from filesystem paths +### Changes +- Rename `restrict-filenames` to `path-restrict` +- Adjust `pixiv` metadata and default filename format ([#366](https://github.com/mikf/gallery-dl/issues/366)) + - Set `filename` to `"{category}_{user[id]}_{id}{suffix}.{extension}"` to restore the old default +- Improve and optimize directory and filename generation +### Fixes +- Allow the `classify` post-processor to handle files with unknown filename extension ([#138](https://github.com/mikf/gallery-dl/issues/138)) +- Fix rate limit handling for OAuth APIs ([#368](https://github.com/mikf/gallery-dl/issues/368)) +- Fix artwork and scraps extraction on `deviantart` ([#376](https://github.com/mikf/gallery-dl/issues/376), [#392](https://github.com/mikf/gallery-dl/issues/392)) +- Distinguish between `imgur` album and gallery URLs ([#380](https://github.com/mikf/gallery-dl/issues/380)) +- Prevent crash when using `--ugoira-conv` ([#382](https://github.com/mikf/gallery-dl/issues/382)) +- Handle multi-image posts on `patreon` ([#383](https://github.com/mikf/gallery-dl/issues/383)) +- Miscellaneous fixes for `*reactor`, `simplyhentai` + ## 1.10.1 - 2019-08-02 ## Fixes -- Restore functionality of both domains for `exhentai` extractors +- Use the correct domain for exhentai.org input URLs ## 1.10.0 - 2019-08-01 ### Warning diff --git a/README.rst b/README.rst index 3bca007..e62a7ec 100644 --- a/README.rst +++ b/README.rst @@ -78,8 +78,8 @@ Download a standalone executable file, put it into your `PATH `__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ These executables include a Python 3.7 interpreter and all required Python packages. @@ -224,13 +224,13 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.10.1.zip +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.10.2.zip .. _dev: https://github.com/mikf/gallery-dl/archive/master.zip .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ .. _pip: https://pip.pypa.io/en/stable/ -.. _Requests: http://docs.python-requests.org/en/master/ +.. _Requests: https://2.python-requests.org/en/master/#requests-http-for-humans .. _FFmpeg: https://www.ffmpeg.org/ .. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ .. _pyOpenSSL: https://pyopenssl.org/ diff --git a/docs/configuration.rst b/docs/configuration.rst index c6f757d..0e2e355 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -108,21 +108,36 @@ Description Directory path used as the base for all download destinations. =========== ===== -extractor.*.restrict-filenames ------------------------------- +extractor.*.path-restrict +------------------------- =========== ===== Type ``string`` Default ``"auto"`` -Example ``"/!? ()[]{}"`` -Description Characters to replace with underscores (``_``) when generating - directory and file names. +Example ``"/!? (){}"`` +Description Set of characters to replace with underscores (``_``) + in generated path segment names. Special values: * ``"auto"``: Use characters from ``"unix"`` or ``"windows"`` depending on the local operating system * ``"unix"``: ``"/"`` - * ``"windows"``: ``"<>:\"\\|/?*"`` + * ``"windows"``: ``"\\\\|/<>:\"?*"`` + + Note: In a set with 2 or more characters, ``[]^-\`` need to be + escaped with backslashes, e.g. ``"\\[\\]"`` +=========== ===== + + +extractor.*.path-remove +----------------------- +=========== ===== +Type ``string`` +Default ``"\\u0000-\\u001f\\u007f"`` (ASCII control characters) +Description Set of characters to remove from generated path names. + + Note: In a set with 2 or more characters, ``[]^-\`` need to be + escaped with backslashes, e.g. ``"\\[\\]"`` =========== ===== @@ -131,8 +146,11 @@ extractor.*.skip =========== ===== Type ``bool`` or ``string`` Default ``true`` -Description Controls the behavior when downloading files whose filename - already exists. +Description Controls the behavior when downloading files that have been + downloaded before, i.e. a file with the same filename already + exists or its ID is in a `download archive`__. + + __ `extractor.*.archive`_ * ``true``: Skip downloads * ``false``: Overwrite already existing files @@ -144,6 +162,9 @@ Description Controls the behavior when downloading files whose filename * ``"exit"``: Exit the program altogether * ``"exit:N"``: Skip downloads and exit the program after ``N`` consecutive skips + + * ``"enumerate"``: Append a numeric suffix to the end of the + original filename (``file.ext.1``, ``file.ext.2``, etc) =========== ===== @@ -555,6 +576,15 @@ Description Download original files if available. =========== ===== +extractor.deviantart.quality +---------------------------- +=========== ===== +Type ``integer`` +Default ``100`` +Description JPEG compression quality for newer images hosted on wixmp servers. +=========== ===== + + extractor.deviantart.refresh-token ---------------------------------- =========== ===== @@ -1098,6 +1128,16 @@ Description Certificate validation during file downloads. =========== ===== +downloader.http.adjust-extensions +--------------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Check the file headers of ``jpg``, ``png``, and ``gif`` files + and adjust their filename extensions if they do not match. +=========== ===== + + downloader.ytdl.format ---------------------- =========== ===== @@ -1772,7 +1812,7 @@ Description An object with the ``name`` of a post-processor and its options. .. _timeout: https://docs.python-requests.org/en/latest/user/advanced/#timeouts .. _verify: https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification .. _Last-Modified: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.29 -.. _`Requests' proxy documentation`: http://docs.python-requests.org/en/master/user/advanced/#proxies +.. _`Requests' proxy documentation`: https://2.python-requests.org/en/master/user/advanced/#proxies .. _format string: https://docs.python.org/3/library/string.html#formatstrings .. _format strings: https://docs.python.org/3/library/string.html#formatstrings .. _strptime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior @@ -1780,5 +1820,5 @@ Description An object with the ``name`` of a post-processor and its options. .. _webbrowser.open(): https://docs.python.org/3/library/webbrowser.html .. _datetime: https://docs.python.org/3/library/datetime.html#datetime-objects .. _datetime.max: https://docs.python.org/3/library/datetime.html#datetime.datetime.max -.. _Authentication: https://github.com/mikf/gallery-dl#5authentication +.. _Authentication: https://github.com/mikf/gallery-dl#authentication .. _youtube-dl: https://github.com/ytdl-org/youtube-dl diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index a4a9ee0..b9ff32d 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -8,8 +8,9 @@ "proxy": null, "skip": true, "sleep": 0, + "path-restrict": "auto", + "path-remove": "\\u0000-\\u001f\\u007f", "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0", - "restrict-filenames": "auto", "artstation": { @@ -30,6 +31,7 @@ "mature": true, "metadata": false, "original": true, + "quality": 100, "wait-min": 0 }, "exhentai": @@ -154,6 +156,7 @@ "http": { + "adjust-extensions": true, "mtime": true, "rate": null, "retries": 4, diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index d2fb4ea..05c8555 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -28,6 +28,7 @@ Dynasty Reader https://dynasty-scans.com/ Chapters, individual Im E-Hentai https://e-hentai.org/ Favorites, Galleries, Search Results Optional e621 https://e621.net/ Pools, Popular Images, Posts, Tag-Searches EroLord.com http://erolord.com/ Galleries +ExHentai https://exhentai.org/ Favorites, Galleries, Search Results Optional Fallen Angels Scans https://www.fascans.com/ Chapters, Manga Fashion Nova https://www.fashionnova.com/ Collections, Products Fireden https://boards.fireden.net/ Threads @@ -47,11 +48,11 @@ Hypnohub https://hypnohub.net/ Pools, Popular Images, Idol Complex https://idol.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional ImageBam http://www.imagebam.com/ Galleries, individual Images ImageFap https://imagefap.com/ Images from Users, Galleries, individual Images -ImgBB https://imgbb.com/ Images from Users, Albums Optional +ImgBB https://imgbb.com/ Images from Users, Albums, individual Images Optional imgbox https://imgbox.com/ Galleries, individual Images imgth https://imgth.com/ Galleries -imgur https://imgur.com/ Albums, individual Images -Instagram https://www.instagram.com/ Images from Users, individual Images, Tag-Searches Optional +imgur https://imgur.com/ Albums, Galleries, individual Images +Instagram https://www.instagram.com/ |instagram-C| Optional Jaimini's Box https://jaiminisbox.com/reader/ Chapters, Manga Joyreactor http://joyreactor.cc/ |joyreactor-C| Keenspot http://www.keenspot.com/ Comics @@ -77,7 +78,7 @@ Niconico Seiga https://seiga.nicovideo.jp/ Images from Users, indi nijie https://nijie.info/ |nijie-C| Required NSFWalbum.com https://nsfwalbum.com/ Albums Nyafuu Archive https://archive.nyafuu.org/ Threads -Patreon https://www.patreon.com/ Images from Users, Creators +Patreon https://www.patreon.com/ Images from Users, Creators, Posts Pawoo https://pawoo.net/ Images from Users, Images from Statuses Photobucket https://photobucket.com/ Albums, individual Images Piczel https://piczel.tv/ Images from Users, Folders, individual Images @@ -100,7 +101,7 @@ Sankaku Complex https://www.sankakucomplex.com/ Articles, Tag-Searches Sen Manga https://raw.senmanga.com/ Chapters Sense-Scans http://sensescans.com/reader/ Chapters, Manga Sex.com https://www.sex.com/ Boards, Pins, related Pins, Search Results -Simply Hentai https://www.simply-hentai.com/ Galleries, individual Images, Videos +Simply Hentai https://www.simply-hentai.com/ Galleries SlickPic https://www.slickpic.com/ Images from Users, Albums SlideShare https://www.slideshare.net/ Presentations SmugMug https://www.smugmug.com/ |smugmug-C| Optional (OAuth) @@ -134,6 +135,7 @@ Turboimagehost https://www.turboimagehost.com/ individual Images .. |deviantart-C| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images, Scraps, Sta.sh .. |flickr-C| replace:: Images from Users, Albums, Favorites, Galleries, Groups, individual Images, Search Results .. |hentaifoundry-C| replace:: Images from Users, Favorites, individual Images, Popular Images, Recent Images, Scraps +.. |instagram-C| replace:: Images from Users, Channels, individual Images, Stories, Tag-Searches .. |joyreactor-C| replace:: Images from Users, Posts, Search Results, Tag-Searches .. |nijie-C| replace:: Images from Users, Doujin, Favorites, individual Images .. |pixiv-C| replace:: Images from Users, Favorites, Follows, pixiv.me Links, Rankings, Search Results, Individual Images diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 7a95191..e3229eb 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -26,6 +26,7 @@ class HttpDownloader(DownloaderBase): def __init__(self, extractor, output): DownloaderBase.__init__(self, extractor, output) + self.adjust_extension = self.config("adjust-extensions", True) self.retries = self.config("retries", extractor._retries) self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) @@ -59,7 +60,6 @@ class HttpDownloader(DownloaderBase): def _download_impl(self, url, pathfmt): response = None - adj_ext = None tries = 0 msg = "" @@ -103,7 +103,7 @@ class HttpDownloader(DownloaderBase): elif code == 206: # Partial Content offset = filesize size = response.headers["Content-Range"].rpartition("/")[2] - elif code == 416: # Requested Range Not Satisfiable + elif code == 416 and filesize: # Requested Range Not Satisfiable break else: msg = "{}: {} for url: {}".format(code, response.reason, url) @@ -114,7 +114,7 @@ class HttpDownloader(DownloaderBase): size = text.parse_int(size) # set missing filename extension - if not pathfmt.has_extension: + if not pathfmt.extension: pathfmt.set_extension(self.get_extension(response)) if pathfmt.exists(): pathfmt.temppath = "" @@ -152,15 +152,16 @@ class HttpDownloader(DownloaderBase): continue # check filename extension - adj_ext = self.check_extension(file, pathfmt) + if self.adjust_extension: + adj_ext = self.check_extension(file, pathfmt.extension) + if adj_ext: + pathfmt.set_extension(adj_ext) break self.downloading = False - if adj_ext: - pathfmt.set_extension(adj_ext) if self.mtime: - pathfmt.keywords["_mtime"] = response.headers.get("Last-Modified") + pathfmt.kwdict["_mtime"] = response.headers.get("Last-Modified") return True def receive(self, response, file): @@ -196,9 +197,8 @@ class HttpDownloader(DownloaderBase): return "txt" @staticmethod - def check_extension(file, pathfmt): + def check_extension(file, extension): """Check filename extension against fileheader""" - extension = pathfmt.keywords["extension"] if extension in FILETYPE_CHECK: file.seek(0) header = file.read(8) diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index a233487..7d8b905 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -50,7 +50,7 @@ class YoutubeDLDownloader(DownloaderBase): return False if "entries" in info_dict: - index = pathfmt.keywords.get("_ytdl_index") + index = pathfmt.kwdict.get("_ytdl_index") if index is None: return self._download_playlist(pathfmt, info_dict) else: @@ -59,7 +59,7 @@ class YoutubeDLDownloader(DownloaderBase): def _download_video(self, pathfmt, info_dict): if "url" in info_dict: - text.nameext_from_url(info_dict["url"], pathfmt.keywords) + text.nameext_from_url(info_dict["url"], pathfmt.kwdict) pathfmt.set_extension(info_dict["ext"]) if pathfmt.exists(): pathfmt.temppath = "" diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index 5ea835f..5e2480a 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -21,12 +21,12 @@ class AdultempireGalleryExtractor(GalleryExtractor): test = ( ("https://www.adultempire.com/5998/gallery.html", { "range": "1", - "keyword": "0533ef1184892be8ac02b17286797c95f389ba63", + "keyword": "25c8171f5623678491a0d7bdf38a7a6ebfa4a361", "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e", }), ("https://www.adultdvdempire.com/5683/gallery.html", { "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d", - "keyword": "59fe5d95929efc5040a819a5f77aba7a022bb85a", + "keyword": "0fe9a6e3f0a331b95ba77f66a643705ca86e8ec5", }), ) @@ -42,8 +42,8 @@ class AdultempireGalleryExtractor(GalleryExtractor): "studio" : extr(">studio", "<").strip(), "date" : text.parse_datetime(extr( ">released", "<").strip(), "%m/%d/%Y"), - "actors" : text.split_html(extr( - '
    = 400 or not deviation_id: - raise exception.NotFoundError("image") - return (self.api.deviation(deviation_id),) - - class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" @@ -558,54 +503,6 @@ class DeviantartJournalExtractor(DeviantartExtractor): return self.api.browse_user_journals(self.user, self.offset) -class DeviantartScrapsExtractor(DeviantartExtractor): - """Extractor for an artist's scraps""" - subcategory = "scraps" - directory_fmt = ("{category}", "{username}", "Scraps") - archive_fmt = "s_{username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/\?catpath=scraps\b" - test = ( - ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", { - "count": 12, - "options": (("original", False),), - }), - ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"), - ) - - def deviations(self): - url = "{}/{}/gallery/?catpath=scraps".format(self.root, self.user) - page = self._html_request(url).text - csrf, pos = text.extract(page, '"csrf":"', '"') - iid , pos = text.extract(page, '"requestid":"', '"', pos) - - url = "https://www.deviantart.com/dapi/v1/gallery/0" - data = { - "username": self.user, - "offset": self.offset, - "limit": "24", - "catpath": "scraps", - "_csrf": csrf, - "dapiIid": iid + "-jsok7403-1.1" - } - - while True: - content = self.request( - url, method="POST", data=data).json()["content"] - - for item in content["results"]: - if item["html"].startswith('
    ', '<').lower(), + } + image["extension"] = text.ext_from_url(image["url"]) + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, image["url"], image diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index c5e3d17..8523523 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -20,13 +20,19 @@ class ImgurExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.item_id = match.group(1) + self.key = match.group(1) self.mp4 = self.config("mp4", True) - def _get_data(self, path): + def _extract_data(self, path): response = self.request(self.root + path, notfound=self.subcategory) - data = text.extract(response.text, "image : ", ",\n")[0] - return self._clean(json.loads(data)) + data = json.loads(text.extract( + response.text, "image : ", ",\n")[0]) + try: + del data["adConfig"] + del data["isAd"] + except KeyError: + pass + return data def _prepare(self, image): image["ext"] = image["ext"].partition("?")[0] @@ -37,18 +43,9 @@ class ImgurExtractor(Extractor): image["extension"] = image["ext"][1:] return url - @staticmethod - def _clean(data): - try: - del data["adConfig"] - del data["isAd"] - except KeyError: - pass - return data - class ImgurImageExtractor(ImgurExtractor): - """Extractor for individual images from imgur.com""" + """Extractor for individual images on imgur.com""" subcategory = "image" filename_fmt = "{category}_{hash}{title:?_//}.{extension}" archive_fmt = "{hash}" @@ -101,22 +98,21 @@ class ImgurImageExtractor(ImgurExtractor): ) def items(self): - image = self._get_data("/" + self.item_id) + image = self._extract_data("/" + self.key) url = self._prepare(image) - yield Message.Version, 1 yield Message.Directory, image yield Message.Url, url, image class ImgurAlbumExtractor(ImgurExtractor): - """Extractor for image albums from imgur.com""" + """Extractor for imgur albums""" subcategory = "album" directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}") filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" archive_fmt = "{album[hash]}_{hash}" pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" - r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})") + r"/(?:a|t/unmuted)/(\w{7}|\w{5})") test = ( ("https://imgur.com/a/TcBmP", { "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", @@ -147,7 +143,7 @@ class ImgurAlbumExtractor(ImgurExtractor): "width": int, }, }), - ("https://imgur.com/gallery/eD9CT", { # large album + ("https://imgur.com/a/eD9CT", { # large album "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937", }), ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash @@ -164,13 +160,13 @@ class ImgurAlbumExtractor(ImgurExtractor): ) def items(self): - album = self._get_data("/a/" + self.item_id + "/all") + album = self._extract_data("/a/" + self.key + "/all") images = album["album_images"]["images"] del album["album_images"] if int(album["num_images"]) > len(images): url = "{}/ajaxalbums/getimages/{}/hit.json".format( - self.root, self.item_id) + self.root, self.key) images = self.request(url).json()["data"]["images"] yield Message.Version, 1 @@ -180,3 +176,32 @@ class ImgurAlbumExtractor(ImgurExtractor): image["num"] = num image["album"] = album yield Message.Url, url, image + + +class ImgurGalleryExtractor(ImgurExtractor): + """Extractor for imgur galleries""" + subcategory = "gallery" + pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" + r"/gallery/(\w{7}|\w{5})") + test = ( + ("https://imgur.com/gallery/zf2fIms", { # non-album gallery (#380) + "pattern": "https://imgur.com/zf2fIms", + }), + ("https://imgur.com/gallery/eD9CT", { + "pattern": "https://imgur.com/a/eD9CT", + }), + ) + + def items(self): + url = self.root + "/a/" + self.key + with self.request(url, method="HEAD", fatal=False) as response: + code = response.status_code + + if code < 400: + extr = ImgurAlbumExtractor + else: + extr = ImgurImageExtractor + url = self.root + "/" + self.key + + yield Message.Version, 1 + yield Message.Queue, url, {"_extractor": extr} diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 475e24b..e5cfe8b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -8,11 +8,10 @@ """Extract images from https://www.instagram.com/""" -import hashlib -import json from .common import Extractor, Message from .. import text, exception from ..cache import cache +import json class InstagramExtractor(Extractor): @@ -37,10 +36,11 @@ class InstagramExtractor(Extractor): data.update(metadata) yield Message.Directory, data - if data['typename'] == 'GraphImage': + if data['typename'] in ('GraphImage', 'GraphStoryImage', 'GraphStoryVideo'): yield Message.Url, data['display_url'], \ text.nameext_from_url(data['display_url'], data) elif data['typename'] == 'GraphVideo': + data["extension"] = None yield Message.Url, \ 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data @@ -140,33 +140,113 @@ class InstagramExtractor(Extractor): return medias + def _extract_stories(self, url): + if self.highlight_id: + user_id = '' + highlight_id = '"{}"'.format(self.highlight_id) + query_hash = '30a89afdd826d78a5376008a7b81c205' + else: + page = self.request(url).text + shared_data = self._extract_shared_data(page) + + # If no stories are present the URL redirects to `ProfilePage' + if 'StoriesPage' not in shared_data['entry_data']: + return [] + + user_id = '"{}"'.format( + shared_data['entry_data']['StoriesPage'][0]['user']['id']) + highlight_id = '' + query_hash = 'cda12de4f7fd3719c0569ce03589f4c4' + + variables = ( + '{{' + '"reel_ids":[{}],"tag_names":[],"location_ids":[],' + '"highlight_reel_ids":[{}],"precomposed_overlay":true,' + '"show_story_viewer_list":true,' + '"story_viewer_fetch_count":50,"story_viewer_cursor":"",' + '"stories_video_dash_manifest":false}}' + ).format(user_id, highlight_id) + headers = { + "X-Requested-With": "XMLHttpRequest", + } + url = '{}/graphql/query/?query_hash={}&variables={}'.format( + self.root, + query_hash, + variables, + ) + shared_data = self.request(url, headers=headers).json() + + # If there are stories present but the user is not authenticated or + # does not have permissions no stories are returned. + if not shared_data['data']['reels_media']: + return [] # no stories present + + medias = [] + for media in shared_data['data']['reels_media'][0]['items']: + media_data = { + 'owner_id': media['owner']['id'], + 'username': media['owner']['username'], + 'date': text.parse_timestamp(media['taken_at_timestamp']), + 'expires': text.parse_timestamp(media['expiring_at_timestamp']), + 'media_id': media['id'], + 'typename': media['__typename'], + } + if media['__typename'] == 'GraphStoryImage': + media_data.update({ + 'display_url': media['display_url'], + 'height': text.parse_int(media['dimensions']['height']), + 'width': text.parse_int(media['dimensions']['width']), + }) + elif media['__typename'] == 'GraphStoryVideo': + vr = media['video_resources'][0] + media_data.update({ + 'duration': text.parse_float(media['video_duration']), + 'display_url': vr['src'], + 'height': text.parse_int(vr['config_height']), + 'width': text.parse_int(vr['config_width']), + }) + medias.append(media_data) + + return medias + def _extract_page(self, url, page_type): shared_data_fields = { 'ProfilePage': { + 'page': 'ProfilePage', 'node': 'user', 'node_id': 'id', 'edge_to_medias': 'edge_owner_to_timeline_media', 'variables_id': 'id', - 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41', + 'query_hash': 'f2405b236d85e8296cf30347c9f08c2a', + }, + 'ProfileChannelPage': { + 'page': 'ProfilePage', + 'node': 'user', + 'node_id': 'id', + 'edge_to_medias': 'edge_felix_video_timeline', + 'variables_id': 'id', + 'query_hash': 'bc78b344a68ed16dd5d7f264681c4c76', }, 'TagPage': { + 'page': 'TagPage', 'node': 'hashtag', 'node_id': 'name', 'edge_to_medias': 'edge_hashtag_to_media', 'variables_id': 'tag_name', - 'query_hash': 'f92f56d47dc7a55b606908374b43a314', + 'query_hash': 'f12c9ec5e46a3173b2969c712ad84744', }, } page = self.request(url).text shared_data = self._extract_shared_data(page) psdf = shared_data_fields[page_type] + csrf = shared_data["config"]["csrf_token"] while True: # Deal with different structure of pages: the first page # has interesting data in `entry_data', next pages in `data'. if 'entry_data' in shared_data: - base_shared_data = shared_data['entry_data'][page_type][0]['graphql'] + base_shared_data = shared_data['entry_data'][psdf['page']][0]['graphql'] # variables_id is available only in the first page variables_id = base_shared_data[psdf['node']][psdf['node_id']] @@ -192,7 +272,8 @@ class InstagramExtractor(Extractor): ) headers = { "X-Requested-With": "XMLHttpRequest", - "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(), + "X-CSRFToken": csrf, + "X-IG-App-ID": "936619743392459", } url = '{}/graphql/query/?query_hash={}&variables={}'.format( self.root, @@ -204,14 +285,20 @@ class InstagramExtractor(Extractor): def _extract_profilepage(self, url): yield from self._extract_page(url, 'ProfilePage') + def _extract_profilechannelpage(self, url): + yield from self._extract_page(url, 'ProfileChannelPage') + def _extract_tagpage(self, url): yield from self._extract_page(url, 'TagPage') + def _extract_storiespage(self, url): + yield from self._extract_stories(url) + class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" subcategory = "image" - pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)" + pattern = r"(?:https?://)?(?:www\.)?instagram\.com/(?:p|tv)/([^/?&#]+)" test = ( # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { @@ -258,6 +345,22 @@ class InstagramImageExtractor(InstagramExtractor): } }), + # GraphVideo (IGTV) + ("https://www.instagram.com/tv/BkQjCfsBIzi/", { + "url": "64208f408e11cbbca86c2df4488e90262ae9d9ec", + "keyword": { + "date": "type:datetime", + "description": str, + "height": int, + "likes": int, + "media_id": "1806097553666903266", + "shortcode": "BkQjCfsBIzi", + "typename": "GraphVideo", + "username": "instagram", + "width": int, + } + }), + # GraphSidecar with 2 embedded GraphVideo objects ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, @@ -283,10 +386,11 @@ class InstagramUserExtractor(InstagramExtractor): """Extractor for ProfilePage""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)") + r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"([^/?&#]+)/?$") test = ("https://www.instagram.com/instagram/", { - "range": "1-12", - "count": ">= 12", + "range": "1-16", + "count": ">= 16", }) def __init__(self, match): @@ -298,6 +402,26 @@ class InstagramUserExtractor(InstagramExtractor): return self._extract_profilepage(url) +class InstagramChannelExtractor(InstagramExtractor): + """Extractor for ProfilePage channel""" + subcategory = "channel" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"([^/?&#]+)/channel") + test = ("https://www.instagram.com/instagram/channel/", { + "range": "1-16", + "count": ">= 16", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username = match.group(1) + + def instagrams(self): + url = '{}/{}/channel/'.format(self.root, self.username) + return self._extract_profilechannelpage(url) + + class InstagramTagExtractor(InstagramExtractor): """Extractor for TagPage""" subcategory = "tag" @@ -305,8 +429,8 @@ class InstagramTagExtractor(InstagramExtractor): pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" r"/explore/tags/([^/?&#]+)") test = ("https://www.instagram.com/explore/tags/instagram/", { - "range": "1-12", - "count": ">= 12", + "range": "1-16", + "count": ">= 16", }) def __init__(self, match): @@ -319,3 +443,22 @@ class InstagramTagExtractor(InstagramExtractor): def instagrams(self): url = '{}/explore/tags/{}/'.format(self.root, self.tag) return self._extract_tagpage(url) + + +class InstagramStoriesExtractor(InstagramExtractor): + """Extractor for StoriesPage""" + subcategory = "stories" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/stories/([^/?&#]+)(?:/(\d+))?") + test = ( + ("https://www.instagram.com/stories/instagram/"), + ("https://www.instagram.com/stories/highlights/18042509488170095/"), + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username, self.highlight_id = match.groups() + + def instagrams(self): + url = '{}/stories/{}/'.format(self.root, self.username) + return self._extract_storiespage(url) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 879d38b..a73eb86 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { "url": "7e4984a271a1072ac6483e4228a045895aff86f3", - "keyword": "ab4e5b71583fd439b4c8012a642aa8b58d8d0758", + "keyword": "07c0b915f2ab1cc3bbf28b76e7950fccee1213f3", "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", }), ("https://luscious.net/albums/virgin-killer-sweater_282582/", { diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 282c389..1ca1073 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -93,7 +93,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor): test = ( ("https://blitzwuff.newgrounds.com/art", { "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", - "keyword": "98566e0c8096a8099b8d71962fea7e31c8b098d4", + "keyword": "62981f7bdd66e1f1c72ab1d9b932423c156bc9a1", }), ("https://blitzwuff.newgrounds.com/"), ) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 4884497..ab5932d 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -11,6 +11,8 @@ from .common import Extractor, Message from .. import text from ..cache import memcache +import collections +import json class PatreonExtractor(Extractor): @@ -33,70 +35,92 @@ class PatreonExtractor(Extractor): for post in self.posts(): yield Message.Directory, post + ids = set() post["num"] = 0 content = post.get("content") postfile = post.get("post_file") - for url in text.extract_iter(content or "", 'src="', '"'): + for image in post["images"]: + url = image.get("download_url") + if not url: + continue + ids.add(url.split("/")[-2]) + name = image.get("file_name") or self._filename(url) or url + post["num"] += 1 - yield Message.Url, url, text.nameext_from_url(url, post) + post["type"] = "image" + yield Message.Url, url, text.nameext_from_url(name, post) - if postfile: + if postfile and postfile["url"].split("/")[-2] not in ids: post["num"] += 1 + post["type"] = "postfile" text.nameext_from_url(postfile["name"], post) yield Message.Url, postfile["url"], post for attachment in post["attachments"]: post["num"] += 1 + post["type"] = "attachment" text.nameext_from_url(attachment["name"], post) yield Message.Url, attachment["url"], post + if content: + for url in text.extract_iter(content, 'src="', '"'): + post["num"] += 1 + post["type"] = "content" + yield Message.Url, url, text.nameext_from_url(url, post) + def posts(self): """Return all relevant post objects""" def _pagination(self, url): headers = {"Referer": self.root} - empty = [] while url: posts = self.request(url, headers=headers).json() - if "included" not in posts: - return - - # collect attachments - attachments = {} - for inc in posts["included"]: - if inc["type"] == "attachment": - attachments[inc["id"]] = inc["attributes"] - - # update posts - for post in posts["data"]: - attr = post["attributes"] - attr["id"] = text.parse_int(post["id"]) - attr["date"] = text.parse_datetime( - attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - attr["creator"] = self._user( - post["relationships"]["user"]["links"]["related"]) - - # add attachments to post attributes - files = post["relationships"].get("attachments") - if files: - attr["attachments"] = [ - attachments[f["id"]] - for f in files["data"] - ] - else: - attr["attachments"] = empty - - yield attr + if "included" in posts: + included = self._transform(posts["included"]) + for post in posts["data"]: + yield self._process(post, included) if "links" not in posts: return url = posts["links"].get("next") + def _process(self, post, included): + """Process and extend a 'post' object""" + attr = post["attributes"] + attr["id"] = text.parse_int(post["id"]) + attr["images"] = self._files(post, included, "images") + attr["attachments"] = self._files(post, included, "attachments") + attr["date"] = text.parse_datetime( + attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + attr["creator"] = self._user( + post["relationships"]["user"]["links"]["related"]) + return attr + + @staticmethod + def _transform(included): + """Transform 'included' into an easier to handle format""" + result = collections.defaultdict(dict) + for inc in included: + result[inc["type"]][inc["id"]] = inc["attributes"] + return result + + @staticmethod + def _files(post, included, key): + """Build a list of files""" + files = post["relationships"].get(key) + if files and files.get("data"): + return [ + included[file["type"]][file["id"]] + for file in files["data"] + ] + return [] + @memcache(keyarg=1) def _user(self, url): + """Fetch user information""" user = self.request(url).json()["data"] attr = user["attributes"] attr["id"] = user["id"] @@ -104,14 +128,21 @@ class PatreonExtractor(Extractor): attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z") return attr + def _filename(self, url): + """Fetch filename from its Content-Disposition header""" + response = self.request(url, method="HEAD", fatal=False) + cd = response.headers.get("Content-Disposition") + return text.extract(cd, 'filename="', '"')[0] + @staticmethod def _build_url(endpoint, query): return ( "https://www.patreon.com/api/" + endpoint + - "?include=user,attachments,user_defined_tags,campaign,poll.choices" - ",poll.current_user_responses.user,poll.current_user_responses.cho" - "ice,poll.current_user_responses.poll,access_rules.tier.null" + "?include=user,images,attachments,user_defined_tags,campaign,poll." + "choices,poll.current_user_responses.user,poll.current_user_respon" + "ses.choice,poll.current_user_responses.poll,access_rules.tier.nul" + "l" "&fields[post]=change_visibility_at,comment_count,content,current_" "user_can_delete,current_user_can_view,current_user_has_liked,embe" @@ -133,7 +164,8 @@ class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" - r"/(?!(?:home|join|login|signup)(?:$|[/?&#]))([^/?&#]+)/?") + r"/(?!(?:home|join|posts|login|signup)(?:$|[/?&#]))" + r"([^/?&#]+)/?") test = ("https://www.patreon.com/koveliana", { "range": "1-25", "count": ">= 25", @@ -144,6 +176,7 @@ class PatreonCreatorExtractor(PatreonExtractor): "creator": dict, "date": "type:datetime", "id": int, + "images": list, "like_count": int, "post_type": str, "published_at": str, @@ -181,3 +214,26 @@ class PatreonUserExtractor(PatreonExtractor): "&filter[is_following]=true" )) return self._pagination(url) + + +class PatreonPostExtractor(PatreonExtractor): + """Extractor for media from a single post""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" + r"/posts/[^/?&#]*?(\d+)") + test = ("https://www.patreon.com/posts/precious-metal-23563293", { + "count": 4, + }) + + def __init__(self, match): + PatreonExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + url = "{}/posts/{}".format(self.root, self.post_id) + page = self.request(url).text + data = text.extract(page, "window.patreon.bootstrap,", "\n});")[0] + post = json.loads(data + "}")["post"] + + included = self._transform(post["included"]) + return (self._process(post["data"], included),) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 76d4dc4..4f8ee9c 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -18,8 +18,8 @@ class PixivExtractor(Extractor): """Base class for pixiv extractors""" category = "pixiv" directory_fmt = ("{category}", "{user[id]} {user[account]}") - filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}" - archive_fmt = "{id}{num}.{extension}" + filename_fmt = "{id}_p{num}.{extension}" + archive_fmt = "{id}{suffix}.{extension}" def __init__(self, match): Extractor.__init__(self, match) @@ -40,9 +40,10 @@ class PixivExtractor(Extractor): del work["meta_single_page"] del work["image_urls"] del work["meta_pages"] - work["num"] = "" + work["num"] = 0 work["tags"] = [tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) + work["suffix"] = "" work.update(metadata) yield Message.Directory, work @@ -55,20 +56,17 @@ class PixivExtractor(Extractor): url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") work["frames"] = ugoira["frames"] - work["extension"] = "zip" - yield Message.Url, url, work + yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] - work["extension"] = url.rpartition(".")[2] - yield Message.Url, url, work + yield Message.Url, url, text.nameext_from_url(url, work) else: - for num, img in enumerate(meta_pages): + for work["num"], img in enumerate(meta_pages): url = img["image_urls"]["original"] - work["num"] = "_p{:02}".format(num) - work["extension"] = url.rpartition(".")[2] - yield Message.Url, url, work + work["suffix"] = "_p{:02}".format(work["num"]) + yield Message.Url, url, text.nameext_from_url(url, work) def works(self): """Return an iterable containing all relevant 'work'-objects""" diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index fa4eb81..aa5c9c6 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -29,7 +29,7 @@ class PururinGalleryExtractor(GalleryExtractor): "artist" : ["Shoda Norihiro"], "group" : ["Obsidian Order"], "parody" : ["Kantai Collection"], - "characters": ["Iowa", "Teitoku"], + "characters": ["Admiral", "Iowa"], "tags" : list, "type" : "Doujinshi", "collection": "", diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 59d502a..f97454b 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -117,6 +117,8 @@ class ReactorExtractor(SharedConfigMixin, Extractor): url = text.extract(image, ' src="', '"')[0] if not url: continue + if url.startswith("//"): + url = "http:" + url width = text.extract(image, ' width="', '"')[0] height = text.extract(image, ' height="', '"')[0] image_id = url.rpartition("-")[2].partition(".")[0] @@ -268,8 +270,8 @@ class JoyreactorPostExtractor(ReactorPostExtractor): "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47", }), ("http://joyreactor.com/post/3668724", { # youtube embed - "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a", - "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651", + "url": "bf1666eddcff10c9b58f6be63fa94e4e13074214", + "keyword": "989112c7888e9cc80fd35870180c6c98165d953b", }), ("http://joyreactor.cc/post/1299", { # "malformed" JSON "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2ba4b99..94e95e8 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -234,7 +234,7 @@ class RedditAPI(): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = 1 self.authenticate() - response = self.extractor.request(url, params=params, fatal=False) + response = self.extractor.request(url, params=params, fatal=None) remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: wait = int(response.headers["x-ratelimit-reset"]) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index da9735e..bb8a2ae 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -40,17 +40,18 @@ class SankakuExtractor(SharedConfigMixin, Extractor): def items(self): self.login() - data = self.get_metadata() yield Message.Version, 1 - yield Message.Directory, data + data = self.get_metadata() for post_id in util.advance(self.get_posts(), self.start_post): self.wait() post = self.get_post_data(post_id) url = post["file_url"] post.update(data) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post def skip(self, num): self.start_post += num diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index afd4eaa..38b7813 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -78,6 +78,7 @@ class SexcomExtractor(Extractor): path += "/hd" data["url"] = self.root + path else: + data["extension"] = None data["url"] = "ytdl:" + text.extract( extr(''), ' src="', '"')[0] else: diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 5ad372d..8567155 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -8,14 +8,16 @@ """Extract hentai-manga from https://www.simply-hentai.com/""" -from .common import GalleryExtractor, Extractor, Message +from .common import GalleryExtractor from .. import text, util, exception +import json class SimplyhentaiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from simply-hentai.com""" category = "simplyhentai" archive_fmt = "{image_id}" + root = "https://www.simply-hentai.com" pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com" r"(?!/(?:album|gifs?|images?|series)(?:/|$))" r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)") @@ -23,7 +25,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { "url": "258289249990502c3138719cb89e995a60861e49", - "keyword": "eba83ccdbab3022a2280c77aa747f9458196138b", + "keyword": "8b2400e4b466e8f46802fa5a6b917d2788bb7e8e", }), ("https://www.simply-hentai.com/notfound", { "exception": exception.GalleryDLException, @@ -40,144 +42,30 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): self.session.headers["Referer"] = url def metadata(self, page): - extr = text.extract_from(page) - split = text.split_html - - title = extr('Series
    ', '')), - "language" : text.remove_html(extr( - 'box-title">Language', '')) or None, - "characters": split(extr('box-title">Characters', '')), - "tags" : split(extr('box-title">Tags', '')), - "artist" : split(extr('box-title">Artists', '')), - "date" : text.parse_datetime(text.remove_html( - extr('Uploaded', '')), "%d.%m.%Y"), + page = self.request(self.root + path).text + data = json.loads(text.unescape(text.extract( + page, 'data-react-class="Reader" data-react-props="', '"')[0])) + self.manga = manga = data["manga"] + + return { + "title" : manga["title"], + "parody" : manga["series"]["title"], + "language" : manga["language"]["name"], + "lang" : util.language_to_code(manga["language"]["name"]), + "characters": [x["name"] for x in manga["characters"]], + "tags" : [x["name"] for x in manga["tags"]], + "artist" : [x["name"] for x in manga["artists"]], + "gallery_id": text.parse_int(text.extract( + manga["images"][0]["sizes"]["full"], "/Album/", "/")[0]), + "date" : text.parse_datetime( + manga["publish_date"], "%Y-%m-%dT%H:%M:%S.%f%z"), } - data["lang"] = util.language_to_code(data["language"]) - return data def images(self, _): - url = self.chapter_url + "/all-pages" - headers = {"Accept": "application/json"} - images = self.request(url, headers=headers).json() return [ - (urls["full"], {"image_id": text.parse_int(image_id)}) - for image_id, urls in sorted(images.items()) + (image["sizes"]["full"], {"image_id": image["id"]}) + for image in self.manga["images"] ] - - -class SimplyhentaiImageExtractor(Extractor): - """Extractor for individual images from simply-hentai.com""" - category = "simplyhentai" - subcategory = "image" - directory_fmt = ("{category}", "{type}s") - filename_fmt = "{category}_{token}{title:?_//}.{extension}" - archive_fmt = "{token}" - pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com" - r"/(image|gif)/[^/?&#]+)") - test = ( - (("https://www.simply-hentai.com/image" - "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), { - "url": "0338eb137830ab6f81e5f410d3936ef785d063d9", - "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2", - }), - ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", { - "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1", - "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_url = "https://www." + match.group(1) - self.type = match.group(2) - - def items(self): - extr = text.extract_from(self.request(self.page_url).text) - title = extr('"og:title" content="' , '"') - descr = extr('"og:description" content="', '"') - url = extr('"image":"' , '&') - url = extr(""content":"", "&") or url - - tags = text.extract(descr, " tagged with ", " online for free ")[0] - if tags: - tags = tags.split(", ") - tags[-1] = tags[-1].partition(" ")[2] - else: - tags = [] - - data = text.nameext_from_url(url, { - "title": text.unescape(title) if title else "", - "tags": tags, - "type": self.type, - }) - data["token"] = data["filename"].rpartition("_")[2] - - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, url, data - - -class SimplyhentaiVideoExtractor(Extractor): - """Extractor for hentai videos from simply-hentai.com""" - category = "simplyhentai" - subcategory = "video" - directory_fmt = ("{category}", "{type}s") - filename_fmt = "{title}{episode:?_//>02}.{extension}" - archive_fmt = "{title}_{episode}" - pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)" - test = ( - ("https://videos.simply-hentai.com/creamy-pie-episode-02", { - "pattern": r"https://www\.googleapis\.com/drive/v3/files" - r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+", - "keyword": "706790708b14773efc1e075ddd3b738a375348a5", - "count": 1, - }), - (("https://videos.simply-hentai.com" - "/1715-tifa-in-hentai-gang-bang-3d-movie"), { - "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0", - "keyword": "f9dad94fbde9c95859e631ff4f07297a9567b874", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_url = "https://" + match.group(1) - - def items(self): - page = self.request(self.page_url).text - - title, pos = text.extract(page, "", "") - tags , pos = text.extract(page, ">Tags", "", pos) - date , pos = text.extract(page, ">Upload Date", "", pos) - title = title.rpartition(" - ")[0] - - if "', pos) - embed_url = text.extract(page, 'src="', '"', pos)[0].replace( - "embedplayer.php?link=", "embed.php?name=") - embed_page = self.request(embed_url).text - video_url = text.extract(embed_page, '"file":"', '"')[0] - title, _, episode = title.rpartition(" Episode ") - - data = text.nameext_from_url(video_url, { - "title": text.unescape(title), - "episode": text.parse_int(episode), - "tags": text.split_html(tags)[::2], - "type": "video", - "date": text.parse_datetime(text.remove_html( - date), "%B %d, %Y %H:%M"), - }) - - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, video_url, data diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ccba640..3672a6d 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -54,6 +54,7 @@ class TwitterExtractor(Extractor): if self.videos and "-videoContainer" in tweet: data["num"] = 1 + data["extension"] = None url = "ytdl:{}/{}/status/{}".format( self.root, data["user"], data["tweet_id"]) yield Message.Url, url, data diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index b9c223c..463733f 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -70,7 +70,7 @@ class WikiartArtistExtractor(WikiartExtractor): pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)" test = ("https://www.wikiart.org/en/thomas-cole", { "url": "f1eee8158f5b8b7380382ab730a8f53884715c8b", - "keyword": "b62678394ce645815963883d5c9642255307225f", + "keyword": "c61f5a4774b977106000e9554d19cfb9438a7032", }) def __init__(self, match): diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 9699806..23750db 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -13,13 +13,16 @@ from .. import text import json -BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)" +BASE_PATTERN = r"(?:https?://)?((?:[^.]+\.)?xhamster\d?\.(?:com|one|desi))" class XhamsterExtractor(Extractor): """Base class for xhamster extractors""" category = "xhamster" - root = "https://xhamster.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "https://" + match.group(1) class XhamsterGalleryExtractor(XhamsterExtractor): @@ -66,16 +69,21 @@ class XhamsterGalleryExtractor(XhamsterExtractor): }, }, }), + ("https://jp.xhamster2.com/photos/gallery/11748968", { + "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$", + "count": ">= 144", + }), ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"), ("https://xhamster.com/photos/gallery/11748968"), ("https://xhamster.one/photos/gallery/11748968"), ("https://xhamster.desi/photos/gallery/11748968"), + ("https://xhamster2.com/photos/gallery/11748968"), ("https://en.xhamster.com/photos/gallery/11748968"), ) def __init__(self, match): XhamsterExtractor.__init__(self, match) - self.path = match.group(1) + self.path = match.group(2) self.data = None def items(self): @@ -154,7 +162,7 @@ class XhamsterUserExtractor(XhamsterExtractor): def __init__(self, match): XhamsterExtractor.__init__(self, match) - self.user = match.group(1) + self.user = match.group(2) def items(self): yield Message.Version, 1 diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 637561a..6d81e66 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -196,7 +196,7 @@ class DownloadJob(Job): archive = self.archive # prepare download - pathfmt.set_keywords(keywords) + pathfmt.set_filename(keywords) if postprocessors: for pp in postprocessors: @@ -316,7 +316,9 @@ class DownloadJob(Job): skip = self.extractor.config("skip", True) if skip: self._skipexc = None - if isinstance(skip, str): + if skip == "enumerate": + self.pathfmt.check_file = self.pathfmt._enum_file + elif isinstance(skip, str): skip, _, smax = skip.partition(":") if skip == "abort": self._skipexc = exception.StopExtraction @@ -334,7 +336,8 @@ class DownloadJob(Job): postprocessors = self.extractor.config("postprocessors") if postprocessors: - self.postprocessors = [] + pp_list = [] + for pp_dict in postprocessors: whitelist = pp_dict.get("whitelist") blacklist = pp_dict.get("blacklist") @@ -353,16 +356,19 @@ class DownloadJob(Job): "'%s' initialization failed: %s: %s", name, exc.__class__.__name__, exc) else: - self.postprocessors.append(pp_obj) - self.extractor.log.debug( - "Active postprocessor modules: %s", self.postprocessors) + pp_list.append(pp_obj) + + if pp_list: + self.postprocessors = pp_list + self.extractor.log.debug( + "Active postprocessor modules: %s", pp_list) class SimulationJob(DownloadJob): """Simulate the extraction process without downloading anything""" def handle_url(self, url, keywords, fallback=None): - self.pathfmt.set_keywords(keywords) + self.pathfmt.set_filename(keywords) self.out.skip(self.pathfmt.path) if self.sleep: time.sleep(self.sleep) diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py index 8a12755..69ab4f6 100644 --- a/gallery_dl/oauth.py +++ b/gallery_dl/oauth.py @@ -127,6 +127,6 @@ class OAuth1API(): self.api_key = api_key def request(self, url, method="GET", **kwargs): - kwargs["fatal"] = False + kwargs["fatal"] = None kwargs["session"] = self.session return self.extractor.request(url, method, **kwargs) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index af70fc8..ecc2ee3 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -307,7 +307,8 @@ def build_parser(): "--ugoira-conv", dest="postprocessors", action="append_const", const={"name": "ugoira", "ffmpeg-args": ( - "-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an")}, + "-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an"), + "whitelist": ("pixiv", "danbooru")}, help="Convert Pixiv Ugoira to WebM (requires FFmpeg)", ) postprocessor.add_argument( diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py index 62460d3..4a9bde9 100644 --- a/gallery_dl/postprocessor/classify.py +++ b/gallery_dl/postprocessor/classify.py @@ -33,17 +33,24 @@ class ClassifyPP(PostProcessor): } def prepare(self, pathfmt): - ext = pathfmt.keywords.get("extension") - + ext = pathfmt.extension if ext in self.mapping: - self._dir = pathfmt.realdirectory + os.sep + self.mapping[ext] - pathfmt.realpath = self._dir + os.sep + pathfmt.filename - else: - self._dir = None + # set initial paths to enable download skips + self._build_paths(pathfmt, self.mapping[ext]) def run(self, pathfmt): - if self._dir: - os.makedirs(self._dir, exist_ok=True) + ext = pathfmt.extension + if ext in self.mapping: + # rebuild paths in case the filename extension changed + path = self._build_paths(pathfmt, self.mapping[ext]) + os.makedirs(path, exist_ok=True) + + @staticmethod + def _build_paths(pathfmt, extra): + path = pathfmt.realdirectory + extra + pathfmt.realpath = path + os.sep + pathfmt.filename + pathfmt.path = pathfmt.directory + extra + os.sep + pathfmt.filename + return path __postprocessor__ = ClassifyPP diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index c642f0f..b967cf6 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Mike Fährmann +# Copyright 2018-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,3 +23,6 @@ class PostProcessor(): def finalize(self): """Cleanup""" + + def __repr__(self): + return self.__class__.__name__ diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 77be9c7..467ef11 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -36,15 +36,14 @@ class MetadataPP(PostProcessor): def run(self, pathfmt): path = "{}.{}".format(pathfmt.realpath, self.extension) with open(path, "w", encoding="utf-8") as file: - self.write(file, pathfmt) + self.write(file, pathfmt.kwdict) - def _write_custom(self, file, pathfmt): - output = self.formatter.format_map(pathfmt.keywords) + def _write_custom(self, file, kwdict): + output = self.formatter.format_map(kwdict) file.write(output) - def _write_tags(self, file, pathfmt): - kwds = pathfmt.keywords - tags = kwds.get("tags") or kwds.get("tag_string") + def _write_tags(self, file, kwdict): + tags = kwdict.get("tags") or kwdict.get("tag_string") if not tags: return @@ -58,8 +57,8 @@ class MetadataPP(PostProcessor): file.write("\n".join(tags)) file.write("\n") - def _write_json(self, file, pathfmt): - util.dump_json(pathfmt.keywords, file, self.ascii, self.indent) + def _write_json(self, file, kwdict): + util.dump_json(kwdict, file, self.ascii, self.indent) __postprocessor__ = MetadataPP diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index 03d2f11..7065428 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -19,9 +19,9 @@ class MtimePP(PostProcessor): self.key = options.get("key", "date") def run(self, pathfmt): - mtime = pathfmt.keywords.get(self.key) + mtime = pathfmt.kwdict.get(self.key) ts = getattr(mtime, "timestamp", None) - pathfmt.keywords["_mtime"] = ts() if ts else parse_int(mtime) + pathfmt.kwdict["_mtime"] = ts() if ts else parse_int(mtime) __postprocessor__ = MtimePP diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index bd8c5ad..0dbb796 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -52,13 +52,13 @@ class UgoiraPP(PostProcessor): def prepare(self, pathfmt): self._frames = None - if pathfmt.keywords["extension"] != "zip": + if pathfmt.extension != "zip": return - if "frames" in pathfmt.keywords: - self._frames = pathfmt.keywords["frames"] - elif "pixiv_ugoira_frame_data" in pathfmt.keywords: - self._frames = pathfmt.keywords["pixiv_ugoira_frame_data"]["data"] + if "frames" in pathfmt.kwdict: + self._frames = pathfmt.kwdict["frames"] + elif "pixiv_ugoira_frame_data" in pathfmt.kwdict: + self._frames = pathfmt.kwdict["pixiv_ugoira_frame_data"]["data"] else: return diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 02d998d..79fa175 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -391,10 +391,18 @@ class Formatter(): if field_name: self.fields.append(( len(self.result), - self._field_access(field_name, format_spec, conversion) + self._field_access(field_name, format_spec, conversion), )) self.result.append("") + if len(self.result) == 1: + if self.fields: + self.format_map = self.fields[0][1] + else: + self.format_map = lambda _: format_string + del self.result + del self.fields + def format_map(self, kwargs): """Apply 'kwargs' to the initial format_string and return its result""" for index, func in self.fields: @@ -512,48 +520,63 @@ class Formatter(): class PathFormat(): def __init__(self, extractor): - self.filename_fmt = extractor.config( - "filename", extractor.filename_fmt) - self.directory_fmt = extractor.config( - "directory", extractor.directory_fmt) - self.kwdefault = extractor.config("keywords-default") + filename_fmt = extractor.config("filename", extractor.filename_fmt) + directory_fmt = extractor.config("directory", extractor.directory_fmt) + kwdefault = extractor.config("keywords-default") try: - self.formatter = Formatter(self.filename_fmt, self.kwdefault) + self.filename_formatter = Formatter( + filename_fmt, kwdefault).format_map except Exception as exc: raise exception.FormatError(exc, "filename") - self.delete = False - self.has_extension = False - self.keywords = {} - self.filename = "" + try: + self.directory_formatters = [ + Formatter(dirfmt, kwdefault).format_map + for dirfmt in directory_fmt + ] + except Exception as exc: + raise exception.FormatError(exc, "directory") + self.directory = self.realdirectory = "" + self.filename = "" + self.extension = "" + self.prefix = "" + self.kwdict = {} + self.delete = False self.path = self.realpath = self.temppath = "" - self.basedirectory = expand_path( + basedir = expand_path( extractor.config("base-directory", (".", "gallery-dl"))) - if os.altsep and os.altsep in self.basedirectory: - self.basedirectory = self.basedirectory.replace(os.altsep, os.sep) + if os.altsep and os.altsep in basedir: + basedir = basedir.replace(os.altsep, os.sep) + if basedir[-1] != os.sep: + basedir += os.sep + self.basedirectory = basedir - restrict = extractor.config("restrict-filenames", "auto") + restrict = extractor.config("path-restrict", "auto") if restrict == "auto": - restrict = "<>:\"\\/|?*" if os.name == "nt" else "/" + restrict = "\\\\|/<>:\"?*" if os.name == "nt" else "/" elif restrict == "unix": restrict = "/" elif restrict == "windows": - restrict = "<>:\"\\/|?*" - self.clean_path = self._build_cleanfunc(restrict) + restrict = "\\\\|/<>:\"?*" + + remove = extractor.config("path-remove", "\x00-\x1f\x7f") + + self.clean_segment = self._build_cleanfunc(restrict, "_") + self.clean_path = self._build_cleanfunc(remove, "") @staticmethod - def _build_cleanfunc(repl): - if not repl: + def _build_cleanfunc(chars, repl): + if not chars: return lambda x: x - elif len(repl) == 1: - def func(x, r=repl): - return x.replace(r, "_") + elif len(chars) == 1: + def func(x, c=chars, r=repl): + return x.replace(c, r) else: - def func(x, sub=re.compile("[" + re.escape(repl) + "]").sub): - return sub("_", x) + def func(x, sub=re.compile("[" + chars + "]").sub, r=repl): + return sub(r, x) return func def open(self, mode="wb"): @@ -562,68 +585,91 @@ class PathFormat(): def exists(self, archive=None): """Return True if the file exists on disk or in 'archive'""" - if archive and archive.check(self.keywords): + if archive and self.kwdict in archive: return self.fix_extension() - if self.has_extension and os.path.exists(self.realpath): - return True + if self.extension and os.path.exists(self.realpath): + return self.check_file() return False - def set_directory(self, keywords): + @staticmethod + def check_file(): + return True + + def _enum_file(self): + num = 1 + while True: + self.prefix = str(num) + "." + self.set_extension(self.extension, False) + if not os.path.exists(self.realpath): + return False + num += 1 + + def set_directory(self, kwdict): """Build directory path and create it if necessary""" + + # Build path segments by applying 'kwdict' to directory format strings try: segments = [ - self.clean_path( - Formatter(segment, self.kwdefault) - .format_map(keywords).strip()) - for segment in self.directory_fmt + self.clean_segment(format_map(kwdict).strip()) + for format_map in self.directory_formatters ] except Exception as exc: raise exception.FormatError(exc, "directory") - self.directory = os.path.join( - self.basedirectory, - *segments - ) + # Join path segements + sep = os.sep + directory = self.clean_path(self.basedirectory + sep.join(segments)) - # remove trailing path separator; - # occurs if the last argument to os.path.join() is an empty string - if self.directory[-1] == os.sep: - self.directory = self.directory[:-1] + # Ensure directory ends with a path separator + if directory[-1] != sep: + directory += sep + self.directory = directory - self.realdirectory = self.adjust_path(self.directory) + # Enable longer-than-260-character paths on Windows + if os.name == "nt": + self.realdirectory = "\\\\?\\" + os.path.abspath(directory) + sep + else: + self.realdirectory = directory + + # Create directory tree os.makedirs(self.realdirectory, exist_ok=True) - def set_keywords(self, keywords): - """Set filename keywords""" - self.keywords = keywords - self.temppath = "" - self.has_extension = bool(keywords.get("extension")) - if self.has_extension: + def set_filename(self, kwdict): + """Set general filename data""" + self.kwdict = kwdict + self.temppath = self.prefix = "" + self.extension = kwdict["extension"] + + if self.extension: self.build_path() def set_extension(self, extension, real=True): - """Set the 'extension' keyword""" - self.has_extension = real - self.keywords["extension"] = extension + """Set filename extension""" + if real: + self.extension = extension + self.kwdict["extension"] = self.prefix + extension self.build_path() def fix_extension(self, _=None): - if not self.has_extension: - self.set_extension("") + """Fix filenames without a given filename extension""" + if not self.extension: + self.set_extension("", False) if self.path[-1] == ".": self.path = self.path[:-1] self.temppath = self.realpath = self.realpath[:-1] return True def build_path(self): - """Use filename-keywords and directory to build a full path""" + """Use filename metadata and directory to build a full path""" + + # Apply 'kwdict' to filename format string try: - self.filename = self.clean_path( - self.formatter.format_map(self.keywords)) + self.filename = filename = self.clean_path(self.clean_segment( + self.filename_formatter(self.kwdict))) except Exception as exc: raise exception.FormatError(exc, "filename") - filename = os.sep + self.filename + # Combine directory and filename to full paths self.path = self.directory + filename self.realpath = self.realdirectory + filename if not self.temppath: @@ -631,7 +677,7 @@ class PathFormat(): def part_enable(self, part_directory=None): """Enable .part file usage""" - if self.has_extension: + if self.extension: self.temppath += ".part" else: self.set_extension("part", False) @@ -657,16 +703,16 @@ class PathFormat(): return if self.temppath != self.realpath: - # move temp file to its actual location + # Move temp file to its actual location try: os.replace(self.temppath, self.realpath) except OSError: shutil.copyfile(self.temppath, self.realpath) os.unlink(self.temppath) - if "_mtime" in self.keywords: - # set file modification time - mtime = self.keywords["_mtime"] + if "_mtime" in self.kwdict: + # Set file modification time + mtime = self.kwdict["_mtime"] if mtime: try: if isinstance(mtime, str): @@ -675,11 +721,6 @@ class PathFormat(): except Exception: pass - @staticmethod - def adjust_path(path): - """Enable longer-than-260-character paths on windows""" - return "\\\\?\\" + os.path.abspath(path) if os.name == "nt" else path - class DownloadArchive(): @@ -693,8 +734,8 @@ class DownloadArchive(): "archive-format", extractor.archive_fmt) ).format_map - def check(self, kwdict): - """Return True if item described by 'kwdict' exists in archive""" + def __contains__(self, kwdict): + """Return True if the item described by 'kwdict' exists in archive""" key = self.keygen(kwdict) self.cursor.execute( "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d9cc3d6..911939d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.10.1" +__version__ = "1.10.2" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 334671e..d8c8a03 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -2,7 +2,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -TESTS_CORE=(config cookies downloader extractor oauth text util) +TESTS_CORE=(config cookies downloader extractor oauth postprocessor text util) TESTS_RESULTS=(results) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 498e3fc..78963aa 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -24,7 +24,8 @@ CATEGORY_MAP = { "dynastyscans" : "Dynasty Reader", "e621" : "e621", "erolord" : "EroLord.com", - "exhentai" : "E-Hentai", + "e-hentai" : "E-Hentai", + "exhentai" : "ExHentai", "fallenangels" : "Fallen Angels Scans", "fashionnova" : "Fashion Nova", "hbrowse" : "HBrowse", @@ -109,6 +110,7 @@ SUBCATEGORY_MAP = { AUTH_MAP = { "danbooru" : "Optional", "deviantart" : "Optional (OAuth)", + "e-hentai" : "Optional", "exhentai" : "Optional", "flickr" : "Optional (OAuth)", "idolcomplex": "Optional", @@ -203,6 +205,15 @@ def build_extractor_list(): for extrlist in extractors.values(): extrlist.sort(key=subcategory_key) + # ugly hack to add e-hentai.org + eh = [] + for extr in extractors["exhentai"]: + class eh_extr(extr): + category = "e-hentai" + root = "https://e-hentai.org" + eh.append(eh_extr) + extractors["e-hentai"] = eh + # sort lists by category return sorted( extractors.values(), diff --git a/test/test_downloader.py b/test/test_downloader.py index caed983..0f58d4e 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -120,7 +120,7 @@ class TestDownloaderBase(unittest.TestCase): } pathfmt = PathFormat(cls.extractor) pathfmt.set_directory(kwdict) - pathfmt.set_keywords(kwdict) + pathfmt.set_filename(kwdict) if content: mode = "w" + ("b" if isinstance(content, bytes) else "") @@ -145,7 +145,7 @@ class TestDownloaderBase(unittest.TestCase): # test filename extension self.assertEqual( - pathfmt.keywords["extension"], + pathfmt.extension, expected_extension, ) self.assertEqual( diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py new file mode 100644 index 0000000..786dc46 --- /dev/null +++ b/test/test_postprocessor.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os.path +import zipfile +import tempfile +from datetime import datetime, timezone as tz + +import unittest +from unittest.mock import Mock, mock_open, patch + +from gallery_dl import postprocessor, extractor, util, config +from gallery_dl.postprocessor.common import PostProcessor + + +class MockPostprocessorModule(Mock): + __postprocessor__ = "mock" + + +class TestPostprocessorModule(unittest.TestCase): + + def setUp(self): + postprocessor._cache.clear() + + def test_find(self): + for name in (postprocessor.modules): + cls = postprocessor.find(name) + self.assertEqual(cls.__name__, name.capitalize() + "PP") + self.assertIs(cls.__base__, PostProcessor) + + self.assertEqual(postprocessor.find("foo"), None) + self.assertEqual(postprocessor.find(1234) , None) + self.assertEqual(postprocessor.find(None) , None) + + @patch("importlib.import_module") + def test_cache(self, import_module): + import_module.return_value = MockPostprocessorModule() + + for name in (postprocessor.modules): + postprocessor.find(name) + self.assertEqual(import_module.call_count, len(postprocessor.modules)) + + # no new calls to import_module + for name in (postprocessor.modules): + postprocessor.find(name) + self.assertEqual(import_module.call_count, len(postprocessor.modules)) + + +class BasePostprocessorTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.extractor = extractor.find("test:") + cls.dir = tempfile.TemporaryDirectory() + cls.fnum = 0 + config.set(("base-directory",), cls.dir.name) + + @classmethod + def tearDownClass(cls): + cls.dir.cleanup() + config.clear() + + def _create(self, options=None, data=None): + kwdict = {"category": "test", "filename": "file", "extension": "ext"} + if options is None: + options = {} + if data is not None: + kwdict.update(data) + + self.pathfmt = util.PathFormat(self.extractor) + self.pathfmt.set_directory(kwdict) + self.pathfmt.set_filename(kwdict) + + pp = postprocessor.find(self.__class__.__name__[:-4].lower()) + return pp(self.pathfmt, options) + + +class ClassifyTest(BasePostprocessorTest): + + def test_classify_default(self): + pp = self._create() + + self.assertEqual(pp.mapping, { + ext: directory + for directory, exts in pp.DEFAULT_MAPPING.items() + for ext in exts + }) + self.pathfmt.set_extension("jpg") + + pp.prepare(self.pathfmt) + path = os.path.join(self.dir.name, "test", "Pictures") + self.assertEqual(self.pathfmt.path, path + "/file.jpg") + self.assertEqual(self.pathfmt.realpath, path + "/file.jpg") + + with patch("os.makedirs") as mkdirs: + pp.run(self.pathfmt) + mkdirs.assert_called_once_with(path, exist_ok=True) + + def test_classify_noop(self): + pp = self._create() + rp = self.pathfmt.realpath + + pp.prepare(self.pathfmt) + self.assertEqual(self.pathfmt.path, rp) + self.assertEqual(self.pathfmt.realpath, rp) + + with patch("os.makedirs") as mkdirs: + pp.run(self.pathfmt) + self.assertEqual(mkdirs.call_count, 0) + + def test_classify_custom(self): + pp = self._create({"mapping": { + "foo/bar": ["foo", "bar"], + }}) + + self.assertEqual(pp.mapping, { + "foo": "foo/bar", + "bar": "foo/bar", + }) + self.pathfmt.set_extension("foo") + + pp.prepare(self.pathfmt) + path = os.path.join(self.dir.name, "test", "foo", "bar") + self.assertEqual(self.pathfmt.path, path + "/file.foo") + self.assertEqual(self.pathfmt.realpath, path + "/file.foo") + + with patch("os.makedirs") as mkdirs: + pp.run(self.pathfmt) + mkdirs.assert_called_once_with(path, exist_ok=True) + + +class MetadataTest(BasePostprocessorTest): + + def test_metadata_default(self): + pp = self._create() + + # default arguments + self.assertEqual(pp.write , pp._write_json) + self.assertEqual(pp.ascii , False) + self.assertEqual(pp.indent , 4) + self.assertEqual(pp.extension, "json") + + def test_metadata_json(self): + pp = self._create({ + "mode" : "json", + "ascii" : True, + "indent" : 2, + "extension": "JSON", + }) + + self.assertEqual(pp.write , pp._write_json) + self.assertEqual(pp.ascii , True) + self.assertEqual(pp.indent , 2) + self.assertEqual(pp.extension, "JSON") + + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + + path = self.pathfmt.realpath + ".JSON" + m.assert_called_once_with(path, "w", encoding="utf-8") + self.assertEqual(self._output(m), """{ + "category": "test", + "extension": "ext", + "filename": "file" +} +""") + + def test_metadata_tags(self): + pp = self._create({"mode": "tags"}, {"tags": ["foo", "bar", "baz"]}) + self.assertEqual(pp.write, pp._write_tags) + self.assertEqual(pp.extension, "txt") + + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + + path = self.pathfmt.realpath + ".txt" + m.assert_called_once_with(path, "w", encoding="utf-8") + self.assertEqual(self._output(m), "foo\nbar\nbaz\n") + + def test_metadata_tags_split_1(self): + pp = self._create({"mode": "tags"}, {"tags": "foo, bar, baz"}) + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "foo\nbar\nbaz\n") + + def test_metadata_tags_split_2(self): + pp = self._create( + {"mode": "tags"}, + {"tags": "foobar1 foobar2 foobarbaz"}, + ) + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "foobar1\nfoobar2\nfoobarbaz\n") + + def test_metadata_tags_tagstring(self): + pp = self._create({"mode": "tags"}, {"tag_string": "foo, bar, baz"}) + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "foo\nbar\nbaz\n") + + def test_metadata_custom(self): + pp = self._create( + {"mode": "custom", "format": "{foo}\n{missing}\n"}, + {"foo": "bar"}, + ) + self.assertEqual(pp.write, pp._write_custom) + self.assertEqual(pp.extension, "txt") + self.assertTrue(pp.formatter) + + with patch("builtins.open", mock_open()) as m: + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self._output(m), "bar\nNone\n") + + @staticmethod + def _output(mock): + return "".join( + call[1][0] + for call in mock.mock_calls + if call[0] == "().write" + ) + + +class MtimeTest(BasePostprocessorTest): + + def test_mtime_default(self): + pp = self._create() + self.assertEqual(pp.key, "date") + + def test_mtime_datetime(self): + pp = self._create(None, {"date": datetime(1980, 1, 1, tzinfo=tz.utc)}) + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + + def test_mtime_timestamp(self): + pp = self._create(None, {"date": 315532800}) + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + + def test_mtime_custom(self): + pp = self._create({"key": "foo"}, {"foo": 315532800}) + pp.prepare(self.pathfmt) + pp.run(self.pathfmt) + self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) + + +class ZipTest(BasePostprocessorTest): + + def test_zip_default(self): + pp = self._create() + self.assertEqual(pp.path, self.pathfmt.realdirectory) + self.assertEqual(pp.run, pp._write) + self.assertEqual(pp.delete, True) + self.assertFalse(hasattr(pp, "args")) + self.assertEqual(pp.zfile.compression, zipfile.ZIP_STORED) + self.assertEqual( + pp.zfile.filename, self.pathfmt.realdirectory + ".zip") + + def test_zip_options(self): + pp = self._create({ + "keep-files": True, + "compression": "zip", + "extension": "cbz", + }) + self.assertEqual(pp.delete, False) + self.assertEqual(pp.zfile.compression, zipfile.ZIP_DEFLATED) + self.assertEqual( + pp.zfile.filename, self.pathfmt.realdirectory + ".cbz") + + def test_zip_safe(self): + pp = self._create({"mode": "safe"}) + self.assertEqual(pp.delete, True) + self.assertEqual(pp.path, self.pathfmt.realdirectory) + self.assertEqual(pp.run, pp._write_safe) + self.assertEqual(pp.args, ( + pp.path + ".zip", "a", zipfile.ZIP_STORED, True + )) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_results.py b/test/test_results.py index 839a75c..12f2416 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -26,12 +26,9 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "adultempire", - "flickr", + "8chan", "imgth", - "mangafox", "mangapark", - "pixnet", } @@ -90,13 +87,17 @@ class TestExtractorResults(unittest.TestCase): # test archive-id uniqueness self.assertEqual(len(set(tjob.list_archive)), len(tjob.list_archive)) - # test '_extractor' entries if tjob.queue: + # test '_extractor' entries for url, kwdict in zip(tjob.list_url, tjob.list_keyword): if "_extractor" in kwdict: extr = kwdict["_extractor"].from_url(url) self.assertIsInstance(extr, kwdict["_extractor"]) self.assertEqual(extr.url, url) + else: + # test 'extension' entries + for kwdict in tjob.list_keyword: + self.assertIn("extension", kwdict) # test extraction results if "url" in result: @@ -168,7 +169,6 @@ class ResultJob(job.DownloadJob): if content: self.fileobj = TestPathfmt(self.hash_content) - self.get_downloader("http").check_extension = lambda a, b: None self.format_directory = TestFormatter( "".join(self.extractor.directory_fmt)) @@ -222,8 +222,8 @@ class TestPathfmt(): self.hashobj = hashobj self.path = "" self.size = 0 - self.keywords = {} - self.has_extension = True + self.kwdict = {} + self.extension = "jpg" def __enter__(self): return self @@ -280,6 +280,7 @@ def setup_test_config(): config.clear() config.set(("cache", "file"), ":memory:") config.set(("downloader", "part"), False) + config.set(("downloader", "adjust-extensions"), False) config.set(("extractor", "timeout"), 60) config.set(("extractor", "username"), name) config.set(("extractor", "password"), name) -- cgit v1.2.3