diff options
| author | 2021-06-05 20:55:36 -0400 | |
|---|---|---|
| committer | 2021-06-05 20:55:36 -0400 | |
| commit | 8a644b7a06c504263a478d3681eed10b4161b5be (patch) | |
| tree | b3d668588e5c0be8c75467e50499f73ff9ec7c05 /gallery_dl/extractor | |
| parent | e7eb1f9779f2e223575ab23a6bc1abf2222e7d27 (diff) | |
New upstream version 1.17.5.upstream/1.17.5
Diffstat (limited to 'gallery_dl/extractor')
27 files changed, 460 insertions, 233 deletions
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index edb9d46..27634de 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -104,7 +104,8 @@ class _35photoUserExtractor(_35photoExtractor): r"/(?!photo_|genre_|tags/|rating/)([^/?#]+)") test = ( ("https://35photo.pro/liya", { - "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg", + "pattern": r"https://([a-z][0-9]\.)?35photo\.pro" + r"/photos_(main|series)/.*\.jpg", "count": 9, }), ("https://35photo.pro/suhoveev", { @@ -214,7 +215,7 @@ class _35photoImageExtractor(_35photoExtractor): test = ("https://35photo.pro/photo_753340/", { "count": 1, "keyword": { - "url" : r"re:https://m\d+.35photo.pro/photos_main/.*.jpg", + "url" : r"re:https://35photo\.pro/photos_main/.*\.jpg", "id" : 753340, "title" : "Winter walk", "description": str, diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 0583eb9..c2c5a66 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -146,7 +146,7 @@ class _500pxGalleryExtractor(_500pxExtractor): }), # unavailable photos (#1335) ("https://500px.com/p/Light_Expression_Photography/galleries/street", { - "count": ">= 7", + "count": 0, }), ("https://500px.com/fashvamp/galleries/lera"), ) @@ -172,7 +172,7 @@ class _500pxGalleryExtractor(_500pxExtractor): } gallery = self._request_graphql( "GalleriesDetailQueryRendererQuery", variables, - "fb8bb66d31b58903e2f01ebe66bbe7937b982753be3211855b7bce4e286c1a49", + "eda3c77ca4efe4b3347ec9c08befe3bd2c58099ebfb1f680d829fcd26d34f12d", )["gallery"] self._photos = gallery["photos"] @@ -200,8 +200,8 @@ class _500pxGalleryExtractor(_500pxExtractor): variables["cursor"] = photos["pageInfo"]["endCursor"] photos = self._request_graphql( "GalleriesDetailPaginationContainerQuery", variables, - "457c66d976f56863c81795f03e98cb54" - "3c7c6cdae7abeab8fe9e8e8a67479fa9", + "466cf6661a07e7fdca465edb39118efb" + "80fb157c6d3f620c7f518cdae0832c78", )["galleryByOwnerIdAndSlugOrToken"]["photos"] diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index ded2ae3..0d0ad70 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2020-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,7 +12,6 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache - BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4" @@ -33,6 +32,8 @@ class AryionExtractor(Extractor): self._needle = "class='gallery-item' id='" def login(self): + if self._check_cookies(self.cookienames): + return username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) @@ -73,8 +74,7 @@ class AryionExtractor(Extractor): def _pagination(self, url): while True: page = self.request(url).text - yield from text.extract_iter( - page, self._needle, "'") + yield from text.extract_iter(page, self._needle, "'") pos = page.find("Next >>") if pos < 0: @@ -173,7 +173,7 @@ class AryionGalleryExtractor(AryionExtractor): def skip(self, num): if self.recursive: - num = 0 + return 0 self.offset += num return num @@ -182,7 +182,7 @@ class AryionGalleryExtractor(AryionExtractor): url = "{}/g4/gallery/{}".format(self.root, self.user) return self._pagination(url) else: - self._needle = "class='thumb' href='/g4/view/" + self._needle = "thumb' href='/g4/view/" url = "{}/g4/latest.php?name={}".format(self.root, self.user) return util.advance(self._pagination(url), self.offset) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 1f86ea5..3b96a4e 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -61,6 +61,7 @@ class DanbooruExtractor(Extractor): "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format( self.root, post["id"]) ).json()["pixiv_ugoira_frame_data"]["data"] + post["_http_adjust_extension"] = False else: url = post["large_file_url"] post["extension"] = "webm" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 47f589a..9a461a4 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -930,10 +930,12 @@ class DeviantartOAuthAPI(): self.folders = extractor.config("folders", False) self.metadata = extractor.extra or extractor.config("metadata", False) - self.client_id = extractor.config( - "client-id", self.CLIENT_ID) - self.client_secret = extractor.config( - "client-secret", self.CLIENT_SECRET) + self.client_id = extractor.config("client-id") + if self.client_id: + self.client_secret = extractor.config("client-secret") + else: + self.client_id = self.CLIENT_ID + self.client_secret = self.CLIENT_SECRET token = extractor.config("refresh-token") if token is None or token == "cache": diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 910da7d..64a6cb7 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -128,7 +128,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "gid": 1200119, "height": int, "image_token": "re:[0-9a-f]{10}", - "lang": "jp", + "lang": "ja", "language": "Japanese", "parent": "", "rating": r"re:\d\.\d+", diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 0bcec2b..5962b9e 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -135,7 +135,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): "url": "61896d9d9a2edb556b619000a308a984307b6d30", }), ("https://thebarchive.com/b/thread/739772332/", { - "url": "e8b18001307d130d67db31740ce57c8561b5d80c", + "url": "07d39d2cb48f40fb337dc992993d965b0cd5f7cd", }), ) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 863cead..df45d0d 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -8,8 +8,10 @@ """Extractors for https://gelbooru.com/""" +from .common import Extractor, Message from . import gelbooru_v02 from .. import text, exception +import binascii class GelbooruBase(): @@ -131,3 +133,23 @@ class GelbooruPostExtractor(GelbooruBase, } }), ) + + +class GelbooruRedirectExtractor(GelbooruBase, Extractor): + subcategory = "redirect" + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com" + r"/redirect\.php\?s=([^&#]+)") + test = (("https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgu" + "cGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MTgzMDA0Ng=="), { + "pattern": r"https://gelbooru.com/index.php" + r"\?page=post&s=view&id=1830046" + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.redirect_url = text.ensure_http_scheme( + binascii.a2b_base64(match.group(1)).decode()) + + def items(self): + data = {"_extractor": GelbooruPostExtractor} + yield Message.Queue, self.redirect_url, data diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 76b2c38..9370840 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from http://www.imagebam.com/""" +"""Extractors for https://www.imagebam.com/""" from .common import Extractor, Message from .. import text, exception @@ -15,34 +15,44 @@ from .. import text, exception class ImagebamExtractor(Extractor): """Base class for imagebam extractors""" category = "imagebam" - root = "http://www.imagebam.com" + root = "https://www.imagebam.com" + cookies = None - def get_image_data(self, page_url, data): - """Fill 'data' and return image URL""" + def __init__(self, match): + Extractor.__init__(self, match) + self.key = match.group(1) + if self.cookies: + self.session.cookies = self.cookies + + def get_image_data(self, data): + page_url = "{}/image/{}".format(self.root, data["image_key"]) page = self.request(page_url).text - image_url = text.extract(page, 'property="og:image" content="', '"')[0] - data["extension"] = image_url.rpartition(".")[2] - data["image_key"] = page_url.rpartition("/")[2] - data["image_id"] = data["image_key"][6:] - return image_url + image_url, pos = text.extract(page, '<img src="https://images', '"') + + if not image_url: + # cache cookies + ImagebamExtractor.cookies = self.session.cookies + # repeat request to get past "Continue to your image" pages + page = self.request(page_url).text + image_url, pos = text.extract( + page, '<img src="https://images', '"') - def request_page(self, url): - """Retrive the main part of a gallery page""" - page = self.request(text.urljoin(self.root, url)).text - return text.extract(page, "<fieldset>", "</fieldset>")[0] + filename = text.unescape(text.extract(page, 'alt="', '"', pos)[0]) + data["url"] = "https://images" + image_url + data["filename"], _, data["extension"] = filename.rpartition(".") class ImagebamGalleryExtractor(ImagebamExtractor): """Extractor for image galleries from imagebam.com""" subcategory = "gallery" - directory_fmt = ("{category}", "{title} - {gallery_key}") - filename_fmt = "{num:>03}-{image_key}.{extension}" + directory_fmt = ("{category}", "{title} {gallery_key}") + filename_fmt = "{num:>03} {filename}.{extension}" archive_fmt = "{gallery_key}_{image_key}" pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)" test = ( - ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { + ("https://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { "url": "76d976788ae2757ac81694736b07b72356f5c4c8", - "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", + "keyword": "b048478b1bbba3072a7fa9fcc40630b3efad1f6c", "content": "596e6bfa157f2c7169805d50075c2986549973a8", }), ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", { @@ -51,78 +61,67 @@ class ImagebamGalleryExtractor(ImagebamExtractor): "url": "32ae6fe5dc3e4ca73ff6252e522d16473595d1d1", }), ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ) - def __init__(self, match): - ImagebamExtractor.__init__(self, match) - self.gallery_key = match.group(1) - def items(self): - url = "{}/gallery/{}".format(self.root, self.gallery_key) - page = self.request_page(url) - if not page or ">Error<" in page: - raise exception.NotFoundError("gallery") + url = "{}/gallery/{}".format(self.root, self.key) + page = self.request(url).text data = self.get_metadata(page) - imgs = self.get_image_pages(page) - data["count"] = len(imgs) - data["gallery_key"] = self.gallery_key + keys = self.get_image_keys(page) + keys.reverse() + data["count"] = len(keys) + data["gallery_key"] = self.key - yield Message.Version, 1 yield Message.Directory, data - for data["num"], page_url in enumerate(imgs, 1): - image_url = self.get_image_data(page_url, data) - yield Message.Url, image_url, data + for data["num"], data["image_key"] in enumerate(keys, 1): + self.get_image_data(data) + yield Message.Url, data["url"], data @staticmethod def get_metadata(page): """Return gallery metadata""" - return text.extract_all(page, ( - ("title" , "'> ", " <span "), - (None , "'>", "</span>"), - ("description", ":#FCFCFC;'>", "</div>"), - ))[0] - - def get_image_pages(self, page): - """Return a list of all image pages""" - pages = [] + title = text.extract(page, 'id="gallery-name">', '<')[0] + return {"title": text.unescape(title.strip())} + + def get_image_keys(self, page): + """Return a list of all image keys""" + keys = [] while True: - pages.extend(text.extract_iter(page, "\n<a href='", "'")) - pos = page.find('"pagination_current"') + keys.extend(text.extract_iter( + page, '<a href="https://www.imagebam.com/image/', '"')) + pos = page.find('rel="next" aria-label="Next') if pos > 0: - url = text.extract(page, "<a href='", "'", pos)[0] + url = text.rextract(page, 'href="', '"', pos)[0] if url: - page = self.request_page(url) + page = self.request(url).text continue - return pages + return keys class ImagebamImageExtractor(ImagebamExtractor): """Extractor for single images from imagebam.com""" subcategory = "image" - filename_fmt = "{image_key}.{extension}" archive_fmt = "{image_key}" pattern = (r"(?:https?://)?(?:\w+\.)?imagebam\.com" r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)") test = ( - ("http://www.imagebam.com/image/94d56c502511890", { + ("https://www.imagebam.com/image/94d56c502511890", { "url": "5e9ba3b1451f8ded0ae3a1b84402888893915d4a", - "keyword": "4263d4840007524129792b8587a562b5d20c2687", + "keyword": "2a4380d4b57554ff793898c2d6ec60987c86d1a1", "content": "0c8768055e4e20e7c7259608b67799171b691140", }), ("http://images3.imagebam.com/1d/8c/44/94d56c502511890.png"), + # NSFW (#1534) + ("https://www.imagebam.com/image/0850951366904951", { + "url": "d37297b17ed1615b4311c8ed511e50ce46e4c748", + }), ) - def __init__(self, match): - ImagebamExtractor.__init__(self, match) - self.image_key = match.group(1) - def items(self): - page_url = "{}/image/{}".format(self.root, self.image_key) - data = {} - image_url = self.get_image_data(page_url, data) - yield Message.Version, 1 + data = {"image_key": self.key} + self.get_image_data(data) yield Message.Directory, data - yield Message.Url, image_url, data + yield Message.Url, data["url"], data diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 7009c7a..f925c9e 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -57,7 +57,8 @@ class ImgurImageExtractor(ImgurExtractor): subcategory = "image" filename_fmt = "{category}_{id}{title:?_//}.{extension}" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/(?!gallery|search)(\w{7}|\w{5})[sbtmlh]?\.?" + pattern = (BASE_PATTERN + r"/(?!gallery|search)" + r"(?:r/\w+/)?(\w{7}|\w{5})[sbtmlh]?") test = ( ("https://imgur.com/21yMxCS", { "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", @@ -110,6 +111,7 @@ class ImgurImageExtractor(ImgurExtractor): ("https://imgur.com/zzzzzzz", { # not found "exception": exception.HttpError, }), + ("https://m.imgur.com/r/Celebs/iHJ7tsM"), ("https://www.imgur.com/21yMxCS"), # www ("https://m.imgur.com/21yMxCS"), # mobile ("https://imgur.com/zxaY6"), # 5 character key @@ -289,7 +291,7 @@ class ImgurFavoriteExtractor(ImgurExtractor): class ImgurSubredditExtractor(ImgurExtractor): """Extractor for a subreddits's imgur links""" subcategory = "subreddit" - pattern = BASE_PATTERN + r"/r/([^/?#]+)" + pattern = BASE_PATTERN + r"/r/([^/?#]+)/?$" test = ("https://imgur.com/r/pics", { "range": "1-100", "count": 100, diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 9b5331a..2f7935b 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -64,7 +64,7 @@ class InkbunnyExtractor(Extractor): class InkbunnyUserExtractor(InkbunnyExtractor): """Extractor for inkbunny user profiles""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?([^/?#]+)" + pattern = BASE_PATTERN + r"/(?!s/)(gallery/|scraps/)?(\w+)(?:$|[/?#])" test = ( ("https://inkbunny.net/soina", { "pattern": r"https://[\w.]+\.metapix\.net/files/full" @@ -138,6 +138,33 @@ class InkbunnyUserExtractor(InkbunnyExtractor): return self.api.search(params) +class InkbunnyFavoriteExtractor(InkbunnyExtractor): + """Extractor for inkbunny user favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/userfavorites_process\.php\?favs_user_id=(\d+)" + test = ( + ("https://inkbunny.net/userfavorites_process.php?favs_user_id=20969", { + "pattern": r"https://[\w.]+\.metapix\.net/files/full" + r"/\d+/\d+_\w+_.+", + "range": "20-50", + }), + ) + + def __init__(self, match): + InkbunnyExtractor.__init__(self, match) + self.user_id = match.group(1) + + def posts(self): + orderby = self.config("orderby", "fav_datetime") + params = { + "favs_user_id": self.user_id, + "orderby" : orderby, + } + if orderby and orderby.startswith("unread_"): + params["unread_submissions"] = "yes" + return self.api.search(params) + + class InkbunnyPostExtractor(InkbunnyExtractor): """Extractor for individual Inkbunny posts""" subcategory = "post" diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index a027be1..e3db789 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -408,7 +408,7 @@ class InstagramPostsExtractor(InstagramExtractor): url = "{}/{}/".format(self.root, self.item) user = self._extract_profile_page(url) - query_hash = "42d2750e44dbac713ff30130659cd891" + query_hash = "32b14723a678bd4628d70c1f877b94c9" variables = {"id": user["id"], "first": 50} edge = self._get_edge_data(user, "edge_owner_to_timeline_media") return self._pagination_graphql(query_hash, variables, edge) @@ -613,7 +613,7 @@ class InstagramPostExtractor(InstagramExtractor): ) def posts(self): - query_hash = "cf28bf5eb45d62d4dc8e77cdb99d750d" + query_hash = "d4e8ae69cb68f66329dcebe82fb69f6d" variables = { "shortcode" : self.item, "child_comment_count" : 3, diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 377e00b..1b5e5e9 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -26,24 +26,41 @@ class KemonopartyExtractor(Extractor): def items(self): find_inline = re.compile(r'src="(/inline/[^"]+)').findall + if self.config("metadata"): + username = text.unescape(text.extract( + self.request(self.user_url).text, "<title>", " | Kemono<")[0]) + else: + username = None + for post in self.posts(): files = [] - if post["file"]: - files.append(post["file"]) - if post["attachments"]: - files.extend(post["attachments"]) + append = files.append + file = post["file"] + + if file: + file["type"] = "file" + append(file) + for attachment in post["attachments"]: + attachment["type"] = "attachment" + append(attachment) for path in find_inline(post["content"] or ""): - files.append({"path": path, "name": path}) + append({"path": path, "name": path, "type": "inline"}) post["date"] = text.parse_datetime( post["published"], "%a, %d %b %Y %H:%M:%S %Z") + if username: + post["username"] = username yield Message.Directory, post for post["num"], file in enumerate(files, 1): + post["type"] = file["type"] url = file["path"] if url[0] == "/": - url = self.root + url + url = "https://data.kemono.party" + url + elif url.startswith("https://kemono.party/"): + url = "https://data.kemono.party" + url[20:] + text.nameext_from_url(file["name"], post) yield Message.Url, url, post @@ -64,6 +81,7 @@ class KemonopartyUserExtractor(KemonopartyExtractor): KemonopartyExtractor.__init__(self, match) service, user_id = match.groups() self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) + self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): url = self.api_url @@ -84,7 +102,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): pattern = BASE_PATTERN + r"/post/([^/?#]+)" test = ( ("https://kemono.party/fanbox/user/6993449/post/506575", { - "pattern": r"https://kemono\.party/files/fanbox" + "pattern": r"https://data\.kemono\.party/files/fanbox" r"/6993449/506575/P058kDFYus7DbqAkGlfWTlOr\.jpeg", "keyword": { "added": "Wed, 06 May 2020 20:28:02 GMT", @@ -101,16 +119,21 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "shared_file": False, "subcategory": "post", "title": "c96取り置き", + "type": "file", "user": "6993449", }, }), # inline image (#1286) ("https://kemono.party/fanbox/user/7356311/post/802343", { - "pattern": r"https://kemono\.party/inline/fanbox" + "pattern": r"https://data\.kemono\.party/inline/fanbox" r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg", }), + # kemono.party -> data.kemono.party + ("https://kemono.party/gumroad/user/trylsc/post/IURjT", { + "pattern": r"https://data\.kemono\.party/(file|attachment)s" + r"/gumroad/trylsc/IURjT/", + }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), - ("https://kemono.party/gumroad/user/trylsc/post/IURjT"), ) def __init__(self, match): @@ -118,6 +141,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): service, user_id, post_id = match.groups() self.api_url = "{}/api/{}/user/{}/post/{}".format( self.root, service, user_id, post_id) + self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): posts = self.request(self.api_url).json() diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index f8e1473..833d18e 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -4,35 +4,23 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://manganelo.com/""" +"""Extractors for https://manganato.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re +BASE_PATTERN = \ + r"(?:https?://)?((?:(?:read)?manganato|(?:www\.)?manganelo)\.com)" -class ManganeloBase(): - """Base class for manganelo extractors""" - category = "manganelo" - root = "https://manganelo.com" - - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '<h1>', '</h1>'), - ('author' , '</i>Author(s) :</td>', '</tr>'), - ), values=data) - data["author"] = text.remove_html(data["author"]) - return data - -class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): +class ManganeloChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from manganelo.com""" - pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com" - r"(/chapter/\w+/chapter_[^/?#]+)") + category = "manganelo" + root = "https://readmanganato.com" + pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)" test = ( - ("https://manganelo.com/chapter/gq921227/chapter_23", { + ("https://readmanganato.com/manga-gn983696/chapter-23", { "pattern": r"https://s\d+\.\w+\.com/mangakakalot/g\d+/gq921227/" r"vol3_chapter_23_24_yen/\d+\.jpg", "keyword": "3748087cf41abc97f991530e6fd53b291490d6d0", @@ -43,11 +31,12 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): "content": "fbec629c71f66b246bfa0604204407c0d1c8ae38", "count": 39, }), + ("https://manganelo.com/chapter/gq921227/chapter_23"), ) def __init__(self, match): - self.path = match.group(1) - ChapterExtractor.__init__(self, match, self.root + self.path) + domain, path = match.groups() + ChapterExtractor.__init__(self, match, "https://" + domain + path) self.session.headers['Referer'] = self.root def metadata(self, page): @@ -85,21 +74,29 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): ] -class ManganeloMangaExtractor(ManganeloBase, MangaExtractor): +class ManganeloMangaExtractor(MangaExtractor): """Extractor for manga from manganelo.com""" + category = "manganelo" + root = "https://readmanganato.com" chapterclass = ManganeloChapterExtractor - pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com" - r"(/(?:manga/|read_)\w+)") + pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$" test = ( - ("https://manganelo.com/manga/ol921234", { - "url": "6ba7f083a6944e414ad8214b74a0a40cb60d4562", + ("https://manganato.com/manga-gu983703", { + "pattern": ManganeloChapterExtractor.pattern, + "count": ">= 70", }), ("https://manganelo.com/manga/read_otome_no_teikoku", { "pattern": ManganeloChapterExtractor.pattern, - "count": ">= 40" + "count": ">= 40", }), + ("https://manganelo.com/manga/ol921234/"), ) + def __init__(self, match): + domain, path = match.groups() + MangaExtractor.__init__(self, match, "https://" + domain + path) + self.session.headers['Referer'] = self.root + def chapters(self, page): results = [] data = self.parse_page(page, {"lang": "en", "language": "English"}) @@ -117,3 +114,13 @@ class ManganeloMangaExtractor(ManganeloBase, MangaExtractor): data["chapter"] = text.parse_int(chapter) data["chapter_minor"] = sep + minor results.append((url, data.copy())) + + @staticmethod + def parse_page(page, data): + """Parse metadata on 'page' and add it to 'data'""" + text.extract_all(page, ( + ("manga" , '<h1>', '</h1>'), + ('author' , '</i>Author(s) :</td>', '</tr>'), + ), values=data) + data["author"] = text.remove_html(data["author"]) + return data diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 558e682..9b6d4ba 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -17,7 +17,7 @@ import re class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" - root_fmt = "https://mangapark.{}" + root_fmt = "https://v2.mangapark.{}" browser = "firefox" @staticmethod @@ -51,7 +51,7 @@ class MangaparkBase(): class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" + pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" r"/manga/([^?#]+/i\d+)") test = ( ("https://mangapark.net/manga/gosu/i811653/c055/1", { @@ -117,7 +117,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): """Extractor for manga from mangapark.net""" chapterclass = MangaparkChapterExtractor - pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" + pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" r"(/manga/[^/?#]+)/?$") test = ( ("https://mangapark.net/manga/aria", { diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index e1081da..b74355d 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -158,7 +158,7 @@ class NozomiTagExtractor(NozomiExtractor): """Extractor for posts from tag searches on nozomi.la""" subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") - archive_fmt = "t_{search_tags}_{postid}" + archive_fmt = "t_{search_tags}_{dataid}" pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\." test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$", @@ -180,7 +180,7 @@ class NozomiSearchExtractor(NozomiExtractor): """Extractor for search results on nozomi.la""" subcategory = "search" directory_fmt = ("{category}", "{search_tags:J }") - archive_fmt = "t_{search_tags}_{postid}" + archive_fmt = "t_{search_tags}_{dataid}" pattern = r"(?:https?://)?nozomi\.la/search\.html\?q=([^&#]+)" test = ("https://nozomi.la/search.html?q=hibiscus%203:4_ratio#1", { "count": ">= 5", diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 839e0b8..9c32d7a 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -117,12 +117,22 @@ class PatreonExtractor(Extractor): attr = post["attributes"] attr["id"] = text.parse_int(post["id"]) - if post.get("current_user_can_view", True): + if attr.get("current_user_can_view", True): + + relationships = post["relationships"] attr["images"] = self._files(post, included, "images") attr["attachments"] = self._files(post, included, "attachments") attr["date"] = text.parse_datetime( attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - user = post["relationships"]["user"] + + tags = relationships.get("user_defined_tags") + attr["tags"] = [ + tag["id"].replace("user_defined;", "") + for tag in tags["data"] + if tag["type"] == "post_tag" + ] if tags else [] + + user = relationships["user"] attr["creator"] = ( self._user(user["links"]["related"]) or included["user"][user["data"]["id"]]) @@ -299,6 +309,10 @@ class PatreonPostExtractor(PatreonExtractor): ("https://www.patreon.com/posts/19987002", { "count": 4, }), + # tags (#1539) + ("https://www.patreon.com/posts/free-post-12497641", { + "keyword": {"tags": ["AWMedia"]}, + }), ("https://www.patreon.com/posts/not-found-123", { "exception": exception.NotFoundError, }), diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index cbd65d7..3c3fcd4 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -9,7 +9,9 @@ """Extractors for https://www.pillowfort.social/""" from .common import Extractor, Message -from .. import text +from ..cache import cache +from .. import text, exception +import re BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social" @@ -19,94 +21,171 @@ class PillowfortExtractor(Extractor): category = "pillowfort" root = "https://www.pillowfort.social" directory_fmt = ("{category}", "{username}") - filename_fmt = ("{post_id} {title|original_post[title]} " + filename_fmt = ("{post_id} {title|original_post[title]:?/ /}" "{num:>02}.{extension}") archive_fmt = "{id}" + cookiedomain = "www.pillowfort.social" def __init__(self, match): Extractor.__init__(self, match) self.item = match.group(1) - self.reblogs = self.config("reblogs", False) def items(self): - for post in self.posts(): + self.login() + inline = self.config("inline", True) + reblogs = self.config("reblogs", False) + external = self.config("external", False) + + if inline: + inline = re.compile(r'src="(https://img\d+\.pillowfort\.social' + r'/posts/[^"]+)').findall - if "original_post" in post and not self.reblogs: + for post in self.posts(): + if "original_post" in post and not reblogs: continue - files = post["media"] - del post["media"] + files = post.pop("media") + if inline: + for url in inline(post["content"]): + files.append({"url": url}) post["date"] = text.parse_datetime( post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + post["post_id"] = post.pop("id") yield Message.Directory, post post["num"] = 0 for file in files: url = file["url"] - if url: - post.update(file) + if not url: + continue + + if file.get("embed_code"): + if not external: + continue + msgtype = Message.Queue + else: post["num"] += 1 + msgtype = Message.Url + + post.update(file) + text.nameext_from_url(url, post) + post["hash"], _, post["filename"] = \ + post["filename"].partition("_") + + if "id" not in file: + post["id"] = post["hash"] + if "created_at" in file: post["date"] = text.parse_datetime( file["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - yield Message.Url, url, text.nameext_from_url(url, post) + + yield msgtype, url, post + + def login(self): + cget = self.session.cookies.get + if cget("_Pf_new_session", domain=self.cookiedomain) \ + or cget("remember_user_token", domain=self.cookiedomain): + return + + username, password = self._get_auth_info() + if username: + cookies = self._login_impl(username, password) + self._update_cookies(cookies) + + @cache(maxage=14*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = "https://www.pillowfort.social/users/sign_in" + page = self.request(url).text + auth = text.extract(page, 'name="authenticity_token" value="', '"')[0] + + headers = {"Origin": self.root, "Referer": url} + data = { + "utf8" : "✓", + "authenticity_token": auth, + "user[email]" : username, + "user[password]" : password, + "user[remember_me]" : "1", + } + response = self.request(url, method="POST", headers=headers, data=data) + + if not response.history: + raise exception.AuthenticationError() + + return { + cookie.name: cookie.value + for cookie in response.history[0].cookies + } class PillowfortPostExtractor(PillowfortExtractor): """Extractor for a single pillowfort post""" subcategory = "post" pattern = BASE_PATTERN + r"/posts/(\d+)" - test = ("https://www.pillowfort.social/posts/27510", { - "pattern": r"https://img\d+\.pillowfort\.social/posts/\w+_out\d+\.png", - "count": 4, - "keyword": { - "avatar_url": str, - "col": 0, - "commentable": True, - "comments_count": int, - "community_id": None, - "content": str, - "created_at": str, - "date": "type:datetime", - "deleted": None, - "deleted_at": None, - "deleted_by_mod": None, - "deleted_for_flag_id": None, - "embed_code": None, - "id": int, - "last_activity": str, - "last_activity_elapsed": str, - "last_edited_at": None, - "likes_count": int, - "media_type": "picture", - "nsfw": False, - "num": int, - "original_post_id": None, - "original_post_user_id": None, - "picture_content_type": None, - "picture_file_name": None, - "picture_file_size": None, - "picture_updated_at": None, - "post_id": 27510, - "post_type": "picture", - "privacy": "public", - "reblog_copy_info": list, - "rebloggable": True, - "reblogged_from_post_id": None, - "reblogged_from_user_id": None, - "reblogs_count": int, - "row": int, - "small_image_url": None, - "tags": list, - "time_elapsed": str, - "timestamp": str, - "title": "What is Pillowfort.io? ", - "updated_at": str, - "url": r"re:https://img3.pillowfort.social/posts/.*\.png", - "user_id": 5, - "username": "Staff" - }, - }) + test = ( + ("https://www.pillowfort.social/posts/27510", { + "pattern": r"https://img\d+\.pillowfort\.social" + r"/posts/\w+_out\d+\.png", + "count": 4, + "keyword": { + "avatar_url": str, + "col": 0, + "commentable": True, + "comments_count": int, + "community_id": None, + "content": str, + "created_at": str, + "date": "type:datetime", + "deleted": None, + "deleted_at": None, + "deleted_by_mod": None, + "deleted_for_flag_id": None, + "embed_code": None, + "id": int, + "last_activity": str, + "last_activity_elapsed": str, + "last_edited_at": None, + "likes_count": int, + "media_type": "picture", + "nsfw": False, + "num": int, + "original_post_id": None, + "original_post_user_id": None, + "picture_content_type": None, + "picture_file_name": None, + "picture_file_size": None, + "picture_updated_at": None, + "post_id": 27510, + "post_type": "picture", + "privacy": "public", + "reblog_copy_info": list, + "rebloggable": True, + "reblogged_from_post_id": None, + "reblogged_from_user_id": None, + "reblogs_count": int, + "row": int, + "small_image_url": None, + "tags": list, + "time_elapsed": str, + "timestamp": str, + "title": "What is Pillowfort.io? ", + "updated_at": str, + "url": r"re:https://img3.pillowfort.social/posts/.*\.png", + "user_id": 5, + "username": "Staff" + }, + }), + ("https://www.pillowfort.social/posts/1557500", { + "options": (("external", True), ("inline", False)), + "pattern": r"https://twitter\.com/Aliciawitdaart/status" + r"/1282862493841457152", + }), + ("https://www.pillowfort.social/posts/1672518", { + "options": (("inline", True),), + "count": 3, + }), + ) def posts(self): url = "{}/posts/{}/json/".format(self.root, self.item) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8bfae06..8076fff 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -29,14 +29,28 @@ class PixivExtractor(Extractor): Extractor.__init__(self, match) self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) - self.translated_tags = self.config("translated-tags", False) + self.max_posts = self.config("max-posts", 0) def items(self): - tkey = "translated_name" if self.translated_tags else "name" + tags = self.config("tags", "japanese") + if tags == "original": + transform_tags = None + elif tags == "translated": + def transform_tags(work): + work["tags"] = list(set( + tag["translated_name"] or tag["name"] + for tag in work["tags"])) + else: + def transform_tags(work): + work["tags"] = [tag["name"] for tag in work["tags"]] + ratings = {0: "General", 1: "R-18", 2: "R-18G"} metadata = self.metadata() - for work in self.works(): + works = self.works() + if self.max_posts: + works = itertools.islice(works, self.max_posts) + for work in works: if not work["user"]["id"]: continue @@ -45,12 +59,10 @@ class PixivExtractor(Extractor): del work["meta_single_page"] del work["image_urls"] del work["meta_pages"] + + if transform_tags: + transform_tags(work) work["num"] = 0 - if self.translated_tags: - work["untranslated_tags"] = [ - tag["name"] for tag in work["tags"] - ] - work["tags"] = [tag[tkey] or tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) work["rating"] = ratings.get(work["x_restrict"]) work["suffix"] = "" @@ -66,6 +78,7 @@ class PixivExtractor(Extractor): url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") work["frames"] = ugoira["frames"] + work["_http_adjust_extension"] = False yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: @@ -115,7 +128,8 @@ class PixivUserExtractor(PixivExtractor): }), # deleted account ("http://www.pixiv.net/member_illust.php?id=173531", { - "count": 0, + "options": (("metadata", True),), + "exception": exception.NotFoundError, }), ("https://www.pixiv.net/en/users/173530"), ("https://www.pixiv.net/en/users/173530/manga"), @@ -138,6 +152,11 @@ class PixivUserExtractor(PixivExtractor): self.user_id = u1 or u2 or u3 self.tag = t1 or t2 + def metadata(self): + if self.config("metadata"): + return {"user": self.api.user_detail(self.user_id)} + return {} + def works(self): works = self.api.user_illusts(self.user_id) diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 971347b..c62a942 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -78,6 +78,8 @@ class ReactorExtractor(Extractor): def _parse_post(self, post): post, _, script = post.partition('<script type="application/ld+json">') + if not script: + return images = text.extract_iter(post, '<div class="image">', '</div>') script = script[:script.index("</")].strip() @@ -210,7 +212,7 @@ class JoyreactorTagExtractor(ReactorTagExtractor): pattern = JR_BASE_PATTERN + r"/tag/([^/?#]+)" test = ( ("http://joyreactor.cc/tag/Advent+Cirno", { - "count": ">= 17", + "count": ">= 15", }), ("http://joyreactor.com/tag/Cirno", { "url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914", diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 7ffe5dc..e4075a2 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -1,17 +1,19 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://readcomiconline.to/""" +"""Extractors for https://readcomiconline.li/""" from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, exception import re +BASE_PATTERN = r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.(?:li|to)" + class ReadcomiconlineBase(): """Base class for readcomiconline extractors""" @@ -19,7 +21,7 @@ class ReadcomiconlineBase(): directory_fmt = ("{category}", "{comic}", "{issue:>03}") filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" archive_fmt = "{issue_id}_{page}" - root = "https://readcomiconline.to" + root = "https://readcomiconline.li" def request(self, url, **kwargs): """Detect and handle redirects to CAPTCHA pages""" @@ -42,11 +44,10 @@ class ReadcomiconlineBase(): class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): - """Extractor for comic-issues from readcomiconline.to""" + """Extractor for comic-issues from readcomiconline.li""" subcategory = "issue" - pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" - r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))") - test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", { + pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))" + test = ("https://readcomiconline.li/Comic/W-i-t-c-h/Issue-130?id=22289", { "url": "30d29c5afc65043bfd384c010257ec2d0ecbafa6", "keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5", }) @@ -78,18 +79,17 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): - """Extractor for comics from readcomiconline.to""" + """Extractor for comics from readcomiconline.li""" chapterclass = ReadcomiconlineIssueExtractor subcategory = "comic" - pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" - r"(/Comic/[^/?#]+/?)$") + pattern = BASE_PATTERN + r"(/Comic/[^/?#]+/?)$" test = ( - ("https://readcomiconline.to/Comic/W-i-t-c-h", { - "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14", + ("https://readcomiconline.li/Comic/W-i-t-c-h", { + "url": "74eb8b9504b4084fcc9367b341300b2c52260918", "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c", }), ("https://readcomiconline.to/Comic/Bazooka-Jules", { - "url": "711674cb78ed10bd2557315f7a67552d01b33985", + "url": "2f66a467a772df4d4592e97a059ddbc3e8991799", "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516", }), ) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 5579017..9808cb8 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -10,7 +10,7 @@ from .booru import BooruExtractor from .common import Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache import collections @@ -206,7 +206,7 @@ class SankakuAPI(): self.username, self.password = self.extractor._get_auth_info() if not self.username: - self.authenticate = lambda: None + self.authenticate = util.noop def pools(self, pool_id): params = {"lang": "en"} @@ -250,7 +250,8 @@ class SankakuAPI(): success = True if not success: code = data.get("code") - if code and code.endswith(("invalid-token", "invalid_token")): + if code and code.endswith( + ("unauthorized", "invalid-token", "invalid_token")): _authenticate_impl.invalidate(self.username) continue raise exception.StopExtraction(code) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c323fe0..afeebb0 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -32,6 +32,7 @@ class TwitterExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + self.textonly = self.config("text-tweets", False) self.retweets = self.config("retweets", True) self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) @@ -64,7 +65,7 @@ class TwitterExtractor(Extractor): self._extract_card(tweet, files) if self.twitpic: self._extract_twitpic(tweet, files) - if not files: + if not files and not self.textonly: continue tdata = self._transform_tweet(tweet) @@ -168,7 +169,6 @@ class TwitterExtractor(Extractor): tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), "user" : self._transform_user(tweet["user"]), "lang" : tweet["lang"], - "content" : tweet["full_text"], "favorite_count": tweet["favorite_count"], "quote_count" : tweet["quote_count"], "reply_count" : tweet["reply_count"], @@ -187,6 +187,14 @@ class TwitterExtractor(Extractor): "nick": u["name"], } for u in mentions] + content = tweet["full_text"] + urls = entities.get("urls") + if urls: + for url in urls: + content = content.replace(url["url"], url["expanded_url"]) + txt, _, tco = content.rpartition(" ") + tdata["content"] = txt if tco.startswith("https://t.co/") else content + if "in_reply_to_screen_name" in tweet: tdata["reply_to"] = tweet["in_reply_to_screen_name"] @@ -489,6 +497,10 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("conversations", True),), "count": ">= 50", }), + # retweet with missing media entities (#1555) + ("https://twitter.com/morino_ya/status/1392763691599237121", { + "count": 4, + }), ) def __init__(self, match): @@ -802,6 +814,10 @@ class TwitterAPI(): tweet = retweet elif retweet: tweet["author"] = users[retweet["user_id_str"]] + if "extended_entities" in retweet and \ + "extended_entities" not in tweet: + tweet["extended_entities"] = \ + retweet["extended_entities"] tweet["user"] = users[tweet["user_id_str"]] yield tweet diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index d13ce0f..e89a5b7 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -69,7 +69,8 @@ class UnsplashImageExtractor(UnsplashExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/photos/([^/?#]+)" test = ("https://unsplash.com/photos/lsoogGC_5dg", { - "url": "b99a5829ca955b768a206aa9afc391bd3f3dd55e", + "pattern": r"https://images\.unsplash\.com/photo-1586348943529-" + r"beaae6c28db9\?ixid=\w+&ixlib=rb-1.2.1", "keyword": { "alt_description": "re:silhouette of trees near body of water ", "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", @@ -114,7 +115,7 @@ class UnsplashImageExtractor(UnsplashExtractor): "id": "uMJXuywXLiU", "instagram_username": "just_midwest_rock", "last_name": "Hoefler", - "location": "Madison, WI", + "location": None, "name": "Dave Hoefler", "portfolio_url": str, "total_collections": int, diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index f8da191..711d3fa 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -77,7 +77,7 @@ class WeasylSubmissionExtractor(WeasylExtractor): "keyword": { "comments" : int, "date" : "dt:2012-04-20 00:38:04", - "description" : "<p>(flex)</p>", + "description" : "<p>(flex)</p>\n", "favorites" : int, "folder_name" : "Wesley Stuff", "folderid" : 2081, @@ -160,8 +160,8 @@ class WeasylJournalExtractor(WeasylExtractor): "keyword": { "title" : "BBCode", "date" : "dt:2013-09-19 23:11:23", - "content": "<p><a>javascript:alert(42);</a></p>" - "<p>No more of that!</p>", + "content": "<p><a>javascript:alert(42);</a></p>\n\n" + "<p>No more of that!</p>\n", }, }) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index a325f87..0b6a153 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -27,16 +27,21 @@ class WeiboExtractor(Extractor): self.videos = self.config("videos", True) def items(self): - yield Message.Version, 1 + original_retweets = (self.retweets == "original") for status in self.statuses(): - files = self._files_from_status(status) if self.retweets and "retweeted_status" in status: - files = itertools.chain( - files, - self._files_from_status(status["retweeted_status"]), - ) + if original_retweets: + status = status["retweeted_status"] + files = self._files_from_status(status) + else: + files = itertools.chain( + self._files_from_status(status), + self._files_from_status(status["retweeted_status"]), + ) + else: + files = self._files_from_status(status) for num, file in enumerate(files, 1): if num == 1: @@ -143,6 +148,11 @@ class WeiboStatusExtractor(WeiboExtractor): }), # non-numeric status ID (#664) ("https://weibo.com/3314883543/Iy7fj4qVg"), + # original retweets (#1542) + ("https://m.weibo.cn/detail/4600272267522211", { + "options": (("retweets", "original"),), + "keyword": {"status": {"id": "4600167083287033"}}, + }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), ) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 7fd60b1..511a609 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -71,8 +71,8 @@ class WikiartArtistExtractor(WikiartExtractor): directory_fmt = ("{category}", "{artist[artistName]}") pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$" test = ("https://www.wikiart.org/en/thomas-cole", { - "url": "5ba2fbe6783fcce34e65014d16e5fbc581490c98", - "keyword": "eb5b141cf33e6d279afd1518aae24e61cc0adf81", + "url": "5140343730331786117fa5f4c013a6153393e28e", + "keyword": "4d9cbc50ebddfcb186f31ff70b08833578dd0070", }) def __init__(self, match): @@ -97,8 +97,8 @@ class WikiartImageExtractor(WikiartArtistExtractor): pattern = BASE_PATTERN + r"/(?!(?:paintings|artists)-by-)([\w-]+)/([\w-]+)" test = ( ("https://www.wikiart.org/en/thomas-cole/the-departure-1838", { - "url": "4d9fd87680a2620eaeaf1f13e3273475dec93231", - "keyword": "a1b083d500ce2fd364128e35b026e4ca526000cc", + "url": "976cc2545f308a650b5dbb35c29d3cee0f4673b3", + "keyword": "8e80cdcb01c1fedb934633d1c4c3ab0419cfbedf", }), # no year or '-' in slug ("https://www.wikiart.org/en/huang-shen/summer", { |
