From fc004701f923bb954a22c7fec2ae8d607e78cb2b Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Fri, 25 Oct 2024 17:27:30 -0400 Subject: New upstream version 1.27.7. --- gallery_dl/extractor/8chan.py | 53 ++++---- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/behance.py | 1 + gallery_dl/extractor/bluesky.py | 101 +++++++++------- gallery_dl/extractor/bunkr.py | 143 ++++++++++++++++------ gallery_dl/extractor/civitai.py | 115 +++++++++++++----- gallery_dl/extractor/cohost.py | 2 +- gallery_dl/extractor/common.py | 39 +++--- gallery_dl/extractor/deviantart.py | 4 +- gallery_dl/extractor/exhentai.py | 6 +- gallery_dl/extractor/foolfuuka.py | 2 +- gallery_dl/extractor/lensdump.py | 109 ++++++++--------- gallery_dl/extractor/lolisafe.py | 10 +- gallery_dl/extractor/mangadex.py | 22 ++++ gallery_dl/extractor/mangakakalot.py | 6 +- gallery_dl/extractor/newgrounds.py | 60 ++++----- gallery_dl/extractor/nozomi.py | 3 +- gallery_dl/extractor/patreon.py | 7 +- gallery_dl/extractor/pinterest.py | 171 ++++++++++++++++++-------- gallery_dl/extractor/pixiv.py | 77 ++++++++++-- gallery_dl/extractor/postmill.py | 2 +- gallery_dl/extractor/reddit.py | 8 +- gallery_dl/extractor/scrolller.py | 227 +++++++++++++++++++++++++++++++++++ gallery_dl/extractor/telegraph.py | 2 +- gallery_dl/extractor/tsumino.py | 6 +- gallery_dl/extractor/urlgalleries.py | 30 +++-- gallery_dl/extractor/vk.py | 9 +- gallery_dl/extractor/wikimedia.py | 5 +- 28 files changed, 887 insertions(+), 334 deletions(-) create mode 100644 gallery_dl/extractor/scrolller.py (limited to 'gallery_dl/extractor') diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index f81d2a1..ce1c52a 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -27,12 +27,22 @@ class _8chanExtractor(Extractor): Extractor.__init__(self, match) def _init(self): - now = util.datetime_utcnow() - domain = self.root.rpartition("/")[2] - self.cookies.set( - now.strftime("TOS%Y%m%d"), "1", domain=domain) - self.cookies.set( - (now - timedelta(1)).strftime("TOS%Y%m%d"), "1", domain=domain) + tos = self.cookies_tos_name() + self.cookies.set(tos, "1", domain=self.root[8:]) + + @memcache() + def cookies_tos_name(self): + url = self.root + "/.static/pages/confirmed.html" + headers = {"Referer": self.root + "/.static/pages/disclaimer.html"} + response = self.request(url, headers=headers, allow_redirects=False) + + for cookie in response.cookies: + if cookie.name.lower().startswith("tos"): + self.log.debug("TOS cookie name: %s", cookie.name) + return cookie.name + + self.log.error("Unable to determin TOS cookie name") + return "TOS20241009" @memcache() def cookies_prepare(self): @@ -64,16 +74,14 @@ class _8chanThreadExtractor(_8chanExtractor): "{threadId} {subject[:50]}") filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}" archive_fmt = "{boardUri}_{postId}_{num}" - pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/(?:res|last)/(\d+)" example = "https://8chan.moe/a/res/12345.html" - def __init__(self, match): - _8chanExtractor.__init__(self, match) - _, self.board, self.thread = match.groups() - def items(self): + _, board, thread = self.groups + # fetch thread data - url = "{}/{}/res/{}.".format(self.root, self.board, self.thread) + url = "{}/{}/res/{}.".format(self.root, board, thread) self.session.headers["Referer"] = url + "html" thread = self.request(url + "json").json() thread["postId"] = thread["threadId"] @@ -106,25 +114,22 @@ class _8chanBoardExtractor(_8chanExtractor): pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$" example = "https://8chan.moe/a/" - def __init__(self, match): - _8chanExtractor.__init__(self, match) - _, self.board, self.page = match.groups() - def items(self): - page = text.parse_int(self.page, 1) - url = "{}/{}/{}.json".format(self.root, self.board, page) - board = self.request(url).json() - threads = board["threads"] + _, board, pnum = self.groups + pnum = text.parse_int(pnum, 1) + url = "{}/{}/{}.json".format(self.root, board, pnum) + data = self.request(url).json() + threads = data["threads"] while True: for thread in threads: thread["_extractor"] = _8chanThreadExtractor url = "{}/{}/res/{}.html".format( - self.root, self.board, thread["threadId"]) + self.root, board, thread["threadId"]) yield Message.Queue, url, thread - page += 1 - if page > board["pageCount"]: + pnum += 1 + if pnum > data["pageCount"]: return - url = "{}/{}/{}.json".format(self.root, self.board, page) + url = "{}/{}/{}.json".format(self.root, board, pnum) threads = self.request(url).json()["threads"] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9885195..4e9fa50 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -141,6 +141,7 @@ modules = [ "rule34us", "sankaku", "sankakucomplex", + "scrolller", "seiga", "senmanga", "sexcom", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 72f9195..14598b7 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -171,6 +171,7 @@ class BehanceGalleryExtractor(BehanceExtractor): url = text.extr(page, '").partition(">")[2]) - count, _, size = info[1].split(None, 2) + title, size = text.split_html(text.extr( + page, "").partition(">")[2]) - pos = page.index('class="grid-images') - urls = list(text.extract_iter(page, '", "")) + return self._extract_files(items), { "album_id" : self.album_id, - "album_name" : text.unescape(info[0]), - "album_size" : size[1:-1], - "count" : len(urls), - "_http_validate": self._validate, + "album_name" : title, + "album_size" : text.extr(size, "(", ")"), + "count" : len(items), } - def _extract_files(self, urls): - for url in urls: + def _extract_files(self, items): + for item in items: try: - url = self._extract_file(text.unescape(url)) + url = text.extr(item, ' href="', '"') + file = self._extract_file(text.unescape(url)) + + info = text.split_html(item) + file["name"] = info[0] + file["size"] = info[2] + file["date"] = text.parse_datetime( + info[-1], "%H:%M:%S %d/%m/%Y") + + yield file + except exception.StopExtraction: + raise except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) - continue - yield {"file": text.unescape(url)} - - def _extract_file(self, url): - page = self.request(url).text - url = (text.extr(page, 'Torrent Download (', ')'), } - if data["uploader"].startswith("<"): - data["uploader"] = text.unescape(text.extr( - data["uploader"], ">", "<")) + uploader = data["uploader"] + if uploader and uploader[0] == "<": + data["uploader"] = text.unescape(text.extr(uploader, ">", "<")) f = data["favorites"][0] if f == "N": diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 85dd896..44c4542 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -37,7 +37,7 @@ class FoolfuukaExtractor(BaseExtractor): if not url and "remote_media_link" in media: url = self.remote(media) - if url.startswith("/"): + if url and url[0] == "/": url = self.root + url post["filename"], _, post["extension"] = \ diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index 12e8860..72a6453 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -17,42 +17,30 @@ class LensdumpBase(): category = "lensdump" root = "https://lensdump.com" - def nodes(self, page=None): - if page is None: - page = self.request(self.url).text - - # go through all pages starting from the oldest - page_url = text.urljoin(self.root, text.extr( - text.extr(page, ' id="list-most-oldest-link"', '>'), - 'href="', '"')) - while page_url is not None: - if page_url == self.url: - current_page = page - else: - current_page = self.request(page_url).text - - for node in text.extract_iter( - current_page, ' class="list-item ', '>'): - yield node - - # find url of next page - page_url = text.extr( - text.extr(current_page, ' data-pagination="next"', '>'), - 'href="', '"') - if page_url is not None and len(page_url) > 0: - page_url = text.urljoin(self.root, page_url) - else: - page_url = None + def _pagination(self, page, begin, end): + while True: + yield from text.extract_iter(page, begin, end) + + next = text.extr(page, ' data-pagination="next"', '>') + if not next: + return + + url = text.urljoin(self.root, text.extr(next, 'href="', '"')) + page = self.request(url).text class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): subcategory = "album" - pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))" + pattern = BASE_PATTERN + r"/a/(\w+)(?:/?\?([^#]+))?" example = "https://lensdump.com/a/ID" def __init__(self, match): - GalleryExtractor.__init__(self, match, match.string) - self.gallery_id = match.group(1) or match.group(2) + self.gallery_id, query = match.groups() + if query: + url = "{}/a/{}/?{}".format(self.root, self.gallery_id, query) + else: + url = "{}/a/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) def metadata(self, page): return { @@ -62,40 +50,48 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): } def images(self, page): - for node in self.nodes(page): - # get urls and filenames of images in current page - json_data = util.json_loads(text.unquote( - text.extr(node, "data-object='", "'") or - text.extr(node, 'data-object="', '"'))) - image_id = json_data.get('name') - image_url = json_data.get('url') - image_title = json_data.get('title') + for image in self._pagination(page, ' class="list-item ', '>'): + + data = util.json_loads(text.unquote( + text.extr(image, "data-object='", "'") or + text.extr(image, 'data-object="', '"'))) + image_id = data.get("name") + image_url = data.get("url") + image_title = data.get("title") if image_title is not None: image_title = text.unescape(image_title) + yield (image_url, { - 'id': image_id, - 'url': image_url, - 'title': image_title, - 'name': json_data.get('filename'), - 'filename': image_id, - 'extension': json_data.get('extension'), - 'height': text.parse_int(json_data.get('height')), - 'width': text.parse_int(json_data.get('width')), + "id" : image_id, + "url" : image_url, + "title" : image_title, + "name" : data.get("filename"), + "filename" : image_id, + "extension": data.get("extension"), + "width" : text.parse_int(data.get("width")), + "height" : text.parse_int(data.get("height")), }) class LensdumpAlbumsExtractor(LensdumpBase, Extractor): """Extractor for album list from lensdump.com""" subcategory = "albums" - pattern = BASE_PATTERN + r"/\w+/albums" - example = "https://lensdump.com/USER/albums" + pattern = BASE_PATTERN + r"/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?" + example = "https://lensdump.com/USER" def items(self): - for node in self.nodes(): - album_url = text.urljoin(self.root, text.extr( - node, 'data-url-short="', '"')) - yield Message.Queue, album_url, { - "_extractor": LensdumpAlbumExtractor} + user, query = self.groups + url = "{}/{}/".format(self.root, user) + if query: + params = text.parse_query(query) + else: + params = {"sort": "date_asc", "page": "1"} + page = self.request(url, params=params).text + + data = {"_extractor": LensdumpAlbumExtractor} + for album_path in self._pagination(page, 'data-url-short="', '"'): + album_url = text.urljoin(self.root, album_path) + yield Message.Queue, album_url, data class LensdumpImageExtractor(LensdumpBase, Extractor): @@ -107,16 +103,13 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" example = "https://lensdump.com/i/ID" - def __init__(self, match): - Extractor.__init__(self, match) - self.key = match.group(1) - def items(self): - url = "{}/i/{}".format(self.root, self.key) + key = self.groups[0] + url = "{}/i/{}".format(self.root, key) extr = text.extract_from(self.request(url).text) data = { - "id" : self.key, + "id" : key, "title" : text.unescape(extr( 'property="og:title" content="', '"')), "url" : extr( diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 6fc0689..044f4f5 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -47,7 +47,15 @@ class LolisafeAlbumExtractor(LolisafeExtractor): url = file["file"] file.update(data) text.nameext_from_url(url, file) - file["name"], sep, file["id"] = file["filename"].rpartition("-") + + if "name" in file: + name = file["name"] + file["name"] = name.rpartition(".")[0] or name + file["id"] = file["filename"].rpartition("-")[2] + else: + file["name"], sep, file["id"] = \ + file["filename"].rpartition("-") + yield Message.Url, url, file def fetch_album(self, album_id): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index bca7e4d..1f24593 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -174,6 +174,20 @@ class MangadexListExtractor(MangadexExtractor): yield Message.Queue, url, data +class MangadexAuthorExtractor(MangadexExtractor): + """Extractor for mangadex authors""" + subcategory = "author" + pattern = BASE_PATTERN + r"/author/([0-9a-f-]+)" + example = ("https://mangadex.org/author" + "/01234567-89ab-cdef-0123-456789abcdef/NAME") + + def items(self): + for manga in self.api.manga_author(self.uuid): + manga["_extractor"] = MangadexMangaExtractor + url = "{}/title/{}".format(self.root, manga["id"]) + yield Message.Queue, url, manga + + class MangadexAPI(): """Interface for the MangaDex API v5 @@ -195,6 +209,10 @@ class MangadexAPI(): def athome_server(self, uuid): return self._call("/at-home/server/" + uuid) + def author(self, uuid, manga=False): + params = {"includes[]": ("manga",)} if manga else None + return self._call("/author/" + uuid, params)["data"] + def chapter(self, uuid): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] @@ -210,6 +228,10 @@ class MangadexAPI(): params = {"includes[]": ("artist", "author")} return self._call("/manga/" + uuid, params)["data"] + def manga_author(self, uuid_author): + params = {"authorOrArtist": uuid_author} + return self._pagination("/manga", params) + def manga_feed(self, uuid): order = "desc" if self.extractor.config("chapter-reverse") else "asc" params = { diff --git a/gallery_dl/extractor/mangakakalot.py b/gallery_dl/extractor/mangakakalot.py index 0183b25..9fc8681 100644 --- a/gallery_dl/extractor/mangakakalot.py +++ b/gallery_dl/extractor/mangakakalot.py @@ -19,7 +19,7 @@ BASE_PATTERN = r"(?:https?://)?(?:ww[\dw]?\.)?mangakakalot\.tv" class MangakakalotBase(): """Base class for mangakakalot extractors""" category = "mangakakalot" - root = "https://ww6.mangakakalot.tv" + root = "https://ww8.mangakakalot.tv" class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): @@ -40,7 +40,7 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): match = re.match( r"(?:[Vv]ol\. *(\d+) )?" r"[Cc]hapter *([^:]*)" - r"(?:: *(.+))?", info) + r"(?:: *(.+))?", info or "") volume, chapter, title = match.groups() if match else ("", "", info) chapter, sep, minor = chapter.partition(".") @@ -86,7 +86,7 @@ class MangakakalotMangaExtractor(MangakakalotBase, MangaExtractor): data["chapter"] = text.parse_int(chapter) data["chapter_minor"] = sep + minor - if url.startswith("/"): + if url[0] == "/": url = self.root + url results.append((url, data.copy())) return results diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 2928573..61ffdee 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -14,6 +14,9 @@ from ..cache import cache import itertools import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com" +USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com" + class NewgroundsExtractor(Extractor): """Base class for newgrounds extractors""" @@ -93,7 +96,7 @@ class NewgroundsExtractor(Extractor): def posts(self): """Return URLs of all relevant post pages""" - return self._pagination(self._path) + return self._pagination(self._path, self.groups[1]) def metadata(self): """Return general metadata""" @@ -334,10 +337,10 @@ class NewgroundsExtractor(Extractor): for fmt in formats: yield fmt[1][0]["src"] - def _pagination(self, kind): + def _pagination(self, kind, pnum=1): url = "{}/{}".format(self.user_root, kind) params = { - "page": 1, + "page": text.parse_int(pnum, 1), "isAjaxRequest": "1", } headers = { @@ -400,8 +403,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): class NewgroundsMediaExtractor(NewgroundsExtractor): """Extractor for a media file from newgrounds.com""" subcategory = "media" - pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" - r"(/(?:portal/view|audio/listen)/\d+)") + pattern = BASE_PATTERN + r"(/(?:portal/view|audio/listen)/\d+)" example = "https://www.newgrounds.com/portal/view/12345" def __init__(self, match): @@ -416,35 +418,35 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): class NewgroundsArtExtractor(NewgroundsExtractor): """Extractor for all images of a newgrounds user""" subcategory = _path = "art" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/art/?$" + pattern = USER_PATTERN + r"/art(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/art" class NewgroundsAudioExtractor(NewgroundsExtractor): """Extractor for all audio submissions of a newgrounds user""" subcategory = _path = "audio" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/audio/?$" + pattern = USER_PATTERN + r"/audio(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/audio" class NewgroundsMoviesExtractor(NewgroundsExtractor): """Extractor for all movies of a newgrounds user""" subcategory = _path = "movies" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/movies/?$" + pattern = USER_PATTERN + r"/movies(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/movies" class NewgroundsGamesExtractor(NewgroundsExtractor): """Extractor for a newgrounds user's games""" subcategory = _path = "games" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/games/?$" + pattern = USER_PATTERN + r"/games(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/games" class NewgroundsUserExtractor(NewgroundsExtractor): """Extractor for a newgrounds user profile""" subcategory = "user" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/?$" + pattern = USER_PATTERN + r"/?$" example = "https://USER.newgrounds.com" def initialize(self): @@ -464,25 +466,22 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): """Extractor for posts favorited by a newgrounds user""" subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") - pattern = (r"(?:https?://)?([\w-]+)\.newgrounds\.com" - r"/favorites(?!/following)(?:/(art|audio|movies))?/?") + pattern = (USER_PATTERN + r"/favorites(?!/following)(?:/(art|audio|movies)" + r"(?:(?:/page/|/?\?page=)(\d+))?)?") example = "https://USER.newgrounds.com/favorites" - def __init__(self, match): - NewgroundsExtractor.__init__(self, match) - self.kind = match.group(2) - def posts(self): - if self.kind: - return self._pagination(self.kind) + _, kind, pnum = self.groups + if kind: + return self._pagination_favorites(kind, pnum) return itertools.chain.from_iterable( - self._pagination(k) for k in ("art", "audio", "movies") + self._pagination_favorites(k) for k in ("art", "audio", "movies") ) - def _pagination(self, kind): + def _pagination_favorites(self, kind, pnum=1): url = "{}/favorites/{}".format(self.user_root, kind) params = { - "page": 1, + "page": text.parse_int(pnum, 1), "isAjaxRequest": "1", } headers = { @@ -514,12 +513,13 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): """Extractor for a newgrounds user's favorited users""" subcategory = "following" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/favorites/(following)" + pattern = USER_PATTERN + r"/favorites/(following)" example = "https://USER.newgrounds.com/favorites/following" def items(self): + _, kind, pnum = self.groups data = {"_extractor": NewgroundsUserExtractor} - for url in self._pagination(self.kind): + for url in self._pagination_favorites(kind, pnum): yield Message.Queue, url, data @staticmethod @@ -534,13 +534,12 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): """Extractor for newgrounds.com search reesults""" subcategory = "search" directory_fmt = ("{category}", "search", "{search_tags}") - pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" - r"/search/conduct/([^/?#]+)/?\?([^#]+)") + pattern = BASE_PATTERN + r"/search/conduct/([^/?#]+)/?\?([^#]+)" example = "https://www.newgrounds.com/search/conduct/art?terms=QUERY" def __init__(self, match): NewgroundsExtractor.__init__(self, match) - self._path, query = match.groups() + self._path, query = self.groups self.query = text.parse_query(query) def posts(self): @@ -550,19 +549,20 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): for s in suitabilities.split(",")} self.request(self.root + "/suitabilities", method="POST", data=data) - return self._pagination("/search/conduct/" + self._path, self.query) + return self._pagination_search( + "/search/conduct/" + self._path, self.query) def metadata(self): return {"search_tags": self.query.get("terms", "")} - def _pagination(self, path, params): + def _pagination_search(self, path, params): url = self.root + path + params["inner"] = "1" + params["page"] = text.parse_int(params.get("page"), 1) headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", } - params["inner"] = "1" - params["page"] = 1 while True: data = self.request(url, params=params, headers=headers).json() diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 8c7ffe5..851f663 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -63,7 +63,8 @@ class NozomiExtractor(Extractor): yield Message.Directory, post for post["num"], image in enumerate(images, 1): post["filename"] = post["dataid"] = did = image["dataid"] - post["is_video"] = video = bool(image.get("is_video")) + post["is_video"] = video = \ + True if image.get("is_video") else False ext = image["type"] if video: diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index d47ffa2..0b64ea3 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -56,6 +56,7 @@ class PatreonExtractor(Extractor): text.nameext_from_url(name, post) if text.ext_from_url(url) == "m3u8": url = "ytdl:" + url + post["_ytdl_manifest"] = "hls" post["extension"] = "mp4" yield Message.Url, url, post else: @@ -310,7 +311,7 @@ class PatreonCreatorExtractor(PatreonExtractor): subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" - r"([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") + r"(?:c/)?([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") example = "https://www.patreon.com/USER" def posts(self): @@ -340,9 +341,9 @@ class PatreonCreatorExtractor(PatreonExtractor): user_id = query.get("u") if user_id: - url = "{}/user/posts?u={}".format(self.root, user_id) + url = "{}/user?u={}".format(self.root, user_id) else: - url = "{}/{}/posts".format(self.root, creator) + url = "{}/{}".format(self.root, creator) page = self.request(url, notfound="creator").text try: diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 8c04ed5..499c579 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -18,8 +18,8 @@ BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" class PinterestExtractor(Extractor): """Base class for pinterest extractors""" category = "pinterest" - filename_fmt = "{category}_{id}{media_id:?_//}.{extension}" - archive_fmt = "{id}{media_id}" + filename_fmt = "{category}_{id}{media_id|page_id:?_//}.{extension}" + archive_fmt = "{id}{media_id|page_id}" root = "https://www.pinterest.com" def _init(self): @@ -30,12 +30,12 @@ class PinterestExtractor(Extractor): self.root = text.ensure_http_scheme(domain) self.api = PinterestAPI(self) + self.stories = self.config("stories", True) + self.videos = self.config("videos", True) def items(self): data = self.metadata() - videos = self.config("videos", True) - yield Message.Directory, data for pin in self.pins(): if isinstance(pin, tuple): @@ -43,40 +43,35 @@ class PinterestExtractor(Extractor): yield Message.Queue, url, data continue + try: + files = self._extract_files(pin) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.warning( + "%s: Error when extracting download URLs (%s: %s)", + pin.get("id"), exc.__class__.__name__, exc) + continue + pin.update(data) + pin["count"] = len(files) - carousel_data = pin.get("carousel_data") - if carousel_data: - pin["count"] = len(carousel_data["carousel_slots"]) - for num, slot in enumerate(carousel_data["carousel_slots"], 1): - slot["media_id"] = slot.pop("id") - pin.update(slot) - pin["num"] = num - size, image = next(iter(slot["images"].items())) - url = image["url"].replace("/" + size + "/", "/originals/") - yield Message.Url, url, text.nameext_from_url(url, pin) - - else: - try: - media = self._media_from_pin(pin) - except Exception: - self.log.debug("Unable to fetch download URL for pin %s", - pin.get("id")) - continue + yield Message.Directory, pin + for pin["num"], file in enumerate(files, 1): + url = file["url"] + text.nameext_from_url(url, pin) + pin.update(file) - if videos or media.get("duration") is None: - pin.update(media) - pin["num"] = pin["count"] = 1 + if "media_id" not in file: pin["media_id"] = "" + if "page_id" not in file: + pin["page_id"] = "" - url = media["url"] - text.nameext_from_url(url, pin) + if pin["extension"] == "m3u8": + url = "ytdl:" + url + pin["_ytdl_manifest"] = "hls" + pin["extension"] = "mp4" - if pin["extension"] == "m3u8": - url = "ytdl:" + url - pin["extension"] = "mp4" - - yield Message.Url, url, pin + yield Message.Url, url, pin def metadata(self): """Return general metadata""" @@ -84,26 +79,108 @@ class PinterestExtractor(Extractor): def pins(self): """Return all relevant pin objects""" - @staticmethod - def _media_from_pin(pin): + def _extract_files(self, pin): + story_pin_data = pin.get("story_pin_data") + if story_pin_data and self.stories: + return self._extract_story(pin, story_pin_data) + + carousel_data = pin.get("carousel_data") + if carousel_data: + return self._extract_carousel(pin, carousel_data) + videos = pin.get("videos") - if videos: - video_formats = videos["video_list"] + if videos and self.videos: + return (self._extract_video(videos),) - for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"): - if fmt in video_formats: - media = video_formats[fmt] - break - else: - media = max(video_formats.values(), - key=lambda x: x.get("width", 0)) + try: + return (pin["images"]["orig"],) + except Exception: + self.log.debug("%s: No files found", pin.get("id")) + return () + + def _extract_story(self, pin, story): + files = [] + story_id = story.get("id") + + for page in story["pages"]: + page_id = page.get("id") + + for block in page["blocks"]: + type = block.get("type") + + if type == "story_pin_image_block": + if 1 == len(page["blocks"]) == len(story["pages"]): + try: + media = pin["images"]["orig"] + except Exception: + media = self._extract_image(page, block) + else: + media = self._extract_image(page, block) + + elif type == "story_pin_video_block": + video = block["video"] + media = self._extract_video(video) + media["media_id"] = video.get("id") or "" + + elif type == "story_pin_paragraph_block": + media = {"url": "text:" + block["text"], + "extension": "txt", + "media_id": block.get("id")} + + else: + self.log.warning("%s: Unsupported story block '%s'", + pin.get("id"), type) + continue - if "V_720P" in video_formats: - media["_fallback"] = (video_formats["V_720P"]["url"],) + media["story_id"] = story_id + media["page_id"] = page_id + files.append(media) + + return files + + def _extract_carousel(self, pin, carousel_data): + files = [] + for slot in carousel_data["carousel_slots"]: + size, image = next(iter(slot["images"].items())) + slot["media_id"] = slot.pop("id") + slot["url"] = image["url"].replace( + "/" + size + "/", "/originals/", 1) + files.append(slot) + return files + + def _extract_image(self, page, block): + sig = block.get("image_signature") or page["image_signature"] + url_base = "https://i.pinimg.com/originals/{}/{}/{}/{}.".format( + sig[0:2], sig[2:4], sig[4:6], sig) + url_jpg = url_base + "jpg" + url_png = url_base + "png" + url_webp = url_base + "webp" - return media + try: + media = block["image"]["images"]["originals"] + except Exception: + media = {"url": url_jpg, "_fallback": (url_png, url_webp,)} - return pin["images"]["orig"] + if media["url"] == url_jpg: + media["_fallback"] = (url_png, url_webp,) + else: + media["_fallback"] = (url_jpg, url_png, url_webp,) + media["media_id"] = sig + + return media + + def _extract_video(self, video): + video_formats = video["video_list"] + for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"): + if fmt in video_formats: + media = video_formats[fmt] + break + else: + media = max(video_formats.values(), + key=lambda x: x.get("width", 0)) + if "V_720P" in video_formats: + media["_fallback"] = (video_formats["V_720P"]["url"],) + return media class PinterestPinExtractor(PinterestExtractor): diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index c2d1243..8c6e6d8 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -38,6 +38,7 @@ class PixivExtractor(Extractor): self.meta_user = self.config("metadata") self.meta_bookmark = self.config("metadata-bookmark") self.meta_comments = self.config("comments") + self.meta_captions = self.config("captions") def items(self): tags = self.config("tags", "japanese") @@ -76,8 +77,8 @@ class PixivExtractor(Extractor): detail = self.api.illust_bookmark_detail(work["id"]) work["tags_bookmark"] = [tag["name"] for tag in detail["tags"] if tag["is_registered"]] - if self.sanity_workaround and not work.get("caption") and \ - not work.get("_mypixiv"): + if self.meta_captions and not work.get("caption") and \ + not work.get("_mypixiv") and not work.get("_ajax"): body = self._request_ajax("/illust/" + str(work["id"])) if body: work["caption"] = text.unescape(body["illustComment"]) @@ -108,10 +109,10 @@ class PixivExtractor(Extractor): if self.load_ugoira: try: return self._extract_ugoira(work) - except exception.StopExtraction as exc: + except Exception as exc: self.log.warning( - "Unable to retrieve Ugoira metatdata (%s - %s)", - work["id"], exc.message) + "%s: Unable to retrieve Ugoira metatdata (%s - %s)", + work["id"], exc.__class__.__name__, exc) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] @@ -186,6 +187,7 @@ class PixivExtractor(Extractor): return None def _extract_ajax(self, work, body): + work["_ajax"] = True url = self._extract_ajax_url(body) if not url: return () @@ -243,12 +245,12 @@ class PixivExtractor(Extractor): original = body["urls"]["original"] if original: return original - except KeyError: + except Exception: pass try: square1200 = body["userIllusts"][body["id"]]["url"] - except KeyError: + except Exception: return parts = square1200.rpartition("_p0")[0].split("/") del parts[3:5] @@ -293,9 +295,6 @@ class PixivExtractor(Extractor): "x_restrict" : 0, } - def _web_to_mobile(self, work): - return work - def works(self): """Return an iterable containing all relevant 'work' objects""" @@ -334,15 +333,17 @@ class PixivUserExtractor(PixivExtractor): class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" + _warning = True pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") example = "https://www.pixiv.net/en/users/12345/artworks" - def __init__(self, match): - PixivExtractor.__init__(self, match) - u1, t1, u2, t2 = match.groups() + def _init(self): + PixivExtractor._init(self) + + u1, t1, u2, t2 = self.groups if t1: t1 = text.unquote(t1) elif t2: @@ -350,6 +351,14 @@ class PixivArtworksExtractor(PixivExtractor): self.user_id = u1 or u2 self.tag = t1 or t2 + if self.sanity_workaround: + self.cookies_domain = d = ".pixiv.net" + self._init_cookies() + if self._warning and not self.cookies.get("PHPSESSID", domain=d): + PixivArtworksExtractor._warning = False + self.log.warning("No 'PHPSESSID' cookie set. Can detect only " + "non R-18 'sanity_level' works.") + def metadata(self): if self.config("metadata"): self.api.user_detail(self.user_id) @@ -358,6 +367,19 @@ class PixivArtworksExtractor(PixivExtractor): def works(self): works = self.api.user_illusts(self.user_id) + if self.sanity_workaround: + body = self._request_ajax( + "/user/{}/profile/all".format(self.user_id)) + try: + ajax_ids = list(map(int, body["illusts"])) + ajax_ids.extend(map(int, body["manga"])) + ajax_ids.sort() + except Exception as exc: + self.log.warning("Unable to collect artwork IDs using AJAX " + "API (%s: %s)", exc.__class__.__name__, exc) + else: + works = self._extend_sanity(works, ajax_ids) + if self.tag: tag = self.tag.lower() works = ( @@ -367,6 +389,35 @@ class PixivArtworksExtractor(PixivExtractor): return works + def _extend_sanity(self, works, ajax_ids): + user = {"id": 1} + index = len(ajax_ids) - 1 + + for work in works: + while index >= 0: + work_id = work["id"] + ajax_id = ajax_ids[index] + + if ajax_id == work_id: + index -= 1 + break + + elif ajax_id > work_id: + index -= 1 + self.log.debug("Inserting work %s", ajax_id) + yield self._make_work(ajax_id, self.sanity_url, user) + + else: # ajax_id < work_id + break + + yield work + + while index >= 0: + ajax_id = ajax_ids[index] + self.log.debug("Inserting work %s", ajax_id) + yield self._make_work(ajax_id, self.sanity_url, user) + index -= 1 + class PixivAvatarExtractor(PixivExtractor): """Extractor for pixiv avatars""" diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py index 29b351b..8877175 100644 --- a/gallery_dl/extractor/postmill.py +++ b/gallery_dl/extractor/postmill.py @@ -50,7 +50,7 @@ class PostmillExtractor(BaseExtractor): forum = match.group(1) id = int(match.group(2)) - is_text_post = url.startswith("/") + is_text_post = (url[0] == "/") is_image_post = self._search_image_tag(page) is not None data = { "title": title, diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index ce602f6..8577e74 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -31,6 +31,7 @@ class RedditExtractor(Extractor): parentdir = self.config("parent-directory") max_depth = self.config("recursion", 0) previews = self.config("previews", True) + embeds = self.config("embeds", True) videos = self.config("videos", True) if videos: @@ -100,7 +101,7 @@ class RedditExtractor(Extractor): for comment in comments: html = comment["body_html"] or "" href = (' href="' in html) - media = ("media_metadata" in comment) + media = (embeds and "media_metadata" in comment) if media or href: comment["date"] = text.parse_timestamp( @@ -211,8 +212,9 @@ class RedditExtractor(Extractor): def _extract_video_dash(self, submission): submission["_ytdl_extra"] = {"title": submission["title"]} try: - return (submission["secure_media"]["reddit_video"]["dash_url"] + - "#__youtubedl_smuggle=%7B%22to_generic%22%3A+1%7D") + url = submission["secure_media"]["reddit_video"]["dash_url"] + submission["_ytdl_manifest"] = "dash" + return url except Exception: return submission["url"] diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py new file mode 100644 index 0000000..9f9f0c4 --- /dev/null +++ b/gallery_dl/extractor/scrolller.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://scrolller.com/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?scrolller\.com" + + +class ScrolllerExtractor(Extractor): + """Base class for scrolller extractors""" + category = "scrolller" + root = "https://scrolller.com" + directory_fmt = ("{category}", "{subredditTitle}") + filename_fmt = "{id}{title:? //}.{extension}" + archive_fmt = "{id}" + request_interval = (0.5, 1.5) + + def _init(self): + self.auth_token = None + + def items(self): + self.login() + + for post in self.posts(): + + src = max(post["mediaSources"], key=self._sort_key) + post.update(src) + url = src["url"] + text.nameext_from_url(url, post) + + yield Message.Directory, post + yield Message.Url, url, post + + def posts(self): + return () + + def login(self): + username, password = self._get_auth_info() + if username: + self.auth_token = self._login_impl(username, password) + + @cache(maxage=28*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + variables = { + "username": username, + "password": password, + } + + try: + data = self._request_graphql("LoginQuery", variables) + except exception.HttpError as exc: + if exc.status == 403: + raise exception.AuthenticationError() + raise + + return data["login"]["token"] + + def _request_graphql(self, opname, variables): + url = "https://api.scrolller.com/api/v2/graphql" + headers = { + "Content-Type" : "text/plain;charset=UTF-8", + "Origin" : self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + } + data = { + "query" : QUERIES[opname], + "variables" : variables, + "authorization": self.auth_token, + } + return self.request( + url, method="POST", headers=headers, data=util.json_dumps(data), + ).json()["data"] + + def _pagination(self, opname, variables): + while True: + data = self._request_graphql(opname, variables) + + while "items" not in data: + data = data.popitem()[1] + yield from data["items"] + + if not data["iterator"]: + return + variables["iterator"] = data["iterator"] + + def _sort_key(self, src): + return src["width"], not src["isOptimized"] + + +class ScrolllerSubredditExtractor(ScrolllerExtractor): + """Extractor for media from a scrolller subreddit""" + subcategory = "subreddit" + pattern = BASE_PATTERN + r"(/r/[^/?#]+)(?:/?\?([^#]+))?" + example = "https://scrolller.com/r/SUBREDDIT" + + def posts(self): + url, query = self.groups + filter = None + + if query: + params = text.parse_query(query) + if "filter" in params: + filter = params["filter"].upper().rstrip("S") + + variables = { + "url" : url, + "iterator" : None, + "filter" : filter, + "hostsDown": None, + } + return self._pagination("SubredditQuery", variables) + + +class ScrolllerFollowingExtractor(ScrolllerExtractor): + """Extractor for followed scrolller subreddits""" + subcategory = "following" + pattern = BASE_PATTERN + r"/following" + example = "https://scrolller.com/following" + + def items(self): + self.login() + + if not self.auth_token: + raise exception.AuthorizationError("Login required") + + variables = { + "iterator" : None, + "hostsDown": None, + } + + for subreddit in self._pagination("FollowingQuery", variables): + url = self.root + subreddit["url"] + subreddit["_extractor"] = ScrolllerSubredditExtractor + yield Message.Queue, url, subreddit + + +class ScrolllerPostExtractor(ScrolllerExtractor): + """Extractor for media from a single scrolller post""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)" + example = "https://scrolller.com/title-slug-a1b2c3d4f5" + + def posts(self): + url = "{}/{}".format(self.root, self.groups[0]) + page = self.request(url).text + data = util.json_loads(text.extr( + page, '') + .replace('\\"', '"')) + return (data["item"],) + + +QUERIES = { + + "SubredditQuery": """\ +query SubredditQuery( + $url: String! + $filter: SubredditPostFilter + $iterator: String +) { + getSubreddit( + url: $url + ) { + children( + limit: 50 + iterator: $iterator + filter: $filter + disabledHosts: null + ) { + iterator items { + __typename id url title subredditId subredditTitle + subredditUrl redditPath isNsfw albumUrl hasAudio + fullLengthSource gfycatSource redgifsSource ownerAvatar + username displayName isPaid tags isFavorite + mediaSources { url width height isOptimized } + blurredMediaSources { url width height isOptimized } + } + } + } +} +""", + + "FollowingQuery": """\ +query FollowingQuery( + $iterator: String +) { + getFollowing( + limit: 10 + iterator: $iterator + ) { + iterator items { + __typename id url title secondaryTitle description createdAt isNsfw + subscribers isComplete itemCount videoCount pictureCount albumCount + isPaid username tags isFollowing + banner { url width height isOptimized } + } + } +} +""", + + "LoginQuery": """\ +query LoginQuery( + $username: String!, + $password: String! +) { + login( + username: $username, + password: $password + ) { + username token expiresAt isAdmin status isPremium + } +} +""", + +} diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py index dd5988f..468840b 100644 --- a/gallery_dl/extractor/telegraph.py +++ b/gallery_dl/extractor/telegraph.py @@ -49,7 +49,7 @@ class TelegraphGalleryExtractor(GalleryExtractor): url, pos = text.extract(figure, 'src="', '"') if url.startswith("/embed/"): continue - elif url.startswith("/"): + elif url[0] == "/": url = self.root + url caption, pos = text.extract(figure, "
", "<", pos) num += 1 diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index bce661a..b196aeb 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -148,8 +148,10 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor): data["PageNumber"] += 1 def _parse(self, query): + if not query: + return {} try: - if query.startswith("?"): + if query[0] == "?": return self._parse_simple(query) return self._parse_jsurl(query) except Exception as exc: @@ -187,8 +189,6 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor): Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill)) Ref: https://github.com/Sage/jsurl """ - if not data: - return {} i = 0 imax = len(data) diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index b21709a..f7ce44b 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -7,7 +7,7 @@ """Extractors for https://urlgalleries.net/""" from .common import GalleryExtractor, Message -from .. import text +from .. import text, exception class UrlgalleriesGalleryExtractor(GalleryExtractor): @@ -16,27 +16,31 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): root = "urlgalleries.net" request_interval = (0.5, 1.0) pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" - example = "https://blog.urlgalleries.net/gallery-12345/TITLE" + example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE" - def __init__(self, match): - self.blog, self.gallery_id = match.groups() + def items(self): + blog, self.gallery_id = self.groups url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format( - self.blog, self.gallery_id) - GalleryExtractor.__init__(self, match, url) + blog, self.gallery_id) + + with self.request(url, allow_redirects=False, fatal=...) as response: + if 300 <= response.status_code < 500: + if response.headers.get("location", "").endswith( + "/not_found_adult.php"): + raise exception.NotFoundError("gallery") + raise exception.HttpError(None, response) + page = response.text - def items(self): - page = self.request(self.gallery_url).text imgs = self.images(page) data = self.metadata(page) data["count"] = len(imgs) - del page - root = "https://{}.urlgalleries.net".format(self.blog) + root = "https://{}.urlgalleries.net".format(blog) yield Message.Directory, data for data["num"], img in enumerate(imgs, 1): - response = self.request( - root + img, method="HEAD", allow_redirects=False) - yield Message.Queue, response.headers["Location"], data + page = self.request(root + img).text + url = text.extr(page, "window.location.href = '", "'") + yield Message.Queue, url, data def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 95eeafe..ea034a7 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -24,6 +24,13 @@ class VkExtractor(Extractor): root = "https://vk.com" request_interval = (0.5, 1.5) + def _init(self): + self.offset = text.parse_int(self.config("offset")) + + def skip(self, num): + self.offset += num + return num + def items(self): sub = re.compile(r"/imp[fg]/").sub sizes = "wzyxrqpo" @@ -75,7 +82,7 @@ class VkExtractor(Extractor): "al" : "1", "direction": "1", "list" : photos_id, - "offset" : 0, + "offset" : self.offset, } while True: diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 116f557..4eae537 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -193,7 +193,10 @@ class WikimediaArticleExtractor(WikimediaExtractor): def __init__(self, match): WikimediaExtractor.__init__(self, match) - path = match.group(match.lastindex) + path = self.groups[-1] + if path[2] == "/": + self.root = self.root + "/" + path[:2] + path = path[3:] if path.startswith("wiki/"): path = path[5:] -- cgit v1.2.3