diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/aryion.py | 17 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/furaffinity.py | 37 | ||||
| -rw-r--r-- | gallery_dl/extractor/gofile.py | 124 | ||||
| -rw-r--r-- | gallery_dl/extractor/hitomi.py | 49 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 21 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 13 | ||||
| -rw-r--r-- | gallery_dl/extractor/kissgoddess.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangasee.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/newgrounds.py | 70 | ||||
| -rw-r--r-- | gallery_dl/extractor/pinterest.py | 86 | ||||
| -rw-r--r-- | gallery_dl/extractor/skeb.py | 90 | ||||
| -rw-r--r-- | gallery_dl/extractor/telegraph.py | 95 | ||||
| -rw-r--r-- | gallery_dl/extractor/twibooru.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 114 | ||||
| -rw-r--r-- | gallery_dl/extractor/unsplash.py | 4 |
17 files changed, 561 insertions, 177 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1bec48e..6d6c7ee 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -43,6 +43,7 @@ modules = [ "gelbooru_v01", "gelbooru_v02", "gfycat", + "gofile", "hbrowse", "hentai2read", "hentaicosplays", @@ -125,6 +126,7 @@ modules = [ "speakerdeck", "subscribestar", "tapas", + "telegraph", "toyhouse", "tsumino", "tumblr", diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 06ec571..fa590b9 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,6 +11,8 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache +from email.utils import parsedate_tz +from datetime import datetime BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4" @@ -144,7 +146,8 @@ class AryionExtractor(Extractor): title, _, artist = text.unescape(extr( "<title>g4 :: ", "<")).rpartition(" by ") - data = { + + return { "id" : text.parse_int(post_id), "url" : url, "user" : self.user or artist, @@ -152,7 +155,7 @@ class AryionExtractor(Extractor): "artist": artist, "path" : text.split_html(extr( "cookiecrumb'>", '</span'))[4:-1:2], - "date" : extr("class='pretty-date' title='", "'"), + "date" : datetime(*parsedate_tz(lmod)[:6]), "size" : text.parse_int(clen), "views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")), "width" : text.parse_int(extr("Resolution</b>:", "x")), @@ -167,12 +170,6 @@ class AryionExtractor(Extractor): "_mtime" : lmod, } - d1, _, d2 = data["date"].partition(",") - data["date"] = text.parse_datetime( - d1[:-2] + d2, "%b %d %Y %I:%M %p", -5) - - return data - class AryionGalleryExtractor(AryionExtractor): """Extractor for a user's gallery on eka's portal""" @@ -249,7 +246,7 @@ class AryionPostExtractor(AryionExtractor): "title" : "I'm on subscribestar now too!", "description": r"re:Doesn't hurt to have a backup, right\?", "tags" : ["Non-Vore", "subscribestar"], - "date" : "dt:2019-02-16 19:30:00", + "date" : "dt:2019-02-16 19:30:34", "path" : [], "views" : int, "favorites": int, diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index e3559f9..ff49d89 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -183,7 +183,7 @@ class Extractor(): elif until: if isinstance(until, datetime.datetime): # convert to UTC timestamp - until = (until - util.EPOCH) / util.SECOND + until = util.datetime_to_timestamp(until) else: until = float(until) seconds = until - now @@ -373,7 +373,6 @@ class Extractor(): self.log.warning( "Cookie '%s' will expire in less than %s hour%s", cookie.name, hours + 1, "s" if hours else "") - continue names.discard(cookie.name) if not names: diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 6a8744a..b63cfc1 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -165,22 +165,24 @@ class FuraffinityExtractor(Extractor): def _pagination_search(self, query): url = self.root + "/search/" data = { - "page" : 0, - "next_page" : "Next", + "page" : 1, "order-by" : "relevancy", "order-direction": "desc", "range" : "all", - "rating-general" : "on", - "rating-mature" : "on", - "rating-adult" : "on", - "type-art" : "on", - "type-music" : "on", - "type-flash" : "on", - "type-story" : "on", - "type-photo" : "on", - "type-poetry" : "on", + "range_from" : "", + "range_to" : "", + "rating-general" : "1", + "rating-mature" : "1", + "rating-adult" : "1", + "type-art" : "1", + "type-music" : "1", + "type-flash" : "1", + "type-story" : "1", + "type-photo" : "1", + "type-poetry" : "1", "mode" : "extended", } + data.update(query) if "page" in query: data["page"] = text.parse_int(query["page"]) @@ -194,7 +196,11 @@ class FuraffinityExtractor(Extractor): if not post_id: return - data["page"] += 1 + + if "next_page" in data: + data["page"] += 1 + else: + data["next_page"] = "Next" class FuraffinityGalleryExtractor(FuraffinityExtractor): @@ -255,9 +261,10 @@ class FuraffinitySearchExtractor(FuraffinityExtractor): "range": "45-50", "count": 6, }), - ("https://www.furaffinity.net/search/cute&rating-general=0", { - "range": "1", - "count": 1, + # first page of search results (#2402) + ("https://www.furaffinity.net/search/?q=leaf&range=1day", { + "range": "1-3", + "count": 3, }), ) diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py new file mode 100644 index 0000000..37d2986 --- /dev/null +++ b/gallery_dl/extractor/gofile.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import Extractor, Message +from .. import exception +from ..cache import memcache + + +class GofileFolderExtractor(Extractor): + category = "gofile" + subcategory = "folder" + root = "https://gofile.io" + directory_fmt = ("{category}", "{name} ({code})") + archive_fmt = "{id}" + pattern = r"(?:https?://)?(?:www\.)?gofile\.io/d/([^/?#]+)" + test = ( + ("https://gofile.io/d/5qHmQj", { + "pattern": r"https://file\d+\.gofile\.io/download" + r"/\w{8}-\w{4}-\w{4}-\w{4}-\w{12}" + r"/test-%E3%83%86%E3%82%B9%E3%83%88-%2522%26!\.png", + "keyword": { + "createTime": int, + "directLink": "re:https://store3.gofile.io/download/direct/.+", + "downloadCount": int, + "extension": "png", + "filename": "test-テスト-%22&!", + "folder": { + "childs": [ + "346429cc-aee4-4996-be3f-e58616fe231f", + "765b6b12-b354-4e14-9a45-f763fa455682", + "2a44600a-4a59-4389-addc-4a0d542c457b" + ], + "code": "5qHmQj", + "createTime": 1648536501, + "id": "45cd45d1-dc78-4553-923f-04091c621699", + "isRoot": True, + "name": "root", + "public": True, + "totalDownloadCount": int, + "totalSize": 364, + "type": "folder" + }, + "id": r"re:\w{8}-\w{4}-\w{4}-\w{4}-\w{12}", + "link": r"re:https://file17.gofile.io/download/.+\.png", + "md5": "re:[0-9a-f]{32}", + "mimetype": "image/png", + "name": "test-テスト-%22&!.png", + "num": int, + "parentFolder": "45cd45d1-dc78-4553-923f-04091c621699", + "serverChoosen": "file17", + "size": 182, + "thumbnail": r"re:https://store3.gofile.io/download/.+\.png", + "type": "file" + }, + }), + ("https://gofile.io/d/346429cc-aee4-4996-be3f-e58616fe231f", { + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.content_id = match.group(1) + + def items(self): + recursive = self.config("recursive") + + token = self.config("api-token") + if token is None: + self.log.debug("creating temporary account") + token = self._create_account() + self.session.cookies.set("accountToken", token, domain=".gofile.io") + + folder = self._get_content(self.content_id, token) + yield Message.Directory, folder + + num = 0 + contents = folder.pop("contents") + for content_id in folder["childs"]: + content = contents[content_id] + content["folder"] = folder + + if content["type"] == "file": + num += 1 + content["num"] = num + content["filename"], _, content["extension"] = \ + content["name"].rpartition(".") + yield Message.Url, content["link"], content + + elif content["type"] == "folder": + if recursive: + url = "https://gofile.io/d/" + content["id"] + content["_extractor"] = GofileFolderExtractor + yield Message.Queue, url, content + + else: + self.log.debug("'%s' is of unknown type (%s)", + content.get("name"), content["type"]) + + @memcache() + def _create_account(self): + return self._api_request("createAccount")["token"] + + def _get_content(self, content_id, token): + return self._api_request("getContent", { + "contentId" : content_id, + "token" : token, + "websiteToken": "websiteToken", + }) + + def _api_request(self, endpoint, params=None): + response = self.request( + "https://api.gofile.io/" + endpoint, params=params).json() + + if response["status"] != "ok": + if response["status"] == "error-notFound": + raise exception.NotFoundError("content") + raise exception.StopExtraction( + "%s failed (Status: %s)", endpoint, response["status"]) + + return response["data"] diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 34eaaab..ca7e692 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -28,8 +28,7 @@ class HitomiGalleryExtractor(GalleryExtractor): ("https://hitomi.la/galleries/867789.html", { "pattern": r"https://[a-c]a\.hitomi\.la/webp/\d+/\d+" r"/[0-9a-f]{64}\.webp", - "keyword": "4b584d09d535694d7d757c47daf5c15d116420d2", - "options": (("metadata", True),), + "keyword": "86af5371f38117a07407f11af689bdd460b09710", "count": 16, }), # download test @@ -77,23 +76,18 @@ class HitomiGalleryExtractor(GalleryExtractor): def metadata(self, page): self.info = info = json.loads(page.partition("=")[2]) + iget = info.get - data = self._data_from_gallery_info(info) - if self.config("metadata", False): - data.update(self._data_from_gallery_page(info)) - return data - - def _data_from_gallery_info(self, info): - language = info.get("language") + language = iget("language") if language: language = language.capitalize() - date = info.get("date") + date = iget("date") if date: date += ":00" tags = [] - for tinfo in info.get("tags") or (): + for tinfo in iget("tags") or (): tag = string.capwords(tinfo["tag"]) if tinfo.get("female"): tag += " ♀" @@ -109,35 +103,10 @@ class HitomiGalleryExtractor(GalleryExtractor): "lang" : util.language_to_code(language), "date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"), "tags" : tags, - } - - def _data_from_gallery_page(self, info): - url = "{}/galleries/{}.html".format(self.root, info["id"]) - - # follow redirects - while True: - response = self.request(url, fatal=False) - if b"<title>Redirect</title>" not in response.content: - break - url = text.extract( - response.text, 'http-equiv="refresh" content="', '"', - )[0].partition("=")[2] - - if response.status_code >= 400: - return {} - - def prep(value): - return [ - text.unescape(string.capwords(v)) - for v in text.extract_iter(value or "", '.html">', '<') - ] - - extr = text.extract_from(response.text) - return { - "artist" : prep(extr('<h2>', '</h2>')), - "group" : prep(extr('<td>Group</td><td>', '</td>')), - "parody" : prep(extr('<td>Series</td><td>', '</td>')), - "characters": prep(extr('<td>Characters</td><td>', '</td>')), + "artist" : [o["artist"] for o in iget("artists") or ()], + "group" : [o["group"] for o in iget("groups") or ()], + "parody" : [o["parody"] for o in iget("parodys") or ()], + "characters": [o["character"] for o in iget("characters") or ()] } def images(self, _): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 20a4c1a..e07b64e 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2018-2020 Leonardo Taccari -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -43,6 +43,7 @@ class InstagramExtractor(Extractor): self.login() data = self.metadata() videos = self.config("videos", True) + previews = self.config("previews", False) video_headers = {"User-Agent": "Mozilla/5.0"} for post in self.posts(): @@ -56,14 +57,18 @@ class InstagramExtractor(Extractor): yield Message.Directory, post for file in files: - url = file.get("video_url") - if not url: - url = file["display_url"] - elif not videos: - continue - else: - file["_http_headers"] = video_headers file.update(post) + + url = file.get("video_url") + if url: + if videos: + file["_http_headers"] = video_headers + text.nameext_from_url(url, file) + yield Message.Url, url, file + if not previews: + continue + + url = file["display_url"] yield Message.Url, url, text.nameext_from_url(url, file) def metadata(self): diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 9537263..7287c38 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -42,6 +42,7 @@ class KemonopartyExtractor(Extractor): r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match generators = self._build_file_generators(self.config("files")) + duplicates = self.config("duplicates") comments = self.config("comments") username = dms = None @@ -84,7 +85,7 @@ class KemonopartyExtractor(Extractor): match = find_hash(url) if match: post["hash"] = hash = match.group(1) - if hash in hashes: + if hash in hashes and not duplicates: self.log.debug("Skipping %s (duplicate)", url) continue hashes.add(hash) @@ -273,6 +274,11 @@ class KemonopartyPostExtractor(KemonopartyExtractor): ("https://kemono.party/patreon/user/4158582/post/32099982", { "count": 2, }), + # allow duplicates (#2440) + ("https://kemono.party/patreon/user/4158582/post/32099982", { + "options": (("duplicates", True),), + "count": 3, + }), # DMs (#2008) ("https://kemono.party/patreon/user/34134344/post/38129255", { "options": (("dms", True),), @@ -323,8 +329,9 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): }), (("https://kemono.party/discord" "/server/256559665620451329/channel/462437519519383555#"), { - "pattern": r"https://kemono\.party/data/attachments/discord" - r"/256559665620451329/\d+/\d+/.+", + "pattern": r"https://kemono\.party/data/(" + r"e3/77/e377e3525164559484ace2e64425b0cec1db08.*\.png|" + r"51/45/51453640a5e0a4d23fbf57fb85390f9c5ec154.*\.gif)", "count": ">= 2", }), # 'inline' files diff --git a/gallery_dl/extractor/kissgoddess.py b/gallery_dl/extractor/kissgoddess.py index 85ec806..6e66772 100644 --- a/gallery_dl/extractor/kissgoddess.py +++ b/gallery_dl/extractor/kissgoddess.py @@ -20,7 +20,7 @@ class KissgoddessGalleryExtractor(GalleryExtractor): test = ("https://kissgoddess.com/album/18285.html", { "pattern": r"https://pic\.kissgoddess\.com" r"/gallery/16473/18285/s/\d+\.jpg", - "count": 8, + "count": 19, "keyword": { "gallery_id": 18285, "title": "[Young Champion Extra] 2016.02 No.03 菜乃花 安枝瞳 葉月あや", @@ -45,6 +45,8 @@ class KissgoddessGalleryExtractor(GalleryExtractor): while page: for url in text.extract_iter(page, "<img src='", "'"): yield url, None + for url in text.extract_iter(page, "<img data-original='", "'"): + yield url, None pnum += 1 url = "{}/album/{}_{}.html".format( diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index 1b3dd18..0b0da65 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -64,7 +64,7 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): self.slug = extr('vm.IndexName = "', '"') data = self._transform_chapter(data) - data["manga"] = extr('vm.SeriesName = "', '"') + data["manga"] = text.unescape(extr('vm.SeriesName = "', '"')) return data def images(self, page): diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 6d0e94b..e9fde97 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -113,10 +113,16 @@ class NewgroundsExtractor(Extractor): if self.flash: url += "/format/flash" - response = self.request(url, fatal=False) - if response.status_code >= 400: - return {} - page = response.text + with self.request(url, fatal=False) as response: + if response.status_code >= 400: + return {} + page = response.text + + pos = page.find('id="adults_only"') + if pos >= 0: + msg = text.extract(page, 'class="highlight">', '<', pos)[0] + self.log.warning('"%s"', msg) + extr = text.extract_from(page) data = extract_data(extr, post_url) @@ -230,16 +236,20 @@ class NewgroundsExtractor(Extractor): yield fmt[1][0]["src"] def _pagination(self, kind): - root = self.user_root + url = "{}/{}".format(self.user_root, kind) + params = { + "page": 1, + "isAjaxRequest": "1", + } headers = { - "Accept": "application/json, text/javascript, */*; q=0.01", + "Referer": url, "X-Requested-With": "XMLHttpRequest", - "Referer": root, } - url = "{}/{}/page/1".format(root, kind) while True: - with self.request(url, headers=headers, fatal=False) as response: + with self.request( + url, params=params, headers=headers, + fatal=False) as response: try: data = response.json() except ValueError: @@ -250,14 +260,17 @@ class NewgroundsExtractor(Extractor): msg = ", ".join(text.unescape(e) for e in data["errors"]) raise exception.StopExtraction(msg) - for year in data["sequence"]: - for item in data["years"][str(year)]["items"]: + for year, items in data["items"].items(): + for item in items: page_url = text.extract(item, 'href="', '"')[0] - yield text.urljoin(root, page_url) + if page_url[0] == "/": + page_url = self.root + page_url + yield page_url - if not data["more"]: + more = data.get("load_more") + if not more or len(more) < 8: return - url = text.urljoin(root, data["more"]) + params["page"] += 1 class NewgroundsImageExtractor(NewgroundsExtractor): @@ -293,7 +306,12 @@ class NewgroundsImageExtractor(NewgroundsExtractor): ("https://www.newgrounds.com/art/view/sailoryon/yon-dream-buster", { "url": "84eec95e663041a80630df72719f231e157e5f5d", "count": 2, - }) + }), + # "adult" rated (#2456) + ("https://www.newgrounds.com/art/view/kekiiro/red", { + "options": (("username", None),), + "count": 1, + }), ) def __init__(self, match): @@ -360,6 +378,11 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "pattern": r"https://uploads\.ungrounded\.net/alternate/1482000" r"/1482860_alternate_102516\.720p\.mp4\?\d+", }), + # "adult" rated (#2456) + ("https://www.newgrounds.com/portal/view/717744", { + "options": (("username", None),), + "count": 1, + }), ) def __init__(self, match): @@ -454,25 +477,28 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): ) def _pagination(self, kind): - num = 1 + url = "{}/favorites/{}".format(self.user_root, kind) + params = { + "page": 1, + "isAjaxRequest": "1", + } headers = { - "Accept": "application/json, text/javascript, */*; q=0.01", + "Referer": url, "X-Requested-With": "XMLHttpRequest", - "Referer": self.user_root, } while True: - url = "{}/favorites/{}/{}".format(self.user_root, kind, num) - response = self.request(url, headers=headers) + response = self.request(url, params=params, headers=headers) if response.history: return - favs = self._extract_favorites(response.text) + data = response.json() + favs = self._extract_favorites(data.get("component") or "") yield from favs if len(favs) < 24: return - num += 1 + params["page"] += 1 def _extract_favorites(self, page): return [ diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 25344e8..2079b73 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,8 +20,8 @@ BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" class PinterestExtractor(Extractor): """Base class for pinterest extractors""" category = "pinterest" - filename_fmt = "{category}_{id}.{extension}" - archive_fmt = "{id}" + filename_fmt = "{category}_{id}{media_id:?_//}.{extension}" + archive_fmt = "{id}{media_id}" root = "https://www.pinterest.com" def __init__(self, match): @@ -35,28 +35,39 @@ class PinterestExtractor(Extractor): yield Message.Directory, data for pin in self.pins(): + pin.update(data) - try: - media = self._media_from_pin(pin) - except Exception: - self.log.debug("Unable to fetch download URL for pin %s", - pin.get("id")) - continue + carousel_data = pin.get("carousel_data") + if carousel_data: + for num, slot in enumerate(carousel_data["carousel_slots"], 1): + slot["media_id"] = slot.pop("id") + pin.update(slot) + pin["num"] = num + size, image = next(iter(slot["images"].items())) + url = image["url"].replace("/" + size + "/", "/originals/") + yield Message.Url, url, text.nameext_from_url(url, pin) - if not videos and media.get("duration") is not None: - continue + else: + try: + media = self._media_from_pin(pin) + except Exception: + self.log.debug("Unable to fetch download URL for pin %s", + pin.get("id")) + continue - pin.update(data) - pin.update(media) - url = media["url"] - text.nameext_from_url(url, pin) + if videos or media.get("duration") is None: + pin.update(media) + pin["num"] = 0 + pin["media_id"] = "" + + url = media["url"] + text.nameext_from_url(url, pin) - if pin["extension"] == "m3u8": - url = "ytdl:" + url - pin["extension"] = "mp4" - pin["_ytdl_extra"] = {"protocol": "m3u8_native"} + if pin["extension"] == "m3u8": + url = "ytdl:" + url + pin["extension"] = "mp4" - yield Message.Url, url, pin + yield Message.Url, url, pin def metadata(self): """Return general metadata""" @@ -124,7 +135,8 @@ class PinterestBoardExtractor(PinterestExtractor): subcategory = "board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") archive_fmt = "{board[id]}_{id}" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/(?!_saved)([^/?#&]+)/?$" + pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#&]+)" + "/(?!_saved|_created)([^/?#&]+)/?$") test = ( ("https://www.pinterest.com/g1952849/test-/", { "pattern": r"https://i\.pinimg\.com/originals/", @@ -192,6 +204,28 @@ class PinterestUserExtractor(PinterestExtractor): yield Message.Queue, self.root + url, board +class PinterestCreatedExtractor(PinterestExtractor): + """Extractor for a user's created pins""" + subcategory = "created" + directory_fmt = ("{category}", "{user}") + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/_created/?$" + test = ("https://www.pinterest.com/amazon/_created", { + "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" + r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.jpg", + "count": 10, + }) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match.group(1)) + + def metadata(self): + return {"user": self.user} + + def pins(self): + return self.api.user_activity_pins(self.user) + + class PinterestSectionExtractor(PinterestExtractor): """Extractor for board sections on pinterest.com""" subcategory = "section" @@ -385,6 +419,16 @@ class PinterestAPI(): options = {"board_id": board_id, "add_vase": True} return self._pagination("BoardRelatedPixieFeed", options) + def user_activity_pins(self, user): + """Yield pins created by 'user'""" + options = { + "exclude_add_pin_rep": True, + "field_set_key" : "grid_item", + "is_own_profile_pins": False, + "username" : user, + } + return self._pagination("UserActivityPins", options) + def search(self, query): """Yield pins from searches""" options = {"query": query, "scope": "pins", "rs": "typed"} diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 965391c..2af917d 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text +import itertools class SkebExtractor(Extractor): @@ -22,7 +23,6 @@ class SkebExtractor(Extractor): Extractor.__init__(self, match) self.user_name = match.group(1) self.thumbnails = self.config("thumbnails", False) - self.sent_requests = self.config("sent-requests", False) def items(self): for user_name, post_num in self.posts(): @@ -35,18 +35,18 @@ class SkebExtractor(Extractor): def posts(self): """Return post number""" - def _pagination(self): - url = "{}/api/users/{}/works".format(self.root, self.user_name) - params = {"role": "creator", "sort": "date", "offset": 0} + def _pagination(self, url, params): headers = {"Referer": self.root, "Authorization": "Bearer null"} - do_requests = self.sent_requests + params["offset"] = 0 while True: posts = self.request(url, params=params, headers=headers).json() for post in posts: - post_num = post["path"].rpartition("/")[2] - user_name = post["path"].split("/")[1][1:] + parts = post["path"].split("/") + user_name = parts[1][1:] + post_num = parts[3] + if post["private"]: self.log.debug("Skipping @%s/%s (private)", user_name, post_num) @@ -54,13 +54,7 @@ class SkebExtractor(Extractor): yield user_name, post_num if len(posts) < 30: - if do_requests: - params["offset"] = 0 - params['role'] = "client" - do_requests = False - continue - else: - return + return params["offset"] += 30 def _get_post_data(self, user_name, post_num): @@ -134,6 +128,54 @@ class SkebPostExtractor(SkebExtractor): """Extractor for a single skeb post""" subcategory = "post" pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/works/(\d+)" + test = ("https://skeb.jp/@kanade_cocotte/works/38", { + "count": 2, + "keyword": { + "anonymous": False, + "body": "re:はじめまして。私はYouTubeにてVTuberとして活動をしている湊ラ", + "client": { + "avatar_url": "https://pbs.twimg.com/profile_images" + "/1471184042791895042/f0DcWFGl.jpg", + "header_url": None, + "id": 1196514, + "name": "湊ラギ", + "screen_name": "minato_ragi", + }, + "completed_at": "2022-02-27T14:03:45.442Z", + "content_category": "preview", + "creator": { + "avatar_url": "https://pbs.twimg.com/profile_images" + "/1225470417063645184/P8_SiB0V.jpg", + "header_url": "https://pbs.twimg.com/profile_banners" + "/71243217/1647958329/1500x500", + "id": 159273, + "name": "イチノセ奏", + "screen_name": "kanade_cocotte", + }, + "date": "dt:2022-02-27 14:03:45", + "file_id": int, + "file_url": str, + "genre": "art", + "nsfw": False, + "original": { + "byte_size": int, + "duration": None, + "extension": "re:psd|png", + "frame_rate": None, + "height": 3727, + "is_movie": False, + "width": 2810, + }, + "post_num": "38", + "post_url": "https://skeb.jp/@kanade_cocotte/works/38", + "source_body": None, + "source_thanks": None, + "tags": list, + "thanks": None, + "translated_body": False, + "translated_thanks": None, + } + }) def __init__(self, match): SkebExtractor.__init__(self, match) @@ -146,7 +188,23 @@ class SkebPostExtractor(SkebExtractor): class SkebUserExtractor(SkebExtractor): """Extractor for all posts from a skeb user""" subcategory = "user" - pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)" + pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/?$" + test = ("https://skeb.jp/@kanade_cocotte", { + "pattern": r"https://skeb\.imgix\.net/uploads/origins/[\w-]+" + r"\?bg=%23fff&auto=format&txtfont=bold&txtshad=70" + r"&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150" + r"&txt=SAMPLE&w=800&s=\w+", + "range": "1-5", + }) def posts(self): - return self._pagination() + url = "{}/api/users/{}/works".format(self.root, self.user_name) + + params = {"role": "creator", "sort": "date"} + posts = self._pagination(url, params) + + if self.config("sent-requests", False): + params = {"role": "client", "sort": "date"} + posts = itertools.chain(posts, self._pagination(url, params)) + + return posts diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py new file mode 100644 index 0000000..8e9bf2c --- /dev/null +++ b/gallery_dl/extractor/telegraph.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractor for https://telegra.ph/""" + +from .common import GalleryExtractor +from .. import text + + +class TelegraphGalleryExtractor(GalleryExtractor): + """Extractor for articles from telegra.ph""" + + category = "telegraph" + root = "https://telegra.ph" + directory_fmt = ("{category}", "{slug}") + filename_fmt = "{num_formatted}_{filename}.{extension}" + archive_fmt = "{slug}_{num}" + pattern = r"(?:https?://)(?:www\.)??telegra\.ph(/[^/?#]+)" + test = ( + ("https://telegra.ph/Telegraph-Test-03-28", { + "pattern": r"https://telegra\.ph/file/[0-9a-f]+\.png", + "keyword": { + "author": "mikf", + "caption": r"re:test|", + "count": 2, + "date": "dt:2022-03-28 16:01:36", + "description": "Just a test", + "post_url": "https://telegra.ph/Telegraph-Test-03-28", + "slug": "Telegraph-Test-03-28", + "title": "Telegra.ph Test", + }, + }), + ("https://telegra.ph/森-03-28", { + "pattern": "https://telegra.ph/file/3ea79d23b0dd0889f215a.jpg", + "count": 1, + "keyword": { + "author": "&", + "caption": "kokiri", + "count": 1, + "date": "dt:2022-03-28 16:31:26", + "description": "コキリの森", + "extension": "jpg", + "filename": "3ea79d23b0dd0889f215a", + "num": 1, + "num_formatted": "1", + "post_url": "https://telegra.ph/森-03-28", + "slug": "森-03-28", + "title": '"森"', + "url": "https://telegra.ph/file/3ea79d23b0dd0889f215a.jpg", + }, + }), + ) + + def metadata(self, page): + extr = text.extract_from(page) + data = { + "title": text.unescape(extr( + 'property="og:title" content="', '"')), + "description": text.unescape(extr( + 'property="og:description" content="', '"')), + "date": text.parse_datetime(extr( + 'property="article:published_time" content="', '"'), + "%Y-%m-%dT%H:%M:%S%z"), + "author": text.unescape(extr( + 'property="article:author" content="', '"')), + "post_url": text.unescape(extr( + 'rel="canonical" href="', '"')), + } + data["slug"] = data["post_url"][19:] + return data + + def images(self, page): + figures = tuple(text.extract_iter(page, "<figure>", "</figure>")) + num_zeroes = len(str(len(figures))) + num = 0 + + result = [] + for figure in figures: + src, pos = text.extract(figure, 'src="', '"') + if src.startswith("/embed/"): + continue + caption, pos = text.extract(figure, "<figcaption>", "<", pos) + url = self.root + src + num += 1 + + result.append((url, { + "url" : url, + "caption" : text.unescape(caption), + "num" : num, + "num_formatted": str(num).zfill(num_zeroes), + })) + return result diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index ec8ab35..355ca21 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -36,8 +36,9 @@ class TwibooruExtractor(BooruExtractor): post["date"] = text.parse_datetime( post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") - name, sep, rest = post["name"].rpartition(".") - post["filename"] = name if sep else rest + if "name" in post: + name, sep, rest = post["name"].rpartition(".") + post["filename"] = name if sep else rest class TwibooruPostExtractor(TwibooruExtractor): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 6d51834..4c46170 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -15,7 +15,7 @@ import json BASE_PATTERN = ( r"(?:https?://)?(?:www\.|mobile\.)?" - r"(?:twitter\.com|nitter\.net)" + r"(?:(?:fx)?twitter\.com|nitter\.net)" ) @@ -217,23 +217,24 @@ class TwitterExtractor(Extractor): if "legacy" in tweet: tweet = tweet["legacy"] + tget = tweet.get entities = tweet["entities"] tdata = { "tweet_id" : text.parse_int(tweet["id_str"]), "retweet_id" : text.parse_int( - tweet.get("retweeted_status_id_str")), + tget("retweeted_status_id_str")), "quote_id" : text.parse_int( - tweet.get("quoted_status_id_str")), + tget("quoted_status_id_str")), "reply_id" : text.parse_int( - tweet.get("in_reply_to_status_id_str")), + tget("in_reply_to_status_id_str")), "date" : text.parse_datetime( tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), "user" : user, "lang" : tweet["lang"], - "favorite_count": tweet["favorite_count"], - "quote_count" : tweet["quote_count"], - "reply_count" : tweet["reply_count"], - "retweet_count" : tweet["retweet_count"], + "favorite_count": tget("favorite_count"), + "quote_count" : tget("quote_count"), + "reply_count" : tget("reply_count"), + "retweet_count" : tget("retweet_count"), } hashtags = entities.get("hashtags") @@ -248,7 +249,7 @@ class TwitterExtractor(Extractor): "nick": u["name"], } for u in mentions] - content = tweet["full_text"] + content = tget("full_text") or tget("text") or "" urls = entities.get("urls") if urls: for url in urls: @@ -269,33 +270,36 @@ class TwitterExtractor(Extractor): return tdata def _transform_user(self, user): + uid = user.get("rest_id") or user["id_str"] + try: - return self._user_cache[user.get("rest_id") or user["id_str"]] + return self._user_cache[uid] except KeyError: pass - uid = user.get("rest_id") or user["id_str"] if "legacy" in user: user = user["legacy"] + + uget = user.get entities = user["entities"] self._user_cache[uid] = udata = { "id" : text.parse_int(uid), "name" : user["screen_name"], "nick" : user["name"], - "location" : user["location"], + "location" : uget("location"), "date" : text.parse_datetime( - user["created_at"], "%a %b %d %H:%M:%S %z %Y"), - "verified" : user.get("verified", False), - "profile_banner" : user.get("profile_banner_url", ""), - "profile_image" : user.get( + uget("created_at"), "%a %b %d %H:%M:%S %z %Y"), + "verified" : uget("verified", False), + "profile_banner" : uget("profile_banner_url", ""), + "profile_image" : uget( "profile_image_url_https", "").replace("_normal.", "."), - "favourites_count": user["favourites_count"], - "followers_count" : user["followers_count"], - "friends_count" : user["friends_count"], - "listed_count" : user["listed_count"], - "media_count" : user["media_count"], - "statuses_count" : user["statuses_count"], + "favourites_count": uget("favourites_count"), + "followers_count" : uget("followers_count"), + "friends_count" : uget("friends_count"), + "listed_count" : uget("listed_count"), + "media_count" : uget("media_count"), + "statuses_count" : uget("statuses_count"), } descr = user["description"] @@ -653,6 +657,11 @@ class TwitterTweetExtractor(TwitterExtractor): ("https://twitter.com/i/web/status/1486373748911575046", { "count": 4, }), + # age-restricted (#2354) + ("https://twitter.com/mightbecursed/status/1492954264909479936", { + "options": (("syndication", True),), + "count": 1, + }), ) def __init__(self, match): @@ -770,6 +779,7 @@ class TwitterAPI(): } self._nsfw_warning = True + self._syndication = extractor.config("syndication") self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode self._user = None @@ -1153,9 +1163,10 @@ class TwitterAPI(): elif esw("conversationthread-"): tweets.extend(entry["content"]["items"]) elif esw("tombstone-"): - self._report_tombstone( - entry, - entry["content"]["itemContent"]["tombstoneInfo"]) + item = entry["content"]["itemContent"] + item["tweet_results"] = \ + {"result": {"tombstone": item["tombstoneInfo"]}} + tweets.append(entry) elif esw("cursor-bottom-"): cursor = entry["content"] if not cursor.get("stopOnEmptyResponse", True): @@ -1168,8 +1179,10 @@ class TwitterAPI(): tweet = ((entry.get("content") or entry["item"]) ["itemContent"]["tweet_results"]["result"]) if "tombstone" in tweet: - self._report_tombstone(entry, tweet["tombstone"]) - continue + tweet = self._process_tombstone( + entry, tweet["tombstone"]) + if not tweet: + continue if "tweet" in tweet: tweet = tweet["tweet"] legacy = tweet["legacy"] @@ -1259,10 +1272,45 @@ class TwitterAPI(): return variables["cursor"] = cursor - def _report_tombstone(self, entry, tombstone): + def _process_tombstone(self, entry, tombstone): text = (tombstone.get("richText") or tombstone["text"])["text"] - if text.startswith("Age-restricted") and self._nsfw_warning: - self.extractor.log.warning(text) - self._nsfw_warning = False - self.extractor.log.debug( - "Skipping %s (%s)", entry["entryId"].rpartition("-")[2], text) + tweet_id = entry["entryId"].rpartition("-")[2] + + if text.startswith("Age-restricted"): + if self._syndication: + return self._syndication_tweet(tweet_id) + elif self._nsfw_warning: + self._nsfw_warning = False + self.extractor.log.warning('"%s"', text) + + self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text) + + def _syndication_tweet(self, tweet_id): + tweet = self.extractor.request( + "https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json() + + tweet["user"]["description"] = "" + tweet["user"]["entities"] = {"description": {}} + + if "video" in tweet: + video = tweet["video"] + del video["variants"][:-1] + video["variants"][0]["url"] = video["variants"][0]["src"] + tweet["extended_entities"] = {"media": [{ + "video_info" : video, + "original_info": {"width" : 0, "height": 0}, + }]} + elif "photos" in tweet: + for p in tweet["photos"]: + p["media_url_https"] = p["url"] + p["original_info"] = { + "width" : p["width"], + "height": p["height"], + } + tweet["extended_entities"] = {"media": tweet["photos"]} + + return { + "rest_id": tweet["id_str"], + "legacy" : tweet, + "user" : tweet["user"], + } diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index 2405dc3..6036322 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -193,7 +193,7 @@ class UnsplashSearchExtractor(UnsplashExtractor): """Extractor for unsplash search results""" subcategory = "search" pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?" - test = ("https://unsplash.com/s/photos/nature", { + test = ("https://unsplash.com/s/photos/hair-style", { "pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+" r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", "range": "1-30", @@ -206,7 +206,7 @@ class UnsplashSearchExtractor(UnsplashExtractor): def photos(self): url = self.root + "/napi/search/photos" - params = {"query": text.unquote(self.item)} + params = {"query": text.unquote(self.item.replace('-', ' '))} if self.query: params.update(text.parse_query(self.query)) return self._pagination(url, params, True) |
