summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2022-04-09 00:15:19 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2022-04-09 00:15:19 -0400
commit2fe1dfed848fc26b7419e3bfe91a62e686960429 (patch)
tree901cb64e2a1748df2bb8c7abc60ff6d72ae4bc27 /gallery_dl/extractor
parentc2e774d3f5a4499b8beb5a12ab46a0099b16b1e7 (diff)
New upstream version 1.21.1.upstream/1.21.1
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py2
-rw-r--r--gallery_dl/extractor/aryion.py17
-rw-r--r--gallery_dl/extractor/common.py3
-rw-r--r--gallery_dl/extractor/furaffinity.py37
-rw-r--r--gallery_dl/extractor/gofile.py124
-rw-r--r--gallery_dl/extractor/hitomi.py49
-rw-r--r--gallery_dl/extractor/instagram.py21
-rw-r--r--gallery_dl/extractor/kemonoparty.py13
-rw-r--r--gallery_dl/extractor/kissgoddess.py4
-rw-r--r--gallery_dl/extractor/mangasee.py4
-rw-r--r--gallery_dl/extractor/newgrounds.py70
-rw-r--r--gallery_dl/extractor/pinterest.py86
-rw-r--r--gallery_dl/extractor/skeb.py90
-rw-r--r--gallery_dl/extractor/telegraph.py95
-rw-r--r--gallery_dl/extractor/twibooru.py5
-rw-r--r--gallery_dl/extractor/twitter.py114
-rw-r--r--gallery_dl/extractor/unsplash.py4
17 files changed, 561 insertions, 177 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 1bec48e..6d6c7ee 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -43,6 +43,7 @@ modules = [
"gelbooru_v01",
"gelbooru_v02",
"gfycat",
+ "gofile",
"hbrowse",
"hentai2read",
"hentaicosplays",
@@ -125,6 +126,7 @@ modules = [
"speakerdeck",
"subscribestar",
"tapas",
+ "telegraph",
"toyhouse",
"tsumino",
"tumblr",
diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py
index 06ec571..fa590b9 100644
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2021 Mike Fährmann
+# Copyright 2020-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,8 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
+from email.utils import parsedate_tz
+from datetime import datetime
BASE_PATTERN = r"(?:https?://)?(?:www\.)?aryion\.com/g4"
@@ -144,7 +146,8 @@ class AryionExtractor(Extractor):
title, _, artist = text.unescape(extr(
"<title>g4 :: ", "<")).rpartition(" by ")
- data = {
+
+ return {
"id" : text.parse_int(post_id),
"url" : url,
"user" : self.user or artist,
@@ -152,7 +155,7 @@ class AryionExtractor(Extractor):
"artist": artist,
"path" : text.split_html(extr(
"cookiecrumb'>", '</span'))[4:-1:2],
- "date" : extr("class='pretty-date' title='", "'"),
+ "date" : datetime(*parsedate_tz(lmod)[:6]),
"size" : text.parse_int(clen),
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
"width" : text.parse_int(extr("Resolution</b>:", "x")),
@@ -167,12 +170,6 @@ class AryionExtractor(Extractor):
"_mtime" : lmod,
}
- d1, _, d2 = data["date"].partition(",")
- data["date"] = text.parse_datetime(
- d1[:-2] + d2, "%b %d %Y %I:%M %p", -5)
-
- return data
-
class AryionGalleryExtractor(AryionExtractor):
"""Extractor for a user's gallery on eka's portal"""
@@ -249,7 +246,7 @@ class AryionPostExtractor(AryionExtractor):
"title" : "I'm on subscribestar now too!",
"description": r"re:Doesn't hurt to have a backup, right\?",
"tags" : ["Non-Vore", "subscribestar"],
- "date" : "dt:2019-02-16 19:30:00",
+ "date" : "dt:2019-02-16 19:30:34",
"path" : [],
"views" : int,
"favorites": int,
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index e3559f9..ff49d89 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -183,7 +183,7 @@ class Extractor():
elif until:
if isinstance(until, datetime.datetime):
# convert to UTC timestamp
- until = (until - util.EPOCH) / util.SECOND
+ until = util.datetime_to_timestamp(until)
else:
until = float(until)
seconds = until - now
@@ -373,7 +373,6 @@ class Extractor():
self.log.warning(
"Cookie '%s' will expire in less than %s hour%s",
cookie.name, hours + 1, "s" if hours else "")
- continue
names.discard(cookie.name)
if not names:
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 6a8744a..b63cfc1 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -165,22 +165,24 @@ class FuraffinityExtractor(Extractor):
def _pagination_search(self, query):
url = self.root + "/search/"
data = {
- "page" : 0,
- "next_page" : "Next",
+ "page" : 1,
"order-by" : "relevancy",
"order-direction": "desc",
"range" : "all",
- "rating-general" : "on",
- "rating-mature" : "on",
- "rating-adult" : "on",
- "type-art" : "on",
- "type-music" : "on",
- "type-flash" : "on",
- "type-story" : "on",
- "type-photo" : "on",
- "type-poetry" : "on",
+ "range_from" : "",
+ "range_to" : "",
+ "rating-general" : "1",
+ "rating-mature" : "1",
+ "rating-adult" : "1",
+ "type-art" : "1",
+ "type-music" : "1",
+ "type-flash" : "1",
+ "type-story" : "1",
+ "type-photo" : "1",
+ "type-poetry" : "1",
"mode" : "extended",
}
+
data.update(query)
if "page" in query:
data["page"] = text.parse_int(query["page"])
@@ -194,7 +196,11 @@ class FuraffinityExtractor(Extractor):
if not post_id:
return
- data["page"] += 1
+
+ if "next_page" in data:
+ data["page"] += 1
+ else:
+ data["next_page"] = "Next"
class FuraffinityGalleryExtractor(FuraffinityExtractor):
@@ -255,9 +261,10 @@ class FuraffinitySearchExtractor(FuraffinityExtractor):
"range": "45-50",
"count": 6,
}),
- ("https://www.furaffinity.net/search/cute&rating-general=0", {
- "range": "1",
- "count": 1,
+ # first page of search results (#2402)
+ ("https://www.furaffinity.net/search/?q=leaf&range=1day", {
+ "range": "1-3",
+ "count": 3,
}),
)
diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py
new file mode 100644
index 0000000..37d2986
--- /dev/null
+++ b/gallery_dl/extractor/gofile.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from .common import Extractor, Message
+from .. import exception
+from ..cache import memcache
+
+
+class GofileFolderExtractor(Extractor):
+ category = "gofile"
+ subcategory = "folder"
+ root = "https://gofile.io"
+ directory_fmt = ("{category}", "{name} ({code})")
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?(?:www\.)?gofile\.io/d/([^/?#]+)"
+ test = (
+ ("https://gofile.io/d/5qHmQj", {
+ "pattern": r"https://file\d+\.gofile\.io/download"
+ r"/\w{8}-\w{4}-\w{4}-\w{4}-\w{12}"
+ r"/test-%E3%83%86%E3%82%B9%E3%83%88-%2522%26!\.png",
+ "keyword": {
+ "createTime": int,
+ "directLink": "re:https://store3.gofile.io/download/direct/.+",
+ "downloadCount": int,
+ "extension": "png",
+ "filename": "test-テスト-%22&!",
+ "folder": {
+ "childs": [
+ "346429cc-aee4-4996-be3f-e58616fe231f",
+ "765b6b12-b354-4e14-9a45-f763fa455682",
+ "2a44600a-4a59-4389-addc-4a0d542c457b"
+ ],
+ "code": "5qHmQj",
+ "createTime": 1648536501,
+ "id": "45cd45d1-dc78-4553-923f-04091c621699",
+ "isRoot": True,
+ "name": "root",
+ "public": True,
+ "totalDownloadCount": int,
+ "totalSize": 364,
+ "type": "folder"
+ },
+ "id": r"re:\w{8}-\w{4}-\w{4}-\w{4}-\w{12}",
+ "link": r"re:https://file17.gofile.io/download/.+\.png",
+ "md5": "re:[0-9a-f]{32}",
+ "mimetype": "image/png",
+ "name": "test-テスト-%22&!.png",
+ "num": int,
+ "parentFolder": "45cd45d1-dc78-4553-923f-04091c621699",
+ "serverChoosen": "file17",
+ "size": 182,
+ "thumbnail": r"re:https://store3.gofile.io/download/.+\.png",
+ "type": "file"
+ },
+ }),
+ ("https://gofile.io/d/346429cc-aee4-4996-be3f-e58616fe231f", {
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.content_id = match.group(1)
+
+ def items(self):
+ recursive = self.config("recursive")
+
+ token = self.config("api-token")
+ if token is None:
+ self.log.debug("creating temporary account")
+ token = self._create_account()
+ self.session.cookies.set("accountToken", token, domain=".gofile.io")
+
+ folder = self._get_content(self.content_id, token)
+ yield Message.Directory, folder
+
+ num = 0
+ contents = folder.pop("contents")
+ for content_id in folder["childs"]:
+ content = contents[content_id]
+ content["folder"] = folder
+
+ if content["type"] == "file":
+ num += 1
+ content["num"] = num
+ content["filename"], _, content["extension"] = \
+ content["name"].rpartition(".")
+ yield Message.Url, content["link"], content
+
+ elif content["type"] == "folder":
+ if recursive:
+ url = "https://gofile.io/d/" + content["id"]
+ content["_extractor"] = GofileFolderExtractor
+ yield Message.Queue, url, content
+
+ else:
+ self.log.debug("'%s' is of unknown type (%s)",
+ content.get("name"), content["type"])
+
+ @memcache()
+ def _create_account(self):
+ return self._api_request("createAccount")["token"]
+
+ def _get_content(self, content_id, token):
+ return self._api_request("getContent", {
+ "contentId" : content_id,
+ "token" : token,
+ "websiteToken": "websiteToken",
+ })
+
+ def _api_request(self, endpoint, params=None):
+ response = self.request(
+ "https://api.gofile.io/" + endpoint, params=params).json()
+
+ if response["status"] != "ok":
+ if response["status"] == "error-notFound":
+ raise exception.NotFoundError("content")
+ raise exception.StopExtraction(
+ "%s failed (Status: %s)", endpoint, response["status"])
+
+ return response["data"]
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 34eaaab..ca7e692 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -28,8 +28,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
("https://hitomi.la/galleries/867789.html", {
"pattern": r"https://[a-c]a\.hitomi\.la/webp/\d+/\d+"
r"/[0-9a-f]{64}\.webp",
- "keyword": "4b584d09d535694d7d757c47daf5c15d116420d2",
- "options": (("metadata", True),),
+ "keyword": "86af5371f38117a07407f11af689bdd460b09710",
"count": 16,
}),
# download test
@@ -77,23 +76,18 @@ class HitomiGalleryExtractor(GalleryExtractor):
def metadata(self, page):
self.info = info = json.loads(page.partition("=")[2])
+ iget = info.get
- data = self._data_from_gallery_info(info)
- if self.config("metadata", False):
- data.update(self._data_from_gallery_page(info))
- return data
-
- def _data_from_gallery_info(self, info):
- language = info.get("language")
+ language = iget("language")
if language:
language = language.capitalize()
- date = info.get("date")
+ date = iget("date")
if date:
date += ":00"
tags = []
- for tinfo in info.get("tags") or ():
+ for tinfo in iget("tags") or ():
tag = string.capwords(tinfo["tag"])
if tinfo.get("female"):
tag += " ♀"
@@ -109,35 +103,10 @@ class HitomiGalleryExtractor(GalleryExtractor):
"lang" : util.language_to_code(language),
"date" : text.parse_datetime(date, "%Y-%m-%d %H:%M:%S%z"),
"tags" : tags,
- }
-
- def _data_from_gallery_page(self, info):
- url = "{}/galleries/{}.html".format(self.root, info["id"])
-
- # follow redirects
- while True:
- response = self.request(url, fatal=False)
- if b"<title>Redirect</title>" not in response.content:
- break
- url = text.extract(
- response.text, 'http-equiv="refresh" content="', '"',
- )[0].partition("=")[2]
-
- if response.status_code >= 400:
- return {}
-
- def prep(value):
- return [
- text.unescape(string.capwords(v))
- for v in text.extract_iter(value or "", '.html">', '<')
- ]
-
- extr = text.extract_from(response.text)
- return {
- "artist" : prep(extr('<h2>', '</h2>')),
- "group" : prep(extr('<td>Group</td><td>', '</td>')),
- "parody" : prep(extr('<td>Series</td><td>', '</td>')),
- "characters": prep(extr('<td>Characters</td><td>', '</td>')),
+ "artist" : [o["artist"] for o in iget("artists") or ()],
+ "group" : [o["group"] for o in iget("groups") or ()],
+ "parody" : [o["parody"] for o in iget("parodys") or ()],
+ "characters": [o["character"] for o in iget("characters") or ()]
}
def images(self, _):
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 20a4c1a..e07b64e 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2018-2020 Leonardo Taccari
-# Copyright 2018-2021 Mike Fährmann
+# Copyright 2018-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -43,6 +43,7 @@ class InstagramExtractor(Extractor):
self.login()
data = self.metadata()
videos = self.config("videos", True)
+ previews = self.config("previews", False)
video_headers = {"User-Agent": "Mozilla/5.0"}
for post in self.posts():
@@ -56,14 +57,18 @@ class InstagramExtractor(Extractor):
yield Message.Directory, post
for file in files:
- url = file.get("video_url")
- if not url:
- url = file["display_url"]
- elif not videos:
- continue
- else:
- file["_http_headers"] = video_headers
file.update(post)
+
+ url = file.get("video_url")
+ if url:
+ if videos:
+ file["_http_headers"] = video_headers
+ text.nameext_from_url(url, file)
+ yield Message.Url, url, file
+ if not previews:
+ continue
+
+ url = file["display_url"]
yield Message.Url, url, text.nameext_from_url(url, file)
def metadata(self):
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 9537263..7287c38 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -42,6 +42,7 @@ class KemonopartyExtractor(Extractor):
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match
generators = self._build_file_generators(self.config("files"))
+ duplicates = self.config("duplicates")
comments = self.config("comments")
username = dms = None
@@ -84,7 +85,7 @@ class KemonopartyExtractor(Extractor):
match = find_hash(url)
if match:
post["hash"] = hash = match.group(1)
- if hash in hashes:
+ if hash in hashes and not duplicates:
self.log.debug("Skipping %s (duplicate)", url)
continue
hashes.add(hash)
@@ -273,6 +274,11 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
("https://kemono.party/patreon/user/4158582/post/32099982", {
"count": 2,
}),
+ # allow duplicates (#2440)
+ ("https://kemono.party/patreon/user/4158582/post/32099982", {
+ "options": (("duplicates", True),),
+ "count": 3,
+ }),
# DMs (#2008)
("https://kemono.party/patreon/user/34134344/post/38129255", {
"options": (("dms", True),),
@@ -323,8 +329,9 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
}),
(("https://kemono.party/discord"
"/server/256559665620451329/channel/462437519519383555#"), {
- "pattern": r"https://kemono\.party/data/attachments/discord"
- r"/256559665620451329/\d+/\d+/.+",
+ "pattern": r"https://kemono\.party/data/("
+ r"e3/77/e377e3525164559484ace2e64425b0cec1db08.*\.png|"
+ r"51/45/51453640a5e0a4d23fbf57fb85390f9c5ec154.*\.gif)",
"count": ">= 2",
}),
# 'inline' files
diff --git a/gallery_dl/extractor/kissgoddess.py b/gallery_dl/extractor/kissgoddess.py
index 85ec806..6e66772 100644
--- a/gallery_dl/extractor/kissgoddess.py
+++ b/gallery_dl/extractor/kissgoddess.py
@@ -20,7 +20,7 @@ class KissgoddessGalleryExtractor(GalleryExtractor):
test = ("https://kissgoddess.com/album/18285.html", {
"pattern": r"https://pic\.kissgoddess\.com"
r"/gallery/16473/18285/s/\d+\.jpg",
- "count": 8,
+ "count": 19,
"keyword": {
"gallery_id": 18285,
"title": "[Young Champion Extra] 2016.02 No.03 菜乃花 安枝瞳 葉月あや",
@@ -45,6 +45,8 @@ class KissgoddessGalleryExtractor(GalleryExtractor):
while page:
for url in text.extract_iter(page, "<img src='", "'"):
yield url, None
+ for url in text.extract_iter(page, "<img data-original='", "'"):
+ yield url, None
pnum += 1
url = "{}/album/{}_{}.html".format(
diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py
index 1b3dd18..0b0da65 100644
--- a/gallery_dl/extractor/mangasee.py
+++ b/gallery_dl/extractor/mangasee.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021 Mike Fährmann
+# Copyright 2021-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -64,7 +64,7 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
self.slug = extr('vm.IndexName = "', '"')
data = self._transform_chapter(data)
- data["manga"] = extr('vm.SeriesName = "', '"')
+ data["manga"] = text.unescape(extr('vm.SeriesName = "', '"'))
return data
def images(self, page):
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 6d0e94b..e9fde97 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -113,10 +113,16 @@ class NewgroundsExtractor(Extractor):
if self.flash:
url += "/format/flash"
- response = self.request(url, fatal=False)
- if response.status_code >= 400:
- return {}
- page = response.text
+ with self.request(url, fatal=False) as response:
+ if response.status_code >= 400:
+ return {}
+ page = response.text
+
+ pos = page.find('id="adults_only"')
+ if pos >= 0:
+ msg = text.extract(page, 'class="highlight">', '<', pos)[0]
+ self.log.warning('"%s"', msg)
+
extr = text.extract_from(page)
data = extract_data(extr, post_url)
@@ -230,16 +236,20 @@ class NewgroundsExtractor(Extractor):
yield fmt[1][0]["src"]
def _pagination(self, kind):
- root = self.user_root
+ url = "{}/{}".format(self.user_root, kind)
+ params = {
+ "page": 1,
+ "isAjaxRequest": "1",
+ }
headers = {
- "Accept": "application/json, text/javascript, */*; q=0.01",
+ "Referer": url,
"X-Requested-With": "XMLHttpRequest",
- "Referer": root,
}
- url = "{}/{}/page/1".format(root, kind)
while True:
- with self.request(url, headers=headers, fatal=False) as response:
+ with self.request(
+ url, params=params, headers=headers,
+ fatal=False) as response:
try:
data = response.json()
except ValueError:
@@ -250,14 +260,17 @@ class NewgroundsExtractor(Extractor):
msg = ", ".join(text.unescape(e) for e in data["errors"])
raise exception.StopExtraction(msg)
- for year in data["sequence"]:
- for item in data["years"][str(year)]["items"]:
+ for year, items in data["items"].items():
+ for item in items:
page_url = text.extract(item, 'href="', '"')[0]
- yield text.urljoin(root, page_url)
+ if page_url[0] == "/":
+ page_url = self.root + page_url
+ yield page_url
- if not data["more"]:
+ more = data.get("load_more")
+ if not more or len(more) < 8:
return
- url = text.urljoin(root, data["more"])
+ params["page"] += 1
class NewgroundsImageExtractor(NewgroundsExtractor):
@@ -293,7 +306,12 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
("https://www.newgrounds.com/art/view/sailoryon/yon-dream-buster", {
"url": "84eec95e663041a80630df72719f231e157e5f5d",
"count": 2,
- })
+ }),
+ # "adult" rated (#2456)
+ ("https://www.newgrounds.com/art/view/kekiiro/red", {
+ "options": (("username", None),),
+ "count": 1,
+ }),
)
def __init__(self, match):
@@ -360,6 +378,11 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"pattern": r"https://uploads\.ungrounded\.net/alternate/1482000"
r"/1482860_alternate_102516\.720p\.mp4\?\d+",
}),
+ # "adult" rated (#2456)
+ ("https://www.newgrounds.com/portal/view/717744", {
+ "options": (("username", None),),
+ "count": 1,
+ }),
)
def __init__(self, match):
@@ -454,25 +477,28 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
)
def _pagination(self, kind):
- num = 1
+ url = "{}/favorites/{}".format(self.user_root, kind)
+ params = {
+ "page": 1,
+ "isAjaxRequest": "1",
+ }
headers = {
- "Accept": "application/json, text/javascript, */*; q=0.01",
+ "Referer": url,
"X-Requested-With": "XMLHttpRequest",
- "Referer": self.user_root,
}
while True:
- url = "{}/favorites/{}/{}".format(self.user_root, kind, num)
- response = self.request(url, headers=headers)
+ response = self.request(url, params=params, headers=headers)
if response.history:
return
- favs = self._extract_favorites(response.text)
+ data = response.json()
+ favs = self._extract_favorites(data.get("component") or "")
yield from favs
if len(favs) < 24:
return
- num += 1
+ params["page"] += 1
def _extract_favorites(self, page):
return [
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 25344e8..2079b73 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2021 Mike Fährmann
+# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,8 +20,8 @@ BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+"
class PinterestExtractor(Extractor):
"""Base class for pinterest extractors"""
category = "pinterest"
- filename_fmt = "{category}_{id}.{extension}"
- archive_fmt = "{id}"
+ filename_fmt = "{category}_{id}{media_id:?_//}.{extension}"
+ archive_fmt = "{id}{media_id}"
root = "https://www.pinterest.com"
def __init__(self, match):
@@ -35,28 +35,39 @@ class PinterestExtractor(Extractor):
yield Message.Directory, data
for pin in self.pins():
+ pin.update(data)
- try:
- media = self._media_from_pin(pin)
- except Exception:
- self.log.debug("Unable to fetch download URL for pin %s",
- pin.get("id"))
- continue
+ carousel_data = pin.get("carousel_data")
+ if carousel_data:
+ for num, slot in enumerate(carousel_data["carousel_slots"], 1):
+ slot["media_id"] = slot.pop("id")
+ pin.update(slot)
+ pin["num"] = num
+ size, image = next(iter(slot["images"].items()))
+ url = image["url"].replace("/" + size + "/", "/originals/")
+ yield Message.Url, url, text.nameext_from_url(url, pin)
- if not videos and media.get("duration") is not None:
- continue
+ else:
+ try:
+ media = self._media_from_pin(pin)
+ except Exception:
+ self.log.debug("Unable to fetch download URL for pin %s",
+ pin.get("id"))
+ continue
- pin.update(data)
- pin.update(media)
- url = media["url"]
- text.nameext_from_url(url, pin)
+ if videos or media.get("duration") is None:
+ pin.update(media)
+ pin["num"] = 0
+ pin["media_id"] = ""
+
+ url = media["url"]
+ text.nameext_from_url(url, pin)
- if pin["extension"] == "m3u8":
- url = "ytdl:" + url
- pin["extension"] = "mp4"
- pin["_ytdl_extra"] = {"protocol": "m3u8_native"}
+ if pin["extension"] == "m3u8":
+ url = "ytdl:" + url
+ pin["extension"] = "mp4"
- yield Message.Url, url, pin
+ yield Message.Url, url, pin
def metadata(self):
"""Return general metadata"""
@@ -124,7 +135,8 @@ class PinterestBoardExtractor(PinterestExtractor):
subcategory = "board"
directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}")
archive_fmt = "{board[id]}_{id}"
- pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/(?!_saved)([^/?#&]+)/?$"
+ pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#&]+)"
+ "/(?!_saved|_created)([^/?#&]+)/?$")
test = (
("https://www.pinterest.com/g1952849/test-/", {
"pattern": r"https://i\.pinimg\.com/originals/",
@@ -192,6 +204,28 @@ class PinterestUserExtractor(PinterestExtractor):
yield Message.Queue, self.root + url, board
+class PinterestCreatedExtractor(PinterestExtractor):
+ """Extractor for a user's created pins"""
+ subcategory = "created"
+ directory_fmt = ("{category}", "{user}")
+ pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/_created/?$"
+ test = ("https://www.pinterest.com/amazon/_created", {
+ "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}"
+ r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.jpg",
+ "count": 10,
+ })
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.user = text.unquote(match.group(1))
+
+ def metadata(self):
+ return {"user": self.user}
+
+ def pins(self):
+ return self.api.user_activity_pins(self.user)
+
+
class PinterestSectionExtractor(PinterestExtractor):
"""Extractor for board sections on pinterest.com"""
subcategory = "section"
@@ -385,6 +419,16 @@ class PinterestAPI():
options = {"board_id": board_id, "add_vase": True}
return self._pagination("BoardRelatedPixieFeed", options)
+ def user_activity_pins(self, user):
+ """Yield pins created by 'user'"""
+ options = {
+ "exclude_add_pin_rep": True,
+ "field_set_key" : "grid_item",
+ "is_own_profile_pins": False,
+ "username" : user,
+ }
+ return self._pagination("UserActivityPins", options)
+
def search(self, query):
"""Yield pins from searches"""
options = {"query": query, "scope": "pins", "rs": "typed"}
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index 965391c..2af917d 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -8,6 +8,7 @@
from .common import Extractor, Message
from .. import text
+import itertools
class SkebExtractor(Extractor):
@@ -22,7 +23,6 @@ class SkebExtractor(Extractor):
Extractor.__init__(self, match)
self.user_name = match.group(1)
self.thumbnails = self.config("thumbnails", False)
- self.sent_requests = self.config("sent-requests", False)
def items(self):
for user_name, post_num in self.posts():
@@ -35,18 +35,18 @@ class SkebExtractor(Extractor):
def posts(self):
"""Return post number"""
- def _pagination(self):
- url = "{}/api/users/{}/works".format(self.root, self.user_name)
- params = {"role": "creator", "sort": "date", "offset": 0}
+ def _pagination(self, url, params):
headers = {"Referer": self.root, "Authorization": "Bearer null"}
- do_requests = self.sent_requests
+ params["offset"] = 0
while True:
posts = self.request(url, params=params, headers=headers).json()
for post in posts:
- post_num = post["path"].rpartition("/")[2]
- user_name = post["path"].split("/")[1][1:]
+ parts = post["path"].split("/")
+ user_name = parts[1][1:]
+ post_num = parts[3]
+
if post["private"]:
self.log.debug("Skipping @%s/%s (private)",
user_name, post_num)
@@ -54,13 +54,7 @@ class SkebExtractor(Extractor):
yield user_name, post_num
if len(posts) < 30:
- if do_requests:
- params["offset"] = 0
- params['role'] = "client"
- do_requests = False
- continue
- else:
- return
+ return
params["offset"] += 30
def _get_post_data(self, user_name, post_num):
@@ -134,6 +128,54 @@ class SkebPostExtractor(SkebExtractor):
"""Extractor for a single skeb post"""
subcategory = "post"
pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/works/(\d+)"
+ test = ("https://skeb.jp/@kanade_cocotte/works/38", {
+ "count": 2,
+ "keyword": {
+ "anonymous": False,
+ "body": "re:はじめまして。私はYouTubeにてVTuberとして活動をしている湊ラ",
+ "client": {
+ "avatar_url": "https://pbs.twimg.com/profile_images"
+ "/1471184042791895042/f0DcWFGl.jpg",
+ "header_url": None,
+ "id": 1196514,
+ "name": "湊ラギ",
+ "screen_name": "minato_ragi",
+ },
+ "completed_at": "2022-02-27T14:03:45.442Z",
+ "content_category": "preview",
+ "creator": {
+ "avatar_url": "https://pbs.twimg.com/profile_images"
+ "/1225470417063645184/P8_SiB0V.jpg",
+ "header_url": "https://pbs.twimg.com/profile_banners"
+ "/71243217/1647958329/1500x500",
+ "id": 159273,
+ "name": "イチノセ奏",
+ "screen_name": "kanade_cocotte",
+ },
+ "date": "dt:2022-02-27 14:03:45",
+ "file_id": int,
+ "file_url": str,
+ "genre": "art",
+ "nsfw": False,
+ "original": {
+ "byte_size": int,
+ "duration": None,
+ "extension": "re:psd|png",
+ "frame_rate": None,
+ "height": 3727,
+ "is_movie": False,
+ "width": 2810,
+ },
+ "post_num": "38",
+ "post_url": "https://skeb.jp/@kanade_cocotte/works/38",
+ "source_body": None,
+ "source_thanks": None,
+ "tags": list,
+ "thanks": None,
+ "translated_body": False,
+ "translated_thanks": None,
+ }
+ })
def __init__(self, match):
SkebExtractor.__init__(self, match)
@@ -146,7 +188,23 @@ class SkebPostExtractor(SkebExtractor):
class SkebUserExtractor(SkebExtractor):
"""Extractor for all posts from a skeb user"""
subcategory = "user"
- pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)"
+ pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/?$"
+ test = ("https://skeb.jp/@kanade_cocotte", {
+ "pattern": r"https://skeb\.imgix\.net/uploads/origins/[\w-]+"
+ r"\?bg=%23fff&auto=format&txtfont=bold&txtshad=70"
+ r"&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150"
+ r"&txt=SAMPLE&w=800&s=\w+",
+ "range": "1-5",
+ })
def posts(self):
- return self._pagination()
+ url = "{}/api/users/{}/works".format(self.root, self.user_name)
+
+ params = {"role": "creator", "sort": "date"}
+ posts = self._pagination(url, params)
+
+ if self.config("sent-requests", False):
+ params = {"role": "client", "sort": "date"}
+ posts = itertools.chain(posts, self._pagination(url, params))
+
+ return posts
diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py
new file mode 100644
index 0000000..8e9bf2c
--- /dev/null
+++ b/gallery_dl/extractor/telegraph.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractor for https://telegra.ph/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class TelegraphGalleryExtractor(GalleryExtractor):
+ """Extractor for articles from telegra.ph"""
+
+ category = "telegraph"
+ root = "https://telegra.ph"
+ directory_fmt = ("{category}", "{slug}")
+ filename_fmt = "{num_formatted}_{filename}.{extension}"
+ archive_fmt = "{slug}_{num}"
+ pattern = r"(?:https?://)(?:www\.)??telegra\.ph(/[^/?#]+)"
+ test = (
+ ("https://telegra.ph/Telegraph-Test-03-28", {
+ "pattern": r"https://telegra\.ph/file/[0-9a-f]+\.png",
+ "keyword": {
+ "author": "mikf",
+ "caption": r"re:test|",
+ "count": 2,
+ "date": "dt:2022-03-28 16:01:36",
+ "description": "Just a test",
+ "post_url": "https://telegra.ph/Telegraph-Test-03-28",
+ "slug": "Telegraph-Test-03-28",
+ "title": "Telegra.ph Test",
+ },
+ }),
+ ("https://telegra.ph/森-03-28", {
+ "pattern": "https://telegra.ph/file/3ea79d23b0dd0889f215a.jpg",
+ "count": 1,
+ "keyword": {
+ "author": "&",
+ "caption": "kokiri",
+ "count": 1,
+ "date": "dt:2022-03-28 16:31:26",
+ "description": "コキリの森",
+ "extension": "jpg",
+ "filename": "3ea79d23b0dd0889f215a",
+ "num": 1,
+ "num_formatted": "1",
+ "post_url": "https://telegra.ph/森-03-28",
+ "slug": "森-03-28",
+ "title": '"森"',
+ "url": "https://telegra.ph/file/3ea79d23b0dd0889f215a.jpg",
+ },
+ }),
+ )
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ data = {
+ "title": text.unescape(extr(
+ 'property="og:title" content="', '"')),
+ "description": text.unescape(extr(
+ 'property="og:description" content="', '"')),
+ "date": text.parse_datetime(extr(
+ 'property="article:published_time" content="', '"'),
+ "%Y-%m-%dT%H:%M:%S%z"),
+ "author": text.unescape(extr(
+ 'property="article:author" content="', '"')),
+ "post_url": text.unescape(extr(
+ 'rel="canonical" href="', '"')),
+ }
+ data["slug"] = data["post_url"][19:]
+ return data
+
+ def images(self, page):
+ figures = tuple(text.extract_iter(page, "<figure>", "</figure>"))
+ num_zeroes = len(str(len(figures)))
+ num = 0
+
+ result = []
+ for figure in figures:
+ src, pos = text.extract(figure, 'src="', '"')
+ if src.startswith("/embed/"):
+ continue
+ caption, pos = text.extract(figure, "<figcaption>", "<", pos)
+ url = self.root + src
+ num += 1
+
+ result.append((url, {
+ "url" : url,
+ "caption" : text.unescape(caption),
+ "num" : num,
+ "num_formatted": str(num).zfill(num_zeroes),
+ }))
+ return result
diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py
index ec8ab35..355ca21 100644
--- a/gallery_dl/extractor/twibooru.py
+++ b/gallery_dl/extractor/twibooru.py
@@ -36,8 +36,9 @@ class TwibooruExtractor(BooruExtractor):
post["date"] = text.parse_datetime(
post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
- name, sep, rest = post["name"].rpartition(".")
- post["filename"] = name if sep else rest
+ if "name" in post:
+ name, sep, rest = post["name"].rpartition(".")
+ post["filename"] = name if sep else rest
class TwibooruPostExtractor(TwibooruExtractor):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 6d51834..4c46170 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -15,7 +15,7 @@ import json
BASE_PATTERN = (
r"(?:https?://)?(?:www\.|mobile\.)?"
- r"(?:twitter\.com|nitter\.net)"
+ r"(?:(?:fx)?twitter\.com|nitter\.net)"
)
@@ -217,23 +217,24 @@ class TwitterExtractor(Extractor):
if "legacy" in tweet:
tweet = tweet["legacy"]
+ tget = tweet.get
entities = tweet["entities"]
tdata = {
"tweet_id" : text.parse_int(tweet["id_str"]),
"retweet_id" : text.parse_int(
- tweet.get("retweeted_status_id_str")),
+ tget("retweeted_status_id_str")),
"quote_id" : text.parse_int(
- tweet.get("quoted_status_id_str")),
+ tget("quoted_status_id_str")),
"reply_id" : text.parse_int(
- tweet.get("in_reply_to_status_id_str")),
+ tget("in_reply_to_status_id_str")),
"date" : text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"user" : user,
"lang" : tweet["lang"],
- "favorite_count": tweet["favorite_count"],
- "quote_count" : tweet["quote_count"],
- "reply_count" : tweet["reply_count"],
- "retweet_count" : tweet["retweet_count"],
+ "favorite_count": tget("favorite_count"),
+ "quote_count" : tget("quote_count"),
+ "reply_count" : tget("reply_count"),
+ "retweet_count" : tget("retweet_count"),
}
hashtags = entities.get("hashtags")
@@ -248,7 +249,7 @@ class TwitterExtractor(Extractor):
"nick": u["name"],
} for u in mentions]
- content = tweet["full_text"]
+ content = tget("full_text") or tget("text") or ""
urls = entities.get("urls")
if urls:
for url in urls:
@@ -269,33 +270,36 @@ class TwitterExtractor(Extractor):
return tdata
def _transform_user(self, user):
+ uid = user.get("rest_id") or user["id_str"]
+
try:
- return self._user_cache[user.get("rest_id") or user["id_str"]]
+ return self._user_cache[uid]
except KeyError:
pass
- uid = user.get("rest_id") or user["id_str"]
if "legacy" in user:
user = user["legacy"]
+
+ uget = user.get
entities = user["entities"]
self._user_cache[uid] = udata = {
"id" : text.parse_int(uid),
"name" : user["screen_name"],
"nick" : user["name"],
- "location" : user["location"],
+ "location" : uget("location"),
"date" : text.parse_datetime(
- user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
- "verified" : user.get("verified", False),
- "profile_banner" : user.get("profile_banner_url", ""),
- "profile_image" : user.get(
+ uget("created_at"), "%a %b %d %H:%M:%S %z %Y"),
+ "verified" : uget("verified", False),
+ "profile_banner" : uget("profile_banner_url", ""),
+ "profile_image" : uget(
"profile_image_url_https", "").replace("_normal.", "."),
- "favourites_count": user["favourites_count"],
- "followers_count" : user["followers_count"],
- "friends_count" : user["friends_count"],
- "listed_count" : user["listed_count"],
- "media_count" : user["media_count"],
- "statuses_count" : user["statuses_count"],
+ "favourites_count": uget("favourites_count"),
+ "followers_count" : uget("followers_count"),
+ "friends_count" : uget("friends_count"),
+ "listed_count" : uget("listed_count"),
+ "media_count" : uget("media_count"),
+ "statuses_count" : uget("statuses_count"),
}
descr = user["description"]
@@ -653,6 +657,11 @@ class TwitterTweetExtractor(TwitterExtractor):
("https://twitter.com/i/web/status/1486373748911575046", {
"count": 4,
}),
+ # age-restricted (#2354)
+ ("https://twitter.com/mightbecursed/status/1492954264909479936", {
+ "options": (("syndication", True),),
+ "count": 1,
+ }),
)
def __init__(self, match):
@@ -770,6 +779,7 @@ class TwitterAPI():
}
self._nsfw_warning = True
+ self._syndication = extractor.config("syndication")
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
self._user = None
@@ -1153,9 +1163,10 @@ class TwitterAPI():
elif esw("conversationthread-"):
tweets.extend(entry["content"]["items"])
elif esw("tombstone-"):
- self._report_tombstone(
- entry,
- entry["content"]["itemContent"]["tombstoneInfo"])
+ item = entry["content"]["itemContent"]
+ item["tweet_results"] = \
+ {"result": {"tombstone": item["tombstoneInfo"]}}
+ tweets.append(entry)
elif esw("cursor-bottom-"):
cursor = entry["content"]
if not cursor.get("stopOnEmptyResponse", True):
@@ -1168,8 +1179,10 @@ class TwitterAPI():
tweet = ((entry.get("content") or entry["item"])
["itemContent"]["tweet_results"]["result"])
if "tombstone" in tweet:
- self._report_tombstone(entry, tweet["tombstone"])
- continue
+ tweet = self._process_tombstone(
+ entry, tweet["tombstone"])
+ if not tweet:
+ continue
if "tweet" in tweet:
tweet = tweet["tweet"]
legacy = tweet["legacy"]
@@ -1259,10 +1272,45 @@ class TwitterAPI():
return
variables["cursor"] = cursor
- def _report_tombstone(self, entry, tombstone):
+ def _process_tombstone(self, entry, tombstone):
text = (tombstone.get("richText") or tombstone["text"])["text"]
- if text.startswith("Age-restricted") and self._nsfw_warning:
- self.extractor.log.warning(text)
- self._nsfw_warning = False
- self.extractor.log.debug(
- "Skipping %s (%s)", entry["entryId"].rpartition("-")[2], text)
+ tweet_id = entry["entryId"].rpartition("-")[2]
+
+ if text.startswith("Age-restricted"):
+ if self._syndication:
+ return self._syndication_tweet(tweet_id)
+ elif self._nsfw_warning:
+ self._nsfw_warning = False
+ self.extractor.log.warning('"%s"', text)
+
+ self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text)
+
+ def _syndication_tweet(self, tweet_id):
+ tweet = self.extractor.request(
+ "https://cdn.syndication.twimg.com/tweet?id=" + tweet_id).json()
+
+ tweet["user"]["description"] = ""
+ tweet["user"]["entities"] = {"description": {}}
+
+ if "video" in tweet:
+ video = tweet["video"]
+ del video["variants"][:-1]
+ video["variants"][0]["url"] = video["variants"][0]["src"]
+ tweet["extended_entities"] = {"media": [{
+ "video_info" : video,
+ "original_info": {"width" : 0, "height": 0},
+ }]}
+ elif "photos" in tweet:
+ for p in tweet["photos"]:
+ p["media_url_https"] = p["url"]
+ p["original_info"] = {
+ "width" : p["width"],
+ "height": p["height"],
+ }
+ tweet["extended_entities"] = {"media": tweet["photos"]}
+
+ return {
+ "rest_id": tweet["id_str"],
+ "legacy" : tweet,
+ "user" : tweet["user"],
+ }
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index 2405dc3..6036322 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -193,7 +193,7 @@ class UnsplashSearchExtractor(UnsplashExtractor):
"""Extractor for unsplash search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?"
- test = ("https://unsplash.com/s/photos/nature", {
+ test = ("https://unsplash.com/s/photos/hair-style", {
"pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
"range": "1-30",
@@ -206,7 +206,7 @@ class UnsplashSearchExtractor(UnsplashExtractor):
def photos(self):
url = self.root + "/napi/search/photos"
- params = {"query": text.unquote(self.item)}
+ params = {"query": text.unquote(self.item.replace('-', ' '))}
if self.query:
params.update(text.parse_query(self.query))
return self._pagination(url, params, True)