aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2022-06-28 19:54:18 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2022-06-28 19:54:18 -0400
commitce35450b5308adab049c5bd99095986d4c607027 (patch)
treef0c2b600f8ef720941bdf615164b942c6c4a5d07 /gallery_dl/extractor
parent25442ea49f031d4d2df3353dd7e9ad2080e332da (diff)
New upstream version 1.22.3.upstream/1.22.3
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py2
-rw-r--r--gallery_dl/extractor/common.py18
-rw-r--r--gallery_dl/extractor/cyberdrop.py5
-rw-r--r--gallery_dl/extractor/instagram.py223
-rw-r--r--gallery_dl/extractor/itaku.py183
-rw-r--r--gallery_dl/extractor/lolisafe.py2
-rw-r--r--gallery_dl/extractor/nijie.py194
-rw-r--r--gallery_dl/extractor/poipiku.py169
-rw-r--r--gallery_dl/extractor/readcomiconline.py9
-rw-r--r--gallery_dl/extractor/skeb.py34
-rw-r--r--gallery_dl/extractor/twitter.py88
-rw-r--r--gallery_dl/extractor/unsplash.py20
-rw-r--r--gallery_dl/extractor/vk.py4
-rw-r--r--gallery_dl/extractor/weibo.py48
14 files changed, 765 insertions, 234 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 6d6c7ee..e273f84 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -64,6 +64,7 @@ modules = [
"inkbunny",
"instagram",
"issuu",
+ "itaku",
"kabeuchi",
"keenspot",
"kemonoparty",
@@ -106,6 +107,7 @@ modules = [
"pixiv",
"pixnet",
"plurk",
+ "poipiku",
"pornhub",
"pururin",
"reactor",
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 9cd9059..5c5e29e 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -256,7 +256,7 @@ class Extractor():
else:
headers["User-Agent"] = self.config("user-agent", (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
- "rv:91.0) Gecko/20100101 Firefox/91.0"))
+ "rv:102.0) Gecko/20100101 Firefox/102.0"))
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
headers["Accept-Encoding"] = "gzip, deflate"
@@ -713,16 +713,21 @@ _browser_cookies = {}
HTTP_HEADERS = {
"firefox": (
- ("User-Agent", "Mozilla/5.0 ({}; rv:91.0) "
- "Gecko/20100101 Firefox/91.0"),
+ ("User-Agent", "Mozilla/5.0 ({}; rv:102.0) "
+ "Gecko/20100101 Firefox/102.0"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
- "image/avif,*/*;q=0.8"),
+ "image/avif,image/webp,*/*;q=0.8"),
("Accept-Language", "en-US,en;q=0.5"),
- ("Accept-Encoding", "gzip, deflate"),
+ ("Accept-Encoding", "gzip, deflate, br"),
("Referer", None),
+ ("DNT", "1"),
("Connection", "keep-alive"),
("Upgrade-Insecure-Requests", "1"),
("Cookie", None),
+ ("Sec-Fetch-Dest", "empty"),
+ ("Sec-Fetch-Mode", "no-cors"),
+ ("Sec-Fetch-Site", "same-origin"),
+ ("TE", "trailers"),
),
"chrome": (
("Upgrade-Insecure-Requests", "1"),
@@ -755,8 +760,7 @@ SSL_CIPHERS = {
"AES128-GCM-SHA256:"
"AES256-GCM-SHA384:"
"AES128-SHA:"
- "AES256-SHA:"
- "DES-CBC3-SHA"
+ "AES256-SHA"
),
"chrome": (
"TLS_AES_128_GCM_SHA256:"
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
index 1afaac8..7a79eca 100644
--- a/gallery_dl/extractor/cyberdrop.py
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -48,10 +48,11 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
files = []
append = files.append
while True:
- url = extr('id="file" href="', '"')
+ url = text.unescape(extr('id="file" href="', '"'))
if not url:
break
- append({"file": text.unescape(url)})
+ append({"file": url,
+ "_fallback": (self.root + url[url.find("/", 8):],)})
return files, {
"album_id" : self.album_id,
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index e536e22..31f5b32 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -82,8 +82,12 @@ class InstagramExtractor(Extractor):
if response.history:
- url = response.request.url
+ url = response.url
if "/accounts/login/" in url:
+ if self._username:
+ self.log.debug("Invalidating cached login session for "
+ "'%s'", self._username)
+ _login_impl.invalidate(self._username)
page = "login"
elif "/challenge/" in url:
page = "challenge"
@@ -161,55 +165,15 @@ class InstagramExtractor(Extractor):
return self._pagination_api(endpoint)
def login(self):
+ self._username = None
if not self._check_cookies(self.cookienames):
username, password = self._get_auth_info()
if username:
- self._update_cookies(self._login_impl(username, password))
+ self._username = username
+ self._update_cookies(_login_impl(self, username, password))
self.session.cookies.set(
"csrftoken", self.csrf_token, domain=self.cookiedomain)
- @cache(maxage=360*24*3600, keyarg=1)
- def _login_impl(self, username, password):
- self.log.info("Logging in as %s", username)
-
- url = self.root + "/accounts/login/"
- page = self.request(url).text
-
- headers = {
- "X-Web-Device-Id" : text.extract(page, '"device_id":"', '"')[0],
- "X-IG-App-ID" : "936619743392459",
- "X-ASBD-ID" : "437806",
- "X-IG-WWW-Claim" : "0",
- "X-Requested-With": "XMLHttpRequest",
- "Referer" : url,
- }
- url = self.root + "/data/shared_data/"
- data = self.request(url, headers=headers).json()
-
- headers["X-CSRFToken"] = data["config"]["csrf_token"]
- headers["X-Instagram-AJAX"] = data["rollout_hash"]
- headers["Origin"] = self.root
- data = {
- "username" : username,
- "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format(
- int(time.time()), password),
- "queryParams" : "{}",
- "optIntoOneTap" : "false",
- "stopDeletionNonce" : "",
- "trustedDeviceRecords": "{}",
- }
- url = self.root + "/accounts/login/ajax/"
- response = self.request(url, method="POST", headers=headers, data=data)
-
- if not response.json().get("authenticated"):
- raise exception.AuthenticationError()
-
- cget = self.session.cookies.get
- return {
- name: cget(name)
- for name in ("sessionid", "mid", "ig_did")
- }
-
def _parse_post_graphql(self, post):
typename = post["__typename"]
@@ -286,37 +250,51 @@ class InstagramExtractor(Extractor):
return data
def _parse_post_api(self, post):
-
- if "media" in post:
- media = post["media"]
- owner = media["user"]
+ if "items" in post:
+ items = post["items"]
+ reel_id = str(post["id"]).rpartition(":")[2]
data = {
- "post_id" : media["pk"],
- "post_shortcode": shortcode_from_id(media["pk"]),
+ "expires": text.parse_timestamp(post.get("expiring_at")),
+ "post_id": reel_id,
+ "post_shortcode": shortcode_from_id(reel_id),
+ }
+ else:
+ data = {
+ "post_id" : post["pk"],
+ "post_shortcode": post["code"],
+ "likes": post["like_count"],
}
- if "carousel_media" in media:
- post["items"] = media["carousel_media"]
+ caption = post["caption"]
+ data["description"] = caption["text"] if caption else ""
+
+ tags = self._find_tags(data["description"])
+ if tags:
+ data["tags"] = sorted(set(tags))
+
+ location = post.get("location")
+ if location:
+ slug = location["short_name"].replace(" ", "-").lower()
+ data["location_id"] = location["pk"]
+ data["location_slug"] = slug
+ data["location_url"] = "{}/explore/locations/{}/{}/".format(
+ self.root, location["pk"], slug)
+
+ if "carousel_media" in post:
+ items = post["carousel_media"]
data["sidecar_media_id"] = data["post_id"]
data["sidecar_shortcode"] = data["post_shortcode"]
else:
- post["items"] = (media,)
-
- else:
- reel_id = str(post["id"]).rpartition(":")[2]
- owner = post["user"]
- data = {
- "expires" : text.parse_timestamp(post.get("expiring_at")),
- "post_id" : reel_id,
- "post_shortcode": shortcode_from_id(reel_id),
- }
+ items = (post,)
+ owner = post["user"]
data["owner_id"] = owner["pk"]
data["username"] = owner.get("username")
data["fullname"] = owner.get("full_name")
- data["_files"] = files = []
+ data["post_url"] = "{}/p/{}/".format(self.root, data["post_shortcode"])
- for num, item in enumerate(post["items"], 1):
+ data["_files"] = files = []
+ for num, item in enumerate(items, 1):
image = item["image_versions2"]["candidates"][0]
@@ -333,7 +311,8 @@ class InstagramExtractor(Extractor):
media = {
"num" : num,
"date" : text.parse_timestamp(item.get("taken_at") or
- media.get("taken_at")),
+ media.get("taken_at") or
+ post.get("taken_at")),
"media_id" : item["pk"],
"shortcode" : (item.get("code") or
shortcode_from_id(item["pk"])),
@@ -342,6 +321,10 @@ class InstagramExtractor(Extractor):
"width" : media["width"],
"height" : media["height"],
}
+
+ if "expiring_at" in item:
+ media["expires"] = text.parse_timestamp(post["expiring_at"])
+
self._extract_tagged_users(item, media)
files.append(media)
@@ -385,31 +368,6 @@ class InstagramExtractor(Extractor):
"username" : user["username"],
"full_name": user["full_name"]})
- def _extract_shared_data(self, page):
- shared_data, pos = text.extract(
- page, "window._sharedData =", ";</script>")
- additional_data, pos = text.extract(
- page, "window.__additionalDataLoaded(", ");</script>", pos)
-
- data = json.loads(shared_data)
- if additional_data:
- next(iter(data["entry_data"].values()))[0] = \
- json.loads(additional_data.partition(",")[2])
- return data
-
- def _get_edge_data(self, user, key):
- cursor = self.config("cursor")
- if cursor or not key:
- return {
- "edges" : (),
- "page_info": {
- "end_cursor" : cursor,
- "has_next_page": True,
- "_virtual" : True,
- },
- }
- return user[key]
-
def _pagination_graphql(self, query_hash, variables):
cursor = self.config("cursor")
if cursor:
@@ -436,8 +394,7 @@ class InstagramExtractor(Extractor):
def _pagination_api(self, endpoint, params=None):
while True:
data = self._request_api(endpoint, params=params)
- for item in data["items"]:
- yield {"media": item}
+ yield from data["items"]
if not data["more_available"]:
return
@@ -446,7 +403,8 @@ class InstagramExtractor(Extractor):
def _pagination_api_post(self, endpoint, params, post=False):
while True:
data = self._request_api(endpoint, method="POST", data=params)
- yield from data["items"]
+ for item in data["items"]:
+ yield item["media"]
info = data["paging_info"]
if not info["more_available"]:
@@ -567,21 +525,7 @@ class InstagramTagExtractor(InstagramExtractor):
return {"tag": text.unquote(self.item)}
def posts(self):
- url = "{}/explore/tags/{}/".format(self.root, self.item)
- page = self._extract_shared_data(
- self.request(url).text)["entry_data"]["TagPage"][0]
-
- if "data" in page:
- return self._pagination_sections(page["data"]["recent"])
-
- hashtag = page["graphql"]["hashtag"]
- query_hash = "9b498c08113f1e09617a1703c22b2f32"
- variables = {"tag_name": hashtag["name"], "first": 50}
- edge = self._get_edge_data(hashtag, "edge_hashtag_to_media")
- return self._pagination_graphql(query_hash, variables, edge)
-
- def _pagination_sections(self, info):
- endpoint = "/v1/tags/instagram/sections/"
+ endpoint = "/v1/tags/{}/sections/".format(self.item)
data = {
"include_persistent": "0",
"max_id" : None,
@@ -591,29 +535,17 @@ class InstagramTagExtractor(InstagramExtractor):
}
while True:
+ info = self._request_api(endpoint, method="POST", data=data)
+
for section in info["sections"]:
- yield from section["layout_content"]["medias"]
+ for media in section["layout_content"]["medias"]:
+ yield media["media"]
if not info.get("more_available"):
return
data["max_id"] = info["next_max_id"]
data["page"] = info["next_page"]
- info = self._request_api(endpoint, method="POST", data=data)
-
- def _pagination_graphql(self, query_hash, variables, data):
- while True:
- for edge in data["edges"]:
- yield edge["node"]
-
- info = data["page_info"]
- if not info["has_next_page"]:
- return
-
- variables["after"] = self._cursor = info["end_cursor"]
- self.log.debug("Cursor: %s", self._cursor)
- data = self._request_graphql(
- query_hash, variables)["hashtag"]["edge_hashtag_to_media"]
class InstagramPostExtractor(InstagramExtractor):
@@ -812,6 +744,49 @@ class InstagramReelsExtractor(InstagramExtractor):
return self._pagination_api_post(endpoint, data)
+@cache(maxage=360*24*3600, keyarg=1)
+def _login_impl(extr, username, password):
+ extr.log.info("Logging in as %s", username)
+
+ url = extr.root + "/accounts/login/"
+ page = extr.request(url).text
+
+ headers = {
+ "X-Web-Device-Id" : text.extract(page, '"device_id":"', '"')[0],
+ "X-IG-App-ID" : "936619743392459",
+ "X-ASBD-ID" : "437806",
+ "X-IG-WWW-Claim" : "0",
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer" : url,
+ }
+ url = extr.root + "/data/shared_data/"
+ data = extr.request(url, headers=headers).json()
+
+ headers["X-CSRFToken"] = data["config"]["csrf_token"]
+ headers["X-Instagram-AJAX"] = data["rollout_hash"]
+ headers["Origin"] = extr.root
+ data = {
+ "username" : username,
+ "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format(
+ int(time.time()), password),
+ "queryParams" : "{}",
+ "optIntoOneTap" : "false",
+ "stopDeletionNonce" : "",
+ "trustedDeviceRecords": "{}",
+ }
+ url = extr.root + "/accounts/login/ajax/"
+ response = extr.request(url, method="POST", headers=headers, data=data)
+
+ if not response.json().get("authenticated"):
+ raise exception.AuthenticationError()
+
+ cget = extr.session.cookies.get
+ return {
+ name: cget(name)
+ for name in ("sessionid", "mid", "ig_did")
+ }
+
+
def id_from_shortcode(shortcode):
return util.bdecode(shortcode, _ALPHABET)
diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py
new file mode 100644
index 0000000..dfe4b53
--- /dev/null
+++ b/gallery_dl/extractor/itaku.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://itaku.ee/"""
+
+from .common import Extractor, Message
+from ..cache import memcache
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?itaku\.ee"
+
+
+class ItakuExtractor(Extractor):
+ """Base class for itaku extractors"""
+ category = "itaku"
+ root = "https://itaku.ee"
+ directory_fmt = ("{category}", "{owner_username}")
+ filename_fmt = ("{id}{title:? //}.{extension}")
+ archive_fmt = "{id}"
+ request_interval = (0.5, 1.5)
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = ItakuAPI(self)
+ self.item = match.group(1)
+ self.videos = self.config("videos", True)
+
+ def items(self):
+ for post in self.posts():
+
+ post["date"] = text.parse_datetime(
+ post["date_added"], "%Y-%m-%dT%H:%M:%S.%f")
+ for category, tags in post.pop("categorized_tags").items():
+ post["tags_" + category.lower()] = [t["name"] for t in tags]
+ post["tags"] = [t["name"] for t in post["tags"]]
+ post["sections"] = [s["title"] for s in post["sections"]]
+
+ if post["video"] and self.videos:
+ url = post["video"]["video"]
+ else:
+ url = post["image"]
+
+ yield Message.Directory, post
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+
+class ItakuGalleryExtractor(ItakuExtractor):
+ """Extractor for posts from an itaku user gallery"""
+ subcategory = "gallery"
+ pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery"
+ test = ("https://itaku.ee/profile/piku/gallery", {
+ "pattern": r"https://d1wmr8tlk3viaj\.cloudfront\.net/gallery_imgs"
+ r"/[^/?#]+\.(jpg|png|gif)",
+ "range": "1-10",
+ "count": 10,
+ })
+
+ def posts(self):
+ return self.api.galleries_images(self.item)
+
+
+class ItakuImageExtractor(ItakuExtractor):
+ subcategory = "image"
+ pattern = BASE_PATTERN + r"/images/(\d+)"
+ test = (
+ ("https://itaku.ee/images/100471", {
+ "pattern": r"https://d1wmr8tlk3viaj\.cloudfront\.net/gallery_imgs"
+ r"/220504_oUNIAFT\.png",
+ "count": 1,
+ "keyword": {
+ "already_pinned": None,
+ "blacklisted": {
+ "blacklisted_tags": [],
+ "is_blacklisted": False
+ },
+ "can_reshare": True,
+ "date_added": "2022-05-05T19:21:17.674148Z",
+ "date_edited": "2022-05-25T14:37:46.220612Z",
+ "description": "sketch from drawpile",
+ "extension": "png",
+ "filename": "220504_oUNIAFT",
+ "hotness_score": 11507.4691939,
+ "id": 100471,
+ "image": "https://d1wmr8tlk3viaj.cloudfront.net/gallery_imgs"
+ "/220504_oUNIAFT.png",
+ "image_xl": "https://d1wmr8tlk3viaj.cloudfront.net"
+ "/gallery_imgs/220504_oUNIAFT/xl.jpg",
+ "liked_by_you": False,
+ "maturity_rating": "SFW",
+ "num_comments": 2,
+ "num_likes": 80,
+ "num_reshares": 2,
+ "obj_tags": 136446,
+ "owner": 16775,
+ "owner_avatar": "https://d1wmr8tlk3viaj.cloudfront.net"
+ "/profile_pics/av2022r_vKYVywc/sm.jpg",
+ "owner_displayname": "Piku",
+ "owner_username": "piku",
+ "reshared_by_you": False,
+ "sections": ["Miku"],
+ "tags": list,
+ "tags_character": ["hatsune_miku"],
+ "tags_copyright": ["vocaloid"],
+ "tags_general" : ["twintails", "green_hair", "flag", "gloves",
+ "green_eyes", "female", "racing_miku"],
+ "title": "Racing Miku 2022 Ver.",
+ "too_mature": False,
+ "uncompressed_filesize": "0.62",
+ "video": None,
+ "visibility": "PUBLIC",
+ },
+ }),
+ # video
+ ("https://itaku.ee/images/19465", {
+ "pattern": r"https://d1wmr8tlk3viaj\.cloudfront\.net/gallery_vids"
+ r"/sleepy_af_OY5GHWw\.mp4",
+ }),
+ )
+
+ def posts(self):
+ return (self.api.image(self.item),)
+
+
+class ItakuAPI():
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.root = extractor.root + "/api"
+ self.headers = {
+ "Accept": "application/json, text/plain, */*",
+ "Referer": extractor.root + "/",
+ }
+
+ def galleries_images(self, username, section=None):
+ endpoint = "/galleries/images/"
+ params = {
+ "cursor" : None,
+ "owner" : self.user(username)["owner"],
+ "section" : section,
+ "date_range": "",
+ "maturity_rating": ("SFW", "Questionable", "NSFW", "Extreme"),
+ "ordering" : "-date_added",
+ "page" : "1",
+ "page_size" : "30",
+ "visibility": ("PUBLIC", "PROFILE_ONLY"),
+ }
+ return self._pagination(endpoint, params, self.image)
+
+ def image(self, image_id):
+ endpoint = "/galleries/images/" + str(image_id)
+ return self._call(endpoint)
+
+ @memcache()
+ def user(self, username):
+ return self._call("/user_profiles/{}/".format(username))
+
+ def _call(self, endpoint, params=None):
+ if not endpoint.startswith("http"):
+ endpoint = self.root + endpoint
+ response = self.extractor.request(
+ endpoint, params=params, headers=self.headers)
+ return response.json()
+
+ def _pagination(self, endpoint, params, extend):
+ data = self._call(endpoint, params)
+
+ while True:
+ if extend:
+ for result in data["results"]:
+ yield extend(result["id"])
+ else:
+ yield from data["results"]
+
+ url_next = data["links"].get("next")
+ if not url_next:
+ return
+
+ data = self._call(url_next)
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index f3bd5d8..2aea44c 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -85,6 +85,8 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
yield Message.Directory, data
for data["num"], file in enumerate(files, 1):
url = file["file"]
+ if "_fallback" in file:
+ data["_fallback"] = file["_fallback"]
text.nameext_from_url(url, data)
data["name"], sep, data["id"] = data["filename"].rpartition("-")
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 832831f..122ea46 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -6,31 +6,31 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://nijie.info/"""
+"""Extractors for nijie instances"""
-from .common import Extractor, Message, AsynchronousMixin
+from .common import BaseExtractor, Message, AsynchronousMixin
from .. import text, exception
from ..cache import cache
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?nijie\.info"
-
-
-class NijieExtractor(AsynchronousMixin, Extractor):
+class NijieExtractor(AsynchronousMixin, BaseExtractor):
"""Base class for nijie extractors"""
- category = "nijie"
+ basecategory = "Nijie"
directory_fmt = ("{category}", "{user_id}")
filename_fmt = "{image_id}_p{num}.{extension}"
archive_fmt = "{image_id}_{num}"
- cookiedomain = "nijie.info"
- cookienames = ("nemail", "nlogin")
- root = "https://nijie.info"
- view_url = "https://nijie.info/view.php?id="
- popup_url = "https://nijie.info/view_popup.php?id="
def __init__(self, match):
- Extractor.__init__(self, match)
- self.user_id = text.parse_int(match.group(1))
+ self._init_category(match)
+ self.cookiedomain = "." + self.root.rpartition("/")[2]
+ self.cookienames = (self.category + "_tok",)
+
+ if self.category == "horne":
+ self._extract_data = self._extract_data_horne
+
+ BaseExtractor.__init__(self, match)
+
+ self.user_id = text.parse_int(match.group(match.lastindex))
self.user_name = None
self.session.headers["Referer"] = self.root + "/"
@@ -39,13 +39,21 @@ class NijieExtractor(AsynchronousMixin, Extractor):
for image_id in self.image_ids():
- response = self.request(self.view_url + image_id, fatal=False)
+ url = "{}/view.php?id={}".format(self.root, image_id)
+ response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
page = response.text
data = self._extract_data(page)
data["image_id"] = text.parse_int(image_id)
+
+ if self.user_name:
+ data["user_id"] = self.user_id
+ data["user_name"] = self.user_name
+ else:
+ data["user_id"] = data["artist_id"]
+ data["user_name"] = data["artist_name"]
yield Message.Directory, data
for image in self._extract_images(page):
@@ -68,24 +76,41 @@ class NijieExtractor(AsynchronousMixin, Extractor):
"description": text.unescape(extr(
'"description": "', '"').replace("&amp;", "&")),
"date" : text.parse_datetime(extr(
- '"datePublished": "', '"') + "+0900",
- "%a %b %d %H:%M:%S %Y%z"),
- "artist_id" : text.parse_int(extr(
- '"sameAs": "https://nijie.info/members.php?id=', '"')),
+ '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y", 9),
+ "artist_id" : text.parse_int(extr('/members.php?id=', '"')),
+ "artist_name": keywords[1],
+ "tags" : keywords[2:-1],
+ }
+ return data
+
+ @staticmethod
+ def _extract_data_horne(page):
+ """Extract image metadata from 'page'"""
+ extr = text.extract_from(page)
+ keywords = text.unescape(extr(
+ 'name="keywords" content="', '" />')).split(",")
+ data = {
+ "title" : keywords[0].strip(),
+ "description": text.unescape(extr(
+ 'property="og:description" content="', '"')),
+ "artist_id" : text.parse_int(extr('members.php?id=', '"')),
"artist_name": keywords[1],
"tags" : keywords[2:-1],
+ "date" : text.parse_datetime(extr(
+ "itemprop='datePublished' content=", "<").rpartition(">")[2],
+ "%Y-%m-%d %H:%M:%S", 9),
}
- data["user_id"] = data["artist_id"]
- data["user_name"] = data["artist_name"]
return data
@staticmethod
def _extract_images(page):
"""Extract image URLs from 'page'"""
- images = text.extract_iter(page, '<a href="./view_popup.php', '</a>')
+ images = text.extract_iter(page, "/view_popup.php", "</a>")
for num, image in enumerate(images):
- url = "https:" + text.extract(image, 'src="', '"')[0]
- url = url.replace("/__rs_l120x120/", "/")
+ src = text.extract(image, 'src="', '"')[0]
+ if not src:
+ continue
+ url = ("https:" + src).replace("/__rs_l120x120/", "/")
yield text.nameext_from_url(url, {
"num": num,
"url": url,
@@ -112,7 +137,7 @@ class NijieExtractor(AsynchronousMixin, Extractor):
data = {"email": username, "password": password, "save": "on"}
response = self.request(url, method="POST", data=data)
- if "//nijie.info/login.php" in response.text:
+ if "/login.php" in response.text:
raise exception.AuthenticationError()
return self.session.cookies
@@ -132,12 +157,27 @@ class NijieExtractor(AsynchronousMixin, Extractor):
params["p"] += 1
+BASE_PATTERN = NijieExtractor.update({
+ "nijie": {
+ "root": "https://nijie.info",
+ "pattern": r"(?:www\.)?nijie\.info",
+ },
+ "horne": {
+ "root": "https://horne.red",
+ "pattern": r"(?:www\.)?horne\.red",
+ },
+})
+
+
class NijieUserExtractor(NijieExtractor):
"""Extractor for nijie user profiles"""
subcategory = "user"
cookiedomain = None
pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)"
- test = ("https://nijie.info/members.php?id=44",)
+ test = (
+ ("https://nijie.info/members.php?id=44"),
+ ("https://horne.red/members.php?id=58000"),
+ )
def items(self):
fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format
@@ -172,6 +212,25 @@ class NijieIllustrationExtractor(NijieExtractor):
"user_name": "ED",
},
}),
+ ("https://horne.red/members_illust.php?id=58000", {
+ "pattern": r"https://pic\.nijie\.net/\d+/horne/\d+/\d+/\d+"
+ r"/illust/\d+_\d+_[0-9a-f]+_[0-9a-f]+\.png",
+ "range": "1-20",
+ "count": 20,
+ "keyword": {
+ "artist_id": 58000,
+ "artist_name": "のえるわ",
+ "date": "type:datetime",
+ "description": str,
+ "image_id": int,
+ "num": int,
+ "tags": list,
+ "title": str,
+ "url": str,
+ "user_id": 58000,
+ "user_name": "のえるわ",
+ },
+ }),
("https://nijie.info/members_illust.php?id=43", {
"exception": exception.NotFoundError,
}),
@@ -182,34 +241,47 @@ class NijieIllustrationExtractor(NijieExtractor):
class NijieDoujinExtractor(NijieExtractor):
- """Extractor for doujin entries of a nijie-user"""
+ """Extractor for doujin entries of a nijie user"""
subcategory = "doujin"
pattern = BASE_PATTERN + r"/members_dojin\.php\?id=(\d+)"
- test = ("https://nijie.info/members_dojin.php?id=6782", {
- "count": ">= 18",
- "keyword": {
- "user_id" : 6782,
- "user_name": "ジョニー@アビオン村",
- },
- })
+ test = (
+ ("https://nijie.info/members_dojin.php?id=6782", {
+ "count": ">= 18",
+ "keyword": {
+ "user_id" : 6782,
+ "user_name": "ジョニー@アビオン村",
+ },
+ }),
+ ("https://horne.red/members_dojin.php?id=58000"),
+ )
def image_ids(self):
return self._pagination("members_dojin")
class NijieFavoriteExtractor(NijieExtractor):
- """Extractor for all favorites/bookmarks of a nijie-user"""
+ """Extractor for all favorites/bookmarks of a nijie user"""
subcategory = "favorite"
directory_fmt = ("{category}", "bookmarks", "{user_id}")
archive_fmt = "f_{user_id}_{image_id}_{num}"
pattern = BASE_PATTERN + r"/user_like_illust_view\.php\?id=(\d+)"
- test = ("https://nijie.info/user_like_illust_view.php?id=44", {
- "count": ">= 16",
- "keyword": {
- "user_id" : 44,
- "user_name": "ED",
- },
- })
+ test = (
+ ("https://nijie.info/user_like_illust_view.php?id=44", {
+ "count": ">= 16",
+ "keyword": {
+ "user_id" : 44,
+ "user_name": "ED",
+ },
+ }),
+ ("https://horne.red/user_like_illust_view.php?id=58000", {
+ "range": "1-5",
+ "count": 5,
+ "keyword": {
+ "user_id" : 58000,
+ "user_name": "のえるわ",
+ },
+ }),
+ )
def image_ids(self):
return self._pagination("user_like_illust_view")
@@ -227,14 +299,17 @@ class NijieNuitaExtractor(NijieExtractor):
directory_fmt = ("{category}", "nuita", "{user_id}")
archive_fmt = "n_{user_id}_{image_id}_{num}"
pattern = BASE_PATTERN + r"/history_nuita\.php\?id=(\d+)"
- test = ("https://nijie.info/history_nuita.php?id=728995", {
- "range": "1-10",
- "count": 10,
- "keyword": {
- "user_id" : 728995,
- "user_name": "莚",
- },
- })
+ test = (
+ ("https://nijie.info/history_nuita.php?id=728995", {
+ "range": "1-10",
+ "count": 10,
+ "keyword": {
+ "user_id" : 728995,
+ "user_name": "莚",
+ },
+ }),
+ ("https://horne.red/history_nuita.php?id=58000"),
+ )
def image_ids(self):
return self._pagination("history_nuita")
@@ -252,7 +327,7 @@ class NijieNuitaExtractor(NijieExtractor):
class NijieImageExtractor(NijieExtractor):
- """Extractor for a work/image from nijie.info"""
+ """Extractor for a nijie work/image"""
subcategory = "image"
pattern = BASE_PATTERN + r"/view(?:_popup)?\.php\?id=(\d+)"
test = (
@@ -265,11 +340,26 @@ class NijieImageExtractor(NijieExtractor):
"count": 0,
}),
("https://nijie.info/view_popup.php?id=70720"),
+ ("https://horne.red/view.php?id=8716", {
+ "count": 4,
+ "keyword": {
+ "artist_id": 58000,
+ "artist_name": "のえるわ",
+ "date": "dt:2018-02-04 14:47:24",
+ "description": "ノエル「そんなことしなくても、"
+ "言ってくれたら咥えるのに・・・♡」",
+ "image_id": 8716,
+ "tags": ["男の娘", "フェラ", "オリキャラ", "うちのこ"],
+ "title": "ノエル「いまどきそんな、恵方巻ネタなんてやらなくても・・・」",
+ "user_id": 58000,
+ "user_name": "のえるわ",
+ },
+ }),
)
def __init__(self, match):
NijieExtractor.__init__(self, match)
- self.image_id = match.group(1)
+ self.image_id = match.group(match.lastindex)
def image_ids(self):
return (self.image_id,)
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
new file mode 100644
index 0000000..e1846cc
--- /dev/null
+++ b/gallery_dl/extractor/poipiku.py
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://poipiku.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?poipiku\.com"
+
+
+class PoipikuExtractor(Extractor):
+ """Base class for poipiku extractors"""
+ category = "poipiku"
+ root = "https://poipiku.com"
+ directory_fmt = ("{category}", "{user_id} {user_name}")
+ filename_fmt = "{post_id}_{num}.{extension}"
+ archive_fmt = "{post_id}_{num}"
+ request_interval = (0.5, 1.5)
+
+ def items(self):
+ password = self.config("password", "")
+
+ for post_url in self.posts():
+ parts = post_url.split("/")
+ if post_url[0] == "/":
+ post_url = self.root + post_url
+ page = self.request(post_url).text
+ extr = text.extract_from(page)
+
+ post = {
+ "post_category": extr("<title>[", "]"),
+ "count" : extr("(", " "),
+ "post_id" : parts[-1].partition(".")[0],
+ "user_id" : parts[-2],
+ "user_name" : text.unescape(extr(
+ '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),
+ "description": text.unescape(extr(
+ 'class="IllustItemDesc" >', '<')),
+ }
+
+ yield Message.Directory, post
+ post["num"] = 0
+
+ while True:
+ thumb = extr('class="IllustItemThumbImg" src="', '"')
+ if not thumb:
+ break
+ elif thumb.startswith("/img/"):
+ continue
+ post["num"] += 1
+ url = text.ensure_http_scheme(thumb[:-8])
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ if not extr('</i> show all', '<'):
+ continue
+
+ url = self.root + "/f/ShowAppendFileF.jsp"
+ headers = {
+ "Accept" : "application/json, text/javascript, */*; q=0.01",
+ "X-Requested-With": "XMLHttpRequest",
+ "Origin" : self.root,
+ "Referer": post_url,
+ }
+ data = {
+ "UID": post["user_id"],
+ "IID": post["post_id"],
+ "PAS": password,
+ "MD" : "0",
+ "TWF": "-1",
+ }
+ page = self.request(
+ url, method="POST", headers=headers, data=data).json()["html"]
+
+ for thumb in text.extract_iter(
+ page, 'class="IllustItemThumbImg" src="', '"'):
+ post["num"] += 1
+ url = text.ensure_http_scheme(thumb[:-8])
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+
+class PoipikuUserExtractor(PoipikuExtractor):
+ """Extractor for posts from a poipiku user"""
+ subcategory = "user"
+ pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?"
+ r"(\d+)/?(?:$|[?&#])")
+ test = (
+ ("https://poipiku.com/25049/", {
+ "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049"
+ r"/\d+_\w+\.(jpe?g|png)$",
+ "range": "1-10",
+ "count": 10,
+ }),
+ ("https://poipiku.com/IllustListPcV.jsp?PG=1&ID=25049&KWD=")
+ )
+
+ def __init__(self, match):
+ PoipikuExtractor.__init__(self, match)
+ self._page, self.user_id = match.groups()
+
+ def posts(self):
+ url = self.root + "/IllustListPcV.jsp"
+ params = {
+ "PG" : text.parse_int(self._page, 0),
+ "ID" : self.user_id,
+ "KWD": "",
+ }
+
+ while True:
+ page = self.request(url, params=params).text
+
+ cnt = 0
+ for path in text.extract_iter(
+ page, 'class="IllustInfo" href="', '"'):
+ yield path
+ cnt += 1
+
+ if cnt < 48:
+ return
+ params["PG"] += 1
+
+
+class PoipikuPostExtractor(PoipikuExtractor):
+ """Extractor for a poipiku post"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(\d+)/(\d+)"
+ test = (
+ ("https://poipiku.com/25049/5864576.html", {
+ "pattern": r"https://img\.poipiku\.com/user_img03/000025049"
+ r"/005864576_EWN1Y65gQ\.png$",
+ "keyword": {
+ "count": "1",
+ "description": "",
+ "extension": "png",
+ "filename": "005864576_EWN1Y65gQ",
+ "num": 1,
+ "post_category": "DOODLE",
+ "post_id": "5864576",
+ "user_id": "25049",
+ "user_name": "ユキウサギ",
+ },
+ }),
+ ("https://poipiku.com/2166245/6411749.html", {
+ "pattern": r"https://img\.poipiku\.com/user_img01/002166245"
+ r"/006411749_\w+\.jpeg$",
+ "count": 4,
+ "keyword": {
+ "count": "4",
+ "description": "絵茶の産物ネタバレあるやつ",
+ "num": int,
+ "post_category": "SPOILER",
+ "post_id": "6411749",
+ "user_id": "2166245",
+ "user_name": "wadahito",
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ PoipikuExtractor.__init__(self, match)
+ self.user_id, self.post_id = match.groups()
+
+ def posts(self):
+ return ("/{}/{}.html".format(self.user_id, self.post_id),)
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index ca7a3c6..a477424 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -130,12 +130,13 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
def beau(url):
"""https://readcomiconline.li/Scripts/rguard.min.js"""
- if url.startswith("https"):
- return url
-
url = url.replace("_x236", "d")
url = url.replace("_x945", "g")
+ if url.startswith("https"):
+ return url
+
+ url, sep, rest = url.partition("?")
containsS0 = "=s0" in url
url = url[:-3 if containsS0 else -6]
url = url[4:22] + url[25:]
@@ -143,4 +144,4 @@ def beau(url):
url = binascii.a2b_base64(url).decode()
url = url[0:13] + url[17:]
url = url[0:-2] + ("=s0" if containsS0 else "=s1600")
- return "https://2.bp.blogspot.com/" + url
+ return "https://2.bp.blogspot.com/" + url + sep + rest
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index 2af917d..2ecb4b6 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -135,10 +135,11 @@ class SkebPostExtractor(SkebExtractor):
"body": "re:はじめまして。私はYouTubeにてVTuberとして活動をしている湊ラ",
"client": {
"avatar_url": "https://pbs.twimg.com/profile_images"
- "/1471184042791895042/f0DcWFGl.jpg",
- "header_url": None,
+ "/1537488326697287680/yNUbLDgC.jpg",
+ "header_url": "https://pbs.twimg.com/profile_banners"
+ "/1375007870291300358/1655744756/1500x500",
"id": 1196514,
- "name": "湊ラギ",
+ "name": "湊ラギ♦️🎀Vtuber🎀次回6/23予定",
"screen_name": "minato_ragi",
},
"completed_at": "2022-02-27T14:03:45.442Z",
@@ -208,3 +209,30 @@ class SkebUserExtractor(SkebExtractor):
posts = itertools.chain(posts, self._pagination(url, params))
return posts
+
+
+class SkebFollowingExtractor(SkebExtractor):
+ """Extractor for all creators followed by a skeb user"""
+ subcategory = "following"
+ pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators"
+ test = ("https://skeb.jp/@user/following_creators",)
+
+ def items(self):
+ for user in self.users():
+ url = "{}/@{}".format(self.root, user["screen_name"])
+ user["_extractor"] = SkebUserExtractor
+ yield Message.Queue, url, user
+
+ def users(self):
+ url = "{}/api/users/{}/following_creators".format(
+ self.root, self.user_name)
+ headers = {"Referer": self.root, "Authorization": "Bearer null"}
+ params = {"sort": "date", "offset": 0, "limit": 90}
+
+ while True:
+ data = self.request(url, params=params, headers=headers).json()
+ yield from data
+
+ if len(data) < params["limit"]:
+ return
+ params["offset"] += params["limit"]
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 2737d34..a0d6194 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -40,6 +40,7 @@ class TwitterExtractor(Extractor):
self.quoted = self.config("quoted", False)
self.videos = self.config("videos", True)
self.cards = self.config("cards", False)
+ self._user_id = None
self._user_cache = {}
self._init_sizes()
@@ -59,6 +60,10 @@ class TwitterExtractor(Extractor):
self.api = TwitterAPI(self)
metadata = self.metadata()
+ if self.config("expand"):
+ tweets = self._expand_tweets(self.tweets())
+ self.tweets = lambda : tweets
+
for tweet in self.tweets():
if "legacy" in tweet:
@@ -75,7 +80,8 @@ class TwitterExtractor(Extractor):
if "in_reply_to_user_id_str" in data and (
not self.replies or (
self.replies == "self" and
- data["in_reply_to_user_id_str"] != data["user_id_str"]
+ (self._user_id or data["in_reply_to_user_id_str"]) !=
+ data["user_id_str"]
)
):
self.log.debug("Skipping %s (reply)", data["id_str"])
@@ -338,6 +344,22 @@ class TwitterExtractor(Extractor):
user["_extractor"] = cls
yield Message.Queue, fmt(user), user
+ def _expand_tweets(self, tweets):
+ seen = set()
+ for tweet in tweets:
+
+ if "legacy" in tweet:
+ cid = tweet["legacy"]["conversation_id_str"]
+ else:
+ cid = tweet["conversation_id_str"]
+
+ if cid not in seen:
+ seen.add(cid)
+ try:
+ yield from self.api.tweet_detail(cid)
+ except Exception:
+ yield tweet
+
def metadata(self):
"""Return general metadata"""
return {}
@@ -418,12 +440,12 @@ class TwitterTimelineExtractor(TwitterExtractor):
self.user = "id:" + user_id
def tweets(self):
- tweets = (self.api.user_tweets(self.user) if self.retweets else
- self.api.user_media(self.user))
+ tweets = (self.api.user_tweets if self.retweets else
+ self.api.user_media)
# yield initial batch of (media) tweets
tweet = None
- for tweet in tweets:
+ for tweet in tweets(self.user):
yield tweet
if tweet is None:
@@ -442,12 +464,17 @@ class TwitterTimelineExtractor(TwitterExtractor):
if "legacy" in tweet:
tweet = tweet["legacy"]
+ # build search query
+ query = "from:{} max_id:{}".format(username, tweet["id_str"])
+ if self.retweets:
+ query += " include:retweets include:nativeretweets"
+ if not self.textonly:
+ query += (" (filter:images OR"
+ " filter:native_video OR"
+ " card_name:animated_gif)")
+
# yield search results starting from last tweet id
- yield from self.api.search_adaptive(
- "from:{} include:retweets include:nativeretweets max_id:{} "
- "filter:images OR card_name:animated_gif OR filter:native_video"
- .format(username, tweet["id_str"])
- )
+ yield from self.api.search_adaptive(query)
class TwitterTweetsExtractor(TwitterExtractor):
@@ -694,10 +721,10 @@ class TwitterTweetExtractor(TwitterExtractor):
"date" : "dt:2020-08-20 04:00:28",
},
}),
- # all Tweets from a conversation (#1319)
- ("https://twitter.com/BlankArts_/status/1323314488611872769", {
+ # all Tweets from a 'conversation' (#1319)
+ ("https://twitter.com/supernaturepics/status/604341487988576256", {
"options": (("conversations", True),),
- "count": ">= 50",
+ "count": 5,
}),
# retweet with missing media entities (#1555)
("https://twitter.com/morino_ya/status/1392763691599237121", {
@@ -845,8 +872,11 @@ class TwitterAPI():
cookies = extractor.session.cookies
cookiedomain = extractor.cookiedomain
- # CSRF
- csrf_token = cookies.get("ct0", domain=cookiedomain)
+ csrf = extractor.config("csrf")
+ if csrf is None or csrf == "cookies":
+ csrf_token = cookies.get("ct0", domain=cookiedomain)
+ else:
+ csrf_token = None
if not csrf_token:
csrf_token = util.generate_token()
cookies.set("ct0", csrf_token, domain=cookiedomain)
@@ -1000,19 +1030,23 @@ class TwitterAPI():
def _user_id_by_screen_name(self, screen_name):
if screen_name.startswith("id:"):
self._user = util.SENTINEL
- return screen_name[3:]
+ user_id = screen_name[3:]
- user = ()
- try:
- user = self._user = self.user_by_screen_name(screen_name)
- return user["rest_id"]
- except KeyError:
- if "unavailable_message" in user:
- raise exception.NotFoundError("{} ({})".format(
- user["unavailable_message"].get("text"),
- user.get("reason")), False)
- else:
- raise exception.NotFoundError("user")
+ else:
+ user = ()
+ try:
+ user = self._user = self.user_by_screen_name(screen_name)
+ user_id = user["rest_id"]
+ except KeyError:
+ if "unavailable_message" in user:
+ raise exception.NotFoundError("{} ({})".format(
+ user["unavailable_message"].get("text"),
+ user.get("reason")), False)
+ else:
+ raise exception.NotFoundError("user")
+
+ self.extractor._user_id = user_id
+ return user_id
@cache(maxage=3600)
def _guest_token(self):
@@ -1228,6 +1262,8 @@ class TwitterAPI():
tweets.append(entry)
elif esw("cursor-bottom-"):
cursor = entry["content"]
+ if "itemContent" in cursor:
+ cursor = cursor["itemContent"]
if not cursor.get("stopOnEmptyResponse", True):
# keep going even if there are no tweets
tweet = True
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index ad1617c..c29d730 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -30,12 +30,16 @@ class UnsplashExtractor(Extractor):
def items(self):
fmt = self.config("format") or "raw"
+ metadata = self.metadata()
+
for photo in self.photos():
util.delete_items(
photo, ("current_user_collections", "related_collections"))
url = photo["urls"][fmt]
text.nameext_from_url(url, photo)
+ if metadata:
+ photo.update(metadata)
photo["extension"] = "jpg"
photo["date"] = text.parse_datetime(photo["created_at"])
if "tags" in photo:
@@ -44,6 +48,10 @@ class UnsplashExtractor(Extractor):
yield Message.Directory, photo
yield Message.Url, url, photo
+ @staticmethod
+ def metadata():
+ return None
+
def skip(self, num):
pages = num // self.per_page
self.page_start += pages
@@ -172,17 +180,27 @@ class UnsplashFavoriteExtractor(UnsplashExtractor):
class UnsplashCollectionExtractor(UnsplashExtractor):
"""Extractor for an unsplash collection"""
subcategory = "collection"
- pattern = BASE_PATTERN + r"/collections/([^/?#]+)"
+ pattern = BASE_PATTERN + r"/collections/([^/?#]+)(?:/([^/?#]+))?"
test = (
("https://unsplash.com/collections/3178572/winter", {
"pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
+ "keyword": {"collection_id": "3178572",
+ "collection_title": "winter"},
"range": "1-30",
"count": 30,
}),
+ ("https://unsplash.com/collections/3178572/"),
("https://unsplash.com/collections/_8qJQ2bCMWE/2021.05"),
)
+ def __init__(self, match):
+ UnsplashExtractor.__init__(self, match)
+ self.title = match.group(2) or ""
+
+ def metadata(self):
+ return {"collection_id": self.item, "collection_title": self.title}
+
def photos(self):
url = "{}/napi/collections/{}/photos".format(self.root, self.item)
params = {"order_by": "latest"}
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
index 23f6ea2..ab2153f 100644
--- a/gallery_dl/extractor/vk.py
+++ b/gallery_dl/extractor/vk.py
@@ -40,12 +40,12 @@ class VkExtractor(Extractor):
continue
try:
- photo["url"], photo["width"], photo["height"] = photo[size]
+ _, photo["width"], photo["height"] = photo[size]
except ValueError:
# photo without width/height entries (#2535)
- photo["url"] = photo[size + "src"]
photo["width"] = photo["height"] = 0
+ photo["url"] = photo[size + "src"]
photo["id"] = photo["id"].rpartition("_")[2]
photo.update(data)
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index a7068c8..68871c8 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -52,10 +52,6 @@ class WeiboExtractor(Extractor):
for status in self.statuses():
- status["date"] = text.parse_datetime(
- status["created_at"], "%a %b %d %H:%M:%S %z %Y")
- yield Message.Directory, status
-
if self.retweets and "retweeted_status" in status:
if original_retweets:
status = status["retweeted_status"]
@@ -68,6 +64,10 @@ class WeiboExtractor(Extractor):
else:
files = self._files_from_status(status)
+ status["date"] = text.parse_datetime(
+ status["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ yield Message.Directory, status
+
for num, file in enumerate(files, 1):
if file["url"].startswith("http:"):
file["url"] = "https:" + file["url"][5:]
@@ -191,7 +191,9 @@ class WeiboUserExtractor(WeiboExtractor):
subcategory = "user"
pattern = USER_PATTERN + r"(?:$|#)"
test = (
- ("https://weibo.com/1758989602"),
+ ("https://weibo.com/1758989602", {
+ "pattern": r"^https://weibo\.com/u/1758989602\?tabtype=feed$",
+ }),
("https://weibo.com/u/1758989602"),
("https://weibo.com/p/1758989602"),
("https://m.weibo.cn/profile/2314621010"),
@@ -200,12 +202,13 @@ class WeiboUserExtractor(WeiboExtractor):
)
def items(self):
- base = " {}/u/{}?tabtype=".format(self.root, self._user_id())
+ base = "{}/u/{}?tabtype=".format(self.root, self._user_id())
return self._dispatch_extractors((
- (WeiboHomeExtractor , base + "home"),
- (WeiboFeedExtractor , base + "feed"),
- (WeiboVideosExtractor, base + "newVideo"),
- (WeiboAlbumExtractor , base + "album"),
+ (WeiboHomeExtractor , base + "home"),
+ (WeiboFeedExtractor , base + "feed"),
+ (WeiboVideosExtractor , base + "video"),
+ (WeiboNewvideoExtractor, base + "newVideo"),
+ (WeiboAlbumExtractor , base + "album"),
), ("feed",))
@@ -254,8 +257,27 @@ class WeiboFeedExtractor(WeiboExtractor):
class WeiboVideosExtractor(WeiboExtractor):
- """Extractor for weibo 'newVideo' listings"""
+ """Extractor for weibo 'video' listings"""
subcategory = "videos"
+ pattern = USER_PATTERN + r"\?tabtype=video"
+ test = ("https://weibo.com/1758989602?tabtype=video", {
+ "pattern": r"https://f\.(video\.weibocdn\.com|us\.sinaimg\.cn)"
+ r"/(../)?\w+\.mp4\?label=mp",
+ "range": "1-30",
+ "count": 30,
+ })
+
+ def statuses(self):
+ endpoint = "/profile/getprofilevideolist"
+ params = {"uid": self._user_id()}
+
+ for status in self._pagination(endpoint, params):
+ yield status["video_detail_vo"]
+
+
+class WeiboNewvideoExtractor(WeiboExtractor):
+ """Extractor for weibo 'newVideo' listings"""
+ subcategory = "newvideo"
pattern = USER_PATTERN + r"\?tabtype=newVideo"
test = ("https://weibo.com/1758989602?tabtype=newVideo", {
"pattern": r"https://f\.video\.weibocdn\.com/(../)?\w+\.mp4\?label=mp",
@@ -336,8 +358,8 @@ class WeiboStatusExtractor(WeiboExtractor):
}),
# type == gif
("https://weibo.com/1758989602/LvBhm5DiP", {
- "pattern": r"http://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM01041"
- r"20005tc0E010\.mp4\?label=gif_mp4",
+ "pattern": r"https://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104"
+ r"120005tc0E010\.mp4\?label=gif_mp4",
}),
("https://m.weibo.cn/status/4339748116375525"),
("https://m.weibo.cn/5746766133/4339748116375525"),