From c2e774d3f5a4499b8beb5a12ab46a0099b16b1e7 Mon Sep 17 00:00:00 2001
From: Unit 193 ', '<')[0]
- return {"title": text.unescape(title.strip())}
-
- def get_image_keys(self, page):
- """Return a list of all image keys"""
- keys = []
+ def metadata(page):
+ return {"title": text.unescape(text.extract(
+ page, 'id="gallery-name">', '<')[0].strip())}
+
+ def images(self, page):
+ findall = re.compile(r'', "<")[0].rpartition(" | ")[0],
+ }
+
+ def images(self, page):
+ pnum = 1
+
+ while page:
+ for url in text.extract_iter(page, "
= 7",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.model = match.group(1)
+
+ def items(self):
+ url = "{}/people/{}.html".format(self.root, self.model)
+ page = self.request(url).text
+
+ data = {"_extractor": KissgoddessGalleryExtractor}
+ for path in text.extract_iter(page, 'thumb">= 30"
+ })
+ per_page = 25
+
+ def __init__(self, match):
+ MememuseumExtractor.__init__(self, match)
+ self.tags = text.unquote(match.group(1))
+
+ def metadata(self):
+ return {"search_tags": self.tags}
+
+ def posts(self):
+ pnum = 1
+ while True:
+ url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
+ extr = text.extract_from(self.request(url).text)
+
+ while True:
+ mime = extr("data-mime='", "'")
+ if not mime:
+ break
+
+ pid = extr("data-post-id='", "'")
+ tags, dimensions, size = extr("title='", "'").split(" // ")
+ md5 = extr("/_thumbs/", "/")
+ width, _, height = dimensions.partition("x")
+
+ yield {
+ "file_url": "{}/_images/{}/{}%20-%20{}.{}".format(
+ self.root, md5, pid, text.quote(tags),
+ mime.rpartition("/")[2]),
+ "id": pid, "md5": md5, "tags": tags,
+ "width": width, "height": height,
+ "size": text.parse_bytes(size[:-1]),
+ }
+
+ if not extr(">Next<", ">"):
+ return
+ pnum += 1
+
+
+class MememuseumPostExtractor(MememuseumExtractor):
+ """Extractor for single images from meme.museum"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)"
+ test = ("https://meme.museum/post/view/10243", {
+ "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997"
+ r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm"
+ r"an%20stallman%20tagme%20text\.jpg",
+ "keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e",
+ "content": "45565f3f141fc960a8ae1168b80e718a494c52d2",
+ })
+
+ def __init__(self, match):
+ MememuseumExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def posts(self):
+ url = "{}/post/view/{}".format(self.root, self.post_id)
+ extr = text.extract_from(self.request(url).text)
+
+ return ({
+ "id" : self.post_id,
+ "tags" : extr(": ", "<"),
+ "md5" : extr("/_thumbs/", "/"),
+ "file_url": self.root + extr("id='main_image' src='", "'"),
+ "width" : extr("data-width=", " ").strip("'\""),
+ "height" : extr("data-height=", " ").strip("'\""),
+ "size" : 0,
+ },)
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 54e2040..6d0e94b 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -103,7 +103,7 @@ class NewgroundsExtractor(Extractor):
}
def extract_post(self, post_url):
-
+ url = post_url
if "/art/view/" in post_url:
extract_data = self._extract_image_data
elif "/audio/listen/" in post_url:
@@ -111,18 +111,19 @@ class NewgroundsExtractor(Extractor):
else:
extract_data = self._extract_media_data
if self.flash:
- post_url += "/format/flash"
+ url += "/format/flash"
- response = self.request(post_url, fatal=False)
+ response = self.request(url, fatal=False)
if response.status_code >= 400:
return {}
page = response.text
extr = text.extract_from(page)
data = extract_data(extr, post_url)
- data["_comment"] = extr('id="author_comments"', '')
+ data["_comment"] = extr(
+ 'id="author_comments"', '').partition(">")[2]
data["comment"] = text.unescape(text.remove_html(
- data["_comment"].partition(">")[2], "", ""))
+ data["_comment"], "", ""))
data["favorites"] = text.parse_int(extr(
'id="faves_load">', '<').replace(",", ""))
data["score"] = text.parse_float(extr('id="score_number">', '<'))
@@ -134,6 +135,7 @@ class NewgroundsExtractor(Extractor):
data["tags"].sort()
data["user"] = self.user or data["artist"][0]
+ data["post_url"] = post_url
return data
@staticmethod
@@ -171,6 +173,7 @@ class NewgroundsExtractor(Extractor):
def _extract_media_data(self, extr, url):
index = url.split("/")[5]
title = extr('"og:title" content="', '"')
+ descr = extr('"og:description" content="', '"')
src = extr('{"url":"', '"')
if src:
@@ -209,7 +212,7 @@ class NewgroundsExtractor(Extractor):
"title" : text.unescape(title),
"url" : src,
"date" : date,
- "description": text.unescape(extr(
+ "description": text.unescape(descr or extr(
'itemprop="description" content="', '"')),
"rating" : extr('class="rated-', '"'),
"index" : text.parse_int(index),
@@ -319,6 +322,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"artist" : ["kickinthehead", "danpaladin", "tomfulp"],
"comment" : "re:My fan trailer for Alien Hominid HD!",
"date" : "dt:2013-02-01 09:50:49",
+ "description": "Fan trailer for Alien Hominid HD!",
"favorites" : int,
"filename" : "564957_alternate_31",
"index" : 595355,
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 6812f35..428f772 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2021 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -399,7 +399,7 @@ class OAuthPixiv(OAuthBase):
if "error" in data:
print(data)
- if data["error"] == "invalid_request":
+ if data["error"] in ("invalid_request", "invalid_grant"):
print("'code' expired, try again")
return
@@ -417,6 +417,10 @@ class OAuthPixiv(OAuthBase):
2) Login
3) Select the last network monitor entry ('callback?state=...')
4) Copy its 'code' query parameter, paste it below, and press Enter
+
+- This 'code' will expire 30 seconds after logging in.
+- Copy-pasting more than just the 'code' value will work as well,
+ like the entire URL or several query parameters.
""")
code = input("code: ")
return code.rpartition("=")[2].strip()
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 051f1ef..35a015f 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -71,6 +71,15 @@ class PatreonExtractor(Extractor):
name = image.get("file_name") or self._filename(url) or url
yield "image", url, name
+ def _image_large(self, post):
+ image = post.get("image")
+ if image:
+ url = image.get("large_url")
+ if url:
+ name = image.get("file_name") or self._filename(url) or url
+ return (("image_large", url, name),)
+ return ()
+
def _attachments(self, post):
for attachment in post["attachments"]:
url = self.request(
@@ -212,10 +221,11 @@ class PatreonExtractor(Extractor):
def _build_file_generators(self, filetypes):
if filetypes is None:
- return (self._images, self._attachments,
- self._postfile, self._content)
+ return (self._images, self._image_large,
+ self._attachments, self._postfile, self._content)
genmap = {
"images" : self._images,
+ "image_large": self._image_large,
"attachments": self._attachments,
"postfile" : self._postfile,
"content" : self._content,
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
index bf38a77..22c9487 100644
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -1,16 +1,15 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2020 Mike Fährmann
+# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://seiga.nicovideo.jp/"""
+"""Extractors for https://seiga.nicovideo.jp/"""
from .common import Extractor, Message
from .. import text, util, exception
-from ..cache import cache
class SeigaExtractor(Extractor):
@@ -25,7 +24,9 @@ class SeigaExtractor(Extractor):
self.start_image = 0
def items(self):
- self.login()
+ if not self._check_cookies(("user_session",)):
+ raise exception.StopExtraction("'user_session' cookie required")
+
images = iter(self.get_images())
data = next(images)
@@ -45,28 +46,6 @@ class SeigaExtractor(Extractor):
url, method="HEAD", allow_redirects=False, notfound="image")
return response.headers["Location"].replace("/o/", "/priv/", 1)
- def login(self):
- """Login and set necessary cookies"""
- if not self._check_cookies(("user_session",)):
- username, password = self._get_auth_info()
- self._update_cookies(self._login_impl(username, password))
-
- @cache(maxage=7*24*3600, keyarg=1)
- def _login_impl(self, username, password):
- if not username or not password:
- raise exception.AuthenticationError(
- "Username and password required")
-
- self.log.info("Logging in as %s", username)
- url = "https://account.nicovideo.jp/api/v1/login"
- data = {"mail_tel": username, "password": password}
-
- self.request(url, method="POST", data=data)
- if "user_session" not in self.session.cookies:
- raise exception.AuthenticationError()
- del self.session.cookies["nicosid"]
- return self.session.cookies
-
class SeigaUserExtractor(SeigaExtractor):
"""Extractor for images of a user from seiga.nicovideo.jp"""
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index 2c806ad..965391c 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -22,10 +22,11 @@ class SkebExtractor(Extractor):
Extractor.__init__(self, match)
self.user_name = match.group(1)
self.thumbnails = self.config("thumbnails", False)
+ self.sent_requests = self.config("sent-requests", False)
def items(self):
- for post_num in self.posts():
- response, post = self._get_post_data(post_num)
+ for user_name, post_num in self.posts():
+ response, post = self._get_post_data(user_name, post_num)
yield Message.Directory, post
for data in self._get_urls_from_post(response, post):
url = data["file_url"]
@@ -38,24 +39,33 @@ class SkebExtractor(Extractor):
url = "{}/api/users/{}/works".format(self.root, self.user_name)
params = {"role": "creator", "sort": "date", "offset": 0}
headers = {"Referer": self.root, "Authorization": "Bearer null"}
+ do_requests = self.sent_requests
while True:
posts = self.request(url, params=params, headers=headers).json()
for post in posts:
post_num = post["path"].rpartition("/")[2]
+ user_name = post["path"].split("/")[1][1:]
if post["private"]:
- self.log.debug("Skipping %s (private)", post_num)
+ self.log.debug("Skipping @%s/%s (private)",
+ user_name, post_num)
continue
- yield post_num
+ yield user_name, post_num
if len(posts) < 30:
- return
+ if do_requests:
+ params["offset"] = 0
+ params['role'] = "client"
+ do_requests = False
+ continue
+ else:
+ return
params["offset"] += 30
- def _get_post_data(self, post_num):
+ def _get_post_data(self, user_name, post_num):
url = "{}/api/users/{}/works/{}".format(
- self.root, self.user_name, post_num)
+ self.root, user_name, post_num)
headers = {"Referer": self.root, "Authorization": "Bearer null"}
resp = self.request(url, headers=headers).json()
creator = resp["creator"]
@@ -130,7 +140,7 @@ class SkebPostExtractor(SkebExtractor):
self.post_num = match.group(2)
def posts(self):
- return (self.post_num,)
+ return ((self.user_name, self.post_num),)
class SkebUserExtractor(SkebExtractor):
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index 91386e8..557c9fb 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2021 Mike Fährmann, Leonardo Taccari
+# Copyright 2016-2022 Mike Fährmann, Leonardo Taccari
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,11 +8,12 @@
"""Extractors for https://www.slideshare.net/"""
-from .common import Extractor, Message
+from .common import GalleryExtractor
from .. import text
+import json
-class SlidesharePresentationExtractor(Extractor):
+class SlidesharePresentationExtractor(GalleryExtractor):
"""Extractor for images from a presentation on slideshare.net"""
category = "slideshare"
subcategory = "presentation"
@@ -24,13 +25,36 @@ class SlidesharePresentationExtractor(Extractor):
test = (
(("https://www.slideshare.net"
"/Slideshare/get-started-with-slide-share"), {
- "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
- "content": "2e90a01c6ca225579ebf8f98ab46f97a28a5e45c",
+ "pattern": r"https://image\.slidesharecdn\.com/getstartedwithslide"
+ r"share-150520173821-lva1-app6892/95/get-started-with-s"
+ r"lide-share-\d+-1024\.jpg\?cb=\d+",
+ "count": 19,
+ "content": "2b6a191eab60b3978fdacfecf2da302dd45bc108",
+ "keyword": {
+ "comments": "0",
+ "description": "Get Started with SlideShare - "
+ "A Beginngers Guide for Creators",
+ "likes": r"re:\d{3,}",
+ "presentation": "get-started-with-slide-share",
+ "published": "dt:2015-05-20 00:00:00",
+ "title": "Getting Started With SlideShare",
+ "user": "Slideshare",
+ "views": r"re:\d{7,}",
+ },
}),
- # long title
+ # long title and description
(("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
"-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
"url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
+ "keyword": {
+ "title": "Warum Sie nicht Ihren Mitarbeitenden ändern "
+ "sollten, sondern Ihr Managementsystem",
+ "description": "Mitarbeitende verhalten sich mehrheitlich so, "
+ "wie das System es ihnen vorgibt. Welche Voraus"
+ "setzungen es braucht, damit Ihre Mitarbeitende"
+ "n ihr ganzes Herzblut einsetzen, bespricht Fre"
+ "di Schmidli in diesem Referat.",
+ },
}),
# mobile URL
(("https://www.slideshare.net"
@@ -40,48 +64,50 @@ class SlidesharePresentationExtractor(Extractor):
)
def __init__(self, match):
- Extractor.__init__(self, match)
self.user, self.presentation = match.groups()
+ url = "https://www.slideshare.net/{}/{}".format(
+ self.user, self.presentation)
+ GalleryExtractor.__init__(self, match, url)
- def items(self):
- page = self.request("https://www.slideshare.net/" + self.user +
- "/" + self.presentation).text
- data = self.get_job_metadata(page)
- imgs = self.get_image_urls(page)
- data["count"] = len(imgs)
- yield Message.Directory, data
- for data["num"], url in enumerate(imgs, 1):
- yield Message.Url, url, text.nameext_from_url(url, data)
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ descr = extr('', '')
+ published = extr('')
+ comments = extr('content="UserComments:', '"')
+ likes = extr('content="UserLikes:', '"')
+ views = extr('content="UserPageVisits:', '"')
- def get_job_metadata(self, page):
- """Collect metadata for extractor-job"""
- descr, pos = text.extract(
- page, '', '', pos)
- views, pos = text.extract(
- page, '', pos)
- published, pos = text.extract(
- page, '', pos)
- title, pos = text.extract(
- page, '', pos)
- alt_descr, pos = text.extract(
- page, '', pos)
-
- if descr.endswith("…") and alt_descr:
- descr = text.remove_html(alt_descr).strip()
+ if descr.endswith("…"):
+ alt_descr = extr(
+ 'id="slideshow-description-text" class="notranslate">', '