aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2020-11-13 19:17:03 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2020-11-13 19:17:03 -0500
commit209a3c800871cd68edd2bc7ae661a24ecd496d2d (patch)
treecf81c47ab57540b58292295c7d5641e9d2668291 /gallery_dl/extractor
parent5dc7d6f5902ddaee5223d041d5c10060f0c72430 (diff)
New upstream version 1.15.3.upstream/1.15.3
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/2chan.py8
-rw-r--r--gallery_dl/extractor/500px.py12
-rw-r--r--gallery_dl/extractor/8kun.py20
-rw-r--r--gallery_dl/extractor/__init__.py2
-rw-r--r--gallery_dl/extractor/common.py9
-rw-r--r--gallery_dl/extractor/deviantart.py3
-rw-r--r--gallery_dl/extractor/exhentai.py27
-rw-r--r--gallery_dl/extractor/gfycat.py2
-rw-r--r--gallery_dl/extractor/hentaifoundry.py34
-rw-r--r--gallery_dl/extractor/khinsider.py7
-rw-r--r--gallery_dl/extractor/mangoxo.py11
-rw-r--r--gallery_dl/extractor/paheal.py8
-rw-r--r--gallery_dl/extractor/sankakucomplex.py79
-rw-r--r--gallery_dl/extractor/twitter.py98
-rw-r--r--gallery_dl/extractor/weasyl.py3
15 files changed, 237 insertions, 86 deletions
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
index c34cfec..51e461e 100644
--- a/gallery_dl/extractor/2chan.py
+++ b/gallery_dl/extractor/2chan.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2019 Mike Fährmann
+# Copyright 2017-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,9 +21,9 @@ class _2chanThreadExtractor(Extractor):
archive_fmt = "{board}_{thread}_{tim}"
url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
- test = ("http://dec.2chan.net/70/res/947.htm", {
- "url": "c5c12b80b290e224b6758507b3bb952044f4595b",
- "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
+ test = ("http://dec.2chan.net/70/res/11048.htm", {
+ "url": "2ecf919139bd5d915930530b3576d67c388a2a49",
+ "keyword": "8def4ec98a89fd4fff8bbcbae603604dcb4a3bb9",
})
def __init__(self, match):
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index 624b14d..df9941a 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -109,8 +109,8 @@ class _500pxUserExtractor(_500pxExtractor):
variables = {"username": self.user, "pageSize": 20}
photos = self._request_graphql(
"OtherPhotosQuery", variables,
- "54524abbdc809f8d4e10d37839e8ab2d"
- "3035413688cad9c7fbece13b66637e9d",
+ "018a5e5117bd72bdf28066aad02c4f2d"
+ "8acdf7f6127215d231da60e24080eb1b",
)["user"]["photos"]
while True:
@@ -122,8 +122,8 @@ class _500pxUserExtractor(_500pxExtractor):
variables["cursor"] = photos["pageInfo"]["endCursor"]
photos = self._request_graphql(
"OtherPhotosPaginationContainerQuery", variables,
- "6d31e01104456ce642a2c6fc2f936812"
- "b0f2a65c442d03e1521d769c20efe507",
+ "b4af70d42c71a5e43f0be36ce60dc81e"
+ "9742ebc117cde197350f2b86b5977d98",
)["userByUsername"]["photos"]
@@ -153,7 +153,7 @@ class _500pxGalleryExtractor(_500pxExtractor):
def metadata(self):
user = self._request_graphql(
"ProfileRendererQuery", {"username": self.user_name},
- "4d02ff5c13927a3ac73b3eef306490508bc765956940c31051468cf30402a503",
+ "5a17a9af1830b58b94a912995b7947b24f27f1301c6ea8ab71a9eb1a6a86585b",
)["profile"]
self.user_id = str(user["legacyId"])
@@ -166,7 +166,7 @@ class _500pxGalleryExtractor(_500pxExtractor):
}
gallery = self._request_graphql(
"GalleriesDetailQueryRendererQuery", variables,
- "fd367cacf9bebcdc0620bd749dbd8fc9b0ccbeb54fc76b8b4b95e66a8c0cba49",
+ "fb8bb66d31b58903e2f01ebe66bbe7937b982753be3211855b7bce4e286c1a49",
)["gallery"]
self._photos = gallery["photos"]
diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py
index 47fe672..e55bb08 100644
--- a/gallery_dl/extractor/8kun.py
+++ b/gallery_dl/extractor/8kun.py
@@ -20,10 +20,17 @@ class _8kunThreadExtractor(Extractor):
filename_fmt = "{time}{num:?-//} {filename}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)"
- test = ("https://8kun.top/test/res/65248.html", {
- "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+",
- "count": ">= 8",
- })
+ test = (
+ ("https://8kun.top/test/res/65248.html", {
+ "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+",
+ "count": ">= 8",
+ }),
+ # old-style file URLs (#1101)
+ ("https://8kun.top/d/res/13258.html", {
+ "pattern": r"https://media\.8kun\.top/d/src/\d+(-\d)?\.\w+",
+ "range": "1-20",
+ }),
+ )
def __init__(self, match):
Extractor.__init__(self, match)
@@ -56,7 +63,10 @@ class _8kunThreadExtractor(Extractor):
def _process(post, data):
post.update(data)
post["extension"] = post["ext"][1:]
- url = "https://media.8kun.top/file_store/" + post["tim"] + post["ext"]
+ tim = post["tim"]
+ url = ("https://media.8kun.top/" +
+ ("file_store/" if len(tim) > 16 else post["board"] + "/src/") +
+ tim + post["ext"])
return Message.Url, url, post
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index b8e39bc..d0c327a 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -185,6 +185,8 @@ def _list_classes():
module = importlib.import_module("."+module_name, __package__)
yield from add_module(module)
+ globals()["_list_classes"] = lambda : _cache
+
def _get_classes(module):
"""Return a list of all extractor classes in a module"""
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 357deac..5efea4a 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -518,6 +518,15 @@ class SharedConfigMixin():
), key, default,
)
+ def config_accumulate(self, key):
+ values = config.accumulate(self._cfgpath, key)
+
+ conf = config.get(("extractor",), self.basecategory)
+ if conf:
+ values[:0] = config.accumulate((self.subcategory,), key, conf=conf)
+
+ return values
+
def generate_extractors(extractor_data, symtable, classes):
"""Dynamically generate Extractor classes"""
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index e40ec51..456a173 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -165,11 +165,12 @@ class DeviantartExtractor(Extractor):
# filename metadata
alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
+ deviation["index_base36"] = util.bencode(deviation["index"], alphabet)
sub = re.compile(r"\W").sub
deviation["filename"] = "".join((
sub("_", deviation["title"].lower()), "_by_",
sub("_", deviation["author"]["username"].lower()), "-d",
- util.bencode(deviation["index"], alphabet),
+ deviation["index_base36"],
))
@staticmethod
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 06b5ba2..4ead3fb 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -347,24 +347,33 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
@staticmethod
def _parse_image_info(url):
- parts = url.split("/")[4].split("-")
+ for part in url.split("/")[4:]:
+ try:
+ _, size, width, height, _ = part.split("-")
+ break
+ except ValueError:
+ pass
+ else:
+ size = width = height = 0
+
return {
- "width": text.parse_int(parts[2]),
- "height": text.parse_int(parts[3]),
- "size": text.parse_int(parts[1]),
- "cost": 1,
+ "cost" : 1,
+ "size" : text.parse_int(size),
+ "width" : text.parse_int(width),
+ "height": text.parse_int(height),
}
@staticmethod
def _parse_original_info(info):
parts = info.lstrip().split(" ")
size = text.parse_bytes(parts[3] + parts[4][0])
+
return {
- "width": text.parse_int(parts[0]),
- "height": text.parse_int(parts[2]),
- "size": size,
# 1 initial point + 1 per 0.1 MB
- "cost": 1 + math.ceil(size / 100000)
+ "cost" : 1 + math.ceil(size / 100000),
+ "size" : size,
+ "width" : text.parse_int(parts[0]),
+ "height": text.parse_int(parts[2]),
}
diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py
index 493c1d2..f878dbd 100644
--- a/gallery_dl/extractor/gfycat.py
+++ b/gallery_dl/extractor/gfycat.py
@@ -55,7 +55,7 @@ class GfycatExtractor(Extractor):
class GfycatUserExtractor(GfycatExtractor):
"""Extractor for gfycat user profiles"""
subcategory = "user"
- directory_fmt = ("{category}", "{userName}")
+ directory_fmt = ("{category}", "{username|userName}")
pattern = r"(?:https?://)?gfycat\.com/@([^/?#]+)"
test = ("https://gfycat.com/@gretta", {
"pattern": r"https://giant\.gfycat\.com/[A-Za-z]+\.mp4",
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index 0be528d..691cefb 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -11,7 +11,7 @@
from .common import Extractor, Message
from .. import text, util
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com"
class HentaifoundryExtractor(Extractor):
@@ -20,12 +20,14 @@ class HentaifoundryExtractor(Extractor):
directory_fmt = ("{category}", "{user}")
filename_fmt = "{category}_{index}_{title}.{extension}"
archive_fmt = "{index}"
+ cookiedomain = "www.hentai-foundry.com"
root = "https://www.hentai-foundry.com"
per_page = 25
def __init__(self, match):
+ self.root = (match.group(1) or "https://") + "www.hentai-foundry.com"
+ self.user = match.group(2)
Extractor.__init__(self, match)
- self.user = match.group(1)
self.page_url = ""
self.start_post = 0
self.start_page = 1
@@ -75,7 +77,8 @@ class HentaifoundryExtractor(Extractor):
"width" : text.parse_int(extr('width="', '"')),
"height" : text.parse_int(extr('height="', '"')),
"index" : text.parse_int(path.rsplit("/", 2)[1]),
- "src" : "https:" + text.unescape(extr('src="', '"')),
+ "src" : text.urljoin(self.root, text.unescape(extr(
+ 'src="', '"'))),
"description": text.unescape(text.remove_html(extr(
'>Description</div>', '</section>')
.replace("\r\n", "\n"), "", "")),
@@ -121,7 +124,13 @@ class HentaifoundryExtractor(Extractor):
def _init_site_filters(self):
"""Set site-internal filters to show all images"""
url = self.root + "/?enterAgree=1"
- response = self.request(url, method="HEAD")
+ self.request(url, method="HEAD")
+
+ csrf_token = self.session.cookies.get(
+ "YII_CSRF_TOKEN", domain=self.cookiedomain)
+ if not csrf_token:
+ self.log.warning("Unable to update site content filters")
+ return
url = self.root + "/site/filters"
data = {
@@ -148,7 +157,7 @@ class HentaifoundryExtractor(Extractor):
"filter_order" : "date_new",
"filter_type" : "0",
"YII_CSRF_TOKEN" : text.unquote(text.extract(
- response.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0]),
+ csrf_token, "%22", "%22")[0]),
}
self.request(url, method="POST", data=data)
@@ -235,7 +244,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor):
directory_fmt = ("{category}", "Recent Pictures", "{date}")
archive_fmt = "r_{index}"
pattern = BASE_PATTERN + r"/pictures/recent/(\d\d\d\d-\d\d-\d\d)"
- test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20", {
+ test = ("https://www.hentai-foundry.com/pictures/recent/2018-09-20", {
"pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/",
"range": "20-30",
})
@@ -254,7 +263,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor):
directory_fmt = ("{category}", "Popular Pictures")
archive_fmt = "p_{index}"
pattern = BASE_PATTERN + r"/pictures/popular()"
- test = ("http://www.hentai-foundry.com/pictures/popular", {
+ test = ("https://www.hentai-foundry.com/pictures/popular", {
"pattern": r"https://pictures.hentai-foundry.com/[^/]/[^/?#]+/\d+/",
"range": "20-30",
})
@@ -267,7 +276,7 @@ class HentaifoundryPopularExtractor(HentaifoundryExtractor):
class HentaifoundryImageExtractor(HentaifoundryExtractor):
"""Extractor for a single image from hentaifoundry.com"""
subcategory = "image"
- pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com"
+ pattern = (r"(https?://)?(?:www\.|pictures\.)?hentai-foundry\.com"
r"/(?:pictures/user|[^/?#])/([^/?#]+)/(\d+)")
test = (
(("https://www.hentai-foundry.com"
@@ -290,7 +299,10 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
"width" : 495,
},
}),
- ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/"),
+ ("http://www.hentai-foundry.com/pictures/user/Tenpura/407501/", {
+ "pattern": "http://pictures.hentai-foundry.com/t/Tenpura/407501/",
+ }),
+ ("https://www.hentai-foundry.com/pictures/user/Tenpura/407501/"),
("https://pictures.hentai-foundry.com"
"/t/Tenpura/407501/Tenpura-407501-shimakaze.png"),
)
@@ -298,7 +310,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.index = match.group(2)
+ self.index = match.group(3)
def items(self):
post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format(
@@ -359,7 +371,7 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match)
- self.index = match.group(2)
+ self.index = match.group(3)
def items(self):
story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format(
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index 6ddf0e8..679b5a0 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -23,9 +23,10 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
root = "https://downloads.khinsider.com"
test = (("https://downloads.khinsider.com"
"/game-soundtracks/album/horizon-riders-wii"), {
- "pattern": r"https?://vgmdownloads.com/soundtracks/horizon-riders-wii/"
- r"[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3",
- "keyword": "5b2c35cce638c326cab2a4f7a79f245d008d62ff",
+ "pattern": r"https?://vgm(site|downloads).com"
+ r"/soundtracks/horizon-riders-wii/[^/]+"
+ r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3",
+ "keyword": "12ca70e0709ea15250e577ea388cf2b5b0c65630",
})
def __init__(self, match):
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index 5743498..344dd56 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -86,7 +86,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
"album": {
"id": "lzVOv1Q9",
"name": "re:池永康晟 Ikenaga Yasunari 透出古朴",
- "date": "2019.3.22 14:42",
+ "date": "dt:2019-03-22 14:42:00",
"description": str,
},
"num": int,
@@ -113,23 +113,24 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
def metadata(self, page):
"""Return general metadata"""
title, pos = text.extract(page, '<title>', '</title>')
- count, pos = text.extract(page, 'id="pic-count">', '<', pos)
- cover, pos = text.extract(page, ' src="', '"', pos)
+ _ , pos = text.extract(page, 'class="desc"', '', pos)
cid , pos = text.extract(page, '//www.mangoxo.com/channel/', '"', pos)
cname, pos = text.extract(page, '>', '<', pos)
+ count, pos = text.extract(page, 'id="pic-count">', '<', pos)
+ cover, pos = text.extract(page, ' src="', '"', pos)
date , pos = text.extract(page, '</i>', '<', pos)
descr, pos = text.extract(page, '<pre>', '</pre>', pos)
return {
"channel": {
"id": cid,
- "name": text.unescape(cname),
+ "name": text.unescape(cname.strip()),
"cover": cover,
},
"album": {
"id": self.album_id,
"name": text.unescape(title),
- "date": date.strip(),
+ "date": text.parse_datetime(date.strip(), "%Y.%m.%d %H:%M"),
"description": text.unescape(descr),
},
"count": text.parse_int(count),
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 57521d6..e0b0496 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -21,6 +21,9 @@ class PahealExtractor(SharedConfigMixin, Extractor):
root = "https://rule34.paheal.net"
def items(self):
+ self.session.cookies.set(
+ "ui-tnc-agreed", "true", domain="rule34.paheal.net")
+
yield Message.Version, 1
yield Message.Directory, self.get_metadata()
@@ -65,7 +68,7 @@ class PahealTagExtractor(PahealExtractor):
page = self.request(url).text
for post in text.extract_iter(
- page, '<img id="thumb_', '>Image Only<'):
+ page, '<img id="thumb_', 'Only</a>'):
yield self._extract_data(post)
if ">Next<" not in page:
@@ -79,7 +82,8 @@ class PahealTagExtractor(PahealExtractor):
md5 , pos = text.extract(post, '/_thumbs/', '/', pos)
url , pos = text.extract(post, '<a href="', '"', pos)
- tags, dimensions, size, _ = data.split(" // ")
+ tags, data, date = data.split("\n")
+ dimensions, size, ext = data.split(" // ")
width, _, height = dimensions.partition("x")
return {
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index f6ad327..972750c 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,7 +9,7 @@
"""Extractors for https://www.sankakucomplex.com/"""
from .common import Extractor, Message
-from .. import text
+from .. import text, util
import re
@@ -40,6 +40,21 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
"url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
"keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68",
}),
+ # videos (#308)
+ (("https://www.sankakucomplex.com/2019/06/11"
+ "/darling-ol-goddess-shows-off-her-plump-lower-area/"), {
+ "pattern": r"/wp-content/uploads/2019/06/[^/]+\d\.mp4",
+ "range": "26-",
+ "count": 5,
+ }),
+ # youtube embeds (#308)
+ (("https://www.sankakucomplex.com/2015/02/12"
+ "/snow-miku-2015-live-magical-indeed/"), {
+ "options": (("embeds", True),),
+ "pattern": r"https://www.youtube.com/embed/",
+ "range": "2-",
+ "count": 2,
+ }),
)
def items(self):
@@ -53,38 +68,44 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
"date" : text.parse_datetime(
extr('property="article:published_time" content="', '"')),
}
- imgs = self.images(extr)
- data["count"] = len(imgs)
+ content = extr('<div class="entry-content">', '</article>')
data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2]
- yield Message.Version, 1
- yield Message.Directory, data
- for img in imgs:
- img.update(data)
- yield Message.Url, img["url"], img
+ files = self._extract_images(content)
+ if self.config("videos", True):
+ files += self._extract_videos(content)
+ if self.config("embeds", False):
+ files += self._extract_embeds(content)
+ data["count"] = len(files)
- def images(self, extr):
- num = 0
- imgs = []
- urls = set()
- orig = re.compile(r"-\d+x\d+\.")
-
- extr('<div class="entry-content">', '')
- while True:
- url = extr('data-lazy-src="', '"')
- if not url:
- return imgs
- if url in urls:
- continue
+ yield Message.Directory, data
+ for num, url in enumerate(files, 1):
+ file = text.nameext_from_url(url)
if url[0] == "/":
url = text.urljoin(self.root, url)
- url = orig.sub(".", url)
- num += 1
- imgs.append(text.nameext_from_url(url, {
- "url" : url,
- "num" : num,
- }))
- urls.add(url)
+ file["url"] = url
+ file["num"] = num
+ file.update(data)
+ yield Message.Url, url, file
+
+ @staticmethod
+ def _extract_images(content):
+ orig_sub = re.compile(r"-\d+x\d+\.").sub
+ return [
+ orig_sub(".", url) for url in
+ util.unique(text.extract_iter(content, 'data-lazy-src="', '"'))
+ ]
+
+ @staticmethod
+ def _extract_videos(content):
+ return re.findall(r"<source [^>]*src=[\"']([^\"']+)", content)
+
+ @staticmethod
+ def _extract_embeds(content):
+ return [
+ "ytdl:" + url for url in
+ re.findall(r"<iframe [^>]*src=[\"']([^\"']+)", content)
+ ]
class SankakucomplexTagExtractor(SankakucomplexExtractor):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 06973b2..fe0b3c5 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -11,7 +11,7 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
-
+import json
BASE_PATTERN = (
r"(?:https?://)?(?:www\.|mobile\.)?"
@@ -78,8 +78,8 @@ class TwitterExtractor(Extractor):
def _extract_media(self, tweet, files):
for media in tweet["extended_entities"]["media"]:
- width = media["original_info"].get("width", 0),
- height = media["original_info"].get("height", 0),
+ width = media["original_info"].get("width", 0)
+ height = media["original_info"].get("height", 0)
if "video_info" in media:
if self.videos == "ytdl":
@@ -321,6 +321,35 @@ class TwitterBookmarkExtractor(TwitterExtractor):
return TwitterAPI(self).timeline_bookmark()
+class TwitterListExtractor(TwitterExtractor):
+ """Extractor for Twitter lists"""
+ subcategory = "list"
+ pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$"
+ test = ("https://twitter.com/i/lists/784214683683127296", {
+ "range": "1-40",
+ "count": 40,
+ "archive": False,
+ })
+
+ def tweets(self):
+ return TwitterAPI(self).timeline_list(self.user)
+
+
+class TwitterListMembersExtractor(TwitterExtractor):
+ """Extractor for members of a Twitter list"""
+ subcategory = "list-members"
+ pattern = BASE_PATTERN + r"/i/lists/(\d+)/members"
+ test = ("https://twitter.com/i/lists/784214683683127296/members",)
+
+ def items(self):
+ self.login()
+ for user in TwitterAPI(self).list_members(self.user):
+ user["_extractor"] = TwitterTimelineExtractor
+ url = "{}/intent/user?user_id={}".format(
+ self.root, user["rest_id"])
+ yield Message.Queue, url, user
+
+
class TwitterSearchExtractor(TwitterExtractor):
"""Extractor for all images from a search timeline"""
subcategory = "search"
@@ -399,7 +428,7 @@ class TwitterTweetExtractor(TwitterExtractor):
# Twitter card (#1005)
("https://twitter.com/billboard/status/1306599586602135555", {
"options": (("cards", True),),
- "pattern": r"https://pbs.twimg.com/card_img/1317274761030856707/",
+ "pattern": r"https://pbs.twimg.com/card_img/\d+/",
}),
# original retweets (#1026)
("https://twitter.com/jessica_3978/status/1296304589591810048", {
@@ -511,6 +540,13 @@ class TwitterAPI():
endpoint = "2/timeline/bookmark.json"
return self._pagination(endpoint)
+ def timeline_list(self, list_id):
+ endpoint = "2/timeline/list.json"
+ params = self.params.copy()
+ params["list_id"] = list_id
+ params["ranking_mode"] = "reverse_chronological"
+ return self._pagination(endpoint, params)
+
def search(self, query):
endpoint = "2/search/adaptive.json"
params = self.params.copy()
@@ -522,12 +558,29 @@ class TwitterAPI():
return self._pagination(
endpoint, params, "sq-I-t-", "sq-cursor-bottom")
- def user_by_screen_name(self, screen_name):
- endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName"
- params = {
- "variables": '{"screen_name":"' + screen_name + '"'
- ',"withHighlightedLabel":true}'
+ def list_members(self, list_id):
+ endpoint = "graphql/M74V2EwlxxVYGB4DbyAphQ/ListMembers"
+ variables = {
+ "listId": list_id,
+ "count" : 20,
+ "withTweetResult": False,
+ "withUserResult" : False,
}
+ return self._pagination_members(endpoint, variables)
+
+ def list_by_rest_id(self, list_id):
+ endpoint = "graphql/LXXTUytSX1QY-2p8Xp9BFA/ListByRestId"
+ params = {"variables": '{"listId":"' + list_id + '"'
+ ',"withUserResult":false}'}
+ try:
+ return self._call(endpoint, params)["data"]["list"]
+ except KeyError:
+ raise exception.NotFoundError("list")
+
+ def user_by_screen_name(self, screen_name):
+ endpoint = "graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName"
+ params = {"variables": '{"screen_name":"' + screen_name + '"'
+ ',"withHighlightedLabel":true}'}
try:
return self._call(endpoint, params)["data"]["user"]
except KeyError:
@@ -627,3 +680,30 @@ class TwitterAPI():
if not cursor or not tweet:
return
params["cursor"] = cursor
+
+ def _pagination_members(self, endpoint, variables):
+ while True:
+ cursor = entry = stop = None
+ params = {"variables": json.dumps(variables)}
+ data = self._call(endpoint, params)
+
+ try:
+ instructions = (data["data"]["list"]["members_timeline"]
+ ["timeline"]["instructions"])
+ except KeyError:
+ raise exception.AuthorizationError()
+
+ for instr in instructions:
+ if instr["type"] == "TimelineAddEntries":
+ for entry in instr["entries"]:
+ if entry["entryId"].startswith("user-"):
+ yield entry["content"]["itemContent"]["user"]
+ elif entry["entryId"].startswith("cursor-bottom-"):
+ cursor = entry["content"]["value"]
+ elif instr["type"] == "TimelineTerminateTimeline":
+ if instr["direction"] == "Bottom":
+ stop = True
+
+ if stop or not cursor or not entry:
+ return
+ variables["cursor"] = cursor
diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py
index 6799784..1dd5b09 100644
--- a/gallery_dl/extractor/weasyl.py
+++ b/gallery_dl/extractor/weasyl.py
@@ -226,7 +226,6 @@ class WeasylFavoriteExtractor(WeasylExtractor):
if not owner_login:
owner_login = text.extract(page, '<a href="/~', '"')[0]
- yield Message.Directory, {"owner_login": owner_login}
for submitid in text.extract_iter(page, "/submissions/", "/", pos):
if submitid == lastid:
@@ -234,6 +233,8 @@ class WeasylFavoriteExtractor(WeasylExtractor):
lastid = submitid
submission = self.request_submission(submitid)
if self.populate_submission(submission):
+ submission["user"] = owner_login
+ yield Message.Directory, submission
yield Message.Url, submission["url"], submission
if "&amp;nextid=" not in page: