summaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/__init__.py13
-rw-r--r--gallery_dl/cache.py39
-rw-r--r--gallery_dl/cloudflare.py4
-rw-r--r--gallery_dl/downloader/http.py6
-rw-r--r--gallery_dl/extractor/3dbooru.py2
-rw-r--r--gallery_dl/extractor/8kun.py91
-rw-r--r--gallery_dl/extractor/8muses.py4
-rw-r--r--gallery_dl/extractor/__init__.py7
-rw-r--r--gallery_dl/extractor/bcy.py188
-rw-r--r--gallery_dl/extractor/blogger.py48
-rw-r--r--gallery_dl/extractor/booru.py42
-rw-r--r--gallery_dl/extractor/common.py39
-rw-r--r--gallery_dl/extractor/danbooru.py176
-rw-r--r--gallery_dl/extractor/deviantart.py37
-rw-r--r--gallery_dl/extractor/e621.py214
-rw-r--r--gallery_dl/extractor/flickr.py3
-rw-r--r--gallery_dl/extractor/furaffinity.py235
-rw-r--r--gallery_dl/extractor/hentaifoundry.py2
-rw-r--r--gallery_dl/extractor/hentaihand.py134
-rw-r--r--gallery_dl/extractor/hentainexus.py2
-rw-r--r--gallery_dl/extractor/hiperdex.py137
-rw-r--r--gallery_dl/extractor/hitomi.py146
-rw-r--r--gallery_dl/extractor/imgbb.py12
-rw-r--r--gallery_dl/extractor/imgur.py8
-rw-r--r--gallery_dl/extractor/instagram.py14
-rw-r--r--gallery_dl/extractor/issuu.py2
-rw-r--r--gallery_dl/extractor/kabeuchi.py92
-rw-r--r--gallery_dl/extractor/khinsider.py60
-rw-r--r--gallery_dl/extractor/kissmanga.py2
-rw-r--r--gallery_dl/extractor/luscious.py4
-rw-r--r--gallery_dl/extractor/mangareader.py4
-rw-r--r--gallery_dl/extractor/mangoxo.py35
-rw-r--r--gallery_dl/extractor/newgrounds.py84
-rw-r--r--gallery_dl/extractor/nozomi.py4
-rw-r--r--gallery_dl/extractor/oauth.py14
-rw-r--r--gallery_dl/extractor/paheal.py4
-rw-r--r--gallery_dl/extractor/patreon.py82
-rw-r--r--gallery_dl/extractor/piczel.py62
-rw-r--r--gallery_dl/extractor/pixiv.py84
-rw-r--r--gallery_dl/extractor/pururin.py6
-rw-r--r--gallery_dl/extractor/realbooru.py2
-rw-r--r--gallery_dl/extractor/reddit.py120
-rw-r--r--gallery_dl/extractor/sexcom.py40
-rw-r--r--gallery_dl/extractor/tsumino.py4
-rw-r--r--gallery_dl/extractor/tumblr.py2
-rw-r--r--gallery_dl/extractor/twitter.py118
-rw-r--r--gallery_dl/extractor/vsco.py16
-rw-r--r--gallery_dl/extractor/wallhaven.py6
-rw-r--r--gallery_dl/extractor/weibo.py6
-rw-r--r--gallery_dl/extractor/xhamster.py4
-rw-r--r--gallery_dl/extractor/yaplog.py128
-rw-r--r--gallery_dl/job.py9
-rw-r--r--gallery_dl/output.py48
-rw-r--r--gallery_dl/postprocessor/ugoira.py14
-rw-r--r--gallery_dl/text.py6
-rw-r--r--gallery_dl/util.py254
-rw-r--r--gallery_dl/version.py2
57 files changed, 2153 insertions, 768 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index ffaed3d..6fba5e2 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -129,17 +129,8 @@ def main():
for opts in args.options:
config.set(*opts)
- # stream logging handler
- output.configure_logging_handler(
- "log", logging.getLogger().handlers[0])
-
- # file logging handler
- handler = output.setup_logging_handler(
- "logfile", lvl=args.loglevel)
- if handler:
- logging.getLogger().addHandler(handler)
-
# loglevels
+ output.configure_logging(args.loglevel)
if args.loglevel >= logging.ERROR:
config.set(("output",), "mode", "null")
elif args.loglevel <= logging.DEBUG:
diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py
index c48b53f..6cde65d 100644
--- a/gallery_dl/cache.py
+++ b/gallery_dl/cache.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -96,12 +96,12 @@ class DatabaseCacheDecorator():
# database lookup
fullkey = "%s-%s" % (self.key, key)
- cursor = self.cursor()
- try:
- cursor.execute("BEGIN EXCLUSIVE")
- except sqlite3.OperationalError:
- pass # Silently swallow exception - workaround for Python 3.6
- try:
+ with self.database() as db:
+ cursor = db.cursor()
+ try:
+ cursor.execute("BEGIN EXCLUSIVE")
+ except sqlite3.OperationalError:
+ pass # Silently swallow exception - workaround for Python 3.6
cursor.execute(
"SELECT value, expires FROM data WHERE key=? LIMIT 1",
(fullkey,),
@@ -118,37 +118,38 @@ class DatabaseCacheDecorator():
"INSERT OR REPLACE INTO data VALUES (?,?,?)",
(fullkey, pickle.dumps(value), expires),
)
- finally:
- self.db.commit()
+
self.cache[key] = value, expires
return value
def update(self, key, value):
expires = int(time.time()) + self.maxage
self.cache[key] = value, expires
- self.cursor().execute(
- "INSERT OR REPLACE INTO data VALUES (?,?,?)",
- ("%s-%s" % (self.key, key), pickle.dumps(value), expires),
- )
+ with self.database() as db:
+ db.execute(
+ "INSERT OR REPLACE INTO data VALUES (?,?,?)",
+ ("%s-%s" % (self.key, key), pickle.dumps(value), expires),
+ )
def invalidate(self, key):
try:
del self.cache[key]
except KeyError:
pass
- self.cursor().execute(
- "DELETE FROM data WHERE key=? LIMIT 1",
- ("%s-%s" % (self.key, key),),
- )
+ with self.database() as db:
+ db.execute(
+ "DELETE FROM data WHERE key=?",
+ ("%s-%s" % (self.key, key),),
+ )
- def cursor(self):
+ def database(self):
if self._init:
self.db.execute(
"CREATE TABLE IF NOT EXISTS data "
"(key TEXT PRIMARY KEY, value TEXT, expires INTEGER)"
)
DatabaseCacheDecorator._init = False
- return self.db.cursor()
+ return self.db
def memcache(maxage=None, keyarg=None):
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
index 6e23c83..6ba5480 100644
--- a/gallery_dl/cloudflare.py
+++ b/gallery_dl/cloudflare.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -38,7 +38,7 @@ def solve_challenge(session, response, kwargs):
params = cf_kwargs["data"] = collections.OrderedDict()
page = response.text
- url = root + text.extract(page, 'action="', '"')[0]
+ url = root + text.unescape(text.extract(page, 'action="', '"')[0])
params["r"] = text.extract(page, 'name="r" value="', '"')[0]
params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 9cd2aa6..844e422 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -193,6 +193,9 @@ class HttpDownloader(DownloaderBase):
mtype = response.headers.get("Content-Type", "image/jpeg")
mtype = mtype.partition(";")[0]
+ if "/" not in mtype:
+ mtype = "image/" + mtype
+
if mtype in MIMETYPE_MAP:
return MIMETYPE_MAP[mtype]
@@ -231,6 +234,8 @@ MIMETYPE_MAP = {
"image/png": "png",
"image/gif": "gif",
"image/bmp": "bmp",
+ "image/x-bmp": "bmp",
+ "image/x-ms-bmp": "bmp",
"image/webp": "webp",
"image/svg+xml": "svg",
@@ -247,6 +252,7 @@ MIMETYPE_MAP = {
"application/zip": "zip",
"application/x-zip": "zip",
"application/x-zip-compressed": "zip",
+ "application/rar": "rar",
"application/x-rar": "rar",
"application/x-rar-compressed": "rar",
"application/x-7z-compressed": "7z",
diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py
index ac96211..3773ee5 100644
--- a/gallery_dl/extractor/3dbooru.py
+++ b/gallery_dl/extractor/3dbooru.py
@@ -67,7 +67,7 @@ class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor):
r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
r"(?:\?(?P<query>[^#]*))?")
test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
- "url": "8b1a5c5b7a10f8f5d3d6124d1aabfee0277078cb",
+ "pattern": r"http://behoimi\.org/data/../../[0-9a-f]{32}\.jpg",
"count": 20,
})
diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py
new file mode 100644
index 0000000..7162920
--- /dev/null
+++ b/gallery_dl/extractor/8kun.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://8kun.top/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _8kunThreadExtractor(Extractor):
+ """Extractor for 8kun threads"""
+ category = "8kun"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} {title}")
+ filename_fmt = "{time}{num:?-//} {filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)"
+ test = ("https://8kun.top/test/res/65248.html", {
+ "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+",
+ "count": ">= 8",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "https://8kun.top/{}/res/{}.json".format(self.board, self.thread)
+ posts = self.request(url).json()["posts"]
+ title = posts[0].get("sub") or text.remove_html(posts[0]["com"])
+ process = self._process
+
+ data = {
+ "board" : self.board,
+ "thread": self.thread,
+ "title" : text.unescape(title)[:50],
+ "num" : 0,
+ }
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in posts:
+ if "filename" in post:
+ yield process(post, data)
+ if "extra_files" in post:
+ for post["num"], filedata in enumerate(
+ post["extra_files"], 1):
+ yield process(post, filedata)
+
+ @staticmethod
+ def _process(post, data):
+ post.update(data)
+ post["extension"] = post["ext"][1:]
+ url = "https://media.8kun.top/file_store/" + post["tim"] + post["ext"]
+ return Message.Url, url, post
+
+
+class _8kunBoardExtractor(Extractor):
+ """Extractor for 8kun boards"""
+ category = "8kun"
+ subcategory = "board"
+ pattern = r"(?:https?://)?8kun\.top/([^/?&#]+)/(?:index|\d+)\.html"
+ test = (
+ ("https://8kun.top/v/index.html", {
+ "pattern": _8kunThreadExtractor.pattern,
+ "count": ">= 100",
+ }),
+ ("https://8kun.top/v/2.html"),
+ ("https://8kun.top/v/index.html?PageSpeed=noscript"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board = match.group(1)
+
+ def items(self):
+ url = "https://8kun.top/{}/threads.json".format(self.board)
+ threads = self.request(url).json()
+
+ for page in threads:
+ for thread in page["threads"]:
+ url = "https://8kun.top/{}/res/{}.html".format(
+ self.board, thread["no"])
+ thread["page"] = page["page"]
+ thread["_extractor"] = _8kunThreadExtractor
+ yield Message.Queue, url, thread
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index 089a0e9..dec5972 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -40,7 +40,7 @@ class _8musesAlbumExtractor(Extractor):
"parent" : 10454,
"views" : int,
"likes" : int,
- "date" : "type:datetime",
+ "date" : "dt:2018-07-10 00:00:00",
},
},
}),
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 66203fe..74c553d 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -15,9 +15,11 @@ modules = [
"3dbooru",
"4chan",
"500px",
+ "8kun",
"8muses",
"adultempire",
"artstation",
+ "bcy",
"behance",
"blogger",
"bobx",
@@ -28,6 +30,7 @@ modules = [
"exhentai",
"fallenangels",
"flickr",
+ "furaffinity",
"fuskator",
"gelbooru",
"gfycat",
@@ -36,8 +39,10 @@ modules = [
"hentaicafe",
"hentaifoundry",
"hentaifox",
+ "hentaihand",
"hentaihere",
"hentainexus",
+ "hiperdex",
"hitomi",
"hypnohub",
"idolcomplex",
@@ -49,6 +54,7 @@ modules = [
"imgur",
"instagram",
"issuu",
+ "kabeuchi",
"keenspot",
"khinsider",
"kissmanga",
@@ -110,7 +116,6 @@ modules = [
"xhamster",
"xvideos",
"yandere",
- "yaplog",
"yuki",
"foolfuuka",
"foolslide",
diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py
new file mode 100644
index 0000000..c3049a4
--- /dev/null
+++ b/gallery_dl/extractor/bcy.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://bcy.net/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+import re
+
+
+class BcyExtractor(Extractor):
+ """Base class for bcy extractors"""
+ category = "bcy"
+ directory_fmt = ("{category}", "{user[id]} {user[name]}")
+ filename_fmt = "{post[id]} {id}.{extension}"
+ archive_fmt = "{post[id]}_{id}"
+ root = "https://bcy.net"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_id = match.group(1)
+
+ def items(self):
+ sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
+ iroot = "https://img-bcy-qn.pstatp.com"
+ noop = self.config("noop")
+
+ for post in self.posts():
+ if not post["image_list"]:
+ continue
+
+ multi = None
+ tags = post.get("post_tags") or ()
+ data = {
+ "user": {
+ "id" : post["uid"],
+ "name" : post["uname"],
+ "avatar" : sub(iroot, post["avatar"].partition("~")[0]),
+ },
+ "post": {
+ "id" : text.parse_int(post["item_id"]),
+ "tags" : [t["tag_name"] for t in tags],
+ "date" : text.parse_timestamp(post["ctime"]),
+ "parody" : post["work"],
+ "content": post["plain"],
+ "likes" : post["like_count"],
+ "shares" : post["share_count"],
+ "replies": post["reply_count"],
+ },
+ }
+
+ yield Message.Directory, data
+ for data["num"], image in enumerate(post["image_list"], 1):
+ data["id"] = image["mid"]
+ data["width"] = image["w"]
+ data["height"] = image["h"]
+
+ url = image["path"].partition("~")[0]
+ text.nameext_from_url(url, data)
+
+ if data["extension"]:
+ if not url.startswith(iroot):
+ url = sub(iroot, url)
+ data["filter"] = ""
+ yield Message.Url, url, data
+
+ else:
+ if not multi:
+ if len(post["multi"]) < len(post["image_list"]):
+ multi = self._data_from_post(post["item_id"])
+ multi = multi["post_data"]["multi"]
+ else:
+ multi = post["multi"]
+ image = multi[data["num"] - 1]
+
+ if image["origin"]:
+ data["filter"] = "watermark"
+ yield Message.Url, image["origin"], data
+
+ if noop:
+ data["extension"] = ""
+ data["filter"] = "noop"
+ yield Message.Url, image["original_path"], data
+
+ def posts(self):
+ """Returns an iterable with all relevant 'post' objects"""
+
+ def _data_from_post(self, post_id):
+ url = "{}/item/detail/{}".format(self.root, post_id)
+ page = self.request(url).text
+ return json.loads(
+ text.extract(page, 'JSON.parse("', '");')[0]
+ .replace('\\\\u002F', '/')
+ .replace('\\"', '"')
+ )["detail"]
+
+
+class BcyUserExtractor(BcyExtractor):
+ """Extractor for user timelines"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?bcy\.net/u/(\d+)"
+ test = (
+ ("https://bcy.net/u/1933712", {
+ "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg",
+ "count": ">= 25",
+ }),
+ ("https://bcy.net/u/109282764041", {
+ "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+"
+ r"~tplv-banciyuan-logo-v3:.+\.image",
+ "range": "1-25",
+ "count": 25,
+ }),
+ )
+
+ def posts(self):
+ url = self.root + "/apiv3/user/selfPosts"
+ params = {"uid": self.item_id, "since": None}
+
+ while True:
+ data = self.request(url, params=params).json()
+
+ item = None
+ for item in data["data"]["items"]:
+ yield item["item_detail"]
+
+ if not item:
+ return
+ params["since"] = item["since"]
+
+
+class BcyPostExtractor(BcyExtractor):
+ """Extractor for individual posts"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)"
+ test = (
+ ("https://bcy.net/item/detail/6355835481002893070", {
+ "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3",
+ "count": 1,
+ "keyword": {
+ "user": {
+ "id" : 1933712,
+ "name" : "wukloo",
+ "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/",
+ },
+ "post": {
+ "id" : 6355835481002893070,
+ "tags" : list,
+ "date" : "dt:2016-11-22 08:47:46",
+ "parody" : "东方PROJECT",
+ "content": "re:根据微博的建议稍微做了点修改",
+ "likes" : int,
+ "shares" : int,
+ "replies": int,
+ },
+ "id": 8330182,
+ "num": 1,
+ "width" : 3000,
+ "height": 1687,
+ "filename": "712e0780b09011e696f973c3d1568337",
+ "extension": "jpg",
+ },
+ }),
+ # only watermarked images available
+ ("https://bcy.net/item/detail/6780546160802143236", {
+ "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+"
+ r"~tplv-banciyuan-logo-v3:.+\.image",
+ "count": 8,
+ "keyword": {"filter": "watermark"}
+ }),
+ # only visible to logged in users
+ ("https://bcy.net/item/detail/6747523535150783495", {
+ "count": 0,
+ }),
+ )
+
+ def posts(self):
+ data = self._data_from_post(self.item_id)
+ post = data["post_data"]
+ post["image_list"] = post["multi"]
+ post["plain"] = text.parse_unicode_escapes(post["plain"])
+ post.update(data["detail_user"])
+ return (post,)
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 31bbaf8..2657b5d 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text
+import json
import re
BASE_PATTERN = (
@@ -28,6 +29,7 @@ class BloggerExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
+ self.videos = self.config("videos", True)
self.blog = match.group(1) or match.group(2)
self.api = BloggerAPI(self)
@@ -41,24 +43,41 @@ class BloggerExtractor(Extractor):
del blog["selfLink"]
sub = re.compile(r"/s\d+/").sub
- findall = re.compile(
- r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall
+ findall_image = re.compile(
+ r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall
+ findall_video = re.compile(
+ r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
for post in self.posts(blog):
- images = findall(post["content"])
- if not images:
+ content = post["content"]
+
+ files = findall_image(content)
+ for idx, url in enumerate(files):
+ files[idx] = sub("/s0/", url).replace("http:", "https:", 1)
+
+ if self.videos and 'id="BLOG_video-' in content:
+ page = self.request(post["url"]).text
+ for url in findall_video(page):
+ page = self.request(url).text
+ video_config = json.loads(text.extract(
+ page, 'var VIDEO_CONFIG =', '\n')[0])
+ files.append(max(
+ video_config["streams"],
+ key=lambda x: x["format_id"],
+ )["play_url"])
+
+ if not files:
continue
post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"]
- post["content"] = text.remove_html(post["content"])
+ post["content"] = text.remove_html(content)
post["date"] = text.parse_datetime(post["published"])
del post["selfLink"]
del post["blog"]
yield Message.Directory, {"blog": blog, "post": post}
- for num, url in enumerate(images, 1):
- url = sub("/s0/", url).replace("http:", "https:", 1)
+ for num, url in enumerate(files, 1):
yield Message.Url, url, text.nameext_from_url(url, {
"blog": blog,
"post": post,
@@ -80,7 +99,7 @@ class BloggerPostExtractor(BloggerExtractor):
"pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg",
"keyword": {
"blog": {
- "date" : "type:datetime",
+ "date" : "dt:2010-11-21 18:19:42",
"description": "",
"id" : "5623928067739466034",
"kind" : "blogger#blog",
@@ -95,7 +114,7 @@ class BloggerPostExtractor(BloggerExtractor):
"post": {
"author" : "Julian Bunker",
"content" : str,
- "date" : "type:datetime",
+ "date" : "dt:2010-12-26 01:08:00",
"etag" : str,
"id" : "6955139236418998998",
"kind" : "blogger#post",
@@ -112,6 +131,11 @@ class BloggerPostExtractor(BloggerExtractor):
("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", {
"url": "9928429fb62f712eb4de80f53625eccecc614aae",
}),
+ # video (#587)
+ (("http://cfnmscenesinmovies.blogspot.com/2011/11/"
+ "cfnm-scene-jenna-fischer-in-office.html"), {
+ "pattern": r"https://.+\.googlevideo\.com/videoplayback",
+ }),
)
def __init__(self, match):
@@ -171,8 +195,8 @@ class BloggerAPI():
def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
- yield from data["items"]
-
+ if "items" in data:
+ yield from data["items"]
if "nextPageToken" not in data:
return
params["pageToken"] = data["nextPageToken"]
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index ac45e0b..162e9cc 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -27,7 +27,6 @@ class BooruExtractor(SharedConfigMixin, Extractor):
page_start = 1
page_limit = None
sort = False
- ugoira = True
def __init__(self, match):
super().__init__(match)
@@ -52,11 +51,7 @@ class BooruExtractor(SharedConfigMixin, Extractor):
for image in images:
try:
- if "pixiv_ugoira_frame_data" in image and \
- "large_file_url" in image and not self.ugoira:
- url = image["large_file_url"]
- else:
- url = image["file_url"]
+ url = image["file_url"]
except KeyError:
continue
if url.startswith("/"):
@@ -112,12 +107,6 @@ class XmlParserMixin():
return [post.attrib for post in root]
-class DanbooruPageMixin():
- """Pagination for Danbooru v2"""
- def update_page(self, data):
- self.params["page"] = "b{}".format(data["id"])
-
-
class MoebooruPageMixin():
"""Pagination for Moebooru and Danbooru v1"""
def update_page(self, data):
@@ -214,8 +203,8 @@ class PostMixin():
self.params["tags"] = "id:" + self.post
-class PopularMixin():
- """Extraction and metadata handling for Danbooru v2"""
+class MoebooruPopularMixin():
+ """Extraction and metadata handling for Moebooru and Danbooru v1"""
subcategory = "popular"
directory_fmt = ("{category}", "popular", "{scale}", "{date}")
archive_fmt = "P_{scale[0]}_{date}_{id}"
@@ -225,37 +214,20 @@ class PopularMixin():
def __init__(self, match):
super().__init__(match)
self.params.update(text.parse_query(match.group("query")))
+ self.scale = match.group("scale")
def get_metadata(self, fmt="%Y-%m-%d"):
- date = self.get_date() or datetime.datetime.utcnow().strftime(fmt)
+ date = self.get_date() or datetime.date.today().isoformat()
scale = self.get_scale() or "day"
if scale == "week":
- dt = datetime.datetime.strptime(date, fmt)
- dt -= datetime.timedelta(days=dt.weekday())
- date = dt.strftime(fmt)
+ date = datetime.date.fromisoformat(date)
+ date = (date - datetime.timedelta(days=date.weekday())).isoformat()
elif scale == "month":
date = date[:-3]
return {"date": date, "scale": scale}
- def get_scale(self):
- if "scale" in self.params:
- return self.params["scale"]
- return None
-
- def get_date(self):
- if "date" in self.params:
- return self.params["date"][:10]
- return None
-
-
-class MoebooruPopularMixin(PopularMixin):
- """Extraction and metadata handling for Moebooru and Danbooru v1"""
- def __init__(self, match):
- super().__init__(match)
- self.scale = match.group("scale")
-
def get_date(self):
if "year" in self.params:
return "{:>04}-{:>02}-{:>02}".format(
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 55b15d4..19ee182 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -16,7 +16,6 @@ import logging
import datetime
import requests
import threading
-import http.cookiejar
from .message import Message
from .. import config, text, util, exception, cloudflare
@@ -40,6 +39,7 @@ class Extractor():
self._cookiefile = None
self._cookiejar = self.session.cookies
+ self._parentdir = ""
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
@@ -197,13 +197,13 @@ class Extractor():
self._update_cookies_dict(cookies, self.cookiedomain)
elif isinstance(cookies, str):
cookiefile = util.expand_path(cookies)
- cookiejar = http.cookiejar.MozillaCookieJar()
try:
- cookiejar.load(cookiefile)
- except OSError as exc:
+ with open(cookiefile) as fp:
+ cookies = util.load_cookiestxt(fp)
+ except Exception as exc:
self.log.warning("cookies: %s", exc)
else:
- self._cookiejar.update(cookiejar)
+ self._update_cookies(cookies)
self._cookiefile = cookiefile
else:
self.log.warning(
@@ -218,11 +218,9 @@ class Extractor():
def _store_cookies(self):
"""Store the session's cookiejar in a cookies.txt file"""
if self._cookiefile and self.config("cookies-update", True):
- cookiejar = http.cookiejar.MozillaCookieJar()
- for cookie in self._cookiejar:
- cookiejar.set_cookie(cookie)
try:
- cookiejar.save(self._cookiefile)
+ with open(self._cookiefile, "w") as fp:
+ util.save_cookiestxt(fp, self._cookiejar)
except OSError as exc:
self.log.warning("cookies: %s", exc)
@@ -248,15 +246,22 @@ class Extractor():
def _check_cookies(self, cookienames, *, domain=None):
"""Check if all 'cookienames' are in the session's cookiejar"""
+ if not self._cookiejar:
+ return False
+
if domain is None:
domain = self.cookiedomain
-
names = set(cookienames)
+ now = time.time()
+
for cookie in self._cookiejar:
- if cookie.domain == domain:
- names.discard(cookie.name)
- if not names:
- return True
+ if cookie.name in names and cookie.domain == domain:
+ if cookie.expires and cookie.expires < now:
+ self.log.warning("Cookie '%s' has expired", cookie.name)
+ else:
+ names.discard(cookie.name)
+ if not names:
+ return True
return False
def _get_date_min_max(self, dmin=None, dmax=None):
@@ -491,12 +496,6 @@ def generate_extractors(extractor_data, symtable, classes):
symtable[Extr.__name__] = prev = Extr
-# Reduce strictness of the expected magic string in cookiejar files.
-# (This allows the use of Wget-generated cookiejars without modification)
-http.cookiejar.MozillaCookieJar.magic_re = re.compile(
- "#( Netscape)? HTTP Cookie File", re.IGNORECASE)
-
-
# Undo automatic pyOpenSSL injection by requests
pyopenssl = config.get((), "pyopenssl", False)
if not pyopenssl:
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index e8d3abf..3fdeaf9 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -1,69 +1,154 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://danbooru.donmai.us/"""
+"""Extractors for https://danbooru.donmai.us/"""
-from . import booru
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+import datetime
BASE_PATTERN = (
r"(?:https?://)?"
- r"(?P<subdomain>danbooru|hijiribe|sonohara|safebooru)"
- r"\.donmai\.us")
+ r"(danbooru|hijiribe|sonohara|safebooru)"
+ r"\.donmai\.us"
+)
-class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor):
+class DanbooruExtractor(SharedConfigMixin, Extractor):
"""Base class for danbooru extractors"""
+ basecategory = "booru"
category = "danbooru"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
page_limit = 1000
+ page_start = None
+ per_page = 100
def __init__(self, match):
- super().__init__(match)
- self.subdomain = match.group("subdomain")
- self.scheme = "https" if self.subdomain == "danbooru" else "http"
- self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format(
- scheme=self.scheme, subdomain=self.subdomain)
+ Extractor.__init__(self, match)
+ self.root = "https://{}.donmai.us".format(match.group(1))
self.ugoira = self.config("ugoira", True)
+ self.params = {}
username, api_key = self._get_auth_info()
if username:
self.log.debug("Using HTTP Basic Auth for user '%s'", username)
self.session.auth = (username, api_key)
-
-class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor):
- """Extractor for images from danbooru based on search-tags"""
- pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"
+ def skip(self, num):
+ pages = num // self.per_page
+ if pages >= self.page_limit:
+ pages = self.page_limit - 1
+ self.page_start = pages + 1
+ return pages * self.per_page
+
+ def items(self):
+ data = self.metadata()
+ for post in self.posts():
+ try:
+ url = post["file_url"]
+ except KeyError:
+ continue
+
+ text.nameext_from_url(url, post)
+ if post["extension"] == "zip":
+ if self.ugoira:
+ post["frames"] = self.request(
+ "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format(
+ self.root, post["id"])
+ ).json()["pixiv_ugoira_frame_data"]["data"]
+ else:
+ url = post["large_file_url"]
+ post["extension"] = "webm"
+
+ post.update(data)
+ yield Message.Directory, post
+ yield Message.Url, url, post
+
+ def metadata(self):
+ return {}
+
+ def posts(self):
+ return self._pagination(self.root + "/posts.json")
+
+ def _pagination(self, url, pagenum=False):
+ params = self.params.copy()
+ params["limit"] = self.per_page
+ params["page"] = self.page_start
+
+ while True:
+ posts = self.request(url, params=params).json()
+ yield from posts
+
+ if len(posts) < self.per_page:
+ return
+
+ if pagenum:
+ params["page"] += 1
+ else:
+ params["page"] = "b{}".format(posts[-1]["id"])
+
+
+class DanbooruTagExtractor(DanbooruExtractor):
+ """Extractor for danbooru posts from tag searches"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=([^&#]+)"
test = (
("https://danbooru.donmai.us/posts?tags=bonocho", {
"content": "b196fb9f1668109d7774a0a82efea3ffdda07746",
}),
# test page transitions
- ("https://danbooru.donmai.us/posts?tags=canvas_%28cocktail_soft%29", {
- "count": ">= 50",
+ ("https://danbooru.donmai.us/posts?tags=mushishi", {
+ "count": ">= 300",
}),
("https://hijiribe.donmai.us/posts?tags=bonocho"),
("https://sonohara.donmai.us/posts?tags=bonocho"),
("https://safebooru.donmai.us/posts?tags=bonocho"),
)
+ def __init__(self, match):
+ DanbooruExtractor.__init__(self, match)
+ self.params["tags"] = text.unquote(match.group(2).replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.params["tags"]}
+
-class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor):
- """Extractor for image-pools from danbooru"""
- pattern = BASE_PATTERN + r"/pools/(?P<pool>\d+)"
+class DanbooruPoolExtractor(DanbooruExtractor):
+ """Extractor for posts from danbooru pools"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")
+ archive_fmt = "p_{pool[id]}_{id}"
+ pattern = BASE_PATTERN + r"/pools/(\d+)"
test = ("https://danbooru.donmai.us/pools/7659", {
"content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",
})
-
-class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor):
- """Extractor for single images from danbooru"""
- pattern = BASE_PATTERN + r"/posts/(?P<post>\d+)"
+ def __init__(self, match):
+ DanbooruExtractor.__init__(self, match)
+ self.pool_id = match.group(2)
+ self.params["tags"] = "pool:" + self.pool_id
+
+ def metadata(self):
+ url = "{}/pools/{}.json".format(self.root, self.pool_id)
+ pool = self.request(url).json()
+ pool["name"] = pool["name"].replace("_", " ")
+ del pool["post_ids"]
+ return {"pool": pool}
+
+
+class DanbooruPostExtractor(DanbooruExtractor):
+ """Extractor for single danbooru posts"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/posts/(\d+)"
test = (
("https://danbooru.donmai.us/posts/294929", {
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
@@ -74,20 +159,47 @@ class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor):
})
)
+ def __init__(self, match):
+ DanbooruExtractor.__init__(self, match)
+ self.post_id = match.group(2)
-class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor):
+ def posts(self):
+ url = "{}/posts/{}.json".format(self.root, self.post_id)
+ return (self.request(url).json(),)
+
+
+class DanbooruPopularExtractor(DanbooruExtractor):
"""Extractor for popular images from danbooru"""
- pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P<query>[^#]*))?"
+ subcategory = "popular"
+ directory_fmt = ("{category}", "popular", "{scale}", "{date}")
+ archive_fmt = "P_{scale[0]}_{date}_{id}"
+ pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"
test = (
("https://danbooru.donmai.us/explore/posts/popular"),
(("https://danbooru.donmai.us/explore/posts/popular"
- "?date=2013-06-06+03%3A34%3A22+-0400&scale=week"), {
- "count": ">= 1",
+ "?date=2013-06-06&scale=week"), {
+ "range": "1-120",
+ "count": 120,
}),
)
def __init__(self, match):
- super().__init__(match)
- urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json"
- self.api_url = urlfmt.format(
- scheme=self.scheme, subdomain=self.subdomain)
+ DanbooruExtractor.__init__(self, match)
+ self.params.update(text.parse_query(match.group(2)))
+
+ def metadata(self):
+ self.page_start = self.page_start or 1
+ scale = self.params.get("scale", "day")
+ date = self.params.get("date") or datetime.date.today().isoformat()
+
+ if scale == "week":
+ date = datetime.date.fromisoformat(date)
+ date = (date - datetime.timedelta(days=date.weekday())).isoformat()
+ elif scale == "month":
+ date = date[:-3]
+
+ return {"date": date, "scale": scale}
+
+ def posts(self):
+ url = self.root + "/explore/posts/popular.json"
+ return self._pagination(url, True)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 02a14e3..90b27d1 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -93,9 +93,11 @@ class DeviantartExtractor(Extractor):
if content["src"].startswith("https://images-wixmp-"):
if deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069
- content["src"] = re.sub(
+ intermediary, count = re.subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*",
r"/intermediary\1", content["src"])
+ if count and self._check_url(intermediary):
+ content["src"] = intermediary
if self.quality:
content["src"] = re.sub(
r"q_\d+", self.quality, content["src"])
@@ -261,6 +263,9 @@ class DeviantartExtractor(Extractor):
if mtype and mtype.startswith("image/"):
content.update(data)
+ def _check_url(self, url):
+ return self.request(url, method="HEAD", fatal=False).status_code < 400
+
class DeviantartUserExtractor(DeviantartExtractor):
"""Extractor for an artist's user profile"""
@@ -717,7 +722,7 @@ class DeviantartExtractorV2(DeviantartExtractor):
# select largest video
target = max(media["types"],
key=lambda x: text.parse_int(x.get("q", "")[:-1]))
- src = target["s"]
+ src = target["b"]
elif target["t"] == "flash":
src = target["s"]
@@ -737,8 +742,10 @@ class DeviantartExtractorV2(DeviantartExtractor):
if src.startswith("https://images-wixmp-"):
if deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069
- src = re.sub(
+ intermediary, count = re.subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", src)
+ if count and self._check_url(intermediary):
+ src = intermediary
if self.quality:
src = re.sub(r"q_\d+", self.quality, src)
@@ -811,15 +818,17 @@ class DeviantartDeviationExtractor(DeviantartExtractorV2):
}),
# video
("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", {
- "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b",
+ "pattern": r"https://wixmp-.+wixmp.com/v/mp4/.+\.720p\.\w+.mp4",
"keyword": {
"filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5",
"extension": "mp4",
"target": {
"d": 306,
- "f": 9963639,
- "q": "1080p",
+ "f": 19367585,
+ "h": 720,
+ "q": "720p",
"t": "video",
+ "w": 1364,
"src": str,
},
}
@@ -952,11 +961,15 @@ class DeviantartAPI():
self.folders = extractor.config("folders", False)
self.metadata = extractor.extra or extractor.config("metadata", False)
- self.refresh_token = extractor.config("refresh-token")
- self.client_id = extractor.config("client-id", self.CLIENT_ID)
+ self.client_id = extractor.config(
+ "client-id", self.CLIENT_ID)
self.client_secret = extractor.config(
"client-secret", self.CLIENT_SECRET)
+ self.refresh_token = extractor.config("refresh-token")
+ if self.refresh_token == "cache":
+ self.refresh_token = "#" + str(self.client_id)
+
self.log.debug(
"Using %s API credentials (client-id %s)",
"default" if self.client_id == self.CLIENT_ID else "custom",
@@ -1026,8 +1039,12 @@ class DeviantartAPI():
"type" : kind,
"include_session": "false",
}
- return self.extractor.request(
- url, headers=headers, params=params, fatal=None).json()
+ response = self.extractor.request(
+ url, headers=headers, params=params, fatal=None)
+ if response.status_code == 404:
+ raise exception.StopExtraction(
+ "Your account must use the Eclipse interface.")
+ return response.json()
def deviation_metadata(self, deviations):
""" Fetch deviation metadata for a set of deviations"""
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index f245ddf..bc3f67a 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -1,71 +1,193 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://e621.net/"""
+"""Extractors for https://e621.net/"""
-from . import booru
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+import datetime
+import time
-class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+BASE_PATTERN = r"(?:https?://)?e(621|926)\.net"
+
+
+class E621Extractor(SharedConfigMixin, Extractor):
"""Base class for e621 extractors"""
+ basecategory = "booru"
category = "e621"
- api_url = "https://e621.net/post/index.json"
- post_url = "https://e621.net/post/show/{}"
+ filename_fmt = "{category}_{id}_{file[md5]}.{extension}"
page_limit = 750
+ page_start = None
+ per_page = 200
+ _last_request = 0
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.root = "https://e{}.net".format(match.group(1))
+ self.params = {}
+
+ username, api_key = self._get_auth_info()
+ if username:
+ self.log.debug("Using HTTP Basic Auth for user '%s'", username)
+ self.session.auth = (username, api_key)
+
+ def request(self, url, **kwargs):
+ diff = time.time() - E621Extractor._last_request
+ if diff < 1.0:
+ self.log.debug("Sleeping for %s seconds", diff)
+ time.sleep(diff)
+ kwargs["headers"] = {"User-Agent": "gallery-dl/1.13.0 (by mikf)"}
+ response = Extractor.request(self, url, **kwargs)
+ E621Extractor._last_request = time.time()
+ return response
+
+ def items(self):
+ data = self.metadata()
+ for post in self.posts():
+ file = post["file"]
+
+ if not file["url"]:
+ ihash = file["md5"]
+ file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format(
+ self.root[8:], ihash[0:2], ihash[2:4], ihash, file["ext"])
+
+ post["filename"] = file["md5"]
+ post["extension"] = file["ext"]
+ post.update(data)
+ yield Message.Directory, post
+ yield Message.Url, file["url"], post
+
+ def metadata(self):
+ return {}
+ def posts(self):
+ return self._pagination(self.root + "/posts.json")
-class E621TagExtractor(booru.TagMixin, E621Extractor):
- """Extractor for images from e621.net based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?e621\.net/post"
- r"(?:/index/\d+/|\?tags=)(?P<tags>[^/?&#]+)")
+ def _pagination(self, url):
+ params = self.params.copy()
+ params["limit"] = self.per_page
+ tags = params.get("tags", "")
+
+ while True:
+ posts = self.request(url, params=params).json()["posts"]
+ yield from posts
+
+ if len(posts) < self.per_page:
+ return
+ params["tags"] = "id:<{} {}".format(posts[-1]["id"], tags)
+
+
+class E621TagExtractor(E621Extractor):
+ """Extractor for e621 posts from tag searches"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/posts?(?:\?.*?tags=|/index/\d+/)([^&#]+)"
test = (
- ("https://e621.net/post/index/1/anry", {
+ ("https://e621.net/posts?tags=anry", {
"url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",
"content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
}),
+ ("https://e926.net/posts?tags=anry"),
+ ("https://e621.net/post/index/1/anry"),
("https://e621.net/post?tags=anry"),
)
+ def __init__(self, match):
+ E621Extractor.__init__(self, match)
+ self.params["tags"] = text.unquote(match.group(2).replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.params["tags"]}
+
-class E621PoolExtractor(booru.PoolMixin, E621Extractor):
- """Extractor for image-pools from e621.net"""
- pattern = r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(?P<pool>\d+)"
- test = ("https://e621.net/pool/show/73", {
- "url": "842f2fb065c7c339486a9b1d689020b8569888ed",
- "content": "c2c87b7a9150509496cddc75ccab08109922876a",
- })
-
-
-class E621PostExtractor(booru.PostMixin, E621Extractor):
- """Extractor for single images from e621.net"""
- pattern = r"(?:https?://)?(?:www\.)?e621\.net/post/show/(?P<post>\d+)"
- test = ("https://e621.net/post/show/535", {
- "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
- "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "anry",
- "tags_general": str,
- "tags_species": str,
- },
- })
-
-
-class E621PopularExtractor(booru.MoebooruPopularMixin, E621Extractor):
- """Extractor for popular images from 621.net"""
- pattern = (r"(?:https?://)?(?:www\.)?e621\.net"
- r"/post/popular_by_(?P<scale>day|week|month)"
- r"(?:\?(?P<query>[^#]*))?")
- test = ("https://e621.net/post/popular_by_month?month=6&year=2013", {
- "count": 32,
- })
+class E621PoolExtractor(E621Extractor):
+ """Extractor for e621 pools"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")
+ archive_fmt = "p_{pool[id]}_{id}"
+ pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)"
+ test = (
+ ("https://e621.net/pools/73", {
+ "url": "842f2fb065c7c339486a9b1d689020b8569888ed",
+ "content": "c2c87b7a9150509496cddc75ccab08109922876a",
+ }),
+ ("https://e621.net/pool/show/73"),
+ )
def __init__(self, match):
- super().__init__(match)
- self.api_url = "https://e621.net/post/popular_by_{scale}.json".format(
- scale=self.scale)
+ E621Extractor.__init__(self, match)
+ self.pool_id = match.group(2)
+ self.params["tags"] = "pool:" + self.pool_id
+
+ def metadata(self):
+ url = "{}/pools/{}.json".format(self.root, self.pool_id)
+ pool = self.request(url).json()
+ pool["name"] = pool["name"].replace("_", " ")
+ del pool["post_ids"]
+ return {"pool": pool}
+
+
+class E621PostExtractor(E621Extractor):
+ """Extractor for single e621 posts"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)"
+ test = (
+ ("https://e621.net/posts/535", {
+ "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
+ "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+ }),
+ ("https://e621.net/post/show/535"),
+ )
+
+ def __init__(self, match):
+ E621Extractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def posts(self):
+ url = "{}/posts/{}.json".format(self.root, self.post_id)
+ return (self.request(url).json()["post"],)
+
+
+class E621PopularExtractor(E621Extractor):
+ """Extractor for popular images from e621"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "popular", "{scale}", "{date}")
+ archive_fmt = "P_{scale[0]}_{date}_{id}"
+ pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"
+ test = (
+ ("https://e621.net/explore/posts/popular"),
+ (("https://e621.net/explore/posts/popular"
+ "?date=2019-06-01&scale=month"), {
+ "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+",
+ "count": ">= 70",
+ })
+ )
+
+ def __init__(self, match):
+ E621Extractor.__init__(self, match)
+ self.params.update(text.parse_query(match.group(2)))
+
+ def metadata(self):
+ scale = self.params.get("scale", "day")
+ date = self.params.get("date") or datetime.date.today().isoformat()
+ date = date[:10]
+
+ if scale == "week":
+ date = datetime.date.fromisoformat(date)
+ date = (date - datetime.timedelta(days=date.weekday())).isoformat()
+ elif scale == "month":
+ date = date[:-3]
+
+ return {"date": date, "scale": scale}
+
+ def posts(self):
+ url = self.root + "/explore/posts/popular.json"
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 967fd9c..a9d3c9d 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -61,7 +61,8 @@ class FlickrImageExtractor(FlickrExtractor):
test = (
("https://www.flickr.com/photos/departingyyz/16089302239", {
"pattern": pattern,
- "content": "0821a28ee46386e85b02b67cf2720063440a228c",
+ "content": ("3133006c6d657fe54cf7d4c46b82abbcb0efaf9f",
+ "0821a28ee46386e85b02b67cf2720063440a228c"),
"keyword": {
"comments": int,
"description": str,
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
new file mode 100644
index 0000000..ba60e19
--- /dev/null
+++ b/gallery_dl/extractor/furaffinity.py
@@ -0,0 +1,235 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.furaffinity.net/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net"
+
+
+class FuraffinityExtractor(Extractor):
+ """Base class for furaffinity extractors"""
+ category = "furaffinity"
+ directory_fmt = ("{category}", "{user!l}")
+ filename_fmt = "{id} {title}.{extension}"
+ archive_fmt = "{id}"
+ cookiedomain = ".furaffinity.net"
+ root = "https://www.furaffinity.net"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.offset = 0
+
+ def items(self):
+ for post_id in util.advance(self.posts(), self.offset):
+ post = self._parse_post(post_id)
+ if post:
+ yield Message.Directory, post
+ yield Message.Url, post["url"], post
+
+ def posts(self):
+ return self._pagination()
+
+ def skip(self, num):
+ self.offset += num
+ return num
+
+ def _parse_post(self, post_id):
+ url = "{}/view/{}/".format(self.root, post_id)
+ extr = text.extract_from(self.request(url).text)
+ title, _, artist = text.unescape(extr(
+ 'property="og:title" content="', '"')).rpartition(" by ")
+ path = extr('href="//d.facdn.net/', '"')
+
+ if not path:
+ self.log.warning(
+ "Unable to download post %s (\"%s\")",
+ post_id, text.remove_html(
+ extr('System Message', '</section>') or
+ extr('System Message', '</table>')
+ )
+ )
+ return None
+
+ pi = text.parse_int
+ rh = text.remove_html
+
+ data = text.nameext_from_url(path, {
+ "id" : pi(post_id),
+ "title" : title,
+ "artist": artist,
+ "user" : self.user or artist,
+ "url" : "https://d.facdn.net/" + path
+ })
+
+ tags = extr('class="tags-row">', '</section>')
+ if tags:
+ # new site layout
+ data["tags"] = text.split_html(tags)
+ data["description"] = text.unescape(rh(extr(
+ 'class="section-body">', '</div>'), "", ""))
+ data["views"] = pi(rh(extr('class="views">', '</span>')))
+ data["favorites"] = pi(rh(extr('class="favorites">', '</span>')))
+ data["comments"] = pi(rh(extr('class="comments">', '</span>')))
+ data["rating"] = rh(extr('class="rating">', '</span>'))
+ data["fa_category"] = rh(extr('>Category</strong>', '</span>'))
+ data["theme"] = rh(extr('>', '<'))
+ data["species"] = rh(extr('>Species</strong>', '</div>'))
+ data["gender"] = rh(extr('>Gender</strong>', '</div>'))
+ data["width"] = pi(extr("<span>", "x"))
+ data["height"] = pi(extr("", "p"))
+ else:
+ # old site layout
+ data["fa_category"] = extr("<b>Category:</b>", "<").strip()
+ data["theme"] = extr("<b>Theme:</b>", "<").strip()
+ data["species"] = extr("<b>Species:</b>", "<").strip()
+ data["gender"] = extr("<b>Gender:</b>", "<").strip()
+ data["favorites"] = pi(extr("<b>Favorites:</b>", "<"))
+ data["comments"] = pi(extr("<b>Comments:</b>", "<"))
+ data["views"] = pi(extr("<b>Views:</b>", "<"))
+ data["width"] = pi(extr("<b>Resolution:</b>", "x"))
+ data["height"] = pi(extr("", "<"))
+ data["tags"] = text.split_html(extr(
+ 'id="keywords">', '</div>'))[::2]
+ data["rating"] = extr('<img alt="', ' ')
+ data["description"] = text.unescape(text.remove_html(extr(
+ "</table>", "</table>"), "", ""))
+ data["date"] = text.parse_timestamp(data["filename"].partition(".")[0])
+
+ return data
+
+ def _pagination(self):
+ num = 1
+
+ while True:
+ url = "{}/{}/{}/{}/".format(
+ self.root, self.subcategory, self.user, num)
+ page = self.request(url).text
+ post_id = None
+
+ for post_id in text.extract_iter(page, 'id="sid-', '"'):
+ yield post_id
+
+ if not post_id:
+ return
+ num += 1
+
+ def _pagination_favorites(self):
+ path = "/favorites/{}/".format(self.user)
+
+ while path:
+ page = self.request(self.root + path).text
+ yield from text.extract_iter(page, 'id="sid-', '"')
+ path = text.extract(page, 'right" href="', '"')[0]
+
+
+class FuraffinityGalleryExtractor(FuraffinityExtractor):
+ """Extractor for a furaffinity user's gallery"""
+ subcategory = "gallery"
+ pattern = BASE_PATTERN + r"/gallery/([^/?&#]+)"
+ test = ("https://www.furaffinity.net/gallery/mirlinthloth/", {
+ "pattern": r"https://d.facdn.net/art/mirlinthloth/\d+/\d+.\w+\.\w+",
+ "range": "45-50",
+ "count": 6,
+ })
+
+
+class FuraffinityScrapsExtractor(FuraffinityExtractor):
+ """Extractor for a furaffinity user's scraps"""
+ subcategory = "scraps"
+ directory_fmt = ("{category}", "{user!l}", "Scraps")
+ pattern = BASE_PATTERN + r"/scraps/([^/?&#]+)"
+ test = ("https://www.furaffinity.net/scraps/mirlinthloth/", {
+ "pattern": r"https://d.facdn.net/art/[^/]+(/stories)?/\d+/\d+.\w+.\w+",
+ "count": ">= 3",
+ })
+
+
+class FuraffinityFavoriteExtractor(FuraffinityExtractor):
+ """Extractor for a furaffinity user's favorites"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{user!l}", "Favorites")
+ pattern = BASE_PATTERN + r"/favorites/([^/?&#]+)"
+ test = ("https://www.furaffinity.net/favorites/mirlinthloth/", {
+ "pattern": r"https://d.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+",
+ "range": "45-50",
+ "count": 6,
+ })
+
+ def posts(self):
+ return self._pagination_favorites()
+
+
+class FuraffinityPostExtractor(FuraffinityExtractor):
+ """Extractor for individual posts on furaffinity"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)"
+ test = (
+ ("https://www.furaffinity.net/view/21835115/", {
+ "url": "eae4ef93d99365c69b31a37561bd800c03d336ad",
+ "keyword": {
+ "artist" : "mirlinthloth",
+ "date" : "dt:2016-11-27 17:24:06",
+ "description": "A Song made playing the game Cosmic DJ.",
+ "extension" : "mp3",
+ "filename" : r"re:\d+\.\w+_dj_fennmink_-_bude_s_4_ever",
+ "id" : 21835115,
+ "tags" : list,
+ "title" : "Bude's 4 Ever",
+ "url" : "re:https://d.facdn.net/art/mirlinthloth/music",
+ "user" : "mirlinthloth",
+ "views" : int,
+ "favorites" : int,
+ "comments" : int,
+ "rating" : "General",
+ "fa_category": "Music",
+ "theme" : "All",
+ "species" : "Unspecified / Any",
+ "gender" : "Any",
+ "width" : 120,
+ "height" : 120,
+ },
+ }),
+ ("https://furaffinity.net/view/21835115/"),
+ ("https://sfw.furaffinity.net/view/21835115/"),
+ ("https://www.furaffinity.net/full/21835115/"),
+ )
+
+ def posts(self):
+ post_id = self.user
+ self.user = None
+ return (post_id,)
+
+
+class FuraffinityUserExtractor(FuraffinityExtractor):
+ """Extractor for furaffinity user profiles"""
+ subcategory = "user"
+ cookiedomain = None
+ pattern = BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = (
+ ("https://www.furaffinity.net/user/mirlinthloth/", {
+ "pattern": r"/gallery/mirlinthloth/$",
+ }),
+ ("https://www.furaffinity.net/user/mirlinthloth/", {
+ "options": (("include", "all"),),
+ "pattern": r"/(gallery|scraps|favorites)/mirlinthloth/$",
+ "count": 3,
+ }),
+ )
+
+ def items(self):
+ base = "{}/{{}}/{}/".format(self.root, self.user)
+ return self._dispatch_extractors((
+ (FuraffinityGalleryExtractor , base.format("gallery")),
+ (FuraffinityScrapsExtractor , base.format("scraps")),
+ (FuraffinityFavoriteExtractor, base.format("favorites")),
+ ), ("gallery",))
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index 19f9481..6e82091 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -247,7 +247,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
"content": "91bf01497c39254b6dfb234a18e8f01629c77fd1",
"keyword": {
"artist" : "Tenpura",
- "date" : "type:datetime",
+ "date" : "dt:2016-02-22 14:41:19",
"description": "Thank you!",
"height" : 700,
"index" : 407501,
diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py
new file mode 100644
index 0000000..302999b
--- /dev/null
+++ b/gallery_dl/extractor/hentaihand.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentaihand.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import collections
+
+
+class HentaihandGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries on hentaihand.com"""
+ category = "hentaihand"
+ root = "https://hentaihand.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com"
+ r"/(?:comi|view)c/(\d+)")
+ test = (
+ ("https://hentaihand.com/comic/272772/kouda-tomohiro-chiyomi-bl", {
+ "pattern": r"https://i.hentaihand.com/.*/images/full/\d+.jpg$",
+ "count": 19,
+ "keyword": {
+ "artists" : ["kouda tomohiro"],
+ "categories": ["manga"],
+ "date" : "Feb. 6, 2020, 3:19 p.m.",
+ "gallery_id": 272772,
+ "lang" : "en",
+ "language" : "English",
+ "relationships": ["family", "step family"],
+ "tags" : list,
+ "title" : r"re:\[Kouda Tomohiro\] Chiyomi Blizzard",
+ "title_jp" : r"re:\[幸田朋弘\] ちよみブリザード",
+ },
+ }),
+ ("https://hentaihand.com/viewc/272772/kouda-tomohiro-chiyomi-bl"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/comic/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+
+ title_en = text.unescape(extr("<h1>", "<"))
+ title_jp = text.unescape(extr("<h2>", "<"))
+ tags = extr('<section id="tags"', "</section>")
+
+ data = {
+ "gallery_id" : text.parse_int(self.gallery_id),
+ "title" : title_en or title_jp,
+ "title_en" : title_en,
+ "title_jp" : title_jp,
+
+ # impossible to parse with strptime()
+ "date" : extr('datetime="', '"'),
+ }
+
+ tdict = collections.defaultdict(list)
+ for path in text.extract_iter(tags, 'href="/', '"'):
+ kind, _, name = path.partition("/")
+ tdict[kind].append(name.replace("+", " "))
+ data.update(tdict)
+
+ if "languages" in data:
+ data["language"] = data["languages"][-1].capitalize()
+ data["lang"] = util.language_to_code(data["language"])
+ del data["languages"]
+ return data
+
+ def images(self, _):
+ url = "{}/viewc/{}/1".format(self.root, self.gallery_id)
+ page = self.request(url).text
+ images = text.extract(page, "var images", ";")[0]
+ return [(img, None) for img in text.extract_iter(images, "'", "'")]
+
+
+class HentaihandTagExtractor(Extractor):
+ """Extractor for tag searches on hentaihand.com"""
+ category = "hentaihand"
+ subcategory = "tag"
+ root = "https://hentaihand.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com"
+ r"(/(?:parody|characters|tags|artists|groups|languages"
+ r"|categories|relationships)/[^#]+)")
+ test = (
+ ("https://hentaihand.com/artists/tony+taka", {
+ "pattern": HentaihandGalleryExtractor.pattern,
+ "count": ">= 50",
+ }),
+ ("https://hentaihand.com/artists/tony+taka/popular?page=2"),
+ ("https://hentaihand.com/tags/full+color"),
+ ("https://hentaihand.com/languages/japanese"),
+ ("https://hentaihand.com/categories/manga"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path, _, query = match.group(1).partition("?")
+ self.query = text.parse_query(query)
+ self.query["page"] = text.parse_int(self.query.get("page"), 1)
+
+ def items(self):
+ yield Message.Version, 1
+ url = self.root + self.path
+ params = self.query.copy()
+ data = {"_extractor": HentaihandGalleryExtractor}
+
+ while True:
+ page = self.request(url, params=params).text
+
+ for path in text.extract_iter(page, '<a href="/comic/', '"'):
+ yield Message.Queue, self.root + "/comic/" + path, data
+
+ pos = page.find(">(current)<")
+ if pos < 0 or page.find('class="page-link" href="', pos) < 0:
+ break
+ params["page"] += 1
+
+
+class HentaihandSearchExtractor(HentaihandTagExtractor):
+ """Extractor for search results on hentaihand.com"""
+ subcategory = "search"
+ pattern = r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com(/search/?[^#]+)"
+ test = ("https://hentaihand.com/search?q=color", {
+ "pattern": HentaihandGalleryExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ })
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
index 193cadf..ad97eba 100644
--- a/gallery_dl/extractor/hentainexus.py
+++ b/gallery_dl/extractor/hentainexus.py
@@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
test = (
("https://hentainexus.com/view/5688", {
"url": "746d0043e20030f1171aae5ea113176607302517",
- "keyword": "c1b7091e2bc2f733f6401711e072ad11cf93dd69",
+ "keyword": "77702b42f8f76ecfe5d8a14cfbbcbd855eb14d7f",
}),
("https://hentainexus.com/read/5688"),
)
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
new file mode 100644
index 0000000..e0b0f50
--- /dev/null
+++ b/gallery_dl/extractor/hiperdex.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hiperdex.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+from ..cache import memcache
+import re
+
+
+class HiperdexBase():
+ """Base class for hiperdex extractors"""
+ category = "hiperdex"
+ root = "https://hiperdex.com"
+
+ @memcache(keyarg=1)
+ def manga_data(self, manga, page=None):
+ if not page:
+ url = "{}/manga/{}/".format(self.root, manga)
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ return {
+ "manga" : text.unescape(extr(
+ "<title>", "<").rpartition("&")[0].strip()),
+ "score" : text.parse_float(extr(
+ 'id="averagerate">', '<')),
+ "author" : text.remove_html(extr(
+ 'class="author-content">', '</div>')),
+ "artist" : text.remove_html(extr(
+ 'class="artist-content">', '</div>')),
+ "genre" : text.split_html(extr(
+ 'class="genres-content">', '</div>'))[::2],
+ "type" : extr(
+ 'class="summary-content">', '<').strip(),
+ "release": text.parse_int(text.remove_html(extr(
+ 'class="summary-content">', '</div>'))),
+ "status" : extr(
+ 'class="summary-content">', '<').strip(),
+ "description": text.remove_html(text.unescape(extr(
+ 'class="description-summary">', '</div>'))),
+ "language": "English",
+ "lang" : "en",
+ }
+
+ def chapter_data(self, chapter):
+ chapter, _, minor = chapter.partition("-")
+ data = {
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": "." + minor if minor and minor != "end" else "",
+ }
+ data.update(self.manga_data(self.manga.lower()))
+ return data
+
+
+class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
+ """Extractor for manga chapters from hiperdex.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.com"
+ r"(/manga/([^/?&#]+)/([^/?&#]+))")
+ test = ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", {
+ "url": "111bc3ee14ce91d78c275770ef63b56c9ac15d8d",
+ "keyword": {
+ "artist" : "Sasuga Kei",
+ "author" : "Sasuga Kei",
+ "chapter": 154,
+ "chapter_minor": ".5",
+ "description": "re:Natsuo Fujii is in love with his teacher, Hina",
+ "genre" : list,
+ "manga" : "Domestic na Kanojo",
+ "release": 2014,
+ "score" : float,
+ "type" : "Manga",
+ },
+ })
+
+ def __init__(self, match):
+ path, self.manga, self.chapter = match.groups()
+ ChapterExtractor.__init__(self, match, self.root + path + "/")
+
+ def metadata(self, _):
+ return self.chapter_data(self.chapter)
+
+ def images(self, page):
+ return [
+ (url.strip(), None)
+ for url in re.findall(r'id="image-\d+"\s+src="([^"]+)', page)
+ ]
+
+
+class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
+ """Extractor for manga from hiperdex.com"""
+ chapterclass = HiperdexChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?hiperdex\.com(/manga/([^/?&#]+))/?$"
+ test = ("https://hiperdex.com/manga/youre-not-that-special/", {
+ "count": 51,
+ "pattern": HiperdexChapterExtractor.pattern,
+ "keyword": {
+ "artist" : "Bolp",
+ "author" : "Abyo4",
+ "chapter": int,
+ "chapter_minor": "",
+ "description": "re:I didn’t think much of the creepy girl in ",
+ "genre" : list,
+ "manga" : "You're Not That Special!",
+ "release": 2019,
+ "score" : float,
+ "status" : "Completed",
+ "type" : "Manhwa",
+ },
+ })
+
+ def __init__(self, match):
+ path, self.manga = match.groups()
+ MangaExtractor.__init__(self, match, self.root + path + "/")
+
+ def chapters(self, page):
+ self.manga_data(self.manga, page)
+ results = []
+ last = None
+
+ page = text.extract(page, 'class="page-content-listing', '</ul>')[0]
+ for match in HiperdexChapterExtractor.pattern.finditer(page):
+ path = match.group(1)
+ if last != path:
+ last = path
+ results.append((
+ self.root + path,
+ self.chapter_data(path.rpartition("/")[2]),
+ ))
+
+ return results
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index d6fdcf2..3baf819 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://hitomi.la/"""
+"""Extractors for https://hitomi.la/"""
from .common import GalleryExtractor
from .. import text, util
@@ -27,21 +27,25 @@ class HitomiGalleryExtractor(GalleryExtractor):
"keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
"count": 16,
}),
+ # download test
("https://hitomi.la/galleries/1401410.html", {
- # download test
"range": "1",
"content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",
}),
+ # Game CG with scenes (#321)
("https://hitomi.la/galleries/733697.html", {
- # Game CG with scenes (#321)
- "url": "21064f9e3c244aca87f1a91967a3fbe79032c4ce",
+ "url": "b4cbc76032852db4a655bf6a2c4d58eae8153c8e",
"count": 210,
}),
+ # fallback for galleries only available through /reader/ URLs
("https://hitomi.la/galleries/1045954.html", {
- # fallback for galleries only available through /reader/ URLs
- "url": "0a67f5e6c3c6a384b578e328f4817fa6ccdf856a",
+ "url": "f3aa914ad148437f72d307268fa0d250eabe8dab",
"count": 1413,
}),
+ # gallery with "broken" redirect
+ ("https://hitomi.la/cg/scathacha-sama-okuchi-ecchi-1291900.html", {
+ "count": 10,
+ }),
("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"),
("https://hitomi.la/manga/867789.html"),
("https://hitomi.la/doujinshi/867789.html"),
@@ -51,84 +55,90 @@ class HitomiGalleryExtractor(GalleryExtractor):
)
def __init__(self, match):
- self.gallery_id = match.group(1)
- self.fallback = False
- url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
+ gid = match.group(1)
+ url = "https://ltn.hitomi.la/galleries/{}.js".format(gid)
GalleryExtractor.__init__(self, match, url)
+ self.info = None
+ self.session.headers["Referer"] = "{}/reader/{}.html".format(
+ self.root, gid)
+
+ def metadata(self, page):
+ self.info = info = json.loads(page.partition("=")[2])
+
+ data = self._data_from_gallery_info(info)
+ if self.config("metadata", True):
+ data.update(self._data_from_gallery_page(info))
+ return data
+
+ def _data_from_gallery_info(self, info):
+ language = info.get("language")
+ if language:
+ language = language.capitalize()
+
+ tags = []
+ for tinfo in info["tags"]:
+ tag = tinfo["tag"]
+ if tinfo.get("female"):
+ tag += " ♀"
+ elif tinfo.get("male"):
+ tag += " ♂"
+ tags.append(string.capwords(tag))
+
+ return {
+ "gallery_id": text.parse_int(info["id"]),
+ "title" : info["title"],
+ "type" : info["type"].capitalize(),
+ "language" : language,
+ "lang" : util.language_to_code(language),
+ "tags" : tags,
+ "date" : text.parse_datetime(
+ info["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
+ }
+
+ def _data_from_gallery_page(self, info):
+ url = "{}/galleries/{}.html".format(self.root, info["id"])
- def request(self, url, **kwargs):
- response = GalleryExtractor.request(self, url, fatal=False, **kwargs)
- if response.status_code == 404:
- self.fallback = True
- url = url.replace("/galleries/", "/reader/")
- response = GalleryExtractor.request(self, url, **kwargs)
- elif b"<title>Redirect</title>" in response.content:
+ # follow redirects
+ while True:
+ response = self.request(url, fatal=False)
+ if b"<title>Redirect</title>" not in response.content:
+ break
url = text.extract(response.text, "href='", "'")[0]
if not url.startswith("http"):
url = text.urljoin(self.root, url)
- response = self.request(url, **kwargs)
- return response
- def metadata(self, page):
- if self.fallback:
- return {
- "gallery_id": text.parse_int(self.gallery_id),
- "title": text.unescape(text.extract(
- page, "<title>", "<")[0].rpartition(" | ")[0]),
- }
-
- extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
- data = {
- "gallery_id": text.parse_int(self.gallery_id),
- "title" : text.unescape(extr('.html">', '<').strip()),
- "artist" : self._prep(extr('<h2>', '</h2>')),
- "group" : self._prep(extr('<td>Group</td><td>', '</td>')),
- "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),
- "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),
- "parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
- "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
- "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
- "date" : self._date(extr('<span class="date">', '</span>')),
+ if response.status_code >= 400:
+ return {}
+
+ def prep(value):
+ return [
+ text.unescape(string.capwords(v))
+ for v in text.extract_iter(value or "", '.html">', '<')
+ ]
+
+ extr = text.extract_from(response.text)
+ return {
+ "artist" : prep(extr('<h2>', '</h2>')),
+ "group" : prep(extr('<td>Group</td><td>', '</td>')),
+ "parody" : prep(extr('<td>Series</td><td>', '</td>')),
+ "characters": prep(extr('<td>Characters</td><td>', '</td>')),
}
- if data["language"] == "N/a":
- data["language"] = None
- data["lang"] = util.language_to_code(data["language"])
- return data
-
- def images(self, page):
- # set Referer header before image downloads (#239)
- self.session.headers["Referer"] = self.gallery_url
-
- # get 'galleryinfo'
- url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id)
- page = self.request(url).text
+ def images(self, _):
result = []
- for image in json.loads(page.partition("=")[2]):
+ for image in self.info["files"]:
ihash = image["hash"]
idata = text.nameext_from_url(image["name"])
# see https://ltn.hitomi.la/common.js
- offset = int(ihash[-3:-1], 16) % 3
+ inum = int(ihash[-3:-1], 16)
+ frontends = 2 if inum < 0x30 else 3
+ inum = 1 if inum < 0x09 else inum
+
url = "https://{}a.hitomi.la/images/{}/{}/{}.{}".format(
- chr(97 + offset),
+ chr(97 + (inum % frontends)),
ihash[-1], ihash[-3:-1], ihash,
idata["extension"],
)
result.append((url, idata))
return result
-
- @staticmethod
- def _prep(value):
- return [
- text.unescape(string.capwords(v))
- for v in text.extract_iter(value or "", '.html">', '<')
- ]
-
- @staticmethod
- def _prep_1(value):
- return text.remove_html(value).capitalize()
-
- @staticmethod
- def _date(value):
- return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index 15152b7..d0aa4f2 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -111,13 +111,13 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
test = (
("https://ibb.co/album/i5PggF", {
"range": "1-80",
- "url": "570872b6eb3e11cf10b618922b780fed204c3f09",
- "keyword": "0f2fc956728c36540c577578bd168d2459d6ae4b",
+ "url": "70afec9fcc3a6de62a6b644b487d892d8d47cf1a",
+ "keyword": "569e1d88ebdd27655387559cdf1cd526a3e1ab69",
}),
("https://ibb.co/album/i5PggF?sort=title_asc", {
"range": "1-80",
- "url": "e2e387b8fdb3690bd75d804d0af2833112e385cd",
- "keyword": "a307fc9d2085bdc0eb7c538c8d866c59198d460c",
+ "url": "a2dfc58fe3348fa37e242082bd5a85eaa490d0a5",
+ "keyword": "5bb79c82411c3770d673fac64a0a98fa28111c3b",
}),
# no user data (#471)
("https://ibb.co/album/kYKpwF", {
@@ -192,12 +192,12 @@ class ImgbbImageExtractor(ImgbbExtractor):
subcategory = "image"
pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)"
test = ("https://ibb.co/fUqh5b", {
- "pattern": "https://image.ibb.co/dY5FQb/Arundel-Ireeman-5.jpg",
+ "pattern": r"https://i\.ibb\.co/g3kvx80/Arundel-Ireeman-5\.jpg",
"content": "c5a0965178a8b357acd8aa39660092918c63795e",
"keyword": {
"id" : "fUqh5b",
"title" : "Arundel Ireeman 5",
- "url" : "https://image.ibb.co/dY5FQb/Arundel-Ireeman-5.jpg",
+ "url" : "https://i.ibb.co/g3kvx80/Arundel-Ireeman-5.jpg",
"width" : 960,
"height": 719,
"user" : "folkie",
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 5084e80..0813ea9 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://imgur.com/"""
+"""Extractors for https://imgur.com/"""
from .common import Extractor, Message
from .. import text, exception
@@ -65,7 +65,7 @@ class ImgurImageExtractor(ImgurExtractor):
"account_url" : None,
"animated" : False,
"bandwidth" : int,
- "date" : "type:datetime",
+ "date" : "dt:2016-11-10 14:24:35",
"datetime" : 1478787875,
"description" : None,
"edited" : "0",
@@ -142,7 +142,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
"cover_edited": None,
"cover_height": 1400,
"cover_width" : 951,
- "date" : "type:datetime",
+ "date" : "dt:2015-10-09 10:37:50",
"datetime" : 1444387070,
"description" : None,
"favorite" : False,
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 05adac1..96afea1 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann
+# Copyright 2018-2019 Leonardo Taccari
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.instagram.com/"""
+"""Extractors for https://www.instagram.com/"""
from .common import Extractor, Message
from .. import text, exception
@@ -129,6 +130,7 @@ class InstagramExtractor(Extractor):
'owner_id': media['owner']['id'],
'username': media['owner']['username'],
'fullname': media['owner']['full_name'],
+ "post_shortcode": media['shortcode'],
'description': text.parse_unicode_escapes('\n'.join(
edge['node']['text']
for edge in media['edge_media_to_caption']['edges']
@@ -306,12 +308,13 @@ class InstagramImageExtractor(InstagramExtractor):
r"/v(p/[0-9a-f]+/[0-9A-F]+)?/t51.2885-15/e35"
r"/44877605_725955034447492_3123079845831750529_n.jpg",
"keyword": {
- "date": "type:datetime",
+ "date": "dt:2018-11-29 01:04:04",
"description": str,
"height": int,
"likes": int,
"media_id": "1922949326347663701",
"shortcode": "BqvsDleB3lV",
+ "post_shortcode": "BqvsDleB3lV",
"typename": "GraphImage",
"username": "instagram",
"width": int,
@@ -324,6 +327,7 @@ class InstagramImageExtractor(InstagramExtractor):
"keyword": {
"sidecar_media_id": "1875629777499953996",
"sidecar_shortcode": "BoHk1haB5tM",
+ "post_shortcode": "BoHk1haB5tM",
"likes": int,
"username": "instagram",
}
@@ -333,7 +337,7 @@ class InstagramImageExtractor(InstagramExtractor):
("https://www.instagram.com/p/Bqxp0VSBgJg/", {
"pattern": r"/47129943_191645575115739_8539303288426725376_n\.mp4",
"keyword": {
- "date": "type:datetime",
+ "date": "dt:2018-11-29 19:23:58",
"description": str,
"height": int,
"likes": int,
@@ -349,7 +353,7 @@ class InstagramImageExtractor(InstagramExtractor):
("https://www.instagram.com/tv/BkQjCfsBIzi/", {
"pattern": r"/10000000_1760663964018792_716207142595461120_n\.mp4",
"keyword": {
- "date": "type:datetime",
+ "date": "dt:2018-06-20 19:51:32",
"description": str,
"height": int,
"likes": int,
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index 49d68ef..b34b288 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -35,7 +35,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
"access" : "public",
"articleStories": list,
"contentRating" : dict,
- "date" : "type:datetime",
+ "date" : "dt:2019-09-16 00:00:00",
"description" : "re:Motions, the brand new publication by I",
"documentId" : r"re:\d+-d99ec95935f15091b040cb8060f05510",
"documentName" : "motions-1-2019",
diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py
new file mode 100644
index 0000000..a8702f1
--- /dev/null
+++ b/gallery_dl/extractor/kabeuchi.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://kabe-uchiroom.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+class KabeuchiUserExtractor(Extractor):
+ """Extractor for all posts of a user on kabe-uchiroom.com"""
+ category = "kabeuchi"
+ subcategory = "user"
+ directory_fmt = ("{category}", "{twitter_user_id} {twitter_id}")
+ filename_fmt = "{id}_{num:>02}{title:?_//}.{extension}"
+ archive_fmt = "{id}_{num}"
+ root = "https://kabe-uchiroom.com"
+ pattern = r"(?:https?://)?kabe-uchiroom\.com/mypage/?\?id=(\d+)"
+ test = (
+ ("https://kabe-uchiroom.com/mypage/?id=919865303848255493", {
+ "pattern": (r"https://kabe-uchiroom\.com/accounts/upfile/3/"
+ r"919865303848255493/\w+\.jpe?g"),
+ "count": ">= 24",
+ }),
+ ("https://kabe-uchiroom.com/mypage/?id=123456789", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def items(self):
+ base = "{}/accounts/upfile/{}/{}/".format(
+ self.root, self.user_id[-1], self.user_id)
+ keys = ("image1", "image2", "image3", "image4", "image5", "image6")
+
+ for post in self.posts():
+ if post.get("is_ad") or not post["image1"]:
+ continue
+
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%Y-%m-%d %H:%M:%S")
+ yield Message.Directory, post
+
+ for key in keys:
+ name = post[key]
+ if not name:
+ break
+ url = base + name
+ post["num"] = ord(key[-1]) - 48
+ yield Message.Url, url, text.nameext_from_url(name, post)
+
+ def posts(self):
+ url = "{}/mypage/?id={}".format(self.root, self.user_id)
+ response = self.request(url)
+ if response.history and response.url == self.root + "/":
+ raise exception.NotFoundError("user")
+ target_id = text.extract(response.text, 'user_friend_id = "', '"')[0]
+ return self._pagination(target_id)
+
+ def _pagination(self, target_id):
+ url = "{}/get_posts.php".format(self.root)
+ data = {
+ "user_id" : "0",
+ "target_id" : target_id,
+ "type" : "uploads",
+ "sort_type" : "0",
+ "category_id": "all",
+ "latest_post": "",
+ "page_num" : 0,
+ }
+
+ while True:
+ info = self.request(url, method="POST", data=data).json()
+ datas = info["datas"]
+
+ if not datas or not isinstance(datas, list):
+ return
+ yield from datas
+
+ last_id = datas[-1]["id"]
+ if last_id == info["last_data"]:
+ return
+ data["latest_post"] = last_id
+ data["page_num"] += 1
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index c9e6959..822a743 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract soundtracks from https://downloads.khinsider.com/"""
+"""Extractors for https://downloads.khinsider.com/"""
from .common import Extractor, Message, AsynchronousMixin
from .. import text, exception
@@ -16,54 +16,52 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
"""Extractor for soundtracks from khinsider.com"""
category = "khinsider"
subcategory = "soundtrack"
- directory_fmt = ("{category}", "{album}")
- archive_fmt = "{album}_{filename}.{extension}"
+ directory_fmt = ("{category}", "{album[name]}")
+ archive_fmt = "{filename}.{extension}"
pattern = (r"(?:https?://)?downloads\.khinsider\.com"
r"/game-soundtracks/album/([^/?&#]+)")
+ root = "https://downloads.khinsider.com"
test = (("https://downloads.khinsider.com"
"/game-soundtracks/album/horizon-riders-wii"), {
- "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+"
- r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3",
- "count": 1,
- "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679",
+ "pattern": r"https?://vgmdownloads.com/soundtracks/horizon-riders-wii/"
+ r"[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3",
+ "keyword": "5b2c35cce638c326cab2a4f7a79f245d008d62ff",
})
- root = "https://downloads.khinsider.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.album = match.group(1)
def items(self):
- url = (self.root + "/game-soundtracks/album/" + self.album)
+ url = self.root + "/game-soundtracks/album/" + self.album
page = self.request(url, encoding="utf-8").text
- data = self.get_job_metadata(page)
+ if "Download all songs at once:" not in page:
+ raise exception.NotFoundError("soundtrack")
+
+ data = self.metadata(page)
yield Message.Version, 1
yield Message.Directory, data
- for url, track in self.get_album_tracks(page):
+ for track in self.tracks(page):
track.update(data)
- yield Message.Url, url, track
+ yield Message.Url, track["url"], track
- def get_job_metadata(self, page):
- """Collect metadata for extractor-job"""
- if "Download all songs at once:" not in page:
- raise exception.NotFoundError("soundtrack")
- data = text.extract_all(page, (
- ("album", "Album name: <b>", "</b>"),
- ("count", "Number of Files: <b>", "</b>"),
- ("size" , "Total Filesize: <b>", "</b>"),
- ("date" , "Date added: <b>", "</b>"),
- ("type" , "Album type: <b>", "</b>"),
- ))[0]
- data["album"] = text.unescape(data["album"])
- return data
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ return {"album": {
+ "name" : text.unescape(extr("Album name: <b>", "<")),
+ "count": text.parse_int(extr("Number of Files: <b>", "<")),
+ "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]),
+ "date" : extr("Date added: <b>", "<"),
+ "type" : extr("Album type: <b>", "<"),
+ }}
- def get_album_tracks(self, page):
- """Collect url and metadata for all tracks of a soundtrack"""
+ def tracks(self, page):
page = text.extract(page, '<table id="songlist">', '</table>')[0]
+
for num, url in enumerate(text.extract_iter(
page, '<td class="clickable-row"><a href="', '"'), 1):
url = text.urljoin(self.root, url)
page = self.request(url, encoding="utf-8").text
- url = text.extract(
- page, '<p><a style="color: #21363f;" href="', '"')[0]
- yield url, text.nameext_from_url(url, {"num": num})
+
+ url = text.extract(page, 'style="color: #21363f;" href="', '"')[0]
+ yield text.nameext_from_url(url, {"num": num, "url": url})
diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py
index 7151de0..8809589 100644
--- a/gallery_dl/extractor/kissmanga.py
+++ b/gallery_dl/extractor/kissmanga.py
@@ -94,7 +94,7 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
}),
("https://kissmanga.com/Manga/Houseki-no-Kuni/Oneshot?id=404189", {
"count": 49,
- "keyword": "d44d1b21d08e4dbf888b0c450a3f1bc919588b4f",
+ "keyword": "cea131c9fe9c71309b3270cd86718d4d1198c31c",
}),
("https://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608"),
)
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index c80cf14..c31de1c 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -59,7 +59,7 @@ class LusciousAlbumExtractor(LusciousExtractor):
"cover" : "re:https://\\w+.luscious.net/.+/277031/",
"created" : 1479625853,
"created_by" : "NTRshouldbeillegal",
- "date" : "type:datetime",
+ "date" : "dt:2016-11-20 07:10:53",
"description" : "Enjoy.",
"download_url": "/download/824778/277031/",
"genres" : list,
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
index d24d452..31083dc 100644
--- a/gallery_dl/extractor/mangareader.py
+++ b/gallery_dl/extractor/mangareader.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -38,7 +38,7 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"
test = (("https://www.mangareader.net"
"/karate-shoukoushi-kohinata-minoru/11"), {
- "url": "061cc92a07edf17bb991ce0821fa4c77a147a860",
+ "url": "3d8a5b900856d59b8d8e83908d0df392be92c0f4",
"keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6",
})
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index 114a48e..8cd7fa5 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import hashlib
+import time
class MangoxoExtractor(Extractor):
@@ -35,28 +36,34 @@ class MangoxoExtractor(Extractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- page = self.request(self.root + "/login/").text
- token = text.extract(page, 'id="loginToken" value="', '"')[0]
- if not token:
- self.log.debug("failed to extract 'loginToken'")
-
- url = self.root + "/login/loginxmm"
+ url = self.root + "/api/login"
headers = {
"X-Requested-With": "XMLHttpRequest",
"Referer": self.root + "/login",
}
- data = {
- "name": username,
- "password": hashlib.md5(password.encode()).hexdigest(),
- "loginToken": token,
- }
+ data = self._sign_by_md5(username, password)
response = self.request(url, method="POST", headers=headers, data=data)
- if response.json().get("result") != "1":
- raise exception.AuthenticationError()
+ data = response.json()
+ if str(data.get("result")) != "1":
+ raise exception.AuthenticationError(data.get("msg"))
return {"SESSION": self.session.cookies.get("SESSION")}
@staticmethod
+ def _sign_by_md5(username, password):
+ # https://dns.mangoxo.com/libs/plugins/phoenix-ui/js/phoenix-ui.js
+ params = [
+ ("username" , username),
+ ("password" , password),
+ ("timestamp", str(int(time.time()))),
+ ]
+ query = "&".join("=".join(item) for item in sorted(params))
+ query += "&secretKey=996293536"
+ sign = hashlib.md5(query.encode()).hexdigest()
+ params.append(("sign", sign.upper()))
+ return params
+
+ @staticmethod
def _total_pages(page):
return text.parse_int(text.extract(page, "total :", ",")[0])
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 54e60b0..21afeae 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,7 @@
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
+import itertools
import json
@@ -35,16 +36,17 @@ class NewgroundsExtractor(Extractor):
for post_url in self.posts():
try:
- file = self.extract_post(post_url)
- url = file["url"]
- # except Exception:
+ post = self.extract_post(post_url)
+ url = post.get("url")
except OSError:
url = None
- if not url:
- self.log.warning("Unable to get download URL for %s", post_url)
- continue
- yield Message.Directory, file
- yield Message.Url, url, text.nameext_from_url(url, file)
+
+ if url:
+ yield Message.Directory, post
+ yield Message.Url, url, text.nameext_from_url(url, post)
+ else:
+ self.log.warning(
+ "Unable to get download URL for '%s'", post_url)
def posts(self):
"""Return urls of all relevant image pages"""
@@ -82,7 +84,10 @@ class NewgroundsExtractor(Extractor):
}
def extract_post(self, post_url):
- page = self.request(post_url).text
+ response = self.request(post_url, fatal=False)
+ if response.status_code >= 400:
+ return {}
+ page = response.text
extr = text.extract_from(page)
if "/art/view/" in post_url:
@@ -97,8 +102,7 @@ class NewgroundsExtractor(Extractor):
data["favorites"] = text.parse_int(extr(
'id="faves_load">', '<').replace(",", ""))
data["score"] = text.parse_float(extr('id="score_number">', '<'))
- data["tags"] = text.split_html(extr(
- '<dd class="tags">', '</dd>'))
+ data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>'))
data["artist"] = [
text.extract(user, '//', '.')[0]
for user in text.extract_iter(page, '<div class="item-user">', '>')
@@ -194,7 +198,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
"keyword": {
"artist" : ["tomfulp"],
"comment" : "re:Consider this the bottom threshold for ",
- "date" : "type:datetime",
+ "date" : "dt:2009-06-04 14:44:05",
"description": "re:Consider this the bottom threshold for ",
"favorites" : int,
"filename" : "94_tomfulp_ryu-is-hawt",
@@ -241,7 +245,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"keyword": {
"artist" : ["psychogoldfish", "tomfulp"],
"comment" : "re:People have been asking me how I like the ",
- "date" : "type:datetime",
+ "date" : "dt:2012-02-08 21:40:56",
"description": "re:People have been asking how I like the ",
"favorites" : int,
"filename" : "527818_alternate_1896",
@@ -259,7 +263,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"keyword": {
"artist" : ["zj", "tomfulp"],
"comment" : "re:RECORDED 12-09-2014\n\nFrom The ZJ \"Late ",
- "date" : "type:datetime",
+ "date" : "dt:2015-02-23 19:31:59",
"description": "From The ZJ Report Show!",
"favorites" : int,
"index" : 609768,
@@ -334,3 +338,53 @@ class NewgroundsUserExtractor(NewgroundsExtractor):
(NewgroundsAudioExtractor , base + "audio"),
(NewgroundsMoviesExtractor, base + "movies"),
), ("art",))
+
+
+class NewgroundsFavoriteExtractor(NewgroundsExtractor):
+ """Extractor for posts favorited by a newgrounds user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{user}", "Favorites")
+ pattern = (r"(?:https?://)?([^.]+)\.newgrounds\.com"
+ r"/favorites(?:/(art|audio|movies))?/?")
+ test = (
+ ("https://tomfulp.newgrounds.com/favorites/art", {
+ "range": "1-10",
+ "count": ">= 10",
+ }),
+ ("https://tomfulp.newgrounds.com/favorites/audio"),
+ ("https://tomfulp.newgrounds.com/favorites/movies"),
+ ("https://tomfulp.newgrounds.com/favorites/"),
+ )
+
+ def __init__(self, match):
+ NewgroundsExtractor.__init__(self, match)
+ self.kind = match.group(2)
+
+ def posts(self):
+ if self.kind:
+ return self._pagination(self.kind)
+ return itertools.chain.from_iterable(
+ self._pagination(k) for k in ("art", "audio", "movies")
+ )
+
+ def _pagination(self, kind):
+ num = 1
+ headers = {
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": self.user_root,
+ }
+
+ while True:
+ url = "{}/favorites/{}/{}".format(self.user_root, kind, num)
+ response = self.request(url, headers=headers)
+ if response.history:
+ return
+
+ favs = list(text.extract_iter(
+ response.text, 'href="//www.newgrounds.com', '"'))
+ for path in favs:
+ yield self.root + path
+ if len(favs) < 24:
+ return
+ num += 1
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 97be789..dfe31e3 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -86,7 +86,7 @@ class NozomiPostExtractor(NozomiExtractor):
"character": ["patchouli knowledge"],
"copyright": ["touhou"],
"dataid" : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5cf5a",
- "date" : "type:datetime",
+ "date" : "dt:2016-07-26 02:32:03",
"extension": "jpg",
"favorites": int,
"filename" : str,
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 74835bf..2f5b429 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2019 Mike Fährmann
+# Copyright 2017-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Utility classes to setup OAuth and link a users account to gallery-dl"""
+"""Utility classes to setup OAuth and link accounts to gallery-dl"""
from .common import Extractor, Message
from . import deviantart, flickr, reddit, smugmug, tumblr
@@ -38,7 +38,7 @@ class OAuthBase(Extractor):
print("Waiting for response. (Cancel with Ctrl+c)")
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- server.bind(("localhost", 6414))
+ server.bind(("localhost", self.config("port", 6414)))
server.listen(1)
# workaround for ctrl+c not working during server.accept on Windows
@@ -98,7 +98,7 @@ class OAuthBase(Extractor):
def _oauth2_authorization_code_grant(
self, client_id, client_secret, auth_url, token_url,
scope="read", key="refresh_token", auth=True,
- message_template=None):
+ message_template=None, cache=None):
"""Perform an OAuth2 authorization code grant"""
state = "gallery-dl_{}_{}".format(
@@ -162,6 +162,11 @@ class OAuthBase(Extractor):
client_secret=client_secret,
))
+ # write to cache
+ if cache and config.get(("extractor", self.category), "cache"):
+ cache.update("#" + str(client_id), data[key])
+ self.log.info("Writing 'refresh-token' to cache")
+
class OAuthDeviantart(OAuthBase):
subcategory = "deviantart"
@@ -179,6 +184,7 @@ class OAuthDeviantart(OAuthBase):
"https://www.deviantart.com/oauth2/authorize",
"https://www.deviantart.com/oauth2/token",
scope="browse",
+ cache=deviantart._refresh_token_cache,
)
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index a4731d0..931fb13 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -45,7 +45,7 @@ class PahealTagExtractor(PahealExtractor):
directory_fmt = ("{category}", "{search_tags}")
pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
r"/post/list/([^/?&#]+)")
- test = ("https://rule34.paheal.net/post/list/k-on/1", {
+ test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", {
"pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20",
"count": ">= 15"
})
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 1e52559..0d51df2 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import memcache
import collections
+import itertools
import json
@@ -33,43 +34,62 @@ class PatreonExtractor(Extractor):
PatreonExtractor._warning = False
for post in self.posts():
- ids = set()
post["num"] = 0
- content = post.get("content")
- postfile = post.get("post_file")
+ hashes = set()
yield Message.Directory, post
yield Message.Metadata, text.nameext_from_url(
post["creator"].get("image_url", ""), post)
- for image in post["images"]:
- url = image.get("download_url")
- if not url:
- continue
- ids.add(url.split("/")[-2])
- name = image.get("file_name") or self._filename(url) or url
+ for kind, url, name in itertools.chain(
+ self._postfile(post),
+ self._images(post),
+ self._attachments(post),
+ self._content(post),
+ ):
+ fhash = url.rsplit("/", 2)[1]
+ if fhash not in hashes:
+ hashes.add(fhash)
+ post["hash"] = fhash
+ post["type"] = kind
+ post["num"] += 1
+ yield Message.Url, url, text.nameext_from_url(name, post)
+ else:
+ self.log.debug("skipping %s (%s %s)", url, fhash, kind)
- post["num"] += 1
- post["type"] = "image"
- yield Message.Url, url, text.nameext_from_url(name, post)
+ @staticmethod
+ def _postfile(post):
+ postfile = post.get("post_file")
+ if postfile:
+ return (("postfile", postfile["url"], postfile["name"]),)
+ return ()
+
+ def _images(self, post):
+ for image in post["images"]:
+ url = image.get("download_url")
+ if url:
+ name = image.get("file_name") or self._filename(url) or url
+ yield "image", url, name
- if postfile and postfile["url"].split("/")[-2] not in ids:
- post["num"] += 1
- post["type"] = "postfile"
- text.nameext_from_url(postfile["name"], post)
- yield Message.Url, postfile["url"], post
+ def _attachments(self, post):
+ for attachment in post["attachments"]:
+ url = self.request(
+ attachment["url"], method="HEAD",
+ allow_redirects=False, fatal=False,
+ ).headers.get("Location")
- for attachment in post["attachments"]:
- post["num"] += 1
- post["type"] = "attachment"
- text.nameext_from_url(attachment["name"], post)
- yield Message.Url, attachment["url"], post
+ if url:
+ yield "attachment", url, attachment["name"]
- if content:
- for url in text.extract_iter(content, 'src="', '"'):
- post["num"] += 1
- post["type"] = "content"
- yield Message.Url, url, text.nameext_from_url(url, post)
+ @staticmethod
+ def _content(post):
+ content = post.get("content")
+ if content:
+ for img in text.extract_iter(
+ content, '<img data-media-id="', '>'):
+ url = text.extract(img, 'src="', '"')[0]
+ if url:
+ yield "content", url, url
def posts(self):
"""Return all relevant post objects"""
@@ -238,11 +258,13 @@ class PatreonPostExtractor(PatreonExtractor):
subcategory = "post"
pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?&#]+)"
test = (
+ # postfile + attachments
("https://www.patreon.com/posts/precious-metal-23563293", {
"count": 4,
}),
- ("https://www.patreon.com/posts/er1-28201153", {
- "count": 1,
+ # postfile + content
+ ("https://www.patreon.com/posts/19987002", {
+ "count": 4,
}),
("https://www.patreon.com/posts/not-found-123", {
"exception": exception.NotFoundError,
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
index 5f50245..35f9f91 100644
--- a/gallery_dl/extractor/piczel.py
+++ b/gallery_dl/extractor/piczel.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,27 +22,30 @@ class PiczelExtractor(Extractor):
def items(self):
yield Message.Version, 1
- for image in self.unpack(self.images()):
- url = self.root + "/static" + image["image"]["image"]["url"]
- yield Message.Directory, image
- yield Message.Url, url, text.nameext_from_url(url, image)
-
- @staticmethod
- def unpack(images):
- """Unpack 'images' into individual image objects"""
- for image in images:
- if image["multi"]:
- multi = image["images"]
- del image["images"]
- for image["num"], img in enumerate(multi):
- image["image"] = img
- yield image
+ for post in self.posts():
+ post["tags"] = [t["title"] for t in post["tags"] if t["title"]]
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+
+ if post["multi"]:
+ images = post["images"]
+ del post["images"]
+ yield Message.Directory, post
+ for post["num"], image in enumerate(images):
+ if "id" in image:
+ del image["id"]
+ post.update(image)
+ url = post["image"]["url"]
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
else:
- image["num"] = 0
- yield image
+ yield Message.Directory, post
+ post["num"] = 0
+ url = post["image"]["url"]
+ yield Message.Url, url, text.nameext_from_url(url, post)
- def images(self):
- """Return an iterable with all relevant image objects"""
+ def posts(self):
+ """Return an iterable with all relevant post objects"""
def _pagination(self, url, folder_id=None):
params = {
@@ -53,26 +56,26 @@ class PiczelExtractor(Extractor):
while True:
data = self.request(url, params=params).json()
- yield from data
-
- if len(data) < 32:
+ if not data:
return
params["from_id"] = data[-1]["id"]
+ yield from data
class PiczelUserExtractor(PiczelExtractor):
"""Extractor for all images from a user's gallery"""
subcategory = "user"
pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$"
- test = ("https://piczel.tv/gallery/Maximumwarp", {
- "count": ">= 45",
+ test = ("https://piczel.tv/gallery/Bikupan", {
+ "range": "1-100",
+ "count": ">= 100",
})
def __init__(self, match):
PiczelExtractor.__init__(self, match)
self.user = match.group(1)
- def images(self):
+ def posts(self):
url = "{}/api/users/{}/gallery".format(self.root, self.user)
return self._pagination(url)
@@ -92,7 +95,7 @@ class PiczelFolderExtractor(PiczelExtractor):
PiczelExtractor.__init__(self, match)
self.user, self.folder_id = match.groups()
- def images(self):
+ def posts(self):
url = "{}/api/users/{}/gallery".format(self.root, self.user)
return self._pagination(url, self.folder_id)
@@ -106,6 +109,7 @@ class PiczelImageExtractor(PiczelExtractor):
"content": "df9a053a24234474a19bce2b7e27e0dec23bff87",
"keyword": {
"created_at": "2018-07-22T05:13:58.000Z",
+ "date": "dt:2018-07-22 05:13:58",
"description": None,
"extension": "png",
"favorites_count": int,
@@ -118,7 +122,7 @@ class PiczelImageExtractor(PiczelExtractor):
"nsfw": False,
"num": 0,
"password_protected": False,
- "tags": "fanart, commission, altair, recreators, ",
+ "tags": ["fanart", "commission", "altair", "recreators"],
"title": "Altair",
"user": dict,
"views": int,
@@ -129,6 +133,6 @@ class PiczelImageExtractor(PiczelExtractor):
PiczelExtractor.__init__(self, match)
self.image_id = match.group(1)
- def images(self):
+ def posts(self):
url = "{}/api/gallery/image/{}".format(self.root, self.image_id)
return (self.request(url).json(),)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 8a10028..eaf97fd 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
from datetime import datetime, timedelta
+import itertools
import hashlib
import time
@@ -27,11 +28,11 @@ class PixivExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.api = PixivAppAPI(self)
- self.user_id = -1
self.load_ugoira = self.config("ugoira", True)
def items(self):
- metadata = self.get_metadata()
+ ratings = {0: "General", 1: "R-18", 2: "R-18G"}
+ metadata = self.metadata()
yield Message.Version, 1
for work in self.works():
@@ -46,6 +47,7 @@ class PixivExtractor(Extractor):
work["num"] = 0
work["tags"] = [tag["name"] for tag in work["tags"]]
work["date"] = text.parse_datetime(work["create_date"])
+ work["rating"] = ratings.get(work["x_restrict"])
work["suffix"] = ""
work.update(metadata)
@@ -74,11 +76,9 @@ class PixivExtractor(Extractor):
def works(self):
"""Return an iterable containing all relevant 'work'-objects"""
- def get_metadata(self, user=None):
+ def metadata(self):
"""Collect metadata for extractor-job"""
- if not user:
- user = self.api.user_detail(self.user_id)
- return {"user": user}
+ return {}
class PixivUserExtractor(PixivExtractor):
@@ -102,8 +102,15 @@ class PixivUserExtractor(PixivExtractor):
"&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), {
"url": "25b1cd81153a8ff82eec440dd9f20a4a22079658",
}),
+ # avatar (#595, 623)
+ ("https://www.pixiv.net/en/users/173530", {
+ "options": (("avatar", True),),
+ "content": "22af450d4dbaf4973d370f164f66f48c7382a6de",
+ "range": "1",
+ }),
+ # deleted account
("http://www.pixiv.net/member_illust.php?id=173531", {
- "exception": exception.NotFoundError,
+ "count": 0,
}),
("https://www.pixiv.net/en/users/173530"),
("https://www.pixiv.net/en/users/173530/manga"),
@@ -136,6 +143,27 @@ class PixivUserExtractor(PixivExtractor):
if tag in [t["name"].lower() for t in work["tags"]]
)
+ if self.config("avatar"):
+ user = self.api.user_detail(self.user_id)
+ url = user["profile_image_urls"]["medium"].replace("_170.", ".")
+ avatar = {
+ "create_date" : None,
+ "height" : 0,
+ "id" : "avatar",
+ "image_urls" : None,
+ "meta_pages" : (),
+ "meta_single_page": {"original_image_url": url},
+ "page_count" : 1,
+ "sanity_level" : 0,
+ "tags" : (),
+ "title" : "avatar",
+ "type" : "avatar",
+ "user" : user,
+ "width" : 0,
+ "x_restrict" : 0,
+ }
+ works = itertools.chain((avatar,), works)
+
return works
@@ -203,15 +231,9 @@ class PixivWorkExtractor(PixivExtractor):
def __init__(self, match):
PixivExtractor.__init__(self, match)
self.illust_id = match.group(1) or match.group(2)
- self.load_ugoira = True
- self.work = None
def works(self):
- return (self.work,)
-
- def get_metadata(self, user=None):
- self.work = self.api.illust_detail(self.illust_id)
- return PixivExtractor.get_metadata(self, self.work["user"])
+ return (self.api.illust_detail(self.illust_id),)
class PixivFavoriteExtractor(PixivExtractor):
@@ -220,8 +242,8 @@ class PixivFavoriteExtractor(PixivExtractor):
directory_fmt = ("{category}", "bookmarks",
"{user_bookmark[id]} {user_bookmark[account]}")
archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}"
- pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/"
- r"(?:(?:en/)?users/(\d+)/(bookmarks/artworks|following)"
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?"
+ r"users/(\d+)/(bookmarks/artworks(?:/([^/?&#]+))?|following)"
r"|bookmark\.php(?:\?([^#]*))?)")
test = (
("https://www.pixiv.net/en/users/173530/bookmarks/artworks", {
@@ -231,20 +253,29 @@ class PixivFavoriteExtractor(PixivExtractor):
"url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
}),
# bookmarks with specific tag
+ (("https://www.pixiv.net/en/users/3137110"
+ "/bookmarks/artworks/%E3%81%AF%E3%82%93%E3%82%82%E3%82%93"), {
+ "url": "379b28275f786d946e01f721e54afe346c148a8c",
+ }),
+ # bookmarks with specific tag (legacy url)
(("https://www.pixiv.net/bookmark.php?id=3137110"
"&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), {
- "count": 2,
+ "url": "379b28275f786d946e01f721e54afe346c148a8c",
}),
# own bookmarks
("https://www.pixiv.net/bookmark.php", {
"url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
}),
+ # own bookmarks with tag (#596)
+ ("https://www.pixiv.net/bookmark.php?tag=foobar", {
+ "count": 0,
+ }),
# followed users (#515)
("https://www.pixiv.net/en/users/173530/following", {
"pattern": PixivUserExtractor.pattern,
"count": ">= 12",
}),
- # followed users (#515)
+ # followed users (legacy url) (#515)
("https://www.pixiv.net/bookmark.php?id=173530&type=user", {
"pattern": PixivUserExtractor.pattern,
"count": ">= 12",
@@ -255,11 +286,11 @@ class PixivFavoriteExtractor(PixivExtractor):
)
def __init__(self, match):
- uid, kind, query = match.groups()
+ uid, kind, self.tag, query = match.groups()
if query:
self.query = text.parse_query(query)
- uid = self.query.get("id", -1)
+ uid = self.query.get("id")
if not uid:
self.subcategory = "bookmark"
elif self.query.get("type") == "user":
@@ -280,12 +311,15 @@ class PixivFavoriteExtractor(PixivExtractor):
if "tag" in self.query:
tag = text.unquote(self.query["tag"])
+ elif self.tag:
+ tag = text.unquote(self.tag)
+
if "rest" in self.query and self.query["rest"] == "hide":
restrict = "private"
return self.api.user_bookmarks_illust(self.user_id, tag, restrict)
- def get_metadata(self, user=None):
+ def metadata(self):
if self.user_id:
user = self.api.user_detail(self.user_id)
else:
@@ -301,7 +335,7 @@ class PixivFavoriteExtractor(PixivExtractor):
for preview in self.api.user_following(self.user_id):
user = preview["user"]
user["_extractor"] = PixivUserExtractor
- url = "https://www.pixiv.net/member.php?id={}".format(user["id"])
+ url = "https://www.pixiv.net/users/{}".format(user["id"])
yield Message.Queue, url, user
@@ -327,7 +361,7 @@ class PixivRankingExtractor(PixivExtractor):
def works(self):
return self.api.illust_ranking(self.mode, self.date)
- def get_metadata(self, user=None):
+ def metadata(self):
query = text.parse_query(self.query)
mode = query.get("mode", "daily").lower()
@@ -393,7 +427,7 @@ class PixivSearchExtractor(PixivExtractor):
def works(self):
return self.api.search_illust(self.word, self.sort, self.target)
- def get_metadata(self, user=None):
+ def metadata(self):
query = text.parse_query(self.query)
if self.word:
@@ -446,7 +480,7 @@ class PixivFollowExtractor(PixivExtractor):
def works(self):
return self.api.illust_follow()
- def get_metadata(self, user=None):
+ def metadata(self):
self.api.login()
return {"user_follow": self.api.user}
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
index aa5c9c6..721fc2f 100644
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,8 +21,8 @@ class PururinGalleryExtractor(GalleryExtractor):
("https://pururin.io/gallery/38661/iowant-2", {
"pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg",
"keyword": {
- "title" : "Iowant 2!!",
- "title_en" : "Iowant 2!!",
+ "title" : "re:I ?owant 2!!",
+ "title_en" : "re:I ?owant 2!!",
"title_jp" : "",
"gallery_id": 38661,
"count" : 19,
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
index 6d89151..70b4833 100644
--- a/gallery_dl/extractor/realbooru.py
+++ b/gallery_dl/extractor/realbooru.py
@@ -30,7 +30,7 @@ class RealbooruTagExtractor(booru.TagMixin, RealbooruExtractor):
pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
test = ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
- "count": 64,
+ "count": ">= 64",
})
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 4c83019..a312c1c 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -9,74 +9,104 @@
"""Extractors for https://www.reddit.com/"""
from .common import Extractor, Message
-from .. import text, util, extractor, exception
+from .. import text, util, exception
from ..cache import cache
class RedditExtractor(Extractor):
"""Base class for reddit extractors"""
category = "reddit"
+ directory_fmt = ("{category}", "{subreddit}")
+ filename_fmt = "{id} {title[:242]}.{extension}"
+ archive_fmt = "{filename}"
cookiedomain = None
def __init__(self, match):
Extractor.__init__(self, match)
self.api = RedditAPI(self)
- self.max_depth = int(self.config("recursion", 0))
- self._visited = set()
+ self.max_depth = self.config("recursion", 0)
def items(self):
- subre = RedditSubmissionExtractor.pattern
+ match_submission = RedditSubmissionExtractor.pattern.match
+ match_subreddit = RedditSubredditExtractor.pattern.match
+ match_user = RedditUserExtractor.pattern.match
+
+ parentdir = self.config("parent-directory")
+ videos = self.config("videos", True)
+
submissions = self.submissions()
+ visited = set()
depth = 0
yield Message.Version, 1
- with extractor.blacklist(
- util.SPECIAL_EXTRACTORS,
- [RedditSubredditExtractor, RedditUserExtractor]):
- while True:
- extra = []
- for url, data in self._urls(submissions):
- if url[0] == "#":
+
+ while True:
+ extra = []
+
+ for submission, comments in submissions:
+ urls = []
+
+ if submission:
+ yield Message.Directory, submission
+ visited.add(submission["id"])
+ url = submission["url"]
+
+ if url.startswith("https://i.redd.it/"):
+ text.nameext_from_url(url, submission)
+ yield Message.Url, url, submission
+
+ elif submission["is_video"]:
+ if videos:
+ text.nameext_from_url(url, submission)
+ if videos == "ytdl":
+ url = "https://www.reddit.com" + \
+ submission["permalink"]
+ else:
+ submission["_ytdl_extra"] = {
+ "title": submission["title"],
+ }
+ yield Message.Url, "ytdl:" + url, submission
+
+ elif not submission["is_self"]:
+ urls.append((url, submission))
+
+ elif parentdir:
+ yield Message.Directory, comments[0]
+
+ if self.api.comments:
+ if submission:
+ for url in text.extract_iter(
+ submission["selftext_html"] or "",
+ ' href="', '"'):
+ urls.append((url, submission))
+ for comment in comments:
+ for url in text.extract_iter(
+ comment["body_html"] or "", ' href="', '"'):
+ urls.append((url, comment))
+
+ for url, data in urls:
+ if not url or url[0] == "#":
continue
if url[0] == "/":
url = "https://www.reddit.com" + url
- match = subre.match(url)
+ match = match_submission(url)
if match:
extra.append(match.group(1))
- else:
+ elif not match_user(url) and not match_subreddit(url):
yield Message.Queue, text.unescape(url), data
- if not extra or depth == self.max_depth:
- return
- depth += 1
- submissions = (
- self.api.submission(sid) for sid in extra
- if sid not in self._visited
- )
+ if not extra or depth == self.max_depth:
+ return
+ depth += 1
+ submissions = (
+ self.api.submission(sid) for sid in extra
+ if sid not in self._visited
+ )
def submissions(self):
"""Return an iterable containing all (submission, comments) tuples"""
- def _urls(self, submissions):
- for submission, comments in submissions:
-
- if submission:
- self._visited.add(submission["id"])
-
- if not submission["is_self"]:
- yield submission["url"], submission
-
- for url in text.extract_iter(
- submission["selftext_html"] or "", ' href="', '"'):
- yield url, submission
-
- if comments:
- for comment in comments:
- for url in text.extract_iter(
- comment["body_html"] or "", ' href="', '"'):
- yield url, comment
-
class RedditSubredditExtractor(RedditExtractor):
"""Extractor for URLs from subreddits on reddit.com"""
@@ -84,7 +114,10 @@ class RedditSubredditExtractor(RedditExtractor):
pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/"
r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)")
test = (
- ("https://www.reddit.com/r/lavaporn/"),
+ ("https://www.reddit.com/r/lavaporn/", {
+ "range": "1-20",
+ "count": ">= 20",
+ }),
("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"),
("https://old.reddit.com/r/lavaporn/"),
("https://np.reddit.com/r/lavaporn/"),
@@ -210,7 +243,7 @@ class RedditAPI():
link_id = "t3_" + submission_id if self.morecomments else None
submission, comments = self._call(endpoint, {"limit": self.comments})
return (submission["data"]["children"][0]["data"],
- self._flatten(comments, link_id) if self.comments else None)
+ self._flatten(comments, link_id) if self.comments else ())
def submissions_subreddit(self, subreddit, params):
"""Collect all (submission, comments)-tuples of a subreddit"""
@@ -290,7 +323,8 @@ class RedditAPI():
raise exception.AuthorizationError()
if data["error"] == 404:
raise exception.NotFoundError()
- raise Exception(data["message"])
+ self.log.debug(data)
+ raise exception.StopExtraction(data.get("message"))
return data
def _pagination(self, endpoint, params):
@@ -315,7 +349,7 @@ class RedditAPI():
except exception.AuthorizationError:
pass
else:
- yield post, None
+ yield post, ()
elif kind == "t1" and self.comments:
yield None, (post,)
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 2c9746e..521b034 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -83,11 +83,11 @@ class SexcomExtractor(Extractor):
data["url"] = "ytdl:" + text.extract(
extr('<iframe', '>'), ' src="', '"')[0]
else:
- data["url"] = extr(' src="', '"')
+ data["url"] = text.unescape(extr(' src="', '"').partition("?")[0])
text.nameext_from_url(data["url"], data)
data["uploader"] = extr('itemprop="author">', '<')
- data["date"] = extr('datetime="', '"')
+ data["date"] = text.parse_datetime(extr('datetime="', '"'))
data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>'))
data["comments"] = text.parse_int(extr('Comments (', ')'))
@@ -102,28 +102,28 @@ class SexcomPinExtractor(SexcomExtractor):
test = (
# picture
("https://www.sex.com/pin/56714360/", {
- "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86",
- "content": "963ed681cf53904173c7581b713c7f9471f04db0",
+ "pattern": "https://cdn.sex.com/images/.+/2018/10/02/20037816.jpg",
+ "content": "e579e3283fea812d0545a3f79734b79bc3c51acb",
"keyword": {
- "comments": int,
- "date": "2018-10-02T21:18:17-04:00",
+ "comments" : int,
+ "date" : "dt:2018-10-02 21:18:17",
"extension": "jpg",
- "filename": "20037816",
- "likes": int,
- "pin_id": 56714360,
- "repins": int,
- "tags": list,
+ "filename" : "20037816",
+ "likes" : int,
+ "pin_id" : 56714360,
+ "repins" : int,
+ "tags" : list,
"thumbnail": str,
- "title": "Pin #56714360",
- "type": "picture",
- "uploader": "alguem",
- "url": str,
+ "title" : "Pin #56714360",
+ "type" : "picture",
+ "uploader" : "alguem",
+ "url" : str,
},
}),
# gif
("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", {
- "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1",
- "content": "a54b37eb39d565094c54ad7d21244fe8f978fb14",
+ "pattern": "https://cdn.sex.com/images/.+/2014/01/26/4829951.gif",
+ "content": "af6726d74d11d819e1c885fe5303f711862eae96",
}),
# video
("https://www.sex.com/pin/55748341/", {
@@ -134,10 +134,6 @@ class SexcomPinExtractor(SexcomExtractor):
("https://www.sex.com/pin/55847384-very-nicely-animated/", {
"pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2",
}),
- # 404
- ("https://www.sex.com/pin/55847385/", {
- "count": 0,
- }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index 298b7e0..31dbdad 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -52,7 +52,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
"title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
"gallery_id": 40996,
- "date" : "type:datetime",
+ "date" : "dt:2018-06-29 00:00:00",
"count" : 42,
"collection": "",
"artist" : ["Itou Life"],
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index a1f2199..0505fa9 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -310,7 +310,7 @@ class TumblrTagExtractor(TumblrExtractor):
def __init__(self, match):
TumblrExtractor.__init__(self, match)
- self.tag = text.unquote(match.group(3))
+ self.tag = text.unquote(match.group(3).replace("-", " "))
def posts(self):
return self.api.posts(self.blog, {"tag": self.tag})
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index dc558c0..2a04463 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://twitter.com/"""
+"""Extractors for https://twitter.com/"""
from .common import Extractor, Message
from .. import text, exception
@@ -21,8 +21,11 @@ class TwitterExtractor(Extractor):
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
+ cookiedomain = ".twitter.com"
root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small")
+ user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; "
+ "Trident/7.0; rv:11.0) like Gecko")
def __init__(self, match):
Extractor.__init__(self, match)
@@ -32,7 +35,7 @@ class TwitterExtractor(Extractor):
self.retweets = self.config("retweets", True)
self.twitpic = self.config("twitpic", False)
self.content = self.config("content", False)
- self.videos = self.config("videos", False)
+ self.videos = self.config("videos", True)
if self.content:
self._emoji_sub = re.compile(
@@ -117,7 +120,8 @@ class TwitterExtractor(Extractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- page = self.request(self.root + "/login").text
+ headers = {"User-Agent": self.user_agent}
+ page = self.request(self.root + "/login", headers=headers).text
pos = page.index('name="authenticity_token"')
token = text.extract(page, 'value="', '"', pos-80)[0]
@@ -131,11 +135,15 @@ class TwitterExtractor(Extractor):
"redirect_after_login" : "",
"remember_me" : "1",
}
- response = self.request(url, method="POST", data=data)
-
+ response = self.request(url, method="POST", headers=headers, data=data)
if "/error" in response.url:
raise exception.AuthenticationError()
- return self.session.cookies
+
+ return {
+ cookie.name: cookie.value
+ for cookie in self.session.cookies
+ if cookie.domain and "twitter.com" in cookie.domain
+ }
def _data_from_tweet(self, tweet):
extr = text.extract_from(tweet)
@@ -353,7 +361,11 @@ class TwitterTweetExtractor(TwitterExtractor):
# content with emoji, newlines, hashtags (#338)
("https://twitter.com/yumi_san0112/status/1151144618936823808", {
"options": (("content", True),),
- "keyword": "0b7a3d05607b480c1412dfd85f8606478313e7bf",
+ "keyword": {"content": (
+ "re:晴、お誕生日おめでとう🎉!\n実は下の名前が同じなので結構親近感ある"
+ "アイドルです✨\n今年の晴ちゃんめちゃくちゃ可愛い路線攻めてるから、そろ"
+ "そろまたかっこいい晴が見たいですねw\n#結城晴生誕祭2019\n#結城晴生誕祭"
+ )},
}),
# Reply to another tweet (#403)
("https://twitter.com/tyson_hesse/status/1103767554424598528", {
@@ -365,9 +377,12 @@ class TwitterTweetExtractor(TwitterExtractor):
"pattern": r"https://pbs.twimg.com/media/EAel0vUUYAAZ4Bq.jpg:orig",
}),
# quoted tweet (#526)
- ("https://twitter.com/Meiyu_miu/status/1070693241413021696", {
- "count": 4,
- "keyword": "0c627af2b8cdccc7e0da8fd221155c4a4a3141a8",
+ ("https://twitter.com/Pistachio/status/1222690391817932803", {
+ "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg",
+ "keyword": {
+ "author": {"name": "Afro_Herper", "id": 786047748508221440},
+ "user" : {"name": "Pistachio" , "id": 3533231},
+ },
}),
# TwitPic embeds (#579)
("https://twitter.com/i/web/status/112900228289540096", {
@@ -384,11 +399,7 @@ class TwitterTweetExtractor(TwitterExtractor):
def tweets(self):
url = "{}/i/web/status/{}".format(self.root, self.tweet_id)
cookies = {"app_shell_visited": "1"}
- headers = {
- "Referer" : url,
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; "
- "Trident/7.0; rv:11.0) like Gecko",
- }
+ headers = {"User-Agent": self.user_agent, "Referer": url}
response = self.request(url, cookies=cookies, headers=headers)
if response.history and response.url == self.root + "/":
@@ -400,6 +411,81 @@ class TwitterTweetExtractor(TwitterExtractor):
return (page[beg:end],)
+class TwitterBookmarkExtractor(TwitterExtractor):
+ """Extractor for bookmarked tweets"""
+ subcategory = "bookmark"
+ pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()"
+ test = ("https://twitter.com/i/bookmarks",)
+
+ def items(self):
+ self.login()
+ if not self.logged_in:
+ raise exception.AuthorizationError("Login required")
+ for cookie in self.session.cookies:
+ cookie.expires = None
+
+ url = "https://api.twitter.com/2/timeline/bookmark.json"
+ params = {
+ "include_profile_interstitial_type": "1",
+ "include_blocking": "1",
+ "include_blocked_by": "1",
+ "include_followed_by": "1",
+ "include_want_retweets": "1",
+ "include_mute_edge": "1",
+ "include_can_dm": "1",
+ "include_can_media_tag": "1",
+ "skip_status": "1",
+ "cards_platform": "Web-12",
+ "include_cards": "1",
+ "include_composer_source": "true",
+ "include_ext_alt_text": "true",
+ "include_reply_count": "1",
+ "tweet_mode": "extended",
+ "include_entities": "true",
+ "include_user_entities": "true",
+ "include_ext_media_color": "true",
+ "include_ext_media_availability": "true",
+ "send_error_codes": "true",
+ "simple_quoted_tweets": "true",
+ "count": "100",
+ "cursor": None,
+ "ext": "mediaStats%2CcameraMoment",
+ }
+ headers = {
+ "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
+ "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
+ "4FA33AGWWjCpTnA",
+ "Origin": self.root,
+ "Referer": self.root + "/i/bookmarks",
+ "x-csrf-token": self.session.cookies.get("ct0"),
+ "x-twitter-active-user": "yes",
+ "x-twitter-auth-type": "Auth2Session",
+ "x-twitter-client-language": "en",
+ }
+
+ while True:
+ response = self.request(
+ url, params=params, headers=headers, fatal=False)
+ if response.status_code >= 400:
+ raise exception.StopExtraction(response.text)
+ data = response.json()
+ tweets = data["globalObjects"]["tweets"]
+
+ if not tweets:
+ return
+ for tweet_id, tweet_data in tweets.items():
+ tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id)
+ tweet_data["_extractor"] = TwitterTweetExtractor
+ yield Message.Queue, tweet_url, tweet_data
+
+ inst = data["timeline"]["instructions"][0]
+ for entry in inst["addEntries"]["entries"]:
+ if entry["entryId"].startswith("cursor-bottom-"):
+ params["cursor"] = \
+ entry["content"]["operation"]["cursor"]["value"]
+ break
+
+
@memcache()
def _guest_token(extr, headers):
return extr.request(
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index a24d3fe..a020064 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -34,6 +34,9 @@ class VscoExtractor(Extractor):
yield Message.Directory, {"user": self.user}
for img in self.images():
+ if not img or "responsive_url" not in img:
+ continue
+
if img["is_video"]:
if not videos:
continue
@@ -98,6 +101,8 @@ class VscoExtractor(Extractor):
@staticmethod
def _transform_media(media):
+ if "responsiveUrl" not in media:
+ return None
media["_id"] = media["id"]
media["is_video"] = media["isVideo"]
media["grid_name"] = media["gridName"]
@@ -111,18 +116,19 @@ class VscoExtractor(Extractor):
class VscoUserExtractor(VscoExtractor):
"""Extractor for images from a user on vsco.co"""
subcategory = "user"
- pattern = BASE_PATTERN + r"(?:/images(?:/\d+)?)?/?(?:$|[?#])"
+ pattern = BASE_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])"
test = (
- ("https://vsco.co/missuri/images/1", {
+ ("https://vsco.co/missuri/gallery", {
"pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+",
"range": "1-80",
"count": 80,
}),
+ ("https://vsco.co/missuri/images/1"),
("https://vsco.co/missuri"),
)
def images(self):
- url = "{}/{}/images/1".format(self.root, self.user)
+ url = "{}/{}/gallery".format(self.root, self.user)
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
@@ -186,7 +192,7 @@ class VscoImageExtractor(VscoExtractor):
"grid" : "erenyildiz",
"meta" : dict,
"tags" : list,
- "date" : "type:datetime",
+ "date" : "dt:2019-07-21 19:12:11",
"video" : False,
"width" : 1537,
"height": 1537,
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
index 737c253..043da0b 100644
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://wallhaven.cc/"""
+"""Extractors for https://wallhaven.cc/"""
from .common import Extractor, Message
from .. import text
@@ -77,7 +77,7 @@ class WallhavenImageExtractor(WallhavenExtractor):
"group" : "Owner/Developer",
"username" : "AksumkA",
},
- "date" : "type:datetime",
+ "date" : "dt:2014-08-31 06:17:19",
"wh_category": "anime",
"views" : int,
"favorites" : int,
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 49fa082..6a779d9 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -56,7 +56,7 @@ class WeiboExtractor(Extractor):
info = obj["page_info"]["media_info"]
url = info.get("stream_url_hd") or info.get("stream_url")
- if url and not info.get("goto"):
+ if url:
data = text.nameext_from_url(url, {
"num" : num,
"pid" : 0,
@@ -65,6 +65,10 @@ class WeiboExtractor(Extractor):
"height": 0,
"status": status,
})
+ if data["extension"] == "m3u8":
+ url = "ytdl:" + url
+ data["extension"] = "mp4"
+ data["_ytdl_extra"] = {"protocol": "m3u8_native"}
yield Message.Url, url, data
if self.retweets and "retweeted_status" in obj:
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 62acb28..0422589 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -49,7 +49,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
"pageURL": str,
"thumbURL": str,
"gallery": {
- "date": "type:datetime",
+ "date": "dt:2019-04-16 00:07:31",
"description": "",
"dislikes": int,
"id": 11748968,
diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py
deleted file mode 100644
index b07ba4b..0000000
--- a/gallery_dl/extractor/yaplog.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://yaplog.jp/"""
-
-from .common import Extractor, Message, AsynchronousMixin
-from .. import text, util
-
-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?yaplog\.jp/([\w-]+)"
-
-
-class YaplogExtractor(AsynchronousMixin, Extractor):
- """Base class for yaplog extractors"""
- category = "yaplog"
- root = "https://yaplog.jp"
- filename_fmt = "{post[id]}_{post[title]}_{id}.{extension}"
- directory_fmt = ("{category}", "{post[user]}")
- archive_fmt = "{post[user]}_{id}"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.user = match.group(1)
-
- def items(self):
- yield Message.Version, 1
- for post, urls in self.posts():
- yield Message.Directory, {"post": post}
- for num, url in enumerate(urls, 1):
- page = self.request(url).text if num > 1 else url
- iurl = text.extract(page, '<img src="', '"')[0]
- if iurl[0] == "/":
- iurl = text.urljoin(self.root, iurl)
- name, _, ext = iurl.rpartition("/")[2].rpartition(".")
- iid = name.rpartition("_")[0] or name
- image = {
- "url" : iurl,
- "num" : num,
- "id" : text.parse_int(iid, iid),
- "filename" : name,
- "extension": ext,
- "post" : post,
- }
- yield Message.Url, iurl, image
-
- def posts(self):
- """Return an iterable with (data, image page URLs) tuples"""
-
- def _parse_post(self, url):
- page = self.request(url).text
- title, pos = text.extract(page, 'class="title">', '<')
- date , pos = text.extract(page, 'class="date">' , '<', pos)
- pid , pos = text.extract(page, '/archive/' , '"', pos)
- prev , pos = text.extract(page, 'class="last"><a href="', '"', pos)
-
- urls = list(text.extract_iter(page, '<li><a href="', '"', pos))
- if urls:
- urls[0] = page # cache HTML of first page
-
- if len(urls) == 24 and text.extract(page, '(1/', ')')[0] != '24':
- # there are a maximum of 24 image entries in an /image/ page
- # -> search /archive/ page for the rest
- url = "{}/{}/archive/{}".format(self.root, self.user, pid)
- page = self.request(url).text
-
- base = "{}/{}/image/{}/".format(self.root, self.user, pid)
- for part in util.advance(text.extract_iter(
- page, base, '"', pos), 24):
- urls.append(base + part)
-
- return prev, urls, {
- "id" : text.parse_int(pid),
- "title": text.unescape(title[:-3]),
- "user" : self.user,
- "date" : text.parse_datetime(date, "%B %d [%a], %Y, %H:%M"),
- }
-
-
-class YaplogBlogExtractor(YaplogExtractor):
- """Extractor for a user's blog on yaplog.jp"""
- subcategory = "blog"
- pattern = BASE_PATTERN + r"/?(?:$|[?&#])"
- test = ("https://yaplog.jp/omitakashi3", {
- "pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/",
- "count": ">= 2",
- })
-
- def posts(self):
- url = "{}/{}/image/".format(self.root, self.user)
- while url:
- url, images, data = self._parse_post(url)
- yield data, images
-
-
-class YaplogPostExtractor(YaplogExtractor):
- """Extractor for images from a blog post on yaplog.jp"""
- subcategory = "post"
- pattern = BASE_PATTERN + r"/(?:archive|image)/(\d+)"
- test = (
- ("https://yaplog.jp/imamiami0726/image/1299", {
- "url": "896cae20fa718735a57e723c48544e830ff31345",
- "keyword": "22df8ad6cb534514c6bb2ff000381d156769a620",
- }),
- # complete image URLs (#443)
- ("https://yaplog.jp/msjane/archive/246", {
- "pattern": r"https://yaplog.jp/cv/msjane/img/246/img\d+_t.jpg"
- }),
- # empty post (#443)
- ("https://yaplog.jp/f_l_a_s_c_o/image/872", {
- "count": 0,
- }),
- # blog names with '-' (#443)
- ("https://yaplog.jp/a-pierrot-o/image/3946/22779"),
- )
-
- def __init__(self, match):
- YaplogExtractor.__init__(self, match)
- self.post_id = match.group(2)
-
- def posts(self):
- url = "{}/{}/image/{}".format(self.root, self.user, self.post_id)
- _, images, data = self._parse_post(url)
- return ((data, images),)
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index c717dc2..6ba2572 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -182,7 +182,14 @@ class DownloadJob(Job):
self.downloaders = {}
self.postprocessors = None
self.out = output.select()
- self.visited = parent.visited if parent else set()
+
+ if parent:
+ self.visited = parent.visited
+ pfmt = parent.pathfmt
+ if pfmt and parent.extractor.config("parent-directory"):
+ self.extractor._parentdir = pfmt.directory
+ else:
+ self.visited = set()
def handle_url(self, url, kwdict, fallback=None):
"""Download the resource specified in 'url'"""
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
index 38e2f60..f084950 100644
--- a/gallery_dl/output.py
+++ b/gallery_dl/output.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -81,6 +81,36 @@ def initialize_logging(loglevel):
return logging.getLogger("gallery-dl")
+def configure_logging(loglevel):
+ root = logging.getLogger()
+ minlevel = loglevel
+
+ # stream logging handler
+ handler = root.handlers[0]
+ opts = config.interpolate(("output",), "log")
+ if opts:
+ if isinstance(opts, str):
+ opts = {"format": opts}
+ if handler.level == LOG_LEVEL and "level" in opts:
+ handler.setLevel(opts["level"])
+ if "format" in opts or "format-date" in opts:
+ handler.setFormatter(Formatter(
+ opts.get("format", LOG_FORMAT),
+ opts.get("format-date", LOG_FORMAT_DATE),
+ ))
+ if minlevel > handler.level:
+ minlevel = handler.level
+
+ # file logging handler
+ handler = setup_logging_handler("logfile", lvl=loglevel)
+ if handler:
+ root.addHandler(handler)
+ if minlevel > handler.level:
+ minlevel = handler.level
+
+ root.setLevel(minlevel)
+
+
def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL):
"""Setup a new logging handler"""
opts = config.interpolate(("output",), key)
@@ -112,22 +142,6 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL):
return handler
-def configure_logging_handler(key, handler):
- """Configure a logging handler"""
- opts = config.interpolate(("output",), key)
- if not opts:
- return
- if isinstance(opts, str):
- opts = {"format": opts}
- if handler.level == LOG_LEVEL and "level" in opts:
- handler.setLevel(opts["level"])
- if "format" in opts or "format-date" in opts:
- handler.setFormatter(Formatter(
- opts.get("format", LOG_FORMAT),
- opts.get("format-date", LOG_FORMAT_DATE),
- ))
-
-
# --------------------------------------------------------------------
# Utility functions
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 162eb9e..706e706 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -42,7 +42,7 @@ class UgoiraPP(PostProcessor):
if arg == "-vcodec" or arg in ("-c", "-codec") and (
not stream or stream.partition(":")[0] in ("v", "V")):
vcodec = self.args[index + 1]
- # use filter if libx264/5 is explicitly or implicitly used
+ # use filter when using libx264/5
self.prevent_odd = (
vcodec in ("libx264", "libx265") or
not vcodec and self.extension.lower() in ("mp4", "mkv"))
@@ -91,12 +91,12 @@ class UgoiraPP(PostProcessor):
# collect command-line arguments
args = [self.ffmpeg]
if rate_in:
- args += ["-r", str(rate_in)]
- args += ["-i", ffconcat]
+ args += ("-r", str(rate_in))
+ args += ("-i", ffconcat)
if rate_out:
- args += ["-r", str(rate_out)]
+ args += ("-r", str(rate_out))
if self.prevent_odd:
- args += ["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"]
+ args += ("-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)")
if self.args:
args += self.args
self.log.debug("ffmpeg args: %s", args)
@@ -106,8 +106,8 @@ class UgoiraPP(PostProcessor):
try:
if self.twopass:
if "-f" not in args:
- args += ["-f", self.extension]
- args += ["-passlogfile", tempdir + "/ffmpeg2pass", "-pass"]
+ args += ("-f", self.extension)
+ args += ("-passlogfile", tempdir + "/ffmpeg2pass", "-pass")
self._exec(args + ["1", "-y", os.devnull])
self._exec(args + ["2", pathfmt.realpath])
else:
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 72dad5b..a3f4e0a 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -15,6 +15,8 @@ import datetime
import urllib.parse
+HTML_RE = re.compile("<[^>]+>")
+
INVALID_XML_CHARS = (
"\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
"\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
@@ -39,7 +41,7 @@ def clean_xml(xmldata, repl=""):
def remove_html(txt, repl=" ", sep=" "):
"""Remove html-tags from a string"""
try:
- txt = re.sub("<[^>]+>", repl, txt)
+ txt = HTML_RE.sub(repl, txt)
except TypeError:
return ""
if sep:
@@ -51,7 +53,7 @@ def split_html(txt, sep=None):
"""Split input string by html-tags"""
try:
return [
- x.strip() for x in re.split("<[^>]+>", txt)
+ x.strip() for x in HTML_RE.split(txt)
if x and not x.isspace()
]
except TypeError:
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 13bf80e..232047c 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -21,6 +21,7 @@ import datetime
import operator
import itertools
import urllib.parse
+from http.cookiejar import Cookie
from email.utils import mktime_tz, parsedate_tz
from . import text, exception
@@ -135,6 +136,67 @@ def remove_directory(path):
pass
+def load_cookiestxt(fp):
+ """Parse a Netscape cookies.txt file and return a list of its Cookies"""
+ cookies = []
+
+ for line in fp:
+
+ line = line.lstrip()
+ # strip '#HttpOnly_'
+ if line.startswith("#HttpOnly_"):
+ line = line[10:]
+ # ignore empty lines and comments
+ if not line or line[0] in ("#", "$"):
+ continue
+ # strip trailing '\n'
+ if line[-1] == "\n":
+ line = line[:-1]
+
+ domain, domain_specified, path, secure, expires, name, value = \
+ line.split("\t")
+ if not name:
+ name = value
+ value = None
+
+ cookies.append(Cookie(
+ 0, name, value,
+ None, False,
+ domain,
+ domain_specified == "TRUE",
+ domain.startswith("."),
+ path, False,
+ secure == "TRUE",
+ None if expires == "0" or not expires else expires,
+ False, None, None, {},
+ ))
+
+ return cookies
+
+
+def save_cookiestxt(fp, cookies):
+ """Write 'cookies' in Netscape cookies.txt format to 'fp'"""
+ fp.write("# Netscape HTTP Cookie File\n\n")
+
+ for cookie in cookies:
+ if cookie.value is None:
+ name = ""
+ value = cookie.name
+ else:
+ name = cookie.name
+ value = cookie.value
+
+ fp.write("\t".join((
+ cookie.domain,
+ "TRUE" if cookie.domain.startswith(".") else "FALSE",
+ cookie.path,
+ "TRUE" if cookie.secure else "FALSE",
+ "0" if cookie.expires is None else str(cookie.expires),
+ name,
+ value,
+ )) + "\n")
+
+
def code_to_language(code, default=None):
"""Map an ISO 639-1 language code to its actual name"""
return CODES.get((code or "").lower(), default)
@@ -419,63 +481,85 @@ class Formatter():
self.format_map = self.fields[0][1]
else:
self.format_map = lambda _: format_string
- del self.result
- del self.fields
+ del self.result, self.fields
- def format_map(self, kwargs):
- """Apply 'kwargs' to the initial format_string and return its result"""
+ def format_map(self, kwdict):
+ """Apply 'kwdict' to the initial format_string and return its result"""
+ result = self.result
for index, func in self.fields:
- self.result[index] = func(kwargs)
- return "".join(self.result)
+ result[index] = func(kwdict)
+ return "".join(result)
def _field_access(self, field_name, format_spec, conversion):
- first, rest = _string.formatter_field_name_split(field_name)
+ fmt = self._parse_format_spec(format_spec, conversion)
+
+ if "|" in field_name:
+ return self._apply_list([
+ self._parse_field_name(fn)
+ for fn in field_name.split("|")
+ ], fmt)
+ else:
+ key, funcs = self._parse_field_name(field_name)
+ if funcs:
+ return self._apply(key, funcs, fmt)
+ return self._apply_simple(key, fmt)
+ @staticmethod
+ def _parse_field_name(field_name):
+ first, rest = _string.formatter_field_name_split(field_name)
funcs = []
+
for is_attr, key in rest:
if is_attr:
func = operator.attrgetter
- elif ":" in key:
- func = self._slicegetter
else:
func = operator.itemgetter
+ try:
+ if ":" in key:
+ start, _, stop = key.partition(":")
+ stop, _, step = stop.partition(":")
+ start = int(start) if start else None
+ stop = int(stop) if stop else None
+ step = int(step) if step else None
+ key = slice(start, stop, step)
+ except TypeError:
+ pass # key is an integer
+
funcs.append(func(key))
- if conversion:
- funcs.append(self.CONVERSIONS[conversion])
+ return first, funcs
- if format_spec:
- if format_spec[0] == "?":
- func = self._format_optional
- elif format_spec[0] == "L":
- func = self._format_maxlen
- elif format_spec[0] == "J":
- func = self._format_join
- elif format_spec[0] == "R":
- func = self._format_replace
- else:
- func = self._format_default
- fmt = func(format_spec)
- else:
- fmt = str
+ def _parse_format_spec(self, format_spec, conversion):
+ fmt = self._build_format_func(format_spec)
+ if not conversion:
+ return fmt
- if funcs:
- return self._apply(first, funcs, fmt)
- return self._apply_simple(first, fmt)
+ conversion = self.CONVERSIONS[conversion]
+ if fmt is format:
+ return conversion
+ else:
+ def chain(obj):
+ return fmt(conversion(obj))
+ return chain
- def _apply_simple(self, key, fmt):
- def wrap(obj):
- if key in obj:
- obj = obj[key]
- else:
- obj = self.default
- return fmt(obj)
- return wrap
+ def _build_format_func(self, format_spec):
+ if format_spec:
+ fmt = format_spec[0]
+ if fmt == "?":
+ return self._parse_optional(format_spec)
+ if fmt == "L":
+ return self._parse_maxlen(format_spec)
+ if fmt == "J":
+ return self._parse_join(format_spec)
+ if fmt == "R":
+ return self._parse_replace(format_spec)
+ return self._default_format(format_spec)
+ return format
def _apply(self, key, funcs, fmt):
- def wrap(obj):
+ def wrap(kwdict):
try:
- obj = obj[key]
+ obj = kwdict[key]
for func in funcs:
obj = func(obj)
except Exception:
@@ -483,54 +567,66 @@ class Formatter():
return fmt(obj)
return wrap
- @staticmethod
- def _slicegetter(key):
- start, _, stop = key.partition(":")
- stop, _, step = stop.partition(":")
- start = int(start) if start else None
- stop = int(stop) if stop else None
- step = int(step) if step else None
- return operator.itemgetter(slice(start, stop, step))
+ def _apply_simple(self, key, fmt):
+ def wrap(kwdict):
+ return fmt(kwdict[key] if key in kwdict else self.default)
+ return wrap
- @staticmethod
- def _format_optional(format_spec):
- def wrap(obj):
- if not obj:
- return ""
- return before + format(obj, format_spec) + after
+ def _apply_list(self, lst, fmt):
+ def wrap(kwdict):
+ for key, funcs in lst:
+ try:
+ obj = kwdict[key]
+ for func in funcs:
+ obj = func(obj)
+ if obj is not None:
+ break
+ except Exception:
+ pass
+ else:
+ obj = self.default
+ return fmt(obj)
+ return wrap
+
+ def _parse_optional(self, format_spec):
before, after, format_spec = format_spec.split("/", 2)
before = before[1:]
- return wrap
+ fmt = self._build_format_func(format_spec)
- @staticmethod
- def _format_maxlen(format_spec):
- def wrap(obj):
- obj = format(obj, format_spec)
- return obj if len(obj) <= maxlen else replacement
+ def optional(obj):
+ return before + fmt(obj) + after if obj else ""
+ return optional
+
+ def _parse_maxlen(self, format_spec):
maxlen, replacement, format_spec = format_spec.split("/", 2)
maxlen = text.parse_int(maxlen[1:])
- return wrap
+ fmt = self._build_format_func(format_spec)
- @staticmethod
- def _format_join(format_spec):
- def wrap(obj):
- obj = separator.join(obj)
- return format(obj, format_spec)
+ def mlen(obj):
+ obj = fmt(obj)
+ return obj if len(obj) <= maxlen else replacement
+ return mlen
+
+ def _parse_join(self, format_spec):
separator, _, format_spec = format_spec.partition("/")
separator = separator[1:]
- return wrap
+ fmt = self._build_format_func(format_spec)
- @staticmethod
- def _format_replace(format_spec):
- def wrap(obj):
- obj = obj.replace(old, new)
- return format(obj, format_spec)
+ def join(obj):
+ return fmt(separator.join(obj))
+ return join
+
+ def _parse_replace(self, format_spec):
old, new, format_spec = format_spec.split("/", 2)
old = old[1:]
- return wrap
+ fmt = self._build_format_func(format_spec)
+
+ def replace(obj):
+ return fmt(obj.replace(old, new))
+ return replace
@staticmethod
- def _format_default(format_spec):
+ def _default_format(format_spec):
def wrap(obj):
return format(obj, format_spec)
return wrap
@@ -565,12 +661,14 @@ class PathFormat():
self.delete = False
self.path = self.realpath = self.temppath = ""
- basedir = expand_path(
- extractor.config("base-directory", (".", "gallery-dl")))
- if os.altsep and os.altsep in basedir:
- basedir = basedir.replace(os.altsep, os.sep)
- if basedir[-1] != os.sep:
- basedir += os.sep
+ basedir = extractor._parentdir
+ if not basedir:
+ basedir = expand_path(
+ extractor.config("base-directory", (".", "gallery-dl")))
+ if os.altsep and os.altsep in basedir:
+ basedir = basedir.replace(os.altsep, os.sep)
+ if basedir[-1] != os.sep:
+ basedir += os.sep
self.basedirectory = basedir
restrict = extractor.config("path-restrict", "auto")
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 36d729e..9171f15 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.12.3"
+__version__ = "1.13.2"