summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md78
-rw-r--r--PKG-INFO6
-rw-r--r--README.rst4
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.5148
-rw-r--r--gallery_dl.egg-info/PKG-INFO6
-rw-r--r--gallery_dl.egg-info/SOURCES.txt2
-rw-r--r--gallery_dl/extractor/8chan.py6
-rw-r--r--gallery_dl/extractor/__init__.py2
-rw-r--r--gallery_dl/extractor/ao3.py45
-rw-r--r--gallery_dl/extractor/blogger.py4
-rw-r--r--gallery_dl/extractor/boosty.py357
-rw-r--r--gallery_dl/extractor/bunkr.py2
-rw-r--r--gallery_dl/extractor/chevereto.py4
-rw-r--r--gallery_dl/extractor/civitai.py174
-rw-r--r--gallery_dl/extractor/cohost.py3
-rw-r--r--gallery_dl/extractor/danbooru.py4
-rw-r--r--gallery_dl/extractor/deviantart.py308
-rw-r--r--gallery_dl/extractor/fanbox.py28
-rw-r--r--gallery_dl/extractor/flickr.py7
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py5
-rw-r--r--gallery_dl/extractor/generic.py51
-rw-r--r--gallery_dl/extractor/imagefap.py3
-rw-r--r--gallery_dl/extractor/instagram.py3
-rw-r--r--gallery_dl/extractor/komikcast.py14
-rw-r--r--gallery_dl/extractor/lolisafe.py4
-rw-r--r--gallery_dl/extractor/myhentaigallery.py10
-rw-r--r--gallery_dl/extractor/newgrounds.py10
-rw-r--r--gallery_dl/extractor/oauth.py2
-rw-r--r--gallery_dl/extractor/patreon.py58
-rw-r--r--gallery_dl/extractor/pixiv.py300
-rw-r--r--gallery_dl/extractor/pornhub.py39
-rw-r--r--gallery_dl/extractor/pururin.py72
-rw-r--r--gallery_dl/extractor/rule34us.py2
-rw-r--r--gallery_dl/extractor/shimmie2.py37
-rw-r--r--gallery_dl/extractor/twitter.py3
-rw-r--r--gallery_dl/postprocessor/ugoira.py14
-rw-r--r--gallery_dl/text.py45
-rw-r--r--gallery_dl/util.py6
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_formatter.py2
-rw-r--r--test/test_results.py7
-rw-r--r--test/test_text.py22
43 files changed, 1449 insertions, 452 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab8f174..bc6a301 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,33 +1,57 @@
-## 1.27.5 - 2024-09-28
+## 1.27.6 - 2024-10-11
### Extractors
#### Additions
-- [ao3] add support ([#6013](https://github.com/mikf/gallery-dl/issues/6013))
-- [civitai] add support ([#3706](https://github.com/mikf/gallery-dl/issues/3706), [#3787](https://github.com/mikf/gallery-dl/issues/3787), [#4129](https://github.com/mikf/gallery-dl/issues/4129), [#5995](https://github.com/mikf/gallery-dl/issues/5995), [#6220](https://github.com/mikf/gallery-dl/issues/6220))
-- [cohost] add support ([#4483](https://github.com/mikf/gallery-dl/issues/4483), [#6191](https://github.com/mikf/gallery-dl/issues/6191))
+- [ao3] add `subscriptions` extractor ([#6247](https://github.com/mikf/gallery-dl/issues/6247))
+- [boosty] add support ([#2387](https://github.com/mikf/gallery-dl/issues/2387))
+- [civitai] add `post` extractors ([#6279](https://github.com/mikf/gallery-dl/issues/6279))
+- [pixiv] support unlisted artworks ([#5162](https://github.com/mikf/gallery-dl/issues/5162))
#### Fixes
-- [8chan] update `TOS` cookie name
-- [deviantart] work around OAuth API returning empty journal texts ([#6196](https://github.com/mikf/gallery-dl/issues/6196), [#6207](https://github.com/mikf/gallery-dl/issues/6207), [#5916](https://github.com/mikf/gallery-dl/issues/5916))
-- [weasyl:favorite] fix pagination ([#6113](https://github.com/mikf/gallery-dl/issues/6113))
+- [cohost] sanitize default filenames ([#6262](https://github.com/mikf/gallery-dl/issues/6262))
+ - limit `headline` length
+ - remove `plainTextBody`
+- [deviantart] fix & improve journal/literature extraction ([#6254](https://github.com/mikf/gallery-dl/issues/6254), [#6207](https://github.com/mikf/gallery-dl/issues/6207), [#6196](https://github.com/mikf/gallery-dl/issues/6196))
+ - extract journal HTML from webpage if possible
+ - support converting `tiptap` markup to HTML
+- [deviantart] fix `stash` folder extraction
+- [flickr] update default API credentials ([#6300](https://github.com/mikf/gallery-dl/issues/6300))
+- [flickr] fix `ZeroDivisionError` ([#6252](https://github.com/mikf/gallery-dl/issues/6252))
+- [imagefap] fix `{num}` in single image default filenames
+- [myhentaigallery] fix `tags` extraction
+- [patreon] extract `attachments_media` files ([#6241](https://github.com/mikf/gallery-dl/issues/6241), [#6268](https://github.com/mikf/gallery-dl/issues/6268))
+- [pixiv] implement workaround for `limit_sanity_level` works ([#4327](https://github.com/mikf/gallery-dl/issues/4327), [#4747](https://github.com/mikf/gallery-dl/issues/4747), [#5054](https://github.com/mikf/gallery-dl/issues/5054), [#5435](https://github.com/mikf/gallery-dl/issues/5435), [#5651](https://github.com/mikf/gallery-dl/issues/5651), [#5655](https://github.com/mikf/gallery-dl/issues/5655))
+- [pornhub] fix `KeyError` when album images are missing ([#6299](https://github.com/mikf/gallery-dl/issues/6299))
+- [rule34us] fix extraction ([#6289](https://github.com/mikf/gallery-dl/issues/6289))
+- [8chan] set TOS cookie for current and previous day
#### Improvements
-- [bluesky] support video downloads ([#6183](https://github.com/mikf/gallery-dl/issues/6183))
-- [deviantart] add `previews` option ([#3782](https://github.com/mikf/gallery-dl/issues/3782), [#6124](https://github.com/mikf/gallery-dl/issues/6124))
-- [deviantart] warn about empty journal texts ([#5916](https://github.com/mikf/gallery-dl/issues/5916))
-- [inkbunny:favorite] update default directory ([#6115](https://github.com/mikf/gallery-dl/issues/6115))
-- [jpgfish] update domain to `jpg5.su` ([#6231](https://github.com/mikf/gallery-dl/issues/6231))
-- [skeb] prevent 429 errors and need for `request_key` cookie
-- [weasyl:favorite] support readable URL format ([#6113](https://github.com/mikf/gallery-dl/issues/6113))
-- [wikimedia] automatically detect API endpoint when none is defined
-- [zzup] support `up.zzup.com` galleries ([#6181](https://github.com/mikf/gallery-dl/issues/6181))
+- [bunkr] support `bunkr.pk` URLs ([#6272](https://github.com/mikf/gallery-dl/issues/6272))
+- [civitai] use tRPC API by default ([#6279](https://github.com/mikf/gallery-dl/issues/6279))
+- [civitai] improve default archive format ([#6302](https://github.com/mikf/gallery-dl/issues/6302))
+- [komikcast] update domain to `komikcast.cz`
+- [newgrounds] detect more comment embeds ([#6253](https://github.com/mikf/gallery-dl/issues/6253))
+- [newgrounds] add more fallback URL formats for `art-images` files
+- [oauth] prevent empty browser names
+- [patreon] use mobile UA ([#6241](https://github.com/mikf/gallery-dl/issues/6241), [#6239](https://github.com/mikf/gallery-dl/issues/6239), [#6140](https://github.com/mikf/gallery-dl/issues/6140))
+- [patreon] handle suspended accounts
+- [pixiv] detect works requiring `My pixiv` access
+#### Metadata
+- [civitai] ensure image files have an `id` ([#6251](https://github.com/mikf/gallery-dl/issues/6251))
+- [gelbooru_v02] unescape HTML entities in categorized tags
+- [generic] ensure `path` metadata is always defined
+- [pixiv] retrieve `caption` from AJAX API when empty ([#4327](https://github.com/mikf/gallery-dl/issues/4327), [#5191](https://github.com/mikf/gallery-dl/issues/5191))
+#### Options
+- [fanbox] add `comments` option, extend `metadata` option ([#6287](https://github.com/mikf/gallery-dl/issues/6287))
+- [pixiv] add `comments` option ([#6287](https://github.com/mikf/gallery-dl/issues/6287))
+#### Removals
+- [blogger] remove `micmicidol.club`
+- [chevereto] remove `deltaporno.com`
+- [lolisafe] remove `xbunkr.com`
+- [pururin] remove module
+- [shimmie2] remove `loudbooru.com`
### Post Processors
-- [ugoira] implement storing "original" frames in ZIP archives ([#6147](https://github.com/mikf/gallery-dl/issues/6147))
-- [ugoira] fix `KeyError: '_ugoira_frame_index'` ([#6154](https://github.com/mikf/gallery-dl/issues/6154))
-### Formatter
-- add `L` conversion - returns the length of a value
-- allow accessing `util.NONE` via global `_nul`
+- [ugoira] fix `BadZipFile` exceptions ([#6285](https://github.com/mikf/gallery-dl/issues/6285))
+- [ugoira] catch all exceptions when extracting ZIP archives ([#6285](https://github.com/mikf/gallery-dl/issues/6285))
+- [ugoira] forward frame data as `_ugoira_frame_data` ([#6154](https://github.com/mikf/gallery-dl/issues/6154), [#6285](https://github.com/mikf/gallery-dl/issues/6285))
### Miscellaneous
-- [cookies] add `cookies-select` option
-- [cookies:firefox] support using domain & container filters together
-- [docker] prevent errors in Dockerfile build
-- [tests] make `#category` result entries optional
-- allow filtering `--list-extractors` results
-- implement alternatives for deprecated `utc` datetime functions
+- [build] remove setuptools and requests version restrictions
+- [docker] build from `python:3.12-alpine`
+- [text] improve `parse_query()` performance
diff --git a/PKG-INFO b/PKG-INFO
index 32ff8bc..27d0dd4 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.27.5
+Version: 1.27.6
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -114,9 +114,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__
Nightly Builds
diff --git a/README.rst b/README.rst
index d0e4a72..fbb7fa5 100644
--- a/README.rst
+++ b/README.rst
@@ -74,9 +74,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__
Nightly Builds
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 06effd6..3fedff4 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2024-09-28" "1.27.5" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2024-10-11" "1.27.6" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index a36e108..ba4bb3e 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2024-09-28" "1.27.5" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2024-10-11" "1.27.6" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -772,7 +772,7 @@ Setting this value to \f[I]"browser"\f[] will try to automatically detect
and use the User-Agent used by the system's default browser.
Note: This option has no effect on
-pixiv, e621, and mangadex
+pixiv, e621, mangadex, and patreon
extractors, as these need specific values to function correctly.
@@ -782,7 +782,7 @@ extractors, as these need specific values to function correctly.
.IP "Default:" 9
.br
-* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]mangasee\f[], \f[I]patreon\f[], \f[I]pixiv:series\f[], \f[I]twitter\f[]
+* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]mangasee\f[], \f[I]twitter\f[]
.br
* \f[I]null\f[]: otherwise
@@ -868,7 +868,7 @@ to be passed to
.IP "Default:" 9
.br
-* \f[I]false\f[]: \f[I]artstation\f[], \f[I]patreon\f[], \f[I]pixiv:series\f[]
+* \f[I]false\f[]: \f[I]artstation\f[]
.br
* \f[I]true\f[]: otherwise
@@ -1761,6 +1761,63 @@ Process reposts.
Download videos.
+.SS extractor.boosty.allowed
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Request only available posts.
+
+
+.SS extractor.boosty.bought
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Request only purchased posts for \f[I]feed\f[] results.
+
+
+.SS extractor.boosty.metadata
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Provide detailed \f[I]user\f[] metadata.
+
+
+.SS extractor.boosty.videos
+.IP "Type:" 6
+.br
+* \f[I]bool\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Example:" 4
+["full_hd", "high", "medium"]
+
+.IP "Description:" 4
+Download videos.
+
+If this is a \f[I]list\f[], it selects which format to try to download.
+.br
+Possibly available formats are
+.br
+\f[I]"quad_hd"\f[], \f[I]"ultra_hd"\f[], \f[I]"full_hd"\f[],
+\f[I]"high"\f[], \f[I]"medium"\f[], \f[I]"low"\f[]
+
+
.SS extractor.bunkr.tlds
.IP "Type:" 6
\f[I]bool\f[]
@@ -1799,7 +1856,7 @@ Available types are
\f[I]string\f[]
.IP "Default:" 9
-\f[I]"rest"\f[]
+\f[I]"trpc"\f[]
.IP "Description:" 4
Selects which API endpoints to use.
@@ -1807,7 +1864,7 @@ Selects which API endpoints to use.
.br
* \f[I]"rest"\f[]: \f[I]Public REST API\f[]
.br
-* \f[I]"trpc"\f[]: Internal TRPC API
+* \f[I]"trpc"\f[]: Internal tRPC API
.SS extractor.civitai.api-key
@@ -1839,6 +1896,28 @@ Available types are
\f[I]gallery\f[].
+.SS extractor.civitai.include
+.IP "Type:" 6
+.br
+* \f[I]string\f[]
+.br
+* \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]["user-models", "user-posts"]\f[]
+
+.IP "Description:" 4
+A (comma-separated) list of subcategories to include
+when processing a user profile.
+
+Possible values are
+\f[I]"user-models"\f[],
+\f[I]"user-posts"\f[],
+\f[I]"user-images"\f[].
+
+It is possible to use \f[I]"all"\f[] instead of listing all values separately.
+
+
.SS extractor.civitai.nsfw
.IP "Type:" 6
.br
@@ -2620,6 +2699,20 @@ Selects an alternative source to download files from.
* \f[I]"hitomi"\f[]: Download the corresponding gallery from \f[I]hitomi.la\f[]
+.SS extractor.fanbox.comments
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Extract \f[I]comments\f[] metadata.
+
+Note: This requires 1 or more additional API requests per post,
+depending on the number of comments.
+
+
.SS extractor.fanbox.embeds
.IP "Type:" 6
.br
@@ -2657,13 +2750,25 @@ extraction and download for YouTube, Vimeo, and SoundCloud embeds.
.IP "Example:" 4
.br
-* user,plan
+* user,plan,comments
.br
-* ["user", "plan"]
+* ["user", "plan", "comments"]
.IP "Description:" 4
Extract \f[I]plan\f[] and extended \f[I]user\f[] metadata.
+Supported fields when selecting which data to extract are
+
+.br
+* \f[I]comments\f[]
+.br
+* \f[I]plan\f[]
+.br
+* \f[I]user\f[]
+
+Note: \f[I]comments\f[] can also be enabled via
+\f[I]fanbox.comments\f[]
+
.SS extractor.flickr.access-token & .access-token-secret
.IP "Type:" 6
@@ -3987,7 +4092,21 @@ For works bookmarked by
\f[I]your own account\f[],
fetch bookmark tags as \f[I]tags_bookmark\f[] metadata.
-Note: This requires 1 additional API call per bookmarked post.
+Note: This requires 1 additional API request per bookmarked post.
+
+
+.SS extractor.pixiv.comments
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Fetch \f[I]comments\f[] metadata.
+
+Note: This requires 1 or more additional API requests per post,
+depending on the number of comments.
.SS extractor.pixiv.work.related
@@ -4054,6 +4173,17 @@ When downloading galleries, this sets the maximum number of posts to get.
A value of \f[I]0\f[] means no limit.
+.SS extractor.pixiv.sanity
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Try to fetch \f[I]limit_sanity_level\f[] works via web API.
+
+
.SS extractor.plurk.comments
.IP "Type:" 6
\f[I]bool\f[]
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index 32ff8bc..27d0dd4 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.27.5
+Version: 1.27.6
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -114,9 +114,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__
Nightly Builds
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 8ae8026..df9217a 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -67,6 +67,7 @@ gallery_dl/extractor/behance.py
gallery_dl/extractor/blogger.py
gallery_dl/extractor/bluesky.py
gallery_dl/extractor/booru.py
+gallery_dl/extractor/boosty.py
gallery_dl/extractor/bunkr.py
gallery_dl/extractor/catbox.py
gallery_dl/extractor/chevereto.py
@@ -179,7 +180,6 @@ gallery_dl/extractor/poringa.py
gallery_dl/extractor/pornhub.py
gallery_dl/extractor/pornpics.py
gallery_dl/extractor/postmill.py
-gallery_dl/extractor/pururin.py
gallery_dl/extractor/reactor.py
gallery_dl/extractor/readcomiconline.py
gallery_dl/extractor/recursive.py
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
index afa3a69..f81d2a1 100644
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -29,8 +29,10 @@ class _8chanExtractor(Extractor):
def _init(self):
now = util.datetime_utcnow()
domain = self.root.rpartition("/")[2]
- self.cookies.set("TOS20240928", "1", domain=domain)
- self.cookies.set(now.strftime("TOS%Y%m%d"), "1", domain=domain)
+ self.cookies.set(
+ now.strftime("TOS%Y%m%d"), "1", domain=domain)
+ self.cookies.set(
+ (now - timedelta(1)).strftime("TOS%Y%m%d"), "1", domain=domain)
@memcache()
def cookies_prepare(self):
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 826771c..9885195 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -32,6 +32,7 @@ modules = [
"behance",
"blogger",
"bluesky",
+ "boosty",
"bunkr",
"catbox",
"chevereto",
@@ -133,7 +134,6 @@ modules = [
"pornhub",
"pornpics",
"postmill",
- "pururin",
"reactor",
"readcomiconline",
"reddit",
diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py
index 1f570e8..d3ab846 100644
--- a/gallery_dl/extractor/ao3.py
+++ b/gallery_dl/extractor/ao3.py
@@ -29,11 +29,31 @@ class Ao3Extractor(Extractor):
self.login()
base = self.root + "/works/"
- data = {"_extractor": Ao3WorkExtractor}
+ data = {"_extractor": Ao3WorkExtractor, "type": "work"}
for work_id in self.works():
yield Message.Queue, base + work_id, data
+ def items_list(self, type, needle, part=True):
+ self.login()
+
+ base = self.root + "/"
+ data_work = {"_extractor": Ao3WorkExtractor, "type": "work"}
+ data_series = {"_extractor": Ao3SeriesExtractor, "type": "series"}
+ data_user = {"_extractor": Ao3UserExtractor, "type": "user"}
+
+ for item in self._pagination(self.groups[0], needle):
+ path = item.rpartition("/")[0] if part else item
+ url = base + path
+ if item.startswith("works/"):
+ yield Message.Queue, url, data_work
+ elif item.startswith("series/"):
+ yield Message.Queue, url, data_series
+ elif item.startswith("users/"):
+ yield Message.Queue, url, data_user
+ else:
+ self.log.warning("Unsupported %s type '%s'", type, path)
+
def works(self):
return self._pagination(self.groups[0])
@@ -284,19 +304,14 @@ class Ao3UserBookmarkExtractor(Ao3Extractor):
example = "https://archiveofourown.org/users/USER/bookmarks"
def items(self):
- self.login()
+ return self.items_list("bookmark", '<span class="count"><a href="/')
- base = self.root + "/"
- data_work = {"_extractor": Ao3WorkExtractor}
- data_series = {"_extractor": Ao3SeriesExtractor}
- for item in self._pagination(
- self.groups[0], '<span class="count"><a href="/'):
- path = item.rpartition("/")[0]
- url = base + path
- if item.startswith("works/"):
- yield Message.Queue, url, data_work
- elif item.startswith("series/"):
- yield Message.Queue, url, data_series
- else:
- self.log.warning("Unsupported bookmark type '%s'", path)
+class Ao3SubscriptionsExtractor(Ao3Extractor):
+ """Extractor for your AO3 account's subscriptions"""
+ subcategory = "subscriptions"
+ pattern = BASE_PATTERN + r"(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)"
+ example = "https://archiveofourown.org/users/USER/subscriptions"
+
+ def items(self):
+ return self.items_list("subscription", '<dt>\n<a href="/', False)
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 402408e..37075ea 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -89,10 +89,6 @@ BASE_PATTERN = BloggerExtractor.update({
"root": None,
"pattern": r"[\w-]+\.blogspot\.com",
},
- "micmicidol": {
- "root": "https://www.micmicidol.club",
- "pattern": r"(?:www\.)?micmicidol\.club",
- },
})
diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py
new file mode 100644
index 0000000..997de4a
--- /dev/null
+++ b/gallery_dl/extractor/boosty.py
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.boosty.to/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+
+BASE_PATTERN = r"(?:https?://)?boosty\.to"
+
+
+class BoostyExtractor(Extractor):
+ """Base class for boosty extractors"""
+ category = "boosty"
+ root = "https://www.boosty.to"
+ directory_fmt = ("{category}", "{user[blogUrl]} ({user[id]})",
+ "{post[date]:%Y-%m-%d} {post[int_id]}")
+ filename_fmt = "{num:>02} {file[id]}.{extension}"
+ archive_fmt = "{file[id]}"
+ cookies_domain = ".boosty.to"
+ cookies_names = ("auth",)
+
+ def _init(self):
+ self.api = BoostyAPI(self)
+
+ self._user = None if self.config("metadata") else False
+ self.only_allowed = self.config("allowed", True)
+ self.only_bought = self.config("bought")
+
+ videos = self.config("videos")
+ if videos is None or videos:
+ if isinstance(videos, str):
+ videos = videos.split(",")
+ elif not isinstance(videos, (list, tuple)):
+ videos = ("quad_hd", "ultra_hd", "full_hd",
+ "high", "medium", "low")
+ self.videos = videos
+
+ def items(self):
+ for post in self.posts():
+ if not post.get("hasAccess"):
+ self.log.warning("Not allowed to access post %s", post["id"])
+ continue
+
+ files = self._process_post(post)
+ data = {
+ "post" : post,
+ "user" : post.pop("user", None),
+ "count": len(files),
+ }
+
+ yield Message.Directory, data
+ for data["num"], file in enumerate(files, 1):
+ data["file"] = file
+ url = file["url"]
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def posts(self):
+ """Yield JSON content of all relevant posts"""
+
+ def _process_post(self, post):
+ files = []
+ post["content"] = content = []
+ post["links"] = links = []
+
+ if "createdAt" in post:
+ post["date"] = text.parse_timestamp(post["createdAt"])
+ if self._user:
+ post["user"] = self._user
+
+ for block in post["data"]:
+ try:
+ type = block["type"]
+ if type == "text":
+ if block["modificator"] == "BLOCK_END":
+ continue
+ c = util.json_loads(block["content"])
+ content.append(c[0])
+
+ elif type == "image":
+ files.append(self._update_url(post, block))
+
+ elif type == "ok_video":
+ if not self.videos:
+ self.log.debug("%s: Skipping video %s",
+ post["int_id"], block["id"])
+ continue
+ fmts = {
+ fmt["type"]: fmt["url"]
+ for fmt in block["playerUrls"]
+ if fmt["url"]
+ }
+ formats = [
+ fmts[fmt]
+ for fmt in self.videos
+ if fmt in fmts
+ ]
+ if formats:
+ formats = iter(formats)
+ block["url"] = next(formats)
+ block["_fallback"] = formats
+ files.append(block)
+ else:
+ self.log.warning(
+ "%s: Found no suitable video format for %s",
+ post["int_id"], block["id"])
+
+ elif type == "link":
+ url = block["url"]
+ links.append(url)
+ content.append(url)
+
+ elif type == "audio_file":
+ files.append(self._update_url(post, block))
+
+ else:
+ self.log.debug("%s: Unsupported data type '%s'",
+ post["int_id"], type)
+ except Exception as exc:
+ self.log.debug("%s: %s", exc.__class__.__name__, exc)
+
+ del post["data"]
+ return files
+
+ def _update_url(self, post, block):
+ url = block["url"]
+ sep = "&" if "?" in url else "?"
+
+ signed_query = post.get("signedQuery")
+ if signed_query:
+ url += sep + signed_query[1:]
+ sep = "&"
+
+ migrated = post.get("isMigrated")
+ if migrated is not None:
+ url += sep + "is_migrated=" + str(migrated).lower()
+
+ block["url"] = url
+ return block
+
+
+class BoostyUserExtractor(BoostyExtractor):
+ """Extractor for boosty.to user profiles"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/([^/?#]+)(?:\?([^#]+))?$"
+ example = "https://boosty.to/USER"
+
+ def posts(self):
+ user, query = self.groups
+ params = text.parse_query(query)
+ if self._user is None:
+ self._user = self.api.user(user)
+ return self.api.blog_posts(user, params)
+
+
+class BoostyMediaExtractor(BoostyExtractor):
+ """Extractor for boosty.to user media"""
+ subcategory = "media"
+ directory_fmt = "{category}", "{user[blogUrl]} ({user[id]})", "media"
+ filename_fmt = "{post[id]}_{num}.{extension}"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/media/([^/?#]+)(?:\?([^#]+))?"
+ example = "https://boosty.to/USER/media/all"
+
+ def posts(self):
+ user, media, query = self.groups
+ params = text.parse_query(query)
+ self._user = self.api.user(user)
+ return self.api.blog_media_album(user, media, params)
+
+
+class BoostyFeedExtractor(BoostyExtractor):
+ """Extractor for your boosty.to subscription feed"""
+ subcategory = "feed"
+ pattern = BASE_PATTERN + r"/(?:\?([^#]+))?(?:$|#)"
+ example = "https://boosty.to/"
+
+ def posts(self):
+ params = text.parse_query(self.groups[0])
+ return self.api.feed_posts(params)
+
+
+class BoostyPostExtractor(BoostyExtractor):
+ """Extractor for boosty.to posts"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/posts/([0-9a-f-]+)"
+ example = "https://boosty.to/USER/posts/01234567-89ab-cdef-0123-456789abcd"
+
+ def posts(self):
+ user, post_id = self.groups
+ if self._user is None:
+ self._user = self.api.user(user)
+ return (self.api.post(user, post_id),)
+
+
+class BoostyFollowingExtractor(BoostyExtractor):
+ """Extractor for your boosty.to subscribed users"""
+ subcategory = "following"
+ pattern = BASE_PATTERN + r"/app/settings/subscriptions"
+ example = "https://boosty.to/app/settings/subscriptions"
+
+ def items(self):
+ for user in self.api.user_subscriptions():
+ url = "{}/{}".format(self.root, user["blog"]["blogUrl"])
+ user["_extractor"] = BoostyUserExtractor
+ yield Message.Queue, url, user
+
+
+class BoostyAPI():
+ """Interface for the Boosty API"""
+ root = "https://api.boosty.to"
+
+ def __init__(self, extractor, access_token=None):
+ self.extractor = extractor
+ self.headers = {
+ "Accept": "application/json, text/plain, */*",
+ "Origin": extractor.root,
+ }
+
+ if not access_token:
+ auth = self.extractor.cookies.get("auth", domain=".boosty.to")
+ if auth:
+ access_token = text.extr(
+ auth, "%22accessToken%22%3A%22", "%22")
+ if access_token:
+ self.headers["Authorization"] = "Bearer " + access_token
+
+ def blog_posts(self, username, params):
+ endpoint = "/v1/blog/{}/post/".format(username)
+ params = self._merge_params(params, {
+ "limit" : "5",
+ "offset" : None,
+ "comments_limit": "2",
+ "reply_limit" : "1",
+ })
+ return self._pagination(endpoint, params)
+
+ def blog_media_album(self, username, type="all", params=()):
+ endpoint = "/v1/blog/{}/media_album/".format(username)
+ params = self._merge_params(params, {
+ "type" : type.rstrip("s"),
+ "limit" : "15",
+ "limit_by": "media",
+ "offset" : None,
+ })
+ return self._pagination(endpoint, params, self._transform_media_posts)
+
+ def _transform_media_posts(self, data):
+ posts = []
+
+ for obj in data["mediaPosts"]:
+ post = obj["post"]
+ post["data"] = obj["media"]
+ posts.append(post)
+
+ return posts
+
+ def post(self, username, post_id):
+ endpoint = "/v1/blog/{}/post/{}".format(username, post_id)
+ return self._call(endpoint)
+
+ def feed_posts(self, params=None):
+ endpoint = "/v1/feed/post/"
+ params = self._merge_params(params, {
+ "limit" : "5",
+ "offset" : None,
+ "comments_limit": "2",
+ })
+ if "only_allowed" not in params and self.extractor.only_allowed:
+ params["only_allowed"] = "true"
+ if "only_bought" not in params and self.extractor.only_bought:
+ params["only_bought"] = "true"
+ return self._pagination(endpoint, params, key="posts")
+
+ def user(self, username):
+ endpoint = "/v1/blog/" + username
+ user = self._call(endpoint)
+ user["id"] = user["owner"]["id"]
+ return user
+
+ def user_subscriptions(self, params=None):
+ endpoint = "/v1/user/subscriptions"
+ params = self._merge_params(params, {
+ "limit" : "30",
+ "with_follow": "true",
+ "offset" : None,
+ })
+ return self._pagination_users(endpoint, params)
+
+ def _merge_params(self, params_web, params_api):
+ if params_web:
+ web_to_api = {
+ "isOnlyAllowedPosts": "is_only_allowed",
+ "postsTagsIds" : "tags_ids",
+ "postsFrom" : "from_ts",
+ "postsTo" : "to_ts",
+ }
+ for name, value in params_web.items():
+ name = web_to_api.get(name, name)
+ params_api[name] = value
+ return params_api
+
+ def _call(self, endpoint, params=None):
+ url = self.root + endpoint
+
+ while True:
+ response = self.extractor.request(
+ url, params=params, headers=self.headers,
+ fatal=None, allow_redirects=False)
+
+ if response.status_code < 300:
+ return response.json()
+
+ elif response.status_code < 400:
+ raise exception.AuthenticationError("Invalid API access token")
+
+ elif response.status_code == 429:
+ self.extractor.wait(seconds=600)
+
+ else:
+ self.extractor.log.debug(response.text)
+ raise exception.StopExtraction("API request failed")
+
+ def _pagination(self, endpoint, params, transform=None, key=None):
+ if "is_only_allowed" not in params and self.extractor.only_allowed:
+ params["is_only_allowed"] = "true"
+
+ while True:
+ data = self._call(endpoint, params)
+
+ if transform:
+ yield from transform(data["data"])
+ elif key:
+ yield from data["data"][key]
+ else:
+ yield from data["data"]
+
+ extra = data["extra"]
+ if extra.get("isLast"):
+ return
+ offset = extra.get("offset")
+ if not offset:
+ return
+ params["offset"] = offset
+
+ def _pagination_users(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, params)
+
+ yield from data["data"]
+
+ offset = data["offset"] + data["limit"]
+ if offset > data["total"]:
+ return
+ params["offset"] = offset
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 780bdf1..9022ffc 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -21,7 +21,7 @@ else:
BASE_PATTERN = (
r"(?:bunkr:(?:https?://)?([^/?#]+)|"
r"(?:https?://)?(?:app\.)?(bunkr+"
- r"\.(?:s[kiu]|[cf]i|ru|la|is|to|a[cx]"
+ r"\.(?:s[kiu]|[cf]i|pk|ru|la|is|to|a[cx]"
r"|black|cat|media|red|site|ws|org)))"
)
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index 102945b..aedcea4 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -42,10 +42,6 @@ BASE_PATTERN = CheveretoExtractor.update({
"root": "https://img.kiwi",
"pattern": r"img\.kiwi",
},
- "deltaporno": {
- "root": "https://gallery.deltaporno.com",
- "pattern": r"gallery\.deltaporno\.com",
- },
})
diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py
index 3e657d6..725af3a 100644
--- a/gallery_dl/extractor/civitai.py
+++ b/gallery_dl/extractor/civitai.py
@@ -22,17 +22,17 @@ class CivitaiExtractor(Extractor):
category = "civitai"
root = "https://civitai.com"
directory_fmt = ("{category}", "{username|user[username]}", "images")
- filename_fmt = "{id}.{extension}"
- archive_fmt = "{hash}"
+ filename_fmt = "{file[id]|id|filename}.{extension}"
+ archive_fmt = "{file[hash]|hash}"
request_interval = (0.5, 1.5)
def _init(self):
- if self.config("api") == "trpc":
- self.log.debug("Using tRPC API")
- self.api = CivitaiTrpcAPI(self)
- else:
+ if self.config("api") == "rest":
self.log.debug("Using REST API")
self.api = CivitaiRestAPI(self)
+ else:
+ self.log.debug("Using tRPC API")
+ self.api = CivitaiTrpcAPI(self)
quality = self.config("quality")
if quality:
@@ -53,6 +53,30 @@ class CivitaiExtractor(Extractor):
yield Message.Queue, url, data
return
+ posts = self.posts()
+ if posts:
+ for post in posts:
+
+ if "images" in post:
+ images = post["images"]
+ else:
+ images = self.api.images_post(post["id"])
+
+ post = self.api.post(post["id"])
+ post["date"] = text.parse_datetime(
+ post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ data = {
+ "post": post,
+ "user": post["user"],
+ }
+ del post["user"]
+
+ yield Message.Directory, data
+ for file in self._image_results(images):
+ file.update(data)
+ yield Message.Url, file["url"], file
+ return
+
images = self.images()
if images:
for image in images:
@@ -68,6 +92,9 @@ class CivitaiExtractor(Extractor):
def models(self):
return ()
+ def posts(self):
+ return ()
+
def images(self):
return ()
@@ -87,13 +114,26 @@ class CivitaiExtractor(Extractor):
url, self._image_quality, name)
)
+ def _image_results(self, images):
+ for num, file in enumerate(images, 1):
+ data = text.nameext_from_url(file["url"], {
+ "num" : num,
+ "file": file,
+ "url" : self._url(file),
+ })
+ if not data["extension"]:
+ data["extension"] = self._image_ext
+ if "id" not in file and data["filename"].isdecimal():
+ file["id"] = text.parse_int(data["filename"])
+ yield data
+
class CivitaiModelExtractor(CivitaiExtractor):
subcategory = "model"
directory_fmt = ("{category}", "{user[username]}",
"{model[id]}{model[name]:? //}",
"{version[id]}{version[name]:? //}")
- filename_fmt = "{filename}.{extension}"
+ filename_fmt = "{file[id]}.{extension}"
archive_fmt = "{file[hash]}"
pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?"
example = "https://civitai.com/models/12345/TITLE"
@@ -183,23 +223,11 @@ class CivitaiModelExtractor(CivitaiExtractor):
}
images = self.api.images(params, defaults=False)
- return [
- text.nameext_from_url(file["url"], {
- "num" : num,
- "file": file,
- "url" : self._url(file),
- })
- for num, file in enumerate(images, 1)
- ]
+ return self._image_results(images)
def _extract_files_gallery(self, model, version, user):
images = self.api.images_gallery(model, version, user)
- for num, file in enumerate(images, 1):
- yield text.nameext_from_url(file["url"], {
- "num" : num,
- "file": file,
- "url" : self._url(file),
- })
+ return self._image_results(images)
def _validate_file_model(self, response):
if response.headers.get("Content-Type", "").startswith("text/html"):
@@ -224,6 +252,17 @@ class CivitaiImageExtractor(CivitaiExtractor):
return self.api.image(self.groups[0])
+class CivitaiPostExtractor(CivitaiExtractor):
+ subcategory = "post"
+ directory_fmt = ("{category}", "{username|user[username]}", "posts",
+ "{post[id]}{post[title]:? //}")
+ pattern = BASE_PATTERN + r"/posts/(\d+)"
+ example = "https://civitai.com/posts/12345"
+
+ def posts(self):
+ return ({"id": int(self.groups[0])},)
+
+
class CivitaiTagModelsExtractor(CivitaiExtractor):
subcategory = "tag-models"
pattern = BASE_PATTERN + r"/(?:tag/|models\?tag=)([^/?&#]+)"
@@ -266,8 +305,9 @@ class CivitaiUserExtractor(CivitaiExtractor):
base = "{}/user/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors((
(CivitaiUserModelsExtractor, base + "models"),
+ (CivitaiUserPostsExtractor , base + "posts"),
(CivitaiUserImagesExtractor, base + "images"),
- ), ("user-models", "user-images"))
+ ), ("user-models", "user-posts"))
class CivitaiUserModelsExtractor(CivitaiExtractor):
@@ -281,6 +321,19 @@ class CivitaiUserModelsExtractor(CivitaiExtractor):
return self.api.models(params)
+class CivitaiUserPostsExtractor(CivitaiExtractor):
+ subcategory = "user-posts"
+ directory_fmt = ("{category}", "{username|user[username]}", "posts",
+ "{post[id]}{post[title]:? //}")
+ pattern = USER_PATTERN + r"/posts/?(?:\?([^#]+))?"
+ example = "https://civitai.com/user/USER/posts"
+
+ def posts(self):
+ params = text.parse_query(self.groups[1])
+ params["username"] = text.unquote(self.groups[0])
+ return self.api.posts(params)
+
+
class CivitaiUserImagesExtractor(CivitaiExtractor):
subcategory = "user-images"
pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?"
@@ -373,7 +426,7 @@ class CivitaiTrpcAPI():
self.root = extractor.root + "/api/trpc/"
self.headers = {
"content-type" : "application/json",
- "x-client-version": "5.0.94",
+ "x-client-version": "5.0.146",
"x-client-date" : "",
"x-client" : "web",
"x-fingerprint" : "undefined",
@@ -399,7 +452,7 @@ class CivitaiTrpcAPI():
endpoint = "image.getInfinite"
if defaults:
- params_ = {
+ params = self._merge_params(params, {
"useIndex" : True,
"period" : "AllTime",
"sort" : "Newest",
@@ -408,12 +461,9 @@ class CivitaiTrpcAPI():
"fromPlatform" : False, # Made On-Site
"browsingLevel": self.nsfw,
"include" : ["cosmetics"],
- }
- params_.update(params)
- else:
- params_ = params
+ })
- return self._pagination(endpoint, params_)
+ return self._pagination(endpoint, params)
def images_gallery(self, model, version, user):
endpoint = "image.getImagesAsPostsInfinite"
@@ -430,6 +480,13 @@ class CivitaiTrpcAPI():
for post in self._pagination(endpoint, params):
yield from post["images"]
+ def images_post(self, post_id):
+ params = {
+ "postId" : int(post_id),
+ "pending": True,
+ }
+ return self.images(params)
+
def model(self, model_id):
endpoint = "model.getById"
params = {"id": int(model_id)}
@@ -444,7 +501,7 @@ class CivitaiTrpcAPI():
endpoint = "model.getAll"
if defaults:
- params_ = {
+ params = self._merge_params(params, {
"period" : "AllTime",
"periodMode" : "published",
"sort" : "Newest",
@@ -455,36 +512,71 @@ class CivitaiTrpcAPI():
"fromPlatform" : False,
"supportsGeneration": False,
"browsingLevel": self.nsfw,
- }
- params_.update(params)
- else:
- params_ = params
+ })
+
+ return self._pagination(endpoint, params)
+
+ def post(self, post_id):
+ endpoint = "post.get"
+ params = {"id": int(post_id)}
+ return self._call(endpoint, params)
- return self._pagination(endpoint, params_)
+ def posts(self, params, defaults=True):
+ endpoint = "post.getInfinite"
+ meta = {"cursor": ("Date",)}
+
+ if defaults:
+ params = self._merge_params(params, {
+ "browsingLevel": self.nsfw,
+ "period" : "AllTime",
+ "periodMode" : "published",
+ "sort" : "Newest",
+ "followed" : False,
+ "draftOnly" : False,
+ "pending" : True,
+ "include" : ["cosmetics"],
+ })
+
+ return self._pagination(endpoint, params, meta)
def user(self, username):
endpoint = "user.getCreator"
params = {"username": username}
return (self._call(endpoint, params),)
- def _call(self, endpoint, params):
+ def _call(self, endpoint, params, meta=None):
url = self.root + endpoint
headers = self.headers
- params = {"input": util.json_dumps({"json": params})}
+ if meta:
+ input = {"json": params, "meta": {"values": meta}}
+ else:
+ input = {"json": params}
+
+ params = {"input": util.json_dumps(input)}
headers["x-client-date"] = str(int(time.time() * 1000))
- response = self.extractor.request(url, headers=headers, params=params)
+ response = self.extractor.request(url, params=params, headers=headers)
return response.json()["result"]["data"]["json"]
- def _pagination(self, endpoint, params):
+ def _pagination(self, endpoint, params, meta=None):
+ if "cursor" not in params:
+ params["cursor"] = None
+ meta_ = {"cursor": ("undefined",)}
+
while True:
- data = self._call(endpoint, params)
+ data = self._call(endpoint, params, meta_)
yield from data["items"]
try:
if not data["nextCursor"]:
return
- params["cursor"] = data["nextCursor"]
except KeyError:
return
+
+ params["cursor"] = data["nextCursor"]
+ meta_ = meta
+
+ def _merge_params(self, params_user, params_default):
+ params_default.update(params_user)
+ return params_default
diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py
index e1f6040..4722a4f 100644
--- a/gallery_dl/extractor/cohost.py
+++ b/gallery_dl/extractor/cohost.py
@@ -19,8 +19,7 @@ class CohostExtractor(Extractor):
category = "cohost"
root = "https://cohost.org"
directory_fmt = ("{category}", "{postingProject[handle]}")
- filename_fmt = ("{postId}_{headline|plainTextBody:?/_/[:100]}"
- "{num}.{extension}")
+ filename_fmt = ("{postId}_{headline:?/_/[b:200]}{num}.{extension}")
archive_fmt = "{postId}_{num}"
def _init(self):
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 09beb5f..1746647 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -93,7 +93,9 @@ class DanbooruExtractor(BaseExtractor):
if post["extension"] == "zip":
if self.ugoira:
- post["frames"] = self._ugoira_frames(post)
+ post["_ugoira_original"] = False
+ post["_ugoira_frame_data"] = post["frames"] = \
+ self._ugoira_frames(post)
post["_http_adjust_extension"] = False
else:
url = post["large_file_url"]
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 3686e1b..836fae7 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -177,24 +177,7 @@ class DeviantartExtractor(Extractor):
yield self.commit(deviation, deviation["flash"])
if self.commit_journal:
- if "excerpt" in deviation:
- # journal = self.api.deviation_content(
- # deviation["deviationid"])
- if not self.eclipse_api:
- self.eclipse_api = DeviantartEclipseAPI(self)
- content = self.eclipse_api.deviation_extended_fetch(
- deviation["index"],
- deviation["author"]["username"],
- "journal",
- )["deviation"]["textContent"]
- html = content["html"]["markup"]
- if html.startswith("{"):
- html = content["excerpt"].replace("\n", "<br />")
- journal = {"html": html}
- elif "body" in deviation:
- journal = {"html": deviation.pop("body")}
- else:
- journal = None
+ journal = self._extract_journal(deviation)
if journal:
if self.extra:
deviation["_journal"] = journal["html"]
@@ -375,6 +358,204 @@ class DeviantartExtractor(Extractor):
deviation["extension"] = "txt"
return Message.Url, txt, deviation
+ def _extract_journal(self, deviation):
+ if "excerpt" in deviation:
+ # # empty 'html'
+ # return self.api.deviation_content(deviation["deviationid"])
+
+ if "_page" in deviation:
+ page = deviation["_page"]
+ del deviation["_page"]
+ else:
+ page = self._limited_request(deviation["url"]).text
+
+ # extract journal html from webpage
+ html = text.extr(
+ page,
+ "<h2>Literature Text</h2></span><div>",
+ "</div></section></div></div>")
+ if html:
+ return {"html": html}
+
+ self.log.debug("%s: Failed to extract journal HTML from webpage. "
+ "Falling back to __INITIAL_STATE__ markup.",
+ deviation["index"])
+
+ # parse __INITIAL_STATE__ as fallback
+ state = util.json_loads(text.extr(
+ page, 'window.__INITIAL_STATE__ = JSON.parse("', '");')
+ .replace("\\\\", "\\").replace("\\'", "'").replace('\\"', '"'))
+ deviations = state["@@entities"]["deviation"]
+ content = deviations.popitem()[1]["textContent"]
+
+ html = self._textcontent_to_html(deviation, content)
+ if html:
+ return {"html": html}
+ return {"html": content["excerpt"].replace("\n", "<br />")}
+
+ if "body" in deviation:
+ return {"html": deviation.pop("body")}
+ return None
+
+ def _textcontent_to_html(self, deviation, content):
+ html = content["html"]
+ markup = html["markup"]
+
+ if not markup.startswith("{"):
+ return markup
+
+ if html["type"] == "tiptap":
+ try:
+ return self._tiptap_to_html(markup)
+ except Exception as exc:
+ self.log.debug("", exc_info=exc)
+ self.log.error("%s: '%s: %s'", deviation["index"],
+ exc.__class__.__name__, exc)
+
+ self.log.warning("%s: Unsupported '%s' markup.",
+ deviation["index"], html["type"])
+
+ def _tiptap_to_html(self, markup):
+ html = []
+
+ html.append('<div data-editor-viewer="1" '
+ 'class="_83r8m _2CKTq _3NjDa mDnFl">')
+ data = util.json_loads(markup)
+ for block in data["document"]["content"]:
+ self._tiptap_process_content(html, block)
+ html.append("</div>")
+
+ return "".join(html)
+
+ def _tiptap_process_content(self, html, content):
+ type = content["type"]
+
+ if type == "paragraph":
+ children = content.get("content")
+ if children:
+ html.append('<p style="')
+
+ attrs = content["attrs"]
+ if "textAlign" in attrs:
+ html.append("text-align:")
+ html.append(attrs["textAlign"])
+ html.append(";")
+ html.append('margin-inline-start:0px">')
+
+ for block in children:
+ self._tiptap_process_content(html, block)
+ html.append("</p>")
+ else:
+ html.append('<p class="empty-p"><br/></p>')
+
+ elif type == "text":
+ self._tiptap_process_text(html, content)
+
+ elif type == "hardBreak":
+ html.append("<br/><br/>")
+
+ elif type == "horizontalRule":
+ html.append("<hr/>")
+
+ elif type == "da-deviation":
+ self._tiptap_process_deviation(html, content)
+
+ elif type == "da-mention":
+ user = content["attrs"]["user"]["username"]
+ html.append('<a href="https://www.deviantart.com/')
+ html.append(user.lower())
+ html.append('" data-da-type="da-mention" data-user="">@<!-- -->')
+ html.append(user)
+ html.append('</a>')
+
+ else:
+ self.log.warning("Unsupported content type '%s'", type)
+
+ def _tiptap_process_text(self, html, content):
+ marks = content.get("marks")
+ if marks:
+ close = []
+ for mark in marks:
+ type = mark["type"]
+ if type == "link":
+ html.append('<a href="')
+ html.append(text.escape(mark["attrs"]["href"]))
+ html.append('" rel="noopener noreferrer nofollow ugc">')
+ close.append("</a>")
+ elif type == "bold":
+ html.append("<strong>")
+ close.append("</strong>")
+ elif type == "italic":
+ html.append("<em>")
+ close.append("</em>")
+ elif type == "underline":
+ html.append("<u>")
+ close.append("</u>")
+ elif type == "textStyle" and len(mark) <= 1:
+ pass
+ else:
+ self.log.warning("Unsupported text marker '%s'", type)
+ close.reverse()
+ html.append(text.escape(content["text"]))
+ html.extend(close)
+ else:
+ html.append(text.escape(content["text"]))
+
+ def _tiptap_process_deviation(self, html, content):
+ dev = content["attrs"]["deviation"]
+ media = dev.get("media") or ()
+
+ html.append('<div class="jjNX2">')
+ html.append('<figure class="Qf-HY" data-da-type="da-deviation" '
+ 'data-deviation="" '
+ 'data-width="" data-link="" data-alignment="center">')
+
+ if "baseUri" in media:
+ url, formats = self._eclipse_media(media)
+ full = formats["fullview"]
+
+ html.append('<a href="')
+ html.append(text.escape(dev["url"]))
+ html.append('" class="_3ouD5" style="margin:0 auto;display:flex;'
+ 'align-items:center;justify-content:center;'
+ 'overflow:hidden;width:780px;height:')
+ html.append(str(780 * full["h"] / full["w"]))
+ html.append('px">')
+
+ html.append('<img src="')
+ html.append(text.escape(url))
+ html.append('" alt="')
+ html.append(text.escape(dev["title"]))
+ html.append('" style="width:100%;max-width:100%;display:block"/>')
+ html.append("</a>")
+
+ elif "textContent" in dev:
+ html.append('<div class="_32Hs4" style="width:350px">')
+
+ html.append('<a href="')
+ html.append(text.escape(dev["url"]))
+ html.append('" class="_3ouD5">')
+
+ html.append('''\
+<section class="Q91qI aG7Yi" style="width:350px;height:313px">\
+<div class="_16ECM _1xMkk" aria-hidden="true">\
+<svg height="100%" viewBox="0 0 15 12" preserveAspectRatio="xMidYMin slice" \
+fill-rule="evenodd">\
+<linearGradient x1="87.8481761%" y1="16.3690766%" \
+x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
+<stop stop-color="#00FF62" offset="0%"></stop>\
+<stop stop-color="#3197EF" stop-opacity="0" offset="100%"></stop>\
+</linearGradient>\
+<text class="_2uqbc" fill="url(#app-root-3)" text-anchor="end" x="15" y="11">J\
+</text></svg></div><div class="_1xz9u">Literature</div><h3 class="_2WvKD">\
+''')
+ html.append(text.escape(dev["title"]))
+ html.append('</h3><div class="_2CPLm">')
+ html.append(text.escape(dev["textContent"]["excerpt"]))
+ html.append('</div></section></a></div>')
+
+ html.append('</figure></div>')
+
def _extract_content(self, deviation):
content = deviation["content"]
@@ -552,6 +733,23 @@ class DeviantartExtractor(Extractor):
self.log.info("Unwatching %s", username)
self.api.user_friends_unwatch(username)
+ def _eclipse_media(self, media, format="preview"):
+ url = [media["baseUri"], ]
+
+ formats = {
+ fmt["t"]: fmt
+ for fmt in media["types"]
+ }
+
+ tokens = media["token"]
+ if len(tokens) == 1:
+ fmt = formats[format]
+ url.append(fmt["c"].replace("<prettyName>", media["prettyName"]))
+ url.append("?token=")
+ url.append(tokens[-1])
+
+ return "".join(url), formats
+
def _eclipse_to_oauth(self, eclipse_api, deviations):
for obj in deviations:
deviation = obj["deviation"] if "deviation" in obj else obj
@@ -709,43 +907,35 @@ class DeviantartStashExtractor(DeviantartExtractor):
archive_fmt = "{index}.{extension}"
pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)"
r"/([a-z0-9]+)")
- example = "https://sta.sh/abcde"
+ example = "https://www.deviantart.com/stash/abcde"
skip = Extractor.skip
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
self.user = None
- self.stash_id = match.group(1)
def deviations(self, stash_id=None):
if stash_id is None:
- stash_id = self.stash_id
- url = "https://sta.sh/" + stash_id
+ stash_id = self.groups[0]
+ url = "https://www.deviantart.com/stash/" + stash_id
page = self._limited_request(url).text
if stash_id[0] == "0":
uuid = text.extr(page, '//deviation/', '"')
if uuid:
deviation = self.api.deviation(uuid)
+ deviation["_page"] = page
deviation["index"] = text.parse_int(text.extr(
page, '\\"deviationId\\":', ','))
yield deviation
return
- for item in text.extract_iter(
- page, 'class="stash-thumb-container', '</div>'):
- url = text.extr(item, '<a href="', '"')
-
- if url:
- stash_id = url.rpartition("/")[2]
- else:
- stash_id = text.extr(item, 'gmi-stashid="', '"')
- stash_id = "2" + util.bencode(text.parse_int(
- stash_id), "0123456789abcdefghijklmnopqrstuvwxyz")
-
- if len(stash_id) > 2:
- yield from self.deviations(stash_id)
+ for sid in text.extract_iter(
+ page, 'href="https://www.deviantart.com/stash/', '"'):
+ if sid == stash_id or sid.endswith("#comments"):
+ continue
+ yield from self.deviations(sid)
class DeviantartFavoriteExtractor(DeviantartExtractor):
@@ -939,11 +1129,14 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
else:
url = "{}/view/{}/".format(self.root, self.deviation_id)
- uuid = text.extr(self._limited_request(url).text,
- '"deviationUuid\\":\\"', '\\')
+ page = self._limited_request(url, notfound="deviation").text
+ uuid = text.extr(page, '"deviationUuid\\":\\"', '\\')
if not uuid:
raise exception.NotFoundError("deviation")
- return (self.api.deviation(uuid),)
+
+ deviation = self.api.deviation(uuid)
+ deviation["_page"] = page
+ return (deviation,)
class DeviantartScrapsExtractor(DeviantartExtractor):
@@ -1816,25 +2009,28 @@ JOURNAL_TEMPLATE_HTML = """text:<!DOCTYPE html>
<head>
<meta charset="utf-8">
<title>{title}</title>
- <link rel="stylesheet" href="https://st.deviantart.net/\
-css/deviantart-network_lc.css?3843780832">
- <link rel="stylesheet" href="https://st.deviantart.net/\
-css/group_secrets_lc.css?3250492874">
- <link rel="stylesheet" href="https://st.deviantart.net/\
-css/v6core_lc.css?4246581581">
- <link rel="stylesheet" href="https://st.deviantart.net/\
-css/sidebar_lc.css?1490570941">
- <link rel="stylesheet" href="https://st.deviantart.net/\
-css/writer_lc.css?3090682151">
- <link rel="stylesheet" href="https://st.deviantart.net/\
-css/v6loggedin_lc.css?3001430805">
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/css/deviantart-network_lc.css?3843780832"/>
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/css/group_secrets_lc.css?3250492874"/>
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/css/v6core_lc.css?4246581581"/>
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/css/sidebar_lc.css?1490570941"/>
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/css/writer_lc.css?3090682151"/>
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/css/v6loggedin_lc.css?3001430805"/>
<style>{css}</style>
- <link rel="stylesheet" href="https://st.deviantart.net/\
-roses/cssmin/core.css?1488405371919" >
- <link rel="stylesheet" href="https://st.deviantart.net/\
-roses/cssmin/peeky.css?1487067424177" >
- <link rel="stylesheet" href="https://st.deviantart.net/\
-roses/cssmin/desktop.css?1491362542749" >
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/roses/cssmin/core.css?1488405371919"/>
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/roses/cssmin/peeky.css?1487067424177"/>
+ <link rel="stylesheet" href="https://st.deviantart.net\
+/roses/cssmin/desktop.css?1491362542749"/>
+ <link rel="stylesheet" href="https://static.parastorage.com/services\
+/da-deviation/2bfd1ff7a9d6bf10d27b98dd8504c0399c3f9974a015785114b7dc6b\
+/app.min.css"/>
</head>
<body id="deviantART-v7" class="bubble no-apps loggedout w960 deviantart">
<div id="output">
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index d8337b6..9bbfb43 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -29,7 +29,10 @@ class FanboxExtractor(Extractor):
_warning = True
def _init(self):
- self.headers = {"Origin": self.root}
+ self.headers = {
+ "Accept": "application/json, text/plain, */*",
+ "Origin": self.root,
+ }
self.embeds = self.config("embeds", True)
includes = self.config("metadata")
@@ -40,8 +43,12 @@ class FanboxExtractor(Extractor):
includes = ("user", "plan")
self._meta_user = ("user" in includes)
self._meta_plan = ("plan" in includes)
+ self._meta_comments = ("comments" in includes)
else:
- self._meta_user = self._meta_plan = False
+ self._meta_user = self._meta_plan = self._meta_comments = False
+
+ if self.config("comments"):
+ self._meta_comments = True
if self._warning:
if not self.cookies_check(("FANBOXSESSID",)):
@@ -124,6 +131,11 @@ class FanboxExtractor(Extractor):
plan = plans[0].copy()
plan["fee"] = fee
post["plan"] = plans[fee] = plan
+ if self._meta_comments:
+ if post["commentCount"]:
+ post["comments"] = list(self._get_comment_data(post_id))
+ else:
+ post["commentd"] = ()
return content_body, post
@@ -160,6 +172,18 @@ class FanboxExtractor(Extractor):
return plans
+ def _get_comment_data(self, post_id):
+ url = ("https://api.fanbox.cc/post.listComments"
+ "?limit=10&postId=" + post_id)
+
+ comments = []
+ while url:
+ url = text.ensure_http_scheme(url)
+ body = self.request(url, headers=self.headers).json()["body"]
+ comments.extend(body["items"])
+ url = body["nextUrl"]
+ return comments
+
def _get_urls_from_post(self, content_body, post):
num = 0
cover_image = post.get("coverImageUrl")
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 6aefa11..df252ee 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -22,7 +22,7 @@ class FlickrExtractor(Extractor):
archive_fmt = "{id}"
cookies_domain = None
request_interval = (1.0, 2.0)
- request_interval_min = 0.2
+ request_interval_min = 0.5
def __init__(self, match):
Extractor.__init__(self, match)
@@ -37,7 +37,6 @@ class FlickrExtractor(Extractor):
extract = self.api._extract_format
for photo in self.photos():
try:
- 1/0
photo = extract(photo)
except Exception as exc:
self.log.warning(
@@ -236,8 +235,8 @@ class FlickrAPI(oauth.OAuth1API):
"""
API_URL = "https://api.flickr.com/services/rest/"
- API_KEY = "f8f78d1a40debf471f0b22fa2d00525f"
- API_SECRET = "4f9dae1113e45556"
+ API_KEY = "90c368449018a0cb880ea4889cbb8681"
+ API_SECRET = "e4b83e319c11e9e1"
FORMATS = [
("o" , "Original" , None),
("6k", "X-Large 6K" , 6144),
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index fbbd26c..0baad2f 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -97,6 +97,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
@staticmethod
def _prepare(post):
+ post["tags"] = post["tags"].strip()
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
@@ -114,7 +115,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
pattern = re.compile(
r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
for tag_type, tag_name in pattern.findall(tag_container):
- tags[tag_type].append(text.unquote(tag_name))
+ tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
@@ -178,7 +179,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
pattern = re.compile(
r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
for tag_type, tag_name in pattern.findall(tag_container):
- tags[tag_type].append(text.unquote(tag_name))
+ tags[tag_type].append(text.unescape(text.unquote(tag_name)))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index a6c1d5a..370cd43 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -89,30 +89,33 @@ class GenericExtractor(Extractor):
def metadata(self, page):
"""Extract generic webpage metadata, return them in a dict."""
- data = {}
- data['path'] = self.path.replace("/", "")
- data['pageurl'] = self.url
- data['title'] = text.extr(page, '<title>', "</title>")
- data['description'] = text.extr(
- page, '<meta name="description" content="', '"')
- data['keywords'] = text.extr(
- page, '<meta name="keywords" content="', '"')
- data['language'] = text.extr(
- page, '<meta name="language" content="', '"')
- data['name'] = text.extr(
- page, '<meta itemprop="name" content="', '"')
- data['copyright'] = text.extr(
- page, '<meta name="copyright" content="', '"')
- data['og_site'] = text.extr(
- page, '<meta property="og:site" content="', '"')
- data['og_site_name'] = text.extr(
- page, '<meta property="og:site_name" content="', '"')
- data['og_title'] = text.extr(
- page, '<meta property="og:title" content="', '"')
- data['og_description'] = text.extr(
- page, '<meta property="og:description" content="', '"')
-
- data = {k: text.unescape(data[k]) for k in data if data[k] != ""}
+ data = {
+ "title" : text.extr(
+ page, "<title>", "</title>"),
+ "description" : text.extr(
+ page, '<meta name="description" content="', '"'),
+ "keywords" : text.extr(
+ page, '<meta name="keywords" content="', '"'),
+ "language" : text.extr(
+ page, '<meta name="language" content="', '"'),
+ "name" : text.extr(
+ page, '<meta itemprop="name" content="', '"'),
+ "copyright" : text.extr(
+ page, '<meta name="copyright" content="', '"'),
+ "og_site" : text.extr(
+ page, '<meta property="og:site" content="', '"'),
+ "og_site_name" : text.extr(
+ page, '<meta property="og:site_name" content="', '"'),
+ "og_title" : text.extr(
+ page, '<meta property="og:title" content="', '"'),
+ "og_description": text.extr(
+ page, '<meta property="og:description" content="', '"'),
+
+ }
+
+ data = {k: text.unescape(v) for k, v in data.items() if v}
+ data["path"] = self.path.replace("/", "")
+ data["pageurl"] = self.url
return data
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index 345f51d..28590fc 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -19,7 +19,8 @@ class ImagefapExtractor(Extractor):
category = "imagefap"
root = "https://www.imagefap.com"
directory_fmt = ("{category}", "{gallery_id} {title}")
- filename_fmt = "{category}_{gallery_id}_{num:04}_{filename}.{extension}"
+ filename_fmt = ("{category}_{gallery_id}_{num:?/_/>04}"
+ "{filename}.{extension}")
archive_fmt = "{gallery_id}_{image_id}"
request_interval = (2.0, 4.0)
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 422c865..dd1272f 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -14,7 +14,6 @@ from .. import text, util, exception
from ..cache import cache, memcache
import itertools
import binascii
-import json
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com"
@@ -913,7 +912,7 @@ class InstagramGraphqlAPI():
self.user_collection = self.user_saved = self.reels_media = \
self.highlights_media = self.guide = self.guide_media = \
self._unsupported
- self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
+ self._json_dumps = util.json_dumps
api = InstagramRestAPI(extractor)
self.user_by_name = api.user_by_name
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index 7a19be5..e39e272 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -6,19 +6,19 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://komikcast.lol/"""
+"""Extractors for https://komikcast.cz/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:cz|lol|site|mo?e|com)"
class KomikcastBase():
"""Base class for komikcast extractors"""
category = "komikcast"
- root = "https://komikcast.lol"
+ root = "https://komikcast.cz"
@staticmethod
def parse_chapter_string(chapter_string, data=None):
@@ -46,9 +46,9 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
- """Extractor for manga-chapters from komikcast.lol"""
+ """Extractor for komikcast manga chapters"""
pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
- example = "https://komikcast.lol/chapter/TITLE/"
+ example = "https://komikcast.cz/chapter/TITLE/"
def metadata(self, page):
info = text.extr(page, "<title>", " - Komikcast<")
@@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
- """Extractor for manga from komikcast.lol"""
+ """Extractor for komikcast manga"""
chapterclass = KomikcastChapterExtractor
pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
- example = "https://komikcast.lol/komik/TITLE"
+ example = "https://komikcast.cz/komik/TITLE"
def chapters(self, page):
results = []
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 117b88b..6fc0689 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -20,10 +20,6 @@ class LolisafeExtractor(BaseExtractor):
BASE_PATTERN = LolisafeExtractor.update({
- "xbunkr": {
- "root": "https://xbunkr.com",
- "pattern": r"xbunkr\.com",
- },
})
diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py
index 5e8179e..f09507c 100644
--- a/gallery_dl/extractor/myhentaigallery.py
+++ b/gallery_dl/extractor/myhentaigallery.py
@@ -33,7 +33,7 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
title = extr('<div class="comic-description">\n', '</h1>').lstrip()
if title.startswith("<h1>"):
- title = title[len("<h1>"):]
+ title = title[4:]
if not title:
raise exception.NotFoundError("gallery")
@@ -41,10 +41,10 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
return {
"title" : text.unescape(title),
"gallery_id": text.parse_int(self.gallery_id),
- "tags" : split(extr('<div>\nCategories:', '</div>')),
- "artist" : split(extr('<div>\nArtists:' , '</div>')),
- "group" : split(extr('<div>\nGroups:' , '</div>')),
- "parodies" : split(extr('<div>\nParodies:' , '</div>')),
+ "tags" : split(extr(" Categories:", "</div>")),
+ "artist" : split(extr(" Artists:" , "</div>")),
+ "group" : split(extr(" Groups:" , "</div>")),
+ "parodies" : split(extr(" Parodies:" , "</div>")),
}
def images(self, page):
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index dfa1f6e..2928573 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -32,6 +32,8 @@ class NewgroundsExtractor(Extractor):
self.user_root = "https://{}.newgrounds.com".format(self.user)
def _init(self):
+ self._extract_comment_urls = re.compile(
+ r'(?:<img |data-smartload-)src="([^"]+)').findall
self.flash = self.config("flash", True)
fmt = self.config("format")
@@ -78,8 +80,7 @@ class NewgroundsExtractor(Extractor):
if "_fallback" in post:
del post["_fallback"]
- for url in text.extract_iter(
- post["_comment"], 'data-smartload-src="', '"'):
+ for url in self._extract_comment_urls(post["_comment"]):
post["num"] += 1
post["_index"] = "{}_{:>02}".format(
post["index"], post["num"])
@@ -243,9 +244,12 @@ class NewgroundsExtractor(Extractor):
url = text.ensure_http_scheme(url)
url = url.replace("/medium_views/", "/images/", 1)
if text.ext_from_url(url) == "webp":
+ fallback = [url.replace(".webp", "." + e)
+ for e in ("jpg", "png", "gif") if e != ext]
+ fallback.append(url)
yield {
"image" : url.replace(".webp", "." + ext),
- "_fallback": (url,),
+ "_fallback": fallback,
}
else:
yield {"image": url}
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 9d025d5..e7540f8 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -83,7 +83,7 @@ class OAuthBase(Extractor):
browser = None
if browser and browser.open(url):
- name = getattr(browser, "name", "Browser")
+ name = getattr(browser, "name", None) or "Browser"
self.log.info("Opening URL in %s:", name.capitalize())
else:
self.log.info("Please open this URL in your browser:")
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index eb6d677..d47ffa2 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -23,16 +23,17 @@ class PatreonExtractor(Extractor):
directory_fmt = ("{category}", "{creator[full_name]}")
filename_fmt = "{id}_{title}_{num:>02}.{extension}"
archive_fmt = "{id}_{num}"
- browser = "firefox"
- tls12 = False
_warning = True
- def items(self):
+ def _init(self):
+ self.session.headers["User-Agent"] = \
+ "Patreon/72.2.28 (Android; Android 14; Scale/2.10)"
if self._warning:
if not self.cookies_check(("session_id",)):
self.log.warning("no 'session_id' cookie set")
PatreonExtractor._warning = False
+ def items(self):
generators = self._build_file_generators(self.config("files"))
for post in self.posts():
@@ -99,6 +100,11 @@ class PatreonExtractor(Extractor):
if url:
yield "attachment", url, attachment["name"]
+ for attachment in post.get("attachments_media") or ():
+ url = attachment.get("download_url")
+ if url:
+ yield "attachment", url, attachment["file_name"]
+
def _content(self, post):
content = post.get("content")
if content:
@@ -137,8 +143,12 @@ class PatreonExtractor(Extractor):
if attr.get("current_user_can_view", True):
relationships = post["relationships"]
- attr["images"] = self._files(post, included, "images")
- attr["attachments"] = self._files(post, included, "attachments")
+ attr["images"] = self._files(
+ post, included, "images")
+ attr["attachments"] = self._files(
+ post, included, "attachments")
+ attr["attachments_media"] = self._files(
+ post, included, "attachments_media")
attr["date"] = text.parse_datetime(
attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
@@ -210,8 +220,8 @@ class PatreonExtractor(Extractor):
return (
"https://www.patreon.com/api/" + endpoint +
- "?include=campaign,access_rules,attachments,audio,images,media,"
- "native_video_insights,poll.choices,"
+ "?include=campaign,access_rules,attachments,attachments_media,"
+ "audio,images,media,native_video_insights,poll.choices,"
"poll.current_user_responses.user,"
"poll.current_user_responses.choice,"
"poll.current_user_responses.poll,"
@@ -303,13 +313,11 @@ class PatreonCreatorExtractor(PatreonExtractor):
r"([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?")
example = "https://www.patreon.com/USER"
- def __init__(self, match):
- PatreonExtractor.__init__(self, match)
- self.creator, self.query = match.groups()
-
def posts(self):
- query = text.parse_query(self.query)
- campaign_id = self._get_campaign_id(query)
+ creator, query = self.groups
+
+ query = text.parse_query(query)
+ campaign_id = self._get_campaign_id(creator, query)
filters = self._get_filters(query)
self.log.debug("campaign_id: %s", campaign_id)
@@ -322,9 +330,9 @@ class PatreonCreatorExtractor(PatreonExtractor):
))
return self._pagination(url)
- def _get_campaign_id(self, query):
- if self.creator.startswith("id:"):
- return self.creator[3:]
+ def _get_campaign_id(self, creator, query):
+ if creator.startswith("id:"):
+ return creator[3:]
campaign_id = query.get("c") or query.get("campaign_id")
if campaign_id:
@@ -334,7 +342,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
if user_id:
url = "{}/user/posts?u={}".format(self.root, user_id)
else:
- url = "{}/{}/posts".format(self.root, self.creator)
+ url = "{}/{}/posts".format(self.root, creator)
page = self.request(url, notfound="creator").text
try:
@@ -377,14 +385,18 @@ class PatreonPostExtractor(PatreonExtractor):
pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?#]+)"
example = "https://www.patreon.com/posts/TITLE-12345"
- def __init__(self, match):
- PatreonExtractor.__init__(self, match)
- self.slug = match.group(1)
-
def posts(self):
- url = "{}/posts/{}".format(self.root, self.slug)
+ url = "{}/posts/{}".format(self.root, self.groups[0])
page = self.request(url, notfound="post").text
- post = self._extract_bootstrap(page)["post"]
+ bootstrap = self._extract_bootstrap(page)
+
+ try:
+ post = bootstrap["post"]
+ except KeyError:
+ self.log.debug(bootstrap)
+ if bootstrap.get("campaignDisciplinaryStatus") == "suspended":
+ self.log.warning("Account suspended")
+ return ()
included = self._transform(post["included"])
return (self._process(post["data"], included),)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index c908e44..c2d1243 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -27,11 +27,17 @@ class PixivExtractor(Extractor):
filename_fmt = "{id}_p{num}.{extension}"
archive_fmt = "{id}{suffix}.{extension}"
cookies_domain = None
+ sanity_url = "https://s.pximg.net/common/images/limit_sanity_level_360.png"
+ mypixiv_url = "https://s.pximg.net/common/images/limit_mypixiv_360.png"
def _init(self):
self.api = PixivAppAPI(self)
self.load_ugoira = self.config("ugoira", True)
self.max_posts = self.config("max-posts", 0)
+ self.sanity_workaround = self.config("sanity", True)
+ self.meta_user = self.config("metadata")
+ self.meta_bookmark = self.config("metadata-bookmark")
+ self.meta_comments = self.config("comments")
def items(self):
tags = self.config("tags", "japanese")
@@ -46,11 +52,7 @@ class PixivExtractor(Extractor):
def transform_tags(work):
work["tags"] = [tag["name"] for tag in work["tags"]]
- url_sanity = ("https://s.pximg.net/common/images"
- "/limit_sanity_level_360.png")
ratings = {0: "General", 1: "R-18", 2: "R-18G"}
- meta_user = self.config("metadata")
- meta_bookmark = self.config("metadata-bookmark")
metadata = self.metadata()
works = self.works()
@@ -60,18 +62,26 @@ class PixivExtractor(Extractor):
if not work["user"]["id"]:
continue
- meta_single_page = work["meta_single_page"]
- meta_pages = work["meta_pages"]
- del work["meta_single_page"]
- del work["image_urls"]
- del work["meta_pages"]
+ files = self._extract_files(work)
- if meta_user:
+ if self.meta_user:
work.update(self.api.user_detail(work["user"]["id"]))
- if meta_bookmark and work["is_bookmarked"]:
+ if self.meta_comments:
+ if work["total_comments"]:
+ work["comments"] = list(
+ self.api.illust_comments(work["id"]))
+ else:
+ work["comments"] = ()
+ if self.meta_bookmark and work["is_bookmarked"]:
detail = self.api.illust_bookmark_detail(work["id"])
work["tags_bookmark"] = [tag["name"] for tag in detail["tags"]
if tag["is_registered"]]
+ if self.sanity_workaround and not work.get("caption") and \
+ not work.get("_mypixiv"):
+ body = self._request_ajax("/illust/" + str(work["id"]))
+ if body:
+ work["caption"] = text.unescape(body["illustComment"])
+
if transform_tags:
transform_tags(work)
work["num"] = 0
@@ -81,69 +91,177 @@ class PixivExtractor(Extractor):
work.update(metadata)
yield Message.Directory, work
+ for work["num"], file in enumerate(files):
+ url = file["url"]
+ work.update(file)
+ work["date_url"] = self._date_from_url(url)
+ yield Message.Url, url, text.nameext_from_url(url, work)
- if work["type"] == "ugoira":
- if not self.load_ugoira:
- continue
+ def _extract_files(self, work):
+ meta_single_page = work["meta_single_page"]
+ meta_pages = work["meta_pages"]
+ del work["meta_single_page"]
+ del work["image_urls"]
+ del work["meta_pages"]
+ if work["type"] == "ugoira":
+ if self.load_ugoira:
try:
- ugoira = self.api.ugoira_metadata(work["id"])
+ return self._extract_ugoira(work)
except exception.StopExtraction as exc:
self.log.warning(
"Unable to retrieve Ugoira metatdata (%s - %s)",
- work.get("id"), exc.message)
- continue
-
- url = ugoira["zip_urls"]["medium"]
- work["frames"] = frames = ugoira["frames"]
- work["date_url"] = self._date_from_url(url)
- work["_http_adjust_extension"] = False
-
- if self.load_ugoira == "original":
- base, sep, _ = url.rpartition("_ugoira")
- base = base.replace(
- "/img-zip-ugoira/", "/img-original/", 1) + sep
-
- for ext in ("jpg", "png", "gif"):
- try:
- url = ("{}0.{}".format(base, ext))
- self.request(url, method="HEAD")
- break
- except exception.HttpError:
- pass
- else:
- self.log.warning(
- "Unable to find Ugoira frame URLs (%s)",
- work.get("id"))
- continue
-
- for num, frame in enumerate(frames):
- url = ("{}{}.{}".format(base, num, ext))
- work["num"] = work["_ugoira_frame_index"] = num
- work["suffix"] = "_p{:02}".format(num)
- text.nameext_from_url(url, work)
- yield Message.Url, url, work
-
+ work["id"], exc.message)
+
+ elif work["page_count"] == 1:
+ url = meta_single_page["original_image_url"]
+ if url == self.sanity_url:
+ if self.sanity_workaround:
+ self.log.warning("%s: 'sanity_level' warning", work["id"])
+ body = self._request_ajax("/illust/" + str(work["id"]))
+ return self._extract_ajax(work, body)
else:
- url = url.replace("_ugoira600x600", "_ugoira1920x1080")
- yield Message.Url, url, text.nameext_from_url(url, work)
-
- elif work["page_count"] == 1:
- url = meta_single_page["original_image_url"]
- if url == url_sanity:
self.log.warning(
- "Unable to download work %s ('sanity_level' warning)",
+ "%s: Unable to download work ('sanity_level' warning)",
work["id"])
- continue
- work["date_url"] = self._date_from_url(url)
- yield Message.Url, url, text.nameext_from_url(url, work)
+ elif url == self.mypixiv_url:
+ work["_mypixiv"] = True
+ self.log.warning("%s: 'My pixiv' locked", work["id"])
+ return ()
+ else:
+ return ({"url": url},)
+ else:
+ return [
+ {
+ "url" : img["image_urls"]["original"],
+ "suffix": "_p{:02}".format(num),
+ }
+ for num, img in enumerate(meta_pages)
+ ]
+
+ return ()
+
+ def _extract_ugoira(self, work):
+ ugoira = self.api.ugoira_metadata(work["id"])
+ url = ugoira["zip_urls"]["medium"]
+ work["_ugoira_frame_data"] = work["frames"] = frames = ugoira["frames"]
+ work["date_url"] = self._date_from_url(url)
+ work["_http_adjust_extension"] = False
+
+ if self.load_ugoira == "original":
+ work["_ugoira_original"] = True
+ base, sep, _ = url.rpartition("_ugoira")
+ base = base.replace("/img-zip-ugoira/", "/img-original/", 1) + sep
+
+ for ext in ("jpg", "png", "gif"):
+ try:
+ url = "{}0.{}".format(base, ext)
+ self.request(url, method="HEAD")
+ break
+ except exception.HttpError:
+ pass
else:
- for work["num"], img in enumerate(meta_pages):
- url = img["image_urls"]["original"]
- work["date_url"] = self._date_from_url(url)
- work["suffix"] = "_p{:02}".format(work["num"])
- yield Message.Url, url, text.nameext_from_url(url, work)
+ self.log.warning(
+ "Unable to find Ugoira frame URLs (%s)", work["id"])
+
+ return [
+ {
+ "url": "{}{}.{}".format(base, num, ext),
+ "suffix": "_p{:02}".format(num),
+ "_ugoira_frame_index": num,
+ }
+ for num in range(len(frames))
+ ]
+ else:
+ work["_ugoira_original"] = False
+ url = url.replace("_ugoira600x600", "_ugoira1920x1080", 1)
+ return ({"url": url},)
+
+ def _request_ajax(self, endpoint):
+ url = "{}/ajax{}".format(self.root, endpoint)
+ try:
+ return self.request(url, headers=self.headers_web).json()["body"]
+ except Exception:
+ return None
+
+ def _extract_ajax(self, work, body):
+ url = self._extract_ajax_url(body)
+ if not url:
+ return ()
+
+ for key_app, key_ajax in (
+ ("title" , "illustTitle"),
+ ("image_urls" , "urls"),
+ ("create_date" , "createDate"),
+ ("width" , "width"),
+ ("height" , "height"),
+ ("sanity_level" , "sl"),
+ ("total_view" , "viewCount"),
+ ("total_comments" , "commentCount"),
+ ("total_bookmarks" , "bookmarkCount"),
+ ("restrict" , "restrict"),
+ ("x_restrict" , "xRestrict"),
+ ("illust_ai_type" , "aiType"),
+ ("illust_book_style", "bookStyle"),
+ ):
+ work[key_app] = body[key_ajax]
+
+ work["user"] = {
+ "account" : body["userAccount"],
+ "id" : int(body["userId"]),
+ "is_followed": False,
+ "name" : body["userName"],
+ "profile_image_urls": {},
+ }
+
+ work["tags"] = tags = []
+ for tag in body["tags"]["tags"]:
+ name = tag["tag"]
+ try:
+ translated_name = tag["translation"]["en"]
+ except Exception:
+ translated_name = None
+ tags.append({"name": name, "translated_name": translated_name})
+
+ work["caption"] = text.unescape(body["illustComment"])
+ work["page_count"] = count = body["pageCount"]
+ if count == 1:
+ return ({"url": url},)
+
+ base, _, ext = url.rpartition("_p0.")
+ return [
+ {
+ "url" : "{}_p{}.{}".format(base, num, ext),
+ "suffix": "_p{:02}".format(num),
+ }
+ for num in range(count)
+ ]
+
+ def _extract_ajax_url(self, body):
+ try:
+ original = body["urls"]["original"]
+ if original:
+ return original
+ except KeyError:
+ pass
+
+ try:
+ square1200 = body["userIllusts"][body["id"]]["url"]
+ except KeyError:
+ return
+ parts = square1200.rpartition("_p0")[0].split("/")
+ del parts[3:5]
+ parts[3] = "img-original"
+ base = "/".join(parts)
+
+ for ext in ("jpg", "png", "gif"):
+ try:
+ url = "{}_p0.{}".format(base, ext)
+ self.request(url, method="HEAD")
+ return url
+ except exception.HttpError:
+ pass
@staticmethod
def _date_from_url(url, offset=timedelta(hours=9)):
@@ -175,6 +293,9 @@ class PixivExtractor(Extractor):
"x_restrict" : 0,
}
+ def _web_to_mobile(self, work):
+ return work
+
def works(self):
"""Return an iterable containing all relevant 'work' objects"""
@@ -255,12 +376,12 @@ class PixivAvatarExtractor(PixivExtractor):
pattern = USER_PATTERN + r"/avatar"
example = "https://www.pixiv.net/en/users/12345/avatar"
- def __init__(self, match):
- PixivExtractor.__init__(self, match)
- self.user_id = match.group(1)
+ def _init(self):
+ PixivExtractor._init(self)
+ self.sanity_workaround = self.meta_comments = False
def works(self):
- user = self.api.user_detail(self.user_id)["user"]
+ user = self.api.user_detail(self.groups[0])["user"]
url = user["profile_image_urls"]["medium"].replace("_170.", ".")
return (self._make_work("avatar", url, user),)
@@ -273,12 +394,12 @@ class PixivBackgroundExtractor(PixivExtractor):
pattern = USER_PATTERN + "/background"
example = "https://www.pixiv.net/en/users/12345/background"
- def __init__(self, match):
- PixivExtractor.__init__(self, match)
- self.user_id = match.group(1)
+ def _init(self):
+ PixivExtractor._init(self)
+ self.sanity_workaround = self.meta_comments = False
def works(self):
- detail = self.api.user_detail(self.user_id)
+ detail = self.api.user_detail(self.groups[0])
url = detail["profile"]["background_image_url"]
if not url:
return ()
@@ -335,6 +456,22 @@ class PixivWorkExtractor(PixivExtractor):
return works
+class PixivUnlistedExtractor(PixivExtractor):
+ """Extractor for a unlisted pixiv illustrations"""
+ subcategory = "unlisted"
+ pattern = BASE_PATTERN + r"/(?:en/)?artworks/unlisted/(\w+)"
+ example = "https://www.pixiv.net/en/artworks/unlisted/a1b2c3d4e5f6g7h8i9j0"
+
+ def _extract_files(self, work):
+ body = self._request_ajax("/illust/unlisted/" + work["id"])
+ work["id_unlisted"] = work["id"]
+ work["id"] = text.parse_int(body["illustId"])
+ return self._extract_ajax(work, body)
+
+ def works(self):
+ return ({"id": self.groups[0], "user": {"id": 1}},)
+
+
class PixivFavoriteExtractor(PixivExtractor):
"""Extractor for all favorites/bookmarks of a pixiv user"""
subcategory = "favorite"
@@ -626,8 +763,6 @@ class PixivNovelExtractor(PixivExtractor):
work["tags"] = [tag["name"] for tag in work["tags"]]
ratings = {0: "General", 1: "R-18", 2: "R-18G"}
- meta_user = self.config("metadata")
- meta_bookmark = self.config("metadata-bookmark")
embeds = self.config("embeds")
covers = self.config("covers")
@@ -645,9 +780,15 @@ class PixivNovelExtractor(PixivExtractor):
if self.max_posts:
novels = itertools.islice(novels, self.max_posts)
for novel in novels:
- if meta_user:
+ if self.meta_user:
novel.update(self.api.user_detail(novel["user"]["id"]))
- if meta_bookmark and novel["is_bookmarked"]:
+ if self.meta_comments:
+ if novel["total_comments"]:
+ novel["comments"] = list(
+ self.api.novel_comments(novel["id"]))
+ else:
+ novel["comments"] = ()
+ if self.meta_bookmark and novel["is_bookmarked"]:
detail = self.api.novel_bookmark_detail(novel["id"])
novel["tags_bookmark"] = [tag["name"] for tag in detail["tags"]
if tag["is_registered"]]
@@ -848,6 +989,7 @@ class PixivAppAPI():
self.username = extractor._get_auth_info()[0]
self.user = None
+ extractor.headers_web = extractor.session.headers.copy()
extractor.session.headers.update({
"App-OS" : "ios",
"App-OS-Version": "16.7.2",
@@ -913,6 +1055,10 @@ class PixivAppAPI():
return self._call(
"/v2/illust/bookmark/detail", params)["bookmark_detail"]
+ def illust_comments(self, illust_id):
+ params = {"illust_id": illust_id}
+ return self._pagination("/v3/illust/comments", params, "comments")
+
def illust_follow(self, restrict="all"):
params = {"restrict": restrict}
return self._pagination("/v2/illust/follow", params)
@@ -935,6 +1081,10 @@ class PixivAppAPI():
return self._call(
"/v2/novel/bookmark/detail", params)["bookmark_detail"]
+ def novel_comments(self, novel_id):
+ params = {"novel_id": novel_id}
+ return self._pagination("/v1/novel/comments", params, "comments")
+
def novel_detail(self, novel_id):
params = {"novel_id": novel_id}
return self._call("/v2/novel/detail", params)["novel"]
diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
index c7283fc..9800eb2 100644
--- a/gallery_dl/extractor/pornhub.py
+++ b/gallery_dl/extractor/pornhub.py
@@ -66,10 +66,19 @@ class PornhubGalleryExtractor(PornhubExtractor):
def items(self):
data = self.metadata()
yield Message.Directory, data
- for num, image in enumerate(self.images(), 1):
+ for num, img in enumerate(self.images(), 1):
+
+ image = {
+ "url" : img["img_large"],
+ "caption": img["caption"],
+ "id" : text.parse_int(img["id"]),
+ "views" : text.parse_int(img["times_viewed"]),
+ "score" : text.parse_int(img["vote_percent"]),
+ "num" : num,
+ }
+
url = image["url"]
image.update(data)
- image["num"] = num
yield Message.Url, url, text.nameext_from_url(url, image)
def metadata(self):
@@ -105,18 +114,20 @@ class PornhubGalleryExtractor(PornhubExtractor):
images = response.json()
key = end = self._first
- while True:
- img = images[key]
- yield {
- "url" : img["img_large"],
- "caption": img["caption"],
- "id" : text.parse_int(img["id"]),
- "views" : text.parse_int(img["times_viewed"]),
- "score" : text.parse_int(img["vote_percent"]),
- }
- key = str(img["next"])
- if key == end:
- return
+ results = []
+ try:
+ while True:
+ img = images[key]
+ results.append(img)
+ key = str(img["next"])
+ if key == end:
+ break
+ except KeyError:
+ self.log.warning("%s: Unable to ensure correct file order",
+ self.gallery_id)
+ return images.values()
+
+ return results
class PornhubGifExtractor(PornhubExtractor):
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
deleted file mode 100644
index 3a4c614..0000000
--- a/gallery_dl/extractor/pururin.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019-2023 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://pururin.to/"""
-
-from .common import GalleryExtractor
-from .. import text, util
-
-
-class PururinGalleryExtractor(GalleryExtractor):
- """Extractor for image galleries on pururin.io"""
- category = "pururin"
- root = "https://pururin.to"
- pattern = r"(?:https?://)?(?:www\.)?pururin\.[ti]o/(?:gallery|read)/(\d+)"
- example = "https://pururin.to/gallery/12345/TITLE"
-
- def __init__(self, match):
- self.gallery_id = match.group(1)
- url = "{}/gallery/{}/x".format(self.root, self.gallery_id)
- GalleryExtractor.__init__(self, match, url)
-
- def metadata(self, page):
- extr = text.extract_from(page)
-
- def _lst(e=extr):
- v = text.unescape(e('value="', '"'))
- return [item["name"] for item in util.json_loads(v)] if v else ()
-
- def _str(key, e=extr):
- return text.unescape(text.extr(
- e(key, "</td>"), 'title="', '"')).partition(" / ")[0]
-
- title = text.unescape(extr('<h1><span itemprop="name">', '<'))
- title_en, _, title_ja = title.partition(" / ")
-
- data = {
- "gallery_id": text.parse_int(self.gallery_id),
- "title" : title_en or title_ja,
- "title_en" : title_en,
- "title_ja" : title_ja,
- "language" : _str("<td>Language</td>"),
- "type" : _str("<td>Category</td>"),
- "uploader" : text.remove_html(extr("<td>Uploader</td>", "</td>")),
- "rating" : text.parse_float(extr(
- 'itemprop="ratingValue" content="', '"')),
- "artist" : extr('name="artist_tags"', '') or _lst(),
- "group" : _lst(),
- "parody" : _lst(),
- "tags" : _lst(),
- "characters": _lst(),
- "scanlator" : _lst(),
- "convention": _lst(),
- "collection": _lst(),
- }
- data["lang"] = util.language_to_code(data["language"])
- return data
-
- def images(self, _):
- url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
- page = self.request(url).text
-
- svr, pos = text.extract(page, 'data-svr="', '"')
- img, pos = text.extract(page, 'data-img="', '"', pos)
- data = util.json_loads(text.unescape(img))
-
- base = "{}/{}/".format(svr, data["directory"])
- return [(base + i["filename"], None) for i in data["images"]]
diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py
index cf70ccc..60c1c35 100644
--- a/gallery_dl/extractor/rule34us.py
+++ b/gallery_dl/extractor/rule34us.py
@@ -36,7 +36,7 @@ class Rule34usExtractor(BooruExtractor):
"score" : text.extract(extr('Score: ', '> - <'), ">", "<")[0],
"width" : extr('Size: ', 'w'),
"height" : extr(' x ', 'h'),
- "file_url": extr(' src="', '"'),
+ "file_url": extr('<source src="', '"') or extr('<img src="', '"'),
}
url = post["file_url"]
diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py
index a68f0db..97bad09 100644
--- a/gallery_dl/extractor/shimmie2.py
+++ b/gallery_dl/extractor/shimmie2.py
@@ -69,11 +69,6 @@ class Shimmie2Extractor(BaseExtractor):
BASE_PATTERN = Shimmie2Extractor.update({
- "loudbooru": {
- "root": "https://loudbooru.com",
- "pattern": r"loudbooru\.com",
- "cookies": {"ui-tnc-agreed": "true"},
- },
"giantessbooru": {
"root": "https://sizechangebooru.com",
"pattern": r"(?:sizechange|giantess)booru\.com",
@@ -104,20 +99,15 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
file_url_fmt = "{}/_images/{}/{}%20-%20{}.{}"
- pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?()"
- example = "https://loudbooru.com/post/list/TAG/1"
-
- def __init__(self, match):
- Shimmie2Extractor.__init__(self, match)
- lastindex = match.lastindex
- self.tags = text.unquote(match.group(lastindex-2))
- self.page = match.group(lastindex-1)
+ pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?"
+ example = "https://vidya.pics/post/list/TAG/1"
def metadata(self):
+ self.tags = text.unquote(self.groups[-2])
return {"search_tags": self.tags}
def posts(self):
- pnum = text.parse_int(self.page, 1)
+ pnum = text.parse_int(self.groups[-1], 1)
file_url_fmt = self.file_url_fmt.format
init = True
@@ -171,7 +161,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
return
def _posts_giantessbooru(self):
- pnum = text.parse_int(self.page, 1)
+ pnum = text.parse_int(self.groups[-1], 1)
file_url_fmt = (self.root + "/index.php?q=/image/{}.jpg").format
while True:
@@ -206,20 +196,17 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
"""Extractor for single shimmie2 posts"""
subcategory = "post"
pattern = BASE_PATTERN + r"post/view/(\d+)"
- example = "https://loudbooru.com/post/view/12345"
-
- def __init__(self, match):
- Shimmie2Extractor.__init__(self, match)
- self.post_id = match.group(match.lastindex)
+ example = "https://vidya.pics/post/view/12345"
def posts(self):
- url = "{}/post/view/{}".format(self.root, self.post_id)
+ post_id = self.groups[-1]
+ url = "{}/post/view/{}".format(self.root, post_id)
page = self.request(url).text
extr = text.extract_from(page)
quote = self._quote_type(page)
post = {
- "id" : self.post_id,
+ "id" : post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : extr("/_thumbs/", "/"),
"file_url": self.root + (
@@ -237,12 +224,12 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
return (post,)
def _posts_giantessbooru(self):
- url = "{}/index.php?q=/post/view/{}".format(
- self.root, self.post_id)
+ post_id = self.groups[-1]
+ url = "{}/index.php?q=/post/view/{}".format(self.root, post_id)
extr = text.extract_from(self.request(url).text)
return ({
- "id" : self.post_id,
+ "id" : post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : "",
"file_url": self.root + extr("id='main_image' src='.", "'"),
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index d4ec343..9c9d505 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -13,7 +13,6 @@ from .. import text, util, exception
from ..cache import cache, memcache
import itertools
import random
-import json
import re
BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?"
@@ -1034,7 +1033,7 @@ class TwitterAPI():
self.root = "https://x.com/i/api"
self._nsfw_warning = True
- self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
+ self._json_dumps = util.json_dumps
cookies = extractor.cookies
cookies_domain = extractor.cookies_domain
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 87a0ba6..fec4ab0 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -105,11 +105,12 @@ class UgoiraPP(PostProcessor):
}, options)
def prepare(self, pathfmt):
- if "frames" not in pathfmt.kwdict:
+ self._convert_zip = self._convert_files = False
+ if "_ugoira_frame_data" not in pathfmt.kwdict:
self._frames = None
return
- self._frames = pathfmt.kwdict["frames"]
+ self._frames = pathfmt.kwdict["_ugoira_frame_data"]
if pathfmt.extension == "zip":
self._convert_zip = True
if self.delete:
@@ -136,7 +137,6 @@ class UgoiraPP(PostProcessor):
def convert_from_zip(self, pathfmt):
if not self._convert_zip:
return
- self._convert_zip = False
self._zip_source = True
with self._tempdir() as tempdir:
@@ -147,6 +147,13 @@ class UgoiraPP(PostProcessor):
except FileNotFoundError:
pathfmt.realpath = pathfmt.temppath
return
+ except Exception as exc:
+ pathfmt.realpath = pathfmt.temppath
+ self.log.error(
+ "%s: Unable to extract frames from %s (%s: %s)",
+ pathfmt.kwdict.get("id"), pathfmt.filename,
+ exc.__class__.__name__, exc)
+ return self.log.debug("", exc_info=exc)
if self.convert(pathfmt, tempdir):
if self.delete:
@@ -159,7 +166,6 @@ class UgoiraPP(PostProcessor):
def convert_from_files(self, pathfmt):
if not self._convert_files:
return
- self._convert_files = False
self._zip_source = False
with tempfile.TemporaryDirectory() as tempdir:
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 8517cdf..5fd5a40 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -238,12 +238,49 @@ def parse_float(value, default=0.0):
def parse_query(qs):
- """Parse a query string into key-value pairs"""
+ """Parse a query string into name-value pairs
+
+ Ignore values whose name has been seen before
+ """
+ if not qs:
+ return {}
+
+ result = {}
+ try:
+ for name_value in qs.split("&"):
+ name, eq, value = name_value.partition("=")
+ if eq:
+ name = unquote(name.replace("+", " "))
+ if name not in result:
+ result[name] = unquote(value.replace("+", " "))
+ except Exception:
+ pass
+ return result
+
+
+def parse_query_list(qs):
+ """Parse a query string into name-value pairs
+
+ Combine values of duplicate names into lists
+ """
+ if not qs:
+ return {}
+
result = {}
try:
- for key, value in urllib.parse.parse_qsl(qs):
- if key not in result:
- result[key] = value
+ for name_value in qs.split("&"):
+ name, eq, value = name_value.partition("=")
+ if eq:
+ name = unquote(name.replace("+", " "))
+ value = unquote(value.replace("+", " "))
+ if name in result:
+ rvalue = result[name]
+ if isinstance(rvalue, list):
+ rvalue.append(value)
+ else:
+ result[name] = [rvalue, value]
+ else:
+ result[name] = value
except Exception:
pass
return result
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 128f48b..d5bc171 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -253,7 +253,11 @@ def json_default(obj):
json_loads = json._default_decoder.decode
-json_dumps = json.JSONEncoder(default=json_default).encode
+json_dumps = json.JSONEncoder(
+ check_circular=False,
+ separators=(",", ":"),
+ default=json_default,
+).encode
def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4):
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 513da41..dd96a9a 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,5 +6,5 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.27.5"
+__version__ = "1.27.6"
__variant__ = None
diff --git a/test/test_formatter.py b/test/test_formatter.py
index f1d752d..c0b504d 100644
--- a/test/test_formatter.py
+++ b/test/test_formatter.py
@@ -64,7 +64,7 @@ class TestFormatter(unittest.TestCase):
self._run_test("{t!d}", datetime.datetime(2010, 1, 1))
self._run_test("{t!d:%Y-%m-%d}", "2010-01-01")
self._run_test("{dt!T}", "1262304000")
- self._run_test("{l!j}", '["a", "b", "c"]')
+ self._run_test("{l!j}", '["a","b","c"]')
self._run_test("{dt!j}", '"2010-01-01 00:00:00"')
self._run_test("{a!g}", "hello-world")
self._run_test("{a!L}", 11)
diff --git a/test/test_results.py b/test/test_results.py
index aa09f2f..ed9c9a9 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -210,6 +210,7 @@ class TestExtractorResults(unittest.TestCase):
if "#urls" in result:
expected = result["#urls"]
if isinstance(expected, str):
+ self.assertTrue(tjob.url_list, msg="#urls")
self.assertEqual(tjob.url_list[0], expected, msg="#urls")
else:
self.assertSequenceEqual(tjob.url_list, expected, msg="#urls")
@@ -235,6 +236,8 @@ class TestExtractorResults(unittest.TestCase):
self.assertIsInstance(value, test, msg=path)
elif isinstance(test, range):
self.assertRange(value, test, msg=path)
+ elif isinstance(test, set):
+ self.assertIn(value, test, msg=path)
elif isinstance(test, list):
subtest = False
for idx, item in enumerate(test):
@@ -286,6 +289,8 @@ class ResultJob(job.DownloadJob):
"".join(self.extractor.directory_fmt)).format_map
self.format_filename = TestFormatter(
self.extractor.filename_fmt).format_map
+ self.format_archive = TestFormatter(
+ self.extractor.archive_fmt).format_map
def run(self):
self._init()
@@ -323,7 +328,7 @@ class ResultJob(job.DownloadJob):
json.dumps(kwdict, sort_keys=True, default=str).encode())
def _update_archive(self, kwdict):
- archive_id = self.extractor.archive_fmt.format_map(kwdict)
+ archive_id = self.format_archive(kwdict)
self.archive_list.append(archive_id)
self.archive_hash.update(archive_id.encode())
diff --git a/test/test_text.py b/test/test_text.py
index 084436b..1b19c47 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -413,6 +413,28 @@ class TestText(unittest.TestCase):
for value in INVALID:
self.assertEqual(f(value), {})
+ def test_parse_query_list(self, f=text.parse_query_list):
+ # standard usage
+ self.assertEqual(f(""), {})
+ self.assertEqual(f("foo=1"), {"foo": "1"})
+ self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"})
+
+ # missing value
+ self.assertEqual(f("bar"), {})
+ self.assertEqual(f("foo=1&bar"), {"foo": "1"})
+ self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"})
+
+ # keys with identical names
+ self.assertEqual(f("foo=1&foo=2"), {"foo": ["1", "2"]})
+ self.assertEqual(
+ f("foo=1&bar=2&foo=3&bar=4&foo=5"),
+ {"foo": ["1", "3", "5"], "bar": ["2", "4"]},
+ )
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), {})
+
def test_parse_timestamp(self, f=text.parse_timestamp):
null = util.datetime_utcfromtimestamp(0)
value = util.datetime_utcfromtimestamp(1555816235)