diff options
| author | 2025-09-23 07:44:44 -0400 | |
|---|---|---|
| committer | 2025-09-23 07:44:44 -0400 | |
| commit | 291c04af647559317fc9f9f392ad43841ec509ad (patch) | |
| tree | 13a72906223927180001b362d086c82401cb7843 | |
| parent | 065386e00c7a6c8bbe4bb23a545a7fc7b2c09a4a (diff) | |
| parent | 42b62671fabfdcf983a9575221420d85f7fbcac1 (diff) | |
Update upstream source from tag 'upstream/1.30.8'
Update to upstream version '1.30.8'
with Debian dir 51367313d3355f7d0d16a754c5c63135fb3c72e2
42 files changed, 1344 insertions, 335 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index a5eaa4d..5aa64b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,28 +1,48 @@ -## 1.30.7 - 2025-09-14 +## 1.30.8 - 2025-09-23 ### Extractors #### Additions -- [bellazon] add support ([#7480](https://github.com/mikf/gallery-dl/issues/7480)) -- [cyberfile] add support ([#5015](https://github.com/mikf/gallery-dl/issues/5015)) -- [fansly] add `creator-media` extractor ([#4401](https://github.com/mikf/gallery-dl/issues/4401)) -- [simpcity] add support ([#3127](https://github.com/mikf/gallery-dl/issues/3127) [#5145](https://github.com/mikf/gallery-dl/issues/5145) [#5879](https://github.com/mikf/gallery-dl/issues/5879) [#8187](https://github.com/mikf/gallery-dl/issues/8187)) +- [chevereto] support `imglike.com` ([#5179](https://github.com/mikf/gallery-dl/issues/5179)) +- [chevereto] add `category` extractor ([#5179](https://github.com/mikf/gallery-dl/issues/5179)) +- [Danbooru] add `random` extractor ([#8270](https://github.com/mikf/gallery-dl/issues/8270)) +- [hdoujin] add support ([#6810](https://github.com/mikf/gallery-dl/issues/6810)) +- [imgpile] add support ([#5044](https://github.com/mikf/gallery-dl/issues/5044)) +- [mangadex] add `covers` extractor ([#4994](https://github.com/mikf/gallery-dl/issues/4994)) +- [mangataro] add support ([#8237](https://github.com/mikf/gallery-dl/issues/8237)) +- [thehentaiworld] add support ([#274](https://github.com/mikf/gallery-dl/issues/274) [#8237](https://github.com/mikf/gallery-dl/issues/8237)) #### Fixes -- [aibooru] fix download URLs ([#8212](https://github.com/mikf/gallery-dl/issues/8212)) -- [ao3] fix pagination ([#8206](https://github.com/mikf/gallery-dl/issues/8206)) -- [boosty] fix extracting `accessToken` from cookies ([#8203](https://github.com/mikf/gallery-dl/issues/8203)) -- [comick] update `buildId` on `404` errors ([#8157](https://github.com/mikf/gallery-dl/issues/8157)) -- [facebook] fix `/photo/?fbid=…&set=…` URLs being handled as a set ([#8181](https://github.com/mikf/gallery-dl/issues/8181)) -- [fansly] fix & improve format selection ([#4401](https://github.com/mikf/gallery-dl/issues/4401)) -- [fansly] fix posts with more than 5 files ([#4401](https://github.com/mikf/gallery-dl/issues/4401)) -- [imgbb] fix & update ([#7936](https://github.com/mikf/gallery-dl/issues/7936)) -- [tiktok] fix `KeyError: 'author'` ([#8189](https://github.com/mikf/gallery-dl/issues/8189)) +- [4archive] fix `TypeError` ([#8217](https://github.com/mikf/gallery-dl/issues/8217)) +- [bellazon] fix video attachments ([#8239](https://github.com/mikf/gallery-dl/issues/8239)) +- [bunkr] fix `JSONDecodeError` for files with URL slugs containing apostrophes `'` ([#8150](https://github.com/mikf/gallery-dl/issues/8150)) +- [instagram] ensure manifest data exists before attempting a DASH download ([#8267](https://github.com/mikf/gallery-dl/issues/8267)) +- [schalenetwork] fix extraction ([#6948](https://github.com/mikf/gallery-dl/issues/6948) [#7391](https://github.com/mikf/gallery-dl/issues/7391) [#7728](https://github.com/mikf/gallery-dl/issues/7728)) +- [twitter] fix quoted Tweets being marked as `deleted` ([#8225](https://github.com/mikf/gallery-dl/issues/8225)) #### Improvements -- [comick] handle redirects -- [fansly] provide fallback URL for manifest downloads ([#4401](https://github.com/mikf/gallery-dl/issues/4401)) -- [fansly:creator] support custom wall IDs ([#4401](https://github.com/mikf/gallery-dl/issues/4401)) -- [tungsten:user] support filtering results by tag ([#8061](https://github.com/mikf/gallery-dl/issues/8061)) -- [twitter] continue searches on empty response ([#8173](https://github.com/mikf/gallery-dl/issues/8173)) -- [twitter] implement various `search-…` options ([#8173](https://github.com/mikf/gallery-dl/issues/8173)) +- [2ch] update domain to `2ch.su`, support `2ch.life` URLs ([#8216](https://github.com/mikf/gallery-dl/issues/8216)) +- [bellazon][simpcity][vipergirls] process threads in descending order ([#8248](https://github.com/mikf/gallery-dl/issues/8248)) +- [bellazon] extract `inline` images (##8247) +- [bellazon] support video embeds ([#8239](https://github.com/mikf/gallery-dl/issues/8239)) +- [bellazon] support `#comment-12345` post links ([#8239](https://github.com/mikf/gallery-dl/issues/8239)) +- [lensdump] support new direct file URL pattern ([#8251](https://github.com/mikf/gallery-dl/issues/8251)) +- [simpcity] extract URLs of `<iframe>` embeds ([#8214](https://github.com/mikf/gallery-dl/issues/8214) [#8256](https://github.com/mikf/gallery-dl/issues/8256)) +- [simpcity] improve post content extraction ([#8214](https://github.com/mikf/gallery-dl/issues/8214)) +#### Metadata +- [facebook] extract `biography` metadata ([#8233](https://github.com/mikf/gallery-dl/issues/8233)) +- [instagram:tagged] provide full `tagged_…` metadata when using `id:…` URLs ([#8263](https://github.com/mikf/gallery-dl/issues/8263)) +- [iwara] extract more metadata ([#6582](https://github.com/mikf/gallery-dl/issues/6582)) +- [iwara] make `type` available for directories ([#8245](https://github.com/mikf/gallery-dl/issues/8245)) +- [reddit] provide `comment` metadata for all media files ([#8228](https://github.com/mikf/gallery-dl/issues/8228)) +#### Options +- [bellazon] add `quoted` option ([#8247](https://github.com/mikf/gallery-dl/issues/8247)) +- [bellazon] implement `order-posts` option ([#8248](https://github.com/mikf/gallery-dl/issues/8248)) +- [kemono:discord] implement `order-posts` option ([#8241](https://github.com/mikf/gallery-dl/issues/8241)) +- [simpcity] implement `order-posts` option ([#8248](https://github.com/mikf/gallery-dl/issues/8248)) +- [vipergirls] implement `order-posts` option ([#8248](https://github.com/mikf/gallery-dl/issues/8248)) +### Downloaders +- [ytdl] fix errors caused by deprecated options removal +### Post Processors +- [metadata] add `"mode": "print"` ([#2691](https://github.com/mikf/gallery-dl/issues/2691)) +- [python] add `"mode": "eval"` +- close archive database connections ([#8243](https://github.com/mikf/gallery-dl/issues/8243)) ### Miscellaneous -- [formatter] exclude `<>\` characters from `!R` results ([#8180](https://github.com/mikf/gallery-dl/issues/8180)) -- [formatter] support negative indicies -- [util] emit debug `Proxy Map` logging message ([#8195](https://github.com/mikf/gallery-dl/issues/8195)) +- [util] define `__enter__` & `__exit__` methods for `NullResponse` objects ([#8227](https://github.com/mikf/gallery-dl/issues/8227)) +- [util] extend list of ISO 639 language codes @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.30.7 +Version: 1.30.8 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -141,9 +141,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.7/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.8/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.7/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.8/gallery-dl.bin>`__ Nightly Builds @@ -79,9 +79,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.7/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.8/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.7/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.8/gallery-dl.bin>`__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 36ca314..6560c3b 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2025-09-14" "1.30.7" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2025-09-23" "1.30.8" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 8d2f806..fbf32bc 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2025-09-14" "1.30.7" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2025-09-23" "1.30.8" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -501,6 +501,7 @@ response before \f[I]retrying\f[] the request. \f[I][Danbooru]\f[], \f[I][E621]\f[], \f[I][foolfuuka]:search\f[], +\f[I]hdoujin\f[], \f[I]itaku\f[], \f[I]newgrounds\f[], \f[I][philomena]\f[], @@ -512,6 +513,7 @@ response before \f[I]retrying\f[] the request. \f[I]scrolller\f[], \f[I]sizebooru\f[], \f[I]soundgasm\f[], +\f[I]thehentaiworld\f[], \f[I]urlgalleries\f[], \f[I]vk\f[], \f[I]webtoons\f[], @@ -1883,6 +1885,34 @@ Supported module types are \f[I]image\f[], \f[I]video\f[], \f[I]mediacollection\f[], \f[I]embed\f[], \f[I]text\f[]. +.SS extractor.bellazon.order-posts +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"desc"\f[] + +.IP "Description:" 4 +Controls the order in which +posts of a \f[I]thread\f[] are processed. + +\f[I]"asc"\f[] +Ascending order (oldest first) +\f[I]"desc"\f[] | \f[I]"reverse"\f[] +Descending order (newest first) + + +.SS extractor.bellazon.quoted +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract files from quoted content. + + .SS extractor.[blogger].api-key .IP "Type:" 6 \f[I]string\f[] @@ -3656,6 +3686,87 @@ to attempt to fetch the current value used by gofile. Recursively download files from subfolders. +.SS extractor.hdoujin.crt +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +.br +* "0542daa9-352c-4fd5-a497-6c6d5cf07423" +.br +* "/12345/a1b2c3d4e5f6?crt=0542daa9-352c-4fd5-a497-6c6d5cf07423" + +.IP "Description:" 4 +The \f[I]crt\f[] query parameter value +sent when fetching gallery data. + +To get this value: + +.br +* Open your browser's Developer Tools (F12) +.br +* Select Network -> XHR +.br +* Open a gallery page +.br +* Select the last Network entry and copy its \f[I]crt\f[] value + +Note: You will also need your browser's +\f[I]user-agent\f[] + + +.SS extractor.hdoujin.format +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]["0", "1600", "1280", "980", "780"]\f[] + +.IP "Description:" 4 +Name(s) of the image format to download. + +When more than one format is given, the first available one is selected. + +Possible formats are +.br +\f[I]"780"\f[], \f[I]"980"\f[], \f[I]"1280"\f[], \f[I]"1600"\f[], \f[I]"0"\f[] (original) +.br + + +.SS extractor.hdoujin.tags +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Group \f[I]tags\f[] by type and +provide them as \f[I]tags_<type>\f[] metadata fields, +for example \f[I]tags_artist\f[] or \f[I]tags_character\f[]. + + +.SS extractor.hdoujin.token +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +.br +* "3f1a9b72-4e4d-4f4e-9e5d-4a2b99f7c893" +.br +* "Bearer 3f1a9b72-4e4d-4f4e-9e5d-4a2b99f7c893" +.br +* "Authorization: Bearer 3f1a9b72-4e4d-4f4e-9e5d-4a2b99f7c893" + +.IP "Description:" 4 +\f[I]Authorization\f[] header value +used for requests to \f[I]https://api.hdoujin.org\f[] +to access \f[I]favorite\f[] galleries. + + .SS extractor.hentaifoundry.descriptions .IP "Type:" 6 \f[I]string\f[] @@ -4209,12 +4320,28 @@ Controls the order in which \f[I]revisions\f[] are returned. -.br -* \f[I]"asc"\f[]: Ascending order (oldest first) -.br -* \f[I]"desc"\f[]: Descending order (newest first) -.br -* \f[I]"reverse"\f[]: Same as \f[I]"asc"\f[] +\f[I]"asc"\f[] | \f[I]"reverse"\f[] +Ascending order (oldest first) +\f[I]"desc"\f[] +Descending order (newest first) + + +.SS extractor.kemono.discord.order-posts +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"asc"\f[] + +.IP "Description:" 4 +Controls the order in which +\f[I]discord\f[] posts +are returned. + +\f[I]"asc"\f[] +Ascending order (oldest first) +\f[I]"desc"\f[] | \f[I]"reverse"\f[] +Descending order (newest first) .SS extractor.khinsider.covers @@ -4245,54 +4372,6 @@ If the selected format is not available, the first in the list gets chosen (usually mp3). -.SS extractor.schalenetwork.cbz -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]true\f[] - -.IP "Description:" 4 -Download each gallery as a single \f[I].cbz\f[] file. - -Disabling this option causes a gallery -to be downloaded as individual image files. - - -.SS extractor.schalenetwork.format -.IP "Type:" 6 -.br -* \f[I]string\f[] -.br -* \f[I]list\f[] of \f[I]strings\f[] - -.IP "Default:" 9 -\f[I]["0", "1600", "1280", "980", "780"]\f[] - -.IP "Description:" 4 -Name(s) of the image format to download. - -When more than one format is given, the first available one is selected. - -Possible formats are -.br -\f[I]"780"\f[], \f[I]"980"\f[], \f[I]"1280"\f[], \f[I]"1600"\f[], \f[I]"0"\f[] (original) -.br - - -.SS extractor.schalenetwork.tags -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -Group \f[I]tags\f[] by type and -provide them as \f[I]tags_<type>\f[] metadata fields, -for example \f[I]tags_artist\f[] or \f[I]tags_character\f[]. - - .SS extractor.lolisafe.domain .IP "Type:" 6 \f[I]string\f[] @@ -5706,6 +5785,87 @@ Download video embeds from external sites. Download videos. +.SS extractor.schalenetwork.crt +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +.br +* "0542daa9-352c-4fd5-a497-6c6d5cf07423" +.br +* "/12345/a1b2c3d4e5f6?crt=0542daa9-352c-4fd5-a497-6c6d5cf07423" + +.IP "Description:" 4 +The \f[I]crt\f[] query parameter value +sent when fetching gallery data. + +To get this value: + +.br +* Open your browser's Developer Tools (F12) +.br +* Select Network -> XHR +.br +* Open a gallery page +.br +* Select the last Network entry and copy its \f[I]crt\f[] value + +Note: You will also need your browser's +\f[I]user-agent\f[] + + +.SS extractor.schalenetwork.format +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]["0", "1600", "1280", "980", "780"]\f[] + +.IP "Description:" 4 +Name(s) of the image format to download. + +When more than one format is given, the first available one is selected. + +Possible formats are +.br +\f[I]"780"\f[], \f[I]"980"\f[], \f[I]"1280"\f[], \f[I]"1600"\f[], \f[I]"0"\f[] (original) +.br + + +.SS extractor.schalenetwork.tags +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Group \f[I]tags\f[] by type and +provide them as \f[I]tags_<type>\f[] metadata fields, +for example \f[I]tags_artist\f[] or \f[I]tags_character\f[]. + + +.SS extractor.schalenetwork.token +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +.br +* "3f1a9b72-4e4d-4f4e-9e5d-4a2b99f7c893" +.br +* "Bearer 3f1a9b72-4e4d-4f4e-9e5d-4a2b99f7c893" +.br +* "Authorization: Bearer 3f1a9b72-4e4d-4f4e-9e5d-4a2b99f7c893" + +.IP "Description:" 4 +\f[I]Authorization\f[] header value +used for requests to \f[I]https://api.schale.network\f[] +to access \f[I]favorite\f[] galleries. + + .SS extractor.sexcom.gifs .IP "Type:" 6 \f[I]bool\f[] @@ -5717,6 +5877,23 @@ Download videos. Download animated images as \f[I].gif\f[] instead of \f[I].webp\f[] +.SS extractor.simpcity.order-posts +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"desc"\f[] + +.IP "Description:" 4 +Controls the order in which +posts of a \f[I]thread\f[] are processed. + +\f[I]"asc"\f[] +Ascending order (oldest first) +\f[I]"desc"\f[] | \f[I]"reverse"\f[] +Descending order (newest first) + + .SS extractor.sizebooru.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -7003,6 +7180,23 @@ Note: Requires \f[I]login\f[] or \f[I]cookies\f[] +.SS extractor.vipergirls.order-posts +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"desc"\f[] + +.IP "Description:" 4 +Controls the order in which +posts of a \f[I]thread\f[] are processed. + +\f[I]"asc"\f[] +Ascending order (oldest first) +\f[I]"desc"\f[] | \f[I]"reverse"\f[] +Descending order (newest first) + + .SS extractor.vk.offset .IP "Type:" 6 \f[I]integer\f[] @@ -9226,6 +9420,26 @@ The event(s) for which \f[I]python.function\f[] gets called. See \f[I]metadata.event\f[] for a list of available events. +.SS python.expression +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +.br +* "print('Foo Bar')" +.br +* "terminate()" + +.IP "Description:" 4 +A +\f[I]Python expression\f[] +to +\f[I]evaluate\f[]. + +Note: Only used with +\f[I]"mode": "eval"\f[] + + .SS python.function .IP "Type:" 6 \f[I]string\f[] @@ -9248,6 +9462,24 @@ This function is specified as \f[I]<module>:<function name>\f[], where It gets called with the current metadata dict as argument. +.SS python.mode +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"function"\f[] + +.IP "Description:" 4 +Selects what Python code to run. + +\f[I]"eval"\f[] +Evaluate an +\f[I]expression\f[] +\f[I]function"\f[] +Call a +\f[I]function\f[] + + .SS rename.from .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 823fcc0..5a1b604 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -154,6 +154,11 @@ "modules": ["image", "video", "mediacollection", "embed"] }, + "bellazon": + { + "order-posts": "desc", + "quoted" : false + }, "bilibili": { "sleep-request": "3.0-6.0" @@ -386,6 +391,15 @@ "website-token": null, "recursive": false }, + "hdoujin": + { + "crt" : "", + "token": "", + "sleep-request": "0.5-1.5", + + "format": ["0", "1600", "1280", "980", "780"], + "tags" : false + }, "hentaifoundry": { "descriptions": "text", @@ -475,7 +489,11 @@ "max-posts" : null, "metadata" : true, "revisions" : false, - "order-revisions": "desc" + "order-revisions": "desc", + + "discord": { + "order-posts": "asc" + } }, "khinsider": { @@ -680,11 +698,10 @@ }, "schalenetwork": { - "username": "", - "password": "", + "crt" : "", + "token": "", "sleep-request": "0.5-1.5", - "cbz" : true, "format": ["0", "1600", "1280", "980", "780"], "tags" : false }, @@ -698,6 +715,12 @@ { "gifs": true }, + "simpcity": + { + "cookies": null, + + "order-posts": "desc" + }, "sizebooru": { "sleep-request": "0.5-1.5", @@ -761,6 +784,10 @@ { "format": ["gif", "mp4", "webm", "webp"] }, + "thehentaiworld": + { + "sleep-request": "0.5-1.5" + }, "tiktok": { "audio" : true, @@ -856,8 +883,9 @@ "password": "", "sleep-request": "0.5", - "domain" : "viper.click", - "like" : false + "domain" : "viper.click", + "like" : false, + "order-posts": "desc" }, "vk": { diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 6abd758..a339b24 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.30.7 +Version: 1.30.8 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -141,9 +141,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.7/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.8/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.7/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.8/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index de3a9ed..93a6880 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -114,6 +114,7 @@ gallery_dl/extractor/girlsreleased.py gallery_dl/extractor/girlswithmuscle.py gallery_dl/extractor/gofile.py gallery_dl/extractor/hatenablog.py +gallery_dl/extractor/hdoujin.py gallery_dl/extractor/hentai2read.py gallery_dl/extractor/hentaicosplays.py gallery_dl/extractor/hentaifoundry.py @@ -130,6 +131,7 @@ gallery_dl/extractor/imagefap.py gallery_dl/extractor/imagehosts.py gallery_dl/extractor/imgbb.py gallery_dl/extractor/imgbox.py +gallery_dl/extractor/imgpile.py gallery_dl/extractor/imgth.py gallery_dl/extractor/imgur.py gallery_dl/extractor/imhentai.py @@ -161,6 +163,7 @@ gallery_dl/extractor/mangahere.py gallery_dl/extractor/manganelo.py gallery_dl/extractor/mangapark.py gallery_dl/extractor/mangaread.py +gallery_dl/extractor/mangataro.py gallery_dl/extractor/mangoxo.py gallery_dl/extractor/mastodon.py gallery_dl/extractor/message.py @@ -238,6 +241,7 @@ gallery_dl/extractor/tapas.py gallery_dl/extractor/tcbscans.py gallery_dl/extractor/telegraph.py gallery_dl/extractor/tenor.py +gallery_dl/extractor/thehentaiworld.py gallery_dl/extractor/tiktok.py gallery_dl/extractor/tmohentai.py gallery_dl/extractor/toyhouse.py diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py index f5bb7b7..912a251 100644 --- a/gallery_dl/extractor/2ch.py +++ b/gallery_dl/extractor/2ch.py @@ -4,37 +4,41 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://2ch.hk/""" +"""Extractors for https://2ch.su/""" from .common import Extractor, Message from .. import text, util +BASE_PATTERN = r"(?:https?://)?2ch\.(su|life|hk)" + class _2chThreadExtractor(Extractor): """Extractor for 2ch threads""" category = "2ch" subcategory = "thread" - root = "https://2ch.hk" + root = "https://2ch.su" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{tim}{filename:? //}.{extension}" archive_fmt = "{board}_{thread}_{tim}" - pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" - example = "https://2ch.hk/a/res/12345.html" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)" + example = "https://2ch.su/a/res/12345.html" def __init__(self, match): + tld = match[1] + self.root = f"https://2ch.{'su' if tld == 'hk' else tld}" Extractor.__init__(self, match) - self.board, self.thread = match.groups() def items(self): - url = f"{self.root}/{self.board}/res/{self.thread}.json" + _, board, thread = self.groups + url = f"{self.root}/{board}/res/{thread}.json" posts = self.request_json(url)["threads"][0]["posts"] op = posts[0] title = op.get("subject") or text.remove_html(op["comment"]) thread = { - "board" : self.board, - "thread": self.thread, + "board" : board, + "thread": thread, "title" : text.unescape(title)[:50], } @@ -61,16 +65,17 @@ class _2chBoardExtractor(Extractor): """Extractor for 2ch boards""" category = "2ch" subcategory = "board" - root = "https://2ch.hk" - pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" - example = "https://2ch.hk/a/" + root = "https://2ch.su" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$" + example = "https://2ch.su/a/" def __init__(self, match): + tld = match[1] + self.root = f"https://2ch.{'su' if tld == 'hk' else tld}" Extractor.__init__(self, match) - self.board = match[1] def items(self): - base = f"{self.root}/{self.board}" + base = f"{self.root}/{self.groups[1]}" # index page url = f"{base}/index.json" diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index c9be2a4..4c43464 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -62,7 +62,8 @@ class _4archiveThreadExtractor(Extractor): data = { "name": extr('class="name">', "</span>"), "date": text.parse_datetime( - extr('class="dateTime postNum" >', "<").strip(), + (extr('class="dateTime">', "<") or + extr('class="dateTime postNum" >', "<")).strip(), "%Y-%m-%d %H:%M:%S"), "no" : text.parse_int(extr(">Post No.", "<")), } @@ -70,8 +71,7 @@ class _4archiveThreadExtractor(Extractor): extr('class="fileText"', ">File: <a") data.update({ "url" : extr('href="', '"'), - "filename": extr( - 'rel="noreferrer noopener"', "</a>").strip()[1:], + "filename": extr('alt="Image: ', '"'), "size" : text.parse_bytes(extr(" (", ", ")[:-1]), "width" : text.parse_int(extr("", "x")), "height" : text.parse_int(extr("", "px")), diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b32fcd1..abdb6cc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -73,6 +73,7 @@ modules = [ "girlswithmuscle", "gofile", "hatenablog", + "hdoujin", "hentai2read", "hentaicosplays", "hentaifoundry", @@ -88,6 +89,7 @@ modules = [ "imagefap", "imgbb", "imgbox", + "imgpile", "imgth", "imgur", "imhentai", @@ -118,6 +120,7 @@ modules = [ "manganelo", "mangapark", "mangaread", + "mangataro", "mangoxo", "misskey", "motherless", @@ -188,6 +191,7 @@ modules = [ "tcbscans", "telegraph", "tenor", + "thehentaiworld", "tiktok", "tmohentai", "toyhouse", diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py index 5c9b9cd..5dcb6a5 100644 --- a/gallery_dl/extractor/bellazon.py +++ b/gallery_dl/extractor/bellazon.py @@ -20,32 +20,61 @@ class BellazonExtractor(Extractor): root = "https://www.bellazon.com/main" directory_fmt = ("{category}", "{thread[section]}", "{thread[title]} ({thread[id]})") - filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}" - archive_fmt = "{post[id]}/{filename}" + filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}" + archive_fmt = "{post[id]}/{id}_{filename}" def items(self): - extract_urls = text.re(r'<a ([^>]*?href="([^"]+)".*?)</a>').findall - native = f"{self.root}/" + native = (f"{self.root}/", f"{self.root[6:]}/") + extract_urls = text.re( + r'(?s)<(' + r'(?:video .*?<source src|a [^>]*?href)="([^"]+).*?</a>' + r'|img [^>]*?src="([^"]+)"[^>]*>' + r')' + ).findall + + if self.config("quoted", False): + strip_quoted = None + else: + strip_quoted = text.re(r"(?s)<blockquote .*?</blockquote>").sub for post in self.posts(): - urls = extract_urls(post["content"]) + if strip_quoted is None: + urls = extract_urls(post["content"]) + else: + urls = extract_urls(strip_quoted("", post["content"])) + data = {"post": post} post["count"] = data["count"] = len(urls) yield Message.Directory, data - for data["num"], (info, url) in enumerate(urls, 1): - url = text.unescape(url) + data["num"] = 0 + for info, url, url_img in urls: + url = text.unescape(url or url_img) + if url.startswith(native): + if "/uploads/emoticons/" in url or "/profile/" in url: + continue + data["num"] += 1 if not (alt := text.extr(info, ' alt="', '"')) or ( alt.startswith("post-") and "_thumb." in alt): name = url else: name = text.unescape(alt) + dc = text.nameext_from_url(name, data.copy()) dc["id"] = text.extr(info, 'data-fileid="', '"') if ext := text.extr(info, 'data-fileext="', '"'): dc["extension"] = ext + elif "/core/interface/file/attachment.php" in url: + if not dc["id"]: + dc["id"] = url.rpartition("?id=")[2] + if name := text.extr(info, ">", "<").strip(): + text.nameext_from_url(name, dc) + + if url[0] == "/": + url = f"https:{url}" yield Message.Url, url, dc + else: yield Message.Queue, url, data @@ -70,6 +99,28 @@ class BellazonExtractor(Extractor): pnum += 1 url = f"{base}/page/{pnum}/" + def _pagination_reverse(self, base, pnum=None): + base = f"{self.root}{base}" + + url = f"{base}/page/9999/" # force redirect to highest page number + with self.request(url) as response: + parts = response.url.rsplit("/", 3) + pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1 + page = response.text + + while True: + yield page + + pnum -= 1 + if pnum > 1: + url = f"{base}/page/{pnum}/" + elif pnum == 1: + url = f"{base}/" + else: + return + + page = self.request(url).text + def _parse_thread(self, page): schema = self._extract_jsonld(page) author = schema["author"] @@ -88,7 +139,7 @@ class BellazonExtractor(Extractor): "posts": stats[1]["userInteractionCount"], "date" : text.parse_datetime(schema["datePublished"]), "date_updated": text.parse_datetime(schema["dateModified"]), - "description" : text.unescape(schema["text"]), + "description" : text.unescape(schema["text"]).strip(), "section" : path[-2], "author" : author["name"], "author_url" : url_a, @@ -123,7 +174,7 @@ class BellazonExtractor(Extractor): class BellazonPostExtractor(BellazonExtractor): subcategory = "post" pattern = (rf"{BASE_PATTERN}(/topic/\d+-[\w-]+(?:/page/\d+)?)" - rf"/?#findComment-(\d+)") + rf"/?#(?:findC|c)omment-(\d+)") example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345" def posts(self): @@ -145,10 +196,22 @@ class BellazonThreadExtractor(BellazonExtractor): example = "https://www.bellazon.com/main/topic/123-SLUG/" def posts(self): - for page in self._pagination(*self.groups): + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + pages = self._pagination(*self.groups) + reverse = False + else: + pages = self._pagination_reverse(*self.groups) + reverse = True + + for page in pages: if "thread" not in self.kwdict: self.kwdict["thread"] = self._parse_thread(page) - for html in text.extract_iter(page, "<article ", "</article>"): + posts = text.extract_iter(page, "<article ", "</article>") + if reverse: + posts = list(posts) + posts.reverse() + for html in posts: yield self._parse_post(html) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index cf5bce1..14ebc48 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -162,7 +162,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): file["name"] = util.json_loads(text.extr( item, 'original:', ',\n').replace("\\'", "'")) file["slug"] = util.json_loads(text.extr( - item, 'slug: ', ',\n')) + item, 'slug: ', ',\n').replace("\\'", "'")) file["uuid"] = text.extr( item, 'name: "', ".") file["size"] = text.parse_int(text.extr( diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 6ba4d08..67fdb39 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -50,6 +50,10 @@ BASE_PATTERN = CheveretoExtractor.update({ "root": "https://imagepond.net", "pattern": r"imagepond\.net", }, + "imglike": { + "root": "https://imglike.com", + "pattern": r"imglike\.com", + }, }) @@ -152,6 +156,18 @@ class CheveretoAlbumExtractor(CheveretoExtractor): yield Message.Queue, image, data +class CheveretoCategoryExtractor(CheveretoExtractor): + """Extractor for chevereto galleries""" + subcategory = "category" + pattern = BASE_PATTERN + r"(/category/[^/?#]+)" + example = "https://imglike.com/category/TITLE" + + def items(self): + data = {"_extractor": CheveretoImageExtractor} + for image in self._pagination(self.root + self.path): + yield Message.Queue, image, data + + class CheveretoUserExtractor(CheveretoExtractor): """Extractor for chevereto users""" subcategory = "user" diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index f8ad07a..29c7763 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -278,6 +278,23 @@ class DanbooruTagExtractor(DanbooruExtractor): return self._pagination("/posts.json", {"tags": self.tags}, prefix) +class DanbooruRandomExtractor(DanbooruTagExtractor): + """Extractor for a random danbooru post""" + subcategory = "random" + pattern = BASE_PATTERN + r"/posts/random(?:\?(?:[^&#]*&)*tags=([^&#]*))?" + example = "https://danbooru.donmai.us/posts/random?tags=TAG" + + def metadata(self): + tags = self.groups[-1] or "" + self.tags = text.unquote(tags.replace("+", " ")) + return {"search_tags": self.tags} + + def posts(self): + posts = self.request_json(self.root + "/posts/random.json", + params={"tags": self.tags or None}) + return (posts,) if isinstance(posts, dict) else posts + + class DanbooruPoolExtractor(DanbooruExtractor): """Extractor for Danbooru pools""" subcategory = "pool" diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index bf24941..6061737 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -369,6 +369,16 @@ class FacebookExtractor(Extractor): for edge in (user["profile_tabs"]["profile_user"] ["timeline_nav_app_sections"]["edges"]) ] + + if bio := text.extr(page, '"best_description":{"text":"', '"'): + user["biography"] = self.decode_all(bio) + elif (pos := page.find( + '"__module_operation_ProfileCometTileView_profileT')) >= 0: + user["biography"] = self.decode_all(text.rextr( + page, '"text":"', '"', pos)) + else: + user["biography"] = text.unescape(text.remove_html(text.extr( + page, "</span></span></h2>", "<ul>"))) except Exception: if user is None: self.log.debug("Failed to extract user data: %s", data) diff --git a/gallery_dl/extractor/hdoujin.py b/gallery_dl/extractor/hdoujin.py new file mode 100644 index 0000000..080b899 --- /dev/null +++ b/gallery_dl/extractor/hdoujin.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hdoujin.org/""" + +from . import schalenetwork + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?(hdoujin\.(?:org|net))" + + +class HdoujinBase(): + """Base class for hdoujin extractors""" + category = "hdoujin" + root = "https://hdoujin.org" + root_api = "https://api.hdoujin.org" + root_auth = "https://auth.hdoujin.org" + + +class HdoujinGalleryExtractor( + HdoujinBase, schalenetwork.SchalenetworkGalleryExtractor): + pattern = rf"{BASE_PATTERN}/(?:g|reader)/(\d+)/(\w+)" + example = "https://hdoujin.org/g/12345/67890abcdef/" + + +class HdoujinSearchExtractor( + HdoujinBase, schalenetwork.SchalenetworkSearchExtractor): + pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+)|browse)?(?:/?\?([^#]*))?$" + example = "https://hdoujin.org/browse?s=QUERY" + + +class HdoujinFavoriteExtractor( + HdoujinBase, schalenetwork.SchalenetworkFavoriteExtractor): + pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?" + example = "https://hdoujin.org/favorites" + + +HdoujinBase.extr_class = HdoujinGalleryExtractor diff --git a/gallery_dl/extractor/imgpile.py b/gallery_dl/extractor/imgpile.py new file mode 100644 index 0000000..9fc3a9c --- /dev/null +++ b/gallery_dl/extractor/imgpile.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://imgpile.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgpile\.com" + + +class ImgpileExtractor(Extractor): + """Base class for imgpile extractors""" + category = "imgpile" + root = "https://imgpile.com" + directory_fmt = ("{category}", "{post[author]}", + "{post[title]} ({post[id_slug]})") + archive_fmt = "{post[id_slug]}_{id}" + + def items(self): + pass + + +class ImgpilePostExtractor(ImgpileExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/p/(\w+)" + example = "https://imgpile.com/p/AbCdEfG" + + def items(self): + post_id = self.groups[0] + url = f"{self.root}/p/{post_id}" + page = self.request(url).text + extr = text.extract_from(page) + + post = { + "id_slug": post_id, + "title" : text.unescape(extr("<title>", " - imgpile<")), + "id" : text.parse_int(extr('data-post-id="', '"')), + "author" : extr('/u/', '"'), + "score" : text.parse_int(text.remove_html(extr( + 'class="post-score">', "</"))), + "views" : text.parse_int(extr( + 'class="meta-value">', "<").replace(",", "")), + "tags" : text.split_html(extr( + " <!-- Tags -->", '<!-- "')), + } + + files = self._extract_files(extr) + data = {"post": post} + data["count"] = post["count"] = len(files) + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + data.update(file) + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, data) + + def _extract_files(self, extr): + files = [] + + while True: + media = extr('lass="post-media', '</div>') + if not media: + break + files.append({ + "id_slug": text.extr(media, 'data-id="', '"'), + "id" : text.parse_int(text.extr( + media, 'data-media-id="', '"')), + "url": f"""http{text.extr(media, '<a href="http', '"')}""", + }) + return files + + +class ImgpileUserExtractor(ImgpileExtractor): + subcategory = "user" + pattern = rf"{BASE_PATTERN}/u/([^/?#]+)" + example = "https://imgpile.com/u/USER" + + def items(self): + url = f"{self.root}/api/v1/posts" + params = { + "limit" : "100", + "sort" : "latest", + "period" : "all", + "visibility": "public", + # "moderation_status": "approved", + "username" : self.groups[0], + } + headers = { + "Accept" : "application/json", + # "Referer" : "https://imgpile.com/u/USER", + "Content-Type" : "application/json", + # "X-CSRF-TOKEN": "", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + } + + base = f"{self.root}/p/" + while True: + data = self.request_json(url, params=params, headers=headers) + + if params is not None: + params = None + self.kwdict["total"] = data["meta"]["total"] + + for item in data["data"]: + item["_extractor"] = ImgpilePostExtractor + url = f"{base}{item['slug']}" + yield Message.Queue, url, item + + url = data["links"].get("next") + if not url: + return diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 9b8f8c9..00e06b5 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -95,7 +95,7 @@ class InstagramExtractor(Extractor): if videos: file["_http_headers"] = videos_headers text.nameext_from_url(url, file) - if videos_dash: + if videos_dash and "_ytdl_manifest_data" in post: file["_fallback"] = (url,) file["_ytdl_manifest"] = "dash" url = f"ytdl:{post['post_url']}{file['num']}.mp4" @@ -505,10 +505,12 @@ class InstagramTaggedExtractor(InstagramExtractor): def metadata(self): if self.item.startswith("id:"): self.user_id = self.item[3:] - return {"tagged_owner_id": self.user_id} - - self.user_id = self.api.user_id(self.item) - user = self.api.user_by_name(self.item) + if not self.config("metadata"): + return {"tagged_owner_id": self.user_id} + user = self.api.user_by_id(self.user_id) + else: + self.user_id = self.api.user_id(self.item) + user = self.api.user_by_name(self.item) return { "tagged_owner_id" : user["id"], diff --git a/gallery_dl/extractor/iwara.py b/gallery_dl/extractor/iwara.py index 179909b..8af2f42 100644 --- a/gallery_dl/extractor/iwara.py +++ b/gallery_dl/extractor/iwara.py @@ -45,6 +45,7 @@ class IwaraExtractor(Extractor): image["id"], exc.__class__.__name__, exc) continue + group_info["type"] = "image" group_info["count"] = len(files) yield Message.Directory, group_info for num, file in enumerate(files, 1): @@ -102,34 +103,37 @@ class IwaraExtractor(Extractor): raise exception.AbortExtraction(f"Unsupported result type '{type}'") def extract_media_info(self, item, key, include_file_info=True): - title = t.strip() if (t := item.get("title")) else "" + info = { + "id" : item["id"], + "slug" : item.get("slug"), + "rating" : item.get("rating"), + "likes" : item.get("numLikes"), + "views" : item.get("numViews"), + "comments": item.get("numComments"), + "tags" : [t["id"] for t in item.get("tags") or ()], + "title" : t.strip() if (t := item.get("title")) else "", + "description": t.strip() if (t := item.get("body")) else "", + } if include_file_info: file_info = item if key is None else item.get(key) or {} filename, _, extension = file_info.get("name", "").rpartition(".") - return { - "id" : item["id"], - "file_id" : file_info.get("id"), - "title" : title, - "filename" : filename, - "extension": extension, - "date" : text.parse_datetime( - file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ"), - "date_updated": text.parse_datetime( - file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ"), - "mime" : file_info.get("mime"), - "size" : file_info.get("size"), - "width" : file_info.get("width"), - "height" : file_info.get("height"), - "duration" : file_info.get("duration"), - "type" : file_info.get("type"), - } - else: - return { - "id" : item["id"], - "title": title, - } + info["file_id"] = file_info.get("id") + info["filename"] = filename + info["extension"] = extension + info["date"] = text.parse_datetime( + file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ") + info["date_updated"] = text.parse_datetime( + file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ") + info["mime"] = file_info.get("mime") + info["size"] = file_info.get("size") + info["width"] = file_info.get("width") + info["height"] = file_info.get("height") + info["duration"] = file_info.get("duration") + info["type"] = file_info.get("type") + + return info def extract_user_info(self, profile): user = profile.get("user") or {} diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py index fc5972c..1f70031 100644 --- a/gallery_dl/extractor/kemono.py +++ b/gallery_dl/extractor/kemono.py @@ -407,7 +407,11 @@ class KemonoDiscordExtractor(KemonoExtractor): r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall find_hash = util.re(HASH_PATTERN).match - posts = self.api.discord_channel(channel_id) + if (order := self.config("order-posts")) and order[0] in ("r", "d"): + posts = self.api.discord_channel(channel_id, channel["post_count"]) + else: + posts = self.api.discord_channel(channel_id) + if max_posts := self.config("max-posts"): posts = itertools.islice(posts, max_posts) @@ -627,9 +631,12 @@ class KemonoAPI(): endpoint = f"/{service}/user/{creator_id}/tags" return self._call(endpoint) - def discord_channel(self, channel_id): + def discord_channel(self, channel_id, post_count=None): endpoint = f"/discord/channel/{channel_id}" - return self._pagination(endpoint, {}, 150) + if post_count is None: + return self._pagination(endpoint, {}, 150) + else: + return self._pagination_reverse(endpoint, {}, 150, post_count) def discord_channel_lookup(self, server_id): endpoint = f"/discord/channel/lookup/{server_id}" @@ -670,3 +677,18 @@ class KemonoAPI(): if len(data) < batch: return params["o"] += batch + + def _pagination_reverse(self, endpoint, params, batch, count): + params["o"] = count // batch * batch + + while True: + data = self._call(endpoint, params) + + if not data: + return + data.reverse() + yield from data + + if not params["o"]: + return + params["o"] -= batch diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index c700a29..b0198d5 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -100,7 +100,8 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" + pattern = (r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)" + r"/(?:i/)?(\w+)") example = "https://lensdump.com/i/ID" def items(self): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 225560d..fbed328 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -96,6 +96,57 @@ class MangadexExtractor(Extractor): return data +class MangadexCoversExtractor(MangadexExtractor): + """Extractor for mangadex manga covers""" + subcategory = "covers" + directory_fmt = ("{category}", "{manga}", "Covers") + filename_fmt = "{volume:>02}_{lang}.{extension}" + archive_fmt = "c_{cover_id}" + pattern = (rf"{BASE_PATTERN}/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)" + r"(?:/[^/?#]+)?\?tab=art") + example = ("https://mangadex.org/title" + "/01234567-89ab-cdef-0123-456789abcdef?tab=art") + + def items(self): + base = f"{self.root}/covers/{self.uuid}/" + for cover in self.api.covers_manga(self.uuid): + data = self._transform_cover(cover) + name = data["cover"] + text.nameext_from_url(name, data) + data["cover_id"] = data["filename"] + yield Message.Directory, data + yield Message.Url, f"{base}{name}", data + + def _transform_cover(self, cover): + relationships = defaultdict(list) + for item in cover["relationships"]: + relationships[item["type"]].append(item) + manga = self.api.manga(relationships["manga"][0]["id"]) + for item in manga["relationships"]: + relationships[item["type"]].append(item) + + cattributes = cover["attributes"] + mattributes = manga["attributes"] + + return { + "manga" : (mattributes["title"].get("en") or + next(iter(mattributes["title"].values()))), + "manga_id": manga["id"], + "status" : mattributes["status"], + "author" : [author["attributes"]["name"] + for author in relationships["author"]], + "artist" : [artist["attributes"]["name"] + for artist in relationships["artist"]], + "tags" : [tag["attributes"]["name"]["en"] + for tag in mattributes["tags"]], + "cover" : cattributes["fileName"], + "lang" : cattributes.get("locale"), + "volume" : text.parse_int(cattributes["volume"]), + "date" : text.parse_datetime(cattributes["createdAt"]), + "date_updated": text.parse_datetime(cattributes["updatedAt"]), + } + + class MangadexChapterExtractor(MangadexExtractor): """Extractor for manga-chapters from mangadex.org""" subcategory = "chapter" @@ -239,6 +290,10 @@ class MangadexAPI(): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] + def covers_manga(self, uuid): + params = {"manga[]": uuid} + return self._pagination_covers("/cover", params) + def list(self, uuid): return self._call("/list/" + uuid, None, True)["data"] @@ -374,6 +429,20 @@ class MangadexAPI(): return self._pagination(endpoint, params, auth) + def _pagination_covers(self, endpoint, params=None, auth=False): + if params is None: + params = {} + + lang = self.extractor.config("lang") + if isinstance(lang, str) and "," in lang: + lang = lang.split(",") + params["locales"] = lang + params["contentRating"] = None + params["order[volume]"] = \ + "desc" if self.extractor.config("chapter-reverse") else "asc" + + return self._pagination(endpoint, params, auth) + def _pagination(self, endpoint, params, auth=False): config = self.extractor.config diff --git a/gallery_dl/extractor/mangataro.py b/gallery_dl/extractor/mangataro.py new file mode 100644 index 0000000..f4cc058 --- /dev/null +++ b/gallery_dl/extractor/mangataro.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://mangataro.org/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?mangataro\.org" + + +class MangataroBase(): + """Base class for mangataro extractors""" + category = "mangataro" + root = "https://mangataro.org" + + +class MangataroChapterExtractor(MangataroBase, ChapterExtractor): + """Extractor for mangataro manga chapters""" + pattern = rf"{BASE_PATTERN}(/read/([^/?#]+)/(?:[^/?#]*-)?(\d+))" + example = "https://mangataro.org/read/MANGA/ch123-12345" + + def metadata(self, page): + _, slug, chapter_id = self.groups + comic = self._extract_jsonld(page)["@graph"][0] + chapter = comic["position"] + minor = chapter - int(chapter) + desc = comic["description"].split(" - ", 3) + + return { + **_manga_info(self, slug), + "title" : desc[1] if len(desc) > 3 else "", + "chapter" : int(chapter), + "chapter_minor": str(round(minor, 5))[1:] if minor else "", + "chapter_id" : text.parse_int(chapter_id), + "chapter_url" : comic["url"], + "date" : text.parse_datetime( + comic["datePublished"], "%Y-%m-%dT%H:%M:%S%z"), + "date_updated" : text.parse_datetime( + comic["dateModified"], "%Y-%m-%dT%H:%M:%S%z"), + } + + def images(self, page): + pos = page.find('class="comic-image-container') + img, pos = text.extract(page, ' src="', '"', pos) + + images = [(img, None)] + images.extend( + (url, None) + for url in text.extract_iter(page, 'data-src="', '"', pos) + ) + return images + + +class MangataroMangaExtractor(MangataroBase, MangaExtractor): + """Extractor for mangataro manga""" + chapterclass = MangataroChapterExtractor + pattern = rf"{BASE_PATTERN}(/manga/([^/?#]+))" + example = "https://mangataro.org/manga/MANGA" + + def chapters(self, page): + slug = self.groups[1] + manga = _manga_info(self, slug) + + results = [] + for url in text.extract_iter(text.extr( + page, '<div class="chapter-list', '<div id="tab-gallery"'), + '<a href="', '"'): + chapter, _, chapter_id = url[url.rfind("/")+3:].rpartition("-") + chapter, sep, minor = chapter.partition("-") + results.append((url, { + **manga, + "chapter" : text.parse_int(chapter), + "chapter_minor": f".{minor}" if sep else "", + "chapter_id" : text.parse_int(chapter_id), + })) + return results + + +@memcache(keyarg=1) +def _manga_info(self, slug): + url = f"{self.root}/manga/{slug}" + page = self.request(url).text + manga = self._extract_jsonld(page) + + return { + "manga" : manga["name"].rpartition(" | ")[0].rpartition(" ")[0], + "manga_url" : manga["url"], + "cover" : manga["image"], + "author" : manga["author"]["name"].split(", "), + "genre" : manga["genre"], + "status" : manga["status"], + "description": text.unescape(text.extr( + page, 'id="description-content-tab">', "</div></div>")), + "tags" : text.split_html(text.extr( + page, ">Genres</h4>", "</div>")), + "publisher" : text.remove_html(text.extr( + page, '>Serialization</h4>', "</div>")), + } diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 9c335ad..ff771fb 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -204,58 +204,6 @@ class PinterestExtractor(Extractor): return media -class PinterestPinExtractor(PinterestExtractor): - """Extractor for images from a single pin from pinterest.com""" - subcategory = "pin" - pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)" - example = "https://www.pinterest.com/pin/12345/" - - def __init__(self, match): - PinterestExtractor.__init__(self, match) - self.pin_id = match[1] - self.pin = None - - def metadata(self): - self.pin = self.api.pin(self.pin_id) - return self.pin - - def pins(self): - return (self.pin,) - - -class PinterestBoardExtractor(PinterestExtractor): - """Extractor for images from a board from pinterest.com""" - subcategory = "board" - directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") - archive_fmt = "{board[id]}_{id}" - pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)" - r"/(?!_saved|_created|pins/)([^/?#]+)/?(?:$|\?|#)") - example = "https://www.pinterest.com/USER/BOARD/" - - def __init__(self, match): - PinterestExtractor.__init__(self, match) - self.user = text.unquote(match[1]) - self.board_name = text.unquote(match[2]) - self.board = None - - def metadata(self): - self.board = self.api.board(self.user, self.board_name) - return {"board": self.board} - - def pins(self): - board = self.board - pins = self.api.board_pins(board["id"]) - - if board["section_count"] and self.config("sections", True): - base = f"{self.root}{board['url']}id:" - data = {"_extractor": PinterestSectionExtractor} - sections = [(base + section["id"], data) - for section in self.api.board_sections(board["id"])] - pins = itertools.chain(pins, sections) - - return pins - - class PinterestUserExtractor(PinterestExtractor): """Extractor for a user's boards""" subcategory = "user" @@ -357,6 +305,58 @@ class PinterestSearchExtractor(PinterestExtractor): return self.api.search(self.search) +class PinterestPinExtractor(PinterestExtractor): + """Extractor for images from a single pin from pinterest.com""" + subcategory = "pin" + pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)" + example = "https://www.pinterest.com/pin/12345/" + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.pin_id = match[1] + self.pin = None + + def metadata(self): + self.pin = self.api.pin(self.pin_id) + return self.pin + + def pins(self): + return (self.pin,) + + +class PinterestBoardExtractor(PinterestExtractor): + """Extractor for images from a board from pinterest.com""" + subcategory = "board" + directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") + archive_fmt = "{board[id]}_{id}" + pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)" + r"/([^/?#]+)/?(?!.*#related$)") + example = "https://www.pinterest.com/USER/BOARD/" + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match[1]) + self.board_name = text.unquote(match[2]) + self.board = None + + def metadata(self): + self.board = self.api.board(self.user, self.board_name) + return {"board": self.board} + + def pins(self): + board = self.board + pins = self.api.board_pins(board["id"]) + + if board["section_count"] and self.config("sections", True): + base = f"{self.root}{board['url']}id:" + data = {"_extractor": PinterestSectionExtractor} + sections = [(base + section["id"], data) + for section in self.api.board_sections(board["id"])] + pins = itertools.chain(pins, sections) + + return pins + + class PinterestRelatedPinExtractor(PinterestPinExtractor): """Extractor for related pins of another pin from pinterest.com""" subcategory = "related-pin" diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 9febda9..e20d80e 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -56,6 +56,7 @@ class RedditExtractor(Extractor): urls = [] if submission: + submission["comment"] = None submission["date"] = text.parse_timestamp( submission["created_utc"]) yield Message.Directory, submission @@ -99,14 +100,13 @@ class RedditExtractor(Extractor): elif not submission["is_self"]: urls.append((url, submission)) + if selftext and (txt := submission["selftext_html"]): + for url in text.extract_iter(txt, ' href="', '"'): + urls.append((url, submission)) + elif parentdir: yield Message.Directory, comments[0] - if selftext and submission: - for url in text.extract_iter( - submission["selftext_html"] or "", ' href="', '"'): - urls.append((url, submission)) - if self.api.comments: if comments and not submission: submission = comments[0] @@ -115,24 +115,24 @@ class RedditExtractor(Extractor): yield Message.Directory, submission for comment in comments: + media = (embeds and "media_metadata" in comment) html = comment["body_html"] or "" href = (' href="' in html) - media = (embeds and "media_metadata" in comment) - if media or href: - comment["date"] = text.parse_timestamp( - comment["created_utc"]) - if submission: - data = submission.copy() - data["comment"] = comment - else: - data = comment + if not media and not href: + continue + + data = submission.copy() + data["comment"] = comment + comment["date"] = text.parse_timestamp( + comment["created_utc"]) if media: - for embed in self._extract_embed(comment): - submission["num"] += 1 - text.nameext_from_url(embed, submission) - yield Message.Url, embed, submission + for url in self._extract_embed(comment): + data["num"] += 1 + text.nameext_from_url(url, data) + yield Message.Url, url, data + submission["num"] = data["num"] if href: for url in text.extract_iter(html, ' href="', '"'): diff --git a/gallery_dl/extractor/schalenetwork.py b/gallery_dl/extractor/schalenetwork.py index d517287..dc42417 100644 --- a/gallery_dl/extractor/schalenetwork.py +++ b/gallery_dl/extractor/schalenetwork.py @@ -10,7 +10,6 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, exception -from ..cache import cache import collections BASE_PATTERN = ( @@ -27,6 +26,8 @@ class SchalenetworkExtractor(Extractor): category = "schalenetwork" root = "https://niyaniya.moe" root_api = "https://api.schale.network" + root_auth = "https://auth.schale.network" + extr_class = None request_interval = (0.5, 1.5) def _init(self): @@ -38,6 +39,7 @@ class SchalenetworkExtractor(Extractor): def _pagination(self, endpoint, params): url_api = self.root_api + endpoint + cls = self.extr_class while True: data = self.request_json( @@ -49,8 +51,8 @@ class SchalenetworkExtractor(Extractor): return for entry in entries: - url = f"{self.root}/g/{entry['id']}/{entry['public_key']}" - entry["_extractor"] = SchalenetworkGalleryExtractor + url = f"{self.root}/g/{entry['id']}/{entry['key']}" + entry["_extractor"] = cls yield Message.Queue, url, entry try: @@ -60,6 +62,34 @@ class SchalenetworkExtractor(Extractor): pass params["page"] += 1 + def _token(self): + if token := self.config("token"): + return f"Bearer {token.rpartition(' ')[2]}" + raise exception.AuthRequired("'token'", "your favorites") + + def _crt(self): + crt = self.config("crt") + if not crt: + self._require_auth() + + if not text.re(r"^[0-9a-f-]+$").match(crt): + path, _, qs = crt.partition("?") + if not qs: + qs = path + crt = text.parse_query(qs).get("crt") + if not crt: + self._require_auth() + + return crt + + def _require_auth(self, exc=None): + if exc is None: + msg = None + else: + msg = f"{exc.status} {exc.response.reason}" + raise exception.AuthRequired( + "'crt' query parameter & matching '--user-agent'", None, msg) + class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): """Extractor for schale.network galleries""" @@ -67,7 +97,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): directory_fmt = ("{category}", "{id} {title}") archive_fmt = "{id}_{num}" request_interval = 0.0 - pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)" + pattern = rf"{BASE_PATTERN}/(?:g|reader)/(\d+)/(\w+)" example = "https://niyaniya.moe/g/12345/67890abcde/" TAG_TYPES = { @@ -86,27 +116,10 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): 12: "other", } - def __init__(self, match): - GalleryExtractor.__init__(self, match) - self.page_url = None - - def _init(self): - self.headers = { - "Accept" : "*/*", - "Referer": self.root + "/", - "Origin" : self.root, - } - - self.fmt = self.config("format") - self.cbz = self.config("cbz", True) - - if self.cbz: - self.filename_fmt = "{id} {title}.{extension}" - self.directory_fmt = ("{category}",) - def metadata(self, _): - url = f"{self.root_api}/books/detail/{self.groups[1]}/{self.groups[2]}" - self.data = data = self.request_json(url, headers=self.headers) + _, gid, gkey = self.groups + url = f"{self.root_api}/books/detail/{gid}/{gkey}" + data = self.request_json(url, headers=self.headers) data["date"] = text.parse_timestamp(data["created_at"] // 1000) tags = [] @@ -127,53 +140,42 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): data["tags_" + types[type]] = values try: - if self.cbz: - data["count"] = len(data["thumbnails"]["entries"]) + data["count"] = len(data["thumbnails"]["entries"]) del data["thumbnails"] - del data["rels"] except Exception: pass return data def images(self, _): - data = self.data - fmt = self._select_format(data["data"]) + crt = self._crt() + _, gid, gkey = self.groups + url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={crt}" + try: + data = self.request_json(url, method="POST", headers=self.headers) + except exception.HttpError as exc: + self._require_auth(exc) - url = (f"{self.root_api}/books/data/{data['id']}/" - f"{data['public_key']}/{fmt['id']}/{fmt['public_key']}") - params = { - "v": data["updated_at"], - "w": fmt["w"], - } + fmt = self._select_format(data["data"]) - if self.cbz: - params["action"] = "dl" - base = self.request_json( - url, method="POST", params=params, headers=self.headers, - )["base"] - url = f"{base}?v={data['updated_at']}&w={fmt['w']}" - info = text.nameext_from_url(base) - if not info["extension"]: - info["extension"] = "cbz" - return ((url, info),) - - data = self.request_json(url, params=params, headers=self.headers) + url = (f"{self.root_api}/books/data/{gid}/{gkey}" + f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={crt}") + data = self.request_json(url, headers=self.headers) base = data["base"] results = [] for entry in data["entries"]: dimensions = entry["dimensions"] info = { - "w": dimensions[0], - "h": dimensions[1], + "width" : dimensions[0], + "height": dimensions[1], "_http_headers": self.headers, } results.append((base + entry["path"], info)) return results def _select_format(self, formats): - fmt = self.fmt + fmt = self.config("format") if not fmt or fmt == "best": fmtids = ("0", "1600", "1280", "980", "780") @@ -182,7 +184,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): elif isinstance(fmt, list): fmtids = fmt else: - fmtids = (str(self.fmt),) + fmtids = (str(fmt),) for fmtid in fmtids: try: @@ -203,44 +205,39 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): class SchalenetworkSearchExtractor(SchalenetworkExtractor): """Extractor for schale.network search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/\?([^#]*)" - example = "https://niyaniya.moe/?s=QUERY" + pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+)|browse)?(?:/?\?([^#]*))?$" + example = "https://niyaniya.moe/browse?s=QUERY" def items(self): - params = text.parse_query(self.groups[1]) + _, tag, qs = self.groups + + params = text.parse_query(qs) params["page"] = text.parse_int(params.get("page"), 1) + + if tag is not None: + ns, sep, tag = text.unquote(tag).partition(":") + if "+" in tag: + tag = tag.replace("+", " ") + q = '"' + else: + q = "" + q = '"' if " " in tag else "" + params["s"] = f"{ns}{sep}{q}^{tag}${q}" + return self._pagination("/books", params) class SchalenetworkFavoriteExtractor(SchalenetworkExtractor): """Extractor for schale.network favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" + pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?" example = "https://niyaniya.moe/favorites" def items(self): - self.login() - params = text.parse_query(self.groups[1]) params["page"] = text.parse_int(params.get("page"), 1) - return self._pagination("/favorites", params) - - def login(self): - username, password = self._get_auth_info() - if username: - self.headers["Authorization"] = \ - "Bearer " + self._login_impl(username, password) - return - - raise exception.AuthenticationError("Username and password required") - - @cache(maxage=86400, keyarg=1) - def _login_impl(self, username, password): - self.log.info("Logging in as %s", username) + self.headers["Authorization"] = self._token() + return self._pagination(f"/books/favorites?crt={self._crt()}", params) - url = "https://auth.schale.network/login" - data = {"uname": username, "passwd": password} - response = self.request( - url, method="POST", headers=self.headers, data=data) - return response.json()["session"] +SchalenetworkExtractor.extr_class = SchalenetworkGalleryExtractor diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py index 8cc7e38..3354289 100644 --- a/gallery_dl/extractor/simpcity.py +++ b/gallery_dl/extractor/simpcity.py @@ -20,18 +20,20 @@ class SimpcityExtractor(Extractor): root = "https://simpcity.cr" def items(self): - extract_urls = text.re(r' href="([^"]+)').findall + extract_urls = text.re( + r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall for post in self.posts(): urls = extract_urls(post["content"]) data = {"post": post} post["count"] = data["count"] = len(urls) + yield Message.Directory, data for data["num"], url in enumerate(urls, 1): yield Message.Queue, url, data def request_page(self, url): try: - return self.request(url).text + return self.request(url) except exception.HttpError as exc: if exc.status == 403 and b">Log in<" in exc.response.content: msg = text.extr(exc.response.text, "blockMessage--error", "</") @@ -44,14 +46,14 @@ class SimpcityExtractor(Extractor): base = f"{self.root}{base}" if pnum is None: - url = base + url = f"{base}/" pnum = 1 else: url = f"{base}/page-{pnum}" pnum = None while True: - page = self.request_page(url) + page = self.request_page(url).text yield page @@ -60,6 +62,31 @@ class SimpcityExtractor(Extractor): pnum += 1 url = f"{base}/page-{pnum}" + def _pagination_reverse(self, base, pnum=None): + base = f"{self.root}{base}" + + url = f"{base}/page-9999" # force redirect to last page + with self.request_page(url) as response: + url = response.url + if url[-1] == "/": + pnum = 1 + else: + pnum = text.parse_int(url[url.rfind("-")+1:], 1) + page = response.text + + while True: + yield page + + pnum -= 1 + if pnum > 1: + url = f"{base}/page-{pnum}" + elif pnum == 1: + url = f"{base}/" + else: + return + + page = self.request_page(url).text + def _parse_thread(self, page): schema = self._extract_jsonld(page)["mainEntity"] author = schema["author"] @@ -92,7 +119,8 @@ class SimpcityExtractor(Extractor): "id": extr('data-content="post-', '"'), "author_url": extr('itemprop="url" content="', '"'), "date": text.parse_datetime(extr('datetime="', '"')), - "content": extr('<div itemprop="text">', "\t\t</div>").strip(), + "content": extr('<div itemprop="text">', + '<div class="js-selectToQuote').strip(), } url_a = post["author_url"] @@ -109,7 +137,7 @@ class SimpcityPostExtractor(SimpcityExtractor): def posts(self): post_id = self.groups[0] url = f"{self.root}/posts/{post_id}/" - page = self.request_page(url) + page = self.request_page(url).text pos = page.find(f'data-content="post-{post_id}"') if pos < 0: @@ -126,10 +154,22 @@ class SimpcityThreadExtractor(SimpcityExtractor): example = "https://simpcity.cr/threads/TITLE.12345/" def posts(self): - for page in self._pagination(*self.groups): + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + pages = self._pagination(*self.groups) + reverse = False + else: + pages = self._pagination_reverse(*self.groups) + reverse = True + + for page in pages: if "thread" not in self.kwdict: self.kwdict["thread"] = self._parse_thread(page) - for html in text.extract_iter(page, "<article ", "</article>"): + posts = text.extract_iter(page, "<article ", "</article>") + if reverse: + posts = list(posts) + posts.reverse() + for html in posts: yield self._parse_post(html) diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py new file mode 100644 index 0000000..055d7d8 --- /dev/null +++ b/gallery_dl/extractor/thehentaiworld.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://thehentaiworld.com/""" + +from .common import Extractor, Message +from .. import text, util +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com" + + +class ThehentaiworldExtractor(Extractor): + """Base class for thehentaiworld extractors""" + category = "thehentaiworld" + root = "https://thehentaiworld.com" + filename_fmt = "{title} ({id}{num:?-//}).{extension}" + archive_fmt = "{id}_{num}" + request_interval = (0.5, 1.5) + + def items(self): + for url in self.posts(): + try: + post = self._extract_post(url) + except Exception as exc: + self.status |= 1 + self.log.warning("Failed to extract post %s (%s: %s)", + url, exc.__class__.__name__, exc) + continue + + if "file_urls" in post: + urls = post["file_urls"] + post["count"] = len(urls) + yield Message.Directory, post + for post["num"], url in enumerate(urls, 1): + text.nameext_from_url(url, post) + yield Message.Url, url, post + else: + yield Message.Directory, post + url = post["file_url"] + text.nameext_from_url(url, post) + yield Message.Url, url, post + + def _extract_post(self, url): + extr = text.extract_from(self.request(url).text) + + post = { + "num" : 0, + "count" : 1, + "title" : text.unescape(extr("<title>", "<").strip()), + "id" : text.parse_int(extr(" postid-", " ")), + "slug" : extr(" post-", '"'), + "tags" : extr('id="tagsHead">', "</ul>"), + "date" : text.parse_datetime(extr( + "<li>Posted: ", "<"), "%Y-%m-%d"), + } + + if "/videos/" in url: + post["type"] = "video" + post["width"] = post["height"] = 0 + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + post["file_url"] = extr('<source src="', '"') + else: + post["type"] = "image" + post["width"] = text.parse_int(extr("<li>Size: ", " ")) + post["height"] = text.parse_int(extr("x ", "<")) + post["file_url"] = extr('a href="', '"') + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + + if doujin := extr('<a id="prev-page"', "</div></div><"): + repl = text.re(r"-220x\d+\.").sub + post["file_urls"] = [ + repl(".", url) + for url in text.extract_iter( + doujin, 'class="border" src="', '"') + ] + + tags = collections.defaultdict(list) + pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)') + for tag_type, tag_name in pattern.findall(post["tags"]): + tags[tag_type].append(tag_name) + post["tags"] = tags_list = [] + for key, value in tags.items(): + tags_list.extend(value) + post[f"tags_{key}" if key else "tags_general"] = value + + return post + + def _pagination(self, endpoint): + base = f"{self.root}{endpoint}" + pnum = self.page_start + + while True: + url = base if pnum < 2 else f"{base}page/{pnum}/" + page = self.request(url).text + + yield from text.extract_iter(text.extr( + page, 'id="thumbContainer"', "<script"), ' href="', '"') + + if 'class="next"' not in page: + return + pnum += 1 + + +class ThehentaiworldPostExtractor(ThehentaiworldExtractor): + subcategory = "post" + pattern = (rf"{BASE_PATTERN}" + rf"(/(?:(?:3d-cgi-)?hentai-image|video)s/([^/?#]+))") + example = "https://thehentaiworld.com/hentai-images/SLUG/" + + def posts(self): + return (f"{self.root}{self.groups[0]}/",) + + +class ThehentaiworldTagExtractor(ThehentaiworldExtractor): + subcategory = "tag" + per_page = 24 + page_start = 1 + post_start = 0 + directory_fmt = ("{category}", "{search_tags}") + pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)" + example = "https://thehentaiworld.com/tag/TAG/" + + def posts(self): + self.kwdict["search_tags"] = tag = self.groups[0] + return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start) + + def skip(self, num): + pages, posts = divmod(num, self.per_page) + self.page_start += pages + self.post_start += posts + return num diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ed3cfae..e6c84d1 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -2070,7 +2070,7 @@ class TwitterAPI(): quoted = tweet["quoted_status_result"]["result"] quoted["legacy"]["quoted_by"] = ( tweet["core"]["user_results"]["result"] - ["legacy"]["screen_name"]) + ["core"]["screen_name"]) quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"] quoted["sortIndex"] = entry.get("sortIndex") diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index e53ecf4..294fc57 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -51,8 +51,16 @@ class VipergirlsExtractor(Extractor): like = False posts = root.iter("post") - if self.page: - util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + if self.page: + util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) + else: + posts = list(posts) + if self.page: + offset = text.parse_int(self.page[5:]) * 15 + posts = posts[:offset] + posts.reverse() for post in posts: images = list(post) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 9d98e68..9369e5d 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -489,9 +489,6 @@ class DownloadJob(Job): self.extractor.cookies_store() - if "finalize" in hooks: - for callback in hooks["finalize"]: - callback(pathfmt) if self.status: if "finalize-error" in hooks: for callback in hooks["finalize-error"]: @@ -500,6 +497,9 @@ class DownloadJob(Job): if "finalize-success" in hooks: for callback in hooks["finalize-success"]: callback(pathfmt) + if "finalize" in hooks: + for callback in hooks["finalize"]: + callback(pathfmt) def handle_skip(self): pathfmt = self.pathfmt diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 8da8417..9992c56 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -54,7 +54,11 @@ class PostProcessor(): else: self.log.debug( "Using %s archive '%s'", self.name, archive_path) + job.register_hooks({"finalize": self._close_archive}) return True self.archive = None return False + + def _close_archive(self, _): + self.archive.close() diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index c74f92f..a6d2b7f 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -45,6 +45,15 @@ class MetadataPP(PostProcessor): cfmt = "\n".join(cfmt) + "\n" self._content_fmt = formatter.parse(cfmt).format_map ext = "txt" + elif mode == "print": + nl = "\n" + if isinstance(cfmt, list): + cfmt = f"{nl.join(cfmt)}{nl}" + if cfmt[-1] != nl and (cfmt[0] != "\f" or cfmt[1] == "F"): + cfmt = f"{cfmt}{nl}" + self.write = self._write_custom + self._content_fmt = formatter.parse(cfmt).format_map + filename = "-" elif mode == "jsonl": self.write = self._write_json self._json_encode = self._make_encoder(options).encode diff --git a/gallery_dl/postprocessor/python.py b/gallery_dl/postprocessor/python.py index db71da2..66d9343 100644 --- a/gallery_dl/postprocessor/python.py +++ b/gallery_dl/postprocessor/python.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2023 Mike Fährmann +# Copyright 2023-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,13 +17,14 @@ class PythonPP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) - spec = options["function"] - module_name, _, function_name = spec.rpartition(":") - module = util.import_file(module_name) - self.function = getattr(module, function_name) - - if self._init_archive(job, options): - self.run = self.run_archive + mode = options.get("mode") + if mode == "eval" or not mode and options.get("expression"): + self.function = util.compile_expression(options["expression"]) + else: + spec = options["function"] + module_name, _, function_name = spec.rpartition(":") + module = util.import_file(module_name) + self.function = getattr(module, function_name) events = options.get("event") if events is None: @@ -32,6 +33,9 @@ class PythonPP(PostProcessor): events = events.split(",") job.register_hooks({event: self.run for event in events}, options) + if self._init_archive(job, options): + self.run = self.run_archive + def run(self, pathfmt): self.function(pathfmt.kwdict) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 7b9ce99..49c1ba8 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -542,6 +542,7 @@ def language_to_code(lang, default=None): CODES = { "ar": "Arabic", "bg": "Bulgarian", + "bn": "Bengali", "ca": "Catalan", "cs": "Czech", "da": "Danish", @@ -549,9 +550,11 @@ CODES = { "el": "Greek", "en": "English", "es": "Spanish", + "fa": "Persian", "fi": "Finnish", "fr": "French", "he": "Hebrew", + "hi": "Hindi", "hu": "Hungarian", "id": "Indonesian", "it": "Italian", @@ -564,9 +567,13 @@ CODES = { "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", + "sk": "Slovak", + "sl": "Slovenian", + "sr": "Serbian", "sv": "Swedish", "th": "Thai", "tr": "Turkish", + "uk": "Ukrainian", "vi": "Vietnamese", "zh": "Chinese", } @@ -634,6 +641,12 @@ class NullResponse(): self.url = url self.reason = str(reason) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + def __str__(self): return "900 " + self.reason diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 277d679..4861a9d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.30.7" +__version__ = "1.30.8" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index cfc6b50..0296498 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -80,7 +80,10 @@ def parse_command_line(module, argv): parser, opts, args = module.parseOpts(argv) ytdlp = hasattr(module, "cookies") - std_headers = module.std_headers + try: + std_headers = module.utils.networking.std_headers + except AttributeError: + std_headers = module.std_headers try: parse_bytes = module.parse_bytes @@ -345,7 +348,7 @@ def parse_command_line(module, argv): "nopart": opts.nopart, "updatetime": opts.updatetime, "writedescription": opts.writedescription, - "writeannotations": opts.writeannotations, + "writeannotations": getattr(opts, "writeannotations", None), "writeinfojson": opts.writeinfojson, "allow_playlist_files": opts.allow_playlist_files, "clean_infojson": opts.clean_infojson, @@ -378,7 +381,8 @@ def parse_command_line(module, argv): "max_views": opts.max_views, "daterange": date, "cachedir": opts.cachedir, - "youtube_print_sig_code": opts.youtube_print_sig_code, + "youtube_print_sig_code": getattr( + opts, "youtube_print_sig_code", None), "age_limit": opts.age_limit, "download_archive": download_archive_fn, "break_on_existing": getattr(opts, "break_on_existing", None), @@ -394,8 +398,8 @@ def parse_command_line(module, argv): "socket_timeout": opts.socket_timeout, "bidi_workaround": opts.bidi_workaround, "debug_printtraffic": opts.debug_printtraffic, - "prefer_ffmpeg": opts.prefer_ffmpeg, - "include_ads": opts.include_ads, + "prefer_ffmpeg": getattr(opts, "prefer_ffmpeg", None), + "include_ads": getattr(opts, "include_ads", None), "default_search": opts.default_search, "dynamic_mpd": getattr(opts, "dynamic_mpd", None), "extractor_args": getattr(opts, "extractor_args", None), @@ -420,7 +424,7 @@ def parse_command_line(module, argv): opts, "sleep_interval_subtitles", None), "external_downloader": opts.external_downloader, "playlist_items": opts.playlist_items, - "xattr_set_filesize": opts.xattr_set_filesize, + "xattr_set_filesize": getattr(opts, "xattr_set_filesize", None), "match_filter": match_filter, "no_color": getattr(opts, "no_color", None), "ffmpeg_location": opts.ffmpeg_location, @@ -430,7 +434,7 @@ def parse_command_line(module, argv): opts, "hls_split_discontinuity", None), "external_downloader_args": opts.external_downloader_args, "postprocessor_args": opts.postprocessor_args, - "cn_verification_proxy": opts.cn_verification_proxy, + "cn_verification_proxy": getattr(opts, "cn_verification_proxy", None), "geo_verification_proxy": opts.geo_verification_proxy, "geo_bypass": getattr( opts, "geo_bypass", "default"), diff --git a/test/test_extractor.py b/test/test_extractor.py index f8b8f09..a623e1d 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -156,6 +156,9 @@ class TestExtractorModule(unittest.TestCase): self.fail(f"{cls.__name__} pattern does not match " f"example URL '{cls.example}'") + self.assertEqual(cls, extr.__class__) + self.assertEqual(cls, extractor.find(cls.example).__class__) + extr.request = fail_request extr.initialize() extr.finalize() diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 07bd348..2902fea 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -20,7 +20,7 @@ import collections from datetime import datetime sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from gallery_dl import extractor, output, path, util # noqa E402 +from gallery_dl import extractor, output, path, util, exception # noqa E402 from gallery_dl import postprocessor, config # noqa E402 from gallery_dl.postprocessor.common import PostProcessor # noqa E402 @@ -555,6 +555,17 @@ class MetadataTest(BasePostprocessorTest): test({"mode": "custom", "format": "{foo}\n{missing}\n"}) test({"format": "{foo}\n{missing}\n"}) + def test_metadata_mode_print(self): + self._create( + {"mode": "print", "format": "{foo}\n{missing}"}, + {"foo": "bar"}, + ) + + with patch("sys.stdout", Mock()) as m: + self._trigger() + + self.assertEqual(self._output(m), "bar\nNone\n") + def test_metadata_extfmt(self): pp = self._create({ "extension" : "ignored", @@ -867,6 +878,18 @@ class PythonTest(BasePostprocessorTest): self._trigger() self.assertEqual(self.pathfmt.kwdict["_result"], 24) + def test_eval(self): + self._create({"mode": "eval", "expression": "abort()"}) + + with self.assertRaises(exception.StopExtraction): + self._trigger() + + def test_eval_auto(self): + self._create({"expression": "abort()"}) + + with self.assertRaises(exception.StopExtraction): + self._trigger() + def _write_module(self, path): with open(path, "w") as fp: fp.write(""" diff --git a/test/test_util.py b/test/test_util.py index 4a76769..bfaab01 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1041,6 +1041,9 @@ value = 123 self.assertEqual(response.links.get("next"), None) self.assertEqual(response.close(), None) + with response as ctx: + self.assertIs(response, ctx) + class TestExtractor(): category = "test_category" diff --git a/test/test_ytdl.py b/test/test_ytdl.py index 88933e4..1f28c9a 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -42,8 +42,6 @@ class Test_CommandlineArguments(unittest.TestCase): def test_proxy(self): self._(["--proxy", "socks5://127.0.0.1:1080/"], "proxy", "socks5://127.0.0.1:1080/") - self._(["--cn-verification-proxy", "https://127.0.0.1"], - "cn_verification_proxy", "https://127.0.0.1") self._(["--geo-verification-proxy", "127.0.0.1"], "geo_verification_proxy", "127.0.0.1") @@ -105,7 +103,10 @@ class Test_CommandlineArguments(unittest.TestCase): "geo_bypass_ip_block", "198.51.100.14/24") def test_headers(self): - headers = self.module.std_headers + try: + headers = self.module.utils.networking.std_headers + except AttributeError: + headers = self.module.std_headers self.assertNotEqual(headers["User-Agent"], "Foo/1.0") self._(["--user-agent", "Foo/1.0"]) @@ -194,8 +195,6 @@ class Test_CommandlineArguments(unittest.TestCase): }) def test_xattr(self): - self._("--xattr-set-filesize", "xattr_set_filesize", True) - opts = self._("--xattrs") self.assertEqual(opts["postprocessors"][0], {"key": "XAttrMetadata"}) |
